vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,128 +0,0 @@
1
- /**
2
- * Rate limiting and retry utilities for API requests
3
- */
4
- const DEFAULT_OPTIONS = {
5
- maxRetries: 5,
6
- initialDelay: 1000, // 1 second
7
- maxDelay: 60000, // 60 seconds
8
- exponentialBase: 2,
9
- jitter: true
10
- };
11
- /**
12
- * Calculate delay with exponential backoff and optional jitter
13
- */
14
- function calculateDelay(attempt, options) {
15
- const exponentialDelay = options.initialDelay * Math.pow(options.exponentialBase, attempt);
16
- const delay = Math.min(exponentialDelay, options.maxDelay);
17
- if (options.jitter) {
18
- // Add random jitter (0-20% of delay) to avoid thundering herd
19
- const jitterAmount = delay * 0.2 * Math.random();
20
- return Math.floor(delay + jitterAmount);
21
- }
22
- return Math.floor(delay);
23
- }
24
- /**
25
- * Extract Retry-After header value from response or error
26
- */
27
- function getRetryAfter(response) {
28
- if (!response)
29
- return null;
30
- const retryAfter = response.headers.get('Retry-After');
31
- if (!retryAfter)
32
- return null;
33
- // Retry-After can be a number of seconds or an HTTP date
34
- const seconds = parseInt(retryAfter, 10);
35
- if (!isNaN(seconds)) {
36
- return seconds * 1000; // Convert to milliseconds
37
- }
38
- // Try parsing as HTTP date
39
- const date = Date.parse(retryAfter);
40
- if (!isNaN(date)) {
41
- return Math.max(0, date - Date.now());
42
- }
43
- return null;
44
- }
45
- /**
46
- * Check if error is a rate limit error (429)
47
- */
48
- function isRateLimitError(error) {
49
- if (error?.status === 429)
50
- return true;
51
- if (error?.response?.status === 429)
52
- return true;
53
- if (error?.message?.includes('rate limit'))
54
- return true;
55
- if (error?.message?.includes('429'))
56
- return true;
57
- return false;
58
- }
59
- /**
60
- * Sleep for specified milliseconds
61
- */
62
- function sleep(ms) {
63
- return new Promise(resolve => setTimeout(resolve, ms));
64
- }
65
- /**
66
- * Retry a function with exponential backoff on rate limit errors
67
- */
68
- export async function retryWithBackoff(fn, options = {}) {
69
- const opts = { ...DEFAULT_OPTIONS, ...options };
70
- let lastError;
71
- let response = null;
72
- for (let attempt = 0; attempt <= opts.maxRetries; attempt++) {
73
- try {
74
- const result = await fn();
75
- return result;
76
- }
77
- catch (error) {
78
- lastError = error;
79
- // Extract response if available
80
- if (error?.response) {
81
- response = error.response;
82
- }
83
- // Only retry on rate limit errors
84
- if (!isRateLimitError(error)) {
85
- throw error;
86
- }
87
- // Don't retry on last attempt
88
- if (attempt >= opts.maxRetries) {
89
- break;
90
- }
91
- // Calculate delay
92
- let delay = getRetryAfter(response);
93
- if (!delay) {
94
- delay = calculateDelay(attempt, opts);
95
- }
96
- console.error(`[Rate Limiter] Rate limited (attempt ${attempt + 1}/${opts.maxRetries + 1}). Waiting ${delay}ms...`);
97
- await sleep(delay);
98
- }
99
- }
100
- throw lastError;
101
- }
102
- /**
103
- * Add a delay between requests to avoid hitting rate limits
104
- */
105
- export async function delayBetweenRequests(ms = 500) {
106
- await sleep(ms);
107
- }
108
- /**
109
- * Rate-limited fetch wrapper with automatic retry
110
- */
111
- export async function rateLimitedFetch(url, options = {}, retryOptions = {}) {
112
- return retryWithBackoff(async () => {
113
- const response = await fetch(url, options);
114
- if (response.status === 429) {
115
- const error = new Error(`Rate limit exceeded: ${response.status}`);
116
- error.status = 429;
117
- error.response = response;
118
- throw error;
119
- }
120
- if (!response.ok) {
121
- const error = new Error(`HTTP error: ${response.status}`);
122
- error.status = response.status;
123
- error.response = response;
124
- throw error;
125
- }
126
- return response;
127
- }, retryOptions);
128
- }
@@ -1,448 +0,0 @@
1
- import { listDatasets, datasetInfo } from "@huggingface/hub";
2
- import { categorizeLicense } from "./license.js";
3
- import { calculateQualityScore } from "./quality.js";
4
- import { classifyDomain } from "./domain.js";
5
- import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
- import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
7
- export class HuggingFaceScraper {
8
- /**
9
- * Bulk discovery: Fetch many datasets quickly without deep details.
10
- * Hits the 25k target in minutes.
11
- */
12
- async scrapeBulk(limit = 1000, queryOrIntent) {
13
- const intent = typeof queryOrIntent === "string"
14
- ? await analyzeDatasetQuery(queryOrIntent)
15
- : queryOrIntent;
16
- const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
17
- const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
18
- const filterMsg = query ? `, query: ${query}` : "";
19
- console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
20
- const results = [];
21
- let processed = 0;
22
- try {
23
- const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
24
- const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
25
- for await (const ds of listDatasets({
26
- limit: limit,
27
- additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
28
- search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
29
- ...(hfToken ? { accessToken: hfToken } : {})
30
- })) {
31
- if (results.length >= limit)
32
- break;
33
- processed++;
34
- if (processed % 1000 === 0) {
35
- console.error(`[Bulk Scraper] Found ${processed} datasets...`);
36
- }
37
- const repoId = ds.name || ds.id;
38
- const description = ds.description || "";
39
- const tags = ds.tags || [];
40
- // Filter out non-repo IDs (hex IDs) if they don't have a namespace
41
- if (!repoId.includes("/") && /^[a-f0-9]{24}$/.test(repoId))
42
- continue;
43
- // Hard skip: Empty or very short description (cannot search semantically)
44
- if (!description || description.length < 20)
45
- continue;
46
- const createdAt = ds.createdAt;
47
- const downloads = ds.downloadsAllTime || ds.downloads || 0;
48
- const task = this.extractTask(tags);
49
- // Create "skeleton" metadata
50
- const metadata = {
51
- id: repoId,
52
- source: "huggingface",
53
- name: repoId.split("/").pop() || repoId,
54
- description: description,
55
- quality_warnings: ["Incomplete metadata: run hydration to get full details"],
56
- downloads: downloads,
57
- likes: ds.likes || 0,
58
- stars: 0,
59
- tags: tags,
60
- last_updated: createdAt instanceof Date ? createdAt.toISOString() : new Date().toISOString(),
61
- task: task,
62
- domain: classifyDomain(description, tags, repoId, task),
63
- languages: this.extractLanguages(tags),
64
- splits: [],
65
- license: {
66
- id: "unknown",
67
- category: "unknown",
68
- usage_restrictions: [],
69
- warnings: ["License not verified yet"]
70
- },
71
- quality_score: 10, // Default low score for skeleton
72
- download_url: `https://huggingface.co/datasets/${repoId}`,
73
- total_examples: 0,
74
- is_structured: false,
75
- has_target_column: false,
76
- is_safe_source: true, // Default to true, will be verified during hydration
77
- has_personal_data: false,
78
- is_paywalled: false,
79
- is_scraped_web_data: false,
80
- uses_https: true,
81
- has_train_split: false,
82
- has_test_split: false,
83
- has_validation_split: false,
84
- description_length: description.length,
85
- has_readme: false,
86
- is_incomplete: true // Flag for Phase 2
87
- };
88
- // Hard language exclusion
89
- if (intent && shouldExcludeByLanguage(metadata, intent))
90
- continue;
91
- results.push(metadata);
92
- }
93
- }
94
- catch (e) {
95
- console.error("[Bulk Scraper] Error:", e.message);
96
- }
97
- return results;
98
- }
99
- async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
100
- const intent = typeof queryOrIntent === "string"
101
- ? await analyzeDatasetQuery(queryOrIntent)
102
- : queryOrIntent;
103
- const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
104
- const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
105
- const filterMsg = query ? `, query: ${query}` : "";
106
- console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
107
- const results = [];
108
- let processed = 0;
109
- let skippedMVP = 0;
110
- let rateLimitHits = 0;
111
- let otherErrors = 0;
112
- try {
113
- // Fetch more datasets to account for filtering
114
- const fetchLimit = applyMVPFilters ? limit * 30 : limit * 10;
115
- // Support HuggingFace token from environment variable
116
- const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
117
- // CRITICAL: Low concurrency without token to avoid rate limits
118
- // With token: 10 concurrent (HF allows more)
119
- // Without token: 2 concurrent (stay under ~30 req/min limit)
120
- const CONCURRENCY = hfToken ? 10 : 2;
121
- const queue = [];
122
- if (!hfToken) {
123
- console.error(`[HF Scraper] ⚠️ No HF_TOKEN set - using conservative rate limits`);
124
- }
125
- // Add delay between batches to avoid rate limits
126
- const BATCH_DELAY = hfToken ? 500 : 2000;
127
- const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
128
- for await (const ds of listDatasets({
129
- limit: fetchLimit,
130
- additionalFields: ["description", "tags"],
131
- search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
132
- ...(hfToken ? { accessToken: hfToken } : {})
133
- })) {
134
- if (results.length >= limit)
135
- break;
136
- processed++;
137
- // Add delay every 100 datasets to avoid aggressive rate limits
138
- if (processed % 100 === 0 && processed > 0) {
139
- await delayBetweenRequests(hfToken ? 500 : 2000);
140
- }
141
- const repoId = ds.name || ds.id;
142
- // Filter out non-repo IDs (hex IDs) if they don't have a namespace
143
- if (!repoId.includes("/")) {
144
- if (/^[a-f0-9]{24}$/.test(repoId))
145
- continue;
146
- }
147
- const downloads = ds.downloads || 0;
148
- const tags = ds.tags || [];
149
- const description = ds.description || "";
150
- if (processed % 100 === 0) {
151
- console.error(`[HF Scraper] Processed ${processed}, found ${results.length}, workers: ${queue.length}...`);
152
- }
153
- // Hard skip: Empty or very short description (cannot search semantically)
154
- if (!description || description.length < 20)
155
- continue;
156
- // Process dataset with concurrency and retry logic
157
- const processTask = (async () => {
158
- try {
159
- const fullInfo = await retryWithBackoff(() => datasetInfo({
160
- name: repoId,
161
- additionalFields: ["cardData"],
162
- ...(hfToken ? { accessToken: hfToken } : {})
163
- }), {
164
- maxRetries: 3,
165
- initialDelay: 2000, // Start with 2 seconds for HF API
166
- maxDelay: 30000 // Max 30 seconds
167
- });
168
- const cardData = fullInfo.cardData || {};
169
- // Extract splits from cardData.dataset_info (where HF actually stores them)
170
- // cardData.dataset_info can be an object (single config) or array (multi-config)
171
- let rawSplits = [];
172
- const datasetInfoField = cardData.dataset_info;
173
- if (datasetInfoField) {
174
- const configs = Array.isArray(datasetInfoField) ? datasetInfoField : [datasetInfoField];
175
- for (const config of configs) {
176
- if (config?.splits && Array.isArray(config.splits)) {
177
- rawSplits = rawSplits.concat(config.splits);
178
- }
179
- }
180
- }
181
- // Fallback: try top-level splits from the SDK (rarely populated)
182
- if (rawSplits.length === 0 && fullInfo.splits) {
183
- rawSplits = fullInfo.splits;
184
- }
185
- const splits = rawSplits.map((s) => ({
186
- name: s.name,
187
- num_examples: s.num_examples || s.numExamples || 0,
188
- size_bytes: s.num_bytes || s.sizeBytes || 0
189
- }));
190
- let totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
191
- const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
192
- // Fallback: estimate from size_categories when splits give 0
193
- if (totalExamples === 0) {
194
- const sizeCategories = cardData.size_categories;
195
- if (Array.isArray(sizeCategories) && sizeCategories.length > 0) {
196
- const cat = sizeCategories[0];
197
- const rangeMatch = cat.match(/([\d.]+[KMB]?)\s*<\s*n\s*<\s*([\d.]+[KMB]?)/i);
198
- if (rangeMatch) {
199
- const parseHumanNum = (s) => {
200
- const m = s.match(/^([\d.]+)([KMB])?$/i);
201
- if (!m)
202
- return 0;
203
- const base = parseFloat(m[1]);
204
- const suffix = (m[2] || '').toUpperCase();
205
- if (suffix === 'K')
206
- return base * 1000;
207
- if (suffix === 'M')
208
- return base * 1_000_000;
209
- if (suffix === 'B')
210
- return base * 1_000_000_000;
211
- return base;
212
- };
213
- const lo = parseHumanNum(rangeMatch[1]);
214
- const hi = parseHumanNum(rangeMatch[2]);
215
- totalExamples = Math.round((lo + hi) / 2);
216
- }
217
- }
218
- }
219
- const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
220
- const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
221
- const licenseTag = tags.find(t => t.startsWith("license:"));
222
- const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
223
- const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
224
- const license = categorizeLicense(licenseId, licenseUrl);
225
- if (license.category === "restricted") {
226
- skippedMVP++;
227
- return;
228
- }
229
- const warnings = [];
230
- if (totalExamples < 50)
231
- warnings.push("Dataset has very few examples (< 50)");
232
- if (description.length < 100)
233
- warnings.push("Short description; results may be less relevant");
234
- const lastUpdated = ds.updatedAt || fullInfo.updatedAt;
235
- if (lastUpdated) {
236
- const updateDate = new Date(lastUpdated);
237
- const fourYearsAgo = new Date();
238
- fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
239
- if (updateDate < fourYearsAgo) {
240
- warnings.push(`Stale data: Last updated ${updateDate.getFullYear()}`);
241
- }
242
- }
243
- if (splits.length === 0)
244
- warnings.push("No data splits found; could be a non-standard format");
245
- const descriptionLower = description.toLowerCase();
246
- const tagsLower = tags.map(t => t.toLowerCase()).join(" ");
247
- const hasPersonalData = descriptionLower.includes("personal data") ||
248
- descriptionLower.includes("gdpr") ||
249
- tagsLower.includes("personal-data");
250
- const isPaywalled = descriptionLower.includes("paywall") ||
251
- descriptionLower.includes("paid");
252
- const isScrapedWebData = descriptionLower.includes("scraped") ||
253
- tagsLower.includes("scraped");
254
- const isSafeSource = !isScrapedWebData && !hasPersonalData && !isPaywalled;
255
- if (!isSafeSource)
256
- warnings.push("Contains potentially sensitive or paywalled data sources");
257
- const format = this.extractFormat(tags, cardData);
258
- const columns = this.extractColumns(cardData, splits);
259
- const task = this.extractTask(tags);
260
- const domain = classifyDomain(description, tags, repoId, task);
261
- // REMOVED strict domain filtering that caused search bias
262
- // if (query && domain !== query) return;
263
- const metadata = {
264
- id: repoId,
265
- source: "huggingface",
266
- name: repoId.split("/").pop() || repoId,
267
- description: description,
268
- quality_warnings: warnings,
269
- downloads: downloads,
270
- likes: ds.likes || 0,
271
- stars: fullInfo.stars || 0,
272
- tags: tags,
273
- last_updated: lastUpdated?.toISOString() || new Date().toISOString(),
274
- task: task,
275
- domain: domain,
276
- languages: this.extractLanguages(tags),
277
- splits,
278
- license,
279
- quality_score: calculateQualityScore({
280
- downloads,
281
- likes: ds.likes || 0,
282
- hasDescription: true,
283
- descriptionLength: description.length,
284
- hasTrainSplit: splits.some((s) => s.name === "train"),
285
- hasTestSplit: splits.some((s) => s.name === "test"),
286
- lastUpdated: lastUpdated?.toISOString() || new Date().toISOString(),
287
- licenseCategory: license.category
288
- }),
289
- download_url: `https://huggingface.co/datasets/${repoId}`,
290
- format,
291
- total_examples: totalExamples,
292
- total_size_bytes: totalSizeBytes,
293
- total_size_mb: totalSizeMB,
294
- columns,
295
- is_structured: columns.length > 0 && splits.some((s) => s.name === "train"),
296
- has_target_column: columns.some(c => c.is_target === true),
297
- is_safe_source: isSafeSource,
298
- has_personal_data: hasPersonalData,
299
- is_paywalled: isPaywalled,
300
- is_scraped_web_data: isScrapedWebData,
301
- uses_https: true,
302
- has_train_split: splits.some((s) => s.name === "train"),
303
- has_test_split: splits.some((s) => s.name === "test"),
304
- has_validation_split: hasValidationSplit,
305
- description_length: description.length,
306
- has_readme: !!(cardData.readme || cardData.readme_content)
307
- };
308
- // Hard language exclusion — drop bilingual/multilingual for single-language queries
309
- if (intent && shouldExcludeByLanguage(metadata, intent)) {
310
- // skip — do not push
311
- }
312
- else {
313
- if (intent) {
314
- metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
315
- }
316
- results.push(metadata);
317
- }
318
- }
319
- catch (e) {
320
- // Track all errors for user feedback
321
- if (e?.status === 429 || e?.message?.includes('rate limit')) {
322
- rateLimitHits++;
323
- if (rateLimitHits <= 3) {
324
- console.error(`[HF Scraper] Rate limit hit for ${repoId}`);
325
- }
326
- }
327
- else {
328
- otherErrors++;
329
- // Log first few non-rate-limit errors for debugging
330
- if (otherErrors <= 2) {
331
- console.error(`[HF Scraper] Error for ${repoId}: ${e.message?.slice(0, 80)}`);
332
- }
333
- }
334
- }
335
- })();
336
- queue.push(processTask);
337
- processTask.then(() => {
338
- const idx = queue.indexOf(processTask);
339
- if (idx !== -1)
340
- queue.splice(idx, 1);
341
- });
342
- if (queue.length >= CONCURRENCY) {
343
- await Promise.race(queue);
344
- }
345
- }
346
- // Wait for remaining tasks
347
- await Promise.all(queue);
348
- }
349
- catch (e) {
350
- // Handle rate limit errors with better messaging
351
- if (e?.status === 429 || e?.message?.includes('rate limit')) {
352
- console.error("[HF Scraper] ❌ Scraping failed due to rate limit:", e.message);
353
- console.error("[HF Scraper] 💡 Set HF_TOKEN environment variable for unlimited access");
354
- }
355
- else {
356
- console.error("[HF Scraper] ❌ Scraping failed:", e.message);
357
- }
358
- }
359
- // User-friendly summary
360
- console.error(`[HF Scraper] ✅ Complete: ${results.length} datasets found`);
361
- if (rateLimitHits > 0) {
362
- console.error(`[HF Scraper] ⚠️ ${rateLimitHits} requests rate-limited. Set HF_TOKEN for better results.`);
363
- }
364
- if (otherErrors > 0) {
365
- console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
366
- }
367
- return results.sort((a, b) => {
368
- const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
369
- if (intentDelta !== 0)
370
- return intentDelta;
371
- return b.downloads - a.downloads;
372
- });
373
- }
374
- extractTask(tags) {
375
- const taskTags = [
376
- "text-classification",
377
- "token-classification",
378
- "question-answering",
379
- "translation",
380
- "summarization",
381
- "text-generation",
382
- "image-classification",
383
- "object-detection",
384
- "named-entity-recognition",
385
- "sentiment-analysis",
386
- "machine-translation"
387
- ];
388
- return tags.find(t => taskTags.includes(t)) || "unknown";
389
- }
390
- extractLanguages(tags) {
391
- const langs = tags
392
- .filter(t => t.startsWith("language:"))
393
- .map(t => t.replace("language:", ""));
394
- const isoLangs = tags.filter(t => t.length === 2 && /^[a-z]{2}$/.test(t));
395
- return Array.from(new Set([...langs, ...isoLangs]));
396
- }
397
- extractFormat(tags, cardData) {
398
- // Check tags first
399
- const formatTag = tags.find(t => t.startsWith("format:"));
400
- if (formatTag) {
401
- return formatTag.replace("format:", "").toUpperCase();
402
- }
403
- // Check cardData
404
- if (cardData.format) {
405
- return String(cardData.format).toUpperCase();
406
- }
407
- // Infer from other tags
408
- if (tags.includes("parquet"))
409
- return "PARQUET";
410
- if (tags.includes("csv"))
411
- return "CSV";
412
- if (tags.includes("json"))
413
- return "JSON";
414
- if (tags.includes("arrow"))
415
- return "ARROW";
416
- return undefined;
417
- }
418
- extractColumns(cardData, splits) {
419
- const columns = [];
420
- // Try to get columns from cardData
421
- if (cardData.columns) {
422
- if (Array.isArray(cardData.columns)) {
423
- return cardData.columns.map((col) => ({
424
- name: col.name || String(col),
425
- type: col.type,
426
- is_target: col.is_target || false
427
- }));
428
- }
429
- }
430
- // Try to infer from features if available
431
- if (cardData.features) {
432
- const features = cardData.features;
433
- if (typeof features === 'object') {
434
- for (const [key, value] of Object.entries(features)) {
435
- const feature = value;
436
- columns.push({
437
- name: key,
438
- type: feature?.dtype || feature?.type,
439
- is_target: key.toLowerCase().includes("label") ||
440
- key.toLowerCase().includes("target") ||
441
- key.toLowerCase().includes("y")
442
- });
443
- }
444
- }
445
- }
446
- return columns;
447
- }
448
- }