vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Rate limiting and retry utilities for API requests
|
|
3
|
-
*/
|
|
4
|
-
const DEFAULT_OPTIONS = {
|
|
5
|
-
maxRetries: 5,
|
|
6
|
-
initialDelay: 1000, // 1 second
|
|
7
|
-
maxDelay: 60000, // 60 seconds
|
|
8
|
-
exponentialBase: 2,
|
|
9
|
-
jitter: true
|
|
10
|
-
};
|
|
11
|
-
/**
|
|
12
|
-
* Calculate delay with exponential backoff and optional jitter
|
|
13
|
-
*/
|
|
14
|
-
function calculateDelay(attempt, options) {
|
|
15
|
-
const exponentialDelay = options.initialDelay * Math.pow(options.exponentialBase, attempt);
|
|
16
|
-
const delay = Math.min(exponentialDelay, options.maxDelay);
|
|
17
|
-
if (options.jitter) {
|
|
18
|
-
// Add random jitter (0-20% of delay) to avoid thundering herd
|
|
19
|
-
const jitterAmount = delay * 0.2 * Math.random();
|
|
20
|
-
return Math.floor(delay + jitterAmount);
|
|
21
|
-
}
|
|
22
|
-
return Math.floor(delay);
|
|
23
|
-
}
|
|
24
|
-
/**
|
|
25
|
-
* Extract Retry-After header value from response or error
|
|
26
|
-
*/
|
|
27
|
-
function getRetryAfter(response) {
|
|
28
|
-
if (!response)
|
|
29
|
-
return null;
|
|
30
|
-
const retryAfter = response.headers.get('Retry-After');
|
|
31
|
-
if (!retryAfter)
|
|
32
|
-
return null;
|
|
33
|
-
// Retry-After can be a number of seconds or an HTTP date
|
|
34
|
-
const seconds = parseInt(retryAfter, 10);
|
|
35
|
-
if (!isNaN(seconds)) {
|
|
36
|
-
return seconds * 1000; // Convert to milliseconds
|
|
37
|
-
}
|
|
38
|
-
// Try parsing as HTTP date
|
|
39
|
-
const date = Date.parse(retryAfter);
|
|
40
|
-
if (!isNaN(date)) {
|
|
41
|
-
return Math.max(0, date - Date.now());
|
|
42
|
-
}
|
|
43
|
-
return null;
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Check if error is a rate limit error (429)
|
|
47
|
-
*/
|
|
48
|
-
function isRateLimitError(error) {
|
|
49
|
-
if (error?.status === 429)
|
|
50
|
-
return true;
|
|
51
|
-
if (error?.response?.status === 429)
|
|
52
|
-
return true;
|
|
53
|
-
if (error?.message?.includes('rate limit'))
|
|
54
|
-
return true;
|
|
55
|
-
if (error?.message?.includes('429'))
|
|
56
|
-
return true;
|
|
57
|
-
return false;
|
|
58
|
-
}
|
|
59
|
-
/**
|
|
60
|
-
* Sleep for specified milliseconds
|
|
61
|
-
*/
|
|
62
|
-
function sleep(ms) {
|
|
63
|
-
return new Promise(resolve => setTimeout(resolve, ms));
|
|
64
|
-
}
|
|
65
|
-
/**
|
|
66
|
-
* Retry a function with exponential backoff on rate limit errors
|
|
67
|
-
*/
|
|
68
|
-
export async function retryWithBackoff(fn, options = {}) {
|
|
69
|
-
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
70
|
-
let lastError;
|
|
71
|
-
let response = null;
|
|
72
|
-
for (let attempt = 0; attempt <= opts.maxRetries; attempt++) {
|
|
73
|
-
try {
|
|
74
|
-
const result = await fn();
|
|
75
|
-
return result;
|
|
76
|
-
}
|
|
77
|
-
catch (error) {
|
|
78
|
-
lastError = error;
|
|
79
|
-
// Extract response if available
|
|
80
|
-
if (error?.response) {
|
|
81
|
-
response = error.response;
|
|
82
|
-
}
|
|
83
|
-
// Only retry on rate limit errors
|
|
84
|
-
if (!isRateLimitError(error)) {
|
|
85
|
-
throw error;
|
|
86
|
-
}
|
|
87
|
-
// Don't retry on last attempt
|
|
88
|
-
if (attempt >= opts.maxRetries) {
|
|
89
|
-
break;
|
|
90
|
-
}
|
|
91
|
-
// Calculate delay
|
|
92
|
-
let delay = getRetryAfter(response);
|
|
93
|
-
if (!delay) {
|
|
94
|
-
delay = calculateDelay(attempt, opts);
|
|
95
|
-
}
|
|
96
|
-
console.error(`[Rate Limiter] Rate limited (attempt ${attempt + 1}/${opts.maxRetries + 1}). Waiting ${delay}ms...`);
|
|
97
|
-
await sleep(delay);
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
throw lastError;
|
|
101
|
-
}
|
|
102
|
-
/**
|
|
103
|
-
* Add a delay between requests to avoid hitting rate limits
|
|
104
|
-
*/
|
|
105
|
-
export async function delayBetweenRequests(ms = 500) {
|
|
106
|
-
await sleep(ms);
|
|
107
|
-
}
|
|
108
|
-
/**
|
|
109
|
-
* Rate-limited fetch wrapper with automatic retry
|
|
110
|
-
*/
|
|
111
|
-
export async function rateLimitedFetch(url, options = {}, retryOptions = {}) {
|
|
112
|
-
return retryWithBackoff(async () => {
|
|
113
|
-
const response = await fetch(url, options);
|
|
114
|
-
if (response.status === 429) {
|
|
115
|
-
const error = new Error(`Rate limit exceeded: ${response.status}`);
|
|
116
|
-
error.status = 429;
|
|
117
|
-
error.response = response;
|
|
118
|
-
throw error;
|
|
119
|
-
}
|
|
120
|
-
if (!response.ok) {
|
|
121
|
-
const error = new Error(`HTTP error: ${response.status}`);
|
|
122
|
-
error.status = response.status;
|
|
123
|
-
error.response = response;
|
|
124
|
-
throw error;
|
|
125
|
-
}
|
|
126
|
-
return response;
|
|
127
|
-
}, retryOptions);
|
|
128
|
-
}
|
|
@@ -1,448 +0,0 @@
|
|
|
1
|
-
import { listDatasets, datasetInfo } from "@huggingface/hub";
|
|
2
|
-
import { categorizeLicense } from "./license.js";
|
|
3
|
-
import { calculateQualityScore } from "./quality.js";
|
|
4
|
-
import { classifyDomain } from "./domain.js";
|
|
5
|
-
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
-
import { analyzeDatasetQuery, buildIntentSearchQuery, buildHuggingFaceFilterTags, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "../search/query-intent.js";
|
|
7
|
-
export class HuggingFaceScraper {
|
|
8
|
-
/**
|
|
9
|
-
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
10
|
-
* Hits the 25k target in minutes.
|
|
11
|
-
*/
|
|
12
|
-
async scrapeBulk(limit = 1000, queryOrIntent) {
|
|
13
|
-
const intent = typeof queryOrIntent === "string"
|
|
14
|
-
? await analyzeDatasetQuery(queryOrIntent)
|
|
15
|
-
: queryOrIntent;
|
|
16
|
-
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
17
|
-
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
18
|
-
const filterMsg = query ? `, query: ${query}` : "";
|
|
19
|
-
console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
|
|
20
|
-
const results = [];
|
|
21
|
-
let processed = 0;
|
|
22
|
-
try {
|
|
23
|
-
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
24
|
-
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
25
|
-
for await (const ds of listDatasets({
|
|
26
|
-
limit: limit,
|
|
27
|
-
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
28
|
-
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
29
|
-
...(hfToken ? { accessToken: hfToken } : {})
|
|
30
|
-
})) {
|
|
31
|
-
if (results.length >= limit)
|
|
32
|
-
break;
|
|
33
|
-
processed++;
|
|
34
|
-
if (processed % 1000 === 0) {
|
|
35
|
-
console.error(`[Bulk Scraper] Found ${processed} datasets...`);
|
|
36
|
-
}
|
|
37
|
-
const repoId = ds.name || ds.id;
|
|
38
|
-
const description = ds.description || "";
|
|
39
|
-
const tags = ds.tags || [];
|
|
40
|
-
// Filter out non-repo IDs (hex IDs) if they don't have a namespace
|
|
41
|
-
if (!repoId.includes("/") && /^[a-f0-9]{24}$/.test(repoId))
|
|
42
|
-
continue;
|
|
43
|
-
// Hard skip: Empty or very short description (cannot search semantically)
|
|
44
|
-
if (!description || description.length < 20)
|
|
45
|
-
continue;
|
|
46
|
-
const createdAt = ds.createdAt;
|
|
47
|
-
const downloads = ds.downloadsAllTime || ds.downloads || 0;
|
|
48
|
-
const task = this.extractTask(tags);
|
|
49
|
-
// Create "skeleton" metadata
|
|
50
|
-
const metadata = {
|
|
51
|
-
id: repoId,
|
|
52
|
-
source: "huggingface",
|
|
53
|
-
name: repoId.split("/").pop() || repoId,
|
|
54
|
-
description: description,
|
|
55
|
-
quality_warnings: ["Incomplete metadata: run hydration to get full details"],
|
|
56
|
-
downloads: downloads,
|
|
57
|
-
likes: ds.likes || 0,
|
|
58
|
-
stars: 0,
|
|
59
|
-
tags: tags,
|
|
60
|
-
last_updated: createdAt instanceof Date ? createdAt.toISOString() : new Date().toISOString(),
|
|
61
|
-
task: task,
|
|
62
|
-
domain: classifyDomain(description, tags, repoId, task),
|
|
63
|
-
languages: this.extractLanguages(tags),
|
|
64
|
-
splits: [],
|
|
65
|
-
license: {
|
|
66
|
-
id: "unknown",
|
|
67
|
-
category: "unknown",
|
|
68
|
-
usage_restrictions: [],
|
|
69
|
-
warnings: ["License not verified yet"]
|
|
70
|
-
},
|
|
71
|
-
quality_score: 10, // Default low score for skeleton
|
|
72
|
-
download_url: `https://huggingface.co/datasets/${repoId}`,
|
|
73
|
-
total_examples: 0,
|
|
74
|
-
is_structured: false,
|
|
75
|
-
has_target_column: false,
|
|
76
|
-
is_safe_source: true, // Default to true, will be verified during hydration
|
|
77
|
-
has_personal_data: false,
|
|
78
|
-
is_paywalled: false,
|
|
79
|
-
is_scraped_web_data: false,
|
|
80
|
-
uses_https: true,
|
|
81
|
-
has_train_split: false,
|
|
82
|
-
has_test_split: false,
|
|
83
|
-
has_validation_split: false,
|
|
84
|
-
description_length: description.length,
|
|
85
|
-
has_readme: false,
|
|
86
|
-
is_incomplete: true // Flag for Phase 2
|
|
87
|
-
};
|
|
88
|
-
// Hard language exclusion
|
|
89
|
-
if (intent && shouldExcludeByLanguage(metadata, intent))
|
|
90
|
-
continue;
|
|
91
|
-
results.push(metadata);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
catch (e) {
|
|
95
|
-
console.error("[Bulk Scraper] Error:", e.message);
|
|
96
|
-
}
|
|
97
|
-
return results;
|
|
98
|
-
}
|
|
99
|
-
async scrape(limit = 100, applyMVPFilters = true, queryOrIntent) {
|
|
100
|
-
const intent = typeof queryOrIntent === "string"
|
|
101
|
-
? await analyzeDatasetQuery(queryOrIntent)
|
|
102
|
-
: queryOrIntent;
|
|
103
|
-
const query = typeof queryOrIntent === "string" ? queryOrIntent : intent?.searchQuery;
|
|
104
|
-
const hfQuery = intent ? buildIntentSearchQuery(intent) : query;
|
|
105
|
-
const filterMsg = query ? `, query: ${query}` : "";
|
|
106
|
-
console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
|
|
107
|
-
const results = [];
|
|
108
|
-
let processed = 0;
|
|
109
|
-
let skippedMVP = 0;
|
|
110
|
-
let rateLimitHits = 0;
|
|
111
|
-
let otherErrors = 0;
|
|
112
|
-
try {
|
|
113
|
-
// Fetch more datasets to account for filtering
|
|
114
|
-
const fetchLimit = applyMVPFilters ? limit * 30 : limit * 10;
|
|
115
|
-
// Support HuggingFace token from environment variable
|
|
116
|
-
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
117
|
-
// CRITICAL: Low concurrency without token to avoid rate limits
|
|
118
|
-
// With token: 10 concurrent (HF allows more)
|
|
119
|
-
// Without token: 2 concurrent (stay under ~30 req/min limit)
|
|
120
|
-
const CONCURRENCY = hfToken ? 10 : 2;
|
|
121
|
-
const queue = [];
|
|
122
|
-
if (!hfToken) {
|
|
123
|
-
console.error(`[HF Scraper] ⚠️ No HF_TOKEN set - using conservative rate limits`);
|
|
124
|
-
}
|
|
125
|
-
// Add delay between batches to avoid rate limits
|
|
126
|
-
const BATCH_DELAY = hfToken ? 500 : 2000;
|
|
127
|
-
const hfFilterTags = intent ? buildHuggingFaceFilterTags(intent) : [];
|
|
128
|
-
for await (const ds of listDatasets({
|
|
129
|
-
limit: fetchLimit,
|
|
130
|
-
additionalFields: ["description", "tags"],
|
|
131
|
-
search: { query: hfQuery, tags: hfFilterTags.length > 0 ? hfFilterTags : undefined },
|
|
132
|
-
...(hfToken ? { accessToken: hfToken } : {})
|
|
133
|
-
})) {
|
|
134
|
-
if (results.length >= limit)
|
|
135
|
-
break;
|
|
136
|
-
processed++;
|
|
137
|
-
// Add delay every 100 datasets to avoid aggressive rate limits
|
|
138
|
-
if (processed % 100 === 0 && processed > 0) {
|
|
139
|
-
await delayBetweenRequests(hfToken ? 500 : 2000);
|
|
140
|
-
}
|
|
141
|
-
const repoId = ds.name || ds.id;
|
|
142
|
-
// Filter out non-repo IDs (hex IDs) if they don't have a namespace
|
|
143
|
-
if (!repoId.includes("/")) {
|
|
144
|
-
if (/^[a-f0-9]{24}$/.test(repoId))
|
|
145
|
-
continue;
|
|
146
|
-
}
|
|
147
|
-
const downloads = ds.downloads || 0;
|
|
148
|
-
const tags = ds.tags || [];
|
|
149
|
-
const description = ds.description || "";
|
|
150
|
-
if (processed % 100 === 0) {
|
|
151
|
-
console.error(`[HF Scraper] Processed ${processed}, found ${results.length}, workers: ${queue.length}...`);
|
|
152
|
-
}
|
|
153
|
-
// Hard skip: Empty or very short description (cannot search semantically)
|
|
154
|
-
if (!description || description.length < 20)
|
|
155
|
-
continue;
|
|
156
|
-
// Process dataset with concurrency and retry logic
|
|
157
|
-
const processTask = (async () => {
|
|
158
|
-
try {
|
|
159
|
-
const fullInfo = await retryWithBackoff(() => datasetInfo({
|
|
160
|
-
name: repoId,
|
|
161
|
-
additionalFields: ["cardData"],
|
|
162
|
-
...(hfToken ? { accessToken: hfToken } : {})
|
|
163
|
-
}), {
|
|
164
|
-
maxRetries: 3,
|
|
165
|
-
initialDelay: 2000, // Start with 2 seconds for HF API
|
|
166
|
-
maxDelay: 30000 // Max 30 seconds
|
|
167
|
-
});
|
|
168
|
-
const cardData = fullInfo.cardData || {};
|
|
169
|
-
// Extract splits from cardData.dataset_info (where HF actually stores them)
|
|
170
|
-
// cardData.dataset_info can be an object (single config) or array (multi-config)
|
|
171
|
-
let rawSplits = [];
|
|
172
|
-
const datasetInfoField = cardData.dataset_info;
|
|
173
|
-
if (datasetInfoField) {
|
|
174
|
-
const configs = Array.isArray(datasetInfoField) ? datasetInfoField : [datasetInfoField];
|
|
175
|
-
for (const config of configs) {
|
|
176
|
-
if (config?.splits && Array.isArray(config.splits)) {
|
|
177
|
-
rawSplits = rawSplits.concat(config.splits);
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
// Fallback: try top-level splits from the SDK (rarely populated)
|
|
182
|
-
if (rawSplits.length === 0 && fullInfo.splits) {
|
|
183
|
-
rawSplits = fullInfo.splits;
|
|
184
|
-
}
|
|
185
|
-
const splits = rawSplits.map((s) => ({
|
|
186
|
-
name: s.name,
|
|
187
|
-
num_examples: s.num_examples || s.numExamples || 0,
|
|
188
|
-
size_bytes: s.num_bytes || s.sizeBytes || 0
|
|
189
|
-
}));
|
|
190
|
-
let totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
|
|
191
|
-
const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
|
|
192
|
-
// Fallback: estimate from size_categories when splits give 0
|
|
193
|
-
if (totalExamples === 0) {
|
|
194
|
-
const sizeCategories = cardData.size_categories;
|
|
195
|
-
if (Array.isArray(sizeCategories) && sizeCategories.length > 0) {
|
|
196
|
-
const cat = sizeCategories[0];
|
|
197
|
-
const rangeMatch = cat.match(/([\d.]+[KMB]?)\s*<\s*n\s*<\s*([\d.]+[KMB]?)/i);
|
|
198
|
-
if (rangeMatch) {
|
|
199
|
-
const parseHumanNum = (s) => {
|
|
200
|
-
const m = s.match(/^([\d.]+)([KMB])?$/i);
|
|
201
|
-
if (!m)
|
|
202
|
-
return 0;
|
|
203
|
-
const base = parseFloat(m[1]);
|
|
204
|
-
const suffix = (m[2] || '').toUpperCase();
|
|
205
|
-
if (suffix === 'K')
|
|
206
|
-
return base * 1000;
|
|
207
|
-
if (suffix === 'M')
|
|
208
|
-
return base * 1_000_000;
|
|
209
|
-
if (suffix === 'B')
|
|
210
|
-
return base * 1_000_000_000;
|
|
211
|
-
return base;
|
|
212
|
-
};
|
|
213
|
-
const lo = parseHumanNum(rangeMatch[1]);
|
|
214
|
-
const hi = parseHumanNum(rangeMatch[2]);
|
|
215
|
-
totalExamples = Math.round((lo + hi) / 2);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
|
|
220
|
-
const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
|
|
221
|
-
const licenseTag = tags.find(t => t.startsWith("license:"));
|
|
222
|
-
const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
|
|
223
|
-
const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
|
|
224
|
-
const license = categorizeLicense(licenseId, licenseUrl);
|
|
225
|
-
if (license.category === "restricted") {
|
|
226
|
-
skippedMVP++;
|
|
227
|
-
return;
|
|
228
|
-
}
|
|
229
|
-
const warnings = [];
|
|
230
|
-
if (totalExamples < 50)
|
|
231
|
-
warnings.push("Dataset has very few examples (< 50)");
|
|
232
|
-
if (description.length < 100)
|
|
233
|
-
warnings.push("Short description; results may be less relevant");
|
|
234
|
-
const lastUpdated = ds.updatedAt || fullInfo.updatedAt;
|
|
235
|
-
if (lastUpdated) {
|
|
236
|
-
const updateDate = new Date(lastUpdated);
|
|
237
|
-
const fourYearsAgo = new Date();
|
|
238
|
-
fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
|
|
239
|
-
if (updateDate < fourYearsAgo) {
|
|
240
|
-
warnings.push(`Stale data: Last updated ${updateDate.getFullYear()}`);
|
|
241
|
-
}
|
|
242
|
-
}
|
|
243
|
-
if (splits.length === 0)
|
|
244
|
-
warnings.push("No data splits found; could be a non-standard format");
|
|
245
|
-
const descriptionLower = description.toLowerCase();
|
|
246
|
-
const tagsLower = tags.map(t => t.toLowerCase()).join(" ");
|
|
247
|
-
const hasPersonalData = descriptionLower.includes("personal data") ||
|
|
248
|
-
descriptionLower.includes("gdpr") ||
|
|
249
|
-
tagsLower.includes("personal-data");
|
|
250
|
-
const isPaywalled = descriptionLower.includes("paywall") ||
|
|
251
|
-
descriptionLower.includes("paid");
|
|
252
|
-
const isScrapedWebData = descriptionLower.includes("scraped") ||
|
|
253
|
-
tagsLower.includes("scraped");
|
|
254
|
-
const isSafeSource = !isScrapedWebData && !hasPersonalData && !isPaywalled;
|
|
255
|
-
if (!isSafeSource)
|
|
256
|
-
warnings.push("Contains potentially sensitive or paywalled data sources");
|
|
257
|
-
const format = this.extractFormat(tags, cardData);
|
|
258
|
-
const columns = this.extractColumns(cardData, splits);
|
|
259
|
-
const task = this.extractTask(tags);
|
|
260
|
-
const domain = classifyDomain(description, tags, repoId, task);
|
|
261
|
-
// REMOVED strict domain filtering that caused search bias
|
|
262
|
-
// if (query && domain !== query) return;
|
|
263
|
-
const metadata = {
|
|
264
|
-
id: repoId,
|
|
265
|
-
source: "huggingface",
|
|
266
|
-
name: repoId.split("/").pop() || repoId,
|
|
267
|
-
description: description,
|
|
268
|
-
quality_warnings: warnings,
|
|
269
|
-
downloads: downloads,
|
|
270
|
-
likes: ds.likes || 0,
|
|
271
|
-
stars: fullInfo.stars || 0,
|
|
272
|
-
tags: tags,
|
|
273
|
-
last_updated: lastUpdated?.toISOString() || new Date().toISOString(),
|
|
274
|
-
task: task,
|
|
275
|
-
domain: domain,
|
|
276
|
-
languages: this.extractLanguages(tags),
|
|
277
|
-
splits,
|
|
278
|
-
license,
|
|
279
|
-
quality_score: calculateQualityScore({
|
|
280
|
-
downloads,
|
|
281
|
-
likes: ds.likes || 0,
|
|
282
|
-
hasDescription: true,
|
|
283
|
-
descriptionLength: description.length,
|
|
284
|
-
hasTrainSplit: splits.some((s) => s.name === "train"),
|
|
285
|
-
hasTestSplit: splits.some((s) => s.name === "test"),
|
|
286
|
-
lastUpdated: lastUpdated?.toISOString() || new Date().toISOString(),
|
|
287
|
-
licenseCategory: license.category
|
|
288
|
-
}),
|
|
289
|
-
download_url: `https://huggingface.co/datasets/${repoId}`,
|
|
290
|
-
format,
|
|
291
|
-
total_examples: totalExamples,
|
|
292
|
-
total_size_bytes: totalSizeBytes,
|
|
293
|
-
total_size_mb: totalSizeMB,
|
|
294
|
-
columns,
|
|
295
|
-
is_structured: columns.length > 0 && splits.some((s) => s.name === "train"),
|
|
296
|
-
has_target_column: columns.some(c => c.is_target === true),
|
|
297
|
-
is_safe_source: isSafeSource,
|
|
298
|
-
has_personal_data: hasPersonalData,
|
|
299
|
-
is_paywalled: isPaywalled,
|
|
300
|
-
is_scraped_web_data: isScrapedWebData,
|
|
301
|
-
uses_https: true,
|
|
302
|
-
has_train_split: splits.some((s) => s.name === "train"),
|
|
303
|
-
has_test_split: splits.some((s) => s.name === "test"),
|
|
304
|
-
has_validation_split: hasValidationSplit,
|
|
305
|
-
description_length: description.length,
|
|
306
|
-
has_readme: !!(cardData.readme || cardData.readme_content)
|
|
307
|
-
};
|
|
308
|
-
// Hard language exclusion — drop bilingual/multilingual for single-language queries
|
|
309
|
-
if (intent && shouldExcludeByLanguage(metadata, intent)) {
|
|
310
|
-
// skip — do not push
|
|
311
|
-
}
|
|
312
|
-
else {
|
|
313
|
-
if (intent) {
|
|
314
|
-
metadata.intent_score = scoreDatasetAgainstIntent(metadata, intent);
|
|
315
|
-
}
|
|
316
|
-
results.push(metadata);
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
catch (e) {
|
|
320
|
-
// Track all errors for user feedback
|
|
321
|
-
if (e?.status === 429 || e?.message?.includes('rate limit')) {
|
|
322
|
-
rateLimitHits++;
|
|
323
|
-
if (rateLimitHits <= 3) {
|
|
324
|
-
console.error(`[HF Scraper] Rate limit hit for ${repoId}`);
|
|
325
|
-
}
|
|
326
|
-
}
|
|
327
|
-
else {
|
|
328
|
-
otherErrors++;
|
|
329
|
-
// Log first few non-rate-limit errors for debugging
|
|
330
|
-
if (otherErrors <= 2) {
|
|
331
|
-
console.error(`[HF Scraper] Error for ${repoId}: ${e.message?.slice(0, 80)}`);
|
|
332
|
-
}
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
})();
|
|
336
|
-
queue.push(processTask);
|
|
337
|
-
processTask.then(() => {
|
|
338
|
-
const idx = queue.indexOf(processTask);
|
|
339
|
-
if (idx !== -1)
|
|
340
|
-
queue.splice(idx, 1);
|
|
341
|
-
});
|
|
342
|
-
if (queue.length >= CONCURRENCY) {
|
|
343
|
-
await Promise.race(queue);
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
// Wait for remaining tasks
|
|
347
|
-
await Promise.all(queue);
|
|
348
|
-
}
|
|
349
|
-
catch (e) {
|
|
350
|
-
// Handle rate limit errors with better messaging
|
|
351
|
-
if (e?.status === 429 || e?.message?.includes('rate limit')) {
|
|
352
|
-
console.error("[HF Scraper] ❌ Scraping failed due to rate limit:", e.message);
|
|
353
|
-
console.error("[HF Scraper] 💡 Set HF_TOKEN environment variable for unlimited access");
|
|
354
|
-
}
|
|
355
|
-
else {
|
|
356
|
-
console.error("[HF Scraper] ❌ Scraping failed:", e.message);
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
// User-friendly summary
|
|
360
|
-
console.error(`[HF Scraper] ✅ Complete: ${results.length} datasets found`);
|
|
361
|
-
if (rateLimitHits > 0) {
|
|
362
|
-
console.error(`[HF Scraper] ⚠️ ${rateLimitHits} requests rate-limited. Set HF_TOKEN for better results.`);
|
|
363
|
-
}
|
|
364
|
-
if (otherErrors > 0) {
|
|
365
|
-
console.error(`[HF Scraper] ⚠️ ${otherErrors} datasets skipped due to errors`);
|
|
366
|
-
}
|
|
367
|
-
return results.sort((a, b) => {
|
|
368
|
-
const intentDelta = Number(b.intent_score || 0) - Number(a.intent_score || 0);
|
|
369
|
-
if (intentDelta !== 0)
|
|
370
|
-
return intentDelta;
|
|
371
|
-
return b.downloads - a.downloads;
|
|
372
|
-
});
|
|
373
|
-
}
|
|
374
|
-
extractTask(tags) {
|
|
375
|
-
const taskTags = [
|
|
376
|
-
"text-classification",
|
|
377
|
-
"token-classification",
|
|
378
|
-
"question-answering",
|
|
379
|
-
"translation",
|
|
380
|
-
"summarization",
|
|
381
|
-
"text-generation",
|
|
382
|
-
"image-classification",
|
|
383
|
-
"object-detection",
|
|
384
|
-
"named-entity-recognition",
|
|
385
|
-
"sentiment-analysis",
|
|
386
|
-
"machine-translation"
|
|
387
|
-
];
|
|
388
|
-
return tags.find(t => taskTags.includes(t)) || "unknown";
|
|
389
|
-
}
|
|
390
|
-
extractLanguages(tags) {
|
|
391
|
-
const langs = tags
|
|
392
|
-
.filter(t => t.startsWith("language:"))
|
|
393
|
-
.map(t => t.replace("language:", ""));
|
|
394
|
-
const isoLangs = tags.filter(t => t.length === 2 && /^[a-z]{2}$/.test(t));
|
|
395
|
-
return Array.from(new Set([...langs, ...isoLangs]));
|
|
396
|
-
}
|
|
397
|
-
extractFormat(tags, cardData) {
|
|
398
|
-
// Check tags first
|
|
399
|
-
const formatTag = tags.find(t => t.startsWith("format:"));
|
|
400
|
-
if (formatTag) {
|
|
401
|
-
return formatTag.replace("format:", "").toUpperCase();
|
|
402
|
-
}
|
|
403
|
-
// Check cardData
|
|
404
|
-
if (cardData.format) {
|
|
405
|
-
return String(cardData.format).toUpperCase();
|
|
406
|
-
}
|
|
407
|
-
// Infer from other tags
|
|
408
|
-
if (tags.includes("parquet"))
|
|
409
|
-
return "PARQUET";
|
|
410
|
-
if (tags.includes("csv"))
|
|
411
|
-
return "CSV";
|
|
412
|
-
if (tags.includes("json"))
|
|
413
|
-
return "JSON";
|
|
414
|
-
if (tags.includes("arrow"))
|
|
415
|
-
return "ARROW";
|
|
416
|
-
return undefined;
|
|
417
|
-
}
|
|
418
|
-
extractColumns(cardData, splits) {
|
|
419
|
-
const columns = [];
|
|
420
|
-
// Try to get columns from cardData
|
|
421
|
-
if (cardData.columns) {
|
|
422
|
-
if (Array.isArray(cardData.columns)) {
|
|
423
|
-
return cardData.columns.map((col) => ({
|
|
424
|
-
name: col.name || String(col),
|
|
425
|
-
type: col.type,
|
|
426
|
-
is_target: col.is_target || false
|
|
427
|
-
}));
|
|
428
|
-
}
|
|
429
|
-
}
|
|
430
|
-
// Try to infer from features if available
|
|
431
|
-
if (cardData.features) {
|
|
432
|
-
const features = cardData.features;
|
|
433
|
-
if (typeof features === 'object') {
|
|
434
|
-
for (const [key, value] of Object.entries(features)) {
|
|
435
|
-
const feature = value;
|
|
436
|
-
columns.push({
|
|
437
|
-
name: key,
|
|
438
|
-
type: feature?.dtype || feature?.type,
|
|
439
|
-
is_target: key.toLowerCase().includes("label") ||
|
|
440
|
-
key.toLowerCase().includes("target") ||
|
|
441
|
-
key.toLowerCase().includes("y")
|
|
442
|
-
});
|
|
443
|
-
}
|
|
444
|
-
}
|
|
445
|
-
}
|
|
446
|
-
return columns;
|
|
447
|
-
}
|
|
448
|
-
}
|