@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
import { listDatasets, datasetInfo } from "@huggingface/hub";
|
|
2
|
+
import { categorizeLicense } from "./license.js";
|
|
3
|
+
import { calculateQualityScore } from "./quality.js";
|
|
4
|
+
import { classifyDomain } from "./domain.js";
|
|
5
|
+
import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
|
|
6
|
+
export class HuggingFaceScraper {
|
|
7
|
+
/**
|
|
8
|
+
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
9
|
+
* Hits the 25k target in minutes.
|
|
10
|
+
*/
|
|
11
|
+
async scrapeBulk(limit = 1000, domainFilter) {
|
|
12
|
+
const filterMsg = domainFilter ? `, domain: ${domainFilter}` : "";
|
|
13
|
+
console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
|
|
14
|
+
const results = [];
|
|
15
|
+
let processed = 0;
|
|
16
|
+
try {
|
|
17
|
+
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
18
|
+
for await (const ds of listDatasets({
|
|
19
|
+
limit: limit,
|
|
20
|
+
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
21
|
+
search: { query: domainFilter },
|
|
22
|
+
...(hfToken ? { accessToken: hfToken } : {})
|
|
23
|
+
})) {
|
|
24
|
+
if (results.length >= limit)
|
|
25
|
+
break;
|
|
26
|
+
processed++;
|
|
27
|
+
if (processed % 1000 === 0) {
|
|
28
|
+
console.error(`[Bulk Scraper] Found ${processed} datasets...`);
|
|
29
|
+
}
|
|
30
|
+
const repoId = ds.name || ds.id;
|
|
31
|
+
const description = ds.description || "";
|
|
32
|
+
const tags = ds.tags || [];
|
|
33
|
+
// Filter out non-repo IDs (hex IDs) if they don't have a namespace
|
|
34
|
+
if (!repoId.includes("/") && /^[a-f0-9]{24}$/.test(repoId))
|
|
35
|
+
continue;
|
|
36
|
+
// Hard skip: Empty or very short description (cannot search semantically)
|
|
37
|
+
if (!description || description.length < 20)
|
|
38
|
+
continue;
|
|
39
|
+
const createdAt = ds.createdAt;
|
|
40
|
+
const downloads = ds.downloadsAllTime || ds.downloads || 0;
|
|
41
|
+
const task = this.extractTask(tags);
|
|
42
|
+
// Create "skeleton" metadata
|
|
43
|
+
const metadata = {
|
|
44
|
+
id: repoId,
|
|
45
|
+
source: "huggingface",
|
|
46
|
+
name: repoId.split("/").pop() || repoId,
|
|
47
|
+
description: description,
|
|
48
|
+
quality_warnings: ["Incomplete metadata: run hydration to get full details"],
|
|
49
|
+
downloads: downloads,
|
|
50
|
+
likes: ds.likes || 0,
|
|
51
|
+
stars: 0,
|
|
52
|
+
tags: tags,
|
|
53
|
+
last_updated: createdAt instanceof Date ? createdAt.toISOString() : new Date().toISOString(),
|
|
54
|
+
task: task,
|
|
55
|
+
domain: classifyDomain(description, tags, repoId, task),
|
|
56
|
+
languages: this.extractLanguages(tags),
|
|
57
|
+
splits: [],
|
|
58
|
+
license: {
|
|
59
|
+
id: "unknown",
|
|
60
|
+
category: "unknown",
|
|
61
|
+
usage_restrictions: [],
|
|
62
|
+
warnings: ["License not verified yet"]
|
|
63
|
+
},
|
|
64
|
+
quality_score: 10, // Default low score for skeleton
|
|
65
|
+
download_url: `https://huggingface.co/datasets/${repoId}`,
|
|
66
|
+
total_examples: 0,
|
|
67
|
+
is_structured: false,
|
|
68
|
+
has_target_column: false,
|
|
69
|
+
is_safe_source: true, // Default to true, will be verified during hydration
|
|
70
|
+
has_personal_data: false,
|
|
71
|
+
is_paywalled: false,
|
|
72
|
+
is_scraped_web_data: false,
|
|
73
|
+
uses_https: true,
|
|
74
|
+
has_train_split: false,
|
|
75
|
+
has_test_split: false,
|
|
76
|
+
has_validation_split: false,
|
|
77
|
+
description_length: description.length,
|
|
78
|
+
has_readme: false,
|
|
79
|
+
is_incomplete: true // Flag for Phase 2
|
|
80
|
+
};
|
|
81
|
+
results.push(metadata);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
catch (e) {
|
|
85
|
+
console.error("[Bulk Scraper] Error:", e.message);
|
|
86
|
+
}
|
|
87
|
+
return results;
|
|
88
|
+
}
|
|
89
|
+
async scrape(limit = 100, applyMVPFilters = true, domainFilter // Optional: filter by domain (medicine, healthcare, security, etc.)
|
|
90
|
+
) {
|
|
91
|
+
const filterMsg = domainFilter ? `, domain: ${domainFilter}` : "";
|
|
92
|
+
console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
|
|
93
|
+
const results = [];
|
|
94
|
+
let processed = 0;
|
|
95
|
+
let skippedMVP = 0;
|
|
96
|
+
try {
|
|
97
|
+
// Fetch more datasets to account for filtering
|
|
98
|
+
const fetchLimit = applyMVPFilters ? limit * 30 : limit * 10;
|
|
99
|
+
const CONCURRENCY = 25; // Increased for high-volume indexing
|
|
100
|
+
const queue = [];
|
|
101
|
+
// Support HuggingFace token from environment variable
|
|
102
|
+
const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
103
|
+
// Add delay between batches to avoid rate limits
|
|
104
|
+
const BATCH_DELAY = 1000; // 1 second delay between batches
|
|
105
|
+
for await (const ds of listDatasets({
|
|
106
|
+
limit: fetchLimit,
|
|
107
|
+
additionalFields: ["description", "tags"],
|
|
108
|
+
search: { query: domainFilter },
|
|
109
|
+
...(hfToken ? { accessToken: hfToken } : {})
|
|
110
|
+
})) {
|
|
111
|
+
if (results.length >= limit)
|
|
112
|
+
break;
|
|
113
|
+
processed++;
|
|
114
|
+
// Add delay every 100 datasets to avoid aggressive rate limits
|
|
115
|
+
if (processed % 100 === 0 && processed > 0) {
|
|
116
|
+
await delayBetweenRequests(hfToken ? 500 : 2000);
|
|
117
|
+
}
|
|
118
|
+
const repoId = ds.name || ds.id;
|
|
119
|
+
// Filter out non-repo IDs (hex IDs) if they don't have a namespace
|
|
120
|
+
if (!repoId.includes("/")) {
|
|
121
|
+
if (/^[a-f0-9]{24}$/.test(repoId))
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
const downloads = ds.downloads || 0;
|
|
125
|
+
const tags = ds.tags || [];
|
|
126
|
+
const description = ds.description || "";
|
|
127
|
+
if (processed % 100 === 0) {
|
|
128
|
+
console.error(`[HF Scraper] Processed ${processed}, found ${results.length}, workers: ${queue.length}...`);
|
|
129
|
+
}
|
|
130
|
+
// Hard skip: Empty or very short description (cannot search semantically)
|
|
131
|
+
if (!description || description.length < 20)
|
|
132
|
+
continue;
|
|
133
|
+
// Process dataset with concurrency and retry logic
|
|
134
|
+
const processTask = (async () => {
|
|
135
|
+
try {
|
|
136
|
+
const fullInfo = await retryWithBackoff(() => datasetInfo({
|
|
137
|
+
name: repoId,
|
|
138
|
+
additionalFields: ["cardData"],
|
|
139
|
+
...(hfToken ? { accessToken: hfToken } : {})
|
|
140
|
+
}), {
|
|
141
|
+
maxRetries: 3,
|
|
142
|
+
initialDelay: 2000, // Start with 2 seconds for HF API
|
|
143
|
+
maxDelay: 30000 // Max 30 seconds
|
|
144
|
+
});
|
|
145
|
+
const splits = fullInfo.splits?.map((s) => ({
|
|
146
|
+
name: s.name,
|
|
147
|
+
num_examples: s.numExamples || 0,
|
|
148
|
+
size_bytes: s.sizeBytes
|
|
149
|
+
})) || [];
|
|
150
|
+
const totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
|
|
151
|
+
const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
|
|
152
|
+
const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
|
|
153
|
+
const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
|
|
154
|
+
const licenseTag = tags.find(t => t.startsWith("license:"));
|
|
155
|
+
const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
|
|
156
|
+
const cardData = fullInfo.cardData || {};
|
|
157
|
+
const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
|
|
158
|
+
const license = categorizeLicense(licenseId, licenseUrl);
|
|
159
|
+
if (license.category === "restricted") {
|
|
160
|
+
skippedMVP++;
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
const warnings = [];
|
|
164
|
+
if (totalExamples < 50)
|
|
165
|
+
warnings.push("Dataset has very few examples (< 50)");
|
|
166
|
+
if (description.length < 100)
|
|
167
|
+
warnings.push("Short description; results may be less relevant");
|
|
168
|
+
const lastUpdated = ds.updatedAt || fullInfo.updatedAt;
|
|
169
|
+
if (lastUpdated) {
|
|
170
|
+
const updateDate = new Date(lastUpdated);
|
|
171
|
+
const fourYearsAgo = new Date();
|
|
172
|
+
fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
|
|
173
|
+
if (updateDate < fourYearsAgo) {
|
|
174
|
+
warnings.push(`Stale data: Last updated ${updateDate.getFullYear()}`);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
if (splits.length === 0)
|
|
178
|
+
warnings.push("No data splits found; could be a non-standard format");
|
|
179
|
+
const descriptionLower = description.toLowerCase();
|
|
180
|
+
const tagsLower = tags.map(t => t.toLowerCase()).join(" ");
|
|
181
|
+
const hasPersonalData = descriptionLower.includes("personal data") ||
|
|
182
|
+
descriptionLower.includes("gdpr") ||
|
|
183
|
+
tagsLower.includes("personal-data");
|
|
184
|
+
const isPaywalled = descriptionLower.includes("paywall") ||
|
|
185
|
+
descriptionLower.includes("paid");
|
|
186
|
+
const isScrapedWebData = descriptionLower.includes("scraped") ||
|
|
187
|
+
tagsLower.includes("scraped");
|
|
188
|
+
const isSafeSource = !isScrapedWebData && !hasPersonalData && !isPaywalled;
|
|
189
|
+
if (!isSafeSource)
|
|
190
|
+
warnings.push("Contains potentially sensitive or paywalled data sources");
|
|
191
|
+
const format = this.extractFormat(tags, cardData);
|
|
192
|
+
const columns = this.extractColumns(cardData, splits);
|
|
193
|
+
const task = this.extractTask(tags);
|
|
194
|
+
const domain = classifyDomain(description, tags, repoId, task);
|
|
195
|
+
if (domainFilter && domain !== domainFilter)
|
|
196
|
+
return;
|
|
197
|
+
const metadata = {
|
|
198
|
+
id: repoId,
|
|
199
|
+
source: "huggingface",
|
|
200
|
+
name: repoId.split("/").pop() || repoId,
|
|
201
|
+
description: description,
|
|
202
|
+
quality_warnings: warnings,
|
|
203
|
+
downloads: downloads,
|
|
204
|
+
likes: ds.likes || 0,
|
|
205
|
+
stars: fullInfo.stars || 0,
|
|
206
|
+
tags: tags,
|
|
207
|
+
last_updated: lastUpdated?.toISOString() || new Date().toISOString(),
|
|
208
|
+
task: task,
|
|
209
|
+
domain: domain,
|
|
210
|
+
languages: this.extractLanguages(tags),
|
|
211
|
+
splits,
|
|
212
|
+
license,
|
|
213
|
+
quality_score: calculateQualityScore({
|
|
214
|
+
downloads,
|
|
215
|
+
likes: ds.likes || 0,
|
|
216
|
+
hasDescription: true,
|
|
217
|
+
descriptionLength: description.length,
|
|
218
|
+
hasTrainSplit: splits.some((s) => s.name === "train"),
|
|
219
|
+
hasTestSplit: splits.some((s) => s.name === "test"),
|
|
220
|
+
lastUpdated: lastUpdated?.toISOString() || new Date().toISOString(),
|
|
221
|
+
licenseCategory: license.category
|
|
222
|
+
}),
|
|
223
|
+
download_url: `https://huggingface.co/datasets/${repoId}`,
|
|
224
|
+
format,
|
|
225
|
+
total_examples: totalExamples,
|
|
226
|
+
total_size_bytes: totalSizeBytes,
|
|
227
|
+
total_size_mb: totalSizeMB,
|
|
228
|
+
columns,
|
|
229
|
+
is_structured: columns.length > 0 && splits.some((s) => s.name === "train"),
|
|
230
|
+
has_target_column: columns.some(c => c.is_target === true),
|
|
231
|
+
is_safe_source: isSafeSource,
|
|
232
|
+
has_personal_data: hasPersonalData,
|
|
233
|
+
is_paywalled: isPaywalled,
|
|
234
|
+
is_scraped_web_data: isScrapedWebData,
|
|
235
|
+
uses_https: true,
|
|
236
|
+
has_train_split: splits.some((s) => s.name === "train"),
|
|
237
|
+
has_test_split: splits.some((s) => s.name === "test"),
|
|
238
|
+
has_validation_split: hasValidationSplit,
|
|
239
|
+
description_length: description.length,
|
|
240
|
+
has_readme: !!(cardData.readme || cardData.readme_content)
|
|
241
|
+
};
|
|
242
|
+
results.push(metadata);
|
|
243
|
+
}
|
|
244
|
+
catch (e) {
|
|
245
|
+
// Log rate limit errors, silently skip others
|
|
246
|
+
if (e?.status === 429 || e?.message?.includes('rate limit')) {
|
|
247
|
+
console.error(`[HF Scraper] Rate limit error for ${repoId}: ${e.message}`);
|
|
248
|
+
}
|
|
249
|
+
// Silently skip other errors
|
|
250
|
+
}
|
|
251
|
+
})();
|
|
252
|
+
queue.push(processTask);
|
|
253
|
+
processTask.then(() => {
|
|
254
|
+
const idx = queue.indexOf(processTask);
|
|
255
|
+
if (idx !== -1)
|
|
256
|
+
queue.splice(idx, 1);
|
|
257
|
+
});
|
|
258
|
+
if (queue.length >= CONCURRENCY) {
|
|
259
|
+
await Promise.race(queue);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
// Wait for remaining tasks
|
|
263
|
+
await Promise.all(queue);
|
|
264
|
+
}
|
|
265
|
+
catch (e) {
|
|
266
|
+
// Handle rate limit errors with better messaging
|
|
267
|
+
if (e?.status === 429 || e?.message?.includes('rate limit')) {
|
|
268
|
+
console.error("Scraping failed due to rate limit:", e.message);
|
|
269
|
+
console.error("Consider setting HF_TOKEN environment variable to increase rate limits");
|
|
270
|
+
}
|
|
271
|
+
else {
|
|
272
|
+
console.error("Scraping failed overall:", e.message);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
console.error(`[HF Scraper] Complete: ${results.length} datasets scraped, ${skippedMVP} skipped`);
|
|
276
|
+
// Sort by downloads descending
|
|
277
|
+
return results.sort((a, b) => b.downloads - a.downloads);
|
|
278
|
+
}
|
|
279
|
+
extractTask(tags) {
|
|
280
|
+
const taskTags = [
|
|
281
|
+
"text-classification",
|
|
282
|
+
"token-classification",
|
|
283
|
+
"question-answering",
|
|
284
|
+
"translation",
|
|
285
|
+
"summarization",
|
|
286
|
+
"text-generation",
|
|
287
|
+
"image-classification",
|
|
288
|
+
"object-detection",
|
|
289
|
+
"named-entity-recognition",
|
|
290
|
+
"sentiment-analysis",
|
|
291
|
+
"machine-translation"
|
|
292
|
+
];
|
|
293
|
+
return tags.find(t => taskTags.includes(t)) || "unknown";
|
|
294
|
+
}
|
|
295
|
+
extractLanguages(tags) {
|
|
296
|
+
const langs = tags
|
|
297
|
+
.filter(t => t.startsWith("language:"))
|
|
298
|
+
.map(t => t.replace("language:", ""));
|
|
299
|
+
const isoLangs = tags.filter(t => t.length === 2 && /^[a-z]{2}$/.test(t));
|
|
300
|
+
return Array.from(new Set([...langs, ...isoLangs]));
|
|
301
|
+
}
|
|
302
|
+
extractFormat(tags, cardData) {
|
|
303
|
+
// Check tags first
|
|
304
|
+
const formatTag = tags.find(t => t.startsWith("format:"));
|
|
305
|
+
if (formatTag) {
|
|
306
|
+
return formatTag.replace("format:", "").toUpperCase();
|
|
307
|
+
}
|
|
308
|
+
// Check cardData
|
|
309
|
+
if (cardData.format) {
|
|
310
|
+
return String(cardData.format).toUpperCase();
|
|
311
|
+
}
|
|
312
|
+
// Infer from other tags
|
|
313
|
+
if (tags.includes("parquet"))
|
|
314
|
+
return "PARQUET";
|
|
315
|
+
if (tags.includes("csv"))
|
|
316
|
+
return "CSV";
|
|
317
|
+
if (tags.includes("json"))
|
|
318
|
+
return "JSON";
|
|
319
|
+
if (tags.includes("arrow"))
|
|
320
|
+
return "ARROW";
|
|
321
|
+
return undefined;
|
|
322
|
+
}
|
|
323
|
+
extractColumns(cardData, splits) {
|
|
324
|
+
const columns = [];
|
|
325
|
+
// Try to get columns from cardData
|
|
326
|
+
if (cardData.columns) {
|
|
327
|
+
if (Array.isArray(cardData.columns)) {
|
|
328
|
+
return cardData.columns.map((col) => ({
|
|
329
|
+
name: col.name || String(col),
|
|
330
|
+
type: col.type,
|
|
331
|
+
is_target: col.is_target || false
|
|
332
|
+
}));
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
// Try to infer from features if available
|
|
336
|
+
if (cardData.features) {
|
|
337
|
+
const features = cardData.features;
|
|
338
|
+
if (typeof features === 'object') {
|
|
339
|
+
for (const [key, value] of Object.entries(features)) {
|
|
340
|
+
const feature = value;
|
|
341
|
+
columns.push({
|
|
342
|
+
name: key,
|
|
343
|
+
type: feature?.dtype || feature?.type,
|
|
344
|
+
is_target: key.toLowerCase().includes("label") ||
|
|
345
|
+
key.toLowerCase().includes("target") ||
|
|
346
|
+
key.toLowerCase().includes("y")
|
|
347
|
+
});
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
return columns;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import Database from "better-sqlite3";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
export class MetadataStore {
|
|
5
|
+
db;
|
|
6
|
+
constructor(dbPath) {
|
|
7
|
+
// Ensure data directory exists
|
|
8
|
+
const dir = path.dirname(dbPath);
|
|
9
|
+
if (!fs.existsSync(dir)) {
|
|
10
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
11
|
+
}
|
|
12
|
+
this.db = new Database(dbPath);
|
|
13
|
+
this.init();
|
|
14
|
+
this.migrate();
|
|
15
|
+
}
|
|
16
|
+
migrate() {
|
|
17
|
+
// Add install_path if missing
|
|
18
|
+
try {
|
|
19
|
+
this.db.exec("ALTER TABLE datasets ADD COLUMN install_path TEXT");
|
|
20
|
+
console.log("[MetadataStore] Migrated: Added install_path column");
|
|
21
|
+
}
|
|
22
|
+
catch (e) {
|
|
23
|
+
// Probably already exists
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
init() {
|
|
27
|
+
// Create table with all columns if it doesn't exist
|
|
28
|
+
this.db.exec(`
|
|
29
|
+
CREATE TABLE IF NOT EXISTS datasets (
|
|
30
|
+
id TEXT PRIMARY KEY,
|
|
31
|
+
source TEXT, -- 'huggingface', 'kaggle', 'uci', etc.
|
|
32
|
+
name TEXT,
|
|
33
|
+
description TEXT,
|
|
34
|
+
downloads INTEGER,
|
|
35
|
+
likes INTEGER,
|
|
36
|
+
stars INTEGER,
|
|
37
|
+
tags TEXT, -- JSON string
|
|
38
|
+
license_id TEXT,
|
|
39
|
+
license_category TEXT,
|
|
40
|
+
quality_score INTEGER,
|
|
41
|
+
has_train_split BOOLEAN,
|
|
42
|
+
total_examples INTEGER,
|
|
43
|
+
total_size_mb REAL,
|
|
44
|
+
is_safe_source BOOLEAN,
|
|
45
|
+
is_structured BOOLEAN,
|
|
46
|
+
last_updated TEXT,
|
|
47
|
+
quality_warnings TEXT, -- JSON array string
|
|
48
|
+
metadata_json TEXT, -- Full metadata as JSON
|
|
49
|
+
install_path TEXT -- Path to locally installed data
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
CREATE TABLE IF NOT EXISTS jobs (
|
|
53
|
+
id TEXT PRIMARY KEY,
|
|
54
|
+
type TEXT,
|
|
55
|
+
status TEXT,
|
|
56
|
+
priority INTEGER DEFAULT 0,
|
|
57
|
+
progress INTEGER DEFAULT 0,
|
|
58
|
+
status_text TEXT,
|
|
59
|
+
result_url TEXT,
|
|
60
|
+
error TEXT,
|
|
61
|
+
attempts INTEGER DEFAULT 0,
|
|
62
|
+
max_attempts INTEGER DEFAULT 3,
|
|
63
|
+
created_at TEXT,
|
|
64
|
+
updated_at TEXT,
|
|
65
|
+
metadata TEXT
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
CREATE TABLE IF NOT EXISTS local_files (
|
|
69
|
+
id TEXT PRIMARY KEY,
|
|
70
|
+
local_path TEXT,
|
|
71
|
+
status TEXT, -- 'downloading', 'completed', 'failed'
|
|
72
|
+
size_bytes INTEGER,
|
|
73
|
+
last_checked TEXT,
|
|
74
|
+
error TEXT
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
CREATE TABLE IF NOT EXISTS jobs_archive (
|
|
78
|
+
id TEXT PRIMARY KEY,
|
|
79
|
+
type TEXT,
|
|
80
|
+
status TEXT,
|
|
81
|
+
priority INTEGER DEFAULT 0,
|
|
82
|
+
progress INTEGER DEFAULT 0,
|
|
83
|
+
status_text TEXT,
|
|
84
|
+
result_url TEXT,
|
|
85
|
+
error TEXT,
|
|
86
|
+
attempts INTEGER DEFAULT 0,
|
|
87
|
+
max_attempts INTEGER DEFAULT 3,
|
|
88
|
+
created_at TEXT,
|
|
89
|
+
updated_at TEXT,
|
|
90
|
+
metadata TEXT
|
|
91
|
+
);
|
|
92
|
+
`);
|
|
93
|
+
// Migrate existing tables: add new columns if they don't exist
|
|
94
|
+
const tableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
|
|
95
|
+
const existingColumns = new Set(tableInfo.map(col => col.name));
|
|
96
|
+
const migrations = [
|
|
97
|
+
{ column: "stars", type: "INTEGER DEFAULT 0" },
|
|
98
|
+
{ column: "total_examples", type: "INTEGER" },
|
|
99
|
+
{ column: "total_size_mb", type: "REAL" },
|
|
100
|
+
{ column: "is_safe_source", type: "BOOLEAN DEFAULT 1" },
|
|
101
|
+
{ column: "is_structured", type: "BOOLEAN DEFAULT 0" },
|
|
102
|
+
{ column: "domain", type: "TEXT" },
|
|
103
|
+
{ column: "source", type: "TEXT DEFAULT 'huggingface'" },
|
|
104
|
+
{ column: "quality_warnings", type: "TEXT" }
|
|
105
|
+
];
|
|
106
|
+
for (const migration of migrations) {
|
|
107
|
+
if (!existingColumns.has(migration.column)) {
|
|
108
|
+
console.error(`[MetadataStore] Migrating: adding column ${migration.column}`);
|
|
109
|
+
this.db.exec(`ALTER TABLE datasets ADD COLUMN ${migration.column} ${migration.type}`);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
// Job migrations
|
|
113
|
+
const jobTableInfo = this.db.prepare("PRAGMA table_info(jobs)").all();
|
|
114
|
+
const existingJobColumns = new Set(jobTableInfo.map(col => col.name));
|
|
115
|
+
const jobMigrations = [
|
|
116
|
+
{ column: "priority", type: "INTEGER DEFAULT 0" },
|
|
117
|
+
{ column: "attempts", type: "INTEGER DEFAULT 0" },
|
|
118
|
+
{ column: "max_attempts", type: "INTEGER DEFAULT 3" }
|
|
119
|
+
];
|
|
120
|
+
for (const migration of jobMigrations) {
|
|
121
|
+
if (!existingJobColumns.has(migration.column)) {
|
|
122
|
+
console.error(`[MetadataStore] Migrating Job: adding column ${migration.column}`);
|
|
123
|
+
this.db.exec(`ALTER TABLE jobs ADD COLUMN ${migration.column} ${migration.type}`);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
// Get updated column list after migrations
|
|
127
|
+
const updatedTableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
|
|
128
|
+
const allColumns = new Set(updatedTableInfo.map(col => col.name));
|
|
129
|
+
// Create indexes (only if columns exist)
|
|
130
|
+
const indexes = [
|
|
131
|
+
{ name: "idx_license_category", table: "datasets", column: "license_category" },
|
|
132
|
+
{ name: "idx_quality_score", table: "datasets", column: "quality_score" },
|
|
133
|
+
{ name: "idx_downloads", table: "datasets", column: "downloads" },
|
|
134
|
+
{ name: "idx_has_train_split", table: "datasets", column: "has_train_split" },
|
|
135
|
+
{ name: "idx_is_safe_source", table: "datasets", column: "is_safe_source" },
|
|
136
|
+
{ name: "idx_is_structured", table: "datasets", column: "is_structured" },
|
|
137
|
+
{ name: "idx_domain", table: "datasets", column: "domain" },
|
|
138
|
+
{ name: "idx_jobs_status", table: "jobs", column: "status" },
|
|
139
|
+
{ name: "idx_jobs_created", table: "jobs", column: "created_at" },
|
|
140
|
+
{ name: "idx_jobs_archive_created", table: "jobs_archive", column: "created_at" }
|
|
141
|
+
];
|
|
142
|
+
for (const idx of indexes) {
|
|
143
|
+
// For datasets table, check if column exists first
|
|
144
|
+
if (idx.table === "datasets" && !allColumns.has(idx.column))
|
|
145
|
+
continue;
|
|
146
|
+
try {
|
|
147
|
+
this.db.exec(`CREATE INDEX IF NOT EXISTS ${idx.name} ON ${idx.table}(${idx.column})`);
|
|
148
|
+
}
|
|
149
|
+
catch (e) {
|
|
150
|
+
// Ignore index errors
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
saveDataset(dataset) {
|
|
155
|
+
// If the incoming dataset is incomplete, check if we already have a complete one
|
|
156
|
+
if (dataset.is_incomplete) {
|
|
157
|
+
const existing = this.getDataset(dataset.id);
|
|
158
|
+
if (existing && !existing.is_incomplete) {
|
|
159
|
+
// Already have better data, only update stats
|
|
160
|
+
const updateStats = this.db.prepare(`
|
|
161
|
+
UPDATE datasets SET
|
|
162
|
+
downloads = ?,
|
|
163
|
+
likes = ?,
|
|
164
|
+
stars = ?,
|
|
165
|
+
last_updated = ?
|
|
166
|
+
WHERE id = ?
|
|
167
|
+
`);
|
|
168
|
+
updateStats.run(dataset.downloads, dataset.likes, dataset.stars || 0, dataset.last_updated, dataset.id);
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
const stmt = this.db.prepare(`
|
|
173
|
+
INSERT INTO datasets (
|
|
174
|
+
id, source, name, description, downloads, likes, stars, tags,
|
|
175
|
+
license_id, license_category, quality_score,
|
|
176
|
+
has_train_split, total_examples, total_size_mb,
|
|
177
|
+
is_safe_source, is_structured, last_updated,
|
|
178
|
+
quality_warnings, metadata_json, install_path
|
|
179
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
180
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
181
|
+
source=excluded.source,
|
|
182
|
+
name=excluded.name,
|
|
183
|
+
description=excluded.description,
|
|
184
|
+
downloads=excluded.downloads,
|
|
185
|
+
likes=excluded.likes,
|
|
186
|
+
stars=excluded.stars,
|
|
187
|
+
tags=excluded.tags,
|
|
188
|
+
license_id=excluded.license_id,
|
|
189
|
+
license_category=excluded.license_category,
|
|
190
|
+
quality_score=excluded.quality_score,
|
|
191
|
+
has_train_split=excluded.has_train_split,
|
|
192
|
+
total_examples=excluded.total_examples,
|
|
193
|
+
total_size_mb=excluded.total_size_mb,
|
|
194
|
+
is_safe_source=excluded.is_safe_source,
|
|
195
|
+
is_structured=excluded.is_structured,
|
|
196
|
+
last_updated=excluded.last_updated,
|
|
197
|
+
quality_warnings=excluded.quality_warnings,
|
|
198
|
+
metadata_json=excluded.metadata_json,
|
|
199
|
+
install_path=excluded.install_path
|
|
200
|
+
`);
|
|
201
|
+
stmt.run(dataset.id, dataset.source, dataset.name, dataset.description, dataset.downloads, dataset.likes, dataset.stars || 0, JSON.stringify(dataset.tags), dataset.license.id, dataset.license.category, dataset.quality_score, dataset.has_train_split ? 1 : 0, dataset.total_examples, dataset.total_size_mb || null, dataset.is_safe_source ? 1 : 0, dataset.is_structured ? 1 : 0, dataset.last_updated, JSON.stringify(dataset.quality_warnings || []), JSON.stringify(dataset), dataset.install_path || null);
|
|
202
|
+
}
|
|
203
|
+
updateInstallPath(id, path) {
|
|
204
|
+
this.db.prepare("UPDATE datasets SET install_path = ? WHERE id = ?").run(path, id);
|
|
205
|
+
}
|
|
206
|
+
getDataset(id) {
|
|
207
|
+
const row = this.db.prepare("SELECT metadata_json, install_path FROM datasets WHERE id = ?").get(id);
|
|
208
|
+
if (!row)
|
|
209
|
+
return null;
|
|
210
|
+
const metadata = JSON.parse(row.metadata_json);
|
|
211
|
+
metadata.install_path = row.install_path || undefined;
|
|
212
|
+
return metadata;
|
|
213
|
+
}
|
|
214
|
+
getAllDatasets() {
|
|
215
|
+
const rows = this.db.prepare("SELECT metadata_json FROM datasets").all();
|
|
216
|
+
return rows.map(r => JSON.parse(r.metadata_json));
|
|
217
|
+
}
|
|
218
|
+
beginTransaction() {
|
|
219
|
+
this.db.exec("BEGIN");
|
|
220
|
+
}
|
|
221
|
+
commit() {
|
|
222
|
+
this.db.exec("COMMIT");
|
|
223
|
+
}
|
|
224
|
+
rollback() {
|
|
225
|
+
this.db.exec("ROLLBACK");
|
|
226
|
+
}
|
|
227
|
+
search(options) {
|
|
228
|
+
let query = "SELECT metadata_json FROM datasets WHERE 1=1";
|
|
229
|
+
const params = [];
|
|
230
|
+
if (options.licenseCategories && options.licenseCategories.length > 0) {
|
|
231
|
+
const placeholders = options.licenseCategories.map(() => "?").join(",");
|
|
232
|
+
query += ` AND license_category IN (${placeholders})`;
|
|
233
|
+
params.push(...options.licenseCategories);
|
|
234
|
+
}
|
|
235
|
+
if (options.minQualityScore !== undefined) {
|
|
236
|
+
query += " AND quality_score >= ?";
|
|
237
|
+
params.push(options.minQualityScore);
|
|
238
|
+
}
|
|
239
|
+
query += " ORDER BY quality_score DESC";
|
|
240
|
+
if (options.limit) {
|
|
241
|
+
query += " LIMIT ?";
|
|
242
|
+
params.push(options.limit);
|
|
243
|
+
}
|
|
244
|
+
const rows = this.db.prepare(query).all(...params);
|
|
245
|
+
return rows.map(r => JSON.parse(r.metadata_json));
|
|
246
|
+
}
|
|
247
|
+
saveJob(job) {
|
|
248
|
+
const upsert = this.db.prepare(`
|
|
249
|
+
INSERT INTO jobs (id, type, status, priority, progress, status_text, result_url, error, attempts, max_attempts, created_at, updated_at, metadata)
|
|
250
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
251
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
252
|
+
status=excluded.status,
|
|
253
|
+
priority=excluded.priority,
|
|
254
|
+
progress=excluded.progress,
|
|
255
|
+
status_text=excluded.status_text,
|
|
256
|
+
result_url=excluded.result_url,
|
|
257
|
+
error=excluded.error,
|
|
258
|
+
attempts=excluded.attempts,
|
|
259
|
+
max_attempts=excluded.max_attempts,
|
|
260
|
+
updated_at=excluded.updated_at,
|
|
261
|
+
metadata=excluded.metadata
|
|
262
|
+
`);
|
|
263
|
+
upsert.run(job.id, job.type, job.status, job.priority, job.progress, job.status_text, job.result_url || null, job.error || null, job.attempts, job.max_attempts, job.created_at, job.updated_at, job.metadata || null);
|
|
264
|
+
}
|
|
265
|
+
getJob(id) {
|
|
266
|
+
const row = this.db.prepare("SELECT * FROM jobs WHERE id = ?").get(id);
|
|
267
|
+
return row || null;
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* Register or update a local file record
|
|
271
|
+
*/
|
|
272
|
+
registerDownload(id, localPath, status, sizeBytes, error) {
|
|
273
|
+
const upsert = this.db.prepare(`
|
|
274
|
+
INSERT INTO local_files (id, local_path, status, size_bytes, last_checked, error)
|
|
275
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
276
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
277
|
+
local_path=excluded.local_path,
|
|
278
|
+
status=excluded.status,
|
|
279
|
+
size_bytes=excluded.size_bytes,
|
|
280
|
+
last_checked=excluded.last_checked,
|
|
281
|
+
error=excluded.error
|
|
282
|
+
`);
|
|
283
|
+
upsert.run(id, localPath, status, sizeBytes || 0, new Date().toISOString(), error || null);
|
|
284
|
+
}
|
|
285
|
+
/**
|
|
286
|
+
* Get download status and path for a dataset
|
|
287
|
+
*/
|
|
288
|
+
getDownloadStatus(id) {
|
|
289
|
+
return this.db.prepare("SELECT * FROM local_files WHERE id = ?").get(id);
|
|
290
|
+
}
|
|
291
|
+
/**
|
|
292
|
+
* Archive old jobs to the cold storage table.
|
|
293
|
+
* @param days Age in days
|
|
294
|
+
*/
|
|
295
|
+
archiveOldJobs(days) {
|
|
296
|
+
const cutoff = new Date();
|
|
297
|
+
cutoff.setDate(cutoff.getDate() - days);
|
|
298
|
+
const cutoffStr = cutoff.toISOString();
|
|
299
|
+
return this.db.transaction(() => {
|
|
300
|
+
// Copy to archive
|
|
301
|
+
this.db.prepare(`
|
|
302
|
+
INSERT OR IGNORE INTO jobs_archive
|
|
303
|
+
SELECT * FROM jobs
|
|
304
|
+
WHERE created_at < ?
|
|
305
|
+
`).run(cutoffStr);
|
|
306
|
+
// Delete from active jobs
|
|
307
|
+
const info = this.db.prepare(`
|
|
308
|
+
DELETE FROM jobs
|
|
309
|
+
WHERE created_at < ?
|
|
310
|
+
`).run(cutoffStr);
|
|
311
|
+
return info.changes;
|
|
312
|
+
})();
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Perform database maintenance (VACUUM, ANALYZE).
|
|
316
|
+
*/
|
|
317
|
+
optimize() {
|
|
318
|
+
console.log("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
|
|
319
|
+
this.db.exec("VACUUM");
|
|
320
|
+
this.db.exec("ANALYZE");
|
|
321
|
+
}
|
|
322
|
+
close() {
|
|
323
|
+
this.db.close();
|
|
324
|
+
}
|
|
325
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|