@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,353 @@
1
+ import { listDatasets, datasetInfo } from "@huggingface/hub";
2
+ import { categorizeLicense } from "./license.js";
3
+ import { calculateQualityScore } from "./quality.js";
4
+ import { classifyDomain } from "./domain.js";
5
+ import { retryWithBackoff, delayBetweenRequests } from "./rate-limiter.js";
6
+ export class HuggingFaceScraper {
7
+ /**
8
+ * Bulk discovery: Fetch many datasets quickly without deep details.
9
+ * Hits the 25k target in minutes.
10
+ */
11
+ async scrapeBulk(limit = 1000, domainFilter) {
12
+ const filterMsg = domainFilter ? `, domain: ${domainFilter}` : "";
13
+ console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
14
+ const results = [];
15
+ let processed = 0;
16
+ try {
17
+ const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
18
+ for await (const ds of listDatasets({
19
+ limit: limit,
20
+ additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
21
+ search: { query: domainFilter },
22
+ ...(hfToken ? { accessToken: hfToken } : {})
23
+ })) {
24
+ if (results.length >= limit)
25
+ break;
26
+ processed++;
27
+ if (processed % 1000 === 0) {
28
+ console.error(`[Bulk Scraper] Found ${processed} datasets...`);
29
+ }
30
+ const repoId = ds.name || ds.id;
31
+ const description = ds.description || "";
32
+ const tags = ds.tags || [];
33
+ // Filter out non-repo IDs (hex IDs) if they don't have a namespace
34
+ if (!repoId.includes("/") && /^[a-f0-9]{24}$/.test(repoId))
35
+ continue;
36
+ // Hard skip: Empty or very short description (cannot search semantically)
37
+ if (!description || description.length < 20)
38
+ continue;
39
+ const createdAt = ds.createdAt;
40
+ const downloads = ds.downloadsAllTime || ds.downloads || 0;
41
+ const task = this.extractTask(tags);
42
+ // Create "skeleton" metadata
43
+ const metadata = {
44
+ id: repoId,
45
+ source: "huggingface",
46
+ name: repoId.split("/").pop() || repoId,
47
+ description: description,
48
+ quality_warnings: ["Incomplete metadata: run hydration to get full details"],
49
+ downloads: downloads,
50
+ likes: ds.likes || 0,
51
+ stars: 0,
52
+ tags: tags,
53
+ last_updated: createdAt instanceof Date ? createdAt.toISOString() : new Date().toISOString(),
54
+ task: task,
55
+ domain: classifyDomain(description, tags, repoId, task),
56
+ languages: this.extractLanguages(tags),
57
+ splits: [],
58
+ license: {
59
+ id: "unknown",
60
+ category: "unknown",
61
+ usage_restrictions: [],
62
+ warnings: ["License not verified yet"]
63
+ },
64
+ quality_score: 10, // Default low score for skeleton
65
+ download_url: `https://huggingface.co/datasets/${repoId}`,
66
+ total_examples: 0,
67
+ is_structured: false,
68
+ has_target_column: false,
69
+ is_safe_source: true, // Default to true, will be verified during hydration
70
+ has_personal_data: false,
71
+ is_paywalled: false,
72
+ is_scraped_web_data: false,
73
+ uses_https: true,
74
+ has_train_split: false,
75
+ has_test_split: false,
76
+ has_validation_split: false,
77
+ description_length: description.length,
78
+ has_readme: false,
79
+ is_incomplete: true // Flag for Phase 2
80
+ };
81
+ results.push(metadata);
82
+ }
83
+ }
84
+ catch (e) {
85
+ console.error("[Bulk Scraper] Error:", e.message);
86
+ }
87
+ return results;
88
+ }
89
+ async scrape(limit = 100, applyMVPFilters = true, domainFilter // Optional: filter by domain (medicine, healthcare, security, etc.)
90
+ ) {
91
+ const filterMsg = domainFilter ? `, domain: ${domainFilter}` : "";
92
+ console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
93
+ const results = [];
94
+ let processed = 0;
95
+ let skippedMVP = 0;
96
+ try {
97
+ // Fetch more datasets to account for filtering
98
+ const fetchLimit = applyMVPFilters ? limit * 30 : limit * 10;
99
+ const CONCURRENCY = 25; // Increased for high-volume indexing
100
+ const queue = [];
101
+ // Support HuggingFace token from environment variable
102
+ const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
103
+ // Add delay between batches to avoid rate limits
104
+ const BATCH_DELAY = 1000; // 1 second delay between batches
105
+ for await (const ds of listDatasets({
106
+ limit: fetchLimit,
107
+ additionalFields: ["description", "tags"],
108
+ search: { query: domainFilter },
109
+ ...(hfToken ? { accessToken: hfToken } : {})
110
+ })) {
111
+ if (results.length >= limit)
112
+ break;
113
+ processed++;
114
+ // Add delay every 100 datasets to avoid aggressive rate limits
115
+ if (processed % 100 === 0 && processed > 0) {
116
+ await delayBetweenRequests(hfToken ? 500 : 2000);
117
+ }
118
+ const repoId = ds.name || ds.id;
119
+ // Filter out non-repo IDs (hex IDs) if they don't have a namespace
120
+ if (!repoId.includes("/")) {
121
+ if (/^[a-f0-9]{24}$/.test(repoId))
122
+ continue;
123
+ }
124
+ const downloads = ds.downloads || 0;
125
+ const tags = ds.tags || [];
126
+ const description = ds.description || "";
127
+ if (processed % 100 === 0) {
128
+ console.error(`[HF Scraper] Processed ${processed}, found ${results.length}, workers: ${queue.length}...`);
129
+ }
130
+ // Hard skip: Empty or very short description (cannot search semantically)
131
+ if (!description || description.length < 20)
132
+ continue;
133
+ // Process dataset with concurrency and retry logic
134
+ const processTask = (async () => {
135
+ try {
136
+ const fullInfo = await retryWithBackoff(() => datasetInfo({
137
+ name: repoId,
138
+ additionalFields: ["cardData"],
139
+ ...(hfToken ? { accessToken: hfToken } : {})
140
+ }), {
141
+ maxRetries: 3,
142
+ initialDelay: 2000, // Start with 2 seconds for HF API
143
+ maxDelay: 30000 // Max 30 seconds
144
+ });
145
+ const splits = fullInfo.splits?.map((s) => ({
146
+ name: s.name,
147
+ num_examples: s.numExamples || 0,
148
+ size_bytes: s.sizeBytes
149
+ })) || [];
150
+ const totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
151
+ const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
152
+ const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
153
+ const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
154
+ const licenseTag = tags.find(t => t.startsWith("license:"));
155
+ const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
156
+ const cardData = fullInfo.cardData || {};
157
+ const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
158
+ const license = categorizeLicense(licenseId, licenseUrl);
159
+ if (license.category === "restricted") {
160
+ skippedMVP++;
161
+ return;
162
+ }
163
+ const warnings = [];
164
+ if (totalExamples < 50)
165
+ warnings.push("Dataset has very few examples (< 50)");
166
+ if (description.length < 100)
167
+ warnings.push("Short description; results may be less relevant");
168
+ const lastUpdated = ds.updatedAt || fullInfo.updatedAt;
169
+ if (lastUpdated) {
170
+ const updateDate = new Date(lastUpdated);
171
+ const fourYearsAgo = new Date();
172
+ fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
173
+ if (updateDate < fourYearsAgo) {
174
+ warnings.push(`Stale data: Last updated ${updateDate.getFullYear()}`);
175
+ }
176
+ }
177
+ if (splits.length === 0)
178
+ warnings.push("No data splits found; could be a non-standard format");
179
+ const descriptionLower = description.toLowerCase();
180
+ const tagsLower = tags.map(t => t.toLowerCase()).join(" ");
181
+ const hasPersonalData = descriptionLower.includes("personal data") ||
182
+ descriptionLower.includes("gdpr") ||
183
+ tagsLower.includes("personal-data");
184
+ const isPaywalled = descriptionLower.includes("paywall") ||
185
+ descriptionLower.includes("paid");
186
+ const isScrapedWebData = descriptionLower.includes("scraped") ||
187
+ tagsLower.includes("scraped");
188
+ const isSafeSource = !isScrapedWebData && !hasPersonalData && !isPaywalled;
189
+ if (!isSafeSource)
190
+ warnings.push("Contains potentially sensitive or paywalled data sources");
191
+ const format = this.extractFormat(tags, cardData);
192
+ const columns = this.extractColumns(cardData, splits);
193
+ const task = this.extractTask(tags);
194
+ const domain = classifyDomain(description, tags, repoId, task);
195
+ if (domainFilter && domain !== domainFilter)
196
+ return;
197
+ const metadata = {
198
+ id: repoId,
199
+ source: "huggingface",
200
+ name: repoId.split("/").pop() || repoId,
201
+ description: description,
202
+ quality_warnings: warnings,
203
+ downloads: downloads,
204
+ likes: ds.likes || 0,
205
+ stars: fullInfo.stars || 0,
206
+ tags: tags,
207
+ last_updated: lastUpdated?.toISOString() || new Date().toISOString(),
208
+ task: task,
209
+ domain: domain,
210
+ languages: this.extractLanguages(tags),
211
+ splits,
212
+ license,
213
+ quality_score: calculateQualityScore({
214
+ downloads,
215
+ likes: ds.likes || 0,
216
+ hasDescription: true,
217
+ descriptionLength: description.length,
218
+ hasTrainSplit: splits.some((s) => s.name === "train"),
219
+ hasTestSplit: splits.some((s) => s.name === "test"),
220
+ lastUpdated: lastUpdated?.toISOString() || new Date().toISOString(),
221
+ licenseCategory: license.category
222
+ }),
223
+ download_url: `https://huggingface.co/datasets/${repoId}`,
224
+ format,
225
+ total_examples: totalExamples,
226
+ total_size_bytes: totalSizeBytes,
227
+ total_size_mb: totalSizeMB,
228
+ columns,
229
+ is_structured: columns.length > 0 && splits.some((s) => s.name === "train"),
230
+ has_target_column: columns.some(c => c.is_target === true),
231
+ is_safe_source: isSafeSource,
232
+ has_personal_data: hasPersonalData,
233
+ is_paywalled: isPaywalled,
234
+ is_scraped_web_data: isScrapedWebData,
235
+ uses_https: true,
236
+ has_train_split: splits.some((s) => s.name === "train"),
237
+ has_test_split: splits.some((s) => s.name === "test"),
238
+ has_validation_split: hasValidationSplit,
239
+ description_length: description.length,
240
+ has_readme: !!(cardData.readme || cardData.readme_content)
241
+ };
242
+ results.push(metadata);
243
+ }
244
+ catch (e) {
245
+ // Log rate limit errors, silently skip others
246
+ if (e?.status === 429 || e?.message?.includes('rate limit')) {
247
+ console.error(`[HF Scraper] Rate limit error for ${repoId}: ${e.message}`);
248
+ }
249
+ // Silently skip other errors
250
+ }
251
+ })();
252
+ queue.push(processTask);
253
+ processTask.then(() => {
254
+ const idx = queue.indexOf(processTask);
255
+ if (idx !== -1)
256
+ queue.splice(idx, 1);
257
+ });
258
+ if (queue.length >= CONCURRENCY) {
259
+ await Promise.race(queue);
260
+ }
261
+ }
262
+ // Wait for remaining tasks
263
+ await Promise.all(queue);
264
+ }
265
+ catch (e) {
266
+ // Handle rate limit errors with better messaging
267
+ if (e?.status === 429 || e?.message?.includes('rate limit')) {
268
+ console.error("Scraping failed due to rate limit:", e.message);
269
+ console.error("Consider setting HF_TOKEN environment variable to increase rate limits");
270
+ }
271
+ else {
272
+ console.error("Scraping failed overall:", e.message);
273
+ }
274
+ }
275
+ console.error(`[HF Scraper] Complete: ${results.length} datasets scraped, ${skippedMVP} skipped`);
276
+ // Sort by downloads descending
277
+ return results.sort((a, b) => b.downloads - a.downloads);
278
+ }
279
+ extractTask(tags) {
280
+ const taskTags = [
281
+ "text-classification",
282
+ "token-classification",
283
+ "question-answering",
284
+ "translation",
285
+ "summarization",
286
+ "text-generation",
287
+ "image-classification",
288
+ "object-detection",
289
+ "named-entity-recognition",
290
+ "sentiment-analysis",
291
+ "machine-translation"
292
+ ];
293
+ return tags.find(t => taskTags.includes(t)) || "unknown";
294
+ }
295
+ extractLanguages(tags) {
296
+ const langs = tags
297
+ .filter(t => t.startsWith("language:"))
298
+ .map(t => t.replace("language:", ""));
299
+ const isoLangs = tags.filter(t => t.length === 2 && /^[a-z]{2}$/.test(t));
300
+ return Array.from(new Set([...langs, ...isoLangs]));
301
+ }
302
+ extractFormat(tags, cardData) {
303
+ // Check tags first
304
+ const formatTag = tags.find(t => t.startsWith("format:"));
305
+ if (formatTag) {
306
+ return formatTag.replace("format:", "").toUpperCase();
307
+ }
308
+ // Check cardData
309
+ if (cardData.format) {
310
+ return String(cardData.format).toUpperCase();
311
+ }
312
+ // Infer from other tags
313
+ if (tags.includes("parquet"))
314
+ return "PARQUET";
315
+ if (tags.includes("csv"))
316
+ return "CSV";
317
+ if (tags.includes("json"))
318
+ return "JSON";
319
+ if (tags.includes("arrow"))
320
+ return "ARROW";
321
+ return undefined;
322
+ }
323
+ extractColumns(cardData, splits) {
324
+ const columns = [];
325
+ // Try to get columns from cardData
326
+ if (cardData.columns) {
327
+ if (Array.isArray(cardData.columns)) {
328
+ return cardData.columns.map((col) => ({
329
+ name: col.name || String(col),
330
+ type: col.type,
331
+ is_target: col.is_target || false
332
+ }));
333
+ }
334
+ }
335
+ // Try to infer from features if available
336
+ if (cardData.features) {
337
+ const features = cardData.features;
338
+ if (typeof features === 'object') {
339
+ for (const [key, value] of Object.entries(features)) {
340
+ const feature = value;
341
+ columns.push({
342
+ name: key,
343
+ type: feature?.dtype || feature?.type,
344
+ is_target: key.toLowerCase().includes("label") ||
345
+ key.toLowerCase().includes("target") ||
346
+ key.toLowerCase().includes("y")
347
+ });
348
+ }
349
+ }
350
+ }
351
+ return columns;
352
+ }
353
+ }
@@ -0,0 +1,325 @@
1
+ import Database from "better-sqlite3";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ export class MetadataStore {
5
+ db;
6
+ constructor(dbPath) {
7
+ // Ensure data directory exists
8
+ const dir = path.dirname(dbPath);
9
+ if (!fs.existsSync(dir)) {
10
+ fs.mkdirSync(dir, { recursive: true });
11
+ }
12
+ this.db = new Database(dbPath);
13
+ this.init();
14
+ this.migrate();
15
+ }
16
+ migrate() {
17
+ // Add install_path if missing
18
+ try {
19
+ this.db.exec("ALTER TABLE datasets ADD COLUMN install_path TEXT");
20
+ console.log("[MetadataStore] Migrated: Added install_path column");
21
+ }
22
+ catch (e) {
23
+ // Probably already exists
24
+ }
25
+ }
26
+ init() {
27
+ // Create table with all columns if it doesn't exist
28
+ this.db.exec(`
29
+ CREATE TABLE IF NOT EXISTS datasets (
30
+ id TEXT PRIMARY KEY,
31
+ source TEXT, -- 'huggingface', 'kaggle', 'uci', etc.
32
+ name TEXT,
33
+ description TEXT,
34
+ downloads INTEGER,
35
+ likes INTEGER,
36
+ stars INTEGER,
37
+ tags TEXT, -- JSON string
38
+ license_id TEXT,
39
+ license_category TEXT,
40
+ quality_score INTEGER,
41
+ has_train_split BOOLEAN,
42
+ total_examples INTEGER,
43
+ total_size_mb REAL,
44
+ is_safe_source BOOLEAN,
45
+ is_structured BOOLEAN,
46
+ last_updated TEXT,
47
+ quality_warnings TEXT, -- JSON array string
48
+ metadata_json TEXT, -- Full metadata as JSON
49
+ install_path TEXT -- Path to locally installed data
50
+ );
51
+
52
+ CREATE TABLE IF NOT EXISTS jobs (
53
+ id TEXT PRIMARY KEY,
54
+ type TEXT,
55
+ status TEXT,
56
+ priority INTEGER DEFAULT 0,
57
+ progress INTEGER DEFAULT 0,
58
+ status_text TEXT,
59
+ result_url TEXT,
60
+ error TEXT,
61
+ attempts INTEGER DEFAULT 0,
62
+ max_attempts INTEGER DEFAULT 3,
63
+ created_at TEXT,
64
+ updated_at TEXT,
65
+ metadata TEXT
66
+ );
67
+
68
+ CREATE TABLE IF NOT EXISTS local_files (
69
+ id TEXT PRIMARY KEY,
70
+ local_path TEXT,
71
+ status TEXT, -- 'downloading', 'completed', 'failed'
72
+ size_bytes INTEGER,
73
+ last_checked TEXT,
74
+ error TEXT
75
+ );
76
+
77
+ CREATE TABLE IF NOT EXISTS jobs_archive (
78
+ id TEXT PRIMARY KEY,
79
+ type TEXT,
80
+ status TEXT,
81
+ priority INTEGER DEFAULT 0,
82
+ progress INTEGER DEFAULT 0,
83
+ status_text TEXT,
84
+ result_url TEXT,
85
+ error TEXT,
86
+ attempts INTEGER DEFAULT 0,
87
+ max_attempts INTEGER DEFAULT 3,
88
+ created_at TEXT,
89
+ updated_at TEXT,
90
+ metadata TEXT
91
+ );
92
+ `);
93
+ // Migrate existing tables: add new columns if they don't exist
94
+ const tableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
95
+ const existingColumns = new Set(tableInfo.map(col => col.name));
96
+ const migrations = [
97
+ { column: "stars", type: "INTEGER DEFAULT 0" },
98
+ { column: "total_examples", type: "INTEGER" },
99
+ { column: "total_size_mb", type: "REAL" },
100
+ { column: "is_safe_source", type: "BOOLEAN DEFAULT 1" },
101
+ { column: "is_structured", type: "BOOLEAN DEFAULT 0" },
102
+ { column: "domain", type: "TEXT" },
103
+ { column: "source", type: "TEXT DEFAULT 'huggingface'" },
104
+ { column: "quality_warnings", type: "TEXT" }
105
+ ];
106
+ for (const migration of migrations) {
107
+ if (!existingColumns.has(migration.column)) {
108
+ console.error(`[MetadataStore] Migrating: adding column ${migration.column}`);
109
+ this.db.exec(`ALTER TABLE datasets ADD COLUMN ${migration.column} ${migration.type}`);
110
+ }
111
+ }
112
+ // Job migrations
113
+ const jobTableInfo = this.db.prepare("PRAGMA table_info(jobs)").all();
114
+ const existingJobColumns = new Set(jobTableInfo.map(col => col.name));
115
+ const jobMigrations = [
116
+ { column: "priority", type: "INTEGER DEFAULT 0" },
117
+ { column: "attempts", type: "INTEGER DEFAULT 0" },
118
+ { column: "max_attempts", type: "INTEGER DEFAULT 3" }
119
+ ];
120
+ for (const migration of jobMigrations) {
121
+ if (!existingJobColumns.has(migration.column)) {
122
+ console.error(`[MetadataStore] Migrating Job: adding column ${migration.column}`);
123
+ this.db.exec(`ALTER TABLE jobs ADD COLUMN ${migration.column} ${migration.type}`);
124
+ }
125
+ }
126
+ // Get updated column list after migrations
127
+ const updatedTableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
128
+ const allColumns = new Set(updatedTableInfo.map(col => col.name));
129
+ // Create indexes (only if columns exist)
130
+ const indexes = [
131
+ { name: "idx_license_category", table: "datasets", column: "license_category" },
132
+ { name: "idx_quality_score", table: "datasets", column: "quality_score" },
133
+ { name: "idx_downloads", table: "datasets", column: "downloads" },
134
+ { name: "idx_has_train_split", table: "datasets", column: "has_train_split" },
135
+ { name: "idx_is_safe_source", table: "datasets", column: "is_safe_source" },
136
+ { name: "idx_is_structured", table: "datasets", column: "is_structured" },
137
+ { name: "idx_domain", table: "datasets", column: "domain" },
138
+ { name: "idx_jobs_status", table: "jobs", column: "status" },
139
+ { name: "idx_jobs_created", table: "jobs", column: "created_at" },
140
+ { name: "idx_jobs_archive_created", table: "jobs_archive", column: "created_at" }
141
+ ];
142
+ for (const idx of indexes) {
143
+ // For datasets table, check if column exists first
144
+ if (idx.table === "datasets" && !allColumns.has(idx.column))
145
+ continue;
146
+ try {
147
+ this.db.exec(`CREATE INDEX IF NOT EXISTS ${idx.name} ON ${idx.table}(${idx.column})`);
148
+ }
149
+ catch (e) {
150
+ // Ignore index errors
151
+ }
152
+ }
153
+ }
154
+ saveDataset(dataset) {
155
+ // If the incoming dataset is incomplete, check if we already have a complete one
156
+ if (dataset.is_incomplete) {
157
+ const existing = this.getDataset(dataset.id);
158
+ if (existing && !existing.is_incomplete) {
159
+ // Already have better data, only update stats
160
+ const updateStats = this.db.prepare(`
161
+ UPDATE datasets SET
162
+ downloads = ?,
163
+ likes = ?,
164
+ stars = ?,
165
+ last_updated = ?
166
+ WHERE id = ?
167
+ `);
168
+ updateStats.run(dataset.downloads, dataset.likes, dataset.stars || 0, dataset.last_updated, dataset.id);
169
+ return;
170
+ }
171
+ }
172
+ const stmt = this.db.prepare(`
173
+ INSERT INTO datasets (
174
+ id, source, name, description, downloads, likes, stars, tags,
175
+ license_id, license_category, quality_score,
176
+ has_train_split, total_examples, total_size_mb,
177
+ is_safe_source, is_structured, last_updated,
178
+ quality_warnings, metadata_json, install_path
179
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
180
+ ON CONFLICT(id) DO UPDATE SET
181
+ source=excluded.source,
182
+ name=excluded.name,
183
+ description=excluded.description,
184
+ downloads=excluded.downloads,
185
+ likes=excluded.likes,
186
+ stars=excluded.stars,
187
+ tags=excluded.tags,
188
+ license_id=excluded.license_id,
189
+ license_category=excluded.license_category,
190
+ quality_score=excluded.quality_score,
191
+ has_train_split=excluded.has_train_split,
192
+ total_examples=excluded.total_examples,
193
+ total_size_mb=excluded.total_size_mb,
194
+ is_safe_source=excluded.is_safe_source,
195
+ is_structured=excluded.is_structured,
196
+ last_updated=excluded.last_updated,
197
+ quality_warnings=excluded.quality_warnings,
198
+ metadata_json=excluded.metadata_json,
199
+ install_path=excluded.install_path
200
+ `);
201
+ stmt.run(dataset.id, dataset.source, dataset.name, dataset.description, dataset.downloads, dataset.likes, dataset.stars || 0, JSON.stringify(dataset.tags), dataset.license.id, dataset.license.category, dataset.quality_score, dataset.has_train_split ? 1 : 0, dataset.total_examples, dataset.total_size_mb || null, dataset.is_safe_source ? 1 : 0, dataset.is_structured ? 1 : 0, dataset.last_updated, JSON.stringify(dataset.quality_warnings || []), JSON.stringify(dataset), dataset.install_path || null);
202
+ }
203
+ updateInstallPath(id, path) {
204
+ this.db.prepare("UPDATE datasets SET install_path = ? WHERE id = ?").run(path, id);
205
+ }
206
+ getDataset(id) {
207
+ const row = this.db.prepare("SELECT metadata_json, install_path FROM datasets WHERE id = ?").get(id);
208
+ if (!row)
209
+ return null;
210
+ const metadata = JSON.parse(row.metadata_json);
211
+ metadata.install_path = row.install_path || undefined;
212
+ return metadata;
213
+ }
214
+ getAllDatasets() {
215
+ const rows = this.db.prepare("SELECT metadata_json FROM datasets").all();
216
+ return rows.map(r => JSON.parse(r.metadata_json));
217
+ }
218
+ beginTransaction() {
219
+ this.db.exec("BEGIN");
220
+ }
221
+ commit() {
222
+ this.db.exec("COMMIT");
223
+ }
224
+ rollback() {
225
+ this.db.exec("ROLLBACK");
226
+ }
227
+ search(options) {
228
+ let query = "SELECT metadata_json FROM datasets WHERE 1=1";
229
+ const params = [];
230
+ if (options.licenseCategories && options.licenseCategories.length > 0) {
231
+ const placeholders = options.licenseCategories.map(() => "?").join(",");
232
+ query += ` AND license_category IN (${placeholders})`;
233
+ params.push(...options.licenseCategories);
234
+ }
235
+ if (options.minQualityScore !== undefined) {
236
+ query += " AND quality_score >= ?";
237
+ params.push(options.minQualityScore);
238
+ }
239
+ query += " ORDER BY quality_score DESC";
240
+ if (options.limit) {
241
+ query += " LIMIT ?";
242
+ params.push(options.limit);
243
+ }
244
+ const rows = this.db.prepare(query).all(...params);
245
+ return rows.map(r => JSON.parse(r.metadata_json));
246
+ }
247
+ saveJob(job) {
248
+ const upsert = this.db.prepare(`
249
+ INSERT INTO jobs (id, type, status, priority, progress, status_text, result_url, error, attempts, max_attempts, created_at, updated_at, metadata)
250
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
251
+ ON CONFLICT(id) DO UPDATE SET
252
+ status=excluded.status,
253
+ priority=excluded.priority,
254
+ progress=excluded.progress,
255
+ status_text=excluded.status_text,
256
+ result_url=excluded.result_url,
257
+ error=excluded.error,
258
+ attempts=excluded.attempts,
259
+ max_attempts=excluded.max_attempts,
260
+ updated_at=excluded.updated_at,
261
+ metadata=excluded.metadata
262
+ `);
263
+ upsert.run(job.id, job.type, job.status, job.priority, job.progress, job.status_text, job.result_url || null, job.error || null, job.attempts, job.max_attempts, job.created_at, job.updated_at, job.metadata || null);
264
+ }
265
+ getJob(id) {
266
+ const row = this.db.prepare("SELECT * FROM jobs WHERE id = ?").get(id);
267
+ return row || null;
268
+ }
269
+ /**
270
+ * Register or update a local file record
271
+ */
272
+ registerDownload(id, localPath, status, sizeBytes, error) {
273
+ const upsert = this.db.prepare(`
274
+ INSERT INTO local_files (id, local_path, status, size_bytes, last_checked, error)
275
+ VALUES (?, ?, ?, ?, ?, ?)
276
+ ON CONFLICT(id) DO UPDATE SET
277
+ local_path=excluded.local_path,
278
+ status=excluded.status,
279
+ size_bytes=excluded.size_bytes,
280
+ last_checked=excluded.last_checked,
281
+ error=excluded.error
282
+ `);
283
+ upsert.run(id, localPath, status, sizeBytes || 0, new Date().toISOString(), error || null);
284
+ }
285
+ /**
286
+ * Get download status and path for a dataset
287
+ */
288
+ getDownloadStatus(id) {
289
+ return this.db.prepare("SELECT * FROM local_files WHERE id = ?").get(id);
290
+ }
291
+ /**
292
+ * Archive old jobs to the cold storage table.
293
+ * @param days Age in days
294
+ */
295
+ archiveOldJobs(days) {
296
+ const cutoff = new Date();
297
+ cutoff.setDate(cutoff.getDate() - days);
298
+ const cutoffStr = cutoff.toISOString();
299
+ return this.db.transaction(() => {
300
+ // Copy to archive
301
+ this.db.prepare(`
302
+ INSERT OR IGNORE INTO jobs_archive
303
+ SELECT * FROM jobs
304
+ WHERE created_at < ?
305
+ `).run(cutoffStr);
306
+ // Delete from active jobs
307
+ const info = this.db.prepare(`
308
+ DELETE FROM jobs
309
+ WHERE created_at < ?
310
+ `).run(cutoffStr);
311
+ return info.changes;
312
+ })();
313
+ }
314
+ /**
315
+ * Perform database maintenance (VACUUM, ANALYZE).
316
+ */
317
+ optimize() {
318
+ console.log("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
319
+ this.db.exec("VACUUM");
320
+ this.db.exec("ANALYZE");
321
+ }
322
+ close() {
323
+ this.db.close();
324
+ }
325
+ }
@@ -0,0 +1 @@
1
+ export {};