@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,49 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ const WB_SCRIPT_PATH = path.resolve("src", "python", "worldbank_adapter.py");
4
+ const NASA_SCRIPT_PATH = path.resolve("src", "python", "nasa_adapter.py");
5
+ export class WorldBankScraper {
6
+ async scrape(query, limit = 10) {
7
+ return runAdapter(WB_SCRIPT_PATH, query, limit);
8
+ }
9
+ }
10
+ export class NASAScraper {
11
+ async scrape(query, limit = 10) {
12
+ return runAdapter(NASA_SCRIPT_PATH, query, limit);
13
+ }
14
+ }
15
+ async function runAdapter(scriptPath, query, limit) {
16
+ return new Promise((resolve) => {
17
+ const pythonProcess = spawn("python", [
18
+ scriptPath,
19
+ "--action", "search",
20
+ "--query", query,
21
+ "--limit", String(limit)
22
+ ]);
23
+ let output = "";
24
+ let errorOutput = "";
25
+ pythonProcess.stdout.on("data", (data) => { output += data.toString(); });
26
+ pythonProcess.stderr.on("data", (data) => { errorOutput += data.toString(); });
27
+ pythonProcess.on("close", (code) => {
28
+ if (code !== 0) {
29
+ console.error(`[Adapter] ${path.basename(scriptPath)} exited with code ${code}: ${errorOutput}`);
30
+ resolve([]);
31
+ return;
32
+ }
33
+ try {
34
+ const results = JSON.parse(output);
35
+ if (results.error) {
36
+ console.error(`[Adapter] ${path.basename(scriptPath)} error: ${results.error}`);
37
+ resolve([]);
38
+ }
39
+ else {
40
+ resolve(results);
41
+ }
42
+ }
43
+ catch (e) {
44
+ console.error(`[Adapter] ${path.basename(scriptPath)} JSON error: ${e.message}`);
45
+ resolve([]);
46
+ }
47
+ });
48
+ });
49
+ }
@@ -0,0 +1,182 @@
1
+ import { categorizeLicense } from "./license.js";
2
+ import { calculateQualityScore } from "./quality.js";
3
+ import { classifyDomain } from "./domain.js";
4
+ import { rateLimitedFetch, delayBetweenRequests } from "./rate-limiter.js";
5
+ export class KaggleMetadataScraper {
6
+ username;
7
+ key;
8
+ constructor(username, key) {
9
+ this.username = username;
10
+ this.key = key;
11
+ }
12
+ async scrape(query, limit = 20, usePagination = true) {
13
+ console.error(`[Kaggle] Searching for "${query}" (limit: ${limit}, pagination: ${usePagination})...`);
14
+ const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
15
+ const results = [];
16
+ const MAX_PAGE_SIZE = 100; // Kaggle API max page size
17
+ const pageSize = Math.min(limit, MAX_PAGE_SIZE);
18
+ let page = 1;
19
+ let totalFetched = 0;
20
+ let hasMore = true;
21
+ try {
22
+ while (hasMore && totalFetched < limit) {
23
+ const url = `https://www.kaggle.com/api/v1/datasets/list?search=${encodeURIComponent(query)}&page_size=${pageSize}&page=${page}`;
24
+ console.error(`[Kaggle] Fetching page ${page} (${totalFetched}/${limit} datasets so far)...`);
25
+ // Use rate-limited fetch with retry logic
26
+ const response = await rateLimitedFetch(url, {
27
+ headers: {
28
+ 'Authorization': `Basic ${auth}`,
29
+ 'Content-Type': 'application/json'
30
+ }
31
+ }, {
32
+ maxRetries: 3,
33
+ initialDelay: 2000, // Start with 2 seconds
34
+ maxDelay: 30000 // Max 30 seconds
35
+ });
36
+ const datasets = await response.json();
37
+ if (!datasets || datasets.length === 0) {
38
+ hasMore = false;
39
+ break;
40
+ }
41
+ // Add delay between processing datasets to avoid rate limits
42
+ for (let i = 0; i < datasets.length; i++) {
43
+ const ds = datasets[i];
44
+ try {
45
+ const metadata = this.transform(ds);
46
+ results.push(metadata);
47
+ totalFetched++;
48
+ console.error(`[Kaggle] Added: ${ds.ref} (${ds.downloadCount} downloads)`);
49
+ // Add small delay every 5 datasets
50
+ if ((i + 1) % 5 === 0 && i < datasets.length - 1) {
51
+ await delayBetweenRequests(500);
52
+ }
53
+ }
54
+ catch (e) {
55
+ console.error(`[Kaggle] ERROR: Failed to transform ${ds.ref}:`, e);
56
+ }
57
+ }
58
+ // Check if we should continue pagination
59
+ if (usePagination && datasets.length === pageSize && totalFetched < limit) {
60
+ page++;
61
+ // Add delay between pages to avoid rate limits
62
+ await delayBetweenRequests(1000);
63
+ }
64
+ else {
65
+ hasMore = false;
66
+ }
67
+ }
68
+ console.error(`[Kaggle] Completed: ${results.length} datasets found for "${query}"`);
69
+ return results;
70
+ }
71
+ catch (e) {
72
+ // Handle rate limit errors specifically
73
+ if (e?.status === 429 || e?.message?.includes('rate limit')) {
74
+ console.error("[Kaggle] Rate limit error:", e.message);
75
+ console.error("Consider adding delays between requests or reducing batch size");
76
+ }
77
+ else {
78
+ console.error("[Kaggle] Scrape error:", e.message || e);
79
+ }
80
+ // Return partial results if we got some before the error
81
+ if (results.length > 0) {
82
+ console.error(`[Kaggle] Returning ${results.length} partial results before error`);
83
+ }
84
+ return results;
85
+ }
86
+ }
87
+ transform(ds) {
88
+ const repoId = ds.ref;
89
+ const tags = ds.tags?.map(t => t.name) || [];
90
+ const description = ds.description || "";
91
+ const license = categorizeLicense(ds.licenseName);
92
+ const warnings = [];
93
+ // Kaggle doesn't give us splits in the list API easily
94
+ const sizeBytes = this.parseSize(ds.size);
95
+ const splits = [
96
+ {
97
+ name: "data",
98
+ num_examples: 0,
99
+ size_bytes: sizeBytes
100
+ }
101
+ ];
102
+ const totalSizeMB = sizeBytes ? Math.round(sizeBytes / (1024 * 1024) * 100) / 100 : 0;
103
+ // Populate warnings
104
+ if (description.length < 100)
105
+ warnings.push("Short description; results may be less relevant");
106
+ const lastUpdatedDate = new Date(ds.lastUpdated);
107
+ const fourYearsAgo = new Date();
108
+ fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
109
+ if (lastUpdatedDate < fourYearsAgo) {
110
+ warnings.push(`Stale data: Last updated ${lastUpdatedDate.getFullYear()}`);
111
+ }
112
+ warnings.push("No specific data splits identified (Kaggle API limitation)");
113
+ // Classify domain
114
+ const task = this.extractTask(tags);
115
+ const domain = classifyDomain(description, tags, repoId, task);
116
+ return {
117
+ id: repoId,
118
+ source: "kaggle",
119
+ name: ds.title,
120
+ description: description,
121
+ quality_warnings: warnings,
122
+ downloads: ds.downloadCount,
123
+ likes: ds.voteCount,
124
+ stars: 0,
125
+ tags: tags,
126
+ last_updated: ds.lastUpdated,
127
+ task: task,
128
+ domain: domain,
129
+ languages: [],
130
+ splits,
131
+ license,
132
+ quality_score: calculateQualityScore({
133
+ downloads: ds.downloadCount,
134
+ likes: ds.voteCount,
135
+ hasDescription: description.length > 50,
136
+ descriptionLength: description.length,
137
+ hasTrainSplit: false,
138
+ hasTestSplit: false,
139
+ lastUpdated: ds.lastUpdated,
140
+ licenseCategory: license.category
141
+ }),
142
+ download_url: `https://www.kaggle.com/datasets/${ds.ref}`,
143
+ format: undefined,
144
+ total_examples: 0,
145
+ total_size_bytes: sizeBytes,
146
+ total_size_mb: totalSizeMB,
147
+ columns: [],
148
+ is_structured: false,
149
+ has_target_column: false,
150
+ is_safe_source: true,
151
+ has_personal_data: false,
152
+ is_paywalled: false,
153
+ is_scraped_web_data: false,
154
+ uses_https: true,
155
+ has_train_split: false,
156
+ has_test_split: false,
157
+ has_validation_split: false,
158
+ description_length: description.length,
159
+ has_readme: true
160
+ };
161
+ }
162
+ parseSize(sizeStr) {
163
+ if (!sizeStr)
164
+ return 0;
165
+ const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*([KMGT]B)$/i);
166
+ if (!match)
167
+ return 0;
168
+ const value = parseFloat(match[1]);
169
+ const unit = match[2].toUpperCase();
170
+ switch (unit) {
171
+ case 'KB': return value * 1024;
172
+ case 'MB': return value * 1024 * 1024;
173
+ case 'GB': return value * 1024 * 1024 * 1024;
174
+ case 'TB': return value * 1024 * 1024 * 1024 * 1024;
175
+ default: return value;
176
+ }
177
+ }
178
+ extractTask(tags) {
179
+ // Similar to HF but Kaggle tags might be different
180
+ return "unknown";
181
+ }
182
+ }
@@ -0,0 +1,68 @@
1
+ const SAFE_KEYWORDS = ["mit", "apache", "bsd", "cc0", "cc-by-4.0", "cc-by-sa-4.0", "odc-by", "pddl", "openrail", "creative commons attribution 4.0", "public domain"];
2
+ const RESTRICTED_KEYWORDS = ["nc", "non-commercial", "research-only", "academic", "gpl", "agpl", "proprietary", "custom"];
3
+ // Permissive licenses for MVP filter
4
+ const PERMISSIVE_LICENSES = ["mit", "apache", "apache-2.0", "bsd", "cc0", "cc-by-4.0", "odc-by", "pddl", "openrail"];
5
+ export function categorizeLicense(licenseId, licenseUrl) {
6
+ const id = (licenseId || "unknown").toLowerCase();
7
+ const usageRestrictions = [];
8
+ let requiresConsent = false;
9
+ // Check for usage restrictions
10
+ if (id.includes("nc") || id.includes("non-commercial")) {
11
+ usageRestrictions.push("non-commercial");
12
+ }
13
+ if (id.includes("research-only") || id.includes("academic")) {
14
+ usageRestrictions.push("academic-only");
15
+ }
16
+ if (id.includes("nd") || id.includes("no-derivatives")) {
17
+ usageRestrictions.push("no-derivatives");
18
+ }
19
+ if (id.includes("gpl") || id.includes("agpl")) {
20
+ usageRestrictions.push("no-derivatives"); // GPL requires derivative works to be GPL
21
+ }
22
+ // Check if consent is required (GDPR, Kaggle, etc.)
23
+ if (id.includes("gdpr") || id.includes("consent") || id.includes("kaggle")) {
24
+ requiresConsent = true;
25
+ }
26
+ // If ID contains restricted keywords
27
+ if (RESTRICTED_KEYWORDS.some(k => id.includes(k))) {
28
+ return {
29
+ id,
30
+ category: "restricted",
31
+ commercial_use: false,
32
+ usage_restrictions: usageRestrictions.length > 0 ? usageRestrictions : ["non-commercial"],
33
+ url: licenseUrl,
34
+ warnings: [
35
+ "Restricted usage terms apply",
36
+ "Verify license terms before commercial application",
37
+ ],
38
+ requires_consent: requiresConsent,
39
+ };
40
+ }
41
+ // If ID is a common safe license
42
+ if (SAFE_KEYWORDS.some(k => id.includes(k))) {
43
+ return {
44
+ id,
45
+ category: "safe",
46
+ commercial_use: true,
47
+ usage_restrictions: [],
48
+ url: licenseUrl,
49
+ warnings: [],
50
+ requires_consent: requiresConsent,
51
+ };
52
+ }
53
+ return {
54
+ id: id || "unknown",
55
+ category: "unknown",
56
+ usage_restrictions: usageRestrictions,
57
+ url: licenseUrl,
58
+ warnings: [
59
+ "License information unclear or unknown",
60
+ "Use at your own risk",
61
+ ],
62
+ requires_consent: requiresConsent,
63
+ };
64
+ }
65
+ export function isPermissiveLicense(licenseId) {
66
+ const id = (licenseId || "unknown").toLowerCase();
67
+ return PERMISSIVE_LICENSES.some(perm => id.includes(perm));
68
+ }
@@ -0,0 +1,107 @@
1
+ export class MonitoringService {
2
+ monitorStore;
3
+ metadataStore;
4
+ constructor(monitorStore, metadataStore) {
5
+ this.monitorStore = monitorStore;
6
+ this.metadataStore = metadataStore;
7
+ }
8
+ /**
9
+ * Checks all active monitors for updates.
10
+ * @param fetchLatest A function that fetches the latest metadata from the source (HF/Kaggle)
11
+ */
12
+ async checkUpdates(fetchLatest) {
13
+ const monitors = this.monitorStore.getActiveMonitors();
14
+ const results = [];
15
+ for (const monitor of monitors) {
16
+ const current = this.metadataStore.getDataset(monitor.dataset_id);
17
+ if (!current)
18
+ continue;
19
+ const latest = await fetchLatest(monitor.dataset_id, current.source);
20
+ if (!latest)
21
+ continue;
22
+ if (latest.last_updated !== monitor.last_checked_version) {
23
+ const diff = this.compareVersions(current, latest);
24
+ if (diff.changes.length > 0) {
25
+ results.push(diff);
26
+ await this.notify(monitor, diff);
27
+ // Update monitor
28
+ monitor.last_checked_version = latest.last_updated;
29
+ monitor.updated_at = new Date().toISOString();
30
+ this.monitorStore.saveMonitor(monitor);
31
+ // Update store
32
+ this.metadataStore.saveDataset(latest);
33
+ if (monitor.auto_reprocess) {
34
+ await this.triggerReprocess(monitor.dataset_id);
35
+ }
36
+ }
37
+ }
38
+ }
39
+ return results;
40
+ }
41
+ compareVersions(oldVer, newVer) {
42
+ const changes = [];
43
+ // Check for significant field changes
44
+ const fieldsToTrack = ["downloads", "likes", "total_examples", "total_size_mb", "quality_score"];
45
+ for (const field of fieldsToTrack) {
46
+ if (oldVer[field] !== newVer[field]) {
47
+ changes.push({
48
+ field: String(field),
49
+ old_value: oldVer[field],
50
+ new_value: newVer[field]
51
+ });
52
+ }
53
+ }
54
+ // Check for split changes
55
+ if (JSON.stringify(oldVer.splits) !== JSON.stringify(newVer.splits)) {
56
+ changes.push({
57
+ field: "splits",
58
+ old_value: oldVer.splits,
59
+ new_value: newVer.splits
60
+ });
61
+ }
62
+ return {
63
+ dataset_id: oldVer.id,
64
+ old_version: oldVer.last_updated,
65
+ new_version: newVer.last_updated,
66
+ changes,
67
+ impact_score: this.calculateImpact(changes)
68
+ };
69
+ }
70
+ calculateImpact(changes) {
71
+ let score = 0;
72
+ for (const change of changes) {
73
+ if (change.field === "total_examples")
74
+ score += 40;
75
+ if (change.field === "splits")
76
+ score += 30;
77
+ if (change.field === "quality_score")
78
+ score += 20;
79
+ if (change.field === "total_size_mb")
80
+ score += 10;
81
+ }
82
+ return Math.min(score, 100);
83
+ }
84
+ async notify(monitor, diff) {
85
+ for (const webhookId of monitor.webhook_ids) {
86
+ const webhook = this.monitorStore.getWebhook(webhookId);
87
+ if (webhook && webhook.enabled) {
88
+ await this.sendToWebhook(webhook, diff);
89
+ }
90
+ }
91
+ }
92
+ async sendToWebhook(webhook, diff) {
93
+ console.log(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
94
+ // In a real implementation, this would be an HTTP POST
95
+ // For now, we simulate the payload
96
+ const payload = {
97
+ text: `Dataset ${diff.dataset_id} updated!`,
98
+ changes: diff.changes,
99
+ impact: diff.impact_score
100
+ };
101
+ // await axios.post(webhook.url, payload);
102
+ }
103
+ async triggerReprocess(datasetId) {
104
+ console.log(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
105
+ // This would call IngestionService or similar
106
+ }
107
+ }
@@ -0,0 +1,78 @@
1
+ export class MonitoringStore {
2
+ db;
3
+ constructor(db) {
4
+ this.db = db;
5
+ this.init();
6
+ }
7
+ init() {
8
+ this.db.exec(`
9
+ CREATE TABLE IF NOT EXISTS dataset_monitors (
10
+ dataset_id TEXT PRIMARY KEY,
11
+ enabled BOOLEAN DEFAULT 1,
12
+ auto_reprocess BOOLEAN DEFAULT 0,
13
+ last_checked_version TEXT,
14
+ webhook_ids TEXT, -- JSON array
15
+ created_at TEXT,
16
+ updated_at TEXT
17
+ );
18
+
19
+ CREATE TABLE IF NOT EXISTS webhook_configs (
20
+ id TEXT PRIMARY KEY,
21
+ name TEXT,
22
+ channel TEXT,
23
+ url TEXT,
24
+ enabled BOOLEAN DEFAULT 1
25
+ );
26
+ `);
27
+ }
28
+ saveMonitor(monitor) {
29
+ const upsert = this.db.prepare(`
30
+ INSERT INTO dataset_monitors (dataset_id, enabled, auto_reprocess, last_checked_version, webhook_ids, created_at, updated_at)
31
+ VALUES (?, ?, ?, ?, ?, ?, ?)
32
+ ON CONFLICT(dataset_id) DO UPDATE SET
33
+ enabled=excluded.enabled,
34
+ auto_reprocess=excluded.auto_reprocess,
35
+ last_checked_version=excluded.last_checked_version,
36
+ webhook_ids=excluded.webhook_ids,
37
+ updated_at=excluded.updated_at
38
+ `);
39
+ upsert.run(monitor.dataset_id, monitor.enabled ? 1 : 0, monitor.auto_reprocess ? 1 : 0, monitor.last_checked_version || null, JSON.stringify(monitor.webhook_ids), monitor.created_at, monitor.updated_at);
40
+ }
41
+ getMonitor(datasetId) {
42
+ const row = this.db.prepare("SELECT * FROM dataset_monitors WHERE dataset_id = ?").get(datasetId);
43
+ if (!row)
44
+ return null;
45
+ return {
46
+ ...row,
47
+ enabled: Boolean(row.enabled),
48
+ auto_reprocess: Boolean(row.auto_reprocess),
49
+ webhook_ids: JSON.parse(row.webhook_ids)
50
+ };
51
+ }
52
+ getActiveMonitors() {
53
+ const rows = this.db.prepare("SELECT * FROM dataset_monitors WHERE enabled = 1").all();
54
+ return rows.map(row => ({
55
+ ...row,
56
+ enabled: Boolean(row.enabled),
57
+ auto_reprocess: Boolean(row.auto_reprocess),
58
+ webhook_ids: JSON.parse(row.webhook_ids)
59
+ }));
60
+ }
61
+ saveWebhook(config) {
62
+ const upsert = this.db.prepare(`
63
+ INSERT INTO webhook_configs (id, name, channel, url, enabled)
64
+ VALUES (?, ?, ?, ?, ?)
65
+ ON CONFLICT(id) DO UPDATE SET
66
+ name=excluded.name,
67
+ url=excluded.url,
68
+ enabled=excluded.enabled
69
+ `);
70
+ upsert.run(config.id, config.name, config.channel, config.url, config.enabled ? 1 : 0);
71
+ }
72
+ getWebhook(id) {
73
+ const row = this.db.prepare("SELECT * FROM webhook_configs WHERE id = ?").get(id);
74
+ if (!row)
75
+ return null;
76
+ return { ...row, enabled: Boolean(row.enabled) };
77
+ }
78
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Calculates a quality score from 0-100 based on metadata.
3
+ */
4
+ export function calculateQualityScore(data) {
5
+ let score = 0;
6
+ // 1. Popularity (max 30)
7
+ if (data.downloads > 10000)
8
+ score += 30;
9
+ else if (data.downloads > 1000)
10
+ score += 20;
11
+ else if (data.downloads > 100)
12
+ score += 10;
13
+ // 2. Structuredness (max 20)
14
+ if (data.hasTrainSplit)
15
+ score += 10;
16
+ if (data.hasTestSplit)
17
+ score += 10;
18
+ // 3. Documentation (max 20)
19
+ if (data.hasDescription) {
20
+ if (data.descriptionLength > 1000)
21
+ score += 20;
22
+ else if (data.descriptionLength > 200)
23
+ score += 10;
24
+ else
25
+ score += 5;
26
+ }
27
+ // 4. Recency (max 15)
28
+ const lastUpdate = new Date(data.lastUpdated);
29
+ const now = new Date();
30
+ const diffDays = Math.floor((now.getTime() - lastUpdate.getTime()) / (1000 * 3600 * 24));
31
+ if (diffDays < 180)
32
+ score += 15; // 6 months
33
+ else if (diffDays < 365)
34
+ score += 10; // 1 year
35
+ else if (diffDays < 730)
36
+ score += 5; // 2 years
37
+ // 5. License Clarity (max 10)
38
+ if (data.licenseCategory === "safe")
39
+ score += 10;
40
+ else if (data.licenseCategory === "restricted")
41
+ score += 5;
42
+ // 6. Community (max 5)
43
+ if (data.likes > 50)
44
+ score += 5;
45
+ else if (data.likes > 10)
46
+ score += 2;
47
+ return Math.min(100, score);
48
+ }
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Rate limiting and retry utilities for API requests
3
+ */
4
+ const DEFAULT_OPTIONS = {
5
+ maxRetries: 5,
6
+ initialDelay: 1000, // 1 second
7
+ maxDelay: 60000, // 60 seconds
8
+ exponentialBase: 2,
9
+ jitter: true
10
+ };
11
+ /**
12
+ * Calculate delay with exponential backoff and optional jitter
13
+ */
14
+ function calculateDelay(attempt, options) {
15
+ const exponentialDelay = options.initialDelay * Math.pow(options.exponentialBase, attempt);
16
+ const delay = Math.min(exponentialDelay, options.maxDelay);
17
+ if (options.jitter) {
18
+ // Add random jitter (0-20% of delay) to avoid thundering herd
19
+ const jitterAmount = delay * 0.2 * Math.random();
20
+ return Math.floor(delay + jitterAmount);
21
+ }
22
+ return Math.floor(delay);
23
+ }
24
+ /**
25
+ * Extract Retry-After header value from response or error
26
+ */
27
+ function getRetryAfter(response) {
28
+ if (!response)
29
+ return null;
30
+ const retryAfter = response.headers.get('Retry-After');
31
+ if (!retryAfter)
32
+ return null;
33
+ // Retry-After can be a number of seconds or an HTTP date
34
+ const seconds = parseInt(retryAfter, 10);
35
+ if (!isNaN(seconds)) {
36
+ return seconds * 1000; // Convert to milliseconds
37
+ }
38
+ // Try parsing as HTTP date
39
+ const date = Date.parse(retryAfter);
40
+ if (!isNaN(date)) {
41
+ return Math.max(0, date - Date.now());
42
+ }
43
+ return null;
44
+ }
45
+ /**
46
+ * Check if error is a rate limit error (429)
47
+ */
48
+ function isRateLimitError(error) {
49
+ if (error?.status === 429)
50
+ return true;
51
+ if (error?.response?.status === 429)
52
+ return true;
53
+ if (error?.message?.includes('rate limit'))
54
+ return true;
55
+ if (error?.message?.includes('429'))
56
+ return true;
57
+ return false;
58
+ }
59
+ /**
60
+ * Sleep for specified milliseconds
61
+ */
62
+ function sleep(ms) {
63
+ return new Promise(resolve => setTimeout(resolve, ms));
64
+ }
65
+ /**
66
+ * Retry a function with exponential backoff on rate limit errors
67
+ */
68
+ export async function retryWithBackoff(fn, options = {}) {
69
+ const opts = { ...DEFAULT_OPTIONS, ...options };
70
+ let lastError;
71
+ let response = null;
72
+ for (let attempt = 0; attempt <= opts.maxRetries; attempt++) {
73
+ try {
74
+ const result = await fn();
75
+ return result;
76
+ }
77
+ catch (error) {
78
+ lastError = error;
79
+ // Extract response if available
80
+ if (error?.response) {
81
+ response = error.response;
82
+ }
83
+ // Only retry on rate limit errors
84
+ if (!isRateLimitError(error)) {
85
+ throw error;
86
+ }
87
+ // Don't retry on last attempt
88
+ if (attempt >= opts.maxRetries) {
89
+ break;
90
+ }
91
+ // Calculate delay
92
+ let delay = getRetryAfter(response);
93
+ if (!delay) {
94
+ delay = calculateDelay(attempt, opts);
95
+ }
96
+ console.error(`[Rate Limiter] Rate limited (attempt ${attempt + 1}/${opts.maxRetries + 1}). Waiting ${delay}ms...`);
97
+ await sleep(delay);
98
+ }
99
+ }
100
+ throw lastError;
101
+ }
102
+ /**
103
+ * Add a delay between requests to avoid hitting rate limits
104
+ */
105
+ export async function delayBetweenRequests(ms = 500) {
106
+ await sleep(ms);
107
+ }
108
+ /**
109
+ * Rate-limited fetch wrapper with automatic retry
110
+ */
111
+ export async function rateLimitedFetch(url, options = {}, retryOptions = {}) {
112
+ return retryWithBackoff(async () => {
113
+ const response = await fetch(url, options);
114
+ if (response.status === 429) {
115
+ const error = new Error(`Rate limit exceeded: ${response.status}`);
116
+ error.status = 429;
117
+ error.response = response;
118
+ throw error;
119
+ }
120
+ if (!response.ok) {
121
+ const error = new Error(`HTTP error: ${response.status}`);
122
+ error.status = response.status;
123
+ error.response = response;
124
+ throw error;
125
+ }
126
+ return response;
127
+ }, retryOptions);
128
+ }