@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
const WB_SCRIPT_PATH = path.resolve("src", "python", "worldbank_adapter.py");
|
|
4
|
+
const NASA_SCRIPT_PATH = path.resolve("src", "python", "nasa_adapter.py");
|
|
5
|
+
export class WorldBankScraper {
|
|
6
|
+
async scrape(query, limit = 10) {
|
|
7
|
+
return runAdapter(WB_SCRIPT_PATH, query, limit);
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
export class NASAScraper {
|
|
11
|
+
async scrape(query, limit = 10) {
|
|
12
|
+
return runAdapter(NASA_SCRIPT_PATH, query, limit);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
async function runAdapter(scriptPath, query, limit) {
|
|
16
|
+
return new Promise((resolve) => {
|
|
17
|
+
const pythonProcess = spawn("python", [
|
|
18
|
+
scriptPath,
|
|
19
|
+
"--action", "search",
|
|
20
|
+
"--query", query,
|
|
21
|
+
"--limit", String(limit)
|
|
22
|
+
]);
|
|
23
|
+
let output = "";
|
|
24
|
+
let errorOutput = "";
|
|
25
|
+
pythonProcess.stdout.on("data", (data) => { output += data.toString(); });
|
|
26
|
+
pythonProcess.stderr.on("data", (data) => { errorOutput += data.toString(); });
|
|
27
|
+
pythonProcess.on("close", (code) => {
|
|
28
|
+
if (code !== 0) {
|
|
29
|
+
console.error(`[Adapter] ${path.basename(scriptPath)} exited with code ${code}: ${errorOutput}`);
|
|
30
|
+
resolve([]);
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
try {
|
|
34
|
+
const results = JSON.parse(output);
|
|
35
|
+
if (results.error) {
|
|
36
|
+
console.error(`[Adapter] ${path.basename(scriptPath)} error: ${results.error}`);
|
|
37
|
+
resolve([]);
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
resolve(results);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
catch (e) {
|
|
44
|
+
console.error(`[Adapter] ${path.basename(scriptPath)} JSON error: ${e.message}`);
|
|
45
|
+
resolve([]);
|
|
46
|
+
}
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
}
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import { categorizeLicense } from "./license.js";
|
|
2
|
+
import { calculateQualityScore } from "./quality.js";
|
|
3
|
+
import { classifyDomain } from "./domain.js";
|
|
4
|
+
import { rateLimitedFetch, delayBetweenRequests } from "./rate-limiter.js";
|
|
5
|
+
export class KaggleMetadataScraper {
|
|
6
|
+
username;
|
|
7
|
+
key;
|
|
8
|
+
constructor(username, key) {
|
|
9
|
+
this.username = username;
|
|
10
|
+
this.key = key;
|
|
11
|
+
}
|
|
12
|
+
async scrape(query, limit = 20, usePagination = true) {
|
|
13
|
+
console.error(`[Kaggle] Searching for "${query}" (limit: ${limit}, pagination: ${usePagination})...`);
|
|
14
|
+
const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
|
|
15
|
+
const results = [];
|
|
16
|
+
const MAX_PAGE_SIZE = 100; // Kaggle API max page size
|
|
17
|
+
const pageSize = Math.min(limit, MAX_PAGE_SIZE);
|
|
18
|
+
let page = 1;
|
|
19
|
+
let totalFetched = 0;
|
|
20
|
+
let hasMore = true;
|
|
21
|
+
try {
|
|
22
|
+
while (hasMore && totalFetched < limit) {
|
|
23
|
+
const url = `https://www.kaggle.com/api/v1/datasets/list?search=${encodeURIComponent(query)}&page_size=${pageSize}&page=${page}`;
|
|
24
|
+
console.error(`[Kaggle] Fetching page ${page} (${totalFetched}/${limit} datasets so far)...`);
|
|
25
|
+
// Use rate-limited fetch with retry logic
|
|
26
|
+
const response = await rateLimitedFetch(url, {
|
|
27
|
+
headers: {
|
|
28
|
+
'Authorization': `Basic ${auth}`,
|
|
29
|
+
'Content-Type': 'application/json'
|
|
30
|
+
}
|
|
31
|
+
}, {
|
|
32
|
+
maxRetries: 3,
|
|
33
|
+
initialDelay: 2000, // Start with 2 seconds
|
|
34
|
+
maxDelay: 30000 // Max 30 seconds
|
|
35
|
+
});
|
|
36
|
+
const datasets = await response.json();
|
|
37
|
+
if (!datasets || datasets.length === 0) {
|
|
38
|
+
hasMore = false;
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
// Add delay between processing datasets to avoid rate limits
|
|
42
|
+
for (let i = 0; i < datasets.length; i++) {
|
|
43
|
+
const ds = datasets[i];
|
|
44
|
+
try {
|
|
45
|
+
const metadata = this.transform(ds);
|
|
46
|
+
results.push(metadata);
|
|
47
|
+
totalFetched++;
|
|
48
|
+
console.error(`[Kaggle] Added: ${ds.ref} (${ds.downloadCount} downloads)`);
|
|
49
|
+
// Add small delay every 5 datasets
|
|
50
|
+
if ((i + 1) % 5 === 0 && i < datasets.length - 1) {
|
|
51
|
+
await delayBetweenRequests(500);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
catch (e) {
|
|
55
|
+
console.error(`[Kaggle] ERROR: Failed to transform ${ds.ref}:`, e);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// Check if we should continue pagination
|
|
59
|
+
if (usePagination && datasets.length === pageSize && totalFetched < limit) {
|
|
60
|
+
page++;
|
|
61
|
+
// Add delay between pages to avoid rate limits
|
|
62
|
+
await delayBetweenRequests(1000);
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
hasMore = false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
console.error(`[Kaggle] Completed: ${results.length} datasets found for "${query}"`);
|
|
69
|
+
return results;
|
|
70
|
+
}
|
|
71
|
+
catch (e) {
|
|
72
|
+
// Handle rate limit errors specifically
|
|
73
|
+
if (e?.status === 429 || e?.message?.includes('rate limit')) {
|
|
74
|
+
console.error("[Kaggle] Rate limit error:", e.message);
|
|
75
|
+
console.error("Consider adding delays between requests or reducing batch size");
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
console.error("[Kaggle] Scrape error:", e.message || e);
|
|
79
|
+
}
|
|
80
|
+
// Return partial results if we got some before the error
|
|
81
|
+
if (results.length > 0) {
|
|
82
|
+
console.error(`[Kaggle] Returning ${results.length} partial results before error`);
|
|
83
|
+
}
|
|
84
|
+
return results;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
transform(ds) {
|
|
88
|
+
const repoId = ds.ref;
|
|
89
|
+
const tags = ds.tags?.map(t => t.name) || [];
|
|
90
|
+
const description = ds.description || "";
|
|
91
|
+
const license = categorizeLicense(ds.licenseName);
|
|
92
|
+
const warnings = [];
|
|
93
|
+
// Kaggle doesn't give us splits in the list API easily
|
|
94
|
+
const sizeBytes = this.parseSize(ds.size);
|
|
95
|
+
const splits = [
|
|
96
|
+
{
|
|
97
|
+
name: "data",
|
|
98
|
+
num_examples: 0,
|
|
99
|
+
size_bytes: sizeBytes
|
|
100
|
+
}
|
|
101
|
+
];
|
|
102
|
+
const totalSizeMB = sizeBytes ? Math.round(sizeBytes / (1024 * 1024) * 100) / 100 : 0;
|
|
103
|
+
// Populate warnings
|
|
104
|
+
if (description.length < 100)
|
|
105
|
+
warnings.push("Short description; results may be less relevant");
|
|
106
|
+
const lastUpdatedDate = new Date(ds.lastUpdated);
|
|
107
|
+
const fourYearsAgo = new Date();
|
|
108
|
+
fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
|
|
109
|
+
if (lastUpdatedDate < fourYearsAgo) {
|
|
110
|
+
warnings.push(`Stale data: Last updated ${lastUpdatedDate.getFullYear()}`);
|
|
111
|
+
}
|
|
112
|
+
warnings.push("No specific data splits identified (Kaggle API limitation)");
|
|
113
|
+
// Classify domain
|
|
114
|
+
const task = this.extractTask(tags);
|
|
115
|
+
const domain = classifyDomain(description, tags, repoId, task);
|
|
116
|
+
return {
|
|
117
|
+
id: repoId,
|
|
118
|
+
source: "kaggle",
|
|
119
|
+
name: ds.title,
|
|
120
|
+
description: description,
|
|
121
|
+
quality_warnings: warnings,
|
|
122
|
+
downloads: ds.downloadCount,
|
|
123
|
+
likes: ds.voteCount,
|
|
124
|
+
stars: 0,
|
|
125
|
+
tags: tags,
|
|
126
|
+
last_updated: ds.lastUpdated,
|
|
127
|
+
task: task,
|
|
128
|
+
domain: domain,
|
|
129
|
+
languages: [],
|
|
130
|
+
splits,
|
|
131
|
+
license,
|
|
132
|
+
quality_score: calculateQualityScore({
|
|
133
|
+
downloads: ds.downloadCount,
|
|
134
|
+
likes: ds.voteCount,
|
|
135
|
+
hasDescription: description.length > 50,
|
|
136
|
+
descriptionLength: description.length,
|
|
137
|
+
hasTrainSplit: false,
|
|
138
|
+
hasTestSplit: false,
|
|
139
|
+
lastUpdated: ds.lastUpdated,
|
|
140
|
+
licenseCategory: license.category
|
|
141
|
+
}),
|
|
142
|
+
download_url: `https://www.kaggle.com/datasets/${ds.ref}`,
|
|
143
|
+
format: undefined,
|
|
144
|
+
total_examples: 0,
|
|
145
|
+
total_size_bytes: sizeBytes,
|
|
146
|
+
total_size_mb: totalSizeMB,
|
|
147
|
+
columns: [],
|
|
148
|
+
is_structured: false,
|
|
149
|
+
has_target_column: false,
|
|
150
|
+
is_safe_source: true,
|
|
151
|
+
has_personal_data: false,
|
|
152
|
+
is_paywalled: false,
|
|
153
|
+
is_scraped_web_data: false,
|
|
154
|
+
uses_https: true,
|
|
155
|
+
has_train_split: false,
|
|
156
|
+
has_test_split: false,
|
|
157
|
+
has_validation_split: false,
|
|
158
|
+
description_length: description.length,
|
|
159
|
+
has_readme: true
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
parseSize(sizeStr) {
|
|
163
|
+
if (!sizeStr)
|
|
164
|
+
return 0;
|
|
165
|
+
const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*([KMGT]B)$/i);
|
|
166
|
+
if (!match)
|
|
167
|
+
return 0;
|
|
168
|
+
const value = parseFloat(match[1]);
|
|
169
|
+
const unit = match[2].toUpperCase();
|
|
170
|
+
switch (unit) {
|
|
171
|
+
case 'KB': return value * 1024;
|
|
172
|
+
case 'MB': return value * 1024 * 1024;
|
|
173
|
+
case 'GB': return value * 1024 * 1024 * 1024;
|
|
174
|
+
case 'TB': return value * 1024 * 1024 * 1024 * 1024;
|
|
175
|
+
default: return value;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
extractTask(tags) {
|
|
179
|
+
// Similar to HF but Kaggle tags might be different
|
|
180
|
+
return "unknown";
|
|
181
|
+
}
|
|
182
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
const SAFE_KEYWORDS = ["mit", "apache", "bsd", "cc0", "cc-by-4.0", "cc-by-sa-4.0", "odc-by", "pddl", "openrail", "creative commons attribution 4.0", "public domain"];
|
|
2
|
+
const RESTRICTED_KEYWORDS = ["nc", "non-commercial", "research-only", "academic", "gpl", "agpl", "proprietary", "custom"];
|
|
3
|
+
// Permissive licenses for MVP filter
|
|
4
|
+
const PERMISSIVE_LICENSES = ["mit", "apache", "apache-2.0", "bsd", "cc0", "cc-by-4.0", "odc-by", "pddl", "openrail"];
|
|
5
|
+
export function categorizeLicense(licenseId, licenseUrl) {
|
|
6
|
+
const id = (licenseId || "unknown").toLowerCase();
|
|
7
|
+
const usageRestrictions = [];
|
|
8
|
+
let requiresConsent = false;
|
|
9
|
+
// Check for usage restrictions
|
|
10
|
+
if (id.includes("nc") || id.includes("non-commercial")) {
|
|
11
|
+
usageRestrictions.push("non-commercial");
|
|
12
|
+
}
|
|
13
|
+
if (id.includes("research-only") || id.includes("academic")) {
|
|
14
|
+
usageRestrictions.push("academic-only");
|
|
15
|
+
}
|
|
16
|
+
if (id.includes("nd") || id.includes("no-derivatives")) {
|
|
17
|
+
usageRestrictions.push("no-derivatives");
|
|
18
|
+
}
|
|
19
|
+
if (id.includes("gpl") || id.includes("agpl")) {
|
|
20
|
+
usageRestrictions.push("no-derivatives"); // GPL requires derivative works to be GPL
|
|
21
|
+
}
|
|
22
|
+
// Check if consent is required (GDPR, Kaggle, etc.)
|
|
23
|
+
if (id.includes("gdpr") || id.includes("consent") || id.includes("kaggle")) {
|
|
24
|
+
requiresConsent = true;
|
|
25
|
+
}
|
|
26
|
+
// If ID contains restricted keywords
|
|
27
|
+
if (RESTRICTED_KEYWORDS.some(k => id.includes(k))) {
|
|
28
|
+
return {
|
|
29
|
+
id,
|
|
30
|
+
category: "restricted",
|
|
31
|
+
commercial_use: false,
|
|
32
|
+
usage_restrictions: usageRestrictions.length > 0 ? usageRestrictions : ["non-commercial"],
|
|
33
|
+
url: licenseUrl,
|
|
34
|
+
warnings: [
|
|
35
|
+
"Restricted usage terms apply",
|
|
36
|
+
"Verify license terms before commercial application",
|
|
37
|
+
],
|
|
38
|
+
requires_consent: requiresConsent,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
// If ID is a common safe license
|
|
42
|
+
if (SAFE_KEYWORDS.some(k => id.includes(k))) {
|
|
43
|
+
return {
|
|
44
|
+
id,
|
|
45
|
+
category: "safe",
|
|
46
|
+
commercial_use: true,
|
|
47
|
+
usage_restrictions: [],
|
|
48
|
+
url: licenseUrl,
|
|
49
|
+
warnings: [],
|
|
50
|
+
requires_consent: requiresConsent,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
return {
|
|
54
|
+
id: id || "unknown",
|
|
55
|
+
category: "unknown",
|
|
56
|
+
usage_restrictions: usageRestrictions,
|
|
57
|
+
url: licenseUrl,
|
|
58
|
+
warnings: [
|
|
59
|
+
"License information unclear or unknown",
|
|
60
|
+
"Use at your own risk",
|
|
61
|
+
],
|
|
62
|
+
requires_consent: requiresConsent,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
export function isPermissiveLicense(licenseId) {
|
|
66
|
+
const id = (licenseId || "unknown").toLowerCase();
|
|
67
|
+
return PERMISSIVE_LICENSES.some(perm => id.includes(perm));
|
|
68
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
export class MonitoringService {
|
|
2
|
+
monitorStore;
|
|
3
|
+
metadataStore;
|
|
4
|
+
constructor(monitorStore, metadataStore) {
|
|
5
|
+
this.monitorStore = monitorStore;
|
|
6
|
+
this.metadataStore = metadataStore;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Checks all active monitors for updates.
|
|
10
|
+
* @param fetchLatest A function that fetches the latest metadata from the source (HF/Kaggle)
|
|
11
|
+
*/
|
|
12
|
+
async checkUpdates(fetchLatest) {
|
|
13
|
+
const monitors = this.monitorStore.getActiveMonitors();
|
|
14
|
+
const results = [];
|
|
15
|
+
for (const monitor of monitors) {
|
|
16
|
+
const current = this.metadataStore.getDataset(monitor.dataset_id);
|
|
17
|
+
if (!current)
|
|
18
|
+
continue;
|
|
19
|
+
const latest = await fetchLatest(monitor.dataset_id, current.source);
|
|
20
|
+
if (!latest)
|
|
21
|
+
continue;
|
|
22
|
+
if (latest.last_updated !== monitor.last_checked_version) {
|
|
23
|
+
const diff = this.compareVersions(current, latest);
|
|
24
|
+
if (diff.changes.length > 0) {
|
|
25
|
+
results.push(diff);
|
|
26
|
+
await this.notify(monitor, diff);
|
|
27
|
+
// Update monitor
|
|
28
|
+
monitor.last_checked_version = latest.last_updated;
|
|
29
|
+
monitor.updated_at = new Date().toISOString();
|
|
30
|
+
this.monitorStore.saveMonitor(monitor);
|
|
31
|
+
// Update store
|
|
32
|
+
this.metadataStore.saveDataset(latest);
|
|
33
|
+
if (monitor.auto_reprocess) {
|
|
34
|
+
await this.triggerReprocess(monitor.dataset_id);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
return results;
|
|
40
|
+
}
|
|
41
|
+
compareVersions(oldVer, newVer) {
|
|
42
|
+
const changes = [];
|
|
43
|
+
// Check for significant field changes
|
|
44
|
+
const fieldsToTrack = ["downloads", "likes", "total_examples", "total_size_mb", "quality_score"];
|
|
45
|
+
for (const field of fieldsToTrack) {
|
|
46
|
+
if (oldVer[field] !== newVer[field]) {
|
|
47
|
+
changes.push({
|
|
48
|
+
field: String(field),
|
|
49
|
+
old_value: oldVer[field],
|
|
50
|
+
new_value: newVer[field]
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// Check for split changes
|
|
55
|
+
if (JSON.stringify(oldVer.splits) !== JSON.stringify(newVer.splits)) {
|
|
56
|
+
changes.push({
|
|
57
|
+
field: "splits",
|
|
58
|
+
old_value: oldVer.splits,
|
|
59
|
+
new_value: newVer.splits
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
return {
|
|
63
|
+
dataset_id: oldVer.id,
|
|
64
|
+
old_version: oldVer.last_updated,
|
|
65
|
+
new_version: newVer.last_updated,
|
|
66
|
+
changes,
|
|
67
|
+
impact_score: this.calculateImpact(changes)
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
calculateImpact(changes) {
|
|
71
|
+
let score = 0;
|
|
72
|
+
for (const change of changes) {
|
|
73
|
+
if (change.field === "total_examples")
|
|
74
|
+
score += 40;
|
|
75
|
+
if (change.field === "splits")
|
|
76
|
+
score += 30;
|
|
77
|
+
if (change.field === "quality_score")
|
|
78
|
+
score += 20;
|
|
79
|
+
if (change.field === "total_size_mb")
|
|
80
|
+
score += 10;
|
|
81
|
+
}
|
|
82
|
+
return Math.min(score, 100);
|
|
83
|
+
}
|
|
84
|
+
async notify(monitor, diff) {
|
|
85
|
+
for (const webhookId of monitor.webhook_ids) {
|
|
86
|
+
const webhook = this.monitorStore.getWebhook(webhookId);
|
|
87
|
+
if (webhook && webhook.enabled) {
|
|
88
|
+
await this.sendToWebhook(webhook, diff);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
async sendToWebhook(webhook, diff) {
|
|
93
|
+
console.log(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
|
|
94
|
+
// In a real implementation, this would be an HTTP POST
|
|
95
|
+
// For now, we simulate the payload
|
|
96
|
+
const payload = {
|
|
97
|
+
text: `Dataset ${diff.dataset_id} updated!`,
|
|
98
|
+
changes: diff.changes,
|
|
99
|
+
impact: diff.impact_score
|
|
100
|
+
};
|
|
101
|
+
// await axios.post(webhook.url, payload);
|
|
102
|
+
}
|
|
103
|
+
async triggerReprocess(datasetId) {
|
|
104
|
+
console.log(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
|
|
105
|
+
// This would call IngestionService or similar
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
export class MonitoringStore {
|
|
2
|
+
db;
|
|
3
|
+
constructor(db) {
|
|
4
|
+
this.db = db;
|
|
5
|
+
this.init();
|
|
6
|
+
}
|
|
7
|
+
init() {
|
|
8
|
+
this.db.exec(`
|
|
9
|
+
CREATE TABLE IF NOT EXISTS dataset_monitors (
|
|
10
|
+
dataset_id TEXT PRIMARY KEY,
|
|
11
|
+
enabled BOOLEAN DEFAULT 1,
|
|
12
|
+
auto_reprocess BOOLEAN DEFAULT 0,
|
|
13
|
+
last_checked_version TEXT,
|
|
14
|
+
webhook_ids TEXT, -- JSON array
|
|
15
|
+
created_at TEXT,
|
|
16
|
+
updated_at TEXT
|
|
17
|
+
);
|
|
18
|
+
|
|
19
|
+
CREATE TABLE IF NOT EXISTS webhook_configs (
|
|
20
|
+
id TEXT PRIMARY KEY,
|
|
21
|
+
name TEXT,
|
|
22
|
+
channel TEXT,
|
|
23
|
+
url TEXT,
|
|
24
|
+
enabled BOOLEAN DEFAULT 1
|
|
25
|
+
);
|
|
26
|
+
`);
|
|
27
|
+
}
|
|
28
|
+
saveMonitor(monitor) {
|
|
29
|
+
const upsert = this.db.prepare(`
|
|
30
|
+
INSERT INTO dataset_monitors (dataset_id, enabled, auto_reprocess, last_checked_version, webhook_ids, created_at, updated_at)
|
|
31
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
32
|
+
ON CONFLICT(dataset_id) DO UPDATE SET
|
|
33
|
+
enabled=excluded.enabled,
|
|
34
|
+
auto_reprocess=excluded.auto_reprocess,
|
|
35
|
+
last_checked_version=excluded.last_checked_version,
|
|
36
|
+
webhook_ids=excluded.webhook_ids,
|
|
37
|
+
updated_at=excluded.updated_at
|
|
38
|
+
`);
|
|
39
|
+
upsert.run(monitor.dataset_id, monitor.enabled ? 1 : 0, monitor.auto_reprocess ? 1 : 0, monitor.last_checked_version || null, JSON.stringify(monitor.webhook_ids), monitor.created_at, monitor.updated_at);
|
|
40
|
+
}
|
|
41
|
+
getMonitor(datasetId) {
|
|
42
|
+
const row = this.db.prepare("SELECT * FROM dataset_monitors WHERE dataset_id = ?").get(datasetId);
|
|
43
|
+
if (!row)
|
|
44
|
+
return null;
|
|
45
|
+
return {
|
|
46
|
+
...row,
|
|
47
|
+
enabled: Boolean(row.enabled),
|
|
48
|
+
auto_reprocess: Boolean(row.auto_reprocess),
|
|
49
|
+
webhook_ids: JSON.parse(row.webhook_ids)
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
getActiveMonitors() {
|
|
53
|
+
const rows = this.db.prepare("SELECT * FROM dataset_monitors WHERE enabled = 1").all();
|
|
54
|
+
return rows.map(row => ({
|
|
55
|
+
...row,
|
|
56
|
+
enabled: Boolean(row.enabled),
|
|
57
|
+
auto_reprocess: Boolean(row.auto_reprocess),
|
|
58
|
+
webhook_ids: JSON.parse(row.webhook_ids)
|
|
59
|
+
}));
|
|
60
|
+
}
|
|
61
|
+
saveWebhook(config) {
|
|
62
|
+
const upsert = this.db.prepare(`
|
|
63
|
+
INSERT INTO webhook_configs (id, name, channel, url, enabled)
|
|
64
|
+
VALUES (?, ?, ?, ?, ?)
|
|
65
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
66
|
+
name=excluded.name,
|
|
67
|
+
url=excluded.url,
|
|
68
|
+
enabled=excluded.enabled
|
|
69
|
+
`);
|
|
70
|
+
upsert.run(config.id, config.name, config.channel, config.url, config.enabled ? 1 : 0);
|
|
71
|
+
}
|
|
72
|
+
getWebhook(id) {
|
|
73
|
+
const row = this.db.prepare("SELECT * FROM webhook_configs WHERE id = ?").get(id);
|
|
74
|
+
if (!row)
|
|
75
|
+
return null;
|
|
76
|
+
return { ...row, enabled: Boolean(row.enabled) };
|
|
77
|
+
}
|
|
78
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Calculates a quality score from 0-100 based on metadata.
|
|
3
|
+
*/
|
|
4
|
+
export function calculateQualityScore(data) {
|
|
5
|
+
let score = 0;
|
|
6
|
+
// 1. Popularity (max 30)
|
|
7
|
+
if (data.downloads > 10000)
|
|
8
|
+
score += 30;
|
|
9
|
+
else if (data.downloads > 1000)
|
|
10
|
+
score += 20;
|
|
11
|
+
else if (data.downloads > 100)
|
|
12
|
+
score += 10;
|
|
13
|
+
// 2. Structuredness (max 20)
|
|
14
|
+
if (data.hasTrainSplit)
|
|
15
|
+
score += 10;
|
|
16
|
+
if (data.hasTestSplit)
|
|
17
|
+
score += 10;
|
|
18
|
+
// 3. Documentation (max 20)
|
|
19
|
+
if (data.hasDescription) {
|
|
20
|
+
if (data.descriptionLength > 1000)
|
|
21
|
+
score += 20;
|
|
22
|
+
else if (data.descriptionLength > 200)
|
|
23
|
+
score += 10;
|
|
24
|
+
else
|
|
25
|
+
score += 5;
|
|
26
|
+
}
|
|
27
|
+
// 4. Recency (max 15)
|
|
28
|
+
const lastUpdate = new Date(data.lastUpdated);
|
|
29
|
+
const now = new Date();
|
|
30
|
+
const diffDays = Math.floor((now.getTime() - lastUpdate.getTime()) / (1000 * 3600 * 24));
|
|
31
|
+
if (diffDays < 180)
|
|
32
|
+
score += 15; // 6 months
|
|
33
|
+
else if (diffDays < 365)
|
|
34
|
+
score += 10; // 1 year
|
|
35
|
+
else if (diffDays < 730)
|
|
36
|
+
score += 5; // 2 years
|
|
37
|
+
// 5. License Clarity (max 10)
|
|
38
|
+
if (data.licenseCategory === "safe")
|
|
39
|
+
score += 10;
|
|
40
|
+
else if (data.licenseCategory === "restricted")
|
|
41
|
+
score += 5;
|
|
42
|
+
// 6. Community (max 5)
|
|
43
|
+
if (data.likes > 50)
|
|
44
|
+
score += 5;
|
|
45
|
+
else if (data.likes > 10)
|
|
46
|
+
score += 2;
|
|
47
|
+
return Math.min(100, score);
|
|
48
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rate limiting and retry utilities for API requests
|
|
3
|
+
*/
|
|
4
|
+
const DEFAULT_OPTIONS = {
|
|
5
|
+
maxRetries: 5,
|
|
6
|
+
initialDelay: 1000, // 1 second
|
|
7
|
+
maxDelay: 60000, // 60 seconds
|
|
8
|
+
exponentialBase: 2,
|
|
9
|
+
jitter: true
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Calculate delay with exponential backoff and optional jitter
|
|
13
|
+
*/
|
|
14
|
+
function calculateDelay(attempt, options) {
|
|
15
|
+
const exponentialDelay = options.initialDelay * Math.pow(options.exponentialBase, attempt);
|
|
16
|
+
const delay = Math.min(exponentialDelay, options.maxDelay);
|
|
17
|
+
if (options.jitter) {
|
|
18
|
+
// Add random jitter (0-20% of delay) to avoid thundering herd
|
|
19
|
+
const jitterAmount = delay * 0.2 * Math.random();
|
|
20
|
+
return Math.floor(delay + jitterAmount);
|
|
21
|
+
}
|
|
22
|
+
return Math.floor(delay);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Extract Retry-After header value from response or error
|
|
26
|
+
*/
|
|
27
|
+
function getRetryAfter(response) {
|
|
28
|
+
if (!response)
|
|
29
|
+
return null;
|
|
30
|
+
const retryAfter = response.headers.get('Retry-After');
|
|
31
|
+
if (!retryAfter)
|
|
32
|
+
return null;
|
|
33
|
+
// Retry-After can be a number of seconds or an HTTP date
|
|
34
|
+
const seconds = parseInt(retryAfter, 10);
|
|
35
|
+
if (!isNaN(seconds)) {
|
|
36
|
+
return seconds * 1000; // Convert to milliseconds
|
|
37
|
+
}
|
|
38
|
+
// Try parsing as HTTP date
|
|
39
|
+
const date = Date.parse(retryAfter);
|
|
40
|
+
if (!isNaN(date)) {
|
|
41
|
+
return Math.max(0, date - Date.now());
|
|
42
|
+
}
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Check if error is a rate limit error (429)
|
|
47
|
+
*/
|
|
48
|
+
function isRateLimitError(error) {
|
|
49
|
+
if (error?.status === 429)
|
|
50
|
+
return true;
|
|
51
|
+
if (error?.response?.status === 429)
|
|
52
|
+
return true;
|
|
53
|
+
if (error?.message?.includes('rate limit'))
|
|
54
|
+
return true;
|
|
55
|
+
if (error?.message?.includes('429'))
|
|
56
|
+
return true;
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Sleep for specified milliseconds
|
|
61
|
+
*/
|
|
62
|
+
function sleep(ms) {
|
|
63
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Retry a function with exponential backoff on rate limit errors
|
|
67
|
+
*/
|
|
68
|
+
export async function retryWithBackoff(fn, options = {}) {
|
|
69
|
+
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
70
|
+
let lastError;
|
|
71
|
+
let response = null;
|
|
72
|
+
for (let attempt = 0; attempt <= opts.maxRetries; attempt++) {
|
|
73
|
+
try {
|
|
74
|
+
const result = await fn();
|
|
75
|
+
return result;
|
|
76
|
+
}
|
|
77
|
+
catch (error) {
|
|
78
|
+
lastError = error;
|
|
79
|
+
// Extract response if available
|
|
80
|
+
if (error?.response) {
|
|
81
|
+
response = error.response;
|
|
82
|
+
}
|
|
83
|
+
// Only retry on rate limit errors
|
|
84
|
+
if (!isRateLimitError(error)) {
|
|
85
|
+
throw error;
|
|
86
|
+
}
|
|
87
|
+
// Don't retry on last attempt
|
|
88
|
+
if (attempt >= opts.maxRetries) {
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
// Calculate delay
|
|
92
|
+
let delay = getRetryAfter(response);
|
|
93
|
+
if (!delay) {
|
|
94
|
+
delay = calculateDelay(attempt, opts);
|
|
95
|
+
}
|
|
96
|
+
console.error(`[Rate Limiter] Rate limited (attempt ${attempt + 1}/${opts.maxRetries + 1}). Waiting ${delay}ms...`);
|
|
97
|
+
await sleep(delay);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
throw lastError;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Add a delay between requests to avoid hitting rate limits
|
|
104
|
+
*/
|
|
105
|
+
export async function delayBetweenRequests(ms = 500) {
|
|
106
|
+
await sleep(ms);
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Rate-limited fetch wrapper with automatic retry
|
|
110
|
+
*/
|
|
111
|
+
export async function rateLimitedFetch(url, options = {}, retryOptions = {}) {
|
|
112
|
+
return retryWithBackoff(async () => {
|
|
113
|
+
const response = await fetch(url, options);
|
|
114
|
+
if (response.status === 429) {
|
|
115
|
+
const error = new Error(`Rate limit exceeded: ${response.status}`);
|
|
116
|
+
error.status = 429;
|
|
117
|
+
error.response = response;
|
|
118
|
+
throw error;
|
|
119
|
+
}
|
|
120
|
+
if (!response.ok) {
|
|
121
|
+
const error = new Error(`HTTP error: ${response.status}`);
|
|
122
|
+
error.status = response.status;
|
|
123
|
+
error.response = response;
|
|
124
|
+
throw error;
|
|
125
|
+
}
|
|
126
|
+
return response;
|
|
127
|
+
}, retryOptions);
|
|
128
|
+
}
|