vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
import { categorizeLicense } from "./license.js";
|
|
2
|
-
import { calculateQualityScore } from "./quality.js";
|
|
3
|
-
import { classifyDomain } from "./domain.js";
|
|
4
|
-
import { rateLimitedFetch, delayBetweenRequests } from "./rate-limiter.js";
|
|
5
|
-
export class KaggleMetadataScraper {
|
|
6
|
-
username;
|
|
7
|
-
key;
|
|
8
|
-
constructor(username, key) {
|
|
9
|
-
this.username = username;
|
|
10
|
-
this.key = key;
|
|
11
|
-
}
|
|
12
|
-
async scrape(query, limit = 20, usePagination = true) {
|
|
13
|
-
console.error(`[Kaggle] Searching for "${query}" (limit: ${limit}, pagination: ${usePagination})...`);
|
|
14
|
-
const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
|
|
15
|
-
const results = [];
|
|
16
|
-
const MAX_PAGE_SIZE = 100; // Kaggle API max page size
|
|
17
|
-
const pageSize = Math.min(limit, MAX_PAGE_SIZE);
|
|
18
|
-
let page = 1;
|
|
19
|
-
let totalFetched = 0;
|
|
20
|
-
let hasMore = true;
|
|
21
|
-
try {
|
|
22
|
-
while (hasMore && totalFetched < limit) {
|
|
23
|
-
const url = `https://www.kaggle.com/api/v1/datasets/list?search=${encodeURIComponent(query)}&page_size=${pageSize}&page=${page}`;
|
|
24
|
-
console.error(`[Kaggle] Fetching page ${page} (${totalFetched}/${limit} datasets so far)...`);
|
|
25
|
-
// Use rate-limited fetch with retry logic
|
|
26
|
-
const response = await rateLimitedFetch(url, {
|
|
27
|
-
headers: {
|
|
28
|
-
'Authorization': `Basic ${auth}`,
|
|
29
|
-
'Content-Type': 'application/json'
|
|
30
|
-
}
|
|
31
|
-
}, {
|
|
32
|
-
maxRetries: 3,
|
|
33
|
-
initialDelay: 2000, // Start with 2 seconds
|
|
34
|
-
maxDelay: 30000 // Max 30 seconds
|
|
35
|
-
});
|
|
36
|
-
const datasets = await response.json();
|
|
37
|
-
if (!datasets || datasets.length === 0) {
|
|
38
|
-
hasMore = false;
|
|
39
|
-
break;
|
|
40
|
-
}
|
|
41
|
-
// Add delay between processing datasets to avoid rate limits
|
|
42
|
-
for (let i = 0; i < datasets.length; i++) {
|
|
43
|
-
const ds = datasets[i];
|
|
44
|
-
try {
|
|
45
|
-
const metadata = this.transform(ds);
|
|
46
|
-
results.push(metadata);
|
|
47
|
-
totalFetched++;
|
|
48
|
-
console.error(`[Kaggle] Added: ${ds.ref} (${ds.downloadCount} downloads)`);
|
|
49
|
-
// Add small delay every 5 datasets
|
|
50
|
-
if ((i + 1) % 5 === 0 && i < datasets.length - 1) {
|
|
51
|
-
await delayBetweenRequests(500);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
catch (e) {
|
|
55
|
-
console.error(`[Kaggle] ERROR: Failed to transform ${ds.ref}:`, e);
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
// Check if we should continue pagination
|
|
59
|
-
if (usePagination && datasets.length === pageSize && totalFetched < limit) {
|
|
60
|
-
page++;
|
|
61
|
-
// Add delay between pages to avoid rate limits
|
|
62
|
-
await delayBetweenRequests(1000);
|
|
63
|
-
}
|
|
64
|
-
else {
|
|
65
|
-
hasMore = false;
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
console.error(`[Kaggle] Completed: ${results.length} datasets found for "${query}"`);
|
|
69
|
-
return results;
|
|
70
|
-
}
|
|
71
|
-
catch (e) {
|
|
72
|
-
// Handle rate limit errors specifically
|
|
73
|
-
if (e?.status === 429 || e?.message?.includes('rate limit')) {
|
|
74
|
-
console.error("[Kaggle] Rate limit error:", e.message);
|
|
75
|
-
console.error("Consider adding delays between requests or reducing batch size");
|
|
76
|
-
}
|
|
77
|
-
else {
|
|
78
|
-
console.error("[Kaggle] Scrape error:", e.message || e);
|
|
79
|
-
}
|
|
80
|
-
// Return partial results if we got some before the error
|
|
81
|
-
if (results.length > 0) {
|
|
82
|
-
console.error(`[Kaggle] Returning ${results.length} partial results before error`);
|
|
83
|
-
}
|
|
84
|
-
return results;
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
transform(ds) {
|
|
88
|
-
const repoId = ds.ref;
|
|
89
|
-
const tags = ds.tags?.map(t => t.name) || [];
|
|
90
|
-
const description = ds.description || "";
|
|
91
|
-
const license = categorizeLicense(ds.licenseName);
|
|
92
|
-
const warnings = [];
|
|
93
|
-
// Kaggle doesn't give us splits in the list API easily
|
|
94
|
-
const sizeBytes = this.parseSize(ds.size);
|
|
95
|
-
const splits = [
|
|
96
|
-
{
|
|
97
|
-
name: "data",
|
|
98
|
-
num_examples: 0,
|
|
99
|
-
size_bytes: sizeBytes
|
|
100
|
-
}
|
|
101
|
-
];
|
|
102
|
-
const totalSizeMB = sizeBytes ? Math.round(sizeBytes / (1024 * 1024) * 100) / 100 : 0;
|
|
103
|
-
// Populate warnings
|
|
104
|
-
if (description.length < 100)
|
|
105
|
-
warnings.push("Short description; results may be less relevant");
|
|
106
|
-
const lastUpdatedDate = new Date(ds.lastUpdated);
|
|
107
|
-
const fourYearsAgo = new Date();
|
|
108
|
-
fourYearsAgo.setFullYear(fourYearsAgo.getFullYear() - 4);
|
|
109
|
-
if (lastUpdatedDate < fourYearsAgo) {
|
|
110
|
-
warnings.push(`Stale data: Last updated ${lastUpdatedDate.getFullYear()}`);
|
|
111
|
-
}
|
|
112
|
-
warnings.push("No specific data splits identified (Kaggle API limitation)");
|
|
113
|
-
// Classify domain
|
|
114
|
-
const task = this.extractTask(tags);
|
|
115
|
-
const domain = classifyDomain(description, tags, repoId, task);
|
|
116
|
-
return {
|
|
117
|
-
id: repoId,
|
|
118
|
-
source: "kaggle",
|
|
119
|
-
name: ds.title,
|
|
120
|
-
description: description,
|
|
121
|
-
quality_warnings: warnings,
|
|
122
|
-
downloads: ds.downloadCount,
|
|
123
|
-
likes: ds.voteCount,
|
|
124
|
-
stars: 0,
|
|
125
|
-
tags: tags,
|
|
126
|
-
last_updated: ds.lastUpdated,
|
|
127
|
-
task: task,
|
|
128
|
-
domain: domain,
|
|
129
|
-
languages: [],
|
|
130
|
-
splits,
|
|
131
|
-
license,
|
|
132
|
-
quality_score: calculateQualityScore({
|
|
133
|
-
downloads: ds.downloadCount,
|
|
134
|
-
likes: ds.voteCount,
|
|
135
|
-
hasDescription: description.length > 50,
|
|
136
|
-
descriptionLength: description.length,
|
|
137
|
-
hasTrainSplit: false,
|
|
138
|
-
hasTestSplit: false,
|
|
139
|
-
lastUpdated: ds.lastUpdated,
|
|
140
|
-
licenseCategory: license.category
|
|
141
|
-
}),
|
|
142
|
-
download_url: `https://www.kaggle.com/datasets/${ds.ref}`,
|
|
143
|
-
format: undefined,
|
|
144
|
-
total_examples: 0,
|
|
145
|
-
total_size_bytes: sizeBytes,
|
|
146
|
-
total_size_mb: totalSizeMB,
|
|
147
|
-
columns: [],
|
|
148
|
-
is_structured: false,
|
|
149
|
-
has_target_column: false,
|
|
150
|
-
is_safe_source: true,
|
|
151
|
-
has_personal_data: false,
|
|
152
|
-
is_paywalled: false,
|
|
153
|
-
is_scraped_web_data: false,
|
|
154
|
-
uses_https: true,
|
|
155
|
-
has_train_split: false,
|
|
156
|
-
has_test_split: false,
|
|
157
|
-
has_validation_split: false,
|
|
158
|
-
description_length: description.length,
|
|
159
|
-
has_readme: true
|
|
160
|
-
};
|
|
161
|
-
}
|
|
162
|
-
parseSize(sizeStr) {
|
|
163
|
-
if (!sizeStr)
|
|
164
|
-
return 0;
|
|
165
|
-
const match = sizeStr.match(/^(\d+(?:\.\d+)?)\s*([KMGT]B)$/i);
|
|
166
|
-
if (!match)
|
|
167
|
-
return 0;
|
|
168
|
-
const value = parseFloat(match[1]);
|
|
169
|
-
const unit = match[2].toUpperCase();
|
|
170
|
-
switch (unit) {
|
|
171
|
-
case 'KB': return value * 1024;
|
|
172
|
-
case 'MB': return value * 1024 * 1024;
|
|
173
|
-
case 'GB': return value * 1024 * 1024 * 1024;
|
|
174
|
-
case 'TB': return value * 1024 * 1024 * 1024 * 1024;
|
|
175
|
-
default: return value;
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
extractTask(tags) {
|
|
179
|
-
// Similar to HF but Kaggle tags might be different
|
|
180
|
-
return "unknown";
|
|
181
|
-
}
|
|
182
|
-
}
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
import os from "os";
|
|
5
|
-
export class KaggleSource {
|
|
6
|
-
pythonPath = "python";
|
|
7
|
-
scriptPath;
|
|
8
|
-
constructor(buildDir = process.cwd()) {
|
|
9
|
-
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "kaggle_engine.py");
|
|
12
|
-
const scriptPath1 = path.resolve(buildDir, "python", "kaggle_engine.py");
|
|
13
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "kaggle_engine.py");
|
|
14
|
-
if (fs.existsSync(scriptPath0)) {
|
|
15
|
-
this.scriptPath = scriptPath0;
|
|
16
|
-
}
|
|
17
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
18
|
-
this.scriptPath = scriptPath1;
|
|
19
|
-
}
|
|
20
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
21
|
-
this.scriptPath = scriptPath2;
|
|
22
|
-
}
|
|
23
|
-
else {
|
|
24
|
-
this.scriptPath = scriptPath0;
|
|
25
|
-
}
|
|
26
|
-
if (process.platform === "win32")
|
|
27
|
-
this.pythonPath = "py";
|
|
28
|
-
}
|
|
29
|
-
async discover(query, limit = 20) {
|
|
30
|
-
const result = await this.run(["discover", query, String(limit)]);
|
|
31
|
-
if (!result.ok) {
|
|
32
|
-
throw new Error(result.error || "Kaggle discover failed");
|
|
33
|
-
}
|
|
34
|
-
return (result.results || []);
|
|
35
|
-
}
|
|
36
|
-
async download(datasetRef, targetDir) {
|
|
37
|
-
const args = ["download", datasetRef];
|
|
38
|
-
if (targetDir)
|
|
39
|
-
args.push(targetDir);
|
|
40
|
-
const result = await this.run(args);
|
|
41
|
-
if (!result.ok) {
|
|
42
|
-
throw new Error(result.error || "Kaggle download failed");
|
|
43
|
-
}
|
|
44
|
-
return {
|
|
45
|
-
local_path: result.local_path,
|
|
46
|
-
target_dir: result.target_dir,
|
|
47
|
-
};
|
|
48
|
-
}
|
|
49
|
-
async run(args) {
|
|
50
|
-
return new Promise((resolve, reject) => {
|
|
51
|
-
const processRef = spawn(this.pythonPath, [this.scriptPath, ...args]);
|
|
52
|
-
let stdout = "";
|
|
53
|
-
let stderr = "";
|
|
54
|
-
processRef.stdout.on("data", (d) => (stdout += d.toString()));
|
|
55
|
-
processRef.stderr.on("data", (d) => (stderr += d.toString()));
|
|
56
|
-
processRef.on("close", (code) => {
|
|
57
|
-
if (code !== 0) {
|
|
58
|
-
reject(new Error(stderr || stdout || `kaggle_engine exited with code ${code}`));
|
|
59
|
-
return;
|
|
60
|
-
}
|
|
61
|
-
try {
|
|
62
|
-
resolve(JSON.parse(stdout));
|
|
63
|
-
}
|
|
64
|
-
catch {
|
|
65
|
-
reject(new Error(`Failed to parse kaggle_engine output: ${stdout}`));
|
|
66
|
-
}
|
|
67
|
-
});
|
|
68
|
-
});
|
|
69
|
-
}
|
|
70
|
-
}
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
const SAFE_KEYWORDS = ["mit", "apache", "bsd", "cc0", "cc-by-4.0", "cc-by-sa-4.0", "odc-by", "pddl", "openrail", "creative commons attribution 4.0", "public domain"];
|
|
2
|
-
const RESTRICTED_KEYWORDS = ["nc", "non-commercial", "research-only", "academic", "gpl", "agpl", "proprietary", "custom"];
|
|
3
|
-
// Permissive licenses for MVP filter
|
|
4
|
-
const PERMISSIVE_LICENSES = ["mit", "apache", "apache-2.0", "bsd", "cc0", "cc-by-4.0", "odc-by", "pddl", "openrail"];
|
|
5
|
-
export function categorizeLicense(licenseId, licenseUrl) {
|
|
6
|
-
const id = (licenseId || "unknown").toLowerCase();
|
|
7
|
-
const usageRestrictions = [];
|
|
8
|
-
let requiresConsent = false;
|
|
9
|
-
// Check for usage restrictions
|
|
10
|
-
if (id.includes("nc") || id.includes("non-commercial")) {
|
|
11
|
-
usageRestrictions.push("non-commercial");
|
|
12
|
-
}
|
|
13
|
-
if (id.includes("research-only") || id.includes("academic")) {
|
|
14
|
-
usageRestrictions.push("academic-only");
|
|
15
|
-
}
|
|
16
|
-
if (id.includes("nd") || id.includes("no-derivatives")) {
|
|
17
|
-
usageRestrictions.push("no-derivatives");
|
|
18
|
-
}
|
|
19
|
-
if (id.includes("gpl") || id.includes("agpl")) {
|
|
20
|
-
usageRestrictions.push("no-derivatives"); // GPL requires derivative works to be GPL
|
|
21
|
-
}
|
|
22
|
-
// Check if consent is required (GDPR, Kaggle, etc.)
|
|
23
|
-
if (id.includes("gdpr") || id.includes("consent") || id.includes("kaggle")) {
|
|
24
|
-
requiresConsent = true;
|
|
25
|
-
}
|
|
26
|
-
// If ID contains restricted keywords
|
|
27
|
-
if (RESTRICTED_KEYWORDS.some(k => id.includes(k))) {
|
|
28
|
-
return {
|
|
29
|
-
id,
|
|
30
|
-
category: "restricted",
|
|
31
|
-
commercial_use: false,
|
|
32
|
-
usage_restrictions: usageRestrictions.length > 0 ? usageRestrictions : ["non-commercial"],
|
|
33
|
-
url: licenseUrl,
|
|
34
|
-
warnings: [
|
|
35
|
-
"Restricted usage terms apply",
|
|
36
|
-
"Verify license terms before commercial application",
|
|
37
|
-
],
|
|
38
|
-
requires_consent: requiresConsent,
|
|
39
|
-
};
|
|
40
|
-
}
|
|
41
|
-
// If ID is a common safe license
|
|
42
|
-
if (SAFE_KEYWORDS.some(k => id.includes(k))) {
|
|
43
|
-
return {
|
|
44
|
-
id,
|
|
45
|
-
category: "safe",
|
|
46
|
-
commercial_use: true,
|
|
47
|
-
usage_restrictions: [],
|
|
48
|
-
url: licenseUrl,
|
|
49
|
-
warnings: [],
|
|
50
|
-
requires_consent: requiresConsent,
|
|
51
|
-
};
|
|
52
|
-
}
|
|
53
|
-
return {
|
|
54
|
-
id: id || "unknown",
|
|
55
|
-
category: "unknown",
|
|
56
|
-
usage_restrictions: usageRestrictions,
|
|
57
|
-
url: licenseUrl,
|
|
58
|
-
warnings: [
|
|
59
|
-
"License information unclear or unknown",
|
|
60
|
-
"Use at your own risk",
|
|
61
|
-
],
|
|
62
|
-
requires_consent: requiresConsent,
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
export function isPermissiveLicense(licenseId) {
|
|
66
|
-
const id = (licenseId || "unknown").toLowerCase();
|
|
67
|
-
return PERMISSIVE_LICENSES.some(perm => id.includes(perm));
|
|
68
|
-
}
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
export class MonitoringService {
|
|
2
|
-
monitorStore;
|
|
3
|
-
metadataStore;
|
|
4
|
-
constructor(monitorStore, metadataStore) {
|
|
5
|
-
this.monitorStore = monitorStore;
|
|
6
|
-
this.metadataStore = metadataStore;
|
|
7
|
-
}
|
|
8
|
-
/**
|
|
9
|
-
* Checks all active monitors for updates.
|
|
10
|
-
* @param fetchLatest A function that fetches the latest metadata from the source (HF/Kaggle)
|
|
11
|
-
*/
|
|
12
|
-
async checkUpdates(fetchLatest) {
|
|
13
|
-
const monitors = this.monitorStore.getActiveMonitors();
|
|
14
|
-
const results = [];
|
|
15
|
-
for (const monitor of monitors) {
|
|
16
|
-
const current = this.metadataStore.getDataset(monitor.dataset_id);
|
|
17
|
-
if (!current)
|
|
18
|
-
continue;
|
|
19
|
-
const latest = await fetchLatest(monitor.dataset_id, current.source);
|
|
20
|
-
if (!latest)
|
|
21
|
-
continue;
|
|
22
|
-
if (latest.last_updated !== monitor.last_checked_version) {
|
|
23
|
-
const diff = this.compareVersions(current, latest);
|
|
24
|
-
if (diff.changes.length > 0) {
|
|
25
|
-
results.push(diff);
|
|
26
|
-
await this.notify(monitor, diff);
|
|
27
|
-
// Update monitor
|
|
28
|
-
monitor.last_checked_version = latest.last_updated;
|
|
29
|
-
monitor.updated_at = new Date().toISOString();
|
|
30
|
-
this.monitorStore.saveMonitor(monitor);
|
|
31
|
-
// Update store
|
|
32
|
-
this.metadataStore.saveDataset(latest);
|
|
33
|
-
if (monitor.auto_reprocess) {
|
|
34
|
-
await this.triggerReprocess(monitor.dataset_id);
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
return results;
|
|
40
|
-
}
|
|
41
|
-
compareVersions(oldVer, newVer) {
|
|
42
|
-
const changes = [];
|
|
43
|
-
// Check for significant field changes
|
|
44
|
-
const fieldsToTrack = ["downloads", "likes", "total_examples", "total_size_mb", "quality_score"];
|
|
45
|
-
for (const field of fieldsToTrack) {
|
|
46
|
-
if (oldVer[field] !== newVer[field]) {
|
|
47
|
-
changes.push({
|
|
48
|
-
field: String(field),
|
|
49
|
-
old_value: oldVer[field],
|
|
50
|
-
new_value: newVer[field]
|
|
51
|
-
});
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
// Check for split changes
|
|
55
|
-
if (JSON.stringify(oldVer.splits) !== JSON.stringify(newVer.splits)) {
|
|
56
|
-
changes.push({
|
|
57
|
-
field: "splits",
|
|
58
|
-
old_value: oldVer.splits,
|
|
59
|
-
new_value: newVer.splits
|
|
60
|
-
});
|
|
61
|
-
}
|
|
62
|
-
return {
|
|
63
|
-
dataset_id: oldVer.id,
|
|
64
|
-
old_version: oldVer.last_updated,
|
|
65
|
-
new_version: newVer.last_updated,
|
|
66
|
-
changes,
|
|
67
|
-
impact_score: this.calculateImpact(changes)
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
calculateImpact(changes) {
|
|
71
|
-
let score = 0;
|
|
72
|
-
for (const change of changes) {
|
|
73
|
-
if (change.field === "total_examples")
|
|
74
|
-
score += 40;
|
|
75
|
-
if (change.field === "splits")
|
|
76
|
-
score += 30;
|
|
77
|
-
if (change.field === "quality_score")
|
|
78
|
-
score += 20;
|
|
79
|
-
if (change.field === "total_size_mb")
|
|
80
|
-
score += 10;
|
|
81
|
-
}
|
|
82
|
-
return Math.min(score, 100);
|
|
83
|
-
}
|
|
84
|
-
async notify(monitor, diff) {
|
|
85
|
-
for (const webhookId of monitor.webhook_ids) {
|
|
86
|
-
const webhook = this.monitorStore.getWebhook(webhookId);
|
|
87
|
-
if (webhook && webhook.enabled) {
|
|
88
|
-
await this.sendToWebhook(webhook, diff);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
async sendToWebhook(webhook, diff) {
|
|
93
|
-
console.error(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
|
|
94
|
-
// In a real implementation, this would be an HTTP POST
|
|
95
|
-
// For now, we simulate the payload
|
|
96
|
-
const payload = {
|
|
97
|
-
text: `Dataset ${diff.dataset_id} updated!`,
|
|
98
|
-
changes: diff.changes,
|
|
99
|
-
impact: diff.impact_score
|
|
100
|
-
};
|
|
101
|
-
// await axios.post(webhook.url, payload);
|
|
102
|
-
}
|
|
103
|
-
async triggerReprocess(datasetId) {
|
|
104
|
-
console.error(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
|
|
105
|
-
// This would call IngestionService or similar
|
|
106
|
-
}
|
|
107
|
-
}
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
export class MonitoringStore {
|
|
2
|
-
db;
|
|
3
|
-
constructor(db) {
|
|
4
|
-
this.db = db;
|
|
5
|
-
this.init();
|
|
6
|
-
}
|
|
7
|
-
init() {
|
|
8
|
-
this.db.exec(`
|
|
9
|
-
CREATE TABLE IF NOT EXISTS dataset_monitors (
|
|
10
|
-
dataset_id TEXT PRIMARY KEY,
|
|
11
|
-
enabled BOOLEAN DEFAULT 1,
|
|
12
|
-
auto_reprocess BOOLEAN DEFAULT 0,
|
|
13
|
-
last_checked_version TEXT,
|
|
14
|
-
webhook_ids TEXT, -- JSON array
|
|
15
|
-
created_at TEXT,
|
|
16
|
-
updated_at TEXT
|
|
17
|
-
);
|
|
18
|
-
|
|
19
|
-
CREATE TABLE IF NOT EXISTS webhook_configs (
|
|
20
|
-
id TEXT PRIMARY KEY,
|
|
21
|
-
name TEXT,
|
|
22
|
-
channel TEXT,
|
|
23
|
-
url TEXT,
|
|
24
|
-
enabled BOOLEAN DEFAULT 1
|
|
25
|
-
);
|
|
26
|
-
`);
|
|
27
|
-
}
|
|
28
|
-
saveMonitor(monitor) {
|
|
29
|
-
const upsert = this.db.prepare(`
|
|
30
|
-
INSERT INTO dataset_monitors (dataset_id, enabled, auto_reprocess, last_checked_version, webhook_ids, created_at, updated_at)
|
|
31
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
32
|
-
ON CONFLICT(dataset_id) DO UPDATE SET
|
|
33
|
-
enabled=excluded.enabled,
|
|
34
|
-
auto_reprocess=excluded.auto_reprocess,
|
|
35
|
-
last_checked_version=excluded.last_checked_version,
|
|
36
|
-
webhook_ids=excluded.webhook_ids,
|
|
37
|
-
updated_at=excluded.updated_at
|
|
38
|
-
`);
|
|
39
|
-
upsert.run(monitor.dataset_id, monitor.enabled ? 1 : 0, monitor.auto_reprocess ? 1 : 0, monitor.last_checked_version || null, JSON.stringify(monitor.webhook_ids), monitor.created_at, monitor.updated_at);
|
|
40
|
-
}
|
|
41
|
-
getMonitor(datasetId) {
|
|
42
|
-
const row = this.db.prepare("SELECT * FROM dataset_monitors WHERE dataset_id = ?").get(datasetId);
|
|
43
|
-
if (!row)
|
|
44
|
-
return null;
|
|
45
|
-
return {
|
|
46
|
-
...row,
|
|
47
|
-
enabled: Boolean(row.enabled),
|
|
48
|
-
auto_reprocess: Boolean(row.auto_reprocess),
|
|
49
|
-
webhook_ids: JSON.parse(row.webhook_ids)
|
|
50
|
-
};
|
|
51
|
-
}
|
|
52
|
-
getActiveMonitors() {
|
|
53
|
-
const rows = this.db.prepare("SELECT * FROM dataset_monitors WHERE enabled = 1").all();
|
|
54
|
-
return rows.map(row => ({
|
|
55
|
-
...row,
|
|
56
|
-
enabled: Boolean(row.enabled),
|
|
57
|
-
auto_reprocess: Boolean(row.auto_reprocess),
|
|
58
|
-
webhook_ids: JSON.parse(row.webhook_ids)
|
|
59
|
-
}));
|
|
60
|
-
}
|
|
61
|
-
saveWebhook(config) {
|
|
62
|
-
const upsert = this.db.prepare(`
|
|
63
|
-
INSERT INTO webhook_configs (id, name, channel, url, enabled)
|
|
64
|
-
VALUES (?, ?, ?, ?, ?)
|
|
65
|
-
ON CONFLICT(id) DO UPDATE SET
|
|
66
|
-
name=excluded.name,
|
|
67
|
-
url=excluded.url,
|
|
68
|
-
enabled=excluded.enabled
|
|
69
|
-
`);
|
|
70
|
-
upsert.run(config.id, config.name, config.channel, config.url, config.enabled ? 1 : 0);
|
|
71
|
-
}
|
|
72
|
-
getWebhook(id) {
|
|
73
|
-
const row = this.db.prepare("SELECT * FROM webhook_configs WHERE id = ?").get(id);
|
|
74
|
-
if (!row)
|
|
75
|
-
return null;
|
|
76
|
-
return { ...row, enabled: Boolean(row.enabled) };
|
|
77
|
-
}
|
|
78
|
-
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
import os from "os";
|
|
5
|
-
export class OpenMLSource {
|
|
6
|
-
pythonPath = "python";
|
|
7
|
-
scriptPath;
|
|
8
|
-
constructor(buildDir = process.cwd()) {
|
|
9
|
-
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
10
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
11
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "openml_engine.py");
|
|
12
|
-
const scriptPath1 = path.resolve(buildDir, "python", "openml_engine.py");
|
|
13
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "openml_engine.py");
|
|
14
|
-
if (fs.existsSync(scriptPath0)) {
|
|
15
|
-
this.scriptPath = scriptPath0;
|
|
16
|
-
}
|
|
17
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
18
|
-
this.scriptPath = scriptPath1;
|
|
19
|
-
}
|
|
20
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
21
|
-
this.scriptPath = scriptPath2;
|
|
22
|
-
}
|
|
23
|
-
else {
|
|
24
|
-
this.scriptPath = scriptPath0;
|
|
25
|
-
}
|
|
26
|
-
if (process.platform === "win32") {
|
|
27
|
-
const venvPy = path.resolve(buildDir, ".venv", "Scripts", "python.exe");
|
|
28
|
-
if (fs.existsSync(venvPy)) {
|
|
29
|
-
this.pythonPath = venvPy;
|
|
30
|
-
}
|
|
31
|
-
else {
|
|
32
|
-
this.pythonPath = "py";
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
else {
|
|
36
|
-
const venvPy = path.resolve(buildDir, ".venv", "bin", "python");
|
|
37
|
-
if (fs.existsSync(venvPy)) {
|
|
38
|
-
this.pythonPath = venvPy;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
async discover(query, limit = 20) {
|
|
43
|
-
const result = await this.run(["discover", query, String(limit)]);
|
|
44
|
-
if (!result.ok) {
|
|
45
|
-
throw new Error(result.error || "OpenML discover failed");
|
|
46
|
-
}
|
|
47
|
-
return (result.results || []);
|
|
48
|
-
}
|
|
49
|
-
async download(datasetRef, targetDir) {
|
|
50
|
-
const args = ["download", datasetRef];
|
|
51
|
-
if (targetDir)
|
|
52
|
-
args.push(targetDir);
|
|
53
|
-
const result = await this.run(args);
|
|
54
|
-
if (!result.ok) {
|
|
55
|
-
throw new Error(result.error || "OpenML download failed");
|
|
56
|
-
}
|
|
57
|
-
return {
|
|
58
|
-
local_path: result.local_path,
|
|
59
|
-
target_dir: result.target_dir,
|
|
60
|
-
};
|
|
61
|
-
}
|
|
62
|
-
run(args) {
|
|
63
|
-
return new Promise((resolve, reject) => {
|
|
64
|
-
const proc = spawn(this.pythonPath, [this.scriptPath, ...args]);
|
|
65
|
-
let stdout = "";
|
|
66
|
-
let stderr = "";
|
|
67
|
-
proc.stdout.on("data", (data) => {
|
|
68
|
-
stdout += data.toString();
|
|
69
|
-
});
|
|
70
|
-
proc.stderr.on("data", (data) => {
|
|
71
|
-
stderr += data.toString();
|
|
72
|
-
});
|
|
73
|
-
proc.on("close", (code) => {
|
|
74
|
-
if (code !== 0) {
|
|
75
|
-
return reject(new Error(`OpenML engine exited with code ${code}: ${stderr}`));
|
|
76
|
-
}
|
|
77
|
-
try {
|
|
78
|
-
const parsed = JSON.parse(stdout.trim());
|
|
79
|
-
resolve(parsed);
|
|
80
|
-
}
|
|
81
|
-
catch (e) {
|
|
82
|
-
reject(new Error(`Failed to parse OpenML engine output: ${stdout}`));
|
|
83
|
-
}
|
|
84
|
-
});
|
|
85
|
-
});
|
|
86
|
-
}
|
|
87
|
-
}
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Calculates a quality score from 0-100 based on metadata.
|
|
3
|
-
*/
|
|
4
|
-
export function calculateQualityScore(data) {
|
|
5
|
-
let score = 0;
|
|
6
|
-
// 1. Popularity (max 30)
|
|
7
|
-
if (data.downloads > 10000)
|
|
8
|
-
score += 30;
|
|
9
|
-
else if (data.downloads > 1000)
|
|
10
|
-
score += 20;
|
|
11
|
-
else if (data.downloads > 100)
|
|
12
|
-
score += 10;
|
|
13
|
-
// 2. Structuredness (max 20)
|
|
14
|
-
if (data.hasTrainSplit)
|
|
15
|
-
score += 10;
|
|
16
|
-
if (data.hasTestSplit)
|
|
17
|
-
score += 10;
|
|
18
|
-
// 3. Documentation (max 20)
|
|
19
|
-
if (data.hasDescription) {
|
|
20
|
-
if (data.descriptionLength > 1000)
|
|
21
|
-
score += 20;
|
|
22
|
-
else if (data.descriptionLength > 200)
|
|
23
|
-
score += 10;
|
|
24
|
-
else
|
|
25
|
-
score += 5;
|
|
26
|
-
}
|
|
27
|
-
// 4. Recency (max 15)
|
|
28
|
-
const lastUpdate = new Date(data.lastUpdated);
|
|
29
|
-
const now = new Date();
|
|
30
|
-
const diffDays = Math.floor((now.getTime() - lastUpdate.getTime()) / (1000 * 3600 * 24));
|
|
31
|
-
if (diffDays < 180)
|
|
32
|
-
score += 15; // 6 months
|
|
33
|
-
else if (diffDays < 365)
|
|
34
|
-
score += 10; // 1 year
|
|
35
|
-
else if (diffDays < 730)
|
|
36
|
-
score += 5; // 2 years
|
|
37
|
-
// 5. License Clarity (max 10)
|
|
38
|
-
if (data.licenseCategory === "safe")
|
|
39
|
-
score += 10;
|
|
40
|
-
else if (data.licenseCategory === "restricted")
|
|
41
|
-
score += 5;
|
|
42
|
-
// 6. Community (max 5)
|
|
43
|
-
if (data.likes > 50)
|
|
44
|
-
score += 5;
|
|
45
|
-
else if (data.likes > 10)
|
|
46
|
-
score += 2;
|
|
47
|
-
return Math.min(100, score);
|
|
48
|
-
}
|