vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
import { listFiles } from "@huggingface/hub";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import path from "path";
|
|
4
|
-
import { RobustDownloader } from "../utils/downloader.js";
|
|
5
|
-
export class HFDownloader {
|
|
6
|
-
hfToken;
|
|
7
|
-
downloader;
|
|
8
|
-
constructor(token) {
|
|
9
|
-
this.hfToken = token || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
10
|
-
this.downloader = new RobustDownloader();
|
|
11
|
-
}
|
|
12
|
-
getToken() {
|
|
13
|
-
return this.hfToken || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
14
|
-
}
|
|
15
|
-
/**
|
|
16
|
-
* Finds the most suitable data file in a repository
|
|
17
|
-
* Returns the relative path within the repo
|
|
18
|
-
*/
|
|
19
|
-
async findBestFile(repoId) {
|
|
20
|
-
try {
|
|
21
|
-
const token = this.getToken();
|
|
22
|
-
const files = [];
|
|
23
|
-
const metadataFiles = [];
|
|
24
|
-
const blacklist = [
|
|
25
|
-
".gitattributes",
|
|
26
|
-
".gitignore",
|
|
27
|
-
".git",
|
|
28
|
-
"README.md",
|
|
29
|
-
"LICENSE",
|
|
30
|
-
"package.json",
|
|
31
|
-
"requirements.txt",
|
|
32
|
-
"setup.py"
|
|
33
|
-
];
|
|
34
|
-
const metadataNamePatterns = [
|
|
35
|
-
/^dataset_infos?\.json$/i,
|
|
36
|
-
/^dataset_dict\.json$/i,
|
|
37
|
-
/^state\.json$/i,
|
|
38
|
-
/^config\.json$/i,
|
|
39
|
-
/^metadata\.json$/i,
|
|
40
|
-
/^stats\.json$/i,
|
|
41
|
-
/^index\.json$/i
|
|
42
|
-
];
|
|
43
|
-
for await (const file of listFiles({
|
|
44
|
-
repo: { type: "dataset", name: repoId },
|
|
45
|
-
recursive: true,
|
|
46
|
-
...(token ? { accessToken: token } : {})
|
|
47
|
-
})) {
|
|
48
|
-
if (file.type === "file") {
|
|
49
|
-
const fileName = path.basename(file.path);
|
|
50
|
-
const isMetadataJson = metadataNamePatterns.some(p => p.test(fileName));
|
|
51
|
-
if (isMetadataJson) {
|
|
52
|
-
metadataFiles.push(file.path);
|
|
53
|
-
}
|
|
54
|
-
if (!blacklist.includes(fileName) && !fileName.startsWith(".") && !isMetadataJson) {
|
|
55
|
-
files.push(file.path);
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
// Priority logic for data scientists
|
|
60
|
-
const priorities = [
|
|
61
|
-
/train.*\.parquet$/i,
|
|
62
|
-
/data.*\.parquet$/i,
|
|
63
|
-
/.*\.parquet$/i,
|
|
64
|
-
/train.*\.csv$/i,
|
|
65
|
-
/data.*\.csv$/i,
|
|
66
|
-
/.*\.csv$/i,
|
|
67
|
-
/train.*\.tsv$/i,
|
|
68
|
-
/data.*\.tsv$/i,
|
|
69
|
-
/.*\.tsv$/i,
|
|
70
|
-
/train.*\.txt$/i,
|
|
71
|
-
/data.*\.txt$/i,
|
|
72
|
-
/.*\.txt$/i,
|
|
73
|
-
/.*\.jsonl$/i,
|
|
74
|
-
/.*\.ndjson$/i,
|
|
75
|
-
// Keep plain JSON as lowest priority to avoid selecting metadata-like files.
|
|
76
|
-
/.*\.json$/i
|
|
77
|
-
];
|
|
78
|
-
for (const pattern of priorities) {
|
|
79
|
-
const match = files.find(f => pattern.test(f));
|
|
80
|
-
if (match)
|
|
81
|
-
return match;
|
|
82
|
-
}
|
|
83
|
-
// Strict fallback: Only return the first file if it has a data-like extension
|
|
84
|
-
const dataExtensions = [".csv", ".parquet", ".jsonl", ".ndjson", ".tsv", ".txt", ".json", ".avro", ".orc"];
|
|
85
|
-
const fallback = files.find(f => {
|
|
86
|
-
const ext = path.extname(f).toLowerCase();
|
|
87
|
-
return dataExtensions.includes(ext);
|
|
88
|
-
});
|
|
89
|
-
if (fallback)
|
|
90
|
-
return fallback;
|
|
91
|
-
// Last-resort: allow dataset metadata file, then resolve external raw URLs later.
|
|
92
|
-
const metadataFallback = metadataFiles.find(f => /dataset_infos?\.json$/i.test(path.basename(f)));
|
|
93
|
-
return metadataFallback || null;
|
|
94
|
-
}
|
|
95
|
-
catch (error) {
|
|
96
|
-
const msg = String(error?.message || error);
|
|
97
|
-
if (msg.includes("401") || msg.toLowerCase().includes("unauthorized")) {
|
|
98
|
-
throw new Error(`Authentication required for dataset '${repoId}'. ` +
|
|
99
|
-
`This dataset may be gated or private. ` +
|
|
100
|
-
`Use the configure_keys tool to set your HF_TOKEN, then retry.`);
|
|
101
|
-
}
|
|
102
|
-
if (msg.includes("403") || msg.toLowerCase().includes("forbidden")) {
|
|
103
|
-
throw new Error(`Access denied for dataset '${repoId}'. ` +
|
|
104
|
-
`You may need to accept the dataset's usage agreement on huggingface.co, ` +
|
|
105
|
-
`then set HF_TOKEN via configure_keys tool.`);
|
|
106
|
-
}
|
|
107
|
-
if (msg.includes("404") || msg.toLowerCase().includes("not found")) {
|
|
108
|
-
throw new Error(`Dataset '${repoId}' not found on HuggingFace. Check the dataset ID.`);
|
|
109
|
-
}
|
|
110
|
-
console.error(`[HF] Failed to list files for ${repoId}:`, msg);
|
|
111
|
-
return null;
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
/**
|
|
115
|
-
* Downloads a file from HF to local path
|
|
116
|
-
*/
|
|
117
|
-
async download(repoId, filePath, targetPath, onProgress) {
|
|
118
|
-
const token = this.getToken();
|
|
119
|
-
const url = `https://huggingface.co/datasets/${repoId}/resolve/main/${filePath}`;
|
|
120
|
-
await this.downloader.download(url, targetPath, {
|
|
121
|
-
headers: token ? { 'Authorization': `Bearer ${token}` } : {},
|
|
122
|
-
resume: true,
|
|
123
|
-
onProgress: (bytes, total) => {
|
|
124
|
-
if (total > 0 && onProgress) {
|
|
125
|
-
onProgress(Math.round((bytes / total) * 100));
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
});
|
|
129
|
-
}
|
|
130
|
-
/**
|
|
131
|
-
* If downloaded file is dataset metadata (dataset_infos.json), resolve and download a real data URL.
|
|
132
|
-
* Returns the actual local data path to use.
|
|
133
|
-
*/
|
|
134
|
-
async resolveExternalDataFromMetadata(localPath, onProgress) {
|
|
135
|
-
const ext = path.extname(localPath).toLowerCase();
|
|
136
|
-
if (ext !== ".json") {
|
|
137
|
-
return localPath;
|
|
138
|
-
}
|
|
139
|
-
try {
|
|
140
|
-
const raw = fs.readFileSync(localPath, "utf-8");
|
|
141
|
-
const parsed = JSON.parse(raw);
|
|
142
|
-
const firstConfig = parsed?.default || Object.values(parsed || {})[0];
|
|
143
|
-
const checksums = firstConfig?.download_checksums;
|
|
144
|
-
if (!checksums || typeof checksums !== "object") {
|
|
145
|
-
return localPath;
|
|
146
|
-
}
|
|
147
|
-
const candidateUrls = Object.keys(checksums).filter((u) => /^https?:\/\//i.test(u));
|
|
148
|
-
if (candidateUrls.length === 0) {
|
|
149
|
-
return localPath;
|
|
150
|
-
}
|
|
151
|
-
const preferred = candidateUrls.find(u => /train|data/i.test(path.basename(u))) || candidateUrls[0];
|
|
152
|
-
const ext = path.extname(preferred).toLowerCase() || ".csv";
|
|
153
|
-
const resolvedPath = localPath.replace(/\.json$/i, ext);
|
|
154
|
-
await this.downloader.download(preferred, resolvedPath, {
|
|
155
|
-
resume: true,
|
|
156
|
-
onProgress: (bytes, total) => {
|
|
157
|
-
if (total > 0 && onProgress) {
|
|
158
|
-
onProgress(Math.round((bytes / total) * 100));
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
});
|
|
162
|
-
if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).size > 0) {
|
|
163
|
-
return resolvedPath;
|
|
164
|
-
}
|
|
165
|
-
return localPath;
|
|
166
|
-
}
|
|
167
|
-
catch {
|
|
168
|
-
return localPath;
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
}
|
|
@@ -1,271 +0,0 @@
|
|
|
1
|
-
import path from "path";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import { spawn } from "child_process";
|
|
4
|
-
import { HFDownloader } from "./hf-downloader.js";
|
|
5
|
-
import { KaggleSource } from "../metadata/kaggle-source.js";
|
|
6
|
-
import { OpenMLSource } from "../metadata/openml-source.js";
|
|
7
|
-
import { DataWorldSource } from "../metadata/dataworld-source.js";
|
|
8
|
-
import { SecureKeysManager } from "../config/secure-keys.js";
|
|
9
|
-
export class DataIngestor {
|
|
10
|
-
projectRoot;
|
|
11
|
-
store;
|
|
12
|
-
rawDataDir;
|
|
13
|
-
hfDownloader;
|
|
14
|
-
kaggleSource;
|
|
15
|
-
openmlSource;
|
|
16
|
-
dataworldSource;
|
|
17
|
-
secureKeys;
|
|
18
|
-
constructor(projectRoot, store) {
|
|
19
|
-
this.projectRoot = projectRoot;
|
|
20
|
-
this.store = store;
|
|
21
|
-
this.rawDataDir = path.join(this.projectRoot, "data", "raw");
|
|
22
|
-
if (!fs.existsSync(this.rawDataDir)) {
|
|
23
|
-
fs.mkdirSync(this.rawDataDir, { recursive: true });
|
|
24
|
-
}
|
|
25
|
-
this.hfDownloader = new HFDownloader();
|
|
26
|
-
this.kaggleSource = new KaggleSource();
|
|
27
|
-
this.openmlSource = new OpenMLSource();
|
|
28
|
-
this.dataworldSource = new DataWorldSource();
|
|
29
|
-
this.secureKeys = new SecureKeysManager();
|
|
30
|
-
}
|
|
31
|
-
/**
|
|
32
|
-
* Check if Kaggle credentials are available
|
|
33
|
-
*/
|
|
34
|
-
hasKaggleCredentials() {
|
|
35
|
-
if (process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY)
|
|
36
|
-
return true;
|
|
37
|
-
const keys = this.secureKeys.getAll();
|
|
38
|
-
if (keys.kaggle_username && keys.kaggle_key)
|
|
39
|
-
return true;
|
|
40
|
-
const kaggleJsonPath = path.join(process.env.HOME || process.env.USERPROFILE || "", ".kaggle", "kaggle.json");
|
|
41
|
-
return !!(kaggleJsonPath && fs.existsSync(kaggleJsonPath));
|
|
42
|
-
}
|
|
43
|
-
/**
|
|
44
|
-
* Get helpful error message if Kaggle credentials are missing
|
|
45
|
-
*/
|
|
46
|
-
getKaggleCredentialError() {
|
|
47
|
-
return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
|
|
48
|
-
}
|
|
49
|
-
toSafeDatasetPath(datasetId) {
|
|
50
|
-
return datasetId.replace(/[:\/]/g, "_");
|
|
51
|
-
}
|
|
52
|
-
/**
|
|
53
|
-
* Ensures a dataset is available locally
|
|
54
|
-
*/
|
|
55
|
-
async ensureData(datasetId, source, onProgress) {
|
|
56
|
-
// 1. Check database for existing download
|
|
57
|
-
const status = this.store.getDownloadStatus(datasetId);
|
|
58
|
-
if (status && status.status === 'completed' && fs.existsSync(status.local_path)) {
|
|
59
|
-
return status.local_path;
|
|
60
|
-
}
|
|
61
|
-
if (status && status.status === 'downloading') {
|
|
62
|
-
console.error(`[Ingestor] Dataset ${datasetId} status is 'downloading'. Attempting to resume/concurrently monitor.`);
|
|
63
|
-
// In a better system we'd use a lock, but for now we let it resume
|
|
64
|
-
// the RobustDownloader handles the actual file locking/range logic.
|
|
65
|
-
}
|
|
66
|
-
// 2. Trigger source-specific download
|
|
67
|
-
if (source === "huggingface") {
|
|
68
|
-
onProgress?.("Discovering data files on HuggingFace Hub...");
|
|
69
|
-
const remotePath = await this.hfDownloader.findBestFile(datasetId);
|
|
70
|
-
if (remotePath) {
|
|
71
|
-
// Direct file download path (repo has raw data files)
|
|
72
|
-
const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
|
|
73
|
-
const targetPath = this.getTargetPath(datasetId, ext);
|
|
74
|
-
this.store.registerDownload(datasetId, targetPath, "downloading");
|
|
75
|
-
try {
|
|
76
|
-
await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
|
|
77
|
-
onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
|
|
78
|
-
});
|
|
79
|
-
const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
|
|
80
|
-
onProgress?.("Resolving external dataset file...", progress);
|
|
81
|
-
});
|
|
82
|
-
const stats = fs.statSync(resolvedPath);
|
|
83
|
-
this.completeDownload(datasetId, resolvedPath, stats.size);
|
|
84
|
-
return resolvedPath;
|
|
85
|
-
}
|
|
86
|
-
catch (e) {
|
|
87
|
-
const msg = String(e?.message || e);
|
|
88
|
-
// If auth error, propagate immediately with helpful message
|
|
89
|
-
if (msg.includes("401") || msg.includes("403") || msg.includes("Authentication") || msg.includes("Access denied")) {
|
|
90
|
-
this.failDownload(datasetId, msg);
|
|
91
|
-
throw e;
|
|
92
|
-
}
|
|
93
|
-
// For other download errors, try the fallback
|
|
94
|
-
onProgress?.(`Direct download failed (${msg}), trying datasets library fallback...`);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
// Fallback: Use Python datasets library to download and convert
|
|
98
|
-
// This runs when findBestFile returns null OR when direct download fails (non-auth)
|
|
99
|
-
if (!fs.existsSync(this.getTargetPath(datasetId, "parquet")) || !this.store.getDownloadStatus(datasetId)?.status?.includes("completed")) {
|
|
100
|
-
onProgress?.("Using HuggingFace datasets library to download...");
|
|
101
|
-
const targetPath = this.getTargetPath(datasetId, "parquet");
|
|
102
|
-
this.store.registerDownload(datasetId, targetPath, "downloading");
|
|
103
|
-
try {
|
|
104
|
-
const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
|
|
105
|
-
const stats = fs.statSync(result);
|
|
106
|
-
this.completeDownload(datasetId, result, stats.size);
|
|
107
|
-
return result;
|
|
108
|
-
}
|
|
109
|
-
catch (e) {
|
|
110
|
-
this.failDownload(datasetId, e.message);
|
|
111
|
-
throw e;
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
else if (source === "kaggle") {
|
|
116
|
-
if (!this.hasKaggleCredentials()) {
|
|
117
|
-
const errorMsg = this.getKaggleCredentialError();
|
|
118
|
-
this.failDownload(datasetId, errorMsg);
|
|
119
|
-
throw new Error(errorMsg);
|
|
120
|
-
}
|
|
121
|
-
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
122
|
-
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
123
|
-
try {
|
|
124
|
-
onProgress?.("Downloading from Kaggle...");
|
|
125
|
-
const result = await this.kaggleSource.download(datasetId, targetDir);
|
|
126
|
-
const stats = fs.statSync(result.local_path);
|
|
127
|
-
this.completeDownload(datasetId, result.local_path, stats.size);
|
|
128
|
-
onProgress?.("Kaggle download complete", 100);
|
|
129
|
-
return result.local_path;
|
|
130
|
-
}
|
|
131
|
-
catch (e) {
|
|
132
|
-
this.failDownload(datasetId, e.message);
|
|
133
|
-
throw e;
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
else if (source === "openml") {
|
|
137
|
-
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
138
|
-
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
139
|
-
try {
|
|
140
|
-
onProgress?.("Downloading from OpenML...");
|
|
141
|
-
const result = await this.openmlSource.download(datasetId, targetDir);
|
|
142
|
-
const stats = fs.statSync(result.local_path);
|
|
143
|
-
this.completeDownload(datasetId, result.local_path, stats.size);
|
|
144
|
-
onProgress?.("OpenML download complete", 100);
|
|
145
|
-
return result.local_path;
|
|
146
|
-
}
|
|
147
|
-
catch (e) {
|
|
148
|
-
this.failDownload(datasetId, e.message);
|
|
149
|
-
throw e;
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
else if (source === "dataworld") {
|
|
153
|
-
const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
|
|
154
|
-
this.store.registerDownload(datasetId, targetDir, "downloading");
|
|
155
|
-
try {
|
|
156
|
-
onProgress?.("Downloading from data.world...");
|
|
157
|
-
const result = await this.dataworldSource.download(datasetId, targetDir);
|
|
158
|
-
const stats = fs.statSync(result.local_path);
|
|
159
|
-
this.completeDownload(datasetId, result.local_path, stats.size);
|
|
160
|
-
onProgress?.("data.world download complete", 100);
|
|
161
|
-
return result.local_path;
|
|
162
|
-
}
|
|
163
|
-
catch (e) {
|
|
164
|
-
this.failDownload(datasetId, e.message);
|
|
165
|
-
throw e;
|
|
166
|
-
}
|
|
167
|
-
}
|
|
168
|
-
throw new Error(`Download logic for ${source} not yet implemented`);
|
|
169
|
-
}
|
|
170
|
-
/**
|
|
171
|
-
* Register a successful download
|
|
172
|
-
*/
|
|
173
|
-
completeDownload(datasetId, actualPath, sizeBytes) {
|
|
174
|
-
this.store.registerDownload(datasetId, actualPath, 'completed', sizeBytes);
|
|
175
|
-
}
|
|
176
|
-
/**
|
|
177
|
-
* Register a failed download
|
|
178
|
-
*/
|
|
179
|
-
failDownload(datasetId, error) {
|
|
180
|
-
const existing = this.store.getDownloadStatus(datasetId);
|
|
181
|
-
this.store.registerDownload(datasetId, existing?.local_path || "", 'failed', 0, error);
|
|
182
|
-
}
|
|
183
|
-
/**
|
|
184
|
-
* Generates a safe local filename for a dataset ID
|
|
185
|
-
*/
|
|
186
|
-
getTargetPath(datasetId, extension = "parquet") {
|
|
187
|
-
const safeId = this.toSafeDatasetPath(datasetId);
|
|
188
|
-
return path.join(this.rawDataDir, `${safeId}.${extension}`);
|
|
189
|
-
}
|
|
190
|
-
/**
|
|
191
|
-
* Fallback: Use Python `datasets` library to download a HuggingFace dataset
|
|
192
|
-
* when no raw data files are found in the repo file listing.
|
|
193
|
-
*/
|
|
194
|
-
async hfDatasetsFallback(datasetId, targetPath, onProgress) {
|
|
195
|
-
const pyCmd = process.platform === "win32" ? "py" : "python";
|
|
196
|
-
// Resolve the fallback script path
|
|
197
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
|
|
198
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
199
|
-
const scriptCandidates = [
|
|
200
|
-
path.resolve(dataRoot, "python", "hf_fallback.py"),
|
|
201
|
-
path.resolve(this.projectRoot, "python", "hf_fallback.py"),
|
|
202
|
-
path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
|
|
203
|
-
path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
|
|
204
|
-
];
|
|
205
|
-
let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
|
|
206
|
-
if (!scriptPath) {
|
|
207
|
-
scriptPath = scriptCandidates[0]; // Will fail with a clear error
|
|
208
|
-
}
|
|
209
|
-
const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
|
|
210
|
-
const payload = {
|
|
211
|
-
repo_id: datasetId,
|
|
212
|
-
output_path: targetPath,
|
|
213
|
-
token: token || null,
|
|
214
|
-
max_rows: 500000,
|
|
215
|
-
};
|
|
216
|
-
onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
|
|
217
|
-
return new Promise((resolve, reject) => {
|
|
218
|
-
const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
|
|
219
|
-
env: {
|
|
220
|
-
...process.env,
|
|
221
|
-
PYTHONUTF8: "1",
|
|
222
|
-
PIP_DISABLE_PIP_VERSION_CHECK: "1",
|
|
223
|
-
},
|
|
224
|
-
});
|
|
225
|
-
let stdout = "";
|
|
226
|
-
let stderr = "";
|
|
227
|
-
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
228
|
-
proc.stderr.on("data", (d) => {
|
|
229
|
-
const msg = d.toString();
|
|
230
|
-
stderr += msg;
|
|
231
|
-
// Forward progress info
|
|
232
|
-
if (msg.includes("Downloading") || msg.includes("Loading")) {
|
|
233
|
-
onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
|
|
234
|
-
}
|
|
235
|
-
});
|
|
236
|
-
const timer = setTimeout(() => {
|
|
237
|
-
try {
|
|
238
|
-
proc.kill();
|
|
239
|
-
}
|
|
240
|
-
catch { /* no-op */ }
|
|
241
|
-
reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
|
|
242
|
-
}, 600000); // 10 min timeout
|
|
243
|
-
proc.on("close", (code) => {
|
|
244
|
-
clearTimeout(timer);
|
|
245
|
-
if (code !== 0) {
|
|
246
|
-
let errorMsg = stderr || stdout || `Python exited with code ${code}`;
|
|
247
|
-
try {
|
|
248
|
-
const parsed = JSON.parse(stdout);
|
|
249
|
-
if (parsed.error)
|
|
250
|
-
errorMsg = parsed.error;
|
|
251
|
-
}
|
|
252
|
-
catch { /* use stderr */ }
|
|
253
|
-
reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
|
|
254
|
-
return;
|
|
255
|
-
}
|
|
256
|
-
try {
|
|
257
|
-
const result = JSON.parse(stdout);
|
|
258
|
-
if (!result.ok) {
|
|
259
|
-
reject(new Error(result.error || "Unknown error from HF fallback"));
|
|
260
|
-
return;
|
|
261
|
-
}
|
|
262
|
-
onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
|
|
263
|
-
resolve(result.path);
|
|
264
|
-
}
|
|
265
|
-
catch {
|
|
266
|
-
reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
|
|
267
|
-
}
|
|
268
|
-
});
|
|
269
|
-
});
|
|
270
|
-
}
|
|
271
|
-
}
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import fs from "fs";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import AdmZip from "adm-zip";
|
|
4
|
-
import { RobustDownloader } from "../utils/downloader.js";
|
|
5
|
-
export class KaggleDownloader {
|
|
6
|
-
username;
|
|
7
|
-
key;
|
|
8
|
-
downloader;
|
|
9
|
-
constructor(username, key) {
|
|
10
|
-
this.username = username || process.env.KAGGLE_USERNAME || "";
|
|
11
|
-
this.key = key || process.env.KAGGLE_KEY || "";
|
|
12
|
-
this.downloader = new RobustDownloader();
|
|
13
|
-
}
|
|
14
|
-
/**
|
|
15
|
-
* Check if Kaggle credentials are available
|
|
16
|
-
*/
|
|
17
|
-
hasCredentials() {
|
|
18
|
-
return !!(this.username && this.key);
|
|
19
|
-
}
|
|
20
|
-
/**
|
|
21
|
-
* Get a helpful error message if credentials are missing
|
|
22
|
-
*/
|
|
23
|
-
getCredentialError() {
|
|
24
|
-
if (!this.username && !this.key) {
|
|
25
|
-
return "Kaggle credentials missing. Please set KAGGLE_USERNAME and KAGGLE_KEY environment variables.\n" +
|
|
26
|
-
"Tip: Get your API token from https://www.kaggle.com/settings -> API -> Create New Token\n" +
|
|
27
|
-
"Alternative: Download the dataset manually and use analyze_quality() on local files.";
|
|
28
|
-
}
|
|
29
|
-
if (!this.username) {
|
|
30
|
-
return "KAGGLE_USERNAME is missing. Please set it in your MCP config or environment variables.";
|
|
31
|
-
}
|
|
32
|
-
if (!this.key) {
|
|
33
|
-
return "KAGGLE_KEY is missing. Please set it in your MCP config or environment variables.";
|
|
34
|
-
}
|
|
35
|
-
return "";
|
|
36
|
-
}
|
|
37
|
-
/**
|
|
38
|
-
* Downloads and extracts a Kaggle dataset
|
|
39
|
-
* returns the path to the primary data file
|
|
40
|
-
*/
|
|
41
|
-
async download(repoId, targetDir, onProgress) {
|
|
42
|
-
if (!this.hasCredentials()) {
|
|
43
|
-
throw new Error(this.getCredentialError());
|
|
44
|
-
}
|
|
45
|
-
const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
|
|
46
|
-
const url = `https://www.kaggle.com/api/v1/datasets/download/${repoId}`;
|
|
47
|
-
// Ensure target directory exists
|
|
48
|
-
if (!fs.existsSync(targetDir)) {
|
|
49
|
-
fs.mkdirSync(targetDir, { recursive: true });
|
|
50
|
-
}
|
|
51
|
-
const zipPath = path.join(targetDir, "data.zip");
|
|
52
|
-
await this.downloader.download(url, zipPath, {
|
|
53
|
-
headers: {
|
|
54
|
-
'Authorization': `Basic ${auth}`
|
|
55
|
-
},
|
|
56
|
-
resume: true,
|
|
57
|
-
onProgress: (bytes, total) => {
|
|
58
|
-
if (total > 0 && onProgress) {
|
|
59
|
-
onProgress(Math.round((bytes / total) * 100));
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
});
|
|
63
|
-
// Unzip
|
|
64
|
-
const zip = new AdmZip(zipPath);
|
|
65
|
-
zip.extractAllTo(targetDir, true);
|
|
66
|
-
// Find best file
|
|
67
|
-
const extractedFiles = this.getAllFiles(targetDir);
|
|
68
|
-
const priorities = [
|
|
69
|
-
/.*\.parquet$/i,
|
|
70
|
-
/.*\.csv$/i,
|
|
71
|
-
/.*\.jsonl$/i,
|
|
72
|
-
/.*\.json$/i
|
|
73
|
-
];
|
|
74
|
-
let bestFile = null;
|
|
75
|
-
for (const pattern of priorities) {
|
|
76
|
-
const match = extractedFiles.find(f => pattern.test(f) && !f.endsWith(".zip"));
|
|
77
|
-
if (match) {
|
|
78
|
-
bestFile = match;
|
|
79
|
-
break;
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
// Cleanup ZIP
|
|
83
|
-
fs.unlinkSync(zipPath);
|
|
84
|
-
if (!bestFile) {
|
|
85
|
-
throw new Error("No suitable data files found in Kaggle archive");
|
|
86
|
-
}
|
|
87
|
-
return bestFile;
|
|
88
|
-
}
|
|
89
|
-
getAllFiles(dir, allFiles = []) {
|
|
90
|
-
const files = fs.readdirSync(dir);
|
|
91
|
-
files.forEach(file => {
|
|
92
|
-
const name = path.join(dir, file);
|
|
93
|
-
if (fs.statSync(name).isDirectory()) {
|
|
94
|
-
this.getAllFiles(name, allFiles);
|
|
95
|
-
}
|
|
96
|
-
else {
|
|
97
|
-
allFiles.push(name);
|
|
98
|
-
}
|
|
99
|
-
});
|
|
100
|
-
return allFiles;
|
|
101
|
-
}
|
|
102
|
-
}
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import fs from "fs";
|
|
2
|
-
import path from "path";
|
|
3
|
-
export class InstallService {
|
|
4
|
-
projectRoot;
|
|
5
|
-
metadataStore;
|
|
6
|
-
constructor(projectRoot, metadataStore) {
|
|
7
|
-
this.projectRoot = projectRoot;
|
|
8
|
-
this.metadataStore = metadataStore;
|
|
9
|
-
}
|
|
10
|
-
/**
|
|
11
|
-
* Installs a prepared dataset file into the ./datasets directory
|
|
12
|
-
* @param datasetId The ID of the dataset
|
|
13
|
-
* @param sourcePath The current location of the processed file
|
|
14
|
-
* @returns The absolute path to the installed file
|
|
15
|
-
*/
|
|
16
|
-
async install(datasetId, sourcePath, targetDir) {
|
|
17
|
-
if (!fs.existsSync(sourcePath)) {
|
|
18
|
-
throw new Error(`Source file not found for installation: ${sourcePath}`);
|
|
19
|
-
}
|
|
20
|
-
const dataset = this.metadataStore.getDataset(datasetId);
|
|
21
|
-
// Create target directory
|
|
22
|
-
const installLabel = dataset?.name || datasetId;
|
|
23
|
-
const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
|
|
24
|
-
// If caller specified a target dir, use it directly
|
|
25
|
-
// Otherwise use the current working directory
|
|
26
|
-
const installDir = targetDir
|
|
27
|
-
? path.resolve(targetDir)
|
|
28
|
-
: path.resolve(process.cwd(), sanitizedName);
|
|
29
|
-
console.error(`[InstallService] Resolved install directory: ${installDir}`);
|
|
30
|
-
if (!fs.existsSync(installDir)) {
|
|
31
|
-
fs.mkdirSync(installDir, { recursive: true });
|
|
32
|
-
}
|
|
33
|
-
const extension = path.extname(sourcePath);
|
|
34
|
-
const targetFilename = `${sanitizedName}${extension}`;
|
|
35
|
-
const targetPath = path.join(installDir, targetFilename);
|
|
36
|
-
// Copy file
|
|
37
|
-
fs.copyFileSync(sourcePath, targetPath);
|
|
38
|
-
// Update metadata
|
|
39
|
-
const absolutePath = path.resolve(targetPath);
|
|
40
|
-
if (dataset) {
|
|
41
|
-
this.metadataStore.updateInstallPath(datasetId, absolutePath);
|
|
42
|
-
}
|
|
43
|
-
console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
|
|
44
|
-
return absolutePath;
|
|
45
|
-
}
|
|
46
|
-
}
|