vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,171 +0,0 @@
1
- import { listFiles } from "@huggingface/hub";
2
- import fs from "fs";
3
- import path from "path";
4
- import { RobustDownloader } from "../utils/downloader.js";
5
- export class HFDownloader {
6
- hfToken;
7
- downloader;
8
- constructor(token) {
9
- this.hfToken = token || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
10
- this.downloader = new RobustDownloader();
11
- }
12
- getToken() {
13
- return this.hfToken || process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
14
- }
15
- /**
16
- * Finds the most suitable data file in a repository
17
- * Returns the relative path within the repo
18
- */
19
- async findBestFile(repoId) {
20
- try {
21
- const token = this.getToken();
22
- const files = [];
23
- const metadataFiles = [];
24
- const blacklist = [
25
- ".gitattributes",
26
- ".gitignore",
27
- ".git",
28
- "README.md",
29
- "LICENSE",
30
- "package.json",
31
- "requirements.txt",
32
- "setup.py"
33
- ];
34
- const metadataNamePatterns = [
35
- /^dataset_infos?\.json$/i,
36
- /^dataset_dict\.json$/i,
37
- /^state\.json$/i,
38
- /^config\.json$/i,
39
- /^metadata\.json$/i,
40
- /^stats\.json$/i,
41
- /^index\.json$/i
42
- ];
43
- for await (const file of listFiles({
44
- repo: { type: "dataset", name: repoId },
45
- recursive: true,
46
- ...(token ? { accessToken: token } : {})
47
- })) {
48
- if (file.type === "file") {
49
- const fileName = path.basename(file.path);
50
- const isMetadataJson = metadataNamePatterns.some(p => p.test(fileName));
51
- if (isMetadataJson) {
52
- metadataFiles.push(file.path);
53
- }
54
- if (!blacklist.includes(fileName) && !fileName.startsWith(".") && !isMetadataJson) {
55
- files.push(file.path);
56
- }
57
- }
58
- }
59
- // Priority logic for data scientists
60
- const priorities = [
61
- /train.*\.parquet$/i,
62
- /data.*\.parquet$/i,
63
- /.*\.parquet$/i,
64
- /train.*\.csv$/i,
65
- /data.*\.csv$/i,
66
- /.*\.csv$/i,
67
- /train.*\.tsv$/i,
68
- /data.*\.tsv$/i,
69
- /.*\.tsv$/i,
70
- /train.*\.txt$/i,
71
- /data.*\.txt$/i,
72
- /.*\.txt$/i,
73
- /.*\.jsonl$/i,
74
- /.*\.ndjson$/i,
75
- // Keep plain JSON as lowest priority to avoid selecting metadata-like files.
76
- /.*\.json$/i
77
- ];
78
- for (const pattern of priorities) {
79
- const match = files.find(f => pattern.test(f));
80
- if (match)
81
- return match;
82
- }
83
- // Strict fallback: Only return the first file if it has a data-like extension
84
- const dataExtensions = [".csv", ".parquet", ".jsonl", ".ndjson", ".tsv", ".txt", ".json", ".avro", ".orc"];
85
- const fallback = files.find(f => {
86
- const ext = path.extname(f).toLowerCase();
87
- return dataExtensions.includes(ext);
88
- });
89
- if (fallback)
90
- return fallback;
91
- // Last-resort: allow dataset metadata file, then resolve external raw URLs later.
92
- const metadataFallback = metadataFiles.find(f => /dataset_infos?\.json$/i.test(path.basename(f)));
93
- return metadataFallback || null;
94
- }
95
- catch (error) {
96
- const msg = String(error?.message || error);
97
- if (msg.includes("401") || msg.toLowerCase().includes("unauthorized")) {
98
- throw new Error(`Authentication required for dataset '${repoId}'. ` +
99
- `This dataset may be gated or private. ` +
100
- `Use the configure_keys tool to set your HF_TOKEN, then retry.`);
101
- }
102
- if (msg.includes("403") || msg.toLowerCase().includes("forbidden")) {
103
- throw new Error(`Access denied for dataset '${repoId}'. ` +
104
- `You may need to accept the dataset's usage agreement on huggingface.co, ` +
105
- `then set HF_TOKEN via configure_keys tool.`);
106
- }
107
- if (msg.includes("404") || msg.toLowerCase().includes("not found")) {
108
- throw new Error(`Dataset '${repoId}' not found on HuggingFace. Check the dataset ID.`);
109
- }
110
- console.error(`[HF] Failed to list files for ${repoId}:`, msg);
111
- return null;
112
- }
113
- }
114
- /**
115
- * Downloads a file from HF to local path
116
- */
117
- async download(repoId, filePath, targetPath, onProgress) {
118
- const token = this.getToken();
119
- const url = `https://huggingface.co/datasets/${repoId}/resolve/main/${filePath}`;
120
- await this.downloader.download(url, targetPath, {
121
- headers: token ? { 'Authorization': `Bearer ${token}` } : {},
122
- resume: true,
123
- onProgress: (bytes, total) => {
124
- if (total > 0 && onProgress) {
125
- onProgress(Math.round((bytes / total) * 100));
126
- }
127
- }
128
- });
129
- }
130
- /**
131
- * If downloaded file is dataset metadata (dataset_infos.json), resolve and download a real data URL.
132
- * Returns the actual local data path to use.
133
- */
134
- async resolveExternalDataFromMetadata(localPath, onProgress) {
135
- const ext = path.extname(localPath).toLowerCase();
136
- if (ext !== ".json") {
137
- return localPath;
138
- }
139
- try {
140
- const raw = fs.readFileSync(localPath, "utf-8");
141
- const parsed = JSON.parse(raw);
142
- const firstConfig = parsed?.default || Object.values(parsed || {})[0];
143
- const checksums = firstConfig?.download_checksums;
144
- if (!checksums || typeof checksums !== "object") {
145
- return localPath;
146
- }
147
- const candidateUrls = Object.keys(checksums).filter((u) => /^https?:\/\//i.test(u));
148
- if (candidateUrls.length === 0) {
149
- return localPath;
150
- }
151
- const preferred = candidateUrls.find(u => /train|data/i.test(path.basename(u))) || candidateUrls[0];
152
- const ext = path.extname(preferred).toLowerCase() || ".csv";
153
- const resolvedPath = localPath.replace(/\.json$/i, ext);
154
- await this.downloader.download(preferred, resolvedPath, {
155
- resume: true,
156
- onProgress: (bytes, total) => {
157
- if (total > 0 && onProgress) {
158
- onProgress(Math.round((bytes / total) * 100));
159
- }
160
- }
161
- });
162
- if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).size > 0) {
163
- return resolvedPath;
164
- }
165
- return localPath;
166
- }
167
- catch {
168
- return localPath;
169
- }
170
- }
171
- }
@@ -1,271 +0,0 @@
1
- import path from "path";
2
- import fs from "fs";
3
- import { spawn } from "child_process";
4
- import { HFDownloader } from "./hf-downloader.js";
5
- import { KaggleSource } from "../metadata/kaggle-source.js";
6
- import { OpenMLSource } from "../metadata/openml-source.js";
7
- import { DataWorldSource } from "../metadata/dataworld-source.js";
8
- import { SecureKeysManager } from "../config/secure-keys.js";
9
- export class DataIngestor {
10
- projectRoot;
11
- store;
12
- rawDataDir;
13
- hfDownloader;
14
- kaggleSource;
15
- openmlSource;
16
- dataworldSource;
17
- secureKeys;
18
- constructor(projectRoot, store) {
19
- this.projectRoot = projectRoot;
20
- this.store = store;
21
- this.rawDataDir = path.join(this.projectRoot, "data", "raw");
22
- if (!fs.existsSync(this.rawDataDir)) {
23
- fs.mkdirSync(this.rawDataDir, { recursive: true });
24
- }
25
- this.hfDownloader = new HFDownloader();
26
- this.kaggleSource = new KaggleSource();
27
- this.openmlSource = new OpenMLSource();
28
- this.dataworldSource = new DataWorldSource();
29
- this.secureKeys = new SecureKeysManager();
30
- }
31
- /**
32
- * Check if Kaggle credentials are available
33
- */
34
- hasKaggleCredentials() {
35
- if (process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY)
36
- return true;
37
- const keys = this.secureKeys.getAll();
38
- if (keys.kaggle_username && keys.kaggle_key)
39
- return true;
40
- const kaggleJsonPath = path.join(process.env.HOME || process.env.USERPROFILE || "", ".kaggle", "kaggle.json");
41
- return !!(kaggleJsonPath && fs.existsSync(kaggleJsonPath));
42
- }
43
- /**
44
- * Get helpful error message if Kaggle credentials are missing
45
- */
46
- getKaggleCredentialError() {
47
- return "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds), or provide ~/.kaggle/kaggle.json.";
48
- }
49
- toSafeDatasetPath(datasetId) {
50
- return datasetId.replace(/[:\/]/g, "_");
51
- }
52
- /**
53
- * Ensures a dataset is available locally
54
- */
55
- async ensureData(datasetId, source, onProgress) {
56
- // 1. Check database for existing download
57
- const status = this.store.getDownloadStatus(datasetId);
58
- if (status && status.status === 'completed' && fs.existsSync(status.local_path)) {
59
- return status.local_path;
60
- }
61
- if (status && status.status === 'downloading') {
62
- console.error(`[Ingestor] Dataset ${datasetId} status is 'downloading'. Attempting to resume/concurrently monitor.`);
63
- // In a better system we'd use a lock, but for now we let it resume
64
- // the RobustDownloader handles the actual file locking/range logic.
65
- }
66
- // 2. Trigger source-specific download
67
- if (source === "huggingface") {
68
- onProgress?.("Discovering data files on HuggingFace Hub...");
69
- const remotePath = await this.hfDownloader.findBestFile(datasetId);
70
- if (remotePath) {
71
- // Direct file download path (repo has raw data files)
72
- const ext = path.extname(remotePath).substring(1).toLowerCase() || "csv";
73
- const targetPath = this.getTargetPath(datasetId, ext);
74
- this.store.registerDownload(datasetId, targetPath, "downloading");
75
- try {
76
- await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
77
- onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
78
- });
79
- const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
80
- onProgress?.("Resolving external dataset file...", progress);
81
- });
82
- const stats = fs.statSync(resolvedPath);
83
- this.completeDownload(datasetId, resolvedPath, stats.size);
84
- return resolvedPath;
85
- }
86
- catch (e) {
87
- const msg = String(e?.message || e);
88
- // If auth error, propagate immediately with helpful message
89
- if (msg.includes("401") || msg.includes("403") || msg.includes("Authentication") || msg.includes("Access denied")) {
90
- this.failDownload(datasetId, msg);
91
- throw e;
92
- }
93
- // For other download errors, try the fallback
94
- onProgress?.(`Direct download failed (${msg}), trying datasets library fallback...`);
95
- }
96
- }
97
- // Fallback: Use Python datasets library to download and convert
98
- // This runs when findBestFile returns null OR when direct download fails (non-auth)
99
- if (!fs.existsSync(this.getTargetPath(datasetId, "parquet")) || !this.store.getDownloadStatus(datasetId)?.status?.includes("completed")) {
100
- onProgress?.("Using HuggingFace datasets library to download...");
101
- const targetPath = this.getTargetPath(datasetId, "parquet");
102
- this.store.registerDownload(datasetId, targetPath, "downloading");
103
- try {
104
- const result = await this.hfDatasetsFallback(datasetId, targetPath, onProgress);
105
- const stats = fs.statSync(result);
106
- this.completeDownload(datasetId, result, stats.size);
107
- return result;
108
- }
109
- catch (e) {
110
- this.failDownload(datasetId, e.message);
111
- throw e;
112
- }
113
- }
114
- }
115
- else if (source === "kaggle") {
116
- if (!this.hasKaggleCredentials()) {
117
- const errorMsg = this.getKaggleCredentialError();
118
- this.failDownload(datasetId, errorMsg);
119
- throw new Error(errorMsg);
120
- }
121
- const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
122
- this.store.registerDownload(datasetId, targetDir, "downloading");
123
- try {
124
- onProgress?.("Downloading from Kaggle...");
125
- const result = await this.kaggleSource.download(datasetId, targetDir);
126
- const stats = fs.statSync(result.local_path);
127
- this.completeDownload(datasetId, result.local_path, stats.size);
128
- onProgress?.("Kaggle download complete", 100);
129
- return result.local_path;
130
- }
131
- catch (e) {
132
- this.failDownload(datasetId, e.message);
133
- throw e;
134
- }
135
- }
136
- else if (source === "openml") {
137
- const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
138
- this.store.registerDownload(datasetId, targetDir, "downloading");
139
- try {
140
- onProgress?.("Downloading from OpenML...");
141
- const result = await this.openmlSource.download(datasetId, targetDir);
142
- const stats = fs.statSync(result.local_path);
143
- this.completeDownload(datasetId, result.local_path, stats.size);
144
- onProgress?.("OpenML download complete", 100);
145
- return result.local_path;
146
- }
147
- catch (e) {
148
- this.failDownload(datasetId, e.message);
149
- throw e;
150
- }
151
- }
152
- else if (source === "dataworld") {
153
- const targetDir = path.join(this.rawDataDir, this.toSafeDatasetPath(datasetId));
154
- this.store.registerDownload(datasetId, targetDir, "downloading");
155
- try {
156
- onProgress?.("Downloading from data.world...");
157
- const result = await this.dataworldSource.download(datasetId, targetDir);
158
- const stats = fs.statSync(result.local_path);
159
- this.completeDownload(datasetId, result.local_path, stats.size);
160
- onProgress?.("data.world download complete", 100);
161
- return result.local_path;
162
- }
163
- catch (e) {
164
- this.failDownload(datasetId, e.message);
165
- throw e;
166
- }
167
- }
168
- throw new Error(`Download logic for ${source} not yet implemented`);
169
- }
170
- /**
171
- * Register a successful download
172
- */
173
- completeDownload(datasetId, actualPath, sizeBytes) {
174
- this.store.registerDownload(datasetId, actualPath, 'completed', sizeBytes);
175
- }
176
- /**
177
- * Register a failed download
178
- */
179
- failDownload(datasetId, error) {
180
- const existing = this.store.getDownloadStatus(datasetId);
181
- this.store.registerDownload(datasetId, existing?.local_path || "", 'failed', 0, error);
182
- }
183
- /**
184
- * Generates a safe local filename for a dataset ID
185
- */
186
- getTargetPath(datasetId, extension = "parquet") {
187
- const safeId = this.toSafeDatasetPath(datasetId);
188
- return path.join(this.rawDataDir, `${safeId}.${extension}`);
189
- }
190
- /**
191
- * Fallback: Use Python `datasets` library to download a HuggingFace dataset
192
- * when no raw data files are found in the repo file listing.
193
- */
194
- async hfDatasetsFallback(datasetId, targetPath, onProgress) {
195
- const pyCmd = process.platform === "win32" ? "py" : "python";
196
- // Resolve the fallback script path
197
- const homeDir = process.env.HOME || process.env.USERPROFILE || this.projectRoot;
198
- const dataRoot = path.join(homeDir, ".vesper");
199
- const scriptCandidates = [
200
- path.resolve(dataRoot, "python", "hf_fallback.py"),
201
- path.resolve(this.projectRoot, "python", "hf_fallback.py"),
202
- path.resolve(this.projectRoot, "..", "src", "python", "hf_fallback.py"),
203
- path.resolve(this.projectRoot, "..", "python", "hf_fallback.py"),
204
- ];
205
- let scriptPath = scriptCandidates.find(p => fs.existsSync(p));
206
- if (!scriptPath) {
207
- scriptPath = scriptCandidates[0]; // Will fail with a clear error
208
- }
209
- const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN || undefined;
210
- const payload = {
211
- repo_id: datasetId,
212
- output_path: targetPath,
213
- token: token || null,
214
- max_rows: 500000,
215
- };
216
- onProgress?.("Downloading via datasets library (this may take a moment)...", 30);
217
- return new Promise((resolve, reject) => {
218
- const proc = spawn(pyCmd, [scriptPath, JSON.stringify(payload)], {
219
- env: {
220
- ...process.env,
221
- PYTHONUTF8: "1",
222
- PIP_DISABLE_PIP_VERSION_CHECK: "1",
223
- },
224
- });
225
- let stdout = "";
226
- let stderr = "";
227
- proc.stdout.on("data", (d) => (stdout += d.toString()));
228
- proc.stderr.on("data", (d) => {
229
- const msg = d.toString();
230
- stderr += msg;
231
- // Forward progress info
232
- if (msg.includes("Downloading") || msg.includes("Loading")) {
233
- onProgress?.(msg.trim().split("\n").pop() || "Downloading...", 50);
234
- }
235
- });
236
- const timer = setTimeout(() => {
237
- try {
238
- proc.kill();
239
- }
240
- catch { /* no-op */ }
241
- reject(new Error(`HuggingFace datasets download timed out after 10 minutes for ${datasetId}`));
242
- }, 600000); // 10 min timeout
243
- proc.on("close", (code) => {
244
- clearTimeout(timer);
245
- if (code !== 0) {
246
- let errorMsg = stderr || stdout || `Python exited with code ${code}`;
247
- try {
248
- const parsed = JSON.parse(stdout);
249
- if (parsed.error)
250
- errorMsg = parsed.error;
251
- }
252
- catch { /* use stderr */ }
253
- reject(new Error(`HuggingFace datasets fallback failed: ${errorMsg}`));
254
- return;
255
- }
256
- try {
257
- const result = JSON.parse(stdout);
258
- if (!result.ok) {
259
- reject(new Error(result.error || "Unknown error from HF fallback"));
260
- return;
261
- }
262
- onProgress?.(`Downloaded ${result.rows?.toLocaleString() || "?"} rows (${result.columns?.length || "?"} columns)`, 90);
263
- resolve(result.path);
264
- }
265
- catch {
266
- reject(new Error(`Failed to parse HF fallback output: ${stdout}`));
267
- }
268
- });
269
- });
270
- }
271
- }
@@ -1,102 +0,0 @@
1
- import fs from "fs";
2
- import path from "path";
3
- import AdmZip from "adm-zip";
4
- import { RobustDownloader } from "../utils/downloader.js";
5
- export class KaggleDownloader {
6
- username;
7
- key;
8
- downloader;
9
- constructor(username, key) {
10
- this.username = username || process.env.KAGGLE_USERNAME || "";
11
- this.key = key || process.env.KAGGLE_KEY || "";
12
- this.downloader = new RobustDownloader();
13
- }
14
- /**
15
- * Check if Kaggle credentials are available
16
- */
17
- hasCredentials() {
18
- return !!(this.username && this.key);
19
- }
20
- /**
21
- * Get a helpful error message if credentials are missing
22
- */
23
- getCredentialError() {
24
- if (!this.username && !this.key) {
25
- return "Kaggle credentials missing. Please set KAGGLE_USERNAME and KAGGLE_KEY environment variables.\n" +
26
- "Tip: Get your API token from https://www.kaggle.com/settings -> API -> Create New Token\n" +
27
- "Alternative: Download the dataset manually and use analyze_quality() on local files.";
28
- }
29
- if (!this.username) {
30
- return "KAGGLE_USERNAME is missing. Please set it in your MCP config or environment variables.";
31
- }
32
- if (!this.key) {
33
- return "KAGGLE_KEY is missing. Please set it in your MCP config or environment variables.";
34
- }
35
- return "";
36
- }
37
- /**
38
- * Downloads and extracts a Kaggle dataset
39
- * returns the path to the primary data file
40
- */
41
- async download(repoId, targetDir, onProgress) {
42
- if (!this.hasCredentials()) {
43
- throw new Error(this.getCredentialError());
44
- }
45
- const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
46
- const url = `https://www.kaggle.com/api/v1/datasets/download/${repoId}`;
47
- // Ensure target directory exists
48
- if (!fs.existsSync(targetDir)) {
49
- fs.mkdirSync(targetDir, { recursive: true });
50
- }
51
- const zipPath = path.join(targetDir, "data.zip");
52
- await this.downloader.download(url, zipPath, {
53
- headers: {
54
- 'Authorization': `Basic ${auth}`
55
- },
56
- resume: true,
57
- onProgress: (bytes, total) => {
58
- if (total > 0 && onProgress) {
59
- onProgress(Math.round((bytes / total) * 100));
60
- }
61
- }
62
- });
63
- // Unzip
64
- const zip = new AdmZip(zipPath);
65
- zip.extractAllTo(targetDir, true);
66
- // Find best file
67
- const extractedFiles = this.getAllFiles(targetDir);
68
- const priorities = [
69
- /.*\.parquet$/i,
70
- /.*\.csv$/i,
71
- /.*\.jsonl$/i,
72
- /.*\.json$/i
73
- ];
74
- let bestFile = null;
75
- for (const pattern of priorities) {
76
- const match = extractedFiles.find(f => pattern.test(f) && !f.endsWith(".zip"));
77
- if (match) {
78
- bestFile = match;
79
- break;
80
- }
81
- }
82
- // Cleanup ZIP
83
- fs.unlinkSync(zipPath);
84
- if (!bestFile) {
85
- throw new Error("No suitable data files found in Kaggle archive");
86
- }
87
- return bestFile;
88
- }
89
- getAllFiles(dir, allFiles = []) {
90
- const files = fs.readdirSync(dir);
91
- files.forEach(file => {
92
- const name = path.join(dir, file);
93
- if (fs.statSync(name).isDirectory()) {
94
- this.getAllFiles(name, allFiles);
95
- }
96
- else {
97
- allFiles.push(name);
98
- }
99
- });
100
- return allFiles;
101
- }
102
- }
@@ -1,46 +0,0 @@
1
- import fs from "fs";
2
- import path from "path";
3
- export class InstallService {
4
- projectRoot;
5
- metadataStore;
6
- constructor(projectRoot, metadataStore) {
7
- this.projectRoot = projectRoot;
8
- this.metadataStore = metadataStore;
9
- }
10
- /**
11
- * Installs a prepared dataset file into the ./datasets directory
12
- * @param datasetId The ID of the dataset
13
- * @param sourcePath The current location of the processed file
14
- * @returns The absolute path to the installed file
15
- */
16
- async install(datasetId, sourcePath, targetDir) {
17
- if (!fs.existsSync(sourcePath)) {
18
- throw new Error(`Source file not found for installation: ${sourcePath}`);
19
- }
20
- const dataset = this.metadataStore.getDataset(datasetId);
21
- // Create target directory
22
- const installLabel = dataset?.name || datasetId;
23
- const sanitizedName = installLabel.replace(/[^a-z0-9]/gi, "_").toLowerCase();
24
- // If caller specified a target dir, use it directly
25
- // Otherwise use the current working directory
26
- const installDir = targetDir
27
- ? path.resolve(targetDir)
28
- : path.resolve(process.cwd(), sanitizedName);
29
- console.error(`[InstallService] Resolved install directory: ${installDir}`);
30
- if (!fs.existsSync(installDir)) {
31
- fs.mkdirSync(installDir, { recursive: true });
32
- }
33
- const extension = path.extname(sourcePath);
34
- const targetFilename = `${sanitizedName}${extension}`;
35
- const targetPath = path.join(installDir, targetFilename);
36
- // Copy file
37
- fs.copyFileSync(sourcePath, targetPath);
38
- // Update metadata
39
- const absolutePath = path.resolve(targetPath);
40
- if (dataset) {
41
- this.metadataStore.updateInstallPath(datasetId, absolutePath);
42
- }
43
- console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
44
- return absolutePath;
45
- }
46
- }