vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,340 +0,0 @@
1
- import Database from "better-sqlite3";
2
- import path from "path";
3
- import fs from "fs";
4
- export class MetadataStore {
5
- db;
6
- constructor(dbPath) {
7
- // Ensure data directory exists
8
- const dir = path.dirname(dbPath);
9
- if (!fs.existsSync(dir)) {
10
- fs.mkdirSync(dir, { recursive: true });
11
- }
12
- this.db = new Database(dbPath);
13
- this.init();
14
- this.migrate();
15
- }
16
- migrate() {
17
- // Add install_path if missing
18
- try {
19
- this.db.exec("ALTER TABLE datasets ADD COLUMN install_path TEXT");
20
- console.error("[MetadataStore] Migrated: Added install_path column");
21
- }
22
- catch (e) {
23
- // Probably already exists
24
- }
25
- }
26
- init() {
27
- // Create table with all columns if it doesn't exist
28
- this.db.exec(`
29
- CREATE TABLE IF NOT EXISTS datasets (
30
- id TEXT PRIMARY KEY,
31
- source TEXT, -- 'huggingface', 'kaggle', 'uci', etc.
32
- name TEXT,
33
- description TEXT,
34
- downloads INTEGER,
35
- likes INTEGER,
36
- stars INTEGER,
37
- tags TEXT, -- JSON string
38
- license_id TEXT,
39
- license_category TEXT,
40
- quality_score INTEGER,
41
- has_train_split BOOLEAN,
42
- total_examples INTEGER,
43
- total_size_mb REAL,
44
- is_safe_source BOOLEAN,
45
- is_structured BOOLEAN,
46
- last_updated TEXT,
47
- quality_warnings TEXT, -- JSON array string
48
- metadata_json TEXT, -- Full metadata as JSON
49
- install_path TEXT -- Path to locally installed data
50
- );
51
-
52
- CREATE TABLE IF NOT EXISTS jobs (
53
- id TEXT PRIMARY KEY,
54
- type TEXT,
55
- status TEXT,
56
- priority INTEGER DEFAULT 0,
57
- progress INTEGER DEFAULT 0,
58
- status_text TEXT,
59
- result_url TEXT,
60
- error TEXT,
61
- attempts INTEGER DEFAULT 0,
62
- max_attempts INTEGER DEFAULT 3,
63
- created_at TEXT,
64
- updated_at TEXT,
65
- metadata TEXT
66
- );
67
-
68
- CREATE TABLE IF NOT EXISTS local_files (
69
- id TEXT PRIMARY KEY,
70
- local_path TEXT,
71
- status TEXT, -- 'downloading', 'completed', 'failed'
72
- size_bytes INTEGER,
73
- last_checked TEXT,
74
- error TEXT
75
- );
76
-
77
- CREATE TABLE IF NOT EXISTS jobs_archive (
78
- id TEXT PRIMARY KEY,
79
- type TEXT,
80
- status TEXT,
81
- priority INTEGER DEFAULT 0,
82
- progress INTEGER DEFAULT 0,
83
- status_text TEXT,
84
- result_url TEXT,
85
- error TEXT,
86
- attempts INTEGER DEFAULT 0,
87
- max_attempts INTEGER DEFAULT 3,
88
- created_at TEXT,
89
- updated_at TEXT,
90
- metadata TEXT
91
- );
92
- `);
93
- // Migrate existing tables: add new columns if they don't exist
94
- const tableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
95
- const existingColumns = new Set(tableInfo.map(col => col.name));
96
- const migrations = [
97
- { column: "stars", type: "INTEGER DEFAULT 0" },
98
- { column: "total_examples", type: "INTEGER" },
99
- { column: "total_size_mb", type: "REAL" },
100
- { column: "is_safe_source", type: "BOOLEAN DEFAULT 1" },
101
- { column: "is_structured", type: "BOOLEAN DEFAULT 0" },
102
- { column: "domain", type: "TEXT" },
103
- { column: "source", type: "TEXT DEFAULT 'huggingface'" },
104
- { column: "quality_warnings", type: "TEXT" }
105
- ];
106
- for (const migration of migrations) {
107
- if (!existingColumns.has(migration.column)) {
108
- console.error(`[MetadataStore] Migrating: adding column ${migration.column}`);
109
- this.db.exec(`ALTER TABLE datasets ADD COLUMN ${migration.column} ${migration.type}`);
110
- }
111
- }
112
- // Job migrations
113
- const jobTableInfo = this.db.prepare("PRAGMA table_info(jobs)").all();
114
- const existingJobColumns = new Set(jobTableInfo.map(col => col.name));
115
- const jobMigrations = [
116
- { column: "priority", type: "INTEGER DEFAULT 0" },
117
- { column: "attempts", type: "INTEGER DEFAULT 0" },
118
- { column: "max_attempts", type: "INTEGER DEFAULT 3" }
119
- ];
120
- for (const migration of jobMigrations) {
121
- if (!existingJobColumns.has(migration.column)) {
122
- console.error(`[MetadataStore] Migrating Job: adding column ${migration.column}`);
123
- this.db.exec(`ALTER TABLE jobs ADD COLUMN ${migration.column} ${migration.type}`);
124
- }
125
- }
126
- // Get updated column list after migrations
127
- const updatedTableInfo = this.db.prepare("PRAGMA table_info(datasets)").all();
128
- const allColumns = new Set(updatedTableInfo.map(col => col.name));
129
- // Create indexes (only if columns exist)
130
- const indexes = [
131
- { name: "idx_license_category", table: "datasets", column: "license_category" },
132
- { name: "idx_quality_score", table: "datasets", column: "quality_score" },
133
- { name: "idx_downloads", table: "datasets", column: "downloads" },
134
- { name: "idx_has_train_split", table: "datasets", column: "has_train_split" },
135
- { name: "idx_is_safe_source", table: "datasets", column: "is_safe_source" },
136
- { name: "idx_is_structured", table: "datasets", column: "is_structured" },
137
- { name: "idx_domain", table: "datasets", column: "domain" },
138
- { name: "idx_jobs_status", table: "jobs", column: "status" },
139
- { name: "idx_jobs_created", table: "jobs", column: "created_at" },
140
- { name: "idx_jobs_archive_created", table: "jobs_archive", column: "created_at" }
141
- ];
142
- for (const idx of indexes) {
143
- // For datasets table, check if column exists first
144
- if (idx.table === "datasets" && !allColumns.has(idx.column))
145
- continue;
146
- try {
147
- this.db.exec(`CREATE INDEX IF NOT EXISTS ${idx.name} ON ${idx.table}(${idx.column})`);
148
- }
149
- catch (e) {
150
- // Ignore index errors
151
- }
152
- }
153
- }
154
- saveDataset(dataset) {
155
- // If the incoming dataset is incomplete, check if we already have a complete one
156
- if (dataset.is_incomplete) {
157
- const existing = this.getDataset(dataset.id);
158
- if (existing && !existing.is_incomplete) {
159
- // Already have better data, only update stats
160
- const updateStats = this.db.prepare(`
161
- UPDATE datasets SET
162
- downloads = ?,
163
- likes = ?,
164
- stars = ?,
165
- last_updated = ?
166
- WHERE id = ?
167
- `);
168
- updateStats.run(dataset.downloads, dataset.likes, dataset.stars || 0, dataset.last_updated, dataset.id);
169
- return;
170
- }
171
- }
172
- const stmt = this.db.prepare(`
173
- INSERT INTO datasets (
174
- id, source, name, description, downloads, likes, stars, tags,
175
- license_id, license_category, quality_score,
176
- has_train_split, total_examples, total_size_mb,
177
- is_safe_source, is_structured, last_updated,
178
- quality_warnings, metadata_json, install_path
179
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
180
- ON CONFLICT(id) DO UPDATE SET
181
- source=excluded.source,
182
- name=excluded.name,
183
- description=excluded.description,
184
- downloads=excluded.downloads,
185
- likes=excluded.likes,
186
- stars=excluded.stars,
187
- tags=excluded.tags,
188
- license_id=excluded.license_id,
189
- license_category=excluded.license_category,
190
- quality_score=excluded.quality_score,
191
- has_train_split=excluded.has_train_split,
192
- total_examples=excluded.total_examples,
193
- total_size_mb=excluded.total_size_mb,
194
- is_safe_source=excluded.is_safe_source,
195
- is_structured=excluded.is_structured,
196
- last_updated=excluded.last_updated,
197
- quality_warnings=excluded.quality_warnings,
198
- metadata_json=excluded.metadata_json,
199
- install_path=excluded.install_path
200
- `);
201
- stmt.run(dataset.id, dataset.source, dataset.name, dataset.description, dataset.downloads, dataset.likes, dataset.stars || 0, JSON.stringify(dataset.tags), dataset.license.id, dataset.license.category, dataset.quality_score, dataset.has_train_split ? 1 : 0, dataset.total_examples, dataset.total_size_mb || null, dataset.is_safe_source ? 1 : 0, dataset.is_structured ? 1 : 0, dataset.last_updated, JSON.stringify(dataset.quality_warnings || []), JSON.stringify(dataset), dataset.install_path || null);
202
- }
203
- updateInstallPath(id, path) {
204
- this.db.prepare("UPDATE datasets SET install_path = ? WHERE id = ?").run(path, id);
205
- }
206
- getDataset(id) {
207
- const row = this.db.prepare("SELECT metadata_json, install_path FROM datasets WHERE id = ?").get(id);
208
- if (!row)
209
- return null;
210
- const metadata = JSON.parse(row.metadata_json);
211
- metadata.install_path = row.install_path || undefined;
212
- return metadata;
213
- }
214
- getAllDatasets() {
215
- const rows = this.db.prepare("SELECT metadata_json FROM datasets").all();
216
- return rows.map(r => JSON.parse(r.metadata_json));
217
- }
218
- beginTransaction() {
219
- this.db.exec("BEGIN");
220
- }
221
- commit() {
222
- this.db.exec("COMMIT");
223
- }
224
- rollback() {
225
- this.db.exec("ROLLBACK");
226
- }
227
- search(options) {
228
- let query = "SELECT metadata_json FROM datasets WHERE 1=1";
229
- const params = [];
230
- if (options.licenseCategories && options.licenseCategories.length > 0) {
231
- const placeholders = options.licenseCategories.map(() => "?").join(",");
232
- query += ` AND license_category IN (${placeholders})`;
233
- params.push(...options.licenseCategories);
234
- }
235
- if (options.minQualityScore !== undefined) {
236
- query += " AND quality_score >= ?";
237
- params.push(options.minQualityScore);
238
- }
239
- query += " ORDER BY quality_score DESC";
240
- if (options.limit) {
241
- query += " LIMIT ?";
242
- params.push(options.limit);
243
- }
244
- const rows = this.db.prepare(query).all(...params);
245
- return rows.map(r => JSON.parse(r.metadata_json));
246
- }
247
- saveJob(job) {
248
- const upsert = this.db.prepare(`
249
- INSERT INTO jobs (id, type, status, priority, progress, status_text, result_url, error, attempts, max_attempts, created_at, updated_at, metadata)
250
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
251
- ON CONFLICT(id) DO UPDATE SET
252
- status=excluded.status,
253
- priority=excluded.priority,
254
- progress=excluded.progress,
255
- status_text=excluded.status_text,
256
- result_url=excluded.result_url,
257
- error=excluded.error,
258
- attempts=excluded.attempts,
259
- max_attempts=excluded.max_attempts,
260
- updated_at=excluded.updated_at,
261
- metadata=excluded.metadata
262
- `);
263
- upsert.run(job.id, job.type, job.status, job.priority, job.progress, job.status_text, job.result_url || null, job.error || null, job.attempts, job.max_attempts, job.created_at, job.updated_at, job.metadata || null);
264
- }
265
- getJob(id) {
266
- const row = this.db.prepare("SELECT * FROM jobs WHERE id = ?").get(id);
267
- return row || null;
268
- }
269
- /**
270
- * Register or update a local file record
271
- */
272
- registerDownload(id, localPath, status, sizeBytes, error) {
273
- const upsert = this.db.prepare(`
274
- INSERT INTO local_files (id, local_path, status, size_bytes, last_checked, error)
275
- VALUES (?, ?, ?, ?, ?, ?)
276
- ON CONFLICT(id) DO UPDATE SET
277
- local_path=excluded.local_path,
278
- status=excluded.status,
279
- size_bytes=excluded.size_bytes,
280
- last_checked=excluded.last_checked,
281
- error=excluded.error
282
- `);
283
- upsert.run(id, localPath, status, sizeBytes || 0, new Date().toISOString(), error || null);
284
- }
285
- /**
286
- * Get download status and path for a dataset
287
- */
288
- getDownloadStatus(id) {
289
- return this.db.prepare("SELECT * FROM local_files WHERE id = ?").get(id);
290
- }
291
- /**
292
- * Archive old jobs to the cold storage table.
293
- * @param days Age in days
294
- */
295
- archiveOldJobs(days) {
296
- const cutoff = new Date();
297
- cutoff.setDate(cutoff.getDate() - days);
298
- const cutoffStr = cutoff.toISOString();
299
- return this.db.transaction(() => {
300
- // Copy to archive
301
- this.db.prepare(`
302
- INSERT OR IGNORE INTO jobs_archive
303
- SELECT * FROM jobs
304
- WHERE created_at < ?
305
- `).run(cutoffStr);
306
- // Delete from active jobs
307
- const info = this.db.prepare(`
308
- DELETE FROM jobs
309
- WHERE created_at < ?
310
- `).run(cutoffStr);
311
- return info.changes;
312
- })();
313
- }
314
- /**
315
- * Perform database maintenance (VACUUM, ANALYZE).
316
- */
317
- optimize() {
318
- console.error("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
319
- this.db.exec("VACUUM");
320
- this.db.exec("ANALYZE");
321
- }
322
- /**
323
- * Delete all datasets from a specific source
324
- */
325
- deleteBySource(source) {
326
- const info = this.db.prepare("DELETE FROM datasets WHERE source = ?").run(source);
327
- console.error(`[MetadataStore] Deleted ${info.changes} datasets from source: ${source}`);
328
- return info.changes;
329
- }
330
- /**
331
- * Get all dataset IDs from a specific source
332
- */
333
- getDatasetIdsBySource(source) {
334
- const rows = this.db.prepare("SELECT id FROM datasets WHERE source = ?").all(source);
335
- return rows.map(r => r.id);
336
- }
337
- close() {
338
- this.db.close();
339
- }
340
- }
@@ -1 +0,0 @@
1
- export {};
@@ -1,49 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- const PYTHON_SCRIPT_PATH = path.resolve("src", "python", "uci_adapter.py");
4
- export class UCIScraper {
5
- /**
6
- * Search UCI repository using the Python adapter
7
- */
8
- async scrape(query, limit = 10) {
9
- return new Promise((resolve, reject) => {
10
- const pythonProcess = spawn("python", [
11
- PYTHON_SCRIPT_PATH,
12
- "--action", "search",
13
- "--query", query,
14
- "--limit", String(limit)
15
- ]);
16
- let output = "";
17
- let errorOutput = "";
18
- pythonProcess.stdout.on("data", (data) => {
19
- output += data.toString();
20
- });
21
- pythonProcess.stderr.on("data", (data) => {
22
- errorOutput += data.toString();
23
- });
24
- pythonProcess.on("close", (code) => {
25
- if (code !== 0) {
26
- // It's possible for python to emit stderr warnings but still succeed
27
- // But exit code != 0 is definitely an error
28
- console.error(`[UCIScraper] Process exited with code ${code}: ${errorOutput}`);
29
- resolve([]); // Fail gracefully by returning empty
30
- return;
31
- }
32
- try {
33
- const results = JSON.parse(output);
34
- if (results.error) {
35
- console.error(`[UCIScraper] Internal error: ${results.error}`);
36
- resolve([]);
37
- }
38
- else {
39
- resolve(results);
40
- }
41
- }
42
- catch (e) {
43
- console.error(`[UCIScraper] JSON parse error: ${e.message}. Output: ${output.substring(0, 100)}...`);
44
- resolve([]);
45
- }
46
- });
47
- });
48
- }
49
- }
@@ -1,76 +0,0 @@
1
- export class MockErrorTracker {
2
- exceptions = [];
3
- messages = [];
4
- captureException(error, context) {
5
- console.error(`[ErrorTracker] Exception captured: ${error.message}`);
6
- this.exceptions.push({ error, context });
7
- }
8
- captureMessage(message, level = "info") {
9
- console.error(`[ErrorTracker] Message captured (${level}): ${message}`);
10
- this.messages.push({ message, level });
11
- }
12
- }
13
- export class ObservabilityService {
14
- errorTracker;
15
- jobSuccessCounter = new Map();
16
- jobFailureCounter = new Map();
17
- jobDurationHistogram = new Map();
18
- constructor(errorTracker = new MockErrorTracker()) {
19
- this.errorTracker = errorTracker;
20
- }
21
- recordJobSuccess(type, durationMs) {
22
- // Increment success counter
23
- this.jobSuccessCounter.set(type, (this.jobSuccessCounter.get(type) || 0) + 1);
24
- // Record duration
25
- this.updateMetricRecord(type, durationMs);
26
- }
27
- recordJobFailure(type, error) {
28
- // Increment failure counter
29
- this.jobFailureCounter.set(type, (this.jobFailureCounter.get(type) || 0) + 1);
30
- // Track error
31
- this.errorTracker.captureException(error, { jobType: type });
32
- }
33
- getPrometheusMetrics() {
34
- let out = "# HELP jobs_processed_total Total number of jobs successfully processed\n";
35
- out += "# TYPE jobs_processed_total counter\n";
36
- for (const [type, count] of this.jobSuccessCounter) {
37
- out += `jobs_processed_total{type="${type}"} ${count}\n`;
38
- }
39
- out += "\n# HELP jobs_failed_total Total number of failed jobs\n";
40
- out += "# TYPE jobs_failed_total counter\n";
41
- for (const [type, count] of this.jobFailureCounter) {
42
- out += `jobs_failed_total{type="${type}"} ${count}\n`;
43
- }
44
- out += "\n# HELP job_duration_seconds_sum Latency of job processing in seconds\n";
45
- out += "# TYPE job_duration_seconds_sum counter\n";
46
- for (const [type, record] of this.jobDurationHistogram) {
47
- out += `job_duration_seconds_sum{type="${type}"} ${record.sum / 1000}\n`;
48
- out += `job_duration_seconds_count{type="${type}"} ${record.count}\n`;
49
- out += `job_duration_seconds_max{type="${type}"} ${record.max / 1000}\n`;
50
- }
51
- return out;
52
- }
53
- getStats() {
54
- const stats = {};
55
- for (const [type, record] of this.jobDurationHistogram) {
56
- stats[type] = {
57
- successCount: this.jobSuccessCounter.get(type) || 0,
58
- failureCount: this.jobFailureCounter.get(type) || 0,
59
- avgDuration: (record.sum / record.count).toFixed(2) + "ms",
60
- maxDuration: record.max + "ms"
61
- };
62
- }
63
- return stats;
64
- }
65
- updateMetricRecord(type, value) {
66
- let record = this.jobDurationHistogram.get(type);
67
- if (!record) {
68
- record = { count: 0, sum: 0, min: value, max: value };
69
- this.jobDurationHistogram.set(type, record);
70
- }
71
- record.count++;
72
- record.sum += value;
73
- record.min = Math.min(record.min, value);
74
- record.max = Math.max(record.max, value);
75
- }
76
- }
@@ -1,75 +0,0 @@
1
- import { spawn } from "child_process";
2
- import path from "path";
3
- import fs from "fs";
4
- export class TargetDetector {
5
- pythonPath = "python";
6
- scriptPath;
7
- constructor(buildDir = process.cwd()) {
8
- const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
- const dataRoot = path.join(homeDir, ".vesper");
10
- // Use same robust path resolution as other services
11
- const scriptPath0 = path.resolve(dataRoot, "python", "target_engine.py");
12
- const scriptPath1 = path.resolve(buildDir, "python", "target_engine.py");
13
- const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "target_engine.py");
14
- const scriptPath3 = path.resolve(buildDir, "..", "python", "target_engine.py");
15
- if (fs.existsSync(scriptPath0)) {
16
- this.scriptPath = scriptPath0;
17
- }
18
- else if (fs.existsSync(scriptPath1)) {
19
- this.scriptPath = scriptPath1;
20
- }
21
- else if (fs.existsSync(scriptPath2)) {
22
- this.scriptPath = scriptPath2;
23
- }
24
- else if (fs.existsSync(scriptPath3)) {
25
- this.scriptPath = scriptPath3;
26
- }
27
- else {
28
- this.scriptPath = scriptPath0;
29
- }
30
- if (process.platform === "win32") {
31
- this.pythonPath = "py";
32
- }
33
- }
34
- /**
35
- * Detect probable target columns in a dataset
36
- */
37
- async detectTarget(filePath) {
38
- return this.runPython("detect", [filePath]);
39
- }
40
- /**
41
- * Validate a specific column as a target
42
- */
43
- async validateTarget(filePath, targetColumn) {
44
- return this.runPython("validate", [filePath, targetColumn]);
45
- }
46
- async runPython(action, args) {
47
- return new Promise((resolve, reject) => {
48
- const childProcess = spawn(this.pythonPath, [this.scriptPath, action, ...args], {
49
- env: { ...process.env, PYTHONIOENCODING: 'utf-8' }
50
- });
51
- let stdout = "";
52
- let stderr = "";
53
- childProcess.stdout.on("data", (data) => stdout += data.toString());
54
- childProcess.stderr.on("data", (data) => stderr += data.toString());
55
- childProcess.on("close", (code) => {
56
- if (code !== 0) {
57
- reject(new Error(`Target Detector (${action}) failed: ${stderr}`));
58
- return;
59
- }
60
- try {
61
- const result = JSON.parse(stdout);
62
- if (result.error) {
63
- reject(new Error(result.error));
64
- }
65
- else {
66
- resolve(result);
67
- }
68
- }
69
- catch (e) {
70
- reject(new Error(`Failed to parse target detector output: ${stdout}\nError: ${e}`));
71
- }
72
- });
73
- });
74
- }
75
- }
@@ -1,94 +0,0 @@
1
- import argparse
2
- import asyncio
3
- import json
4
- import os
5
- import sys
6
- import warnings
7
- from pathlib import Path
8
- from typing import Any, Dict
9
-
10
- # Suppress noisy HF warnings
11
- warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
12
- warnings.filterwarnings("ignore", message=".*legacy.*")
13
-
14
- CURRENT_DIR = Path(__file__).resolve().parent
15
- if str(CURRENT_DIR) not in sys.path:
16
- sys.path.insert(0, str(CURRENT_DIR))
17
-
18
- from vesper.core.asset_downloader import AssetDownloader
19
- from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
20
-
21
-
22
- def _print(payload: Dict[str, Any]) -> None:
23
- print(json.dumps(payload, ensure_ascii=False))
24
-
25
-
26
- async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
27
- payload = json.loads(args.payload)
28
- output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
29
- output_dir = payload.get("output_dir")
30
- workers = int(payload.get("workers") or 8)
31
- recipes_dir = payload.get("recipes_dir")
32
-
33
- # Auto-set HF token from payload if provided
34
- token = payload.get("token") or payload.get("hf_token")
35
- if token:
36
- os.environ["HF_TOKEN"] = str(token)
37
-
38
- downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
39
-
40
- result = await downloader.download_assets(
41
- dataset_id=str(payload.get("dataset_id")),
42
- source=payload.get("source"),
43
- repo_id=payload.get("repo_id"),
44
- kaggle_ref=payload.get("kaggle_ref"),
45
- urls=payload.get("urls"),
46
- output_format=payload.get("output_format", "webdataset"),
47
- output_dir=str(output_dir) if output_dir else None,
48
- max_items=payload.get("max_items"),
49
- image_column=payload.get("image_column"),
50
- )
51
- return {"ok": True, "result": result}
52
-
53
-
54
- def main() -> None:
55
- parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
56
- parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
57
- parser.add_argument("payload", help="JSON payload")
58
- args = parser.parse_args()
59
-
60
- try:
61
- if args.action == "download":
62
- response = asyncio.run(_run_download(args))
63
- _print(response)
64
- return
65
-
66
- payload = json.loads(args.payload)
67
- if args.action == "build_recipe":
68
- recipe = build_download_recipe(payload)
69
- saved = save_recipe(recipe, payload.get("recipes_dir"))
70
- _print({"ok": True, "recipe": recipe, "saved_to": saved})
71
- return
72
-
73
- if args.action == "get_recipe":
74
- dataset_id = str(payload.get("dataset_id"))
75
- recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
76
- _print({"ok": True, "recipe": recipe})
77
- return
78
-
79
- _print({"ok": False, "error": f"Unknown action: {args.action}"})
80
- except Exception as e:
81
- error_msg = str(e)
82
- # Provide actionable error messages
83
- if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
84
- error_msg = (
85
- "Authentication required. This dataset may be gated/private. "
86
- "Use configure_keys tool to set HF_TOKEN, then retry."
87
- )
88
- elif "No image column" in error_msg:
89
- error_msg += " Hint: specify image_column parameter with the name of the column containing images."
90
- _print({"ok": False, "error": error_msg})
91
-
92
-
93
- if __name__ == "__main__":
94
- main()