vesper-wizard 2.0.5 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/scripts/wizard.cjs +625 -0
  174. package/{wizard.js → scripts/wizard.js} +99 -21
  175. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  179. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  180. package/src/python/asset_downloader_engine.py +92 -0
  181. package/src/python/cleaner.py +226 -0
  182. package/src/python/config.py +263 -0
  183. package/src/python/dataworld_engine.py +208 -0
  184. package/src/python/export_engine.py +243 -0
  185. package/src/python/framework_adapters.py +100 -0
  186. package/src/python/fusion_engine.py +368 -0
  187. package/src/python/github_adapter.py +106 -0
  188. package/src/python/hf_fallback.py +298 -0
  189. package/src/python/image_engine.py +86 -0
  190. package/src/python/kaggle_engine.py +295 -0
  191. package/src/python/media_engine.py +133 -0
  192. package/src/python/nasa_adapter.py +82 -0
  193. package/src/python/openml_engine.py +146 -0
  194. package/src/python/quality_engine.py +267 -0
  195. package/src/python/row_count.py +54 -0
  196. package/src/python/splitter_engine.py +283 -0
  197. package/src/python/target_engine.py +154 -0
  198. package/src/python/test_framework_adapters.py +61 -0
  199. package/src/python/test_fusion_engine.py +89 -0
  200. package/src/python/uci_adapter.py +94 -0
  201. package/src/python/vesper/__init__.py +1 -0
  202. package/src/python/vesper/core/__init__.py +1 -0
  203. package/src/python/vesper/core/asset_downloader.py +675 -0
  204. package/src/python/vesper/core/download_recipe.py +104 -0
  205. package/src/python/worldbank_adapter.py +99 -0
  206. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,157 @@
1
+ const fs = require('fs');
2
+ const path = require('path');
3
+ const os = require('os');
4
+
5
+ const { argv, cwd } = process;
6
+
7
+ function usage() {
8
+ console.log(`Usage: node scripts/preindex_registry.cjs [--scan dir1 dir2 ...] [--target N] [--out path] [--no-count]
9
+
10
+ Options:
11
+ --scan Directories to recursively scan for datasets (default: ./e2e_demo_output ./datasets)
12
+ --target Target total registry entries (if larger than scanned, will synthesize entries)
13
+ --out Output registry path (default: ~/.vesper/registry.json)
14
+ --no-count Skip expensive row counting for CSV/JSONL
15
+ `);
16
+ }
17
+
18
+ let scanDirs = [];
19
+ let target = 0;
20
+ let outPath = path.join(os.homedir(), '.vesper', 'registry.json');
21
+ let doCount = true;
22
+
23
+ for (let i = 2; i < argv.length; i++) {
24
+ const a = argv[i];
25
+ if (a === '--scan') {
26
+ i++;
27
+ while (i < argv.length && !argv[i].startsWith('--')) {
28
+ scanDirs.push(argv[i]);
29
+ i++;
30
+ }
31
+ i--;
32
+ } else if (a === '--target') {
33
+ target = parseInt(argv[++i], 10) || 0;
34
+ } else if (a === '--out') {
35
+ outPath = path.resolve(argv[++i]);
36
+ } else if (a === '--no-count') {
37
+ doCount = false;
38
+ } else if (a === '--help' || a === '-h') {
39
+ usage();
40
+ process.exit(0);
41
+ } else {
42
+ console.error('Unknown arg', a);
43
+ usage();
44
+ process.exit(2);
45
+ }
46
+ }
47
+
48
+ if (scanDirs.length === 0) scanDirs = [path.join(cwd(), 'e2e_demo_output'), path.join(cwd(), 'datasets')];
49
+
50
+ function normalizeId(s) {
51
+ return s.replace(/[^a-z0-9]+/gi, '_').replace(/^_+|_+$/g, '').toLowerCase();
52
+ }
53
+
54
+ function walk(dir, exts = ['.csv', '.jsonl', '.json', '.arrow', '.parquet', '.feather']) {
55
+ const results = [];
56
+ try {
57
+ const items = fs.readdirSync(dir, { withFileTypes: true });
58
+ for (const it of items) {
59
+ const p = path.join(dir, it.name);
60
+ if (it.isDirectory()) results.push(...walk(p, exts));
61
+ else if (it.isFile()) {
62
+ const ext = path.extname(it.name).toLowerCase();
63
+ if (exts.includes(ext)) results.push(p);
64
+ }
65
+ }
66
+ } catch (e) {
67
+ // ignore
68
+ }
69
+ return results;
70
+ }
71
+
72
+ function countCsvRows(filePath) {
73
+ return new Promise((resolve, reject) => {
74
+ let count = 0;
75
+ const rs = fs.createReadStream(filePath, { encoding: 'utf8' });
76
+ rs.on('data', chunk => {
77
+ for (let i = 0; i < chunk.length; i++) if (chunk[i] === '\n') count++;
78
+ });
79
+ rs.on('end', () => resolve(count));
80
+ rs.on('error', reject);
81
+ });
82
+ }
83
+
84
+ (async function main() {
85
+ const registryDir = path.dirname(outPath);
86
+ if (!fs.existsSync(registryDir)) fs.mkdirSync(registryDir, { recursive: true });
87
+
88
+ let existing = [];
89
+ if (fs.existsSync(outPath)) {
90
+ try { existing = JSON.parse(fs.readFileSync(outPath, 'utf8')); } catch (e) { existing = []; }
91
+ }
92
+ const map = new Map();
93
+ for (const e of existing) map.set(e.normalized_id || e.id, e);
94
+
95
+ let scanned = 0;
96
+ for (const dir of scanDirs) {
97
+ const abs = path.resolve(dir);
98
+ const files = walk(abs);
99
+ for (const f of files) {
100
+ const stats = fs.statSync(f);
101
+ const base = path.basename(f, path.extname(f));
102
+ const rel = path.relative(process.cwd(), f);
103
+ const id = normalizeId(rel || base);
104
+ let cols = null;
105
+ let rows = null;
106
+ if (doCount && (f.endsWith('.csv') || f.endsWith('.jsonl') || f.endsWith('.json'))) {
107
+ try {
108
+ if (f.endsWith('.csv')) {
109
+ const header = fs.readFileSync(f, { encoding: 'utf8', flag: 'r' }).split(/\r?\n/, 1)[0] || '';
110
+ cols = header ? header.split(',').length : 0;
111
+ rows = await countCsvRows(f);
112
+ } else if (f.endsWith('.jsonl')) {
113
+ rows = await countCsvRows(f);
114
+ }
115
+ } catch (e) {
116
+ // ignore
117
+ }
118
+ }
119
+ const entry = {
120
+ id: id,
121
+ normalized_id: id,
122
+ source: 'scanned',
123
+ path: f,
124
+ size: stats.size,
125
+ mtime: stats.mtime.toISOString(),
126
+ meta: { rows, cols }
127
+ };
128
+ map.set(id, entry);
129
+ scanned++;
130
+ }
131
+ }
132
+
133
+ // Synthesize if target requested
134
+ if (target > map.size) {
135
+ const synthCount = target - map.size;
136
+ const synthDir = path.join(path.dirname(outPath), 'local_library');
137
+ if (!fs.existsSync(synthDir)) fs.mkdirSync(synthDir, { recursive: true });
138
+ for (let i = 1; i <= synthCount; i++) {
139
+ const idx = map.size + i;
140
+ const id = `synth_${String(idx).padStart(6, '0')}`;
141
+ const entry = {
142
+ id,
143
+ normalized_id: id,
144
+ source: 'synthesized',
145
+ path: path.join(synthDir, `${id}.csv`),
146
+ size: 0,
147
+ mtime: new Date().toISOString(),
148
+ meta: { rows: Math.floor(Math.random() * 1000000), cols: Math.floor(Math.random() * 200) + 1 }
149
+ };
150
+ map.set(id, entry);
151
+ }
152
+ }
153
+
154
+ const outArr = Array.from(map.values());
155
+ fs.writeFileSync(outPath, JSON.stringify(outArr, null, 2), 'utf8');
156
+ console.log(`Wrote ${outArr.length} registry entries to ${outPath} (${scanned} scanned, ${Math.max(0, outArr.length - scanned)} synthesized)`);
157
+ })();
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env node
2
+
3
+ const { spawnSync } = require("child_process");
4
+ const fs = require("fs");
5
+ const path = require("path");
6
+ const os = require("os");
7
+ const Database = require("better-sqlite3");
8
+
9
+ function runCommand(command, args, options = {}) {
10
+ const result = spawnSync(command, args, {
11
+ stdio: "inherit",
12
+ shell: process.platform === "win32",
13
+ ...options,
14
+ });
15
+
16
+ if (result.status !== 0) {
17
+ throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
18
+ }
19
+ }
20
+
21
+ function countDatasets(dbPath) {
22
+ if (!fs.existsSync(dbPath)) return "N/A";
23
+ const db = new Database(dbPath);
24
+ const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
25
+ db.close();
26
+ return count;
27
+ }
28
+
29
+ function countVectors(jsonPath) {
30
+ if (!fs.existsSync(jsonPath)) return "N/A";
31
+ const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
32
+ if (typeof data.count === "number") return data.count;
33
+ if (Array.isArray(data.ids)) return data.ids.length;
34
+ return "N/A";
35
+ }
36
+
37
+ function syncRuntime(workspaceRoot) {
38
+ const runtimeDir = path.join(os.homedir(), ".vesper", "data");
39
+ fs.mkdirSync(runtimeDir, { recursive: true });
40
+
41
+ const files = ["metadata.db", "vectors.json", "vectors.bin"];
42
+ for (const file of files) {
43
+ const src = path.join(workspaceRoot, "data", file);
44
+ const dest = path.join(runtimeDir, file);
45
+ if (!fs.existsSync(src)) {
46
+ throw new Error(`Missing source file: ${src}`);
47
+ }
48
+ fs.copyFileSync(src, dest);
49
+ }
50
+
51
+ return runtimeDir;
52
+ }
53
+
54
+ function main() {
55
+ const workspaceRoot = process.cwd();
56
+ const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
57
+ const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
58
+ const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
59
+ const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
60
+
61
+ console.log("\n[refresh-index] Step 1/3: Massive scrape...");
62
+ runCommand("npm", ["run", "massive-scrape"]);
63
+
64
+ console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
65
+ const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
66
+ runCommand("npm", ["run", "index"], { env });
67
+
68
+ console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
69
+ const runtimeDir = syncRuntime(workspaceRoot);
70
+
71
+ const wsDb = countDatasets(workspaceDbPath);
72
+ const wsVec = countVectors(workspaceVecPath);
73
+ const rtDb = countDatasets(runtimeDbPath);
74
+ const rtVec = countVectors(runtimeVecPath);
75
+
76
+ console.log("\n[refresh-index] Completed successfully.");
77
+ console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
78
+ console.log(`[refresh-index] Runtime: DB=${rtDb}, VECTORS=${rtVec}`);
79
+ console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
80
+ }
81
+
82
+ try {
83
+ main();
84
+ } catch (error) {
85
+ console.error("\n[refresh-index] Failed:", error.message);
86
+ process.exit(1);
87
+ }