vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1 +0,0 @@
1
- export {};
@@ -1,54 +0,0 @@
1
- import { Embedder } from "../search/embedder.js";
2
- import { VectorStore } from "../search/vector-store.js";
3
- import { MetadataStore } from "../metadata/store.js";
4
- import path from "path";
5
- async function main() {
6
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
7
- const vectorPath = path.join(process.cwd(), "data", "vectors.json");
8
- const metadataStore = new MetadataStore(dbPath);
9
- const vectorStore = new VectorStore(vectorPath);
10
- const embedder = Embedder.getInstance();
11
- const datasets = metadataStore.getAllDatasets();
12
- const indexedIds = new Set(vectorStore.getAllIds());
13
- // Filter to only new datasets
14
- const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
15
- console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
16
- const BATCH_SIZE = 20;
17
- let processed = 0;
18
- for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
19
- const batch = toIndex.slice(i, i + BATCH_SIZE);
20
- try {
21
- // Prepare texts for batch embedding
22
- const texts = batch.map(ds => [
23
- ds.name,
24
- ds.description,
25
- `Task: ${ds.task}`,
26
- `Languages: ${ds.languages?.join(", ") || ""}`,
27
- `Tags: ${ds.tags?.join(" ") || ""}`
28
- ].join(" ").slice(0, 1500));
29
- // Memory-safe sequential embedding (avoids OOM on large libraries)
30
- for (let idx = 0; idx < batch.length; idx++) {
31
- const ds = batch[idx];
32
- try {
33
- const vector = await embedder.embed(texts[idx]);
34
- vectorStore.add(ds.id, vector);
35
- }
36
- catch (err) {
37
- console.error(`Failed to index ${ds.id}:`, err);
38
- }
39
- }
40
- processed += batch.length;
41
- if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
42
- console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);
43
- vectorStore.save();
44
- }
45
- }
46
- catch (err) {
47
- console.error(`Batch around ${i} failed:`, err);
48
- }
49
- }
50
- vectorStore.save();
51
- console.error("Vector indexing complete.");
52
- metadataStore.close();
53
- }
54
- main().catch(console.error);
@@ -1,73 +0,0 @@
1
- import Database from "better-sqlite3";
2
- import path from "path";
3
- import fs from "fs";
4
- async function main() {
5
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
6
- if (!fs.existsSync(dbPath)) {
7
- console.error("Database not found. Run 'npm run scrape' first.");
8
- process.exit(1);
9
- }
10
- const db = new Database(dbPath);
11
- try {
12
- // Get total count
13
- const count = db.prepare("SELECT COUNT(*) as count FROM datasets").get();
14
- console.log(`\nTotal datasets in database: ${count.count}\n`);
15
- // Check which columns exist
16
- const tableInfo = db.prepare("PRAGMA table_info(datasets)").all();
17
- const columns = tableInfo.map(col => col.name);
18
- const hasNewColumns = columns.includes("is_safe_source");
19
- // Get basic statistics (works with both old and new schema)
20
- const stats = db.prepare(`
21
- SELECT
22
- COUNT(*) as total,
23
- SUM(downloads) as total_downloads,
24
- AVG(quality_score) as avg_quality,
25
- SUM(CASE WHEN license_category = 'safe' THEN 1 ELSE 0 END) as safe_licenses,
26
- SUM(CASE WHEN has_train_split = 1 THEN 1 ELSE 0 END) as with_train_split
27
- FROM datasets
28
- `).get();
29
- console.log("Statistics:");
30
- console.log(` Total downloads: ${stats.total_downloads?.toLocaleString() || 0}`);
31
- console.log(` Average quality score: ${Math.round(stats.avg_quality || 0)}`);
32
- console.log(` Safe licenses: ${stats.safe_licenses || 0}`);
33
- console.log(` With train split: ${stats.with_train_split || 0}`);
34
- // Show extended stats if new schema is available
35
- if (hasNewColumns) {
36
- const extendedStats = db.prepare(`
37
- SELECT
38
- SUM(CASE WHEN is_safe_source = 1 THEN 1 ELSE 0 END) as safe_sources,
39
- SUM(CASE WHEN is_structured = 1 THEN 1 ELSE 0 END) as structured,
40
- SUM(total_examples) as total_examples
41
- FROM datasets
42
- `).get();
43
- console.log(` Safe sources: ${extendedStats.safe_sources || 0}`);
44
- console.log(` Structured datasets: ${extendedStats.structured || 0}`);
45
- console.log(` Total examples: ${extendedStats.total_examples?.toLocaleString() || 0}`);
46
- }
47
- else {
48
- console.log(` WARNING: Database uses old schema. Re-scrape to get extended statistics.`);
49
- }
50
- console.log();
51
- // Top 5 by downloads
52
- const top5 = db.prepare(`
53
- SELECT id, name, downloads, quality_score, license_category
54
- FROM datasets
55
- ORDER BY downloads DESC
56
- LIMIT 5
57
- `).all();
58
- console.log("Top 5 datasets by downloads:");
59
- top5.forEach((ds, i) => {
60
- console.log(` ${i + 1}. ${ds.id}`);
61
- console.log(` Downloads: ${ds.downloads.toLocaleString()}, Quality: ${ds.quality_score}, License: ${ds.license_category}`);
62
- });
63
- console.log();
64
- }
65
- catch (error) {
66
- console.error("Error reading database:", error);
67
- process.exit(1);
68
- }
69
- finally {
70
- db.close();
71
- }
72
- }
73
- main();
@@ -1,24 +0,0 @@
1
- import Database from "better-sqlite3";
2
- import path from "path";
3
- // Checking all plausible databases for jobs
4
- const dbs = ["metadata.db", "vesper.db", "datasets.db"];
5
- for (const dbName of dbs) {
6
- const dbPath = path.resolve("data", dbName);
7
- try {
8
- const db = new Database(dbPath);
9
- const tables = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'").all();
10
- if (tables.length > 0) {
11
- console.log(`\n--- Checking jobs in ${dbName} ---`);
12
- const jobs = db.prepare("SELECT * FROM jobs ORDER BY created_at DESC LIMIT 20").all();
13
- for (const job of jobs) {
14
- if (JSON.stringify(job).toLowerCase().includes("naruto")) {
15
- console.log(JSON.stringify(job, null, 2));
16
- }
17
- }
18
- }
19
- db.close();
20
- }
21
- catch (e) {
22
- // Silently skip if DB doesn't exist or table missing
23
- }
24
- }
@@ -1,17 +0,0 @@
1
- import Database from "better-sqlite3";
2
- import path from "path";
3
- // Trying metadata.db which is larger
4
- const dbPath = path.resolve("data", "metadata.db");
5
- const db = new Database(dbPath);
6
- try {
7
- const query = "naruto";
8
- const results = db.prepare("SELECT * FROM datasets WHERE name LIKE ? OR description LIKE ?").all(`%${query}%`, `%${query}%`);
9
- console.log(`Found ${results.length} results for "${query}" in metadata.db:`);
10
- console.log(JSON.stringify(results, null, 2));
11
- }
12
- catch (e) {
13
- console.error("Error checking database:", e.message);
14
- }
15
- finally {
16
- db.close();
17
- }
@@ -1,41 +0,0 @@
1
- #!/usr/bin/env node
2
- /**
3
- * Cleanup script to remove all Kaggle datasets from Vesper
4
- */
5
- import { fileURLToPath } from "url";
6
- import path from "path";
7
- import { MetadataStore } from "../metadata/store.js";
8
- import { VectorStore } from "../search/vector-store.js";
9
- const __filename = fileURLToPath(import.meta.url);
10
- const __dirname = path.dirname(__filename);
11
- const homeDir = process.env.HOME || process.env.USERPROFILE || path.join(__dirname, "..");
12
- const dataRoot = path.join(homeDir, ".vesper");
13
- const dbPath = path.join(dataRoot, "data", "metadata.db");
14
- const vectorPath = path.join(dataRoot, "data", "vectors.json");
15
- console.log("🧹 Vesper Kaggle Cleanup");
16
- console.log("========================\n");
17
- try {
18
- const metadataStore = new MetadataStore(dbPath);
19
- const vectorStore = new VectorStore(vectorPath);
20
- // Get all Kaggle dataset IDs
21
- const kaggleIds = metadataStore.getDatasetIdsBySource("kaggle");
22
- console.log(`Found ${kaggleIds.length} Kaggle datasets in database`);
23
- if (kaggleIds.length === 0) {
24
- console.log("✅ No Kaggle datasets to remove");
25
- process.exit(0);
26
- }
27
- // Delete from vector store
28
- const vectorsDeleted = vectorStore.deleteMany(kaggleIds);
29
- console.log(`🗑️ Deleted ${vectorsDeleted} vectors from vector store`);
30
- vectorStore.save();
31
- // Delete from metadata database
32
- const datasetsDeleted = metadataStore.deleteBySource("kaggle");
33
- console.log(`🗑️ Deleted ${datasetsDeleted} datasets from metadata database`);
34
- metadataStore.close();
35
- console.log("\n✅ Cleanup complete! Kaggle datasets have been removed.");
36
- console.log(" You can now search without seeing Kaggle results.");
37
- }
38
- catch (error) {
39
- console.error("❌ Cleanup failed:", error.message);
40
- process.exit(1);
41
- }
@@ -1,62 +0,0 @@
1
- import { PipelineExecutor } from "../cleaning/executor.js";
2
- import { ScriptGenerator } from "../cleaning/exporter.js";
3
- import fs from "fs";
4
- import path from "path";
5
- async function main() {
6
- console.log(" Vesper Dataset Ops Engine: Full Demo\n");
7
- const executor = new PipelineExecutor();
8
- const exporter = new ScriptGenerator();
9
- const demoFile = path.join(process.cwd(), "vesper_demo_data.csv");
10
- // 1. Create a Realistic Dirty Dataset
11
- // - duplicate: Duplicate Customer
12
- // - age: Mixed types ("25", "twenty"), Outliers (200)
13
- // - email: PII
14
- // - empty_col: 100% missing
15
- // - score: Good data
16
- const csvContent = `customer_id,age,email,score,empty_col
17
- C001,25,john.doe@example.com,88.5,
18
- C002,"30",jane.smith@work.org,92.0,
19
- C003,200,bob.jones@gmail.com,15.0,
20
- C001,25,john.doe@example.com,88.5,
21
- C004,"forty",alice@co.uk,80.0,
22
- C005,35,,75.0,`;
23
- fs.writeFileSync(demoFile, csvContent);
24
- console.log(`📦 Created dirty dataset: ${demoFile}`);
25
- console.log(`Contains: Duplicates, PII (Emails), Mixed Types (Age), Outliers, Empty Columns.\n`);
26
- try {
27
- // 2. Run the Auto-Cleaning Pipeline
28
- console.log(" Running Auto-Cleaning Pipeline...");
29
- const result = await executor.runPipeline("demo-dataset", demoFile);
30
- console.log("\n --- Quality Inspection Report ---");
31
- console.log(` Duplicates: ${result.initial_quality.duplicate_rows} rows`);
32
- console.log(` PII Warnings: ${result.initial_quality.pii_warnings?.length || 0}`);
33
- if (result.initial_quality.schema_warnings.length > 0) {
34
- console.log(" Schema Issues:");
35
- result.initial_quality.schema_warnings.forEach(w => console.log(` ⚠️ ${w}`));
36
- }
37
- console.log("\n --- Generated Cleaning Plan ---");
38
- result.plan.operations.forEach((op, i) => {
39
- console.log(` ${i + 1}. [${op.type}] Reason: ${op.reason}`);
40
- });
41
- console.log("\n --- Execution Result ---");
42
- if (result.cleaning_result.success) {
43
- console.log(` Success! Cleaned file saved to:`);
44
- console.log(` ${result.final_output_path}`);
45
- }
46
- else {
47
- console.error(` Failed: ${result.cleaning_result.error}`);
48
- }
49
- // 3. Generate Reproducibility Script
50
- console.log("\n --- Reproducibility ---");
51
- const pythonScript = exporter.generatePythonScript(result.plan, demoFile);
52
- const scriptPath = path.join(process.cwd(), "demo_cleaning_script.py");
53
- fs.writeFileSync(scriptPath, pythonScript);
54
- console.log(` Generated Python script: ${scriptPath}`);
55
- console.log(` (You can run this script independently to reproduce these steps!)`);
56
- console.log("\n Demo Complete.");
57
- }
58
- catch (error) {
59
- console.error("\n Demo failed:", error);
60
- }
61
- }
62
- main().catch(console.error);
@@ -1,58 +0,0 @@
1
- import { SearchEngine } from "../search/engine.js";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import { VectorStore } from "../search/vector-store.js";
4
- import { Embedder } from "../search/embedder.js";
5
- import { formatSearchResults, formatDatasetInfo } from "../tools/formatter.js";
6
- import path from "path";
7
- /**
8
- * Demo script to showcase the new formatted UI
9
- */
10
- async function main() {
11
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
12
- const vectorPath = path.join(process.cwd(), "data", "vectors.json");
13
- const store = new MetadataStore(dbPath);
14
- const vectorStore = new VectorStore(vectorPath);
15
- const embedder = Embedder.getInstance();
16
- await embedder.init();
17
- const engine = new SearchEngine(store, vectorStore, embedder);
18
- console.log("\n" + "═".repeat(80));
19
- console.log("VESPER UI DEMO - Formatted Search Results");
20
- console.log("═".repeat(80) + "\n");
21
- // Demo 1: Medical datasets
22
- console.log("Demo 1: Medical Dataset Search\n");
23
- const medicalResults = await engine.search("diabetes prediction machine learning", {
24
- limit: 3,
25
- safeOnly: true
26
- });
27
- console.log(formatSearchResults(medicalResults));
28
- // Demo 2: Detailed dataset info
29
- if (medicalResults.length > 0) {
30
- console.log("\n" + "═".repeat(80));
31
- console.log("Demo 2: Detailed Dataset Information");
32
- console.log("═".repeat(80) + "\n");
33
- console.log(formatDatasetInfo(medicalResults[0]));
34
- }
35
- // Demo 3: Computer Vision
36
- console.log("\n" + "═".repeat(80));
37
- console.log("Demo 3: Computer Vision Dataset Search");
38
- console.log("═".repeat(80) + "\n");
39
- const cvResults = await engine.search("image classification cats dogs", {
40
- limit: 3,
41
- safeOnly: true
42
- });
43
- console.log(formatSearchResults(cvResults));
44
- // Demo 4: Show quality warnings
45
- console.log("\n" + "═".repeat(80));
46
- console.log("Demo 4: Niche Query with Quality Warnings");
47
- console.log("═".repeat(80) + "\n");
48
- const nicheResults = await engine.search("ancient manuscript text recognition", {
49
- limit: 3,
50
- safeOnly: true
51
- });
52
- console.log(formatSearchResults(nicheResults));
53
- console.log("\n" + "═".repeat(80));
54
- console.log("Demo Complete!");
55
- console.log("═".repeat(80) + "\n");
56
- store.close();
57
- }
58
- main().catch(console.error);
@@ -1,72 +0,0 @@
1
- import { QualityAnalyzer } from "../quality/analyzer.js";
2
- import { PipelineExecutor } from "../cleaning/executor.js";
3
- import { DataSplitter } from "../splitting/splitter.js";
4
- import { MetadataPackager } from "../export/packager.js";
5
- import { DataExporter } from "../export/exporter.js";
6
- import fs from "fs";
7
- import path from "path";
8
- async function main() {
9
- console.log(" Vesper Data Ops Engine - End-to-End Pipeline Demo\n");
10
- const sessionDir = path.join(process.cwd(), "e2e_demo_output");
11
- if (fs.existsSync(sessionDir))
12
- fs.rmSync(sessionDir, { recursive: true, force: true });
13
- fs.mkdirSync(sessionDir);
14
- const rawFile = path.join(sessionDir, "raw_data.csv");
15
- // --- STEP 0: Create Raw "Dirty" Data ---
16
- console.log(" Step 0: Initializing Raw Dataset...");
17
- let content = "id,name,age,salary,joined_date\n";
18
- content += "1,Alice,25,50000,2023-01-01\n";
19
- content += "2,Bob,,60000,2023-01-05\n"; // Missing age
20
- content += "3,Charlie,35,70000,2023-01-10\n";
21
- content += "1,Alice,25,50000,2023-01-01\n"; // Duplicate
22
- content += "4,Diana,40,invalid,2023-02-01\n"; // Type mismatch
23
- fs.writeFileSync(rawFile, content);
24
- // --- STEP 1: Quality Analysis ---
25
- console.log(" Step 1: Running Quality Analysis...");
26
- const analyzer = new QualityAnalyzer();
27
- const report = await analyzer.analyze(rawFile);
28
- console.log(` - Quality Score: ${report.overall_score}/100`);
29
- console.log(` - Warnings Found: ${report.warnings.length}`);
30
- // --- STEP 2: Auto-Cleaning ---
31
- console.log(" Step 2: Generating and Executing Cleaning Pipeline...");
32
- const executor = new PipelineExecutor();
33
- const pipelineResult = await executor.runPipeline("demo-dataset", rawFile);
34
- const cleanedFile = pipelineResult.final_output_path;
35
- console.log(` - Cleaned file: ${path.basename(cleanedFile)}`);
36
- // --- STEP 3: Smart Splitting ---
37
- console.log(" Step 3: Splitting into Train/Val/Test/Holdout (Stratified)...");
38
- const splitter = new DataSplitter();
39
- const splitResult = await splitter.split(cleanedFile, {
40
- type: "random", // Using random since dummy data too small for stratified
41
- ratios: { train: 0.6, val: 0.2, test: 0.1, holdout: 0.1 },
42
- shuffle: true,
43
- random_seed: 42
44
- });
45
- console.log(` - Splits created: ${Object.keys(splitResult.paths).join(", ")}`);
46
- // --- STEP 4: Export to Parquet ---
47
- console.log(" Step 4: Exporting Final Train Set to Parquet...");
48
- const exporter = new DataExporter();
49
- const parquetFile = path.join(sessionDir, "train_final.parquet");
50
- await exporter.export(splitResult.paths.train, parquetFile, "parquet");
51
- console.log(` - Exported: ${path.basename(parquetFile)}`);
52
- // --- STEP 5: Metadata Packaging ---
53
- console.log(" Step 5: Creating Final Data Package...");
54
- const packager = new MetadataPackager();
55
- const packageResult = await packager.createPackage(path.join(sessionDir, "vesper_package"), [
56
- { path: parquetFile, format: "parquet", name: "train-set" },
57
- { path: splitResult.paths.holdout, format: "csv", name: "holdout-set" }
58
- ], {
59
- name: "Vesper E2E Demo",
60
- version: "1.0.0",
61
- description: "Automatically cleaned and partitioned dataset.",
62
- license: "MIT",
63
- author: "Vesper Engine"
64
- }, { qualityReport: report });
65
- console.log(" Pipeline Finished Successfully!");
66
- console.log(` Package Location: ${packageResult.packagePath}`);
67
- console.log(` Manifest: datapackage.json created.`);
68
- }
69
- main().catch(err => {
70
- console.error(" Pipeline Failed:", err);
71
- process.exit(1);
72
- });
@@ -1,103 +0,0 @@
1
- import { HuggingFaceScraper } from "../metadata/scraper.js";
2
- import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
3
- import { MetadataStore } from "../metadata/store.js";
4
- import path from "path";
5
- /**
6
- * Realistic massive scraper: Get maximum from HF + extensive Kaggle coverage
7
- * Phase 1: Bulk Discovery (Skeleton Indexing)
8
- * Phase 2: Kaggle Discovery
9
- */
10
- async function main() {
11
- const scraper = new HuggingFaceScraper();
12
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
13
- const store = new MetadataStore(dbPath);
14
- const allDatasets = new Map();
15
- try {
16
- // Load existing datasets to avoid duplicates
17
- console.error(`\nLoading existing datasets from database...`);
18
- const existing = store.getAllDatasets();
19
- for (const ds of existing) {
20
- allDatasets.set(ds.id, ds);
21
- }
22
- console.error(`Found ${existing.length} existing datasets in database`);
23
- // Phase 1: Bulk discovery from HuggingFace
24
- console.error(`\nPhase 1: Bulk HuggingFace Discovery (Target: 30,000)`);
25
- const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
26
- if (hfToken) {
27
- console.error(`Using HuggingFace token (rate limits should be higher)`);
28
- }
29
- else {
30
- console.error(`WARNING: No HF_TOKEN found. Bulk scraping may be slower.`);
31
- }
32
- const hfLimit = 30000;
33
- const hfDatasets = await scraper.scrapeBulk(hfLimit);
34
- let newHfCount = 0;
35
- for (const ds of hfDatasets) {
36
- if (!allDatasets.has(ds.id)) {
37
- allDatasets.set(ds.id, ds);
38
- newHfCount++;
39
- }
40
- }
41
- console.error(`HuggingFace Bulk Discovery: ${newHfCount} new datasets (${allDatasets.size} total unique)`);
42
- // Save HF progress
43
- console.error(`Saving HF discovery results to database...`);
44
- store.beginTransaction();
45
- try {
46
- for (const ds of hfDatasets) {
47
- store.saveDataset(ds);
48
- }
49
- store.commit();
50
- }
51
- catch (e) {
52
- store.rollback();
53
- console.error("Failed to save HF discovery progress:", e);
54
- }
55
- // Phase 2: Extensive Kaggle scraping across many categories
56
- const kaggleUser = process.env.KAGGLE_USERNAME;
57
- const kaggleKey = process.env.KAGGLE_KEY;
58
- if (kaggleUser && kaggleKey) {
59
- console.error(`\nPhase 2: Extensive Kaggle scraping`);
60
- const kaggleScraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
61
- // Comprehensive Kaggle search terms
62
- const kaggleSearches = [
63
- "machine learning", "deep learning", "data science",
64
- "classification", "regression", "clustering", "anomaly detection",
65
- "natural language processing", "text classification", "sentiment analysis",
66
- "image classification", "object detection", "medical imaging",
67
- "time series", "forecasting", "financial forecasting",
68
- "healthcare", "medical diagnosis", "e-commerce", "social media"
69
- ];
70
- for (const search of kaggleSearches) {
71
- console.error(` [Kaggle: "${search}"] Fetching...`);
72
- try {
73
- const kaggleDatasets = await kaggleScraper.scrape(search, 100, true);
74
- let newKaggleCount = 0;
75
- for (const ds of kaggleDatasets) {
76
- ds.id = `kaggle:${ds.id}`;
77
- if (!allDatasets.has(ds.id)) {
78
- allDatasets.set(ds.id, ds);
79
- newKaggleCount++;
80
- store.saveDataset(ds); // Save individually to avoid long transactions
81
- }
82
- }
83
- console.error(` ${newKaggleCount} new datasets (${kaggleDatasets.length} total fetched, ${allDatasets.size} total unique)`);
84
- }
85
- catch (e) {
86
- console.error(` ERROR: ${e.message}`);
87
- }
88
- await new Promise(resolve => setTimeout(resolve, 2000));
89
- }
90
- }
91
- console.error(`\nBulk Discovery Complete!`);
92
- console.error(`Total unique datasets in library: ${allDatasets.size}`);
93
- console.error(`\nNext step: Run 'npm run index' to update vectors.`);
94
- }
95
- catch (error) {
96
- console.error("\nERROR: Massive scraping failed:", error);
97
- process.exit(1);
98
- }
99
- finally {
100
- store.close();
101
- }
102
- }
103
- main();
@@ -1,33 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { JobManager } from "../jobs/manager.js";
3
- import path from "path";
4
- async function showDashboard() {
5
- const dbPath = path.resolve("data", "vesper.db");
6
- const store = new MetadataStore(dbPath);
7
- // In a real app, this would be a shared instance in a long-running process
8
- const manager = JobManager.getInstance(store);
9
- const stats = manager.stats;
10
- console.clear();
11
- console.log("==========================================");
12
- console.log(" VESPER OPERATIONS DASHBOARD ");
13
- console.log("==========================================\n");
14
- const prometheus = stats.getPrometheusMetrics();
15
- const summary = stats.getStats();
16
- if (Object.keys(summary).length === 0) {
17
- console.log("No metrics recorded yet. Process some jobs to see data.");
18
- }
19
- else {
20
- console.log("--- SYSTEM METRICS (JSON) ---");
21
- console.log(JSON.stringify(summary, null, 2));
22
- console.log("\n--- PROMETHEUS EXPORT (RAW) ---");
23
- console.log(prometheus);
24
- }
25
- console.log("\n==========================================");
26
- console.log(" Press Ctrl+C to exit dashboard loop ");
27
- console.log("==========================================");
28
- store.close();
29
- }
30
- // Simple loop for "real-time" feel (simulated)
31
- console.log("Starting Dashboard...");
32
- showDashboard();
33
- setInterval(showDashboard, 5000);
@@ -1,37 +0,0 @@
1
- import { JobManager } from "../jobs/manager.js";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import fs from "fs";
4
- async function repro() {
5
- const dbPath = "repro_test.db";
6
- if (fs.existsSync(dbPath))
7
- fs.unlinkSync(dbPath);
8
- const store = new MetadataStore(dbPath);
9
- const jobManager = JobManager.getInstance(store);
10
- console.log("Setting up listener...");
11
- jobManager.on("processJob", async (job, execute) => {
12
- console.log(`Listener received job ${job.id}`);
13
- const task = async () => {
14
- console.log("Running task...");
15
- return "success";
16
- };
17
- try {
18
- await execute(task);
19
- console.log("Execute finished");
20
- }
21
- catch (e) {
22
- console.error("Execute failed in listener:", e.message);
23
- }
24
- });
25
- console.log("Creating job...");
26
- const job = jobManager.createJob("prepare", 0, { query: "test" });
27
- console.log(`Job created: ${job.id}`);
28
- // Wait for a bit
29
- await new Promise(r => setTimeout(r, 2000));
30
- const finalJob = store.getJob(job.id);
31
- console.log("Final job status:", finalJob?.status);
32
- console.log("Final job status text:", finalJob?.status_text);
33
- store.close();
34
- if (fs.existsSync(dbPath))
35
- fs.unlinkSync(dbPath);
36
- }
37
- repro().catch(console.error);
@@ -1,56 +0,0 @@
1
- import path from "path";
2
- import fs from "fs";
3
- import { spawnSync } from "child_process";
4
- const pythonPath = "python";
5
- const scriptPath = path.join(process.cwd(), "src", "python", "cleaner.py");
6
- const testDir = path.join(process.cwd(), "test_repro");
7
- if (!fs.existsSync(testDir))
8
- fs.mkdirSync(testDir);
9
- async function runRepro() {
10
- console.log("=== Reproducing CSV Export Bug ===\n");
11
- const parquetFile = path.join(testDir, "test_nested.parquet");
12
- const csvOutput = path.join(testDir, "test_nested_cleaned.csv");
13
- // 1. Create a Parquet file with nested data (Lists/Structs) using Python
14
- console.log("Creating nested Parquet file...");
15
- const createScript = `
16
- import polars as pl
17
- df = pl.DataFrame({
18
- "id": [1, 2, 3],
19
- "tags": [["a", "b"], ["c"], []],
20
- "meta": [{"score": 0.9, "safe": True}, {"score": 0.4, "safe": False}, {"score": 0.1, "safe": True}]
21
- })
22
- df.write_parquet(r"${parquetFile}")
23
- `;
24
- fs.writeFileSync(path.join(testDir, "create_data.py"), createScript);
25
- spawnSync(pythonPath, [path.join(testDir, "create_data.py")], { stdio: 'inherit' });
26
- // 2. Call cleaner.py to convert to CSV
27
- console.log("Calling cleaner.py to convert to CSV...");
28
- const result = spawnSync(pythonPath, [
29
- scriptPath,
30
- parquetFile,
31
- "[]",
32
- "csv"
33
- ]);
34
- console.log("Exit Code:", result.status);
35
- console.log("Stdout:", result.stdout?.toString());
36
- console.log("Stderr:", result.stderr?.toString());
37
- if (result.status === 0) {
38
- try {
39
- const data = JSON.parse(result.stdout.toString());
40
- if (data.success) {
41
- console.log("SUCCESS! Output file:", data.output_path);
42
- if (fs.existsSync(data.output_path)) {
43
- console.log("File exists on disk.");
44
- }
45
- }
46
- else {
47
- console.error("cleaner.py reported failure:", data.error);
48
- }
49
- }
50
- catch (e) {
51
- console.error("Failed to parse JSON output:", e);
52
- }
53
- }
54
- }
55
- runRepro().catch(console.error);
56
- runRepro().catch(console.error);