vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,34 +0,0 @@
1
- import { KaggleDownloader } from "../ingestion/kaggle-downloader.js";
2
- import path from "path";
3
- import fs from "fs";
4
- import { fileURLToPath } from "url";
5
- const __filename = fileURLToPath(import.meta.url);
6
- const __dirname = path.dirname(__filename);
7
- const projectRoot = path.join(__dirname, "..", "..");
8
- async function testKaggleDownload() {
9
- // Try to get credentials from env
10
- const user = process.env.KAGGLE_USERNAME;
11
- const key = process.env.KAGGLE_KEY;
12
- if (!user || !key) {
13
- console.error("KAGGLE_USERNAME and KAGGLE_KEY must be set to run this test.");
14
- return;
15
- }
16
- const downloader = new KaggleDownloader(user, key);
17
- const repoId = "shivam2503/diamonds"; // Small classic dataset
18
- console.log(`Testing Kaggle Download for ${repoId}...`);
19
- const targetDir = path.join(projectRoot, "data", "test", "kaggle_diamonds");
20
- if (fs.existsSync(targetDir))
21
- fs.rmSync(targetDir, { recursive: true });
22
- try {
23
- const bestFile = await downloader.download(repoId, targetDir, (p) => {
24
- process.stdout.write(`\rProgress: ${p}%`);
25
- });
26
- console.log(`\nDownload complete!`);
27
- console.log(`Best file found: ${bestFile}`);
28
- console.log(`Size: ${fs.statSync(bestFile).size} bytes`);
29
- }
30
- catch (e) {
31
- console.error("\nKaggle Download failed:", e.message);
32
- }
33
- }
34
- testKaggleDownload().catch(console.error);
@@ -1,50 +0,0 @@
1
- import { StreamProcessor } from "../data/streaming.js";
2
- import { WorkerPool } from "../data/worker-pool.js";
3
- import { Readable } from "stream";
4
- async function runTest() {
5
- console.log("--- Initializing Large Dataset Handling Test ---");
6
- // 1. Generate Mock Data (10,000 records)
7
- const records = Array.from({ length: 10000 }, (_, i) => ({
8
- id: i,
9
- name: `Record ${i}`,
10
- value: Math.random() * 100,
11
- category: i % 2 === 0 ? "A" : "B"
12
- }));
13
- console.log(`Generated ${records.length} mock records.`);
14
- // 2. Test Sampling Strategy
15
- console.log("\n--- Testing Sampling Strategy (10%) ---");
16
- const isSampled = StreamProcessor.createSampler(10);
17
- const sample = records.filter(isSampled);
18
- console.log(`Sample size: ${sample.length} (Expected ~1000)`);
19
- // 3. Test Streaming Utility
20
- console.log("\n--- Testing Streaming Utility (Chunk size: 2000) ---");
21
- const stream = Readable.from(records);
22
- let chunkCount = 0;
23
- const processChunk = async (chunk) => {
24
- chunkCount++;
25
- console.log(`Processing chunk ${chunkCount} of size ${chunk.length}`);
26
- return chunk.map(r => ({ ...r, status: "streamed" }));
27
- };
28
- const streamedResults = await StreamProcessor.processInChunks(stream, { chunkSize: 2000 }, processChunk);
29
- console.log(`Total streamed records: ${streamedResults.length}`);
30
- // 4. Test Parallel Worker Pool
31
- console.log("\n--- Testing Parallel Worker Pool (Optional) ---");
32
- try {
33
- const pool = new WorkerPool(4);
34
- console.log("Running parallel transformation on all records...");
35
- const start = Date.now();
36
- const parallelResults = await pool.processParallel(records, 2500);
37
- const end = Date.now();
38
- console.log(`Parallel processing took ${end - start}ms`);
39
- }
40
- catch (err) {
41
- console.warn(" Parallel Worker Pool skipped or failed (Common with tsx + worker_threads). Streaming and Sampling are still functional.");
42
- }
43
- if (streamedResults.length === 10000) {
44
- console.log("\n✅ Success: Chunked streaming and sampling verified.");
45
- }
46
- else {
47
- console.error("\n❌ Failure: Processing mismatch.");
48
- }
49
- }
50
- runTest().catch(console.error);
@@ -1,74 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { JobManager } from "../jobs/manager.js";
3
- import { SearchEngine } from "../search/engine.js";
4
- import { VectorStore } from "../search/vector-store.js";
5
- import { Embedder } from "../search/embedder.js";
6
- import { PipelineExecutor } from "../cleaning/executor.js";
7
- import path from "path";
8
- import fs from "fs";
9
- async function testPhase5Tools() {
10
- console.log(" Testing Phase 5.2: New MCP Tools Integration\n");
11
- const dbPath = path.join(process.cwd(), "data", "metadata.db");
12
- const vectorPath = path.join(process.cwd(), "data", "vectors.json");
13
- const store = new MetadataStore(dbPath);
14
- const vectorStore = new VectorStore(vectorPath);
15
- const embedder = Embedder.getInstance();
16
- const searchEngine = new SearchEngine(store, vectorStore, embedder);
17
- const jobManager = JobManager.getInstance(store);
18
- const executor = new PipelineExecutor();
19
- // 1. Test compare_datasets
20
- console.log("Step 1: Testing compare_datasets...");
21
- const allDs = store.getAllDatasets();
22
- if (allDs.length < 2) {
23
- console.warn(" ! Need at least 2 datasets in DB to test comparison. Skipping.");
24
- }
25
- else {
26
- const ids = allDs.slice(0, 2).map(d => d.id);
27
- console.log(` - Comparing: ${ids.join(", ")}`);
28
- // Simulating comparison table logic from index.ts
29
- const datasets = ids.map(id => store.getDataset(id)).filter(d => !!d);
30
- let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
31
- comparison += "| Quality Score | " + datasets.map(d => d.quality_score).join(" | ") + " |\n";
32
- console.log(comparison);
33
- }
34
- // 2. Test analyze_quality / preview_cleaning (Local logic)
35
- console.log("\nStep 2: Testing Analysis/Preview (Simulated)...");
36
- const testFile = path.join(process.cwd(), "e2e_demo_output", "raw_data.csv");
37
- if (fs.existsSync(testFile)) {
38
- console.log(" - Testing on e2e_demo_output/raw_data.csv");
39
- // Logic check
40
- }
41
- else {
42
- console.warn(" ! No test file found at e2e_demo_output/raw_data.csv. Run 'tsx src/scripts/e2e-demo.ts' first.");
43
- }
44
- // 3. Test prepare_dataset (Async Job)
45
- console.log("\nStep 3: Testing prepare_dataset (Autonomous Orchestrator)...");
46
- const query = "financial data for stock prediction";
47
- // Create job (Logic from index.ts)
48
- const job = jobManager.createJob("prepare", 0, { query });
49
- console.log(` - Job Created: ${job.id}`);
50
- // Register listener for simulated work (Logic from index.ts)
51
- jobManager.on("processJob", async (currJob, execute) => {
52
- if (currJob.id !== job.id)
53
- return;
54
- await execute(async () => {
55
- console.log(" - [Worker] Starting autonomous preparation task...");
56
- await new Promise(r => setTimeout(r, 1000));
57
- console.log(" - [Worker] Phase 1: Search complete");
58
- await new Promise(r => setTimeout(r, 1000));
59
- console.log(" - [Worker] Phase 2: Quality analysis complete");
60
- return "data/exports/prepared_dataset.parquet";
61
- });
62
- });
63
- // 4. Test check_job_status (Polling)
64
- console.log("\nStep 4: Polling Job Status (Simulating UI Check)...");
65
- for (let i = 0; i < 5; i++) {
66
- const currentJob = store.getJob(job.id);
67
- console.log(` - [TS +${i}s] Status: ${currentJob?.status}, Progress: ${currentJob?.progress}%, Activity: ${currentJob?.status_text}`);
68
- if (currentJob?.status === "completed")
69
- break;
70
- await new Promise(r => setTimeout(r, 800));
71
- }
72
- console.log("\n Phase 5 tools logic verified.");
73
- }
74
- testPhase5Tools().catch(console.error);
@@ -1,61 +0,0 @@
1
- import { MediaAnalyzer } from "../quality/media-analyzer.js";
2
- import path from "path";
3
- import fs from "fs";
4
- import { execSync } from "child_process";
5
- async function runTest() {
6
- console.log("--- Testing Media Quality Analysis ---");
7
- const projectRoot = path.resolve(".");
8
- const analyzer = new MediaAnalyzer(projectRoot);
9
- // 1. Create a sample audio file using Python (sine wave)
10
- const testMediaDir = path.join(projectRoot, "data", "test-media");
11
- if (!fs.existsSync(testMediaDir))
12
- fs.mkdirSync(testMediaDir, { recursive: true });
13
- const audioPath = path.join(testMediaDir, "test_audio.wav");
14
- console.log("Generating test audio file (sine wave)...");
15
- const pythonScript = `
16
- import numpy as np
17
- import soundfile as sf
18
- sample_rate = 44100
19
- duration = 2.0
20
- frequency = 440.0
21
- t = np.linspace(0, duration, int(sample_rate * duration))
22
- audio = 0.5 * np.sin(2 * np.pi * frequency * t)
23
- sf.write('${audioPath.replace(/\\/g, "\\\\")}', audio, sample_rate)
24
- `;
25
- fs.writeFileSync(path.join(testMediaDir, "gen_audio.py"), pythonScript);
26
- try {
27
- execSync(`python "${path.join(testMediaDir, "gen_audio.py")}"`);
28
- }
29
- catch (e) {
30
- console.log("⚠️ Could not generate audio (soundfile may not be installed). Skipping audio test.");
31
- console.log("VERIFICATION_STATUS: ⚠️ PARTIAL (Audio generation failed)");
32
- return;
33
- }
34
- // 2. Run Analysis
35
- console.log(`Analyzing ${audioPath}...`);
36
- try {
37
- const report = await analyzer.analyze(audioPath);
38
- console.log("Analysis Result:");
39
- console.log(`- Total Files: ${report.total_files}`);
40
- console.log(`- OK Files: ${report.ok_files}`);
41
- if ('avg_audio_duration' in report) {
42
- console.log(`- Average Audio Duration: ${report.avg_audio_duration}s`);
43
- const firstAudio = report.details[0];
44
- if (firstAudio && 'sample_rate' in firstAudio) {
45
- console.log(`- Sample Rate: ${firstAudio.sample_rate}Hz`);
46
- console.log(`- Duration: ${firstAudio.duration}s`);
47
- }
48
- }
49
- if (report.ok_files === 1 && report.total_files === 1) {
50
- console.log("\nVERIFICATION_STATUS: ✅ PASS");
51
- }
52
- else {
53
- console.log("\nVERIFICATION_STATUS: ❌ FAIL - Incorrect stats");
54
- }
55
- }
56
- catch (e) {
57
- console.error(`Analysis failed: ${e.message}`);
58
- console.log("\nVERIFICATION_STATUS: ❌ FAIL");
59
- }
60
- }
61
- runTest().catch(console.error);
@@ -1,91 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { MonitoringStore } from "../metadata/monitoring-store.js";
3
- import { MonitoringService } from "../metadata/monitoring-service.js";
4
- import path from "path";
5
- async function runTest() {
6
- const dbPath = path.resolve("data", "test-monitoring.db");
7
- const metadataStore = new MetadataStore(dbPath);
8
- const monitoringStore = new MonitoringStore(metadataStore.db);
9
- const service = new MonitoringService(monitoringStore, metadataStore);
10
- // 1. Setup mock data
11
- const dsId = "test/dataset-1";
12
- const initialMetadata = {
13
- id: dsId,
14
- source: "huggingface",
15
- name: "Test Dataset",
16
- description: "Initial description",
17
- downloads: 100,
18
- likes: 10,
19
- last_updated: "2023-01-01T00:00:00Z",
20
- download_url: "https://huggingface.co/datasets/test/dataset-1",
21
- quality_score: 85,
22
- splits: [{ name: "train", num_examples: 1000 }],
23
- license: { id: "mit", category: "safe", usage_restrictions: [], warnings: [] },
24
- tags: [],
25
- task: "text-classification",
26
- languages: ["en"],
27
- is_structured: true,
28
- has_target_column: true,
29
- total_examples: 1000,
30
- is_safe_source: true,
31
- has_personal_data: false,
32
- is_paywalled: false,
33
- is_scraped_web_data: false,
34
- uses_https: true,
35
- has_train_split: true,
36
- has_test_split: false,
37
- has_validation_split: false,
38
- description_length: 100,
39
- has_readme: true,
40
- quality_warnings: []
41
- };
42
- metadataStore.saveDataset(initialMetadata);
43
- // 2. Setup Monitor and Webhook
44
- monitoringStore.saveWebhook({
45
- id: "slack-1",
46
- name: "General Slack",
47
- channel: "slack",
48
- url: "https://hooks.slack.com/services/...",
49
- enabled: true
50
- });
51
- monitoringStore.saveMonitor({
52
- dataset_id: dsId,
53
- enabled: true,
54
- auto_reprocess: true,
55
- last_checked_version: initialMetadata.last_updated,
56
- webhook_ids: ["slack-1"],
57
- created_at: new Date().toISOString(),
58
- updated_at: new Date().toISOString()
59
- });
60
- console.log("--- Monitoring Setup Complete ---");
61
- // 3. Simulate an update
62
- const updatedMetadata = {
63
- ...initialMetadata,
64
- last_updated: "2023-02-01T00:00:00Z", // New version
65
- downloads: 150,
66
- quality_score: 90,
67
- total_examples: 1200 // Significant change
68
- };
69
- console.log("Checking for updates...");
70
- // Mock fetch function
71
- const mockFetch = async (id, source) => {
72
- if (id === dsId)
73
- return updatedMetadata;
74
- return null;
75
- };
76
- const diffs = await service.checkUpdates(mockFetch);
77
- console.log(`Found ${diffs.length} updates.`);
78
- if (diffs.length > 0) {
79
- const diff = diffs[0];
80
- console.log(`Version Change: ${diff.old_version} -> ${diff.new_version}`);
81
- console.log(`Impact Score: ${diff.impact_score}`);
82
- console.log("Changes:", JSON.stringify(diff.changes, null, 2));
83
- }
84
- if (diffs.length > 0 && diffs[0].impact_score > 0) {
85
- console.log("\n✅ Success: Dataset update detected and diffed correctly.");
86
- }
87
- else {
88
- console.error("\n❌ Failure: Update not detected.");
89
- }
90
- }
91
- runTest().catch(console.error);
@@ -1,106 +0,0 @@
1
- import { MetadataStore } from "../metadata/store.js";
2
- import { JobManager } from "../jobs/manager.js";
3
- import { ObservabilityService, MockErrorTracker } from "../monitoring/observability.js";
4
- import path from "path";
5
- async function runTest() {
6
- console.log("--- Initializing Enhanced Observability Test ---");
7
- const dbPath = path.resolve("data", "test-obs-10.db");
8
- const store = new MetadataStore(dbPath);
9
- const errorTracker = new MockErrorTracker();
10
- const stats = new ObservabilityService(errorTracker);
11
- const manager = JobManager.getInstance(store, stats);
12
- manager.setConcurrency(5); // Increased for faster test
13
- const jobsToRun = 10;
14
- // Listener to simulate job processing
15
- manager.on("processJob", async (job, run) => {
16
- // Fix: Parse JSON metadata correctly
17
- const meta = job.metadata ? JSON.parse(job.metadata) : null;
18
- const isFailing = meta === "FAIL";
19
- const isSlow = meta === "SLOW";
20
- await run(async () => {
21
- // Simulated delay: 100ms for fast, 800ms for slow
22
- const delay = isSlow ? 800 : 100;
23
- await new Promise(r => setTimeout(r, delay));
24
- if (isFailing) {
25
- console.log(`[Worker] Failing Job: ${job.id}`);
26
- throw new Error("Simulated job failure for metrics tracking");
27
- }
28
- console.log(`[Worker] Finished Job: ${job.id}`);
29
- });
30
- });
31
- console.log(`\nEnqueuing ${jobsToRun} mixed jobs...`);
32
- // Enqueue 10 mixed jobs
33
- // [Success, Success, FAIL, SlowSuccess, FAIL, Success, Success, SlowSuccess, FAIL, Success] -> 7 Success, 3 Fail
34
- const jobConfig = [
35
- { type: "clean", meta: "SUCCESS" }, // 1
36
- { type: "prepare", meta: "SUCCESS" }, // 2
37
- { type: "fusion", meta: "FAIL" }, // 3
38
- { type: "clean", meta: "SLOW" }, // 4
39
- { type: "prepare", meta: "FAIL" }, // 5
40
- { type: "fusion", meta: "SUCCESS" }, // 6
41
- { type: "clean", meta: "SUCCESS" }, // 7
42
- { type: "prepare", meta: "SLOW" }, // 8
43
- { type: "fusion", meta: "FAIL" }, // 9
44
- { type: "clean", meta: "SUCCESS" } // 10
45
- ];
46
- for (const conf of jobConfig) {
47
- // Fix: Use maxAttempts: 1 to avoid retries and get immediate metrics
48
- manager.createJob(conf.type, 0, conf.meta, 1);
49
- }
50
- // Monitor for completion
51
- return new Promise((resolve) => {
52
- const interval = setInterval(() => {
53
- const currentStats = stats.getStats();
54
- const totalFinished = Object.values(currentStats).reduce((acc, s) => acc + s.successCount + s.failureCount, 0);
55
- if (totalFinished >= jobsToRun) {
56
- clearInterval(interval);
57
- verify(stats, errorTracker, resolve);
58
- }
59
- }, 500);
60
- });
61
- }
62
- function verify(stats, errorTracker, resolve) {
63
- console.log("\n--- Verification Results ---");
64
- const prometheus = stats.getPrometheusMetrics();
65
- const summary = stats.getStats();
66
- const actualSuccess = Object.values(summary).reduce((acc, s) => acc + s.successCount, 0);
67
- const actualFailure = Object.values(summary).reduce((acc, s) => acc + s.failureCount, 0);
68
- console.log(`Measured Success: ${actualSuccess} (Expected 7)`);
69
- console.log(`Measured Failure: ${actualFailure} (Expected 3)`);
70
- let failed = false;
71
- if (actualSuccess === 7 && actualFailure === 3) {
72
- console.log("✅ Jobs processed/hour (total counts) verified.");
73
- }
74
- else {
75
- console.error("❌ Stats mismatch.");
76
- failed = true;
77
- }
78
- // 2. Verify Latency Histogram
79
- const hasLatency = prometheus.includes("job_duration_seconds_sum");
80
- const maxLatencyClean = summary.clean?.maxDuration;
81
- console.log(`Max Latency (clean): ${maxLatencyClean} (Expected > 100ms)`);
82
- if (hasLatency && parseFloat(maxLatencyClean) > 0) {
83
- console.log("✅ Latency histogram contains recorded values.");
84
- }
85
- else {
86
- console.error("❌ Latency data missing.");
87
- failed = true;
88
- }
89
- // 3. Mock Sentry / Error Tracking Verification
90
- console.log(`Exceptions Captured: ${errorTracker.exceptions.length} (Expected 3)`);
91
- if (errorTracker.exceptions.length === 3) {
92
- console.log("✅ Error tracking successfully logged all failed jobs.");
93
- }
94
- else {
95
- console.error("❌ Error tracking mismatch.");
96
- failed = true;
97
- }
98
- if (!failed) {
99
- console.log("\n✅ Success: Enhanced observability verification complete.");
100
- }
101
- else {
102
- console.error("\n❌ Failure: Observability verification failed.");
103
- }
104
- resolve();
105
- }
106
- runTest().catch(console.error);
@@ -1,55 +0,0 @@
1
- import { MetadataPackager } from "../export/packager.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Metadata Packager Test ===\n");
6
- const packager = new MetadataPackager();
7
- const testDir = path.join(process.cwd(), "test_package_input");
8
- const outputDir = path.join(process.cwd(), "test_package_output");
9
- if (!fs.existsSync(testDir))
10
- fs.mkdirSync(testDir);
11
- if (fs.existsSync(outputDir))
12
- fs.rmSync(outputDir, { recursive: true, force: true });
13
- // 1. Create Mock Data
14
- const dataPath = path.join(testDir, "train.csv");
15
- fs.writeFileSync(dataPath, "id,value\n1,10\n2,20\n");
16
- const qualityReport = {
17
- score: 85,
18
- issues: ["Missing values in column 'label'"]
19
- };
20
- const metadata = {
21
- name: "Test Dataset",
22
- version: "1.0.0",
23
- description: "A test dataset for packaging",
24
- license: "MIT",
25
- author: "Vesper Team",
26
- tags: ["test", "ml"]
27
- };
28
- // 2. Create Package
29
- console.log("Creating package...");
30
- const result = await packager.createPackage(outputDir, [{ path: dataPath, format: "csv", name: "training-data" }], metadata, { qualityReport });
31
- console.log("Result:", JSON.stringify(result, null, 2));
32
- if (result.success) {
33
- console.log("\nVerifying files in output...");
34
- const files = fs.readdirSync(outputDir);
35
- console.log("Files found:", files);
36
- if (files.includes("datapackage.json") && files.includes("train.csv") && files.includes("quality_report.json")) {
37
- console.log("PASS: All files present in package");
38
- const manifest = JSON.parse(fs.readFileSync(path.join(outputDir, "datapackage.json"), "utf8"));
39
- console.log("Manifest resources count:", manifest.resources.length);
40
- if (manifest.resources.length === 2 && manifest.resources[0].hash.startsWith("sha256:")) {
41
- console.log("PASS: Manifest entry and hash verified");
42
- }
43
- }
44
- else {
45
- console.error("FAIL: Missing files in package");
46
- }
47
- }
48
- else {
49
- console.error("FAIL: Packaging failed:", result.error);
50
- }
51
- // Cleanup
52
- // fs.rmSync(testDir, { recursive: true, force: true });
53
- // fs.rmSync(outputDir, { recursive: true, force: true });
54
- }
55
- main().catch(console.error);
@@ -1,50 +0,0 @@
1
- import { PipelineExecutor } from "../cleaning/executor.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Full Pipeline Test ===\n");
6
- const executor = new PipelineExecutor();
7
- const testFile = path.join(process.cwd(), "test_pipeline.csv");
8
- // Create Dirty Data
9
- // - duplicate: Duplicate row
10
- // - empty: 100% missing (should drop)
11
- // - price: "100.5" string (should fix type)
12
- const csvContent = `id,price,empty,desc
13
- 1,"100.5",,A
14
- 2,"50.2",,B
15
- 1,"100.5",,A
16
- 3,"75.0",,C`;
17
- fs.writeFileSync(testFile, csvContent);
18
- console.log(`Created dirty file: ${testFile}`);
19
- try {
20
- console.log("\nRunning Pipeline...");
21
- const result = await executor.runPipeline("test-pipeline-ds", testFile);
22
- console.log("\n=== Pipeline Report ===");
23
- console.log(`Plan Generated: ${result.plan.operations.length} operations`);
24
- // Assertions
25
- const ops = result.plan.operations.map(o => o.type);
26
- console.log("Operations:", ops);
27
- const hasDedupe = ops.includes("RemoveDuplicates");
28
- const hasDrop = ops.includes("DropColumns");
29
- const hasFix = ops.includes("FixTypes");
30
- if (hasDedupe && hasDrop && hasFix) {
31
- console.log("Plan Logic: PASS (Generated precise cleaning steps)");
32
- }
33
- else {
34
- console.error("Plan Logic: FAIL (Missed some cleaning steps)");
35
- }
36
- if (result.cleaning_result.success && fs.existsSync(result.final_output_path)) {
37
- console.log("Execution: PASS (Created cleaned file)");
38
- // Cleanup
39
- fs.unlinkSync(testFile);
40
- fs.unlinkSync(result.final_output_path);
41
- }
42
- else {
43
- console.error("Execution: FAIL");
44
- }
45
- }
46
- catch (error) {
47
- console.error("\nTest failed:", error);
48
- }
49
- }
50
- main().catch(console.error);
@@ -1,64 +0,0 @@
1
- import { CleaningPlanner } from "../cleaning/planner.js";
2
- async function main() {
3
- console.log("=== Vesper Cleaning Planner Test ===\n");
4
- const planner = new CleaningPlanner();
5
- // Mock Report
6
- const mockReport = {
7
- row_count: 1000,
8
- column_count: 4,
9
- duplicate_rows: 50,
10
- duplicate_percentage: 5.0,
11
- columns: [
12
- {
13
- name: "id", type: "Int64", inferred_type: "Int64",
14
- missing_count: 0, missing_percentage: 0, unique_count: 1000,
15
- is_constant: false, is_mixed_type: false
16
- },
17
- {
18
- name: "empty_col", type: "String", inferred_type: "String",
19
- missing_count: 950, missing_percentage: 95.0, unique_count: 1,
20
- is_constant: false, is_mixed_type: false
21
- },
22
- {
23
- name: "age", type: "String", inferred_type: "Numeric (Stored as String)",
24
- missing_count: 10, missing_percentage: 1.0, unique_count: 50,
25
- is_constant: false, is_mixed_type: false
26
- },
27
- {
28
- name: "score", type: "Float64", inferred_type: "Float64",
29
- missing_count: 5, missing_percentage: 0.5, unique_count: 900,
30
- is_constant: false, is_mixed_type: false
31
- }
32
- ],
33
- warnings: [],
34
- schema_warnings: [],
35
- overall_score: 50
36
- };
37
- console.log("Input Scenario:");
38
- console.log("- 50 duplicate rows -> Expect RemoveDuplicates");
39
- console.log("- 'empty_col' (95% missing) -> Expect DropColumns");
40
- console.log("- 'age' (String but looks Numeric) -> Expect FixTypes");
41
- console.log("- 'score' (Missing values) -> Expect FillMissing");
42
- const plan = await planner.generatePlan("test-dataset", mockReport);
43
- console.log("\n=== Generated Plan ===");
44
- plan.operations.forEach((op, i) => {
45
- console.log(`${i + 1}. [${op.type}] Reason: ${op.reason}`);
46
- if (op.params && Object.keys(op.params).length > 0) {
47
- console.log(` Params: ${JSON.stringify(op.params)}`);
48
- }
49
- });
50
- // Verification
51
- const types = plan.operations.map(o => o.type);
52
- const hasDedupe = types.includes("RemoveDuplicates");
53
- const hasDrop = types.includes("DropColumns");
54
- const hasFix = types.includes("FixTypes");
55
- const hasFill = types.includes("FillMissing");
56
- if (hasDedupe && hasDrop && hasFix && hasFill) {
57
- console.log("\nTest passed! All expected operations generated.");
58
- }
59
- else {
60
- console.error("\nTest failed! Missing expected operations.");
61
- console.log("Found:", types);
62
- }
63
- }
64
- main().catch(console.error);
@@ -1,38 +0,0 @@
1
- import { QualityAnalyzer } from "../quality/analyzer.js";
2
- import fs from "fs";
3
- import path from "path";
4
- async function main() {
5
- console.log("=== Vesper Privacy Scanner Test ===\n");
6
- const analyzer = new QualityAnalyzer();
7
- const testFile = path.join(process.cwd(), "test_privacy.csv");
8
- // Create a CSV with PII
9
- const csvContent = `id,name,contact,notes
10
- 1,John Doe,john@example.com,"User IP: 192.168.1.1"
11
- 2,Jane Smith,(555) 123-4567,"SSN is 123-45-6789"
12
- 3,Bob Jones,bob@work.com,"Safe notes"`;
13
- fs.writeFileSync(testFile, csvContent);
14
- console.log(`Created test file: ${testFile}`);
15
- console.log("Expectations:");
16
- console.log(" - Should detect 'Email' in 'contact' column");
17
- console.log(" - Should detect 'Phone' in 'contact' column");
18
- console.log(" - Should detect 'SSN' in 'notes' column");
19
- console.log(" - Should detect 'IPv4' in 'notes' column");
20
- try {
21
- console.log("\nRunning analyzer...");
22
- const report = await analyzer.analyze(testFile);
23
- console.log("\n=== Privacy Report ===");
24
- if (report.pii_warnings && report.pii_warnings.length > 0) {
25
- report.pii_warnings.forEach(w => console.log(`[!] ${w}`));
26
- console.log("\nTest passed! PII detected.");
27
- }
28
- else {
29
- console.error("\nTest failed! No PII detected.");
30
- }
31
- // Cleanup
32
- fs.unlinkSync(testFile);
33
- }
34
- catch (error) {
35
- console.error("\nTest failed:", error);
36
- }
37
- }
38
- main().catch(console.error);
@@ -1,36 +0,0 @@
1
- import { JobManager } from "../jobs/manager.js";
2
- import { MetadataStore } from "../metadata/store.js";
3
- import fs from "fs";
4
- async function testSync() {
5
- console.log("Starting Production Sync Test...");
6
- const dbPath = "prod_sync.db";
7
- if (fs.existsSync(dbPath))
8
- fs.unlinkSync(dbPath);
9
- const store = new MetadataStore(dbPath);
10
- const jobManager = JobManager.getInstance(store);
11
- console.log("Attaching listener (same as index.ts)...");
12
- jobManager.on("processJob", async (job, execute) => {
13
- console.log(`[Listener 1] Received job ${job.id}`);
14
- if (typeof execute !== 'function') {
15
- console.error(`[Listener 1] ERROR: execute is not a function! It is: ${typeof execute}`);
16
- return;
17
- }
18
- const task = async () => {
19
- console.log("[Listener 1] Task running...");
20
- return "ok";
21
- };
22
- await execute(task);
23
- console.log("[Listener 1] Task finished.");
24
- });
25
- console.log("Emitting job...");
26
- const job = jobManager.createJob("prepare", 0, { query: "test" });
27
- // Wait for the background loop
28
- await new Promise(r => setTimeout(r, 1000));
29
- const finalJob = store.getJob(job.id);
30
- console.log(`Job Result: ${finalJob?.status} - ${finalJob?.status_text}`);
31
- store.close();
32
- if (fs.existsSync(dbPath))
33
- fs.unlinkSync(dbPath);
34
- console.log("Test Complete.");
35
- }
36
- testSync().catch(console.error);