@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,58 @@
1
+ import { SearchEngine } from "../search/engine.js";
2
+ import { MetadataStore } from "../metadata/store.js";
3
+ import { VectorStore } from "../search/vector-store.js";
4
+ import { Embedder } from "../search/embedder.js";
5
+ import { formatSearchResults, formatDatasetInfo } from "../tools/formatter.js";
6
+ import path from "path";
7
+ /**
8
+ * Demo script to showcase the new formatted UI
9
+ */
10
+ async function main() {
11
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
12
+ const vectorPath = path.join(process.cwd(), "data", "vectors.json");
13
+ const store = new MetadataStore(dbPath);
14
+ const vectorStore = new VectorStore(vectorPath);
15
+ const embedder = Embedder.getInstance();
16
+ await embedder.init();
17
+ const engine = new SearchEngine(store, vectorStore, embedder);
18
+ console.log("\n" + "═".repeat(80));
19
+ console.log("VESPER UI DEMO - Formatted Search Results");
20
+ console.log("═".repeat(80) + "\n");
21
+ // Demo 1: Medical datasets
22
+ console.log("Demo 1: Medical Dataset Search\n");
23
+ const medicalResults = await engine.search("diabetes prediction machine learning", {
24
+ limit: 3,
25
+ safeOnly: true
26
+ });
27
+ console.log(formatSearchResults(medicalResults));
28
+ // Demo 2: Detailed dataset info
29
+ if (medicalResults.length > 0) {
30
+ console.log("\n" + "═".repeat(80));
31
+ console.log("Demo 2: Detailed Dataset Information");
32
+ console.log("═".repeat(80) + "\n");
33
+ console.log(formatDatasetInfo(medicalResults[0]));
34
+ }
35
+ // Demo 3: Computer Vision
36
+ console.log("\n" + "═".repeat(80));
37
+ console.log("Demo 3: Computer Vision Dataset Search");
38
+ console.log("═".repeat(80) + "\n");
39
+ const cvResults = await engine.search("image classification cats dogs", {
40
+ limit: 3,
41
+ safeOnly: true
42
+ });
43
+ console.log(formatSearchResults(cvResults));
44
+ // Demo 4: Show quality warnings
45
+ console.log("\n" + "═".repeat(80));
46
+ console.log("Demo 4: Niche Query with Quality Warnings");
47
+ console.log("═".repeat(80) + "\n");
48
+ const nicheResults = await engine.search("ancient manuscript text recognition", {
49
+ limit: 3,
50
+ safeOnly: true
51
+ });
52
+ console.log(formatSearchResults(nicheResults));
53
+ console.log("\n" + "═".repeat(80));
54
+ console.log("Demo Complete!");
55
+ console.log("═".repeat(80) + "\n");
56
+ store.close();
57
+ }
58
+ main().catch(console.error);
@@ -0,0 +1,72 @@
1
+ import { QualityAnalyzer } from "../quality/analyzer.js";
2
+ import { PipelineExecutor } from "../cleaning/executor.js";
3
+ import { DataSplitter } from "../splitting/splitter.js";
4
+ import { MetadataPackager } from "../export/packager.js";
5
+ import { DataExporter } from "../export/exporter.js";
6
+ import fs from "fs";
7
+ import path from "path";
8
+ async function main() {
9
+ console.log(" Vesper Data Ops Engine - End-to-End Pipeline Demo\n");
10
+ const sessionDir = path.join(process.cwd(), "e2e_demo_output");
11
+ if (fs.existsSync(sessionDir))
12
+ fs.rmSync(sessionDir, { recursive: true, force: true });
13
+ fs.mkdirSync(sessionDir);
14
+ const rawFile = path.join(sessionDir, "raw_data.csv");
15
+ // --- STEP 0: Create Raw "Dirty" Data ---
16
+ console.log(" Step 0: Initializing Raw Dataset...");
17
+ let content = "id,name,age,salary,joined_date\n";
18
+ content += "1,Alice,25,50000,2023-01-01\n";
19
+ content += "2,Bob,,60000,2023-01-05\n"; // Missing age
20
+ content += "3,Charlie,35,70000,2023-01-10\n";
21
+ content += "1,Alice,25,50000,2023-01-01\n"; // Duplicate
22
+ content += "4,Diana,40,invalid,2023-02-01\n"; // Type mismatch
23
+ fs.writeFileSync(rawFile, content);
24
+ // --- STEP 1: Quality Analysis ---
25
+ console.log(" Step 1: Running Quality Analysis...");
26
+ const analyzer = new QualityAnalyzer();
27
+ const report = await analyzer.analyze(rawFile);
28
+ console.log(` - Quality Score: ${report.overall_score}/100`);
29
+ console.log(` - Warnings Found: ${report.warnings.length}`);
30
+ // --- STEP 2: Auto-Cleaning ---
31
+ console.log(" Step 2: Generating and Executing Cleaning Pipeline...");
32
+ const executor = new PipelineExecutor();
33
+ const pipelineResult = await executor.runPipeline("demo-dataset", rawFile);
34
+ const cleanedFile = pipelineResult.final_output_path;
35
+ console.log(` - Cleaned file: ${path.basename(cleanedFile)}`);
36
+ // --- STEP 3: Smart Splitting ---
37
+ console.log(" Step 3: Splitting into Train/Val/Test/Holdout (Stratified)...");
38
+ const splitter = new DataSplitter();
39
+ const splitResult = await splitter.split(cleanedFile, {
40
+ type: "random", // Using random since dummy data too small for stratified
41
+ ratios: { train: 0.6, val: 0.2, test: 0.1, holdout: 0.1 },
42
+ shuffle: true,
43
+ random_seed: 42
44
+ });
45
+ console.log(` - Splits created: ${Object.keys(splitResult.paths).join(", ")}`);
46
+ // --- STEP 4: Export to Parquet ---
47
+ console.log(" Step 4: Exporting Final Train Set to Parquet...");
48
+ const exporter = new DataExporter();
49
+ const parquetFile = path.join(sessionDir, "train_final.parquet");
50
+ await exporter.export(splitResult.paths.train, parquetFile, "parquet");
51
+ console.log(` - Exported: ${path.basename(parquetFile)}`);
52
+ // --- STEP 5: Metadata Packaging ---
53
+ console.log(" Step 5: Creating Final Data Package...");
54
+ const packager = new MetadataPackager();
55
+ const packageResult = await packager.createPackage(path.join(sessionDir, "vesper_package"), [
56
+ { path: parquetFile, format: "parquet", name: "train-set" },
57
+ { path: splitResult.paths.holdout, format: "csv", name: "holdout-set" }
58
+ ], {
59
+ name: "Vesper E2E Demo",
60
+ version: "1.0.0",
61
+ description: "Automatically cleaned and partitioned dataset.",
62
+ license: "MIT",
63
+ author: "Vesper Engine"
64
+ }, { qualityReport: report });
65
+ console.log(" Pipeline Finished Successfully!");
66
+ console.log(` Package Location: ${packageResult.packagePath}`);
67
+ console.log(` Manifest: datapackage.json created.`);
68
+ }
69
+ main().catch(err => {
70
+ console.error(" Pipeline Failed:", err);
71
+ process.exit(1);
72
+ });
@@ -0,0 +1,103 @@
1
+ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
+ import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
3
+ import { MetadataStore } from "../metadata/store.js";
4
+ import path from "path";
5
+ /**
6
+ * Realistic massive scraper: Get maximum from HF + extensive Kaggle coverage
7
+ * Phase 1: Bulk Discovery (Skeleton Indexing)
8
+ * Phase 2: Kaggle Discovery
9
+ */
10
+ async function main() {
11
+ const scraper = new HuggingFaceScraper();
12
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
13
+ const store = new MetadataStore(dbPath);
14
+ const allDatasets = new Map();
15
+ try {
16
+ // Load existing datasets to avoid duplicates
17
+ console.error(`\nLoading existing datasets from database...`);
18
+ const existing = store.getAllDatasets();
19
+ for (const ds of existing) {
20
+ allDatasets.set(ds.id, ds);
21
+ }
22
+ console.error(`Found ${existing.length} existing datasets in database`);
23
+ // Phase 1: Bulk discovery from HuggingFace
24
+ console.error(`\nPhase 1: Bulk HuggingFace Discovery (Target: 30,000)`);
25
+ const hfToken = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
26
+ if (hfToken) {
27
+ console.error(`Using HuggingFace token (rate limits should be higher)`);
28
+ }
29
+ else {
30
+ console.error(`WARNING: No HF_TOKEN found. Bulk scraping may be slower.`);
31
+ }
32
+ const hfLimit = 30000;
33
+ const hfDatasets = await scraper.scrapeBulk(hfLimit);
34
+ let newHfCount = 0;
35
+ for (const ds of hfDatasets) {
36
+ if (!allDatasets.has(ds.id)) {
37
+ allDatasets.set(ds.id, ds);
38
+ newHfCount++;
39
+ }
40
+ }
41
+ console.error(`HuggingFace Bulk Discovery: ${newHfCount} new datasets (${allDatasets.size} total unique)`);
42
+ // Save HF progress
43
+ console.error(`Saving HF discovery results to database...`);
44
+ store.beginTransaction();
45
+ try {
46
+ for (const ds of hfDatasets) {
47
+ store.saveDataset(ds);
48
+ }
49
+ store.commit();
50
+ }
51
+ catch (e) {
52
+ store.rollback();
53
+ console.error("Failed to save HF discovery progress:", e);
54
+ }
55
+ // Phase 2: Extensive Kaggle scraping across many categories
56
+ const kaggleUser = process.env.KAGGLE_USERNAME;
57
+ const kaggleKey = process.env.KAGGLE_KEY;
58
+ if (kaggleUser && kaggleKey) {
59
+ console.error(`\nPhase 2: Extensive Kaggle scraping`);
60
+ const kaggleScraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
61
+ // Comprehensive Kaggle search terms
62
+ const kaggleSearches = [
63
+ "machine learning", "deep learning", "data science",
64
+ "classification", "regression", "clustering", "anomaly detection",
65
+ "natural language processing", "text classification", "sentiment analysis",
66
+ "image classification", "object detection", "medical imaging",
67
+ "time series", "forecasting", "financial forecasting",
68
+ "healthcare", "medical diagnosis", "e-commerce", "social media"
69
+ ];
70
+ for (const search of kaggleSearches) {
71
+ console.error(` [Kaggle: "${search}"] Fetching...`);
72
+ try {
73
+ const kaggleDatasets = await kaggleScraper.scrape(search, 100, true);
74
+ let newKaggleCount = 0;
75
+ for (const ds of kaggleDatasets) {
76
+ ds.id = `kaggle:${ds.id}`;
77
+ if (!allDatasets.has(ds.id)) {
78
+ allDatasets.set(ds.id, ds);
79
+ newKaggleCount++;
80
+ store.saveDataset(ds); // Save individually to avoid long transactions
81
+ }
82
+ }
83
+ console.error(` ${newKaggleCount} new datasets (${kaggleDatasets.length} total fetched, ${allDatasets.size} total unique)`);
84
+ }
85
+ catch (e) {
86
+ console.error(` ERROR: ${e.message}`);
87
+ }
88
+ await new Promise(resolve => setTimeout(resolve, 2000));
89
+ }
90
+ }
91
+ console.error(`\nBulk Discovery Complete!`);
92
+ console.error(`Total unique datasets in library: ${allDatasets.size}`);
93
+ console.error(`\nNext step: Run 'npm run index' to update vectors.`);
94
+ }
95
+ catch (error) {
96
+ console.error("\nERROR: Massive scraping failed:", error);
97
+ process.exit(1);
98
+ }
99
+ finally {
100
+ store.close();
101
+ }
102
+ }
103
+ main();
@@ -0,0 +1,33 @@
1
+ import { MetadataStore } from "../metadata/store.js";
2
+ import { JobManager } from "../jobs/manager.js";
3
+ import path from "path";
4
+ async function showDashboard() {
5
+ const dbPath = path.resolve("data", "vesper.db");
6
+ const store = new MetadataStore(dbPath);
7
+ // In a real app, this would be a shared instance in a long-running process
8
+ const manager = JobManager.getInstance(store);
9
+ const stats = manager.stats;
10
+ console.clear();
11
+ console.log("==========================================");
12
+ console.log(" VESPER OPERATIONS DASHBOARD ");
13
+ console.log("==========================================\n");
14
+ const prometheus = stats.getPrometheusMetrics();
15
+ const summary = stats.getStats();
16
+ if (Object.keys(summary).length === 0) {
17
+ console.log("No metrics recorded yet. Process some jobs to see data.");
18
+ }
19
+ else {
20
+ console.log("--- SYSTEM METRICS (JSON) ---");
21
+ console.log(JSON.stringify(summary, null, 2));
22
+ console.log("\n--- PROMETHEUS EXPORT (RAW) ---");
23
+ console.log(prometheus);
24
+ }
25
+ console.log("\n==========================================");
26
+ console.log(" Press Ctrl+C to exit dashboard loop ");
27
+ console.log("==========================================");
28
+ store.close();
29
+ }
30
+ // Simple loop for "real-time" feel (simulated)
31
+ console.log("Starting Dashboard...");
32
+ showDashboard();
33
+ setInterval(showDashboard, 5000);
@@ -0,0 +1,100 @@
1
+ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
+ import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
3
+ import { MetadataStore } from "../metadata/store.js";
4
+ import path from "path";
5
+ async function main() {
6
+ const scraper = new HuggingFaceScraper();
7
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
8
+ const store = new MetadataStore(dbPath);
9
+ // Get limit from command line args or default to 100
10
+ let limit = 100;
11
+ if (process.argv[2] && !process.argv[2].startsWith("-")) {
12
+ limit = parseInt(process.argv[2], 10);
13
+ if (isNaN(limit))
14
+ limit = 100;
15
+ }
16
+ // Check if MVP filters should be disabled (--no-mvp flag)
17
+ const applyMVPFilters = !process.argv.includes("--no-mvp");
18
+ // Get domain filter if specified (--domain medicine, --domain healthcare, etc.)
19
+ const domainArgIndex = process.argv.indexOf("--domain");
20
+ const domainFilter = domainArgIndex !== -1 && process.argv[domainArgIndex + 1]
21
+ ? process.argv[domainArgIndex + 1]
22
+ : undefined;
23
+ try {
24
+ const datasets = [];
25
+ const domainMsg = domainFilter ? `, domain: ${domainFilter}` : "";
26
+ // 1. HuggingFace
27
+ const hfLimit = limit > 1000 ? limit : 1000; // Aim for at least 1k for "massive"
28
+ console.error(`Scraping HF: ${hfLimit} datasets with MVP filters: ${applyMVPFilters}${domainMsg}`);
29
+ const hfDatasets = await scraper.scrape(hfLimit, applyMVPFilters, domainFilter);
30
+ datasets.push(...hfDatasets);
31
+ // 2. Kaggle (Optional - only if credentials provided)
32
+ const kaggleUser = process.env.KAGGLE_USERNAME;
33
+ const kaggleKey = process.env.KAGGLE_KEY;
34
+ if (kaggleUser && kaggleKey) {
35
+ const kaggleLimit = Math.max(100, Math.floor(hfLimit / 4)); // Balance HF/Kaggle ratio
36
+ console.error(`\nScraping Kaggle: searching for "${domainFilter || 'all'}" (limit: ${kaggleLimit})`);
37
+ const kaggleScraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
38
+ // For massive scraping, we might want to iterate through multiple common search terms if no domainFilter is set
39
+ const searchTerms = domainFilter ? [domainFilter] : ["machine learning", "data science", "nlp", "computer vision", "healthcare"];
40
+ const termLimit = Math.floor(kaggleLimit / searchTerms.length);
41
+ for (const term of searchTerms) {
42
+ console.error(`[Kaggle] Fetching "${term}"...`);
43
+ const kaggleDatasets = await kaggleScraper.scrape(term, termLimit);
44
+ kaggleDatasets.forEach(d => {
45
+ d.id = `kaggle:${d.id}`;
46
+ if (!datasets.some(existing => existing.id === d.id)) {
47
+ datasets.push(d);
48
+ }
49
+ });
50
+ }
51
+ }
52
+ else {
53
+ console.error("\n[Kaggle] Skipping (KAGGLE_USERNAME/KAGGLE_KEY not set).");
54
+ }
55
+ console.error(`\nTotal Scraped: ${datasets.length} datasets.`);
56
+ let saved = 0;
57
+ store.beginTransaction(); // Performance optimization for large batch
58
+ try {
59
+ for (const ds of datasets) {
60
+ store.saveDataset(ds);
61
+ saved++;
62
+ if (saved % 500 === 0)
63
+ console.error(`[DB] Saved ${saved} records...`);
64
+ }
65
+ store.commit();
66
+ }
67
+ catch (e) {
68
+ store.rollback();
69
+ throw e;
70
+ }
71
+ console.error(`Successfully saved ${saved} datasets to MetadataStore.`);
72
+ console.error(`\nSummary:`);
73
+ console.error(` - Total examples: ${datasets.reduce((sum, d) => sum + d.total_examples, 0).toLocaleString()}`);
74
+ console.error(` - Safe sources: ${datasets.filter(d => d.is_safe_source).length}`);
75
+ console.error(` - Structured datasets: ${datasets.filter(d => d.is_structured).length}`);
76
+ console.error(` - Permissive licenses: ${datasets.filter(d => d.license.category === "safe").length}`);
77
+ // Show domain distribution
78
+ const domainCounts = new Map();
79
+ datasets.forEach(d => {
80
+ const domain = d.domain || "unknown";
81
+ domainCounts.set(domain, (domainCounts.get(domain) || 0) + 1);
82
+ });
83
+ if (domainCounts.size > 0) {
84
+ console.error(` - Domains:`);
85
+ Array.from(domainCounts.entries())
86
+ .sort((a, b) => b[1] - a[1])
87
+ .forEach(([domain, count]) => {
88
+ console.error(` ${domain}: ${count}`);
89
+ });
90
+ }
91
+ }
92
+ catch (error) {
93
+ console.error("Scraping failed:", error);
94
+ process.exit(1);
95
+ }
96
+ finally {
97
+ store.close();
98
+ }
99
+ }
100
+ main();
@@ -0,0 +1,26 @@
1
+ import path from "path";
2
+ import { MetadataStore } from "../metadata/store.js";
3
+ import { VectorStore } from "../search/vector-store.js";
4
+ import { Embedder } from "../search/embedder.js";
5
+ import { SearchEngine } from "../search/engine.js";
6
+ const query = process.argv[2];
7
+ if (!query) {
8
+ console.error("Usage: npx tsx src/scripts/search-cli.ts \"your query\"");
9
+ process.exit(1);
10
+ }
11
+ const dbPath = path.join(process.cwd(), "data", "metadata.db");
12
+ const vectorPath = path.join(process.cwd(), "data", "vectors.json");
13
+ const metadataStore = new MetadataStore(dbPath);
14
+ const vectorStore = new VectorStore(vectorPath);
15
+ const embedder = Embedder.getInstance();
16
+ const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
17
+ async function run() {
18
+ console.error(`Searching for: "${query}"...`);
19
+ const results = await searchEngine.search(query, { limit: 5, safeOnly: true });
20
+ if (results.length === 0) {
21
+ console.log("No results found.");
22
+ return;
23
+ }
24
+ console.log(JSON.stringify(results, null, 2));
25
+ }
26
+ run().catch(console.error);
@@ -0,0 +1,45 @@
1
+ import { QualityAnalyzer } from "../quality/analyzer.js";
2
+ import fs from "fs";
3
+ import path from "path";
4
+ async function main() {
5
+ console.log("=== Vesper Bias Analysis Test ===\n");
6
+ const analyzer = new QualityAnalyzer();
7
+ const testFile = path.join(process.cwd(), "test_bias.csv");
8
+ // Create a CSV with severe class imbalance
9
+ // 'label': 90% 'A', 10% 'B' -> Should trigger warning
10
+ // 'gender': 50/50 -> No warning
11
+ let csvContent = "id,gender,label\n";
12
+ for (let i = 0; i < 95; i++)
13
+ csvContent += `${i},M,A\n`;
14
+ for (let i = 95; i < 100; i++)
15
+ csvContent += `${i},F,B\n`;
16
+ fs.writeFileSync(testFile, csvContent);
17
+ console.log(`Created test file: ${testFile}`);
18
+ console.log("Expectations:");
19
+ console.log(" - Should warn about 'Severe imbalance' in 'label' column (95% A)");
20
+ console.log(" - Should NOT warn about 'gender' (skewed but just a test sample)");
21
+ try {
22
+ console.log("\nRunning analyzer...");
23
+ const report = await analyzer.analyze(testFile);
24
+ console.log("\n=== Bias Report ===");
25
+ if (report.class_imbalance_warnings && report.class_imbalance_warnings.length > 0) {
26
+ report.class_imbalance_warnings.forEach(w => console.log(`[!] ${w}`));
27
+ }
28
+ else {
29
+ console.log("No bias detected.");
30
+ }
31
+ const labelWarning = report.class_imbalance_warnings?.find(w => w.includes("'label'"));
32
+ if (labelWarning) {
33
+ console.log("\nTest passed! Detected imbalance.");
34
+ }
35
+ else {
36
+ console.error("\nTest failed! Did not detect imbalance.");
37
+ }
38
+ // Cleanup
39
+ fs.unlinkSync(testFile);
40
+ }
41
+ catch (error) {
42
+ console.error("\nTest failed:", error);
43
+ }
44
+ }
45
+ main().catch(console.error);
@@ -0,0 +1,51 @@
1
+ import { MockRedisProvider, CacheService } from "../cache/service.js";
2
+ import { CDNService } from "../cache/cdn.js";
3
+ import { CleaningPlanner } from "../cleaning/planner.js";
4
+ async function runTest() {
5
+ console.log("--- Initializing Caching Layer Test ---");
6
+ const cacheProvider = new MockRedisProvider();
7
+ const cache = new CacheService(cacheProvider);
8
+ const planner = new CleaningPlanner(cache);
9
+ const cdn = new CDNService("data/cdn_mock", "https://cdn.vesper.ai");
10
+ const dsId = "test/cache-dataset";
11
+ const mockReport = {
12
+ row_count: 1000,
13
+ column_count: 2,
14
+ duplicate_rows: 5,
15
+ duplicate_percentage: 0.5,
16
+ columns: [
17
+ { name: "col1", type: "Utf8", inferred_type: "String", missing_count: 0, missing_percentage: 0, unique_count: 1000, is_constant: false, is_mixed_type: false }
18
+ ],
19
+ warnings: [],
20
+ schema_warnings: [],
21
+ overall_score: 80
22
+ };
23
+ // 1. Test Planning Cache
24
+ console.log("\n--- Testing Cleaning Plan Cache ---");
25
+ console.log("First Run (Cache Miss)...");
26
+ const start1 = Date.now();
27
+ await planner.generatePlan(dsId, mockReport);
28
+ console.log(`First run took ${Date.now() - start1}ms`);
29
+ console.log("Second Run (Cache Hit)...");
30
+ const start2 = Date.now();
31
+ await planner.generatePlan(dsId, mockReport);
32
+ console.log(`Second run took ${Date.now() - start2}ms (Expected to be near 0ms)`);
33
+ // 2. Test TTL / Hash mismatch
34
+ console.log("\n--- Testing Cache Sensitivity (Config Change) ---");
35
+ console.log("Running with a ruleSet (New Cache Key)...");
36
+ const start3 = Date.now();
37
+ await planner.generatePlan(dsId, mockReport, { id: "rules-1", name: "Custom", rules: [] });
38
+ console.log(`Config change run took ${Date.now() - start3}ms (Cache Miss)`);
39
+ // 3. Test CDN Upload
40
+ console.log("\n--- Testing CDN Mock ---");
41
+ const reportContent = JSON.stringify(mockReport, null, 2);
42
+ const url = await cdn.upload("report_latest.json", reportContent);
43
+ console.log(`Report uploaded to CDN: ${url}`);
44
+ if (url.startsWith("https://cdn.vesper.ai")) {
45
+ console.log("\n✅ Success: Caching and CDN layers verified.");
46
+ }
47
+ else {
48
+ console.error("\n❌ Failure: CDN URL generation mismatch.");
49
+ }
50
+ }
51
+ runTest().catch(console.error);
@@ -0,0 +1,76 @@
1
+ import { DataCleaner } from "../cleaning/cleaner.js";
2
+ import fs from "fs";
3
+ import path from "path";
4
+ async function main() {
5
+ console.log("=== Vesper Auto-Cleaning Pipeline Test ===\n");
6
+ const cleaner = new DataCleaner();
7
+ const testFile = path.join(process.cwd(), "test_cleaning.csv");
8
+ // Create Dirty Data
9
+ // - id: Duplicate rows (1)
10
+ // - age: Missing values, Outlier (200), Wrong Type ("thirty")
11
+ // - score: Good float
12
+ // - unnecessary: Column to drop
13
+ const csvContent = `id,age,score,unnecessary
14
+ 1,25,88.5,trash
15
+ 2,,92.0,trash
16
+ 3,200,15.0,trash
17
+ 1,25,88.5,trash
18
+ 4,"30",80.0,trash`;
19
+ fs.writeFileSync(testFile, csvContent);
20
+ console.log(`Created dirty file: ${testFile}`);
21
+ // Define Cleaning Plan
22
+ const ops = [
23
+ {
24
+ type: "DropColumns",
25
+ params: { columns: ["unnecessary"] },
26
+ reason: "Not useful"
27
+ },
28
+ {
29
+ type: "RemoveDuplicates",
30
+ params: {},
31
+ reason: "Duplicate rows"
32
+ },
33
+ {
34
+ type: "FillMissing",
35
+ params: { column: "age", method: "constant", value: 0 },
36
+ reason: "Impute missing age"
37
+ },
38
+ // Note: Polars can't easily fix "thirty" to 30 automatically without specific logic,
39
+ // so we just cast what we can. "30" string -> 30 int works.
40
+ {
41
+ type: "FixTypes",
42
+ params: { column: "age", type: "float" },
43
+ reason: "Convert age to number"
44
+ },
45
+ {
46
+ type: "RemoveOutliers",
47
+ params: { column: "age", method: "iqr", threshold: 1.5 },
48
+ reason: "Remove age > 100"
49
+ }
50
+ ];
51
+ try {
52
+ console.log("Executing cleaning plan...");
53
+ const result = await cleaner.clean(testFile, ops);
54
+ console.log("\n=== Cleaning Result ===");
55
+ console.log(`Success: ${result.success}`);
56
+ console.log(`Output: ${result.output_path || "None"}`);
57
+ console.log(`Rows Affected: ${result.rows_affected}`);
58
+ console.log("\nLogs:");
59
+ result.logs.forEach(l => console.log(` - ${l}`));
60
+ // Validate Output File Exists
61
+ if (fs.existsSync(result.output_path)) {
62
+ console.log("\nConverted file created successfully.");
63
+ // Cleanup
64
+ fs.unlinkSync(testFile);
65
+ fs.unlinkSync(result.output_path);
66
+ console.log("Test passed!");
67
+ }
68
+ else {
69
+ console.error("\nTest failed! Output file missing.");
70
+ }
71
+ }
72
+ catch (error) {
73
+ console.error("\nTest failed:", error);
74
+ }
75
+ }
76
+ main().catch(console.error);
@@ -0,0 +1,48 @@
1
+ import { StorageManager } from "../cloud/storage-manager.js";
2
+ import fs from "fs";
3
+ import path from "path";
4
+ async function main() {
5
+ console.log("=== Vesper Cloud Storage Test ===\n");
6
+ const vaultDir = path.join(process.cwd(), "test_vault");
7
+ const adapter = StorageManager.createAdapter({
8
+ type: "local",
9
+ options: { basePath: vaultDir }
10
+ });
11
+ const testFile = "test_upload.txt";
12
+ fs.writeFileSync(testFile, "Hello Cloud Storage!");
13
+ try {
14
+ console.log("Testing upload...");
15
+ const remotePath = "datasets/v1/test.txt";
16
+ const url = await adapter.upload(testFile, remotePath);
17
+ console.log("Upload URL:", url);
18
+ const expectedFile = path.join(vaultDir, remotePath);
19
+ if (fs.existsSync(expectedFile)) {
20
+ console.log("PASS: File successfully uploaded to local vault.");
21
+ const content = fs.readFileSync(expectedFile, "utf8");
22
+ if (content === "Hello Cloud Storage!") {
23
+ console.log("PASS: Content integrity verified.");
24
+ }
25
+ }
26
+ else {
27
+ console.error("FAIL: 0Uploaded file not found in vault.");
28
+ }
29
+ console.log("\nTesting signed URL...");
30
+ const signedUrl = await adapter.getSignedUrl(remotePath);
31
+ console.log("Signed URL:", signedUrl);
32
+ console.log("\nTesting deletion...");
33
+ await adapter.delete(remotePath);
34
+ if (!fs.existsSync(expectedFile)) {
35
+ console.log("PASS: File successfully deleted from vault.");
36
+ }
37
+ }
38
+ catch (e) {
39
+ console.error("FAIL: Cloud storage test failed:", e);
40
+ }
41
+ finally {
42
+ if (fs.existsSync(testFile))
43
+ fs.unlinkSync(testFile);
44
+ if (fs.existsSync(vaultDir))
45
+ fs.rmSync(vaultDir, { recursive: true, force: true });
46
+ }
47
+ }
48
+ main().catch(console.error);