@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,86 @@
1
+ import { QualityOrchestrator } from "../quality/quality-orchestrator.js";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ import { execSync } from "child_process";
5
+ async function runTest() {
6
+ console.log("--- Testing Unified Quality Report ---");
7
+ const projectRoot = path.resolve(".");
8
+ const orchestrator = new QualityOrchestrator(projectRoot);
9
+ // Create a mixed dataset directory
10
+ const testDataDir = path.join(projectRoot, "data", "test-unified");
11
+ if (!fs.existsSync(testDataDir))
12
+ fs.mkdirSync(testDataDir, { recursive: true });
13
+ // 1. Create a CSV file (text modality)
14
+ const csvPath = path.join(testDataDir, "data.csv");
15
+ fs.writeFileSync(csvPath, "id,name,value\\n1,Alice,10\\n2,Bob,20\\n3,Charlie,30\\n");
16
+ // 2. Create a test image (using Python)
17
+ const imagePath = path.join(testDataDir, "test_image.png");
18
+ const pythonScript = `
19
+ import numpy as np
20
+ from PIL import Image
21
+ img = Image.fromarray(np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8))
22
+ img.save('${imagePath.replace(/\\/g, "\\\\")}')
23
+ `;
24
+ fs.writeFileSync(path.join(testDataDir, "gen_image.py"), pythonScript);
25
+ try {
26
+ execSync(`python "${path.join(testDataDir, "gen_image.py")}"`);
27
+ }
28
+ catch (e) {
29
+ console.log("⚠️ Could not generate image. Skipping image modality.");
30
+ }
31
+ // 3. Create a test audio file (using Python)
32
+ const audioPath = path.join(testDataDir, "test_audio.wav");
33
+ const audioScript = `
34
+ import numpy as np
35
+ import soundfile as sf
36
+ sample_rate = 44100
37
+ duration = 1.0
38
+ frequency = 440.0
39
+ t = np.linspace(0, duration, int(sample_rate * duration))
40
+ audio = 0.5 * np.sin(2 * np.pi * frequency * t)
41
+ sf.write('${audioPath.replace(/\\/g, "\\\\")}', audio, sample_rate)
42
+ `;
43
+ fs.writeFileSync(path.join(testDataDir, "gen_audio.py"), audioScript);
44
+ try {
45
+ execSync(`python "${path.join(testDataDir, "gen_audio.py")}"`);
46
+ }
47
+ catch (e) {
48
+ console.log("⚠️ Could not generate audio. Skipping audio modality.");
49
+ }
50
+ // 4. Run Unified Quality Analysis
51
+ console.log(`\\nAnalyzing multimodal dataset at ${testDataDir}...`);
52
+ try {
53
+ const report = await orchestrator.generateReport("test-unified-dataset", testDataDir, null);
54
+ console.log("\\n📊 Unified Quality Report:");
55
+ console.log(`- Dataset ID: ${report.dataset_id}`);
56
+ console.log(`- Modalities: ${report.modalities.join(", ")}`);
57
+ console.log(`- Overall Quality Score: ${report.overall_quality_score}/100`);
58
+ if (report.image_quality) {
59
+ console.log(`\\n🖼️ Image Quality:`);
60
+ console.log(` - Total Images: ${report.image_quality.total_images}`);
61
+ console.log(` - Avg Resolution: ${report.image_quality.avg_resolution}`);
62
+ }
63
+ if (report.audio_quality) {
64
+ console.log(`\\n🎵 Audio Quality:`);
65
+ console.log(` - Total Files: ${report.audio_quality.total_files}`);
66
+ console.log(` - Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s`);
67
+ }
68
+ console.log(`\\n💡 Recommendations:`);
69
+ report.recommendations.forEach(rec => console.log(` - ${rec}`));
70
+ // Verify modalities detected
71
+ const hasText = report.modalities.includes("text");
72
+ const hasImage = report.modalities.includes("image");
73
+ const hasAudio = report.modalities.includes("audio");
74
+ if (hasText && (hasImage || hasAudio) && report.overall_quality_score > 0) {
75
+ console.log("\\nVERIFICATION_STATUS: ✅ PASS");
76
+ }
77
+ else {
78
+ console.log("\\nVERIFICATION_STATUS: ⚠️ PARTIAL (Some modalities missing)");
79
+ }
80
+ }
81
+ catch (e) {
82
+ console.error(`Analysis failed: ${e.message}`);
83
+ console.log("\\nVERIFICATION_STATUS: ❌ FAIL");
84
+ }
85
+ }
86
+ runTest().catch(console.error);
@@ -0,0 +1,34 @@
1
+ import { pipeline, env } from "@xenova/transformers";
2
+ // Disable local model check to ensure it downloads if not found
3
+ env.allowLocalModels = false;
4
+ env.useBrowserCache = false;
5
+ export class Embedder {
6
+ static instance;
7
+ extractor = null;
8
+ modelName = "Xenova/paraphrase-multilingual-MiniLM-L12-v2";
9
+ constructor() { }
10
+ static getInstance() {
11
+ if (!Embedder.instance) {
12
+ Embedder.instance = new Embedder();
13
+ }
14
+ return Embedder.instance;
15
+ }
16
+ async init() {
17
+ if (this.extractor)
18
+ return;
19
+ console.error(`Loading embedding model: ${this.modelName}...`);
20
+ this.extractor = await pipeline("feature-extraction", this.modelName);
21
+ console.error("Embedding model loaded successfully.");
22
+ }
23
+ async embed(text) {
24
+ if (!this.extractor) {
25
+ await this.init();
26
+ }
27
+ const result = await this.extractor(text, {
28
+ pooling: "mean",
29
+ normalize: true,
30
+ });
31
+ // result.data is already a Float32Array in Xenova/transformers
32
+ return result.data;
33
+ }
34
+ }
@@ -0,0 +1,129 @@
1
+ import { JITOrchestrator } from "./jit-orchestrator.js";
2
+ import fs from "fs";
3
+ function log(msg) {
4
+ fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
5
+ }
6
+ export class SearchEngine {
7
+ metadataStore;
8
+ vectorStore;
9
+ embedder;
10
+ jitOrchestrator;
11
+ constructor(metadataStore, vectorStore, embedder) {
12
+ this.metadataStore = metadataStore;
13
+ this.vectorStore = vectorStore;
14
+ this.embedder = embedder;
15
+ this.jitOrchestrator = new JITOrchestrator(metadataStore, vectorStore, embedder);
16
+ }
17
+ async search(query, options = {}) {
18
+ const limit = options.limit || 5;
19
+ const enableJIT = options.enableJIT !== false; // Default: true
20
+ log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
21
+ // 1. Perform local search
22
+ const localResults = await this.localSearch(query, options);
23
+ // 2. Check if JIT should be triggered
24
+ const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
25
+ if (!shouldTrigger) {
26
+ log(`JIT not triggered. Returning ${localResults.length} local results`);
27
+ return localResults;
28
+ }
29
+ // 3. Trigger JIT fallback
30
+ console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
31
+ await this.jitOrchestrator.fetchAndIngest(query, 10);
32
+ // 4. Re-run local search with updated index
33
+ console.error(`Re-searching with updated library...`);
34
+ const enhancedResults = await this.localSearch(query, options);
35
+ const newCount = enhancedResults.length - localResults.length;
36
+ if (newCount > 0) {
37
+ console.error(`Found ${newCount} additional results\n`);
38
+ }
39
+ return enhancedResults;
40
+ }
41
+ /**
42
+ * Perform hybrid search (Vector + Lexical + Penalties)
43
+ */
44
+ async localSearch(query, options) {
45
+ const limit = options.limit || 5;
46
+ // 1. Parse Query
47
+ const words = query.toLowerCase().split(/\s+/);
48
+ const positiveKeywords = words.filter(w => !w.startsWith("-") && w.length > 2);
49
+ const negativeKeywords = words.filter(w => w.startsWith("-")).map(w => w.slice(1));
50
+ // Automatic Penalty Detection: Is this a "Finance" query without "Crypto" mentioned?
51
+ const financeTerms = ["financial", "finance", "banking", "economy", "stock", "loan", "forecasting", "bank"];
52
+ const isFinanceQuery = positiveKeywords.some(w => financeTerms.includes(w));
53
+ const mentionsCrypto = query.toLowerCase().includes("crypto") || query.toLowerCase().includes("bitcoin");
54
+ // 2. Get query vector
55
+ const queryVector = await this.embedder.embed(query);
56
+ log(`Vector generated, length=${queryVector.length}`);
57
+ // 3. Search in vector store (fetch more candidates for reranking)
58
+ const matches = this.vectorStore.search(queryVector, 40);
59
+ log(`Vector search found ${matches.length} matches`);
60
+ // 4. Score and filter candidates
61
+ const results = [];
62
+ for (const match of matches) {
63
+ const metadata = this.metadataStore.getDataset(match.id);
64
+ if (!metadata)
65
+ continue;
66
+ // Filter: Safe only
67
+ if (options.safeOnly && metadata.license.category === "restricted")
68
+ continue;
69
+ const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
70
+ // Filter: Explicit Negative Keywords
71
+ if (negativeKeywords.some(neg => text.includes(neg))) {
72
+ log(`Negative penalty: Dropped ${match.id} due to keyword match`);
73
+ continue;
74
+ }
75
+ // A. Vector Score (0.0 to 1.0)
76
+ const vectorScore = match.score;
77
+ // B. Lexical Score (Keyword Match)
78
+ let lexicalScore = 0;
79
+ if (positiveKeywords.length > 0) {
80
+ const matchesCount = positiveKeywords.filter(kw => text.includes(kw)).length;
81
+ lexicalScore = matchesCount / positiveKeywords.length;
82
+ }
83
+ // C. Penalties
84
+ let penalty = 0;
85
+ // Penalty: Domain Drift (Finance vs Crypto)
86
+ // If user asks for finance but NOT crypto, and the result is crypto, penalize heavily
87
+ if (isFinanceQuery && !mentionsCrypto) {
88
+ const isCryptoResult = text.includes("crypto") || text.includes("bitcoin") || text.includes("ethereum") || text.includes("blockchain");
89
+ if (isCryptoResult) {
90
+ log(`Penalty applied: Crypto result in Finance query for ${match.id}`);
91
+ penalty += 0.4; // Massive penalty
92
+ }
93
+ }
94
+ // Penalty: Weak Lexical Match
95
+ // If it's a "semantic vibe" match but has ZERO matching keywords from the query
96
+ if (lexicalScore === 0 && positiveKeywords.length > 1) {
97
+ penalty += 0.2;
98
+ }
99
+ // Final Combined Score
100
+ // 70% Vector, 30% Lexical, minus Penalties
101
+ const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty;
102
+ metadata.relevance_score = Math.round(finalScore * 100) / 100;
103
+ metadata.vector_score = Math.round(vectorScore * 100) / 100;
104
+ metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
105
+ results.push(metadata);
106
+ }
107
+ // Sort by final score and limit
108
+ return results
109
+ .sort((a, b) => b.relevance_score - a.relevance_score)
110
+ .slice(0, limit);
111
+ }
112
+ /**
113
+ * Determine if JIT should be triggered
114
+ */
115
+ shouldTriggerJIT(results, query) {
116
+ // Condition 1: Very few results
117
+ if (results.length < 3) {
118
+ log(`JIT trigger: Only ${results.length} results found`);
119
+ return true;
120
+ }
121
+ // Condition 2: Low confidence (top result has low similarity)
122
+ const topScore = results[0]?.relevance_score || 0;
123
+ if (topScore < 0.60) {
124
+ log(`JIT trigger: Low confidence (top score: ${topScore})`);
125
+ return true;
126
+ }
127
+ return false;
128
+ }
129
+ }
@@ -0,0 +1,232 @@
1
+ import { HuggingFaceScraper } from "../metadata/scraper.js";
2
+ import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
3
+ import { UCIScraper } from "../metadata/uci-scraper.js";
4
+ import { GitHubScraper } from "../metadata/github-scraper.js";
5
+ import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
6
+ /**
7
+ * Just-In-Time Orchestrator
8
+ * Automatically fetches and indexes new datasets when local search is insufficient
9
+ */
10
+ export class JITOrchestrator {
11
+ metadataStore;
12
+ vectorStore;
13
+ embedder;
14
+ lastTriggerTime = new Map();
15
+ RATE_LIMIT_MS = 30000; // 30 seconds between triggers for same query
16
+ constructor(metadataStore, vectorStore, embedder) {
17
+ this.metadataStore = metadataStore;
18
+ this.vectorStore = vectorStore;
19
+ this.embedder = embedder;
20
+ }
21
+ /**
22
+ * Check if JIT should be triggered based on rate limiting
23
+ */
24
+ canTrigger(query) {
25
+ const lastTrigger = this.lastTriggerTime.get(query);
26
+ if (!lastTrigger)
27
+ return true;
28
+ const elapsed = Date.now() - lastTrigger;
29
+ return elapsed > this.RATE_LIMIT_MS;
30
+ }
31
+ /**
32
+ * Main JIT workflow: fetch, save, index, return new datasets
33
+ */
34
+ async fetchAndIngest(query, limit = 10) {
35
+ // Rate limiting check
36
+ if (!this.canTrigger(query)) {
37
+ console.error(`[JIT] Rate limit: Query "${query}" triggered too recently`);
38
+ return [];
39
+ }
40
+ console.error(`\n[JIT] Searching live sources for: "${query}"`);
41
+ this.lastTriggerTime.set(query, Date.now());
42
+ const newDatasets = [];
43
+ const existingIds = new Set();
44
+ try {
45
+ // Get existing dataset IDs to avoid duplicates
46
+ const existing = this.metadataStore.getAllDatasets();
47
+ existing.forEach(ds => existingIds.add(ds.id));
48
+ // 1. Scrape HuggingFace
49
+ const hfResults = await this.scrapeHuggingFace(query, limit);
50
+ console.error(` HuggingFace: Found ${hfResults.length} datasets`);
51
+ for (const ds of hfResults) {
52
+ if (!existingIds.has(ds.id)) {
53
+ newDatasets.push(ds);
54
+ existingIds.add(ds.id);
55
+ }
56
+ }
57
+ // 2. Scrape Kaggle (if credentials available)
58
+ const kaggleUser = process.env.KAGGLE_USERNAME;
59
+ const kaggleKey = process.env.KAGGLE_KEY;
60
+ if (kaggleUser && kaggleKey) {
61
+ const kaggleResults = await this.scrapeKaggle(query, Math.floor(limit / 2));
62
+ console.error(` Kaggle: Found ${kaggleResults.length} datasets`);
63
+ for (const ds of kaggleResults) {
64
+ ds.id = `kaggle:${ds.id}`;
65
+ if (!existingIds.has(ds.id)) {
66
+ newDatasets.push(ds);
67
+ existingIds.add(ds.id);
68
+ }
69
+ }
70
+ }
71
+ // 3. Scrape UCI
72
+ const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
73
+ console.error(` UCI: Found ${uciResults.length} datasets`);
74
+ for (const ds of uciResults) {
75
+ if (!existingIds.has(ds.id)) {
76
+ newDatasets.push(ds);
77
+ existingIds.add(ds.id);
78
+ }
79
+ }
80
+ // 4. Scrape GitHub
81
+ const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
82
+ console.error(` GitHub: Found ${githubResults.length} datasets`);
83
+ for (const ds of githubResults) {
84
+ if (!existingIds.has(ds.id)) {
85
+ newDatasets.push(ds);
86
+ existingIds.add(ds.id);
87
+ }
88
+ }
89
+ // 5. Scrape World Bank
90
+ const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
91
+ console.error(` World Bank: Found ${wbResults.length} datasets`);
92
+ for (const ds of wbResults) {
93
+ if (!existingIds.has(ds.id)) {
94
+ newDatasets.push(ds);
95
+ existingIds.add(ds.id);
96
+ }
97
+ }
98
+ // 6. Scrape NASA
99
+ const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
100
+ console.error(` NASA: Found ${nasaResults.length} datasets`);
101
+ for (const ds of nasaResults) {
102
+ if (!existingIds.has(ds.id)) {
103
+ newDatasets.push(ds);
104
+ existingIds.add(ds.id);
105
+ }
106
+ }
107
+ // 3. Save and index new datasets
108
+ if (newDatasets.length > 0) {
109
+ await this.saveAndIndex(newDatasets);
110
+ console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
111
+ }
112
+ else {
113
+ console.error(` [JIT] No new datasets found`);
114
+ }
115
+ return newDatasets;
116
+ }
117
+ catch (error) {
118
+ console.error(`ERROR [JIT] Error during fetch and ingest:`, error.message);
119
+ return [];
120
+ }
121
+ }
122
+ /**
123
+ * Scrape HuggingFace with free-text search
124
+ */
125
+ async scrapeHuggingFace(query, limit) {
126
+ const scraper = new HuggingFaceScraper();
127
+ try {
128
+ // Use the query as a domain filter for now
129
+ // In the future, we can add a freeTextSearch parameter to the scraper
130
+ return await scraper.scrape(limit, true, query);
131
+ }
132
+ catch (error) {
133
+ console.error(` ERROR: HuggingFace scrape failed: ${error.message}`);
134
+ return [];
135
+ }
136
+ }
137
+ /**
138
+ * Scrape Kaggle with search query
139
+ */
140
+ async scrapeKaggle(query, limit) {
141
+ const kaggleUser = process.env.KAGGLE_USERNAME;
142
+ const kaggleKey = process.env.KAGGLE_KEY;
143
+ if (!kaggleUser || !kaggleKey) {
144
+ return [];
145
+ }
146
+ try {
147
+ const scraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
148
+ return await scraper.scrape(query, limit);
149
+ }
150
+ catch (error) {
151
+ console.error(` ERROR: Kaggle scrape failed: ${error.message}`);
152
+ return [];
153
+ }
154
+ }
155
+ /**
156
+ * Scrape UCI
157
+ */
158
+ async scrapeUCI(query, limit) {
159
+ const scraper = new UCIScraper();
160
+ try {
161
+ return await scraper.scrape(query, limit);
162
+ }
163
+ catch (error) {
164
+ console.error(` ERROR: UCI scrape failed: ${error.message}`);
165
+ return [];
166
+ }
167
+ }
168
+ /**
169
+ * Scrape GitHub
170
+ */
171
+ async scrapeGitHub(query, limit) {
172
+ const scraper = new GitHubScraper();
173
+ try {
174
+ return await scraper.scrape(query, limit);
175
+ }
176
+ catch (error) {
177
+ console.error(` ERROR: GitHub scrape failed: ${error.message}`);
178
+ return [];
179
+ }
180
+ }
181
+ /**
182
+ * Scrape World Bank
183
+ */
184
+ async scrapeWorldBank(query, limit) {
185
+ const scraper = new WorldBankScraper();
186
+ try {
187
+ return await scraper.scrape(query, limit);
188
+ }
189
+ catch (error) {
190
+ console.error(` ERROR: World Bank scrape failed: ${error.message}`);
191
+ return [];
192
+ }
193
+ }
194
+ /**
195
+ * Scrape NASA
196
+ */
197
+ async scrapeNASA(query, limit) {
198
+ const scraper = new NASAScraper();
199
+ try {
200
+ return await scraper.scrape(query, limit);
201
+ }
202
+ catch (error) {
203
+ console.error(` ERROR: NASA scrape failed: ${error.message}`);
204
+ return [];
205
+ }
206
+ }
207
+ /**
208
+ * Save datasets to DB and generate embeddings
209
+ */
210
+ async saveAndIndex(datasets) {
211
+ // 1. Save to database
212
+ this.metadataStore.beginTransaction();
213
+ try {
214
+ for (const ds of datasets) {
215
+ this.metadataStore.saveDataset(ds);
216
+ }
217
+ this.metadataStore.commit();
218
+ }
219
+ catch (e) {
220
+ this.metadataStore.rollback();
221
+ throw e;
222
+ }
223
+ // 2. Generate embeddings and update vector store
224
+ for (const ds of datasets) {
225
+ const text = `${ds.name} ${ds.description} ${ds.tags.join(" ")}`;
226
+ const vector = await this.embedder.embed(text);
227
+ this.vectorStore.add(ds.id, vector);
228
+ }
229
+ // 3. Persist vector store to disk
230
+ this.vectorStore.save();
231
+ }
232
+ }
@@ -0,0 +1,105 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ export class VectorStore {
4
+ idToVector = new Map();
5
+ filePath;
6
+ binPath;
7
+ constructor(filePath) {
8
+ this.filePath = filePath;
9
+ this.binPath = filePath.replace(".json", ".bin");
10
+ this.load();
11
+ }
12
+ load() {
13
+ if (fs.existsSync(this.filePath) && fs.existsSync(this.binPath)) {
14
+ try {
15
+ const metadata = JSON.parse(fs.readFileSync(this.filePath, "utf-8"));
16
+ const buffer = fs.readFileSync(this.binPath);
17
+ const ids = metadata.ids;
18
+ const dim = metadata.dimension;
19
+ for (let i = 0; i < ids.length; i++) {
20
+ const start = i * dim * 4;
21
+ const vector = new Float32Array(buffer.buffer, buffer.byteOffset + start, dim);
22
+ this.idToVector.set(ids[i], new Float32Array(vector)); // Copy to ensure independence
23
+ }
24
+ console.error(`Loaded ${this.idToVector.size} vectors from binary store`);
25
+ }
26
+ catch (err) {
27
+ console.error("Failed to load vector store:", err);
28
+ this.idToVector.clear();
29
+ }
30
+ }
31
+ else if (fs.existsSync(this.filePath)) {
32
+ // Migration from old JSON format
33
+ try {
34
+ const data = JSON.parse(fs.readFileSync(this.filePath, "utf-8"));
35
+ if (Array.isArray(data)) {
36
+ for (const entry of data) {
37
+ this.idToVector.set(entry.id, new Float32Array(entry.vector));
38
+ }
39
+ console.error(`Migrated ${this.idToVector.size} vectors from legacy JSON`);
40
+ this.save(); // Convert to binary immediately
41
+ }
42
+ }
43
+ catch (err) {
44
+ console.error("Migration failed:", err);
45
+ }
46
+ }
47
+ }
48
+ save() {
49
+ const dir = path.dirname(this.filePath);
50
+ if (!fs.existsSync(dir)) {
51
+ fs.mkdirSync(dir, { recursive: true });
52
+ }
53
+ const ids = Array.from(this.idToVector.keys());
54
+ if (ids.length === 0)
55
+ return;
56
+ const dim = this.idToVector.get(ids[0]).length;
57
+ const buffer = Buffer.alloc(ids.length * dim * 4);
58
+ for (let i = 0; i < ids.length; i++) {
59
+ const vector = this.idToVector.get(ids[i]);
60
+ const byteOffset = i * dim * 4;
61
+ for (let j = 0; j < dim; j++) {
62
+ buffer.writeFloatLE(vector[j], byteOffset + j * 4);
63
+ }
64
+ }
65
+ fs.writeFileSync(this.binPath, buffer);
66
+ fs.writeFileSync(this.filePath, JSON.stringify({
67
+ ids,
68
+ dimension: dim,
69
+ count: ids.length,
70
+ updatedAt: new Date().toISOString()
71
+ }, null, 2));
72
+ console.error(`Saved ${ids.length} vectors to binary store (${this.binPath})`);
73
+ }
74
+ add(id, vector) {
75
+ this.idToVector.set(id, vector instanceof Float32Array ? vector : new Float32Array(vector));
76
+ }
77
+ search(queryVector, limit = 10) {
78
+ const q = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
79
+ const results = [];
80
+ for (const [id, v] of this.idToVector.entries()) {
81
+ results.push({
82
+ id,
83
+ score: this.cosineSimilarity(q, v)
84
+ });
85
+ }
86
+ return results
87
+ .sort((a, b) => b.score - a.score)
88
+ .slice(0, limit);
89
+ }
90
+ cosineSimilarity(v1, v2) {
91
+ let dotProduct = 0;
92
+ let normA = 0;
93
+ let normB = 0;
94
+ for (let i = 0; i < v1.length; i++) {
95
+ dotProduct += v1[i] * v2[i];
96
+ normA += v1[i] * v1[i];
97
+ normB += v2[i] * v2[i];
98
+ }
99
+ const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
100
+ return magnitude === 0 ? 0 : dotProduct / magnitude;
101
+ }
102
+ getAllIds() {
103
+ return Array.from(this.idToVector.keys());
104
+ }
105
+ }
@@ -0,0 +1,57 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ export class DataSplitter {
4
+ pythonPath = "python";
5
+ scriptPath;
6
+ constructor(projectRoot = process.cwd()) {
7
+ this.scriptPath = path.join(projectRoot, "src", "python", "splitter_engine.py");
8
+ }
9
+ /**
10
+ * Splits a dataset into Train/Val/Test sets based on config
11
+ */
12
+ async split(filePath, config) {
13
+ return this.runPython("split", [filePath, JSON.stringify(config)]);
14
+ }
15
+ /**
16
+ * Validates a split for leakage and distribution
17
+ */
18
+ async validate(paths, options) {
19
+ const config = {
20
+ paths,
21
+ id_column: options?.id_column,
22
+ target_column: options?.target_column
23
+ };
24
+ return this.runPython("validate", [JSON.stringify(config)]);
25
+ }
26
+ async runPython(action, args) {
27
+ return new Promise((resolve, reject) => {
28
+ const process = spawn(this.pythonPath, [this.scriptPath, action, ...args]);
29
+ let stdout = "";
30
+ let stderr = "";
31
+ process.stdout.on("data", (data) => {
32
+ stdout += data.toString();
33
+ });
34
+ process.stderr.on("data", (data) => {
35
+ stderr += data.toString();
36
+ });
37
+ process.on("close", (code) => {
38
+ if (code !== 0) {
39
+ reject(new Error(`Data Splitter (${action}) failed: ${stderr}`));
40
+ return;
41
+ }
42
+ try {
43
+ const result = JSON.parse(stdout);
44
+ if (result.error) {
45
+ reject(new Error(result.error));
46
+ }
47
+ else {
48
+ resolve(result);
49
+ }
50
+ }
51
+ catch (e) {
52
+ reject(new Error(`Failed to parse output: ${stdout}`));
53
+ }
54
+ });
55
+ });
56
+ }
57
+ }
@@ -0,0 +1 @@
1
+ export {};