@vespermcp/mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +259 -0
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +50 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +60 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +111 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/storage-manager.js +20 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +45 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/index.js +632 -0
- package/build/ingestion/hf-downloader.js +64 -0
- package/build/ingestion/ingestor.js +96 -0
- package/build/ingestion/kaggle-downloader.js +79 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +129 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +353 -0
- package/build/metadata/store.js +325 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/quality/analyzer.js +57 -0
- package/build/quality/image-analyzer.js +46 -0
- package/build/quality/media-analyzer.js +46 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +73 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +129 -0
- package/build/search/jit-orchestrator.js +232 -0
- package/build/search/vector-store.js +105 -0
- package/build/splitting/splitter.js +57 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +227 -0
- package/build/utils/downloader.js +52 -0
- package/mcp-config-template.json +15 -0
- package/package.json +84 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/cleaner.py +196 -0
- package/src/python/export_engine.py +112 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/quality_engine.py +243 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/worldbank_adapter.py +99 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { QualityOrchestrator } from "../quality/quality-orchestrator.js";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import { execSync } from "child_process";
|
|
5
|
+
async function runTest() {
|
|
6
|
+
console.log("--- Testing Unified Quality Report ---");
|
|
7
|
+
const projectRoot = path.resolve(".");
|
|
8
|
+
const orchestrator = new QualityOrchestrator(projectRoot);
|
|
9
|
+
// Create a mixed dataset directory
|
|
10
|
+
const testDataDir = path.join(projectRoot, "data", "test-unified");
|
|
11
|
+
if (!fs.existsSync(testDataDir))
|
|
12
|
+
fs.mkdirSync(testDataDir, { recursive: true });
|
|
13
|
+
// 1. Create a CSV file (text modality)
|
|
14
|
+
const csvPath = path.join(testDataDir, "data.csv");
|
|
15
|
+
fs.writeFileSync(csvPath, "id,name,value\\n1,Alice,10\\n2,Bob,20\\n3,Charlie,30\\n");
|
|
16
|
+
// 2. Create a test image (using Python)
|
|
17
|
+
const imagePath = path.join(testDataDir, "test_image.png");
|
|
18
|
+
const pythonScript = `
|
|
19
|
+
import numpy as np
|
|
20
|
+
from PIL import Image
|
|
21
|
+
img = Image.fromarray(np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8))
|
|
22
|
+
img.save('${imagePath.replace(/\\/g, "\\\\")}')
|
|
23
|
+
`;
|
|
24
|
+
fs.writeFileSync(path.join(testDataDir, "gen_image.py"), pythonScript);
|
|
25
|
+
try {
|
|
26
|
+
execSync(`python "${path.join(testDataDir, "gen_image.py")}"`);
|
|
27
|
+
}
|
|
28
|
+
catch (e) {
|
|
29
|
+
console.log("⚠️ Could not generate image. Skipping image modality.");
|
|
30
|
+
}
|
|
31
|
+
// 3. Create a test audio file (using Python)
|
|
32
|
+
const audioPath = path.join(testDataDir, "test_audio.wav");
|
|
33
|
+
const audioScript = `
|
|
34
|
+
import numpy as np
|
|
35
|
+
import soundfile as sf
|
|
36
|
+
sample_rate = 44100
|
|
37
|
+
duration = 1.0
|
|
38
|
+
frequency = 440.0
|
|
39
|
+
t = np.linspace(0, duration, int(sample_rate * duration))
|
|
40
|
+
audio = 0.5 * np.sin(2 * np.pi * frequency * t)
|
|
41
|
+
sf.write('${audioPath.replace(/\\/g, "\\\\")}', audio, sample_rate)
|
|
42
|
+
`;
|
|
43
|
+
fs.writeFileSync(path.join(testDataDir, "gen_audio.py"), audioScript);
|
|
44
|
+
try {
|
|
45
|
+
execSync(`python "${path.join(testDataDir, "gen_audio.py")}"`);
|
|
46
|
+
}
|
|
47
|
+
catch (e) {
|
|
48
|
+
console.log("⚠️ Could not generate audio. Skipping audio modality.");
|
|
49
|
+
}
|
|
50
|
+
// 4. Run Unified Quality Analysis
|
|
51
|
+
console.log(`\\nAnalyzing multimodal dataset at ${testDataDir}...`);
|
|
52
|
+
try {
|
|
53
|
+
const report = await orchestrator.generateReport("test-unified-dataset", testDataDir, null);
|
|
54
|
+
console.log("\\n📊 Unified Quality Report:");
|
|
55
|
+
console.log(`- Dataset ID: ${report.dataset_id}`);
|
|
56
|
+
console.log(`- Modalities: ${report.modalities.join(", ")}`);
|
|
57
|
+
console.log(`- Overall Quality Score: ${report.overall_quality_score}/100`);
|
|
58
|
+
if (report.image_quality) {
|
|
59
|
+
console.log(`\\n🖼️ Image Quality:`);
|
|
60
|
+
console.log(` - Total Images: ${report.image_quality.total_images}`);
|
|
61
|
+
console.log(` - Avg Resolution: ${report.image_quality.avg_resolution}`);
|
|
62
|
+
}
|
|
63
|
+
if (report.audio_quality) {
|
|
64
|
+
console.log(`\\n🎵 Audio Quality:`);
|
|
65
|
+
console.log(` - Total Files: ${report.audio_quality.total_files}`);
|
|
66
|
+
console.log(` - Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s`);
|
|
67
|
+
}
|
|
68
|
+
console.log(`\\n💡 Recommendations:`);
|
|
69
|
+
report.recommendations.forEach(rec => console.log(` - ${rec}`));
|
|
70
|
+
// Verify modalities detected
|
|
71
|
+
const hasText = report.modalities.includes("text");
|
|
72
|
+
const hasImage = report.modalities.includes("image");
|
|
73
|
+
const hasAudio = report.modalities.includes("audio");
|
|
74
|
+
if (hasText && (hasImage || hasAudio) && report.overall_quality_score > 0) {
|
|
75
|
+
console.log("\\nVERIFICATION_STATUS: ✅ PASS");
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
console.log("\\nVERIFICATION_STATUS: ⚠️ PARTIAL (Some modalities missing)");
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
catch (e) {
|
|
82
|
+
console.error(`Analysis failed: ${e.message}`);
|
|
83
|
+
console.log("\\nVERIFICATION_STATUS: ❌ FAIL");
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
runTest().catch(console.error);
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { pipeline, env } from "@xenova/transformers";
|
|
2
|
+
// Disable local model check to ensure it downloads if not found
|
|
3
|
+
env.allowLocalModels = false;
|
|
4
|
+
env.useBrowserCache = false;
|
|
5
|
+
export class Embedder {
|
|
6
|
+
static instance;
|
|
7
|
+
extractor = null;
|
|
8
|
+
modelName = "Xenova/paraphrase-multilingual-MiniLM-L12-v2";
|
|
9
|
+
constructor() { }
|
|
10
|
+
static getInstance() {
|
|
11
|
+
if (!Embedder.instance) {
|
|
12
|
+
Embedder.instance = new Embedder();
|
|
13
|
+
}
|
|
14
|
+
return Embedder.instance;
|
|
15
|
+
}
|
|
16
|
+
async init() {
|
|
17
|
+
if (this.extractor)
|
|
18
|
+
return;
|
|
19
|
+
console.error(`Loading embedding model: ${this.modelName}...`);
|
|
20
|
+
this.extractor = await pipeline("feature-extraction", this.modelName);
|
|
21
|
+
console.error("Embedding model loaded successfully.");
|
|
22
|
+
}
|
|
23
|
+
async embed(text) {
|
|
24
|
+
if (!this.extractor) {
|
|
25
|
+
await this.init();
|
|
26
|
+
}
|
|
27
|
+
const result = await this.extractor(text, {
|
|
28
|
+
pooling: "mean",
|
|
29
|
+
normalize: true,
|
|
30
|
+
});
|
|
31
|
+
// result.data is already a Float32Array in Xenova/transformers
|
|
32
|
+
return result.data;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
function log(msg) {
|
|
4
|
+
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
5
|
+
}
|
|
6
|
+
export class SearchEngine {
|
|
7
|
+
metadataStore;
|
|
8
|
+
vectorStore;
|
|
9
|
+
embedder;
|
|
10
|
+
jitOrchestrator;
|
|
11
|
+
constructor(metadataStore, vectorStore, embedder) {
|
|
12
|
+
this.metadataStore = metadataStore;
|
|
13
|
+
this.vectorStore = vectorStore;
|
|
14
|
+
this.embedder = embedder;
|
|
15
|
+
this.jitOrchestrator = new JITOrchestrator(metadataStore, vectorStore, embedder);
|
|
16
|
+
}
|
|
17
|
+
async search(query, options = {}) {
|
|
18
|
+
const limit = options.limit || 5;
|
|
19
|
+
const enableJIT = options.enableJIT !== false; // Default: true
|
|
20
|
+
log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
|
|
21
|
+
// 1. Perform local search
|
|
22
|
+
const localResults = await this.localSearch(query, options);
|
|
23
|
+
// 2. Check if JIT should be triggered
|
|
24
|
+
const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
|
|
25
|
+
if (!shouldTrigger) {
|
|
26
|
+
log(`JIT not triggered. Returning ${localResults.length} local results`);
|
|
27
|
+
return localResults;
|
|
28
|
+
}
|
|
29
|
+
// 3. Trigger JIT fallback
|
|
30
|
+
console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
|
|
31
|
+
await this.jitOrchestrator.fetchAndIngest(query, 10);
|
|
32
|
+
// 4. Re-run local search with updated index
|
|
33
|
+
console.error(`Re-searching with updated library...`);
|
|
34
|
+
const enhancedResults = await this.localSearch(query, options);
|
|
35
|
+
const newCount = enhancedResults.length - localResults.length;
|
|
36
|
+
if (newCount > 0) {
|
|
37
|
+
console.error(`Found ${newCount} additional results\n`);
|
|
38
|
+
}
|
|
39
|
+
return enhancedResults;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Perform hybrid search (Vector + Lexical + Penalties)
|
|
43
|
+
*/
|
|
44
|
+
async localSearch(query, options) {
|
|
45
|
+
const limit = options.limit || 5;
|
|
46
|
+
// 1. Parse Query
|
|
47
|
+
const words = query.toLowerCase().split(/\s+/);
|
|
48
|
+
const positiveKeywords = words.filter(w => !w.startsWith("-") && w.length > 2);
|
|
49
|
+
const negativeKeywords = words.filter(w => w.startsWith("-")).map(w => w.slice(1));
|
|
50
|
+
// Automatic Penalty Detection: Is this a "Finance" query without "Crypto" mentioned?
|
|
51
|
+
const financeTerms = ["financial", "finance", "banking", "economy", "stock", "loan", "forecasting", "bank"];
|
|
52
|
+
const isFinanceQuery = positiveKeywords.some(w => financeTerms.includes(w));
|
|
53
|
+
const mentionsCrypto = query.toLowerCase().includes("crypto") || query.toLowerCase().includes("bitcoin");
|
|
54
|
+
// 2. Get query vector
|
|
55
|
+
const queryVector = await this.embedder.embed(query);
|
|
56
|
+
log(`Vector generated, length=${queryVector.length}`);
|
|
57
|
+
// 3. Search in vector store (fetch more candidates for reranking)
|
|
58
|
+
const matches = this.vectorStore.search(queryVector, 40);
|
|
59
|
+
log(`Vector search found ${matches.length} matches`);
|
|
60
|
+
// 4. Score and filter candidates
|
|
61
|
+
const results = [];
|
|
62
|
+
for (const match of matches) {
|
|
63
|
+
const metadata = this.metadataStore.getDataset(match.id);
|
|
64
|
+
if (!metadata)
|
|
65
|
+
continue;
|
|
66
|
+
// Filter: Safe only
|
|
67
|
+
if (options.safeOnly && metadata.license.category === "restricted")
|
|
68
|
+
continue;
|
|
69
|
+
const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
|
|
70
|
+
// Filter: Explicit Negative Keywords
|
|
71
|
+
if (negativeKeywords.some(neg => text.includes(neg))) {
|
|
72
|
+
log(`Negative penalty: Dropped ${match.id} due to keyword match`);
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
// A. Vector Score (0.0 to 1.0)
|
|
76
|
+
const vectorScore = match.score;
|
|
77
|
+
// B. Lexical Score (Keyword Match)
|
|
78
|
+
let lexicalScore = 0;
|
|
79
|
+
if (positiveKeywords.length > 0) {
|
|
80
|
+
const matchesCount = positiveKeywords.filter(kw => text.includes(kw)).length;
|
|
81
|
+
lexicalScore = matchesCount / positiveKeywords.length;
|
|
82
|
+
}
|
|
83
|
+
// C. Penalties
|
|
84
|
+
let penalty = 0;
|
|
85
|
+
// Penalty: Domain Drift (Finance vs Crypto)
|
|
86
|
+
// If user asks for finance but NOT crypto, and the result is crypto, penalize heavily
|
|
87
|
+
if (isFinanceQuery && !mentionsCrypto) {
|
|
88
|
+
const isCryptoResult = text.includes("crypto") || text.includes("bitcoin") || text.includes("ethereum") || text.includes("blockchain");
|
|
89
|
+
if (isCryptoResult) {
|
|
90
|
+
log(`Penalty applied: Crypto result in Finance query for ${match.id}`);
|
|
91
|
+
penalty += 0.4; // Massive penalty
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
// Penalty: Weak Lexical Match
|
|
95
|
+
// If it's a "semantic vibe" match but has ZERO matching keywords from the query
|
|
96
|
+
if (lexicalScore === 0 && positiveKeywords.length > 1) {
|
|
97
|
+
penalty += 0.2;
|
|
98
|
+
}
|
|
99
|
+
// Final Combined Score
|
|
100
|
+
// 70% Vector, 30% Lexical, minus Penalties
|
|
101
|
+
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty;
|
|
102
|
+
metadata.relevance_score = Math.round(finalScore * 100) / 100;
|
|
103
|
+
metadata.vector_score = Math.round(vectorScore * 100) / 100;
|
|
104
|
+
metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
|
|
105
|
+
results.push(metadata);
|
|
106
|
+
}
|
|
107
|
+
// Sort by final score and limit
|
|
108
|
+
return results
|
|
109
|
+
.sort((a, b) => b.relevance_score - a.relevance_score)
|
|
110
|
+
.slice(0, limit);
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Determine if JIT should be triggered
|
|
114
|
+
*/
|
|
115
|
+
shouldTriggerJIT(results, query) {
|
|
116
|
+
// Condition 1: Very few results
|
|
117
|
+
if (results.length < 3) {
|
|
118
|
+
log(`JIT trigger: Only ${results.length} results found`);
|
|
119
|
+
return true;
|
|
120
|
+
}
|
|
121
|
+
// Condition 2: Low confidence (top result has low similarity)
|
|
122
|
+
const topScore = results[0]?.relevance_score || 0;
|
|
123
|
+
if (topScore < 0.60) {
|
|
124
|
+
log(`JIT trigger: Low confidence (top score: ${topScore})`);
|
|
125
|
+
return true;
|
|
126
|
+
}
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
2
|
+
import { KaggleMetadataScraper } from "../metadata/kaggle-scraper.js";
|
|
3
|
+
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
4
|
+
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
5
|
+
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
6
|
+
/**
|
|
7
|
+
* Just-In-Time Orchestrator
|
|
8
|
+
* Automatically fetches and indexes new datasets when local search is insufficient
|
|
9
|
+
*/
|
|
10
|
+
export class JITOrchestrator {
|
|
11
|
+
metadataStore;
|
|
12
|
+
vectorStore;
|
|
13
|
+
embedder;
|
|
14
|
+
lastTriggerTime = new Map();
|
|
15
|
+
RATE_LIMIT_MS = 30000; // 30 seconds between triggers for same query
|
|
16
|
+
constructor(metadataStore, vectorStore, embedder) {
|
|
17
|
+
this.metadataStore = metadataStore;
|
|
18
|
+
this.vectorStore = vectorStore;
|
|
19
|
+
this.embedder = embedder;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Check if JIT should be triggered based on rate limiting
|
|
23
|
+
*/
|
|
24
|
+
canTrigger(query) {
|
|
25
|
+
const lastTrigger = this.lastTriggerTime.get(query);
|
|
26
|
+
if (!lastTrigger)
|
|
27
|
+
return true;
|
|
28
|
+
const elapsed = Date.now() - lastTrigger;
|
|
29
|
+
return elapsed > this.RATE_LIMIT_MS;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Main JIT workflow: fetch, save, index, return new datasets
|
|
33
|
+
*/
|
|
34
|
+
async fetchAndIngest(query, limit = 10) {
|
|
35
|
+
// Rate limiting check
|
|
36
|
+
if (!this.canTrigger(query)) {
|
|
37
|
+
console.error(`[JIT] Rate limit: Query "${query}" triggered too recently`);
|
|
38
|
+
return [];
|
|
39
|
+
}
|
|
40
|
+
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
41
|
+
this.lastTriggerTime.set(query, Date.now());
|
|
42
|
+
const newDatasets = [];
|
|
43
|
+
const existingIds = new Set();
|
|
44
|
+
try {
|
|
45
|
+
// Get existing dataset IDs to avoid duplicates
|
|
46
|
+
const existing = this.metadataStore.getAllDatasets();
|
|
47
|
+
existing.forEach(ds => existingIds.add(ds.id));
|
|
48
|
+
// 1. Scrape HuggingFace
|
|
49
|
+
const hfResults = await this.scrapeHuggingFace(query, limit);
|
|
50
|
+
console.error(` HuggingFace: Found ${hfResults.length} datasets`);
|
|
51
|
+
for (const ds of hfResults) {
|
|
52
|
+
if (!existingIds.has(ds.id)) {
|
|
53
|
+
newDatasets.push(ds);
|
|
54
|
+
existingIds.add(ds.id);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
// 2. Scrape Kaggle (if credentials available)
|
|
58
|
+
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
59
|
+
const kaggleKey = process.env.KAGGLE_KEY;
|
|
60
|
+
if (kaggleUser && kaggleKey) {
|
|
61
|
+
const kaggleResults = await this.scrapeKaggle(query, Math.floor(limit / 2));
|
|
62
|
+
console.error(` Kaggle: Found ${kaggleResults.length} datasets`);
|
|
63
|
+
for (const ds of kaggleResults) {
|
|
64
|
+
ds.id = `kaggle:${ds.id}`;
|
|
65
|
+
if (!existingIds.has(ds.id)) {
|
|
66
|
+
newDatasets.push(ds);
|
|
67
|
+
existingIds.add(ds.id);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
// 3. Scrape UCI
|
|
72
|
+
const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
|
|
73
|
+
console.error(` UCI: Found ${uciResults.length} datasets`);
|
|
74
|
+
for (const ds of uciResults) {
|
|
75
|
+
if (!existingIds.has(ds.id)) {
|
|
76
|
+
newDatasets.push(ds);
|
|
77
|
+
existingIds.add(ds.id);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
// 4. Scrape GitHub
|
|
81
|
+
const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
|
|
82
|
+
console.error(` GitHub: Found ${githubResults.length} datasets`);
|
|
83
|
+
for (const ds of githubResults) {
|
|
84
|
+
if (!existingIds.has(ds.id)) {
|
|
85
|
+
newDatasets.push(ds);
|
|
86
|
+
existingIds.add(ds.id);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
// 5. Scrape World Bank
|
|
90
|
+
const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
|
|
91
|
+
console.error(` World Bank: Found ${wbResults.length} datasets`);
|
|
92
|
+
for (const ds of wbResults) {
|
|
93
|
+
if (!existingIds.has(ds.id)) {
|
|
94
|
+
newDatasets.push(ds);
|
|
95
|
+
existingIds.add(ds.id);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
// 6. Scrape NASA
|
|
99
|
+
const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
|
|
100
|
+
console.error(` NASA: Found ${nasaResults.length} datasets`);
|
|
101
|
+
for (const ds of nasaResults) {
|
|
102
|
+
if (!existingIds.has(ds.id)) {
|
|
103
|
+
newDatasets.push(ds);
|
|
104
|
+
existingIds.add(ds.id);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
// 3. Save and index new datasets
|
|
108
|
+
if (newDatasets.length > 0) {
|
|
109
|
+
await this.saveAndIndex(newDatasets);
|
|
110
|
+
console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
console.error(` [JIT] No new datasets found`);
|
|
114
|
+
}
|
|
115
|
+
return newDatasets;
|
|
116
|
+
}
|
|
117
|
+
catch (error) {
|
|
118
|
+
console.error(`ERROR [JIT] Error during fetch and ingest:`, error.message);
|
|
119
|
+
return [];
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Scrape HuggingFace with free-text search
|
|
124
|
+
*/
|
|
125
|
+
async scrapeHuggingFace(query, limit) {
|
|
126
|
+
const scraper = new HuggingFaceScraper();
|
|
127
|
+
try {
|
|
128
|
+
// Use the query as a domain filter for now
|
|
129
|
+
// In the future, we can add a freeTextSearch parameter to the scraper
|
|
130
|
+
return await scraper.scrape(limit, true, query);
|
|
131
|
+
}
|
|
132
|
+
catch (error) {
|
|
133
|
+
console.error(` ERROR: HuggingFace scrape failed: ${error.message}`);
|
|
134
|
+
return [];
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Scrape Kaggle with search query
|
|
139
|
+
*/
|
|
140
|
+
async scrapeKaggle(query, limit) {
|
|
141
|
+
const kaggleUser = process.env.KAGGLE_USERNAME;
|
|
142
|
+
const kaggleKey = process.env.KAGGLE_KEY;
|
|
143
|
+
if (!kaggleUser || !kaggleKey) {
|
|
144
|
+
return [];
|
|
145
|
+
}
|
|
146
|
+
try {
|
|
147
|
+
const scraper = new KaggleMetadataScraper(kaggleUser, kaggleKey);
|
|
148
|
+
return await scraper.scrape(query, limit);
|
|
149
|
+
}
|
|
150
|
+
catch (error) {
|
|
151
|
+
console.error(` ERROR: Kaggle scrape failed: ${error.message}`);
|
|
152
|
+
return [];
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Scrape UCI
|
|
157
|
+
*/
|
|
158
|
+
async scrapeUCI(query, limit) {
|
|
159
|
+
const scraper = new UCIScraper();
|
|
160
|
+
try {
|
|
161
|
+
return await scraper.scrape(query, limit);
|
|
162
|
+
}
|
|
163
|
+
catch (error) {
|
|
164
|
+
console.error(` ERROR: UCI scrape failed: ${error.message}`);
|
|
165
|
+
return [];
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Scrape GitHub
|
|
170
|
+
*/
|
|
171
|
+
async scrapeGitHub(query, limit) {
|
|
172
|
+
const scraper = new GitHubScraper();
|
|
173
|
+
try {
|
|
174
|
+
return await scraper.scrape(query, limit);
|
|
175
|
+
}
|
|
176
|
+
catch (error) {
|
|
177
|
+
console.error(` ERROR: GitHub scrape failed: ${error.message}`);
|
|
178
|
+
return [];
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Scrape World Bank
|
|
183
|
+
*/
|
|
184
|
+
async scrapeWorldBank(query, limit) {
|
|
185
|
+
const scraper = new WorldBankScraper();
|
|
186
|
+
try {
|
|
187
|
+
return await scraper.scrape(query, limit);
|
|
188
|
+
}
|
|
189
|
+
catch (error) {
|
|
190
|
+
console.error(` ERROR: World Bank scrape failed: ${error.message}`);
|
|
191
|
+
return [];
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Scrape NASA
|
|
196
|
+
*/
|
|
197
|
+
async scrapeNASA(query, limit) {
|
|
198
|
+
const scraper = new NASAScraper();
|
|
199
|
+
try {
|
|
200
|
+
return await scraper.scrape(query, limit);
|
|
201
|
+
}
|
|
202
|
+
catch (error) {
|
|
203
|
+
console.error(` ERROR: NASA scrape failed: ${error.message}`);
|
|
204
|
+
return [];
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Save datasets to DB and generate embeddings
|
|
209
|
+
*/
|
|
210
|
+
async saveAndIndex(datasets) {
|
|
211
|
+
// 1. Save to database
|
|
212
|
+
this.metadataStore.beginTransaction();
|
|
213
|
+
try {
|
|
214
|
+
for (const ds of datasets) {
|
|
215
|
+
this.metadataStore.saveDataset(ds);
|
|
216
|
+
}
|
|
217
|
+
this.metadataStore.commit();
|
|
218
|
+
}
|
|
219
|
+
catch (e) {
|
|
220
|
+
this.metadataStore.rollback();
|
|
221
|
+
throw e;
|
|
222
|
+
}
|
|
223
|
+
// 2. Generate embeddings and update vector store
|
|
224
|
+
for (const ds of datasets) {
|
|
225
|
+
const text = `${ds.name} ${ds.description} ${ds.tags.join(" ")}`;
|
|
226
|
+
const vector = await this.embedder.embed(text);
|
|
227
|
+
this.vectorStore.add(ds.id, vector);
|
|
228
|
+
}
|
|
229
|
+
// 3. Persist vector store to disk
|
|
230
|
+
this.vectorStore.save();
|
|
231
|
+
}
|
|
232
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import path from "path";
|
|
3
|
+
export class VectorStore {
|
|
4
|
+
idToVector = new Map();
|
|
5
|
+
filePath;
|
|
6
|
+
binPath;
|
|
7
|
+
constructor(filePath) {
|
|
8
|
+
this.filePath = filePath;
|
|
9
|
+
this.binPath = filePath.replace(".json", ".bin");
|
|
10
|
+
this.load();
|
|
11
|
+
}
|
|
12
|
+
load() {
|
|
13
|
+
if (fs.existsSync(this.filePath) && fs.existsSync(this.binPath)) {
|
|
14
|
+
try {
|
|
15
|
+
const metadata = JSON.parse(fs.readFileSync(this.filePath, "utf-8"));
|
|
16
|
+
const buffer = fs.readFileSync(this.binPath);
|
|
17
|
+
const ids = metadata.ids;
|
|
18
|
+
const dim = metadata.dimension;
|
|
19
|
+
for (let i = 0; i < ids.length; i++) {
|
|
20
|
+
const start = i * dim * 4;
|
|
21
|
+
const vector = new Float32Array(buffer.buffer, buffer.byteOffset + start, dim);
|
|
22
|
+
this.idToVector.set(ids[i], new Float32Array(vector)); // Copy to ensure independence
|
|
23
|
+
}
|
|
24
|
+
console.error(`Loaded ${this.idToVector.size} vectors from binary store`);
|
|
25
|
+
}
|
|
26
|
+
catch (err) {
|
|
27
|
+
console.error("Failed to load vector store:", err);
|
|
28
|
+
this.idToVector.clear();
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
else if (fs.existsSync(this.filePath)) {
|
|
32
|
+
// Migration from old JSON format
|
|
33
|
+
try {
|
|
34
|
+
const data = JSON.parse(fs.readFileSync(this.filePath, "utf-8"));
|
|
35
|
+
if (Array.isArray(data)) {
|
|
36
|
+
for (const entry of data) {
|
|
37
|
+
this.idToVector.set(entry.id, new Float32Array(entry.vector));
|
|
38
|
+
}
|
|
39
|
+
console.error(`Migrated ${this.idToVector.size} vectors from legacy JSON`);
|
|
40
|
+
this.save(); // Convert to binary immediately
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
catch (err) {
|
|
44
|
+
console.error("Migration failed:", err);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
save() {
|
|
49
|
+
const dir = path.dirname(this.filePath);
|
|
50
|
+
if (!fs.existsSync(dir)) {
|
|
51
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
52
|
+
}
|
|
53
|
+
const ids = Array.from(this.idToVector.keys());
|
|
54
|
+
if (ids.length === 0)
|
|
55
|
+
return;
|
|
56
|
+
const dim = this.idToVector.get(ids[0]).length;
|
|
57
|
+
const buffer = Buffer.alloc(ids.length * dim * 4);
|
|
58
|
+
for (let i = 0; i < ids.length; i++) {
|
|
59
|
+
const vector = this.idToVector.get(ids[i]);
|
|
60
|
+
const byteOffset = i * dim * 4;
|
|
61
|
+
for (let j = 0; j < dim; j++) {
|
|
62
|
+
buffer.writeFloatLE(vector[j], byteOffset + j * 4);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
fs.writeFileSync(this.binPath, buffer);
|
|
66
|
+
fs.writeFileSync(this.filePath, JSON.stringify({
|
|
67
|
+
ids,
|
|
68
|
+
dimension: dim,
|
|
69
|
+
count: ids.length,
|
|
70
|
+
updatedAt: new Date().toISOString()
|
|
71
|
+
}, null, 2));
|
|
72
|
+
console.error(`Saved ${ids.length} vectors to binary store (${this.binPath})`);
|
|
73
|
+
}
|
|
74
|
+
add(id, vector) {
|
|
75
|
+
this.idToVector.set(id, vector instanceof Float32Array ? vector : new Float32Array(vector));
|
|
76
|
+
}
|
|
77
|
+
search(queryVector, limit = 10) {
|
|
78
|
+
const q = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
|
|
79
|
+
const results = [];
|
|
80
|
+
for (const [id, v] of this.idToVector.entries()) {
|
|
81
|
+
results.push({
|
|
82
|
+
id,
|
|
83
|
+
score: this.cosineSimilarity(q, v)
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
return results
|
|
87
|
+
.sort((a, b) => b.score - a.score)
|
|
88
|
+
.slice(0, limit);
|
|
89
|
+
}
|
|
90
|
+
cosineSimilarity(v1, v2) {
|
|
91
|
+
let dotProduct = 0;
|
|
92
|
+
let normA = 0;
|
|
93
|
+
let normB = 0;
|
|
94
|
+
for (let i = 0; i < v1.length; i++) {
|
|
95
|
+
dotProduct += v1[i] * v2[i];
|
|
96
|
+
normA += v1[i] * v1[i];
|
|
97
|
+
normB += v2[i] * v2[i];
|
|
98
|
+
}
|
|
99
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
100
|
+
return magnitude === 0 ? 0 : dotProduct / magnitude;
|
|
101
|
+
}
|
|
102
|
+
getAllIds() {
|
|
103
|
+
return Array.from(this.idToVector.keys());
|
|
104
|
+
}
|
|
105
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { spawn } from "child_process";
|
|
2
|
+
import path from "path";
|
|
3
|
+
export class DataSplitter {
|
|
4
|
+
pythonPath = "python";
|
|
5
|
+
scriptPath;
|
|
6
|
+
constructor(projectRoot = process.cwd()) {
|
|
7
|
+
this.scriptPath = path.join(projectRoot, "src", "python", "splitter_engine.py");
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Splits a dataset into Train/Val/Test sets based on config
|
|
11
|
+
*/
|
|
12
|
+
async split(filePath, config) {
|
|
13
|
+
return this.runPython("split", [filePath, JSON.stringify(config)]);
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Validates a split for leakage and distribution
|
|
17
|
+
*/
|
|
18
|
+
async validate(paths, options) {
|
|
19
|
+
const config = {
|
|
20
|
+
paths,
|
|
21
|
+
id_column: options?.id_column,
|
|
22
|
+
target_column: options?.target_column
|
|
23
|
+
};
|
|
24
|
+
return this.runPython("validate", [JSON.stringify(config)]);
|
|
25
|
+
}
|
|
26
|
+
async runPython(action, args) {
|
|
27
|
+
return new Promise((resolve, reject) => {
|
|
28
|
+
const process = spawn(this.pythonPath, [this.scriptPath, action, ...args]);
|
|
29
|
+
let stdout = "";
|
|
30
|
+
let stderr = "";
|
|
31
|
+
process.stdout.on("data", (data) => {
|
|
32
|
+
stdout += data.toString();
|
|
33
|
+
});
|
|
34
|
+
process.stderr.on("data", (data) => {
|
|
35
|
+
stderr += data.toString();
|
|
36
|
+
});
|
|
37
|
+
process.on("close", (code) => {
|
|
38
|
+
if (code !== 0) {
|
|
39
|
+
reject(new Error(`Data Splitter (${action}) failed: ${stderr}`));
|
|
40
|
+
return;
|
|
41
|
+
}
|
|
42
|
+
try {
|
|
43
|
+
const result = JSON.parse(stdout);
|
|
44
|
+
if (result.error) {
|
|
45
|
+
reject(new Error(result.error));
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
resolve(result);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
catch (e) {
|
|
52
|
+
reject(new Error(`Failed to parse output: ${stdout}`));
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|