vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
package/build/search/engine.js
DELETED
|
@@ -1,190 +0,0 @@
|
|
|
1
|
-
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
-
import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
function log(msg) {
|
|
5
|
-
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
6
|
-
}
|
|
7
|
-
export class SearchEngine {
|
|
8
|
-
metadataStore;
|
|
9
|
-
vectorStore;
|
|
10
|
-
embedder;
|
|
11
|
-
jitOrchestrator;
|
|
12
|
-
constructor(metadataStore, vectorStore, embedder) {
|
|
13
|
-
this.metadataStore = metadataStore;
|
|
14
|
-
this.vectorStore = vectorStore;
|
|
15
|
-
this.embedder = embedder;
|
|
16
|
-
this.jitOrchestrator = new JITOrchestrator(metadataStore, vectorStore, embedder);
|
|
17
|
-
}
|
|
18
|
-
async search(query, options = {}) {
|
|
19
|
-
const limit = options.limit || 5;
|
|
20
|
-
const enableJIT = options.enableJIT !== false; // Default: true
|
|
21
|
-
const intent = await analyzeDatasetQuery(query);
|
|
22
|
-
log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
|
|
23
|
-
// 1. Perform local search
|
|
24
|
-
const localResults = await this.localSearch(query, options, intent);
|
|
25
|
-
// 2. Check if JIT should be triggered
|
|
26
|
-
const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
|
|
27
|
-
if (!shouldTrigger) {
|
|
28
|
-
log(`JIT not triggered. Returning ${localResults.length} local results`);
|
|
29
|
-
return localResults;
|
|
30
|
-
}
|
|
31
|
-
// 3. Trigger JIT fallback
|
|
32
|
-
console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
|
|
33
|
-
await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
|
|
34
|
-
// 4. Re-run local search with updated index
|
|
35
|
-
console.error(`Re-searching with updated library...`);
|
|
36
|
-
const enhancedResults = await this.localSearch(query, options, intent);
|
|
37
|
-
const newCount = enhancedResults.length - localResults.length;
|
|
38
|
-
if (newCount > 0) {
|
|
39
|
-
console.error(`Found ${newCount} additional results\n`);
|
|
40
|
-
}
|
|
41
|
-
return enhancedResults;
|
|
42
|
-
}
|
|
43
|
-
/**
|
|
44
|
-
* Perform hybrid search (Vector + Lexical + Penalties)
|
|
45
|
-
*/
|
|
46
|
-
async localSearch(query, options, intent) {
|
|
47
|
-
const limit = options.limit || 5;
|
|
48
|
-
// 1. Parse Query
|
|
49
|
-
const words = query.toLowerCase().split(/\s+/);
|
|
50
|
-
const positiveKeywords = words.filter(w => !w.startsWith("-") && w.length > 2);
|
|
51
|
-
const negativeKeywords = words.filter(w => w.startsWith("-")).map(w => w.slice(1));
|
|
52
|
-
// Automatic Penalty Detection: Is this a "Finance" query without "Crypto" mentioned?
|
|
53
|
-
const financeTerms = ["financial", "finance", "banking", "economy", "stock", "loan", "forecasting", "bank"];
|
|
54
|
-
const isFinanceQuery = positiveKeywords.some(w => financeTerms.includes(w));
|
|
55
|
-
const mentionsCrypto = query.toLowerCase().includes("crypto") || query.toLowerCase().includes("bitcoin");
|
|
56
|
-
// 2. Get query vector
|
|
57
|
-
const queryVector = await this.embedder.embed(query);
|
|
58
|
-
log(`Vector generated, length=${queryVector.length}`);
|
|
59
|
-
// 3. Search in vector store (fetch more candidates for reranking)
|
|
60
|
-
const matches = this.vectorStore.search(queryVector, 40);
|
|
61
|
-
log(`Vector search found ${matches.length} matches`);
|
|
62
|
-
// 4. Score and filter candidates
|
|
63
|
-
const results = [];
|
|
64
|
-
for (const match of matches) {
|
|
65
|
-
const metadata = this.metadataStore.getDataset(match.id);
|
|
66
|
-
if (!metadata)
|
|
67
|
-
continue;
|
|
68
|
-
// Filter: Safe only
|
|
69
|
-
if (options.safeOnly && metadata.license.category === "restricted")
|
|
70
|
-
continue;
|
|
71
|
-
// Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
|
|
72
|
-
// when user explicitly requested a single language
|
|
73
|
-
if (shouldExcludeByLanguage(metadata, intent)) {
|
|
74
|
-
log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
|
|
75
|
-
continue;
|
|
76
|
-
}
|
|
77
|
-
const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
|
|
78
|
-
// Filter: Explicit Negative Keywords
|
|
79
|
-
if (negativeKeywords.some(neg => text.includes(neg))) {
|
|
80
|
-
log(`Negative penalty: Dropped ${match.id} due to keyword match`);
|
|
81
|
-
continue;
|
|
82
|
-
}
|
|
83
|
-
// A. Vector Score (0.0 to 1.0)
|
|
84
|
-
const vectorScore = match.score;
|
|
85
|
-
// B. Lexical Score (Keyword Match)
|
|
86
|
-
let lexicalScore = 0;
|
|
87
|
-
if (positiveKeywords.length > 0) {
|
|
88
|
-
const matchesCount = positiveKeywords.filter(kw => text.includes(kw)).length;
|
|
89
|
-
lexicalScore = matchesCount / positiveKeywords.length;
|
|
90
|
-
}
|
|
91
|
-
// C. Penalties
|
|
92
|
-
let penalty = 0;
|
|
93
|
-
// Penalty: Domain Drift (Finance vs Crypto)
|
|
94
|
-
// If user asks for finance but NOT crypto, and the result is crypto, penalize heavily
|
|
95
|
-
if (isFinanceQuery && !mentionsCrypto) {
|
|
96
|
-
const isCryptoResult = text.includes("crypto") || text.includes("bitcoin") || text.includes("ethereum") || text.includes("blockchain");
|
|
97
|
-
if (isCryptoResult) {
|
|
98
|
-
log(`Penalty applied: Crypto result in Finance query for ${match.id}`);
|
|
99
|
-
penalty += 0.4; // Massive penalty
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
// Penalty: Weak Lexical Match
|
|
103
|
-
// If it's a "semantic vibe" match but has ZERO matching keywords from the query
|
|
104
|
-
if (lexicalScore === 0 && positiveKeywords.length > 1) {
|
|
105
|
-
penalty += 0.2;
|
|
106
|
-
}
|
|
107
|
-
// Penalty: Modality Mismatch
|
|
108
|
-
// Infer the expected modality from the query and penalize mismatches.
|
|
109
|
-
// e.g., "anime quotes" is text, so image-classification datasets get penalized.
|
|
110
|
-
const textIndicators = ["quotes", "text", "nlp", "sentiment", "review", "comment", "caption", "dialogue", "chat", "translation", "summarization", "classification"];
|
|
111
|
-
const imageIndicators = ["image", "photo", "picture", "vision", "detection", "segmentation", "face", "background"];
|
|
112
|
-
const queryLower = query.toLowerCase();
|
|
113
|
-
const queryLooksText = textIndicators.some(t => queryLower.includes(t));
|
|
114
|
-
const queryLooksImage = imageIndicators.some(t => queryLower.includes(t));
|
|
115
|
-
if (queryLooksText && !queryLooksImage) {
|
|
116
|
-
const resultTask = (metadata.task || "").toLowerCase();
|
|
117
|
-
const isImageResult = resultTask.includes("image") || resultTask.includes("object-detection") ||
|
|
118
|
-
text.includes("image classification") || text.includes("image-classification") ||
|
|
119
|
-
text.includes("object detection") || text.includes("image segmentation");
|
|
120
|
-
if (isImageResult) {
|
|
121
|
-
log(`Modality penalty: text query but image dataset ${match.id}`);
|
|
122
|
-
penalty += 0.35;
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
if (queryLooksImage && !queryLooksText) {
|
|
126
|
-
const resultTask = (metadata.task || "").toLowerCase();
|
|
127
|
-
const isTextResult = resultTask.includes("text-classification") || resultTask.includes("text-generation") ||
|
|
128
|
-
resultTask.includes("translation") || resultTask.includes("summarization") ||
|
|
129
|
-
resultTask.includes("question-answering");
|
|
130
|
-
if (isTextResult) {
|
|
131
|
-
log(`Modality penalty: image query but text dataset ${match.id}`);
|
|
132
|
-
penalty += 0.35;
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
// D. Accessibility Bonuses (Prioritize low-friction sources)
|
|
136
|
-
let bonus = 0;
|
|
137
|
-
const sourceBonuses = {
|
|
138
|
-
"huggingface": 0.1,
|
|
139
|
-
"uci": 0.1,
|
|
140
|
-
"github": 0.1,
|
|
141
|
-
"worldbank": 0.1,
|
|
142
|
-
"nasa": 0.1
|
|
143
|
-
};
|
|
144
|
-
bonus = sourceBonuses[metadata.source] || 0;
|
|
145
|
-
// Final Combined Score
|
|
146
|
-
// 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
|
|
147
|
-
const intentScore = scoreDatasetAgainstIntent(metadata, intent);
|
|
148
|
-
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
|
|
149
|
-
metadata.relevance_score = Math.round(finalScore * 100) / 100;
|
|
150
|
-
metadata.vector_score = Math.round(vectorScore * 100) / 100;
|
|
151
|
-
metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
|
|
152
|
-
metadata.accessibility_bonus = bonus;
|
|
153
|
-
metadata.intent_score = intentScore;
|
|
154
|
-
results.push(metadata);
|
|
155
|
-
}
|
|
156
|
-
// Sort by final score and limit
|
|
157
|
-
return results
|
|
158
|
-
.sort((a, b) => b.relevance_score - a.relevance_score)
|
|
159
|
-
.slice(0, limit);
|
|
160
|
-
}
|
|
161
|
-
/**
|
|
162
|
-
* Determine if JIT should be triggered
|
|
163
|
-
*/
|
|
164
|
-
shouldTriggerJIT(results, query) {
|
|
165
|
-
const queryWords = query.trim().split(/\s+/).length;
|
|
166
|
-
// Condition 1: Zero results (Always trigger JIT)
|
|
167
|
-
if (results.length === 0) {
|
|
168
|
-
log(`JIT trigger: No results found for "${query}"`);
|
|
169
|
-
return true;
|
|
170
|
-
}
|
|
171
|
-
// Condition 2: Very few results
|
|
172
|
-
if (results.length < 3) {
|
|
173
|
-
log(`JIT trigger: Only ${results.length} results found`);
|
|
174
|
-
return true;
|
|
175
|
-
}
|
|
176
|
-
// Condition 3: Low confidence (top result has low similarity)
|
|
177
|
-
// Increased threshold from 0.60 to 0.75 for better specific-query matching
|
|
178
|
-
const topScore = results[0]?.relevance_score || 0;
|
|
179
|
-
if (topScore < 0.75) {
|
|
180
|
-
log(`JIT trigger: Low confidence (top score: ${topScore}, threshold: 0.75)`);
|
|
181
|
-
return true;
|
|
182
|
-
}
|
|
183
|
-
// Condition 4: Long specific queries with mediocre matches
|
|
184
|
-
if (queryWords >= 4 && topScore < 0.85) {
|
|
185
|
-
log(`JIT trigger: Long query with mediocre top score (${topScore})`);
|
|
186
|
-
return true;
|
|
187
|
-
}
|
|
188
|
-
return false;
|
|
189
|
-
}
|
|
190
|
-
}
|
|
@@ -1,262 +0,0 @@
|
|
|
1
|
-
import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
2
|
-
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
3
|
-
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
4
|
-
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
5
|
-
import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
|
|
6
|
-
// Common stop words to filter out for better search
|
|
7
|
-
const STOP_WORDS = new Set([
|
|
8
|
-
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
9
|
-
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
|
10
|
-
"be", "have", "has", "had", "do", "does", "did", "will", "would",
|
|
11
|
-
"could", "should", "may", "might", "must", "shall", "can", "need",
|
|
12
|
-
"about", "into", "through", "during", "before", "after", "above",
|
|
13
|
-
"below", "between", "under", "again", "further", "then", "once",
|
|
14
|
-
"here", "there", "when", "where", "why", "how", "all", "each",
|
|
15
|
-
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
|
16
|
-
"only", "own", "same", "so", "than", "too", "very", "just", "also",
|
|
17
|
-
"dataset", "datasets", "data", "find", "search", "get", "looking"
|
|
18
|
-
]);
|
|
19
|
-
/**
|
|
20
|
-
* Just-In-Time Orchestrator
|
|
21
|
-
* Automatically fetches and indexes new datasets when local search is insufficient
|
|
22
|
-
*/
|
|
23
|
-
export class JITOrchestrator {
|
|
24
|
-
metadataStore;
|
|
25
|
-
vectorStore;
|
|
26
|
-
embedder;
|
|
27
|
-
lastTriggerTime = new Map();
|
|
28
|
-
RATE_LIMIT_MS = 30000; // 30 seconds between triggers for same query
|
|
29
|
-
constructor(metadataStore, vectorStore, embedder) {
|
|
30
|
-
this.metadataStore = metadataStore;
|
|
31
|
-
this.vectorStore = vectorStore;
|
|
32
|
-
this.embedder = embedder;
|
|
33
|
-
}
|
|
34
|
-
/**
|
|
35
|
-
* Simplify a complex user query into keywords that work better with APIs
|
|
36
|
-
* HuggingFace and other APIs often fail on long multi-word queries
|
|
37
|
-
*/
|
|
38
|
-
simplifyQuery(query) {
|
|
39
|
-
// Split into words, lowercase, remove punctuation
|
|
40
|
-
const words = query.toLowerCase()
|
|
41
|
-
.replace(/[^\w\s-]/g, "")
|
|
42
|
-
.split(/\s+/)
|
|
43
|
-
.filter(w => w.length > 2 && !STOP_WORDS.has(w));
|
|
44
|
-
// Return unique keywords (max 3 for API-friendly queries)
|
|
45
|
-
const unique = [...new Set(words)];
|
|
46
|
-
// If we have a lot of words, prioritize longer/more specific ones
|
|
47
|
-
if (unique.length > 3) {
|
|
48
|
-
unique.sort((a, b) => b.length - a.length);
|
|
49
|
-
}
|
|
50
|
-
return unique.slice(0, 3);
|
|
51
|
-
}
|
|
52
|
-
/**
|
|
53
|
-
* Check if JIT should be triggered based on rate limiting
|
|
54
|
-
*/
|
|
55
|
-
canTrigger(query) {
|
|
56
|
-
const lastTrigger = this.lastTriggerTime.get(query);
|
|
57
|
-
if (!lastTrigger)
|
|
58
|
-
return true;
|
|
59
|
-
const elapsed = Date.now() - lastTrigger;
|
|
60
|
-
return elapsed > this.RATE_LIMIT_MS;
|
|
61
|
-
}
|
|
62
|
-
/**
|
|
63
|
-
* Main JIT workflow: fetch, save, index, return new datasets
|
|
64
|
-
*/
|
|
65
|
-
async fetchAndIngest(query, limit = 10, providedIntent) {
|
|
66
|
-
// Rate limiting check
|
|
67
|
-
if (!this.canTrigger(query)) {
|
|
68
|
-
console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
|
|
69
|
-
return [];
|
|
70
|
-
}
|
|
71
|
-
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
72
|
-
this.lastTriggerTime.set(query, Date.now());
|
|
73
|
-
const intent = providedIntent || await analyzeDatasetQuery(query);
|
|
74
|
-
const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
|
|
75
|
-
if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
|
|
76
|
-
console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
|
|
77
|
-
}
|
|
78
|
-
else if (keywords.length > 0) {
|
|
79
|
-
console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
|
|
80
|
-
}
|
|
81
|
-
const newDatasets = [];
|
|
82
|
-
const existingIds = new Set();
|
|
83
|
-
const sourceResults = {};
|
|
84
|
-
try {
|
|
85
|
-
// Get existing dataset IDs to avoid duplicates
|
|
86
|
-
const existing = this.metadataStore.getAllDatasets();
|
|
87
|
-
existing.forEach(ds => existingIds.add(ds.id));
|
|
88
|
-
let hfResults = await this.scrapeHuggingFace(intent, limit);
|
|
89
|
-
if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
|
|
90
|
-
for (const keyword of keywords) {
|
|
91
|
-
if (hfResults.length >= limit)
|
|
92
|
-
break;
|
|
93
|
-
const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
|
|
94
|
-
for (const ds of results) {
|
|
95
|
-
if (!hfResults.some(existing => existing.id === ds.id)) {
|
|
96
|
-
hfResults.push(ds);
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
sourceResults["HuggingFace"] = hfResults.length;
|
|
102
|
-
console.error(` [source] HuggingFace: ${hfResults.length} datasets`);
|
|
103
|
-
for (const ds of hfResults) {
|
|
104
|
-
if (!existingIds.has(ds.id)) {
|
|
105
|
-
newDatasets.push(ds);
|
|
106
|
-
existingIds.add(ds.id);
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
// 2. Scrape UCI (Open Access)
|
|
110
|
-
const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
|
|
111
|
-
sourceResults["UCI"] = uciResults.length;
|
|
112
|
-
console.error(` [source] UCI: ${uciResults.length} datasets`);
|
|
113
|
-
for (const ds of uciResults) {
|
|
114
|
-
if (!existingIds.has(ds.id)) {
|
|
115
|
-
newDatasets.push(ds);
|
|
116
|
-
existingIds.add(ds.id);
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
// 3. Scrape GitHub (Open Access)
|
|
120
|
-
const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
|
|
121
|
-
sourceResults["GitHub"] = githubResults.length;
|
|
122
|
-
console.error(` [source] GitHub: ${githubResults.length} datasets`);
|
|
123
|
-
for (const ds of githubResults) {
|
|
124
|
-
if (!existingIds.has(ds.id)) {
|
|
125
|
-
newDatasets.push(ds);
|
|
126
|
-
existingIds.add(ds.id);
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
// 4. Scrape World Bank (Open Access) - Economic/demographic data
|
|
130
|
-
const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
|
|
131
|
-
sourceResults["WorldBank"] = wbResults.length;
|
|
132
|
-
console.error(` [source] World Bank: ${wbResults.length} datasets`);
|
|
133
|
-
for (const ds of wbResults) {
|
|
134
|
-
if (!existingIds.has(ds.id)) {
|
|
135
|
-
newDatasets.push(ds);
|
|
136
|
-
existingIds.add(ds.id);
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
// 5. Scrape NASA (Open Access) - Scientific/space data
|
|
140
|
-
const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
|
|
141
|
-
sourceResults["NASA"] = nasaResults.length;
|
|
142
|
-
console.error(` [source] NASA: ${nasaResults.length} datasets`);
|
|
143
|
-
for (const ds of nasaResults) {
|
|
144
|
-
if (!existingIds.has(ds.id)) {
|
|
145
|
-
newDatasets.push(ds);
|
|
146
|
-
existingIds.add(ds.id);
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
// Save and index new datasets
|
|
150
|
-
if (newDatasets.length > 0) {
|
|
151
|
-
await this.saveAndIndex(newDatasets);
|
|
152
|
-
console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
|
|
153
|
-
}
|
|
154
|
-
else {
|
|
155
|
-
// Provide helpful feedback when no results found
|
|
156
|
-
const allZero = Object.values(sourceResults).every(v => v === 0);
|
|
157
|
-
if (allZero) {
|
|
158
|
-
console.error(`[JIT] No datasets found across all sources.`);
|
|
159
|
-
console.error(`[JIT] Try: broader keywords, or set HF_TOKEN for better HuggingFace access`);
|
|
160
|
-
}
|
|
161
|
-
else {
|
|
162
|
-
console.error(`[JIT] All found datasets already in index`);
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
return newDatasets;
|
|
166
|
-
}
|
|
167
|
-
catch (error) {
|
|
168
|
-
console.error(`ERROR [JIT] Error during fetch and ingest:`, error.message);
|
|
169
|
-
return [];
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
/**
|
|
173
|
-
* Scrape HuggingFace with free-text search
|
|
174
|
-
*/
|
|
175
|
-
async scrapeHuggingFace(query, limit) {
|
|
176
|
-
const scraper = new HuggingFaceScraper();
|
|
177
|
-
try {
|
|
178
|
-
return await scraper.scrape(limit, true, query);
|
|
179
|
-
}
|
|
180
|
-
catch (error) {
|
|
181
|
-
console.error(` ERROR: HuggingFace scrape failed: ${error.message}`);
|
|
182
|
-
return [];
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
/**
|
|
186
|
-
* Scrape UCI
|
|
187
|
-
*/
|
|
188
|
-
async scrapeUCI(query, limit) {
|
|
189
|
-
const scraper = new UCIScraper();
|
|
190
|
-
try {
|
|
191
|
-
return await scraper.scrape(query, limit);
|
|
192
|
-
}
|
|
193
|
-
catch (error) {
|
|
194
|
-
console.error(` ERROR: UCI scrape failed: ${error.message}`);
|
|
195
|
-
return [];
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
/**
|
|
199
|
-
* Scrape GitHub
|
|
200
|
-
*/
|
|
201
|
-
async scrapeGitHub(query, limit) {
|
|
202
|
-
const scraper = new GitHubScraper();
|
|
203
|
-
try {
|
|
204
|
-
return await scraper.scrape(query, limit);
|
|
205
|
-
}
|
|
206
|
-
catch (error) {
|
|
207
|
-
console.error(` ERROR: GitHub scrape failed: ${error.message}`);
|
|
208
|
-
return [];
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
/**
|
|
212
|
-
* Scrape World Bank
|
|
213
|
-
*/
|
|
214
|
-
async scrapeWorldBank(query, limit) {
|
|
215
|
-
const scraper = new WorldBankScraper();
|
|
216
|
-
try {
|
|
217
|
-
return await scraper.scrape(query, limit);
|
|
218
|
-
}
|
|
219
|
-
catch (error) {
|
|
220
|
-
console.error(` ERROR: World Bank scrape failed: ${error.message}`);
|
|
221
|
-
return [];
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
/**
|
|
225
|
-
* Scrape NASA
|
|
226
|
-
*/
|
|
227
|
-
async scrapeNASA(query, limit) {
|
|
228
|
-
const scraper = new NASAScraper();
|
|
229
|
-
try {
|
|
230
|
-
return await scraper.scrape(query, limit);
|
|
231
|
-
}
|
|
232
|
-
catch (error) {
|
|
233
|
-
console.error(` ERROR: NASA scrape failed: ${error.message}`);
|
|
234
|
-
return [];
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
/**
|
|
238
|
-
* Save datasets to DB and generate embeddings
|
|
239
|
-
*/
|
|
240
|
-
async saveAndIndex(datasets) {
|
|
241
|
-
// 1. Save to database
|
|
242
|
-
this.metadataStore.beginTransaction();
|
|
243
|
-
try {
|
|
244
|
-
for (const ds of datasets) {
|
|
245
|
-
this.metadataStore.saveDataset(ds);
|
|
246
|
-
}
|
|
247
|
-
this.metadataStore.commit();
|
|
248
|
-
}
|
|
249
|
-
catch (e) {
|
|
250
|
-
this.metadataStore.rollback();
|
|
251
|
-
throw e;
|
|
252
|
-
}
|
|
253
|
-
// 2. Generate embeddings and update vector store
|
|
254
|
-
for (const ds of datasets) {
|
|
255
|
-
const text = `${ds.name} ${ds.description} ${ds.tags.join(" ")}`;
|
|
256
|
-
const vector = await this.embedder.embed(text);
|
|
257
|
-
this.vectorStore.add(ds.id, vector);
|
|
258
|
-
}
|
|
259
|
-
// 3. Persist vector store to disk
|
|
260
|
-
this.vectorStore.save();
|
|
261
|
-
}
|
|
262
|
-
}
|