@vespermcp/mcp-server 1.2.21 → 1.2.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +410 -0
- package/build/index.js +1587 -845
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/scraper.js +85 -14
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/search/engine.js +43 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/package.json +7 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +601 -0
- package/scripts/wizard.js +306 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
package/build/search/engine.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
+
import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
|
|
2
3
|
import fs from "fs";
|
|
3
4
|
function log(msg) {
|
|
4
5
|
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
@@ -17,9 +18,10 @@ export class SearchEngine {
|
|
|
17
18
|
async search(query, options = {}) {
|
|
18
19
|
const limit = options.limit || 5;
|
|
19
20
|
const enableJIT = options.enableJIT !== false; // Default: true
|
|
21
|
+
const intent = await analyzeDatasetQuery(query);
|
|
20
22
|
log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
|
|
21
23
|
// 1. Perform local search
|
|
22
|
-
const localResults = await this.localSearch(query, options);
|
|
24
|
+
const localResults = await this.localSearch(query, options, intent);
|
|
23
25
|
// 2. Check if JIT should be triggered
|
|
24
26
|
const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
|
|
25
27
|
if (!shouldTrigger) {
|
|
@@ -28,10 +30,10 @@ export class SearchEngine {
|
|
|
28
30
|
}
|
|
29
31
|
// 3. Trigger JIT fallback
|
|
30
32
|
console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
|
|
31
|
-
await this.jitOrchestrator.fetchAndIngest(query, 10);
|
|
33
|
+
await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
|
|
32
34
|
// 4. Re-run local search with updated index
|
|
33
35
|
console.error(`Re-searching with updated library...`);
|
|
34
|
-
const enhancedResults = await this.localSearch(query, options);
|
|
36
|
+
const enhancedResults = await this.localSearch(query, options, intent);
|
|
35
37
|
const newCount = enhancedResults.length - localResults.length;
|
|
36
38
|
if (newCount > 0) {
|
|
37
39
|
console.error(`Found ${newCount} additional results\n`);
|
|
@@ -41,7 +43,7 @@ export class SearchEngine {
|
|
|
41
43
|
/**
|
|
42
44
|
* Perform hybrid search (Vector + Lexical + Penalties)
|
|
43
45
|
*/
|
|
44
|
-
async localSearch(query, options) {
|
|
46
|
+
async localSearch(query, options, intent) {
|
|
45
47
|
const limit = options.limit || 5;
|
|
46
48
|
// 1. Parse Query
|
|
47
49
|
const words = query.toLowerCase().split(/\s+/);
|
|
@@ -66,6 +68,12 @@ export class SearchEngine {
|
|
|
66
68
|
// Filter: Safe only
|
|
67
69
|
if (options.safeOnly && metadata.license.category === "restricted")
|
|
68
70
|
continue;
|
|
71
|
+
// Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
|
|
72
|
+
// when user explicitly requested a single language
|
|
73
|
+
if (shouldExcludeByLanguage(metadata, intent)) {
|
|
74
|
+
log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
69
77
|
const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
|
|
70
78
|
// Filter: Explicit Negative Keywords
|
|
71
79
|
if (negativeKeywords.some(neg => text.includes(neg))) {
|
|
@@ -96,6 +104,34 @@ export class SearchEngine {
|
|
|
96
104
|
if (lexicalScore === 0 && positiveKeywords.length > 1) {
|
|
97
105
|
penalty += 0.2;
|
|
98
106
|
}
|
|
107
|
+
// Penalty: Modality Mismatch
|
|
108
|
+
// Infer the expected modality from the query and penalize mismatches.
|
|
109
|
+
// e.g., "anime quotes" is text, so image-classification datasets get penalized.
|
|
110
|
+
const textIndicators = ["quotes", "text", "nlp", "sentiment", "review", "comment", "caption", "dialogue", "chat", "translation", "summarization", "classification"];
|
|
111
|
+
const imageIndicators = ["image", "photo", "picture", "vision", "detection", "segmentation", "face", "background"];
|
|
112
|
+
const queryLower = query.toLowerCase();
|
|
113
|
+
const queryLooksText = textIndicators.some(t => queryLower.includes(t));
|
|
114
|
+
const queryLooksImage = imageIndicators.some(t => queryLower.includes(t));
|
|
115
|
+
if (queryLooksText && !queryLooksImage) {
|
|
116
|
+
const resultTask = (metadata.task || "").toLowerCase();
|
|
117
|
+
const isImageResult = resultTask.includes("image") || resultTask.includes("object-detection") ||
|
|
118
|
+
text.includes("image classification") || text.includes("image-classification") ||
|
|
119
|
+
text.includes("object detection") || text.includes("image segmentation");
|
|
120
|
+
if (isImageResult) {
|
|
121
|
+
log(`Modality penalty: text query but image dataset ${match.id}`);
|
|
122
|
+
penalty += 0.35;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (queryLooksImage && !queryLooksText) {
|
|
126
|
+
const resultTask = (metadata.task || "").toLowerCase();
|
|
127
|
+
const isTextResult = resultTask.includes("text-classification") || resultTask.includes("text-generation") ||
|
|
128
|
+
resultTask.includes("translation") || resultTask.includes("summarization") ||
|
|
129
|
+
resultTask.includes("question-answering");
|
|
130
|
+
if (isTextResult) {
|
|
131
|
+
log(`Modality penalty: image query but text dataset ${match.id}`);
|
|
132
|
+
penalty += 0.35;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
99
135
|
// D. Accessibility Bonuses (Prioritize low-friction sources)
|
|
100
136
|
let bonus = 0;
|
|
101
137
|
const sourceBonuses = {
|
|
@@ -108,11 +144,13 @@ export class SearchEngine {
|
|
|
108
144
|
bonus = sourceBonuses[metadata.source] || 0;
|
|
109
145
|
// Final Combined Score
|
|
110
146
|
// 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
|
|
111
|
-
const
|
|
147
|
+
const intentScore = scoreDatasetAgainstIntent(metadata, intent);
|
|
148
|
+
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
|
|
112
149
|
metadata.relevance_score = Math.round(finalScore * 100) / 100;
|
|
113
150
|
metadata.vector_score = Math.round(vectorScore * 100) / 100;
|
|
114
151
|
metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
|
|
115
152
|
metadata.accessibility_bonus = bonus;
|
|
153
|
+
metadata.intent_score = intentScore;
|
|
116
154
|
results.push(metadata);
|
|
117
155
|
}
|
|
118
156
|
// Sort by final score and limit
|
|
@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
|
2
2
|
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
3
3
|
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
4
4
|
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
5
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
|
|
5
6
|
// Common stop words to filter out for better search
|
|
6
7
|
const STOP_WORDS = new Set([
|
|
7
8
|
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
@@ -61,7 +62,7 @@ export class JITOrchestrator {
|
|
|
61
62
|
/**
|
|
62
63
|
* Main JIT workflow: fetch, save, index, return new datasets
|
|
63
64
|
*/
|
|
64
|
-
async fetchAndIngest(query, limit = 10) {
|
|
65
|
+
async fetchAndIngest(query, limit = 10, providedIntent) {
|
|
65
66
|
// Rate limiting check
|
|
66
67
|
if (!this.canTrigger(query)) {
|
|
67
68
|
console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
|
|
@@ -69,9 +70,12 @@ export class JITOrchestrator {
|
|
|
69
70
|
}
|
|
70
71
|
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
71
72
|
this.lastTriggerTime.set(query, Date.now());
|
|
72
|
-
|
|
73
|
-
const keywords = this.simplifyQuery(
|
|
74
|
-
if (
|
|
73
|
+
const intent = providedIntent || await analyzeDatasetQuery(query);
|
|
74
|
+
const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
|
|
75
|
+
if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
|
|
76
|
+
console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
|
|
77
|
+
}
|
|
78
|
+
else if (keywords.length > 0) {
|
|
75
79
|
console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
|
|
76
80
|
}
|
|
77
81
|
const newDatasets = [];
|
|
@@ -81,15 +85,16 @@ export class JITOrchestrator {
|
|
|
81
85
|
// Get existing dataset IDs to avoid duplicates
|
|
82
86
|
const existing = this.metadataStore.getAllDatasets();
|
|
83
87
|
existing.forEach(ds => existingIds.add(ds.id));
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
88
|
+
let hfResults = await this.scrapeHuggingFace(intent, limit);
|
|
89
|
+
if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
|
|
90
|
+
for (const keyword of keywords) {
|
|
91
|
+
if (hfResults.length >= limit)
|
|
92
|
+
break;
|
|
93
|
+
const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
|
|
94
|
+
for (const ds of results) {
|
|
95
|
+
if (!hfResults.some(existing => existing.id === ds.id)) {
|
|
96
|
+
hfResults.push(ds);
|
|
97
|
+
}
|
|
93
98
|
}
|
|
94
99
|
}
|
|
95
100
|
}
|
|
@@ -170,7 +175,6 @@ export class JITOrchestrator {
|
|
|
170
175
|
async scrapeHuggingFace(query, limit) {
|
|
171
176
|
const scraper = new HuggingFaceScraper();
|
|
172
177
|
try {
|
|
173
|
-
// Pass the query as a general search term
|
|
174
178
|
return await scraper.scrape(limit, true, query);
|
|
175
179
|
}
|
|
176
180
|
catch (error) {
|
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
import { classifyDomain } from "../metadata/domain.js";
|
|
2
|
+
const STOP_WORDS = new Set([
|
|
3
|
+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
4
|
+
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
|
5
|
+
"be", "have", "has", "had", "do", "does", "did", "will", "would",
|
|
6
|
+
"could", "should", "may", "might", "must", "shall", "can", "need",
|
|
7
|
+
"about", "into", "through", "during", "before", "after", "above",
|
|
8
|
+
"below", "between", "under", "again", "further", "then", "once",
|
|
9
|
+
"here", "there", "when", "where", "why", "how", "all", "each",
|
|
10
|
+
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
|
11
|
+
"only", "own", "same", "so", "than", "too", "very", "just", "also",
|
|
12
|
+
"dataset", "datasets", "data", "find", "search", "looking", "need", "want",
|
|
13
|
+
"give", "show", "me", "please"
|
|
14
|
+
]);
|
|
15
|
+
const LANGUAGE_ALIASES = {
|
|
16
|
+
english: ["english", "en", "eng"],
|
|
17
|
+
spanish: ["spanish", "es", "spa"],
|
|
18
|
+
french: ["french", "fr", "fra"],
|
|
19
|
+
german: ["german", "de", "deu"],
|
|
20
|
+
portuguese: ["portuguese", "pt", "por"],
|
|
21
|
+
chinese: ["chinese", "zh", "cmn"],
|
|
22
|
+
japanese: ["japanese", "ja", "jpn"],
|
|
23
|
+
korean: ["korean", "ko", "kor"],
|
|
24
|
+
arabic: ["arabic", "ar", "ara"],
|
|
25
|
+
russian: ["russian", "ru", "rus"],
|
|
26
|
+
hindi: ["hindi", "hi", "hin"],
|
|
27
|
+
multilingual: ["multilingual", "bilingual", "cross-lingual", "crosslingual"],
|
|
28
|
+
};
|
|
29
|
+
const TASK_PATTERNS = [
|
|
30
|
+
{ task: "translation", patterns: [/\btranslation\b/i, /\bmachine translation\b/i, /\bparallel corpus\b/i] },
|
|
31
|
+
{ task: "question-answering", patterns: [/\bquestion answering\b/i, /\bqa\b/i, /\bq&a\b/i] },
|
|
32
|
+
{ task: "summarization", patterns: [/\bsummarization\b/i, /\bsummary\b/i, /\btl;dr\b/i] },
|
|
33
|
+
{ task: "sentiment-analysis", patterns: [/\bsentiment\b/i, /\bsentiment analysis\b/i] },
|
|
34
|
+
{ task: "text-classification", patterns: [/\bclassification\b/i, /\bclassifier\b/i, /\btext classification\b/i] },
|
|
35
|
+
{ task: "token-classification", patterns: [/\bner\b/i, /\bnamed entity\b/i, /\btoken classification\b/i] },
|
|
36
|
+
{ task: "text-generation", patterns: [/\btext generation\b/i, /\bgenerative\b/i, /\binstruction\b/i, /\bchat\b/i] },
|
|
37
|
+
{ task: "image-classification", patterns: [/\bimage classification\b/i] },
|
|
38
|
+
{ task: "object-detection", patterns: [/\bobject detection\b/i, /\bdetection\b/i] },
|
|
39
|
+
];
|
|
40
|
+
const intentCache = new Map();
|
|
41
|
+
export async function analyzeDatasetQuery(query, requirements) {
|
|
42
|
+
const cacheKey = `${query || ""}::${requirements || ""}`;
|
|
43
|
+
const cached = intentCache.get(cacheKey);
|
|
44
|
+
if (cached) {
|
|
45
|
+
return cached;
|
|
46
|
+
}
|
|
47
|
+
const task = (async () => {
|
|
48
|
+
const heuristic = buildHeuristicIntent(query, requirements);
|
|
49
|
+
const llmIntent = await tryLlmIntent(heuristic, requirements);
|
|
50
|
+
return llmIntent ? mergeIntent(heuristic, llmIntent) : heuristic;
|
|
51
|
+
})();
|
|
52
|
+
intentCache.set(cacheKey, task);
|
|
53
|
+
return task;
|
|
54
|
+
}
|
|
55
|
+
export function scoreDatasetAgainstIntent(dataset, intent) {
|
|
56
|
+
if (!intent)
|
|
57
|
+
return 0;
|
|
58
|
+
const text = [
|
|
59
|
+
dataset.name,
|
|
60
|
+
dataset.description,
|
|
61
|
+
dataset.task,
|
|
62
|
+
dataset.domain || "",
|
|
63
|
+
dataset.tags.join(" "),
|
|
64
|
+
dataset.languages.join(" "),
|
|
65
|
+
].join(" ").toLowerCase();
|
|
66
|
+
let score = 0;
|
|
67
|
+
if (intent.language) {
|
|
68
|
+
const aliases = getLanguageAliases(intent.language);
|
|
69
|
+
const datasetLanguages = dataset.languages.map(normalizeToken);
|
|
70
|
+
const languageMatch = aliases.some(alias => datasetLanguages.includes(alias) || text.includes(alias));
|
|
71
|
+
const isMultilingualIntent = intent.language === "multilingual";
|
|
72
|
+
if (languageMatch) {
|
|
73
|
+
// Check if the dataset is monolingual in the requested language vs multilingual
|
|
74
|
+
const nonRequestedLanguages = datasetLanguages.filter(lang => !aliases.includes(lang) && lang !== "" && lang !== "unknown");
|
|
75
|
+
if (nonRequestedLanguages.length === 0 || isMultilingualIntent) {
|
|
76
|
+
// Purely the requested language (or user wants multilingual) → full boost
|
|
77
|
+
score += 0.55;
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
// Bilingual/multilingual dataset that CONTAINS the language but isn't exclusive
|
|
81
|
+
// Penalize proportionally to how many other languages are present
|
|
82
|
+
const ratio = nonRequestedLanguages.length / Math.max(datasetLanguages.length, 1);
|
|
83
|
+
score += 0.1 - (ratio * 0.4); // ranges from +0.1 (mostly target lang) to -0.3 (mostly other langs)
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
else if (dataset.languages.length > 0) {
|
|
87
|
+
score -= 0.65;
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
score -= 0.1;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (intent.task) {
|
|
94
|
+
if (matchesTask(dataset, intent.task, text)) {
|
|
95
|
+
score += 0.35;
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
score -= 0.3;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
if (intent.domain && intent.domain !== "general" && intent.domain !== "unknown") {
|
|
102
|
+
const datasetDomain = String(dataset.domain || "").toLowerCase();
|
|
103
|
+
if (datasetDomain === intent.domain || text.includes(intent.domain)) {
|
|
104
|
+
score += 0.25;
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
score -= 0.2;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
if (intent.minRows && intent.minRows > 0) {
|
|
111
|
+
const totalExamples = Number(dataset.total_examples || 0);
|
|
112
|
+
if (totalExamples > 0) {
|
|
113
|
+
const ratio = totalExamples / intent.minRows;
|
|
114
|
+
if (ratio >= 1) {
|
|
115
|
+
score += Math.min(0.45, 0.18 + (Math.log10(ratio + 1) * 0.15));
|
|
116
|
+
}
|
|
117
|
+
else if (ratio < 0.05) {
|
|
118
|
+
score -= 1.2;
|
|
119
|
+
}
|
|
120
|
+
else if (ratio < 0.25) {
|
|
121
|
+
score -= 0.8;
|
|
122
|
+
}
|
|
123
|
+
else if (ratio < 0.5) {
|
|
124
|
+
score -= 0.45;
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
score -= 0.15;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
score -= 0.08;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if (intent.positiveTerms.length > 0) {
|
|
135
|
+
const matches = intent.positiveTerms.filter(term => text.includes(term)).length;
|
|
136
|
+
score += Math.min(0.25, matches * 0.06);
|
|
137
|
+
}
|
|
138
|
+
if (intent.negativeTerms.some(term => text.includes(term))) {
|
|
139
|
+
score -= 0.7;
|
|
140
|
+
}
|
|
141
|
+
return Math.round(score * 100) / 100;
|
|
142
|
+
}
|
|
143
|
+
export function buildIntentSearchQuery(intent) {
|
|
144
|
+
return intent.searchQuery;
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Build HuggingFace-compatible filter tags from the parsed intent.
|
|
148
|
+
* Returns e.g. ["language:en", "task_ids:text-classification"].
|
|
149
|
+
*/
|
|
150
|
+
export function buildHuggingFaceFilterTags(intent) {
|
|
151
|
+
const tags = [];
|
|
152
|
+
if (intent.language && intent.language !== "multilingual") {
|
|
153
|
+
const langCode = LANGUAGE_TO_CODE[intent.language];
|
|
154
|
+
if (langCode)
|
|
155
|
+
tags.push(`language:${langCode}`);
|
|
156
|
+
}
|
|
157
|
+
if (intent.task) {
|
|
158
|
+
tags.push(`task_ids:${intent.task}`);
|
|
159
|
+
}
|
|
160
|
+
return tags;
|
|
161
|
+
}
|
|
162
|
+
const LANGUAGE_TO_CODE = {
|
|
163
|
+
english: "en",
|
|
164
|
+
spanish: "es",
|
|
165
|
+
french: "fr",
|
|
166
|
+
german: "de",
|
|
167
|
+
portuguese: "pt",
|
|
168
|
+
chinese: "zh",
|
|
169
|
+
japanese: "ja",
|
|
170
|
+
korean: "ko",
|
|
171
|
+
arabic: "ar",
|
|
172
|
+
russian: "ru",
|
|
173
|
+
hindi: "hi",
|
|
174
|
+
};
|
|
175
|
+
const BILINGUAL_INDICATORS = [
|
|
176
|
+
"translation", "parallel", "bilingual", "multilingual",
|
|
177
|
+
"cross-lingual", "crosslingual", "machine-translation",
|
|
178
|
+
"aligned", "comparable corpus",
|
|
179
|
+
];
|
|
180
|
+
/**
|
|
181
|
+
* Hard-exclude a dataset when the user requests a single specific language
|
|
182
|
+
* and the dataset is bilingual, multilingual, or tagged with other languages.
|
|
183
|
+
* Returns true if the dataset should be EXCLUDED from results.
|
|
184
|
+
*/
|
|
185
|
+
export function shouldExcludeByLanguage(dataset, intent) {
|
|
186
|
+
if (!intent?.language || intent.language === "multilingual")
|
|
187
|
+
return false;
|
|
188
|
+
const aliases = getLanguageAliases(intent.language);
|
|
189
|
+
const datasetLanguages = dataset.languages.map(normalizeToken).filter(l => l && l !== "unknown");
|
|
190
|
+
// If the dataset has language tags and ANY of them are NOT the requested language, exclude
|
|
191
|
+
if (datasetLanguages.length > 0) {
|
|
192
|
+
const hasRequestedLang = aliases.some(a => datasetLanguages.includes(a));
|
|
193
|
+
const hasOtherLangs = datasetLanguages.some(lang => !aliases.includes(lang));
|
|
194
|
+
if (hasOtherLangs)
|
|
195
|
+
return true; // bilingual/multilingual → exclude
|
|
196
|
+
if (!hasRequestedLang)
|
|
197
|
+
return true; // wrong language entirely
|
|
198
|
+
}
|
|
199
|
+
// Check name, description, and tags for bilingual indicators or other language names
|
|
200
|
+
const text = [
|
|
201
|
+
dataset.name,
|
|
202
|
+
dataset.description,
|
|
203
|
+
dataset.tags.join(" "),
|
|
204
|
+
].join(" ").toLowerCase();
|
|
205
|
+
// Check for bilingual/translation keywords
|
|
206
|
+
if (BILINGUAL_INDICATORS.some(indicator => text.includes(indicator))) {
|
|
207
|
+
return true;
|
|
208
|
+
}
|
|
209
|
+
// Check if the text mentions other specific languages by name
|
|
210
|
+
const otherLanguageNames = Object.keys(LANGUAGE_ALIASES).filter(lang => lang !== intent.language && lang !== "multilingual");
|
|
211
|
+
for (const otherLang of otherLanguageNames) {
|
|
212
|
+
const otherAliases = LANGUAGE_ALIASES[otherLang];
|
|
213
|
+
// Only check the full language name (not 2-letter codes which could appear in regular text)
|
|
214
|
+
if (otherAliases && otherAliases[0] && text.includes(otherAliases[0])) {
|
|
215
|
+
return true;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return false;
|
|
219
|
+
}
|
|
220
|
+
function buildHeuristicIntent(query, requirements) {
|
|
221
|
+
const originalQuery = `${query || ""} ${requirements || ""}`.trim();
|
|
222
|
+
const normalizedQuery = originalQuery.toLowerCase();
|
|
223
|
+
const negativeTerms = [...normalizedQuery.matchAll(/(?:^|\s)-([\w-]{2,})/g)].map(match => normalizeToken(match[1]));
|
|
224
|
+
const positiveTerms = tokenize(normalizedQuery)
|
|
225
|
+
.filter(token => !negativeTerms.includes(token))
|
|
226
|
+
.slice(0, 8);
|
|
227
|
+
const task = detectTask(normalizedQuery);
|
|
228
|
+
const language = detectLanguage(normalizedQuery);
|
|
229
|
+
const domain = classifyDomain(normalizedQuery, [], normalizedQuery, task);
|
|
230
|
+
const minRows = extractRequestedRows(normalizedQuery);
|
|
231
|
+
const searchTerms = [
|
|
232
|
+
language,
|
|
233
|
+
task,
|
|
234
|
+
domain !== "general" && domain !== "unknown" ? domain : undefined,
|
|
235
|
+
...positiveTerms,
|
|
236
|
+
].filter((value, index, self) => !!value && self.indexOf(value) === index);
|
|
237
|
+
return {
|
|
238
|
+
originalQuery,
|
|
239
|
+
normalizedQuery,
|
|
240
|
+
searchQuery: searchTerms.slice(0, 6).join(" ") || normalizedQuery,
|
|
241
|
+
positiveTerms,
|
|
242
|
+
negativeTerms,
|
|
243
|
+
language,
|
|
244
|
+
task: task || undefined,
|
|
245
|
+
domain,
|
|
246
|
+
minRows,
|
|
247
|
+
llmBacked: false,
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
function mergeIntent(base, llmIntent) {
|
|
251
|
+
const language = llmIntent.language ? normalizeToken(llmIntent.language) : base.language;
|
|
252
|
+
const task = llmIntent.task ? normalizeToken(llmIntent.task) : base.task;
|
|
253
|
+
const domain = llmIntent.domain ? normalizeToken(llmIntent.domain) : base.domain;
|
|
254
|
+
const minRows = typeof llmIntent.minRows === "number" && Number.isFinite(llmIntent.minRows)
|
|
255
|
+
? llmIntent.minRows
|
|
256
|
+
: base.minRows;
|
|
257
|
+
const positiveTerms = Array.from(new Set([...(llmIntent.positiveTerms || []), ...base.positiveTerms].map(normalizeToken))).filter(Boolean);
|
|
258
|
+
const negativeTerms = Array.from(new Set([...(llmIntent.negativeTerms || []), ...base.negativeTerms].map(normalizeToken))).filter(Boolean);
|
|
259
|
+
const merged = {
|
|
260
|
+
...base,
|
|
261
|
+
language,
|
|
262
|
+
task,
|
|
263
|
+
domain,
|
|
264
|
+
minRows,
|
|
265
|
+
positiveTerms,
|
|
266
|
+
negativeTerms,
|
|
267
|
+
llmBacked: true,
|
|
268
|
+
};
|
|
269
|
+
merged.searchQuery = [
|
|
270
|
+
merged.language,
|
|
271
|
+
merged.task,
|
|
272
|
+
merged.domain !== "general" && merged.domain !== "unknown" ? merged.domain : undefined,
|
|
273
|
+
...merged.positiveTerms,
|
|
274
|
+
].filter((value, index, self) => !!value && self.indexOf(value) === index).slice(0, 6).join(" ") || merged.normalizedQuery;
|
|
275
|
+
return merged;
|
|
276
|
+
}
|
|
277
|
+
async function tryLlmIntent(base, requirements) {
|
|
278
|
+
const openAiKey = process.env.OPENAI_API_KEY;
|
|
279
|
+
if (openAiKey) {
|
|
280
|
+
return await callOpenAiIntent(base, requirements).catch(() => undefined);
|
|
281
|
+
}
|
|
282
|
+
const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
|
|
283
|
+
if (geminiKey) {
|
|
284
|
+
return await callGeminiIntent(base, requirements, geminiKey).catch(() => undefined);
|
|
285
|
+
}
|
|
286
|
+
return undefined;
|
|
287
|
+
}
|
|
288
|
+
async function callOpenAiIntent(base, requirements) {
|
|
289
|
+
const controller = new AbortController();
|
|
290
|
+
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
291
|
+
try {
|
|
292
|
+
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
293
|
+
method: "POST",
|
|
294
|
+
headers: {
|
|
295
|
+
"Content-Type": "application/json",
|
|
296
|
+
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
|
|
297
|
+
},
|
|
298
|
+
body: JSON.stringify({
|
|
299
|
+
model: process.env.OPENAI_MODEL || "gpt-4o-mini",
|
|
300
|
+
temperature: 0,
|
|
301
|
+
response_format: { type: "json_object" },
|
|
302
|
+
messages: [
|
|
303
|
+
{
|
|
304
|
+
role: "system",
|
|
305
|
+
content: "Extract dataset search intent as JSON with keys: language, task, domain, minRows, positiveTerms, negativeTerms. Use null for unknowns.",
|
|
306
|
+
},
|
|
307
|
+
{
|
|
308
|
+
role: "user",
|
|
309
|
+
content: JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base }),
|
|
310
|
+
},
|
|
311
|
+
],
|
|
312
|
+
}),
|
|
313
|
+
signal: controller.signal,
|
|
314
|
+
});
|
|
315
|
+
if (!response.ok) {
|
|
316
|
+
return undefined;
|
|
317
|
+
}
|
|
318
|
+
const body = await response.json();
|
|
319
|
+
const content = body?.choices?.[0]?.message?.content;
|
|
320
|
+
return parseIntentPayload(content);
|
|
321
|
+
}
|
|
322
|
+
finally {
|
|
323
|
+
clearTimeout(timeout);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
async function callGeminiIntent(base, requirements, apiKey) {
|
|
327
|
+
const controller = new AbortController();
|
|
328
|
+
const timeout = setTimeout(() => controller.abort(), 5000);
|
|
329
|
+
try {
|
|
330
|
+
const model = process.env.GEMINI_MODEL || "gemini-1.5-flash";
|
|
331
|
+
const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${encodeURIComponent(apiKey)}`, {
|
|
332
|
+
method: "POST",
|
|
333
|
+
headers: {
|
|
334
|
+
"Content-Type": "application/json",
|
|
335
|
+
},
|
|
336
|
+
body: JSON.stringify({
|
|
337
|
+
generationConfig: {
|
|
338
|
+
temperature: 0,
|
|
339
|
+
responseMimeType: "application/json",
|
|
340
|
+
},
|
|
341
|
+
contents: [{
|
|
342
|
+
role: "user",
|
|
343
|
+
parts: [{
|
|
344
|
+
text: `Extract dataset search intent as JSON with keys language, task, domain, minRows, positiveTerms, negativeTerms. Query payload: ${JSON.stringify({ query: base.originalQuery, requirements: requirements || null, heuristic: base })}`,
|
|
345
|
+
}],
|
|
346
|
+
}],
|
|
347
|
+
}),
|
|
348
|
+
signal: controller.signal,
|
|
349
|
+
});
|
|
350
|
+
if (!response.ok) {
|
|
351
|
+
return undefined;
|
|
352
|
+
}
|
|
353
|
+
const body = await response.json();
|
|
354
|
+
const content = body?.candidates?.[0]?.content?.parts?.[0]?.text;
|
|
355
|
+
return parseIntentPayload(content);
|
|
356
|
+
}
|
|
357
|
+
finally {
|
|
358
|
+
clearTimeout(timeout);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
function parseIntentPayload(content) {
|
|
362
|
+
if (typeof content !== "string" || !content.trim()) {
|
|
363
|
+
return undefined;
|
|
364
|
+
}
|
|
365
|
+
const jsonText = extractJsonObject(content);
|
|
366
|
+
if (!jsonText) {
|
|
367
|
+
return undefined;
|
|
368
|
+
}
|
|
369
|
+
try {
|
|
370
|
+
const parsed = JSON.parse(jsonText);
|
|
371
|
+
return {
|
|
372
|
+
language: typeof parsed.language === "string" ? parsed.language : undefined,
|
|
373
|
+
task: typeof parsed.task === "string" ? parsed.task : undefined,
|
|
374
|
+
domain: typeof parsed.domain === "string" ? parsed.domain : undefined,
|
|
375
|
+
minRows: typeof parsed.minRows === "number"
|
|
376
|
+
? parsed.minRows
|
|
377
|
+
: typeof parsed.min_rows === "number"
|
|
378
|
+
? parsed.min_rows
|
|
379
|
+
: undefined,
|
|
380
|
+
positiveTerms: Array.isArray(parsed.positiveTerms)
|
|
381
|
+
? parsed.positiveTerms.filter((item) => typeof item === "string")
|
|
382
|
+
: Array.isArray(parsed.positive_terms)
|
|
383
|
+
? parsed.positive_terms.filter((item) => typeof item === "string")
|
|
384
|
+
: undefined,
|
|
385
|
+
negativeTerms: Array.isArray(parsed.negativeTerms)
|
|
386
|
+
? parsed.negativeTerms.filter((item) => typeof item === "string")
|
|
387
|
+
: Array.isArray(parsed.negative_terms)
|
|
388
|
+
? parsed.negative_terms.filter((item) => typeof item === "string")
|
|
389
|
+
: undefined,
|
|
390
|
+
};
|
|
391
|
+
}
|
|
392
|
+
catch {
|
|
393
|
+
return undefined;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
function extractJsonObject(text) {
|
|
397
|
+
const trimmed = text.trim();
|
|
398
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) {
|
|
399
|
+
return trimmed;
|
|
400
|
+
}
|
|
401
|
+
const start = trimmed.indexOf("{");
|
|
402
|
+
const end = trimmed.lastIndexOf("}");
|
|
403
|
+
if (start >= 0 && end > start) {
|
|
404
|
+
return trimmed.slice(start, end + 1);
|
|
405
|
+
}
|
|
406
|
+
return undefined;
|
|
407
|
+
}
|
|
408
|
+
function detectLanguage(text) {
|
|
409
|
+
for (const [language, aliases] of Object.entries(LANGUAGE_ALIASES)) {
|
|
410
|
+
if (aliases.some(alias => new RegExp(`(^|[^a-z])${escapeRegex(alias)}([^a-z]|$)`, "i").test(text))) {
|
|
411
|
+
return language;
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
return undefined;
|
|
415
|
+
}
|
|
416
|
+
function detectTask(text) {
|
|
417
|
+
const match = TASK_PATTERNS.find(entry => entry.patterns.some(pattern => pattern.test(text)));
|
|
418
|
+
return match?.task;
|
|
419
|
+
}
|
|
420
|
+
function tokenize(text) {
|
|
421
|
+
return Array.from(new Set(text
|
|
422
|
+
.replace(/[^\w\s-]/g, " ")
|
|
423
|
+
.split(/\s+/)
|
|
424
|
+
.map(normalizeToken)
|
|
425
|
+
.filter(token => token.length > 2 && !STOP_WORDS.has(token) && !/^\d+$/.test(token))));
|
|
426
|
+
}
|
|
427
|
+
function normalizeToken(value) {
|
|
428
|
+
return value.toLowerCase().replace(/^[^a-z0-9]+|[^a-z0-9-]+$/g, "").trim();
|
|
429
|
+
}
|
|
430
|
+
function extractRequestedRows(text) {
|
|
431
|
+
// Match "1 million", "2.5 billion", "500 thousand" etc.
|
|
432
|
+
const wordMultipliers = {
|
|
433
|
+
thousand: 1_000, million: 1_000_000, billion: 1_000_000_000,
|
|
434
|
+
mil: 1_000_000, bil: 1_000_000_000,
|
|
435
|
+
};
|
|
436
|
+
const wordPattern = new RegExp(`(\\d+(?:\\.\\d+)?)\\s*(${Object.keys(wordMultipliers).join("|")})\\b`, "i");
|
|
437
|
+
const wordMatch = text.match(wordPattern);
|
|
438
|
+
if (wordMatch) {
|
|
439
|
+
const base = Number(wordMatch[1]);
|
|
440
|
+
const multiplier = wordMultipliers[wordMatch[2].toLowerCase()];
|
|
441
|
+
const value = Math.round(base * multiplier);
|
|
442
|
+
if (Number.isFinite(value) && value > 0)
|
|
443
|
+
return value;
|
|
444
|
+
}
|
|
445
|
+
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
446
|
+
if (explicit) {
|
|
447
|
+
const value = Number(explicit[1].replace(/[\s,]/g, ""));
|
|
448
|
+
if (Number.isFinite(value) && value > 0) {
|
|
449
|
+
return value;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
const humanSized = text.match(/(\d+(?:\.\d+)?)\s*([kmb])\s*(samples?|rows?|records?)/i);
|
|
453
|
+
if (humanSized) {
|
|
454
|
+
const base = Number(humanSized[1]);
|
|
455
|
+
const suffix = humanSized[2].toLowerCase();
|
|
456
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
457
|
+
const value = Math.round(base * multiplier);
|
|
458
|
+
if (Number.isFinite(value) && value > 0) {
|
|
459
|
+
return value;
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
|
|
463
|
+
.map(match => Number(match[0].replace(/,/g, "")))
|
|
464
|
+
.filter(value => Number.isFinite(value) && value > 0);
|
|
465
|
+
if (commaNumbers.length > 0) {
|
|
466
|
+
return Math.max(...commaNumbers);
|
|
467
|
+
}
|
|
468
|
+
const humanSizedAnywhere = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
|
|
469
|
+
.map(match => {
|
|
470
|
+
const base = Number(match[1]);
|
|
471
|
+
const suffix = match[2].toLowerCase();
|
|
472
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
473
|
+
return Math.round(base * multiplier);
|
|
474
|
+
})
|
|
475
|
+
.filter(value => Number.isFinite(value) && value > 0);
|
|
476
|
+
if (humanSizedAnywhere.length > 0) {
|
|
477
|
+
return Math.max(...humanSizedAnywhere);
|
|
478
|
+
}
|
|
479
|
+
const allNumbers = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
480
|
+
.map(match => Number(match[0]))
|
|
481
|
+
.filter(value => Number.isFinite(value) && value > 0);
|
|
482
|
+
if (allNumbers.length > 0) {
|
|
483
|
+
return Math.max(...allNumbers);
|
|
484
|
+
}
|
|
485
|
+
return undefined;
|
|
486
|
+
}
|
|
487
|
+
function matchesTask(dataset, task, text) {
|
|
488
|
+
const normalizedTask = normalizeToken(task);
|
|
489
|
+
const aliases = {
|
|
490
|
+
"question-answering": ["question-answering", "qa", "question answering"],
|
|
491
|
+
"text-classification": ["text-classification", "classification", "text classification"],
|
|
492
|
+
"token-classification": ["token-classification", "ner", "named entity"],
|
|
493
|
+
"sentiment-analysis": ["sentiment-analysis", "sentiment"],
|
|
494
|
+
translation: ["translation", "machine-translation", "parallel corpus"],
|
|
495
|
+
summarization: ["summarization", "summary"],
|
|
496
|
+
"text-generation": ["text-generation", "generation", "chat", "instruction"],
|
|
497
|
+
"image-classification": ["image-classification", "image classification"],
|
|
498
|
+
"object-detection": ["object-detection", "object detection"],
|
|
499
|
+
};
|
|
500
|
+
const variants = aliases[normalizedTask] || [normalizedTask];
|
|
501
|
+
return variants.some(variant => normalizeToken(dataset.task).includes(variant) || text.includes(variant));
|
|
502
|
+
}
|
|
503
|
+
function getLanguageAliases(language) {
|
|
504
|
+
const normalized = normalizeToken(language);
|
|
505
|
+
return (LANGUAGE_ALIASES[normalized] || [normalized]).map(normalizeToken);
|
|
506
|
+
}
|
|
507
|
+
function escapeRegex(value) {
|
|
508
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
509
|
+
}
|