@vespermcp/mcp-server 1.2.21 → 1.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cache/service.js +7 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +441 -0
- package/build/index.js +1815 -839
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/arxiv-source.js +229 -0
- package/build/metadata/circuit-breaker.js +62 -0
- package/build/metadata/github-source.js +203 -0
- package/build/metadata/hackernews-source.js +123 -0
- package/build/metadata/quality.js +27 -0
- package/build/metadata/scraper.js +85 -14
- package/build/metadata/semantic-scholar-source.js +138 -0
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/scripts/test-phase1-webcore-quality.js +104 -0
- package/build/search/engine.js +45 -6
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/build/web/extract-web.js +297 -0
- package/build/web/fusion-engine.js +457 -0
- package/build/web/types.js +1 -0
- package/build/web/web-core.js +242 -0
- package/package.json +12 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +652 -0
- package/scripts/wizard.js +338 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
|
@@ -12,6 +12,19 @@ except Exception:
|
|
|
12
12
|
HAS_KAGGLE = False
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
IMAGE_EXTENSIONS = {
|
|
16
|
+
".jpg",
|
|
17
|
+
".jpeg",
|
|
18
|
+
".png",
|
|
19
|
+
".webp",
|
|
20
|
+
".bmp",
|
|
21
|
+
".gif",
|
|
22
|
+
".tiff",
|
|
23
|
+
".tif",
|
|
24
|
+
".svg",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
15
28
|
def _ensure_auth() -> Dict[str, Any]:
|
|
16
29
|
if not HAS_KAGGLE:
|
|
17
30
|
return {
|
|
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
|
135
148
|
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
136
149
|
|
|
137
150
|
|
|
138
|
-
def
|
|
151
|
+
def _find_image_files(root: str) -> List[str]:
|
|
152
|
+
image_files: List[str] = []
|
|
153
|
+
for base, _, files in os.walk(root):
|
|
154
|
+
for name in files:
|
|
155
|
+
full = os.path.join(base, name)
|
|
156
|
+
if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
|
|
157
|
+
image_files.append(full)
|
|
158
|
+
image_files.sort()
|
|
159
|
+
return image_files
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
|
|
163
|
+
relative_path = os.path.relpath(full_path, root).replace("\\", "/")
|
|
164
|
+
parent_dir = os.path.dirname(relative_path)
|
|
165
|
+
parts = [part for part in parent_dir.split("/") if part and part != "."]
|
|
166
|
+
|
|
167
|
+
split = None
|
|
168
|
+
label = None
|
|
169
|
+
if parts:
|
|
170
|
+
first = parts[0].lower()
|
|
171
|
+
if first in {"train", "test", "val", "valid", "validation"}:
|
|
172
|
+
split = parts[0]
|
|
173
|
+
if len(parts) > 1:
|
|
174
|
+
label = parts[-1]
|
|
175
|
+
else:
|
|
176
|
+
label = parts[-1]
|
|
177
|
+
|
|
178
|
+
record: Dict[str, Any] = {
|
|
179
|
+
"id": index,
|
|
180
|
+
"image_path": os.path.abspath(full_path),
|
|
181
|
+
"relative_path": relative_path,
|
|
182
|
+
"file_name": os.path.basename(full_path),
|
|
183
|
+
"extension": os.path.splitext(full_path)[1].lower().lstrip("."),
|
|
184
|
+
}
|
|
185
|
+
if split:
|
|
186
|
+
record["split"] = split
|
|
187
|
+
if label:
|
|
188
|
+
record["label"] = label
|
|
189
|
+
return record
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _write_image_manifest(root: str, image_files: List[str]) -> str:
|
|
193
|
+
manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
|
|
194
|
+
with open(manifest_path, "w", encoding="utf-8") as handle:
|
|
195
|
+
for index, full_path in enumerate(image_files):
|
|
196
|
+
handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
|
|
197
|
+
return manifest_path
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _pick_best_file(root: str) -> Dict[str, Any]:
|
|
139
201
|
candidates: List[str] = []
|
|
140
202
|
for base, _, files in os.walk(root):
|
|
141
203
|
for name in files:
|
|
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
|
|
|
145
207
|
candidates.append(full)
|
|
146
208
|
|
|
147
209
|
if not candidates:
|
|
210
|
+
image_files = _find_image_files(root)
|
|
211
|
+
if image_files:
|
|
212
|
+
manifest_path = _write_image_manifest(root, image_files)
|
|
213
|
+
return {
|
|
214
|
+
"local_path": manifest_path,
|
|
215
|
+
"dataset_kind": "image-manifest",
|
|
216
|
+
"image_count": len(image_files),
|
|
217
|
+
}
|
|
148
218
|
raise RuntimeError("No suitable data file found after download")
|
|
149
219
|
|
|
150
220
|
# prioritize common tabular formats
|
|
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
|
|
|
152
222
|
for ext in priorities:
|
|
153
223
|
for c in candidates:
|
|
154
224
|
if c.lower().endswith(ext):
|
|
155
|
-
return c
|
|
156
|
-
return candidates[0]
|
|
225
|
+
return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
|
|
226
|
+
return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
|
|
157
227
|
|
|
158
228
|
|
|
159
229
|
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
|
174
244
|
|
|
175
245
|
# unzip in place, remove zip for convenience
|
|
176
246
|
api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
|
|
177
|
-
|
|
247
|
+
artifact = _pick_best_file(target_dir)
|
|
178
248
|
return {
|
|
179
249
|
"ok": True,
|
|
180
250
|
"dataset_id": dataset_ref,
|
|
181
251
|
"target_dir": target_dir,
|
|
182
|
-
"local_path":
|
|
252
|
+
"local_path": artifact["local_path"],
|
|
253
|
+
"dataset_kind": artifact["dataset_kind"],
|
|
254
|
+
"image_count": artifact.get("image_count", 0),
|
|
183
255
|
}
|
|
184
256
|
except Exception as e:
|
|
185
257
|
msg = str(e)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize any supported dataset file to parquet format.
|
|
3
|
+
Usage: normalize_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
|
|
20
|
+
if ext == ".csv":
|
|
21
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
22
|
+
if ext in (".tsv", ".tab"):
|
|
23
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
24
|
+
if ext in (".parquet", ".pq"):
|
|
25
|
+
return pl.read_parquet(src)
|
|
26
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
27
|
+
return pl.read_ipc(src)
|
|
28
|
+
if ext in (".jsonl", ".ndjson"):
|
|
29
|
+
return pl.read_ndjson(src)
|
|
30
|
+
if ext == ".json":
|
|
31
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
32
|
+
if raw.startswith("["):
|
|
33
|
+
return pl.read_json(src)
|
|
34
|
+
# Try NDJSON
|
|
35
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
36
|
+
return pl.read_ndjson(src)
|
|
37
|
+
# Try wrapper object
|
|
38
|
+
obj = json.loads(raw)
|
|
39
|
+
if isinstance(obj, dict):
|
|
40
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
41
|
+
if key in obj and isinstance(obj[key], list):
|
|
42
|
+
return pl.DataFrame(obj[key])
|
|
43
|
+
# Last resort - take first list value
|
|
44
|
+
for v in obj.values():
|
|
45
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
46
|
+
return pl.DataFrame(v)
|
|
47
|
+
return pl.read_json(src)
|
|
48
|
+
if ext == ".txt":
|
|
49
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
50
|
+
|
|
51
|
+
# Fallback: try csv
|
|
52
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize(input_path: str, output_path: str):
|
|
56
|
+
df = _load(input_path)
|
|
57
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
58
|
+
df.write_parquet(output_path)
|
|
59
|
+
return df.height
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
if len(sys.argv) < 3:
|
|
64
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
input_path = sys.argv[1]
|
|
68
|
+
output_path = sys.argv[2]
|
|
69
|
+
|
|
70
|
+
if not os.path.exists(input_path):
|
|
71
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
rows = normalize(input_path, output_path)
|
|
76
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -191,6 +191,7 @@ class AssetDownloader:
|
|
|
191
191
|
kaggle_ref: Optional[str] = None,
|
|
192
192
|
urls: Optional[List[str]] = None,
|
|
193
193
|
output_format: str = "webdataset",
|
|
194
|
+
output_dir: Optional[str] = None,
|
|
194
195
|
max_items: Optional[int] = None,
|
|
195
196
|
image_column: Optional[str] = None,
|
|
196
197
|
) -> Dict[str, Any]:
|
|
@@ -231,7 +232,10 @@ class AssetDownloader:
|
|
|
231
232
|
raise ValueError("urls are required for source=url")
|
|
232
233
|
|
|
233
234
|
# --- Now safe to create directories ---
|
|
234
|
-
|
|
235
|
+
if output_dir:
|
|
236
|
+
dataset_dir = Path(output_dir).expanduser().resolve()
|
|
237
|
+
else:
|
|
238
|
+
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
235
239
|
images_dir = dataset_dir / "images"
|
|
236
240
|
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
237
241
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { CacheService, MockRedisProvider } from "../cache/service.js";
|
|
2
|
+
import { ArxivSource } from "../metadata/arxiv-source.js";
|
|
3
|
+
import { GithubSource } from "../metadata/github-source.js";
|
|
4
|
+
import { SemanticScholarSource } from "../metadata/semantic-scholar-source.js";
|
|
5
|
+
import { HackerNewsSource } from "../metadata/hackernews-source.js";
|
|
6
|
+
import { WebCoreEngine } from "../web/web-core.js";
|
|
7
|
+
function percentile(values, p) {
|
|
8
|
+
if (values.length === 0)
|
|
9
|
+
return 0;
|
|
10
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
11
|
+
const idx = Math.max(0, Math.ceil(p * sorted.length) - 1);
|
|
12
|
+
return sorted[idx] ?? 0;
|
|
13
|
+
}
|
|
14
|
+
function makeQueries() {
|
|
15
|
+
const topics = [
|
|
16
|
+
"agentic RAG evaluation",
|
|
17
|
+
"tool-augmented retrieval",
|
|
18
|
+
"LLM dataset quality scoring",
|
|
19
|
+
"semantic deduplication embeddings",
|
|
20
|
+
"cross-source dataset fusion",
|
|
21
|
+
"retrieval augmented generation metrics",
|
|
22
|
+
"dataset export parquet arrow jsonl",
|
|
23
|
+
"data safety dataset provenance",
|
|
24
|
+
"synthetic data generation alignment",
|
|
25
|
+
"multi-source corpus building",
|
|
26
|
+
];
|
|
27
|
+
const suffixes = [
|
|
28
|
+
"paper",
|
|
29
|
+
"benchmark",
|
|
30
|
+
"latency",
|
|
31
|
+
"quality",
|
|
32
|
+
"dedup",
|
|
33
|
+
"provenance",
|
|
34
|
+
"evaluation",
|
|
35
|
+
"pipeline",
|
|
36
|
+
"MCP",
|
|
37
|
+
"agents",
|
|
38
|
+
];
|
|
39
|
+
const out = [];
|
|
40
|
+
for (let i = 0; out.length < 100; i++) {
|
|
41
|
+
const t = topics[i % topics.length];
|
|
42
|
+
const s = suffixes[Math.floor(i / topics.length) % suffixes.length];
|
|
43
|
+
out.push(`${t} ${s}`.trim());
|
|
44
|
+
}
|
|
45
|
+
return out.slice(0, 100);
|
|
46
|
+
}
|
|
47
|
+
async function main() {
|
|
48
|
+
const cache = new CacheService(new MockRedisProvider());
|
|
49
|
+
const engine = new WebCoreEngine({
|
|
50
|
+
arxivSource: new ArxivSource(cache),
|
|
51
|
+
githubSource: new GithubSource(cache),
|
|
52
|
+
semanticScholarSource: new SemanticScholarSource(cache),
|
|
53
|
+
hackerNewsSource: new HackerNewsSource(cache),
|
|
54
|
+
});
|
|
55
|
+
const baseQuery = "agentic RAG evaluation";
|
|
56
|
+
const flags = { arxiv_full_text: true, github_include_readme: true };
|
|
57
|
+
const sources = ["arxiv", "github"];
|
|
58
|
+
console.log("Phase 1 Web Core validation:");
|
|
59
|
+
console.log("- Checking cached latency regression on a single query...");
|
|
60
|
+
const run1 = await engine.find({ query: baseQuery, sources: [...sources], limit: 2, ...flags });
|
|
61
|
+
const run2 = await engine.find({ query: baseQuery, sources: [...sources], limit: 2, ...flags });
|
|
62
|
+
const run2Arxiv = run2.telemetry?.per_source.find((t) => t.source === "arxiv");
|
|
63
|
+
const run2Github = run2.telemetry?.per_source.find((t) => t.source === "github");
|
|
64
|
+
console.log("Cached telemetry (run2):", {
|
|
65
|
+
arxiv: run2Arxiv ? { cache_hit: run2Arxiv.cache_hit, latency_ms: run2Arxiv.latency_ms } : null,
|
|
66
|
+
github: run2Github ? { cache_hit: run2Github.cache_hit, latency_ms: run2Github.latency_ms } : null,
|
|
67
|
+
});
|
|
68
|
+
// 100 query quality distribution test
|
|
69
|
+
console.log("- Running 100 test queries (quality distribution + extraction latency)...");
|
|
70
|
+
const queries = makeQueries();
|
|
71
|
+
const qualityScores = [];
|
|
72
|
+
const pdfExtractMs = [];
|
|
73
|
+
for (let i = 0; i < queries.length; i++) {
|
|
74
|
+
const q = queries[i];
|
|
75
|
+
const res = await engine.find({ query: q, sources: [...sources], limit: 2, ...flags });
|
|
76
|
+
for (const doc of res.results) {
|
|
77
|
+
qualityScores.push(Number(doc.quality_score));
|
|
78
|
+
}
|
|
79
|
+
const arxivTelemetry = res.telemetry?.per_source.find((t) => t.source === "arxiv");
|
|
80
|
+
if (arxivTelemetry) {
|
|
81
|
+
pdfExtractMs.push(Number(arxivTelemetry.pdf_extract_ms_total || 0));
|
|
82
|
+
}
|
|
83
|
+
if ((i + 1) % 10 === 0) {
|
|
84
|
+
console.log(` progress: ${i + 1}/100`);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
const q95 = percentile(qualityScores, 0.95);
|
|
88
|
+
const pdfP95 = percentile(pdfExtractMs, 0.95);
|
|
89
|
+
console.log("\nResults:");
|
|
90
|
+
console.log(`- Quality score p95: ${q95}`);
|
|
91
|
+
console.log(`- PDF extract ms p95 (arxiv): ${pdfP95}`);
|
|
92
|
+
const okQuality = q95 > 0.9;
|
|
93
|
+
const okPdfLatency = pdfP95 < 5000;
|
|
94
|
+
console.log(`\nSuccess criteria:`);
|
|
95
|
+
console.log(`- Quality p95 > 0.9: ${okQuality ? "PASS" : "FAIL"}`);
|
|
96
|
+
console.log(`- PDF extract p95 < 5000ms additional: ${okPdfLatency ? "PASS" : "FAIL"}`);
|
|
97
|
+
if (!okQuality) {
|
|
98
|
+
console.log("Tip: adjust estimateQualityScore() weights/thresholds in src/metadata/quality.ts then rerun.");
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
main().catch((e) => {
|
|
102
|
+
console.error(e);
|
|
103
|
+
process.exit(1);
|
|
104
|
+
});
|
package/build/search/engine.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { JITOrchestrator } from "./jit-orchestrator.js";
|
|
2
|
+
import { analyzeDatasetQuery, scoreDatasetAgainstIntent, shouldExcludeByLanguage } from "./query-intent.js";
|
|
2
3
|
import fs from "fs";
|
|
3
4
|
function log(msg) {
|
|
4
5
|
fs.appendFileSync("debug.log", new Date().toISOString() + " " + msg + "\n");
|
|
@@ -17,9 +18,10 @@ export class SearchEngine {
|
|
|
17
18
|
async search(query, options = {}) {
|
|
18
19
|
const limit = options.limit || 5;
|
|
19
20
|
const enableJIT = options.enableJIT !== false; // Default: true
|
|
21
|
+
const intent = await analyzeDatasetQuery(query);
|
|
20
22
|
log(`Search request: "${query}" Limit=${limit} Safe=${options.safeOnly} JIT=${enableJIT}`);
|
|
21
23
|
// 1. Perform local search
|
|
22
|
-
const localResults = await this.localSearch(query, options);
|
|
24
|
+
const localResults = await this.localSearch(query, options, intent);
|
|
23
25
|
// 2. Check if JIT should be triggered
|
|
24
26
|
const shouldTrigger = enableJIT && this.shouldTriggerJIT(localResults, query);
|
|
25
27
|
if (!shouldTrigger) {
|
|
@@ -28,10 +30,10 @@ export class SearchEngine {
|
|
|
28
30
|
}
|
|
29
31
|
// 3. Trigger JIT fallback
|
|
30
32
|
console.error(`\nWARNING: Low confidence results (${localResults.length} found, top score: ${localResults[0]?.relevance_score || 0})`);
|
|
31
|
-
await this.jitOrchestrator.fetchAndIngest(query, 10);
|
|
33
|
+
await this.jitOrchestrator.fetchAndIngest(query, 10, intent);
|
|
32
34
|
// 4. Re-run local search with updated index
|
|
33
35
|
console.error(`Re-searching with updated library...`);
|
|
34
|
-
const enhancedResults = await this.localSearch(query, options);
|
|
36
|
+
const enhancedResults = await this.localSearch(query, options, intent);
|
|
35
37
|
const newCount = enhancedResults.length - localResults.length;
|
|
36
38
|
if (newCount > 0) {
|
|
37
39
|
console.error(`Found ${newCount} additional results\n`);
|
|
@@ -41,7 +43,7 @@ export class SearchEngine {
|
|
|
41
43
|
/**
|
|
42
44
|
* Perform hybrid search (Vector + Lexical + Penalties)
|
|
43
45
|
*/
|
|
44
|
-
async localSearch(query, options) {
|
|
46
|
+
async localSearch(query, options, intent) {
|
|
45
47
|
const limit = options.limit || 5;
|
|
46
48
|
// 1. Parse Query
|
|
47
49
|
const words = query.toLowerCase().split(/\s+/);
|
|
@@ -66,6 +68,12 @@ export class SearchEngine {
|
|
|
66
68
|
// Filter: Safe only
|
|
67
69
|
if (options.safeOnly && metadata.license.category === "restricted")
|
|
68
70
|
continue;
|
|
71
|
+
// Filter: Hard language exclusion — completely drop bilingual/multilingual datasets
|
|
72
|
+
// when user explicitly requested a single language
|
|
73
|
+
if (shouldExcludeByLanguage(metadata, intent)) {
|
|
74
|
+
log(`Language exclusion: Dropped ${match.id} (bilingual/multilingual for single-language query)`);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
69
77
|
const text = `${metadata.name} ${metadata.description} ${metadata.tags.join(" ")}`.toLowerCase();
|
|
70
78
|
// Filter: Explicit Negative Keywords
|
|
71
79
|
if (negativeKeywords.some(neg => text.includes(neg))) {
|
|
@@ -96,6 +104,34 @@ export class SearchEngine {
|
|
|
96
104
|
if (lexicalScore === 0 && positiveKeywords.length > 1) {
|
|
97
105
|
penalty += 0.2;
|
|
98
106
|
}
|
|
107
|
+
// Penalty: Modality Mismatch
|
|
108
|
+
// Infer the expected modality from the query and penalize mismatches.
|
|
109
|
+
// e.g., "anime quotes" is text, so image-classification datasets get penalized.
|
|
110
|
+
const textIndicators = ["quotes", "text", "nlp", "sentiment", "review", "comment", "caption", "dialogue", "chat", "translation", "summarization", "classification"];
|
|
111
|
+
const imageIndicators = ["image", "photo", "picture", "vision", "detection", "segmentation", "face", "background"];
|
|
112
|
+
const queryLower = query.toLowerCase();
|
|
113
|
+
const queryLooksText = textIndicators.some(t => queryLower.includes(t));
|
|
114
|
+
const queryLooksImage = imageIndicators.some(t => queryLower.includes(t));
|
|
115
|
+
if (queryLooksText && !queryLooksImage) {
|
|
116
|
+
const resultTask = (metadata.task || "").toLowerCase();
|
|
117
|
+
const isImageResult = resultTask.includes("image") || resultTask.includes("object-detection") ||
|
|
118
|
+
text.includes("image classification") || text.includes("image-classification") ||
|
|
119
|
+
text.includes("object detection") || text.includes("image segmentation");
|
|
120
|
+
if (isImageResult) {
|
|
121
|
+
log(`Modality penalty: text query but image dataset ${match.id}`);
|
|
122
|
+
penalty += 0.35;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (queryLooksImage && !queryLooksText) {
|
|
126
|
+
const resultTask = (metadata.task || "").toLowerCase();
|
|
127
|
+
const isTextResult = resultTask.includes("text-classification") || resultTask.includes("text-generation") ||
|
|
128
|
+
resultTask.includes("translation") || resultTask.includes("summarization") ||
|
|
129
|
+
resultTask.includes("question-answering");
|
|
130
|
+
if (isTextResult) {
|
|
131
|
+
log(`Modality penalty: image query but text dataset ${match.id}`);
|
|
132
|
+
penalty += 0.35;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
99
135
|
// D. Accessibility Bonuses (Prioritize low-friction sources)
|
|
100
136
|
let bonus = 0;
|
|
101
137
|
const sourceBonuses = {
|
|
@@ -103,16 +139,19 @@ export class SearchEngine {
|
|
|
103
139
|
"uci": 0.1,
|
|
104
140
|
"github": 0.1,
|
|
105
141
|
"worldbank": 0.1,
|
|
106
|
-
"nasa": 0.1
|
|
142
|
+
"nasa": 0.1,
|
|
143
|
+
"arxiv": 0.1
|
|
107
144
|
};
|
|
108
145
|
bonus = sourceBonuses[metadata.source] || 0;
|
|
109
146
|
// Final Combined Score
|
|
110
147
|
// 70% Vector, 30% Lexical, minus Penalties, plus Bonuses
|
|
111
|
-
const
|
|
148
|
+
const intentScore = scoreDatasetAgainstIntent(metadata, intent);
|
|
149
|
+
const finalScore = (vectorScore * 0.7) + (lexicalScore * 0.3) - penalty + bonus + intentScore;
|
|
112
150
|
metadata.relevance_score = Math.round(finalScore * 100) / 100;
|
|
113
151
|
metadata.vector_score = Math.round(vectorScore * 100) / 100;
|
|
114
152
|
metadata.lexical_score = Math.round(lexicalScore * 100) / 100;
|
|
115
153
|
metadata.accessibility_bonus = bonus;
|
|
154
|
+
metadata.intent_score = intentScore;
|
|
116
155
|
results.push(metadata);
|
|
117
156
|
}
|
|
118
157
|
// Sort by final score and limit
|
|
@@ -2,6 +2,7 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
|
2
2
|
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
3
3
|
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
4
4
|
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
5
|
+
import { analyzeDatasetQuery, buildIntentSearchQuery } from "./query-intent.js";
|
|
5
6
|
// Common stop words to filter out for better search
|
|
6
7
|
const STOP_WORDS = new Set([
|
|
7
8
|
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
@@ -61,7 +62,7 @@ export class JITOrchestrator {
|
|
|
61
62
|
/**
|
|
62
63
|
* Main JIT workflow: fetch, save, index, return new datasets
|
|
63
64
|
*/
|
|
64
|
-
async fetchAndIngest(query, limit = 10) {
|
|
65
|
+
async fetchAndIngest(query, limit = 10, providedIntent) {
|
|
65
66
|
// Rate limiting check
|
|
66
67
|
if (!this.canTrigger(query)) {
|
|
67
68
|
console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
|
|
@@ -69,9 +70,12 @@ export class JITOrchestrator {
|
|
|
69
70
|
}
|
|
70
71
|
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
71
72
|
this.lastTriggerTime.set(query, Date.now());
|
|
72
|
-
|
|
73
|
-
const keywords = this.simplifyQuery(
|
|
74
|
-
if (
|
|
73
|
+
const intent = providedIntent || await analyzeDatasetQuery(query);
|
|
74
|
+
const keywords = this.simplifyQuery(buildIntentSearchQuery(intent));
|
|
75
|
+
if (intent.llmBacked || intent.language || intent.task || intent.domain || intent.minRows) {
|
|
76
|
+
console.error(`[JIT] Intent: ${JSON.stringify({ language: intent.language, task: intent.task, domain: intent.domain, minRows: intent.minRows, searchQuery: intent.searchQuery })}`);
|
|
77
|
+
}
|
|
78
|
+
else if (keywords.length > 0) {
|
|
75
79
|
console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
|
|
76
80
|
}
|
|
77
81
|
const newDatasets = [];
|
|
@@ -81,15 +85,16 @@ export class JITOrchestrator {
|
|
|
81
85
|
// Get existing dataset IDs to avoid duplicates
|
|
82
86
|
const existing = this.metadataStore.getAllDatasets();
|
|
83
87
|
existing.forEach(ds => existingIds.add(ds.id));
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
88
|
+
let hfResults = await this.scrapeHuggingFace(intent, limit);
|
|
89
|
+
if (hfResults.length < Math.max(3, Math.floor(limit / 2))) {
|
|
90
|
+
for (const keyword of keywords) {
|
|
91
|
+
if (hfResults.length >= limit)
|
|
92
|
+
break;
|
|
93
|
+
const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / Math.max(keywords.length, 1)));
|
|
94
|
+
for (const ds of results) {
|
|
95
|
+
if (!hfResults.some(existing => existing.id === ds.id)) {
|
|
96
|
+
hfResults.push(ds);
|
|
97
|
+
}
|
|
93
98
|
}
|
|
94
99
|
}
|
|
95
100
|
}
|
|
@@ -170,7 +175,6 @@ export class JITOrchestrator {
|
|
|
170
175
|
async scrapeHuggingFace(query, limit) {
|
|
171
176
|
const scraper = new HuggingFaceScraper();
|
|
172
177
|
try {
|
|
173
|
-
// Pass the query as a general search term
|
|
174
178
|
return await scraper.scrape(limit, true, query);
|
|
175
179
|
}
|
|
176
180
|
catch (error) {
|