@vespermcp/mcp-server 1.1.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/fusion/engine.js +69 -0
- package/build/index.js +900 -50
- package/build/ingestion/hf-downloader.js +12 -3
- package/build/ingestion/ingestor.js +33 -9
- package/build/ingestion/kaggle-downloader.js +2 -2
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/scraper.js +34 -10
- package/build/python/config.py +259 -0
- package/build/python/export_engine.py +148 -52
- package/build/python/fusion_engine.py +368 -0
- package/build/python/kaggle_engine.py +204 -0
- package/build/python/row_count.py +54 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/scripts/build-index.js +5 -5
- package/build/search/jit-orchestrator.js +72 -12
- package/build/tools/formatter.js +14 -14
- package/package.json +9 -3
- package/scripts/refresh-index.cjs +87 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/config.py +259 -0
- package/src/python/export_engine.py +148 -52
- package/src/python/fusion_engine.py +368 -0
- package/src/python/kaggle_engine.py +204 -0
- package/src/python/row_count.py +54 -0
- package/src/python/test_fusion_engine.py +89 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import tempfile
|
|
5
|
+
from typing import Dict, Any, List
|
|
6
|
+
from config import get_all
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
10
|
+
HAS_KAGGLE = True
|
|
11
|
+
except Exception:
|
|
12
|
+
HAS_KAGGLE = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _ensure_auth() -> Dict[str, Any]:
|
|
16
|
+
if not HAS_KAGGLE:
|
|
17
|
+
return {
|
|
18
|
+
"ok": False,
|
|
19
|
+
"error": "kaggle package not installed. Install with: pip install kaggle",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
# Priority:
|
|
23
|
+
# 1) Existing env vars
|
|
24
|
+
# 2) secure local store (keyring or ~/.vesper/config.toml)
|
|
25
|
+
# 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
|
|
26
|
+
if not os.getenv("KAGGLE_USERNAME") or not os.getenv("KAGGLE_KEY"):
|
|
27
|
+
keys = get_all()
|
|
28
|
+
if keys.get("kaggle_username") and keys.get("kaggle_key"):
|
|
29
|
+
os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
|
|
30
|
+
os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
|
|
31
|
+
|
|
32
|
+
api = KaggleApi()
|
|
33
|
+
try:
|
|
34
|
+
api.authenticate()
|
|
35
|
+
except Exception as e:
|
|
36
|
+
return {
|
|
37
|
+
"ok": False,
|
|
38
|
+
"error": "Kaggle requires API key — run 'vespermcp config keys' (30 seconds) or provide ~/.kaggle/kaggle.json",
|
|
39
|
+
"details": str(e),
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return {"ok": True, "api": api}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _dataset_to_dict(ds) -> Dict[str, Any]:
|
|
46
|
+
# kaggle API object fields differ by version; use getattr defensively
|
|
47
|
+
ref = getattr(ds, "ref", None) or getattr(ds, "datasetRef", None) or ""
|
|
48
|
+
title = getattr(ds, "title", None) or ref
|
|
49
|
+
subtitle = getattr(ds, "subtitle", None) or ""
|
|
50
|
+
owner = getattr(ds, "creatorName", None) or getattr(ds, "ownerName", None) or ""
|
|
51
|
+
votes = int(getattr(ds, "voteCount", 0) or 0)
|
|
52
|
+
downloads = int(getattr(ds, "downloadCount", 0) or 0)
|
|
53
|
+
size = int(getattr(ds, "totalBytes", 0) or 0)
|
|
54
|
+
last_updated = str(getattr(ds, "lastUpdated", ""))
|
|
55
|
+
tags = []
|
|
56
|
+
raw_tags = getattr(ds, "tags", None)
|
|
57
|
+
if raw_tags:
|
|
58
|
+
for t in raw_tags:
|
|
59
|
+
tags.append(getattr(t, "name", str(t)))
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"id": ref,
|
|
63
|
+
"source": "kaggle",
|
|
64
|
+
"name": title,
|
|
65
|
+
"description": subtitle or title,
|
|
66
|
+
"downloads": downloads,
|
|
67
|
+
"likes": votes,
|
|
68
|
+
"stars": 0,
|
|
69
|
+
"tags": tags,
|
|
70
|
+
"last_updated": last_updated,
|
|
71
|
+
"task": "unknown",
|
|
72
|
+
"domain": "unknown",
|
|
73
|
+
"languages": [],
|
|
74
|
+
"splits": [{"name": "data", "num_examples": 0, "size_bytes": size}],
|
|
75
|
+
"license": {
|
|
76
|
+
"id": "unknown",
|
|
77
|
+
"name": "unknown",
|
|
78
|
+
"category": "unknown",
|
|
79
|
+
"usage_restrictions": [],
|
|
80
|
+
"warnings": ["Kaggle license details may vary by dataset"],
|
|
81
|
+
},
|
|
82
|
+
"quality_score": 40,
|
|
83
|
+
"quality_warnings": ["Review dataset card and competition rules before use"],
|
|
84
|
+
"download_url": f"https://www.kaggle.com/datasets/{ref}",
|
|
85
|
+
"format": None,
|
|
86
|
+
"total_examples": 0,
|
|
87
|
+
"total_size_bytes": size,
|
|
88
|
+
"total_size_mb": round(size / (1024 * 1024), 2) if size else 0,
|
|
89
|
+
"columns": [],
|
|
90
|
+
"is_structured": False,
|
|
91
|
+
"has_target_column": False,
|
|
92
|
+
"is_safe_source": True,
|
|
93
|
+
"has_personal_data": False,
|
|
94
|
+
"is_paywalled": False,
|
|
95
|
+
"is_scraped_web_data": False,
|
|
96
|
+
"uses_https": True,
|
|
97
|
+
"has_train_split": False,
|
|
98
|
+
"has_test_split": False,
|
|
99
|
+
"has_validation_split": False,
|
|
100
|
+
"description_length": len(subtitle or title),
|
|
101
|
+
"has_readme": True,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
106
|
+
auth = _ensure_auth()
|
|
107
|
+
if not auth.get("ok"):
|
|
108
|
+
return auth
|
|
109
|
+
|
|
110
|
+
api: KaggleApi = auth["api"]
|
|
111
|
+
try:
|
|
112
|
+
datasets = api.dataset_list(search=query, page_size=max(1, min(limit, 100)))
|
|
113
|
+
items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
|
|
114
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
115
|
+
except Exception as e:
|
|
116
|
+
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _pick_best_file(root: str) -> str:
|
|
120
|
+
candidates: List[str] = []
|
|
121
|
+
for base, _, files in os.walk(root):
|
|
122
|
+
for name in files:
|
|
123
|
+
full = os.path.join(base, name)
|
|
124
|
+
lower = name.lower()
|
|
125
|
+
if lower.endswith((".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow")):
|
|
126
|
+
candidates.append(full)
|
|
127
|
+
|
|
128
|
+
if not candidates:
|
|
129
|
+
raise RuntimeError("No suitable data file found after download")
|
|
130
|
+
|
|
131
|
+
# prioritize common tabular formats
|
|
132
|
+
priorities = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow"]
|
|
133
|
+
for ext in priorities:
|
|
134
|
+
for c in candidates:
|
|
135
|
+
if c.lower().endswith(ext):
|
|
136
|
+
return c
|
|
137
|
+
return candidates[0]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
141
|
+
auth = _ensure_auth()
|
|
142
|
+
if not auth.get("ok"):
|
|
143
|
+
return auth
|
|
144
|
+
|
|
145
|
+
api: KaggleApi = auth["api"]
|
|
146
|
+
|
|
147
|
+
if not target_dir:
|
|
148
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_kaggle_")
|
|
149
|
+
|
|
150
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
if "kaggle.com/datasets/" in dataset_ref:
|
|
154
|
+
dataset_ref = dataset_ref.split("kaggle.com/datasets/")[1].lstrip("/")
|
|
155
|
+
|
|
156
|
+
# unzip in place, remove zip for convenience
|
|
157
|
+
api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
|
|
158
|
+
best_file = _pick_best_file(target_dir)
|
|
159
|
+
return {
|
|
160
|
+
"ok": True,
|
|
161
|
+
"dataset_id": dataset_ref,
|
|
162
|
+
"target_dir": target_dir,
|
|
163
|
+
"local_path": best_file,
|
|
164
|
+
}
|
|
165
|
+
except Exception as e:
|
|
166
|
+
msg = str(e)
|
|
167
|
+
if "401" in msg or "Unauthorized" in msg:
|
|
168
|
+
return {"ok": False, "error": "Invalid Kaggle credentials (401). Run 'vespermcp config kaggle' again."}
|
|
169
|
+
if "429" in msg or "Too Many Requests" in msg:
|
|
170
|
+
return {"ok": False, "error": "Kaggle rate limit reached. Please retry later."}
|
|
171
|
+
return {"ok": False, "error": f"Kaggle download failed: {msg}"}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def main():
|
|
175
|
+
if len(sys.argv) < 2:
|
|
176
|
+
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py <discover|download> ..."}))
|
|
177
|
+
sys.exit(1)
|
|
178
|
+
|
|
179
|
+
command = sys.argv[1]
|
|
180
|
+
|
|
181
|
+
if command == "discover":
|
|
182
|
+
if len(sys.argv) < 3:
|
|
183
|
+
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py discover <query> [limit]"}))
|
|
184
|
+
sys.exit(1)
|
|
185
|
+
query = sys.argv[2]
|
|
186
|
+
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20
|
|
187
|
+
print(json.dumps(discover(query, limit)))
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if command == "download":
|
|
191
|
+
if len(sys.argv) < 3:
|
|
192
|
+
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py download <dataset_ref> [target_dir]"}))
|
|
193
|
+
sys.exit(1)
|
|
194
|
+
dataset_ref = sys.argv[2]
|
|
195
|
+
target_dir = sys.argv[3] if len(sys.argv) > 3 else ""
|
|
196
|
+
print(json.dumps(download(dataset_ref, target_dir)))
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
print(json.dumps({"ok": False, "error": f"Unknown command: {command}"}))
|
|
200
|
+
sys.exit(1)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == "__main__":
|
|
204
|
+
main()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import polars as pl
|
|
7
|
+
except Exception:
|
|
8
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
9
|
+
sys.exit(1)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def count_rows(path: str) -> int:
|
|
13
|
+
ext = os.path.splitext(path)[1].lower()
|
|
14
|
+
|
|
15
|
+
if ext == ".csv":
|
|
16
|
+
# Faster than full read for large csv
|
|
17
|
+
return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
|
|
18
|
+
if ext in [".parquet", ".pq"]:
|
|
19
|
+
return int(pl.scan_parquet(path).select(pl.len()).collect().item())
|
|
20
|
+
if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
|
|
21
|
+
return int(pl.scan_ipc(path).select(pl.len()).collect().item())
|
|
22
|
+
if ext in [".jsonl", ".ndjson"]:
|
|
23
|
+
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
24
|
+
if ext == ".json":
|
|
25
|
+
# fallback to eager for plain JSON arrays
|
|
26
|
+
try:
|
|
27
|
+
return int(pl.read_json(path).height)
|
|
28
|
+
except Exception:
|
|
29
|
+
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
30
|
+
|
|
31
|
+
# unknown extension fallback
|
|
32
|
+
return int(pl.read_csv(path, ignore_errors=True).height)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main():
|
|
36
|
+
if len(sys.argv) < 2:
|
|
37
|
+
print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
p = sys.argv[1]
|
|
41
|
+
if not os.path.exists(p):
|
|
42
|
+
print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
rows = count_rows(p)
|
|
47
|
+
print(json.dumps({"ok": True, "rows": rows}))
|
|
48
|
+
except Exception as e:
|
|
49
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
main()
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import polars as pl
|
|
4
|
+
from fusion_engine import fuse_datasets
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def run_basic_tests():
|
|
8
|
+
tmp = tempfile.gettempdir()
|
|
9
|
+
|
|
10
|
+
# ----- Test 1: concat -----
|
|
11
|
+
p1 = os.path.join(tmp, "fuse_test_a.csv")
|
|
12
|
+
p2 = os.path.join(tmp, "fuse_test_b.csv")
|
|
13
|
+
out_concat = os.path.join(tmp, "fuse_test_concat.feather")
|
|
14
|
+
|
|
15
|
+
df1 = pl.DataFrame({
|
|
16
|
+
"id": [1, 2, 3],
|
|
17
|
+
"text": ["a", "b", "c"],
|
|
18
|
+
"price": [10.0, 20.0, 30.0],
|
|
19
|
+
})
|
|
20
|
+
df2 = pl.DataFrame({
|
|
21
|
+
"id": [4, 5, 3],
|
|
22
|
+
"text": ["d", "e", "c"],
|
|
23
|
+
"price": [40.0, 50.0, 30.0],
|
|
24
|
+
"image_path": ["img1.jpg", "img2.jpg", "img3.jpg"],
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
df1.write_csv(p1)
|
|
28
|
+
df2.write_csv(p2)
|
|
29
|
+
|
|
30
|
+
concat_res = fuse_datasets(
|
|
31
|
+
sources=[p1, p2],
|
|
32
|
+
strategy="concat",
|
|
33
|
+
dedup=True,
|
|
34
|
+
run_quality_after=False,
|
|
35
|
+
leakage_check=True,
|
|
36
|
+
output_path=out_concat,
|
|
37
|
+
output_format="feather",
|
|
38
|
+
compression="lz4",
|
|
39
|
+
preview=True,
|
|
40
|
+
id_column="id",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
assert concat_res.get("success") is True, f"Concat failed: {concat_res}"
|
|
44
|
+
assert os.path.exists(out_concat), "Concat output file missing"
|
|
45
|
+
|
|
46
|
+
# ----- Test 2: join with conflicting column names -----
|
|
47
|
+
p3 = os.path.join(tmp, "fuse_test_c.csv")
|
|
48
|
+
p4 = os.path.join(tmp, "fuse_test_d.csv")
|
|
49
|
+
out_join = os.path.join(tmp, "fuse_test_join.parquet")
|
|
50
|
+
|
|
51
|
+
left = pl.DataFrame({
|
|
52
|
+
"id": [1, 2, 3],
|
|
53
|
+
"price": [100, 200, 300],
|
|
54
|
+
"text": ["x", "y", "z"],
|
|
55
|
+
})
|
|
56
|
+
right = pl.DataFrame({
|
|
57
|
+
"id": [2, 3, 4],
|
|
58
|
+
"price": [999, 888, 777],
|
|
59
|
+
"caption": ["two", "three", "four"],
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
left.write_csv(p3)
|
|
63
|
+
right.write_csv(p4)
|
|
64
|
+
|
|
65
|
+
join_res = fuse_datasets(
|
|
66
|
+
sources=[p3, p4],
|
|
67
|
+
strategy="join",
|
|
68
|
+
join_on="id",
|
|
69
|
+
how="inner",
|
|
70
|
+
dedup=True,
|
|
71
|
+
run_quality_after=False,
|
|
72
|
+
leakage_check=False,
|
|
73
|
+
output_path=out_join,
|
|
74
|
+
output_format="parquet",
|
|
75
|
+
compression="snappy",
|
|
76
|
+
preview=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
assert join_res.get("success") is True, f"Join failed: {join_res}"
|
|
80
|
+
assert os.path.exists(out_join), "Join output file missing"
|
|
81
|
+
assert len(join_res.get("stats", {}).get("conflict_renames", [])) >= 1, "Expected conflict rename for price column"
|
|
82
|
+
|
|
83
|
+
print("✅ Fusion tests passed")
|
|
84
|
+
print("Concat:", concat_res["stats"])
|
|
85
|
+
print("Join:", join_res["stats"])
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
run_basic_tests()
|
|
@@ -13,7 +13,7 @@ async function main() {
|
|
|
13
13
|
// Filter to only new datasets
|
|
14
14
|
const toIndex = datasets.filter(ds => !indexedIds.has(ds.id));
|
|
15
15
|
console.error(`Total datasets: ${datasets.length}, Already indexed: ${indexedIds.size}, To index: ${toIndex.length}`);
|
|
16
|
-
const BATCH_SIZE =
|
|
16
|
+
const BATCH_SIZE = 20;
|
|
17
17
|
let processed = 0;
|
|
18
18
|
for (let i = 0; i < toIndex.length; i += BATCH_SIZE) {
|
|
19
19
|
const batch = toIndex.slice(i, i + BATCH_SIZE);
|
|
@@ -26,9 +26,9 @@ async function main() {
|
|
|
26
26
|
`Languages: ${ds.languages?.join(", ") || ""}`,
|
|
27
27
|
`Tags: ${ds.tags?.join(" ") || ""}`
|
|
28
28
|
].join(" ").slice(0, 1500));
|
|
29
|
-
//
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
// Memory-safe sequential embedding (avoids OOM on large libraries)
|
|
30
|
+
for (let idx = 0; idx < batch.length; idx++) {
|
|
31
|
+
const ds = batch[idx];
|
|
32
32
|
try {
|
|
33
33
|
const vector = await embedder.embed(texts[idx]);
|
|
34
34
|
vectorStore.add(ds.id, vector);
|
|
@@ -36,7 +36,7 @@ async function main() {
|
|
|
36
36
|
catch (err) {
|
|
37
37
|
console.error(`Failed to index ${ds.id}:`, err);
|
|
38
38
|
}
|
|
39
|
-
}
|
|
39
|
+
}
|
|
40
40
|
processed += batch.length;
|
|
41
41
|
if (processed % 100 === 0 || i + BATCH_SIZE >= toIndex.length) {
|
|
42
42
|
console.error(`Indexed ${processed}/${toIndex.length} new datasets...`);
|
|
@@ -2,6 +2,19 @@ import { HuggingFaceScraper } from "../metadata/scraper.js";
|
|
|
2
2
|
import { UCIScraper } from "../metadata/uci-scraper.js";
|
|
3
3
|
import { GitHubScraper } from "../metadata/github-scraper.js";
|
|
4
4
|
import { WorldBankScraper, NASAScraper } from "../metadata/institutional-scrapers.js";
|
|
5
|
+
// Common stop words to filter out for better search
|
|
6
|
+
const STOP_WORDS = new Set([
|
|
7
|
+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
8
|
+
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
|
9
|
+
"be", "have", "has", "had", "do", "does", "did", "will", "would",
|
|
10
|
+
"could", "should", "may", "might", "must", "shall", "can", "need",
|
|
11
|
+
"about", "into", "through", "during", "before", "after", "above",
|
|
12
|
+
"below", "between", "under", "again", "further", "then", "once",
|
|
13
|
+
"here", "there", "when", "where", "why", "how", "all", "each",
|
|
14
|
+
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
|
15
|
+
"only", "own", "same", "so", "than", "too", "very", "just", "also",
|
|
16
|
+
"dataset", "datasets", "data", "find", "search", "get", "looking"
|
|
17
|
+
]);
|
|
5
18
|
/**
|
|
6
19
|
* Just-In-Time Orchestrator
|
|
7
20
|
* Automatically fetches and indexes new datasets when local search is insufficient
|
|
@@ -17,6 +30,24 @@ export class JITOrchestrator {
|
|
|
17
30
|
this.vectorStore = vectorStore;
|
|
18
31
|
this.embedder = embedder;
|
|
19
32
|
}
|
|
33
|
+
/**
|
|
34
|
+
* Simplify a complex user query into keywords that work better with APIs
|
|
35
|
+
* HuggingFace and other APIs often fail on long multi-word queries
|
|
36
|
+
*/
|
|
37
|
+
simplifyQuery(query) {
|
|
38
|
+
// Split into words, lowercase, remove punctuation
|
|
39
|
+
const words = query.toLowerCase()
|
|
40
|
+
.replace(/[^\w\s-]/g, "")
|
|
41
|
+
.split(/\s+/)
|
|
42
|
+
.filter(w => w.length > 2 && !STOP_WORDS.has(w));
|
|
43
|
+
// Return unique keywords (max 3 for API-friendly queries)
|
|
44
|
+
const unique = [...new Set(words)];
|
|
45
|
+
// If we have a lot of words, prioritize longer/more specific ones
|
|
46
|
+
if (unique.length > 3) {
|
|
47
|
+
unique.sort((a, b) => b.length - a.length);
|
|
48
|
+
}
|
|
49
|
+
return unique.slice(0, 3);
|
|
50
|
+
}
|
|
20
51
|
/**
|
|
21
52
|
* Check if JIT should be triggered based on rate limiting
|
|
22
53
|
*/
|
|
@@ -33,20 +64,37 @@ export class JITOrchestrator {
|
|
|
33
64
|
async fetchAndIngest(query, limit = 10) {
|
|
34
65
|
// Rate limiting check
|
|
35
66
|
if (!this.canTrigger(query)) {
|
|
36
|
-
console.error(`[JIT]
|
|
67
|
+
console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
|
|
37
68
|
return [];
|
|
38
69
|
}
|
|
39
70
|
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
40
71
|
this.lastTriggerTime.set(query, Date.now());
|
|
72
|
+
// Simplify query for better API results
|
|
73
|
+
const keywords = this.simplifyQuery(query);
|
|
74
|
+
if (keywords.length > 0) {
|
|
75
|
+
console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
|
|
76
|
+
}
|
|
41
77
|
const newDatasets = [];
|
|
42
78
|
const existingIds = new Set();
|
|
79
|
+
const sourceResults = {};
|
|
43
80
|
try {
|
|
44
81
|
// Get existing dataset IDs to avoid duplicates
|
|
45
82
|
const existing = this.metadataStore.getAllDatasets();
|
|
46
83
|
existing.forEach(ds => existingIds.add(ds.id));
|
|
47
|
-
// 1. Scrape HuggingFace
|
|
48
|
-
|
|
49
|
-
|
|
84
|
+
// 1. Scrape HuggingFace - try each keyword separately for better results
|
|
85
|
+
let hfResults = [];
|
|
86
|
+
for (const keyword of keywords) {
|
|
87
|
+
if (hfResults.length >= limit)
|
|
88
|
+
break;
|
|
89
|
+
const results = await this.scrapeHuggingFace(keyword, Math.ceil(limit / keywords.length));
|
|
90
|
+
for (const ds of results) {
|
|
91
|
+
if (!hfResults.some(existing => existing.id === ds.id)) {
|
|
92
|
+
hfResults.push(ds);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
sourceResults["HuggingFace"] = hfResults.length;
|
|
97
|
+
console.error(` [source] HuggingFace: ${hfResults.length} datasets`);
|
|
50
98
|
for (const ds of hfResults) {
|
|
51
99
|
if (!existingIds.has(ds.id)) {
|
|
52
100
|
newDatasets.push(ds);
|
|
@@ -55,7 +103,8 @@ export class JITOrchestrator {
|
|
|
55
103
|
}
|
|
56
104
|
// 2. Scrape UCI (Open Access)
|
|
57
105
|
const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
|
|
58
|
-
|
|
106
|
+
sourceResults["UCI"] = uciResults.length;
|
|
107
|
+
console.error(` [source] UCI: ${uciResults.length} datasets`);
|
|
59
108
|
for (const ds of uciResults) {
|
|
60
109
|
if (!existingIds.has(ds.id)) {
|
|
61
110
|
newDatasets.push(ds);
|
|
@@ -64,38 +113,49 @@ export class JITOrchestrator {
|
|
|
64
113
|
}
|
|
65
114
|
// 3. Scrape GitHub (Open Access)
|
|
66
115
|
const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
|
|
67
|
-
|
|
116
|
+
sourceResults["GitHub"] = githubResults.length;
|
|
117
|
+
console.error(` [source] GitHub: ${githubResults.length} datasets`);
|
|
68
118
|
for (const ds of githubResults) {
|
|
69
119
|
if (!existingIds.has(ds.id)) {
|
|
70
120
|
newDatasets.push(ds);
|
|
71
121
|
existingIds.add(ds.id);
|
|
72
122
|
}
|
|
73
123
|
}
|
|
74
|
-
// 4. Scrape World Bank (Open Access)
|
|
124
|
+
// 4. Scrape World Bank (Open Access) - Economic/demographic data
|
|
75
125
|
const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
|
|
76
|
-
|
|
126
|
+
sourceResults["WorldBank"] = wbResults.length;
|
|
127
|
+
console.error(` [source] World Bank: ${wbResults.length} datasets`);
|
|
77
128
|
for (const ds of wbResults) {
|
|
78
129
|
if (!existingIds.has(ds.id)) {
|
|
79
130
|
newDatasets.push(ds);
|
|
80
131
|
existingIds.add(ds.id);
|
|
81
132
|
}
|
|
82
133
|
}
|
|
83
|
-
// 5. Scrape NASA (Open Access)
|
|
134
|
+
// 5. Scrape NASA (Open Access) - Scientific/space data
|
|
84
135
|
const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
|
|
85
|
-
|
|
136
|
+
sourceResults["NASA"] = nasaResults.length;
|
|
137
|
+
console.error(` [source] NASA: ${nasaResults.length} datasets`);
|
|
86
138
|
for (const ds of nasaResults) {
|
|
87
139
|
if (!existingIds.has(ds.id)) {
|
|
88
140
|
newDatasets.push(ds);
|
|
89
141
|
existingIds.add(ds.id);
|
|
90
142
|
}
|
|
91
143
|
}
|
|
92
|
-
//
|
|
144
|
+
// Save and index new datasets
|
|
93
145
|
if (newDatasets.length > 0) {
|
|
94
146
|
await this.saveAndIndex(newDatasets);
|
|
95
147
|
console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
|
|
96
148
|
}
|
|
97
149
|
else {
|
|
98
|
-
|
|
150
|
+
// Provide helpful feedback when no results found
|
|
151
|
+
const allZero = Object.values(sourceResults).every(v => v === 0);
|
|
152
|
+
if (allZero) {
|
|
153
|
+
console.error(`[JIT] No datasets found across all sources.`);
|
|
154
|
+
console.error(`[JIT] Try: broader keywords, or set HF_TOKEN for better HuggingFace access`);
|
|
155
|
+
}
|
|
156
|
+
else {
|
|
157
|
+
console.error(`[JIT] All found datasets already in index`);
|
|
158
|
+
}
|
|
99
159
|
}
|
|
100
160
|
return newDatasets;
|
|
101
161
|
}
|
package/build/tools/formatter.js
CHANGED
|
@@ -2,29 +2,29 @@
|
|
|
2
2
|
* Format job status for visual representation
|
|
3
3
|
*/
|
|
4
4
|
export function formatJobStatus(job) {
|
|
5
|
-
const
|
|
6
|
-
"pending": "
|
|
7
|
-
"queued": "
|
|
8
|
-
"running": "
|
|
9
|
-
"completed": "
|
|
10
|
-
"failed": "
|
|
11
|
-
"retrying": "
|
|
5
|
+
const statusMap = {
|
|
6
|
+
"pending": "PENDING",
|
|
7
|
+
"queued": "QUEUED",
|
|
8
|
+
"running": "RUNNING",
|
|
9
|
+
"completed": "COMPLETED",
|
|
10
|
+
"failed": "FAILED",
|
|
11
|
+
"retrying": "RETRYING"
|
|
12
12
|
};
|
|
13
|
-
const
|
|
13
|
+
const statusText = statusMap[job.status] || "UNKNOWN";
|
|
14
14
|
const barWidth = 20;
|
|
15
15
|
const filledWidth = Math.round((job.progress / 100) * barWidth);
|
|
16
16
|
const emptyWidth = barWidth - filledWidth;
|
|
17
17
|
const bar = "█".repeat(filledWidth) + "░".repeat(emptyWidth);
|
|
18
18
|
let output = `═ Job Status: ${job.type.toUpperCase()} ═\n`;
|
|
19
19
|
output += `ID: ${job.id}\n`;
|
|
20
|
-
output += `Status: ${
|
|
20
|
+
output += `Status: ${statusText}\n`;
|
|
21
21
|
output += `Progress: ${bar} ${job.progress}%\n`;
|
|
22
22
|
output += `Activity: ${job.status_text}\n`;
|
|
23
23
|
if (job.result_url) {
|
|
24
|
-
output += `\
|
|
24
|
+
output += `\nResult: ${job.result_url}\n`;
|
|
25
25
|
}
|
|
26
26
|
if (job.error) {
|
|
27
|
-
output += `\
|
|
27
|
+
output += `\nERROR:\n`;
|
|
28
28
|
// Format multi-line errors nicely
|
|
29
29
|
const errorLines = job.error.split('\n');
|
|
30
30
|
errorLines.forEach(line => {
|
|
@@ -51,7 +51,7 @@ export function formatSearchResults(results) {
|
|
|
51
51
|
const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
|
|
52
52
|
const isOpen = openSources.includes(ds.source);
|
|
53
53
|
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
54
|
-
const accessBadge = isOpen ? "
|
|
54
|
+
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
55
55
|
// Safety indicator
|
|
56
56
|
let safetyIndicator = "";
|
|
57
57
|
if (ds.license.category === "safe") {
|
|
@@ -128,7 +128,7 @@ export function formatDatasetInfo(ds) {
|
|
|
128
128
|
const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
|
|
129
129
|
const isOpen = openSources.includes(ds.source);
|
|
130
130
|
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
131
|
-
const accessBadge = isOpen ? "
|
|
131
|
+
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
132
132
|
let safetyIndicator = "";
|
|
133
133
|
if (ds.license.category === "safe") {
|
|
134
134
|
safetyIndicator = "Safe for use";
|
|
@@ -143,7 +143,7 @@ export function formatDatasetInfo(ds) {
|
|
|
143
143
|
output += `Safety: ${safetyIndicator}\n`;
|
|
144
144
|
output += `ID: ${ds.id}\n\n`;
|
|
145
145
|
if (!isOpen && ds.source === "kaggle") {
|
|
146
|
-
output +=
|
|
146
|
+
output += `NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
|
|
147
147
|
}
|
|
148
148
|
// Description
|
|
149
149
|
if (ds.description) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.1
|
|
3
|
+
"version": "1.2.1",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"mcp-config-template.json"
|
|
18
18
|
],
|
|
19
19
|
"scripts": {
|
|
20
|
-
"build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('
|
|
20
|
+
"build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('Copied Python scripts to build/python');\"",
|
|
21
21
|
"dev": "tsx watch src/index.ts",
|
|
22
22
|
"postinstall": "node scripts/postinstall.cjs",
|
|
23
23
|
"scrape": "tsx src/scripts/scrape-metadata.ts",
|
|
@@ -27,8 +27,14 @@
|
|
|
27
27
|
"check-db": "tsx src/scripts/check-db.ts",
|
|
28
28
|
"test-jit": "tsx src/scripts/test-jit.ts",
|
|
29
29
|
"demo-ui": "tsx src/scripts/demo-ui.ts",
|
|
30
|
+
"fuse": "node build/index.js fuse",
|
|
31
|
+
"discover": "node build/index.js discover",
|
|
32
|
+
"download": "node build/index.js download",
|
|
33
|
+
"config": "node build/index.js config",
|
|
34
|
+
"test-fusion-engine": "py src/python/test_fusion_engine.py",
|
|
30
35
|
"setup": "node build/index.js --setup",
|
|
31
36
|
"setup:silent": "node build/index.js --setup --silent",
|
|
37
|
+
"refresh-index": "node scripts/refresh-index.cjs",
|
|
32
38
|
"test": "vitest",
|
|
33
39
|
"start": "node build/index.js"
|
|
34
40
|
},
|
|
@@ -86,4 +92,4 @@
|
|
|
86
92
|
"typescript": "^5.9.3",
|
|
87
93
|
"vitest": "^4.0.17"
|
|
88
94
|
}
|
|
89
|
-
}
|
|
95
|
+
}
|