@vespermcp/mcp-server 1.2.21 → 1.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cache/service.js +7 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +441 -0
- package/build/index.js +1815 -839
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/arxiv-source.js +229 -0
- package/build/metadata/circuit-breaker.js +62 -0
- package/build/metadata/github-source.js +203 -0
- package/build/metadata/hackernews-source.js +123 -0
- package/build/metadata/quality.js +27 -0
- package/build/metadata/scraper.js +85 -14
- package/build/metadata/semantic-scholar-source.js +138 -0
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/scripts/test-phase1-webcore-quality.js +104 -0
- package/build/search/engine.js +45 -6
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/build/web/extract-web.js +297 -0
- package/build/web/fusion-engine.js +457 -0
- package/build/web/types.js +1 -0
- package/build/web/web-core.js +242 -0
- package/package.json +12 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +652 -0
- package/scripts/wizard.js +338 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
|
@@ -12,6 +12,19 @@ except Exception:
|
|
|
12
12
|
HAS_KAGGLE = False
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
IMAGE_EXTENSIONS = {
|
|
16
|
+
".jpg",
|
|
17
|
+
".jpeg",
|
|
18
|
+
".png",
|
|
19
|
+
".webp",
|
|
20
|
+
".bmp",
|
|
21
|
+
".gif",
|
|
22
|
+
".tiff",
|
|
23
|
+
".tif",
|
|
24
|
+
".svg",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
15
28
|
def _ensure_auth() -> Dict[str, Any]:
|
|
16
29
|
if not HAS_KAGGLE:
|
|
17
30
|
return {
|
|
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
|
135
148
|
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
136
149
|
|
|
137
150
|
|
|
138
|
-
def
|
|
151
|
+
def _find_image_files(root: str) -> List[str]:
|
|
152
|
+
image_files: List[str] = []
|
|
153
|
+
for base, _, files in os.walk(root):
|
|
154
|
+
for name in files:
|
|
155
|
+
full = os.path.join(base, name)
|
|
156
|
+
if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
|
|
157
|
+
image_files.append(full)
|
|
158
|
+
image_files.sort()
|
|
159
|
+
return image_files
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
|
|
163
|
+
relative_path = os.path.relpath(full_path, root).replace("\\", "/")
|
|
164
|
+
parent_dir = os.path.dirname(relative_path)
|
|
165
|
+
parts = [part for part in parent_dir.split("/") if part and part != "."]
|
|
166
|
+
|
|
167
|
+
split = None
|
|
168
|
+
label = None
|
|
169
|
+
if parts:
|
|
170
|
+
first = parts[0].lower()
|
|
171
|
+
if first in {"train", "test", "val", "valid", "validation"}:
|
|
172
|
+
split = parts[0]
|
|
173
|
+
if len(parts) > 1:
|
|
174
|
+
label = parts[-1]
|
|
175
|
+
else:
|
|
176
|
+
label = parts[-1]
|
|
177
|
+
|
|
178
|
+
record: Dict[str, Any] = {
|
|
179
|
+
"id": index,
|
|
180
|
+
"image_path": os.path.abspath(full_path),
|
|
181
|
+
"relative_path": relative_path,
|
|
182
|
+
"file_name": os.path.basename(full_path),
|
|
183
|
+
"extension": os.path.splitext(full_path)[1].lower().lstrip("."),
|
|
184
|
+
}
|
|
185
|
+
if split:
|
|
186
|
+
record["split"] = split
|
|
187
|
+
if label:
|
|
188
|
+
record["label"] = label
|
|
189
|
+
return record
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _write_image_manifest(root: str, image_files: List[str]) -> str:
|
|
193
|
+
manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
|
|
194
|
+
with open(manifest_path, "w", encoding="utf-8") as handle:
|
|
195
|
+
for index, full_path in enumerate(image_files):
|
|
196
|
+
handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
|
|
197
|
+
return manifest_path
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _pick_best_file(root: str) -> Dict[str, Any]:
|
|
139
201
|
candidates: List[str] = []
|
|
140
202
|
for base, _, files in os.walk(root):
|
|
141
203
|
for name in files:
|
|
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
|
|
|
145
207
|
candidates.append(full)
|
|
146
208
|
|
|
147
209
|
if not candidates:
|
|
210
|
+
image_files = _find_image_files(root)
|
|
211
|
+
if image_files:
|
|
212
|
+
manifest_path = _write_image_manifest(root, image_files)
|
|
213
|
+
return {
|
|
214
|
+
"local_path": manifest_path,
|
|
215
|
+
"dataset_kind": "image-manifest",
|
|
216
|
+
"image_count": len(image_files),
|
|
217
|
+
}
|
|
148
218
|
raise RuntimeError("No suitable data file found after download")
|
|
149
219
|
|
|
150
220
|
# prioritize common tabular formats
|
|
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
|
|
|
152
222
|
for ext in priorities:
|
|
153
223
|
for c in candidates:
|
|
154
224
|
if c.lower().endswith(ext):
|
|
155
|
-
return c
|
|
156
|
-
return candidates[0]
|
|
225
|
+
return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
|
|
226
|
+
return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
|
|
157
227
|
|
|
158
228
|
|
|
159
229
|
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
|
174
244
|
|
|
175
245
|
# unzip in place, remove zip for convenience
|
|
176
246
|
api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
|
|
177
|
-
|
|
247
|
+
artifact = _pick_best_file(target_dir)
|
|
178
248
|
return {
|
|
179
249
|
"ok": True,
|
|
180
250
|
"dataset_id": dataset_ref,
|
|
181
251
|
"target_dir": target_dir,
|
|
182
|
-
"local_path":
|
|
252
|
+
"local_path": artifact["local_path"],
|
|
253
|
+
"dataset_kind": artifact["dataset_kind"],
|
|
254
|
+
"image_count": artifact.get("image_count", 0),
|
|
183
255
|
}
|
|
184
256
|
except Exception as e:
|
|
185
257
|
msg = str(e)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize any supported dataset file to parquet format.
|
|
3
|
+
Usage: normalize_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
|
|
20
|
+
if ext == ".csv":
|
|
21
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
22
|
+
if ext in (".tsv", ".tab"):
|
|
23
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
24
|
+
if ext in (".parquet", ".pq"):
|
|
25
|
+
return pl.read_parquet(src)
|
|
26
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
27
|
+
return pl.read_ipc(src)
|
|
28
|
+
if ext in (".jsonl", ".ndjson"):
|
|
29
|
+
return pl.read_ndjson(src)
|
|
30
|
+
if ext == ".json":
|
|
31
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
32
|
+
if raw.startswith("["):
|
|
33
|
+
return pl.read_json(src)
|
|
34
|
+
# Try NDJSON
|
|
35
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
36
|
+
return pl.read_ndjson(src)
|
|
37
|
+
# Try wrapper object
|
|
38
|
+
obj = json.loads(raw)
|
|
39
|
+
if isinstance(obj, dict):
|
|
40
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
41
|
+
if key in obj and isinstance(obj[key], list):
|
|
42
|
+
return pl.DataFrame(obj[key])
|
|
43
|
+
# Last resort - take first list value
|
|
44
|
+
for v in obj.values():
|
|
45
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
46
|
+
return pl.DataFrame(v)
|
|
47
|
+
return pl.read_json(src)
|
|
48
|
+
if ext == ".txt":
|
|
49
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
50
|
+
|
|
51
|
+
# Fallback: try csv
|
|
52
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize(input_path: str, output_path: str):
|
|
56
|
+
df = _load(input_path)
|
|
57
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
58
|
+
df.write_parquet(output_path)
|
|
59
|
+
return df.height
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
if len(sys.argv) < 3:
|
|
64
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
input_path = sys.argv[1]
|
|
68
|
+
output_path = sys.argv[2]
|
|
69
|
+
|
|
70
|
+
if not os.path.exists(input_path):
|
|
71
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
rows = normalize(input_path, output_path)
|
|
76
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
polars==1.2.0
|
|
2
|
+
pandas==2.2.0
|
|
3
|
+
numpy==1.26.0
|
|
4
|
+
scikit-learn==1.4.0
|
|
5
|
+
# Optional source/download extras:
|
|
6
|
+
kaggle>=1.6.17
|
|
7
|
+
aiohttp>=3.9.0
|
|
8
|
+
aiofiles>=24.1.0
|
|
9
|
+
datasets>=2.20.0
|
|
10
|
+
webdataset>=0.2.86
|
|
11
|
+
# Optional for secure key storage (preferred over file fallback):
|
|
12
|
+
# keyring>=24.0.0
|
|
@@ -191,6 +191,7 @@ class AssetDownloader:
|
|
|
191
191
|
kaggle_ref: Optional[str] = None,
|
|
192
192
|
urls: Optional[List[str]] = None,
|
|
193
193
|
output_format: str = "webdataset",
|
|
194
|
+
output_dir: Optional[str] = None,
|
|
194
195
|
max_items: Optional[int] = None,
|
|
195
196
|
image_column: Optional[str] = None,
|
|
196
197
|
) -> Dict[str, Any]:
|
|
@@ -231,7 +232,10 @@ class AssetDownloader:
|
|
|
231
232
|
raise ValueError("urls are required for source=url")
|
|
232
233
|
|
|
233
234
|
# --- Now safe to create directories ---
|
|
234
|
-
|
|
235
|
+
if output_dir:
|
|
236
|
+
dataset_dir = Path(output_dir).expanduser().resolve()
|
|
237
|
+
else:
|
|
238
|
+
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
235
239
|
images_dir = dataset_dir / "images"
|
|
236
240
|
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
237
241
|
images_dir.mkdir(parents=True, exist_ok=True)
|
package/wizard.cjs
ADDED