@vespermcp/mcp-server 1.2.21 → 1.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +49 -0
  2. package/build/cache/service.js +7 -0
  3. package/build/cloud/adapters/supabase.js +49 -0
  4. package/build/cloud/storage-manager.js +6 -0
  5. package/build/export/exporter.js +22 -9
  6. package/build/gateway/unified-dataset-gateway.js +441 -0
  7. package/build/index.js +1815 -839
  8. package/build/ingestion/ingestor.js +7 -4
  9. package/build/install/install-service.js +11 -6
  10. package/build/lib/supabase.js +3 -0
  11. package/build/metadata/arxiv-source.js +229 -0
  12. package/build/metadata/circuit-breaker.js +62 -0
  13. package/build/metadata/github-source.js +203 -0
  14. package/build/metadata/hackernews-source.js +123 -0
  15. package/build/metadata/quality.js +27 -0
  16. package/build/metadata/scraper.js +85 -14
  17. package/build/metadata/semantic-scholar-source.js +138 -0
  18. package/build/python/asset_downloader_engine.py +2 -0
  19. package/build/python/convert_engine.py +92 -0
  20. package/build/python/export_engine.py +45 -0
  21. package/build/python/kaggle_engine.py +77 -5
  22. package/build/python/normalize_engine.py +83 -0
  23. package/build/python/vesper/core/asset_downloader.py +5 -1
  24. package/build/scripts/test-phase1-webcore-quality.js +104 -0
  25. package/build/search/engine.js +45 -6
  26. package/build/search/jit-orchestrator.js +18 -14
  27. package/build/search/query-intent.js +509 -0
  28. package/build/tools/formatter.js +6 -3
  29. package/build/utils/python-runtime.js +130 -0
  30. package/build/web/extract-web.js +297 -0
  31. package/build/web/fusion-engine.js +457 -0
  32. package/build/web/types.js +1 -0
  33. package/build/web/web-core.js +242 -0
  34. package/package.json +12 -5
  35. package/scripts/postinstall.cjs +87 -31
  36. package/scripts/wizard.cjs +652 -0
  37. package/scripts/wizard.js +338 -12
  38. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  39. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  40. package/src/python/asset_downloader_engine.py +2 -0
  41. package/src/python/convert_engine.py +92 -0
  42. package/src/python/export_engine.py +45 -0
  43. package/src/python/kaggle_engine.py +77 -5
  44. package/src/python/normalize_engine.py +83 -0
  45. package/src/python/requirements.txt +12 -0
  46. package/src/python/vesper/core/asset_downloader.py +5 -1
  47. package/wizard.cjs +3 -0
@@ -12,6 +12,19 @@ except Exception:
12
12
  HAS_KAGGLE = False
13
13
 
14
14
 
15
+ IMAGE_EXTENSIONS = {
16
+ ".jpg",
17
+ ".jpeg",
18
+ ".png",
19
+ ".webp",
20
+ ".bmp",
21
+ ".gif",
22
+ ".tiff",
23
+ ".tif",
24
+ ".svg",
25
+ }
26
+
27
+
15
28
  def _ensure_auth() -> Dict[str, Any]:
16
29
  if not HAS_KAGGLE:
17
30
  return {
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
135
148
  return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
136
149
 
137
150
 
138
- def _pick_best_file(root: str) -> str:
151
+ def _find_image_files(root: str) -> List[str]:
152
+ image_files: List[str] = []
153
+ for base, _, files in os.walk(root):
154
+ for name in files:
155
+ full = os.path.join(base, name)
156
+ if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
157
+ image_files.append(full)
158
+ image_files.sort()
159
+ return image_files
160
+
161
+
162
+ def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
163
+ relative_path = os.path.relpath(full_path, root).replace("\\", "/")
164
+ parent_dir = os.path.dirname(relative_path)
165
+ parts = [part for part in parent_dir.split("/") if part and part != "."]
166
+
167
+ split = None
168
+ label = None
169
+ if parts:
170
+ first = parts[0].lower()
171
+ if first in {"train", "test", "val", "valid", "validation"}:
172
+ split = parts[0]
173
+ if len(parts) > 1:
174
+ label = parts[-1]
175
+ else:
176
+ label = parts[-1]
177
+
178
+ record: Dict[str, Any] = {
179
+ "id": index,
180
+ "image_path": os.path.abspath(full_path),
181
+ "relative_path": relative_path,
182
+ "file_name": os.path.basename(full_path),
183
+ "extension": os.path.splitext(full_path)[1].lower().lstrip("."),
184
+ }
185
+ if split:
186
+ record["split"] = split
187
+ if label:
188
+ record["label"] = label
189
+ return record
190
+
191
+
192
+ def _write_image_manifest(root: str, image_files: List[str]) -> str:
193
+ manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
194
+ with open(manifest_path, "w", encoding="utf-8") as handle:
195
+ for index, full_path in enumerate(image_files):
196
+ handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
197
+ return manifest_path
198
+
199
+
200
+ def _pick_best_file(root: str) -> Dict[str, Any]:
139
201
  candidates: List[str] = []
140
202
  for base, _, files in os.walk(root):
141
203
  for name in files:
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
145
207
  candidates.append(full)
146
208
 
147
209
  if not candidates:
210
+ image_files = _find_image_files(root)
211
+ if image_files:
212
+ manifest_path = _write_image_manifest(root, image_files)
213
+ return {
214
+ "local_path": manifest_path,
215
+ "dataset_kind": "image-manifest",
216
+ "image_count": len(image_files),
217
+ }
148
218
  raise RuntimeError("No suitable data file found after download")
149
219
 
150
220
  # prioritize common tabular formats
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
152
222
  for ext in priorities:
153
223
  for c in candidates:
154
224
  if c.lower().endswith(ext):
155
- return c
156
- return candidates[0]
225
+ return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
226
+ return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
157
227
 
158
228
 
159
229
  def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
174
244
 
175
245
  # unzip in place, remove zip for convenience
176
246
  api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
177
- best_file = _pick_best_file(target_dir)
247
+ artifact = _pick_best_file(target_dir)
178
248
  return {
179
249
  "ok": True,
180
250
  "dataset_id": dataset_ref,
181
251
  "target_dir": target_dir,
182
- "local_path": best_file,
252
+ "local_path": artifact["local_path"],
253
+ "dataset_kind": artifact["dataset_kind"],
254
+ "image_count": artifact.get("image_count", 0),
183
255
  }
184
256
  except Exception as e:
185
257
  msg = str(e)
@@ -0,0 +1,83 @@
1
+ """
2
+ Normalize any supported dataset file to parquet format.
3
+ Usage: normalize_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+
20
+ if ext == ".csv":
21
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
22
+ if ext in (".tsv", ".tab"):
23
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
24
+ if ext in (".parquet", ".pq"):
25
+ return pl.read_parquet(src)
26
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
27
+ return pl.read_ipc(src)
28
+ if ext in (".jsonl", ".ndjson"):
29
+ return pl.read_ndjson(src)
30
+ if ext == ".json":
31
+ raw = open(src, "r", encoding="utf-8").read().strip()
32
+ if raw.startswith("["):
33
+ return pl.read_json(src)
34
+ # Try NDJSON
35
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
36
+ return pl.read_ndjson(src)
37
+ # Try wrapper object
38
+ obj = json.loads(raw)
39
+ if isinstance(obj, dict):
40
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
41
+ if key in obj and isinstance(obj[key], list):
42
+ return pl.DataFrame(obj[key])
43
+ # Last resort - take first list value
44
+ for v in obj.values():
45
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
46
+ return pl.DataFrame(v)
47
+ return pl.read_json(src)
48
+ if ext == ".txt":
49
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
50
+
51
+ # Fallback: try csv
52
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
53
+
54
+
55
+ def normalize(input_path: str, output_path: str):
56
+ df = _load(input_path)
57
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
58
+ df.write_parquet(output_path)
59
+ return df.height
60
+
61
+
62
+ def main():
63
+ if len(sys.argv) < 3:
64
+ print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
65
+ sys.exit(1)
66
+
67
+ input_path = sys.argv[1]
68
+ output_path = sys.argv[2]
69
+
70
+ if not os.path.exists(input_path):
71
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
72
+ sys.exit(1)
73
+
74
+ try:
75
+ rows = normalize(input_path, output_path)
76
+ print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
77
+ except Exception as e:
78
+ print(json.dumps({"ok": False, "error": str(e)}))
79
+ sys.exit(1)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -0,0 +1,12 @@
1
+ polars==1.2.0
2
+ pandas==2.2.0
3
+ numpy==1.26.0
4
+ scikit-learn==1.4.0
5
+ # Optional source/download extras:
6
+ kaggle>=1.6.17
7
+ aiohttp>=3.9.0
8
+ aiofiles>=24.1.0
9
+ datasets>=2.20.0
10
+ webdataset>=0.2.86
11
+ # Optional for secure key storage (preferred over file fallback):
12
+ # keyring>=24.0.0
@@ -191,6 +191,7 @@ class AssetDownloader:
191
191
  kaggle_ref: Optional[str] = None,
192
192
  urls: Optional[List[str]] = None,
193
193
  output_format: str = "webdataset",
194
+ output_dir: Optional[str] = None,
194
195
  max_items: Optional[int] = None,
195
196
  image_column: Optional[str] = None,
196
197
  ) -> Dict[str, Any]:
@@ -231,7 +232,10 @@ class AssetDownloader:
231
232
  raise ValueError("urls are required for source=url")
232
233
 
233
234
  # --- Now safe to create directories ---
234
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
235
+ if output_dir:
236
+ dataset_dir = Path(output_dir).expanduser().resolve()
237
+ else:
238
+ dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
235
239
  images_dir = dataset_dir / "images"
236
240
  dataset_dir.mkdir(parents=True, exist_ok=True)
237
241
  images_dir.mkdir(parents=True, exist_ok=True)
package/wizard.cjs ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+
3
+ require('./scripts/wizard.cjs');