@vespermcp/mcp-server 1.2.20 → 1.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +49 -0
  2. package/build/cloud/adapters/supabase.js +49 -0
  3. package/build/cloud/storage-manager.js +6 -0
  4. package/build/export/exporter.js +22 -9
  5. package/build/gateway/unified-dataset-gateway.js +410 -0
  6. package/build/index.js +1592 -837
  7. package/build/ingestion/hf-downloader.js +12 -2
  8. package/build/ingestion/ingestor.js +19 -9
  9. package/build/install/install-service.js +11 -6
  10. package/build/lib/supabase.js +3 -0
  11. package/build/metadata/scraper.js +85 -14
  12. package/build/python/asset_downloader_engine.py +22 -1
  13. package/build/python/convert_engine.py +92 -0
  14. package/build/python/export_engine.py +45 -0
  15. package/build/python/hf_fallback.py +196 -45
  16. package/build/python/kaggle_engine.py +77 -5
  17. package/build/python/normalize_engine.py +83 -0
  18. package/build/python/vesper/core/asset_downloader.py +238 -48
  19. package/build/search/engine.js +43 -5
  20. package/build/search/jit-orchestrator.js +18 -14
  21. package/build/search/query-intent.js +509 -0
  22. package/build/tools/formatter.js +6 -3
  23. package/build/utils/python-runtime.js +130 -0
  24. package/package.json +7 -5
  25. package/scripts/postinstall.cjs +87 -31
  26. package/scripts/wizard.cjs +601 -0
  27. package/scripts/wizard.js +306 -12
  28. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  29. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  30. package/src/python/asset_downloader_engine.py +22 -1
  31. package/src/python/convert_engine.py +92 -0
  32. package/src/python/export_engine.py +45 -0
  33. package/src/python/hf_fallback.py +196 -45
  34. package/src/python/kaggle_engine.py +77 -5
  35. package/src/python/normalize_engine.py +83 -0
  36. package/src/python/requirements.txt +12 -0
  37. package/src/python/vesper/core/asset_downloader.py +238 -48
  38. package/wizard.cjs +3 -0
@@ -5,6 +5,12 @@ Used when the HF Hub file listing finds no suitable data files
5
5
  (e.g. script-based datasets, gated datasets, datasets that use
6
6
  the `datasets` library format).
7
7
 
8
+ Handles:
9
+ - Legacy script-based datasets (trust_remote_code)
10
+ - Gated/private datasets (token auth)
11
+ - Image datasets (PIL Image columns → stripped for tabular export)
12
+ - Various split formats (DatasetDict, single split)
13
+
8
14
  Usage:
9
15
  python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
10
16
 
@@ -15,6 +21,127 @@ Output: JSON to stdout
15
21
  import sys
16
22
  import json
17
23
  import os
24
+ import warnings
25
+
26
+ # Suppress noisy HF warnings about trust_remote_code etc.
27
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
28
+ warnings.filterwarnings("ignore", message=".*legacy.*")
29
+
30
+
31
+ def _detect_image_columns(ds):
32
+ """Detect columns that contain HF Image features or PIL Image objects."""
33
+ image_cols = []
34
+ features = getattr(ds, "features", None)
35
+ if features:
36
+ for name, feat in features.items():
37
+ feat_cls = feat.__class__.__name__.lower()
38
+ feat_str = str(feat).lower()
39
+ if feat_cls == "image" or "image(" in feat_str:
40
+ image_cols.append(name)
41
+ return image_cols
42
+
43
+
44
+ def _strip_image_columns(ds, image_cols):
45
+ """Remove image columns from dataset so it can be exported to Parquet/CSV.
46
+
47
+ Image columns contain PIL Image objects that can't be serialized to tabular
48
+ formats. We replace them with a placeholder string indicating the column
49
+ was an image column.
50
+ """
51
+ if not image_cols:
52
+ return ds
53
+
54
+ # Remove the image columns entirely for tabular export
55
+ cols_to_keep = [c for c in ds.column_names if c not in image_cols]
56
+ if not cols_to_keep:
57
+ # Dataset is ALL image columns — keep them but cast to path strings if possible
58
+ return ds
59
+
60
+ return ds.select_columns(cols_to_keep)
61
+
62
+
63
+ def _load_dataset_robust(repo_id, token, split):
64
+ """Load a HuggingFace dataset with multiple fallback strategies.
65
+
66
+ Strategy order:
67
+ 1. Normal load with trust_remote_code=True (handles legacy script datasets)
68
+ 2. Load without trust_remote_code (newer datasets that reject it)
69
+ 3. Load with streaming=True then materialize (handles very large datasets)
70
+ """
71
+ from datasets import load_dataset, DatasetDict
72
+
73
+ errors = []
74
+ splits_to_try = [split] if split else ["train", "test", "validation", None]
75
+
76
+ # Strategy 1: Normal load with trust_remote_code
77
+ for s in splits_to_try:
78
+ try:
79
+ kwargs = {"path": repo_id, "trust_remote_code": True}
80
+ if token:
81
+ kwargs["token"] = token
82
+ if s:
83
+ kwargs["split"] = s
84
+ ds = load_dataset(**kwargs)
85
+ return ds, s
86
+ except (ValueError, KeyError):
87
+ continue
88
+ except Exception as e:
89
+ msg = str(e)
90
+ # Auth errors should be raised immediately, not retried
91
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
92
+ raise
93
+ if "split" in msg.lower() or "key" in msg.lower():
94
+ continue
95
+ errors.append(f"trust_remote_code=True, split={s}: {msg}")
96
+
97
+ # Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
98
+ for s in splits_to_try:
99
+ try:
100
+ kwargs = {"path": repo_id}
101
+ if token:
102
+ kwargs["token"] = token
103
+ if s:
104
+ kwargs["split"] = s
105
+ ds = load_dataset(**kwargs)
106
+ return ds, s
107
+ except (ValueError, KeyError):
108
+ continue
109
+ except Exception as e:
110
+ msg = str(e)
111
+ if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
112
+ raise
113
+ if "split" in msg.lower() or "key" in msg.lower():
114
+ continue
115
+ errors.append(f"trust_remote_code=False, split={s}: {msg}")
116
+
117
+ # Strategy 3: Streaming fallback (for very large / oddly structured datasets)
118
+ for s in splits_to_try:
119
+ if s is None:
120
+ continue # streaming requires a split
121
+ try:
122
+ kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
123
+ if token:
124
+ kwargs["token"] = token
125
+ if s:
126
+ kwargs["split"] = s
127
+ ds_stream = load_dataset(**kwargs)
128
+ # Materialize from streaming iterator
129
+ from datasets import Dataset as HFDataset
130
+ rows = []
131
+ for i, row in enumerate(ds_stream):
132
+ if i >= 500000:
133
+ break
134
+ rows.append(row)
135
+ if rows:
136
+ ds = HFDataset.from_list(rows)
137
+ return ds, s
138
+ except Exception:
139
+ continue
140
+
141
+ # All strategies failed
142
+ error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
143
+ return None, error_summary
144
+
18
145
 
19
146
  def main():
20
147
  if len(sys.argv) < 2:
@@ -53,37 +180,10 @@ def main():
53
180
  pl = None
54
181
 
55
182
  try:
56
- # Try loading with streaming first (memory-efficient)
57
- # If split is not specified, try common ones
58
- splits_to_try = [split] if split else ["train", "test", "validation", None]
59
-
60
- ds = None
61
- used_split = None
62
-
63
- for s in splits_to_try:
64
- try:
65
- kwargs = {
66
- "path": repo_id,
67
- "trust_remote_code": True,
68
- }
69
- if token:
70
- kwargs["token"] = token
71
- if s:
72
- kwargs["split"] = s
73
-
74
- ds = load_dataset(**kwargs)
75
- used_split = s
76
- break
77
- except (ValueError, KeyError):
78
- # Split doesn't exist, try next
79
- continue
80
- except Exception as e:
81
- if "split" in str(e).lower() or "key" in str(e).lower():
82
- continue
83
- raise
183
+ ds, used_split = _load_dataset_robust(repo_id, token, split)
84
184
 
85
185
  if ds is None:
86
- print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}'. No valid splits found."}))
186
+ print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
87
187
  sys.exit(1)
88
188
 
89
189
  # Handle DatasetDict (when no split specified)
@@ -107,37 +207,88 @@ def main():
107
207
  ds = ds.select(range(max_rows))
108
208
  total_rows = max_rows
109
209
 
210
+ # Detect and handle image columns (PIL Image objects can't be exported to Parquet)
211
+ image_cols = _detect_image_columns(ds)
212
+ has_images = len(image_cols) > 0
213
+
214
+ if has_images:
215
+ # Strip image columns for tabular export, note them in output
216
+ export_ds = _strip_image_columns(ds, image_cols)
217
+ else:
218
+ export_ds = ds
219
+
110
220
  # Ensure output directory exists
111
221
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
112
222
 
113
223
  # Export to parquet
114
- columns = ds.column_names
224
+ columns = export_ds.column_names
115
225
 
116
- if output_path.endswith(".parquet"):
117
- ds.to_parquet(output_path)
118
- elif output_path.endswith(".csv"):
119
- ds.to_csv(output_path)
120
- else:
121
- # Default to parquet
122
- if not output_path.endswith(".parquet"):
123
- output_path = output_path + ".parquet"
124
- ds.to_parquet(output_path)
226
+ try:
227
+ if output_path.endswith(".parquet"):
228
+ export_ds.to_parquet(output_path)
229
+ elif output_path.endswith(".csv"):
230
+ export_ds.to_csv(output_path)
231
+ else:
232
+ # Default to parquet
233
+ if not output_path.endswith(".parquet"):
234
+ output_path = output_path + ".parquet"
235
+ export_ds.to_parquet(output_path)
236
+ except Exception as export_err:
237
+ # If parquet export fails (e.g. complex nested types), try CSV
238
+ csv_path = output_path.replace(".parquet", ".csv")
239
+ try:
240
+ export_ds.to_csv(csv_path)
241
+ output_path = csv_path
242
+ except Exception:
243
+ raise export_err # Re-raise original error
125
244
 
126
- print(json.dumps({
245
+ result = {
127
246
  "ok": True,
128
247
  "path": output_path,
129
248
  "rows": total_rows,
130
249
  "columns": columns,
131
250
  "split": used_split
132
- }))
251
+ }
252
+
253
+ if has_images:
254
+ result["image_columns"] = image_cols
255
+ result["note"] = (
256
+ f"This dataset contains image columns ({', '.join(image_cols)}). "
257
+ "Image data was stripped for tabular export. "
258
+ "Use vesper_download_assets with source='huggingface' to download the actual images."
259
+ )
260
+
261
+ print(json.dumps(result))
133
262
 
134
263
  except Exception as e:
135
264
  error_msg = str(e)
136
- # Provide helpful hints
137
- if "401" in error_msg or "403" in error_msg or "gated" in error_msg.lower():
138
- error_msg += " (This dataset may be gated/private. Set HF_TOKEN via configure_keys tool.)"
139
- elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower():
265
+ # Provide helpful, actionable hints
266
+ if "401" in error_msg or "Unauthorized" in error_msg:
267
+ error_msg = (
268
+ f"Authentication required for dataset '{repo_id}'. "
269
+ "This dataset may be gated or private. "
270
+ "Use the configure_keys tool to set your HF_TOKEN, then retry."
271
+ )
272
+ elif "403" in error_msg or "Forbidden" in error_msg:
273
+ error_msg = (
274
+ f"Access denied for dataset '{repo_id}'. "
275
+ "You may need to accept the dataset's usage agreement on huggingface.co, "
276
+ "then set HF_TOKEN via configure_keys tool."
277
+ )
278
+ elif "gated" in error_msg.lower():
279
+ error_msg = (
280
+ f"Dataset '{repo_id}' is gated. "
281
+ "Visit https://huggingface.co/datasets/{repo_id} to request access, "
282
+ "then set HF_TOKEN via configure_keys tool."
283
+ ).format(repo_id=repo_id)
284
+ elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
140
285
  error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
286
+ elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
287
+ error_msg = (
288
+ f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
289
+ "by the current version of the datasets library. "
290
+ "Try: pip install datasets --upgrade, or use an older datasets version."
291
+ )
141
292
 
142
293
  print(json.dumps({"ok": False, "error": error_msg}))
143
294
  sys.exit(1)
@@ -12,6 +12,19 @@ except Exception:
12
12
  HAS_KAGGLE = False
13
13
 
14
14
 
15
+ IMAGE_EXTENSIONS = {
16
+ ".jpg",
17
+ ".jpeg",
18
+ ".png",
19
+ ".webp",
20
+ ".bmp",
21
+ ".gif",
22
+ ".tiff",
23
+ ".tif",
24
+ ".svg",
25
+ }
26
+
27
+
15
28
  def _ensure_auth() -> Dict[str, Any]:
16
29
  if not HAS_KAGGLE:
17
30
  return {
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
135
148
  return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
136
149
 
137
150
 
138
- def _pick_best_file(root: str) -> str:
151
+ def _find_image_files(root: str) -> List[str]:
152
+ image_files: List[str] = []
153
+ for base, _, files in os.walk(root):
154
+ for name in files:
155
+ full = os.path.join(base, name)
156
+ if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
157
+ image_files.append(full)
158
+ image_files.sort()
159
+ return image_files
160
+
161
+
162
+ def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
163
+ relative_path = os.path.relpath(full_path, root).replace("\\", "/")
164
+ parent_dir = os.path.dirname(relative_path)
165
+ parts = [part for part in parent_dir.split("/") if part and part != "."]
166
+
167
+ split = None
168
+ label = None
169
+ if parts:
170
+ first = parts[0].lower()
171
+ if first in {"train", "test", "val", "valid", "validation"}:
172
+ split = parts[0]
173
+ if len(parts) > 1:
174
+ label = parts[-1]
175
+ else:
176
+ label = parts[-1]
177
+
178
+ record: Dict[str, Any] = {
179
+ "id": index,
180
+ "image_path": os.path.abspath(full_path),
181
+ "relative_path": relative_path,
182
+ "file_name": os.path.basename(full_path),
183
+ "extension": os.path.splitext(full_path)[1].lower().lstrip("."),
184
+ }
185
+ if split:
186
+ record["split"] = split
187
+ if label:
188
+ record["label"] = label
189
+ return record
190
+
191
+
192
+ def _write_image_manifest(root: str, image_files: List[str]) -> str:
193
+ manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
194
+ with open(manifest_path, "w", encoding="utf-8") as handle:
195
+ for index, full_path in enumerate(image_files):
196
+ handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
197
+ return manifest_path
198
+
199
+
200
+ def _pick_best_file(root: str) -> Dict[str, Any]:
139
201
  candidates: List[str] = []
140
202
  for base, _, files in os.walk(root):
141
203
  for name in files:
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
145
207
  candidates.append(full)
146
208
 
147
209
  if not candidates:
210
+ image_files = _find_image_files(root)
211
+ if image_files:
212
+ manifest_path = _write_image_manifest(root, image_files)
213
+ return {
214
+ "local_path": manifest_path,
215
+ "dataset_kind": "image-manifest",
216
+ "image_count": len(image_files),
217
+ }
148
218
  raise RuntimeError("No suitable data file found after download")
149
219
 
150
220
  # prioritize common tabular formats
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
152
222
  for ext in priorities:
153
223
  for c in candidates:
154
224
  if c.lower().endswith(ext):
155
- return c
156
- return candidates[0]
225
+ return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
226
+ return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
157
227
 
158
228
 
159
229
  def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
174
244
 
175
245
  # unzip in place, remove zip for convenience
176
246
  api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
177
- best_file = _pick_best_file(target_dir)
247
+ artifact = _pick_best_file(target_dir)
178
248
  return {
179
249
  "ok": True,
180
250
  "dataset_id": dataset_ref,
181
251
  "target_dir": target_dir,
182
- "local_path": best_file,
252
+ "local_path": artifact["local_path"],
253
+ "dataset_kind": artifact["dataset_kind"],
254
+ "image_count": artifact.get("image_count", 0),
183
255
  }
184
256
  except Exception as e:
185
257
  msg = str(e)
@@ -0,0 +1,83 @@
1
+ """
2
+ Normalize any supported dataset file to parquet format.
3
+ Usage: normalize_engine.py <input_path> <output_path>
4
+ Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
5
+ """
6
+ import sys
7
+ import json
8
+ import os
9
+
10
+ try:
11
+ import polars as pl
12
+ except Exception:
13
+ print(json.dumps({"ok": False, "error": "polars is required"}))
14
+ sys.exit(1)
15
+
16
+
17
+ def _load(src: str) -> pl.DataFrame:
18
+ ext = os.path.splitext(src)[1].lower()
19
+
20
+ if ext == ".csv":
21
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
22
+ if ext in (".tsv", ".tab"):
23
+ return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
24
+ if ext in (".parquet", ".pq"):
25
+ return pl.read_parquet(src)
26
+ if ext in (".feather", ".ftr", ".arrow", ".ipc"):
27
+ return pl.read_ipc(src)
28
+ if ext in (".jsonl", ".ndjson"):
29
+ return pl.read_ndjson(src)
30
+ if ext == ".json":
31
+ raw = open(src, "r", encoding="utf-8").read().strip()
32
+ if raw.startswith("["):
33
+ return pl.read_json(src)
34
+ # Try NDJSON
35
+ if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
36
+ return pl.read_ndjson(src)
37
+ # Try wrapper object
38
+ obj = json.loads(raw)
39
+ if isinstance(obj, dict):
40
+ for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
41
+ if key in obj and isinstance(obj[key], list):
42
+ return pl.DataFrame(obj[key])
43
+ # Last resort - take first list value
44
+ for v in obj.values():
45
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
46
+ return pl.DataFrame(v)
47
+ return pl.read_json(src)
48
+ if ext == ".txt":
49
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
50
+
51
+ # Fallback: try csv
52
+ return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
53
+
54
+
55
+ def normalize(input_path: str, output_path: str):
56
+ df = _load(input_path)
57
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
58
+ df.write_parquet(output_path)
59
+ return df.height
60
+
61
+
62
+ def main():
63
+ if len(sys.argv) < 3:
64
+ print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
65
+ sys.exit(1)
66
+
67
+ input_path = sys.argv[1]
68
+ output_path = sys.argv[2]
69
+
70
+ if not os.path.exists(input_path):
71
+ print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
72
+ sys.exit(1)
73
+
74
+ try:
75
+ rows = normalize(input_path, output_path)
76
+ print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
77
+ except Exception as e:
78
+ print(json.dumps({"ok": False, "error": str(e)}))
79
+ sys.exit(1)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -0,0 +1,12 @@
1
+ polars==1.2.0
2
+ pandas==2.2.0
3
+ numpy==1.26.0
4
+ scikit-learn==1.4.0
5
+ # Optional source/download extras:
6
+ kaggle>=1.6.17
7
+ aiohttp>=3.9.0
8
+ aiofiles>=24.1.0
9
+ datasets>=2.20.0
10
+ webdataset>=0.2.86
11
+ # Optional for secure key storage (preferred over file fallback):
12
+ # keyring>=24.0.0