@vespermcp/mcp-server 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/fusion/engine.js +69 -0
- package/build/index.js +813 -25
- package/build/ingestion/hf-downloader.js +34 -5
- package/build/ingestion/ingestor.js +33 -9
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/scraper.js +34 -10
- package/build/python/config.py +259 -0
- package/build/python/export_engine.py +148 -52
- package/build/python/fusion_engine.py +368 -0
- package/build/python/kaggle_engine.py +204 -0
- package/build/python/row_count.py +54 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/scripts/build-index.js +5 -5
- package/build/search/jit-orchestrator.js +74 -14
- package/package.json +8 -2
- package/scripts/refresh-index.cjs +87 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/config.py +259 -0
- package/src/python/export_engine.py +148 -52
- package/src/python/fusion_engine.py +368 -0
- package/src/python/kaggle_engine.py +204 -0
- package/src/python/row_count.py +54 -0
- package/src/python/test_fusion_engine.py +89 -0
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import hashlib
|
|
5
|
+
import subprocess
|
|
6
|
+
from typing import List, Optional, Union, Dict, Any
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import polars as pl
|
|
10
|
+
HAS_POLARS = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
HAS_POLARS = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _safe_suffix(source_path: str, idx: int) -> str:
|
|
16
|
+
base = os.path.basename(source_path)
|
|
17
|
+
base = os.path.splitext(base)[0].replace(" ", "_")
|
|
18
|
+
if not base:
|
|
19
|
+
base = f"source{idx+1}"
|
|
20
|
+
return base
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _load_with_polars(path: str):
|
|
24
|
+
ext = os.path.splitext(path)[1].lower()
|
|
25
|
+
if ext == ".csv":
|
|
26
|
+
return pl.read_csv(path, ignore_errors=True)
|
|
27
|
+
if ext in [".parquet", ".pq"]:
|
|
28
|
+
return pl.read_parquet(path)
|
|
29
|
+
if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
|
|
30
|
+
return pl.read_ipc(path)
|
|
31
|
+
if ext in [".jsonl", ".ndjson"]:
|
|
32
|
+
return pl.read_ndjson(path)
|
|
33
|
+
if ext == ".json":
|
|
34
|
+
try:
|
|
35
|
+
return pl.read_json(path)
|
|
36
|
+
except Exception:
|
|
37
|
+
return pl.read_ndjson(path)
|
|
38
|
+
raise ValueError(f"Unsupported source format: {ext} ({path})")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _write_with_polars(df, output_path: str, fmt: str, compression: Optional[str]):
|
|
42
|
+
fmt = fmt.lower()
|
|
43
|
+
comp = compression
|
|
44
|
+
if comp in ["none", "None", "uncompressed"]:
|
|
45
|
+
comp = None
|
|
46
|
+
|
|
47
|
+
if fmt == "csv":
|
|
48
|
+
df.write_csv(output_path)
|
|
49
|
+
elif fmt == "parquet":
|
|
50
|
+
df.write_parquet(output_path, compression=(compression or "snappy"))
|
|
51
|
+
elif fmt == "feather":
|
|
52
|
+
ipc_comp = "lz4" if compression is None else compression
|
|
53
|
+
if ipc_comp == "uncompressed":
|
|
54
|
+
ipc_comp = None
|
|
55
|
+
df.write_ipc(output_path, compression=ipc_comp)
|
|
56
|
+
elif fmt in ["arrow", "ipc"]:
|
|
57
|
+
df.write_ipc(output_path, compression=comp)
|
|
58
|
+
elif fmt == "jsonl":
|
|
59
|
+
df.write_ndjson(output_path)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Unsupported output format: {fmt}")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _ensure_parent(path: str):
|
|
65
|
+
parent = os.path.dirname(path)
|
|
66
|
+
if parent and not os.path.exists(parent):
|
|
67
|
+
os.makedirs(parent, exist_ok=True)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _compute_null_ratio(df) -> float:
|
|
71
|
+
if len(df) == 0 or len(df.columns) == 0:
|
|
72
|
+
return 0.0
|
|
73
|
+
nulls = 0
|
|
74
|
+
for col in df.columns:
|
|
75
|
+
nulls += int(df[col].null_count())
|
|
76
|
+
total_cells = len(df) * len(df.columns)
|
|
77
|
+
return (nulls / total_cells) * 100 if total_cells else 0.0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _leakage_report(df, source_col: str = "_vesper_source", id_col: Optional[str] = None) -> Dict[str, Any]:
|
|
81
|
+
report = {
|
|
82
|
+
"leakage_detected": False,
|
|
83
|
+
"leakage_count": 0,
|
|
84
|
+
"id_column": id_col,
|
|
85
|
+
"warnings": []
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if source_col not in df.columns:
|
|
89
|
+
report["warnings"].append("Source marker column missing; leakage check skipped.")
|
|
90
|
+
return report
|
|
91
|
+
|
|
92
|
+
if id_col and id_col in df.columns:
|
|
93
|
+
overlap = (
|
|
94
|
+
df.group_by(id_col)
|
|
95
|
+
.agg(pl.col(source_col).n_unique().alias("source_count"))
|
|
96
|
+
.filter(pl.col("source_count") > 1)
|
|
97
|
+
)
|
|
98
|
+
overlap_count = len(overlap)
|
|
99
|
+
if overlap_count > 0:
|
|
100
|
+
report["leakage_detected"] = True
|
|
101
|
+
report["leakage_count"] = overlap_count
|
|
102
|
+
report["warnings"].append(f"Found {overlap_count} IDs appearing across multiple sources")
|
|
103
|
+
else:
|
|
104
|
+
# Fallback: hash rows (excluding source marker) and check if same row appears in multiple sources
|
|
105
|
+
compare_cols = [c for c in df.columns if c != source_col]
|
|
106
|
+
if not compare_cols:
|
|
107
|
+
return report
|
|
108
|
+
|
|
109
|
+
row_sig = df.select(compare_cols).with_columns(
|
|
110
|
+
pl.concat_str([pl.col(c).cast(pl.Utf8, strict=False) for c in compare_cols], separator="||").alias("_row_sig")
|
|
111
|
+
)
|
|
112
|
+
tmp = row_sig.with_columns(df[source_col]).select(["_row_sig", source_col])
|
|
113
|
+
|
|
114
|
+
overlap = (
|
|
115
|
+
tmp.group_by("_row_sig")
|
|
116
|
+
.agg(pl.col(source_col).n_unique().alias("source_count"))
|
|
117
|
+
.filter(pl.col("source_count") > 1)
|
|
118
|
+
)
|
|
119
|
+
overlap_count = len(overlap)
|
|
120
|
+
if overlap_count > 0:
|
|
121
|
+
report["leakage_detected"] = True
|
|
122
|
+
report["leakage_count"] = overlap_count
|
|
123
|
+
report["warnings"].append(f"Found {overlap_count} duplicate rows across multiple sources")
|
|
124
|
+
|
|
125
|
+
return report
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _run_quality_engine(output_path: str) -> Dict[str, Any]:
|
|
129
|
+
# Reuse existing quality engine script (same folder)
|
|
130
|
+
try:
|
|
131
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
132
|
+
quality_script = os.path.join(script_dir, "quality_engine.py")
|
|
133
|
+
cmd = [sys.executable, quality_script, output_path]
|
|
134
|
+
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
|
135
|
+
if proc.returncode != 0:
|
|
136
|
+
return {"error": f"quality_engine failed: {proc.stderr.strip()}"}
|
|
137
|
+
return json.loads(proc.stdout)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
return {"error": str(e)}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _concat_polars(dfs, source_names: List[str]):
|
|
143
|
+
# Schema alignment by union columns
|
|
144
|
+
all_cols = []
|
|
145
|
+
col_set = set()
|
|
146
|
+
for df in dfs:
|
|
147
|
+
for col in df.columns:
|
|
148
|
+
if col not in col_set:
|
|
149
|
+
all_cols.append(col)
|
|
150
|
+
col_set.add(col)
|
|
151
|
+
if "_vesper_source" not in col_set:
|
|
152
|
+
all_cols.append("_vesper_source")
|
|
153
|
+
|
|
154
|
+
aligned = []
|
|
155
|
+
for i, df in enumerate(dfs):
|
|
156
|
+
tmp = df
|
|
157
|
+
if "_vesper_source" not in tmp.columns:
|
|
158
|
+
tmp = tmp.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
|
|
159
|
+
missing = [c for c in all_cols if c not in tmp.columns]
|
|
160
|
+
for m in missing:
|
|
161
|
+
tmp = tmp.with_columns(pl.lit(None).alias(m))
|
|
162
|
+
tmp = tmp.select(all_cols)
|
|
163
|
+
aligned.append(tmp)
|
|
164
|
+
|
|
165
|
+
return pl.concat(aligned, how="vertical_relaxed")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _join_polars(dfs, source_names: List[str], join_on: List[str], how: str):
|
|
169
|
+
if len(dfs) < 2:
|
|
170
|
+
raise ValueError("Join strategy requires at least 2 sources")
|
|
171
|
+
|
|
172
|
+
conflict_renames = []
|
|
173
|
+
left = dfs[0]
|
|
174
|
+
if "_vesper_source" not in left.columns:
|
|
175
|
+
left = left.with_columns(pl.lit(source_names[0]).alias("_vesper_source"))
|
|
176
|
+
|
|
177
|
+
for i in range(1, len(dfs)):
|
|
178
|
+
right = dfs[i]
|
|
179
|
+
if "_vesper_source" not in right.columns:
|
|
180
|
+
right = right.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
|
|
181
|
+
|
|
182
|
+
overlaps = [
|
|
183
|
+
c for c in right.columns
|
|
184
|
+
if c in left.columns and c not in join_on
|
|
185
|
+
]
|
|
186
|
+
rename_map = {}
|
|
187
|
+
suffix = _safe_suffix(source_names[i], i)
|
|
188
|
+
for c in overlaps:
|
|
189
|
+
new_name = f"{c}_{suffix}"
|
|
190
|
+
rename_map[c] = new_name
|
|
191
|
+
conflict_renames.append({"source": source_names[i], "from": c, "to": new_name})
|
|
192
|
+
|
|
193
|
+
if rename_map:
|
|
194
|
+
right = right.rename(rename_map)
|
|
195
|
+
|
|
196
|
+
left = left.join(right, on=join_on, how=how, coalesce=True)
|
|
197
|
+
|
|
198
|
+
return left, conflict_renames
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def fuse_datasets(
|
|
202
|
+
sources: List[Union[str, Any]],
|
|
203
|
+
strategy: str = "concat",
|
|
204
|
+
join_on: Optional[Union[str, List[str]]] = None,
|
|
205
|
+
how: str = "inner",
|
|
206
|
+
dedup: bool = True,
|
|
207
|
+
run_quality_after: bool = True,
|
|
208
|
+
leakage_check: bool = True,
|
|
209
|
+
output_path: Optional[str] = None,
|
|
210
|
+
output_format: str = "feather",
|
|
211
|
+
compression: Optional[str] = "lz4",
|
|
212
|
+
preview: bool = True,
|
|
213
|
+
preview_rows: int = 500,
|
|
214
|
+
id_column: Optional[str] = None,
|
|
215
|
+
**kwargs,
|
|
216
|
+
):
|
|
217
|
+
if not HAS_POLARS:
|
|
218
|
+
return {"error": "Polars is required for dataset fusion. Install with: pip install polars"}
|
|
219
|
+
|
|
220
|
+
if not sources or len(sources) < 2:
|
|
221
|
+
return {"error": "Need at least 2 sources to fuse"}
|
|
222
|
+
|
|
223
|
+
source_paths: List[str] = []
|
|
224
|
+
source_names: List[str] = []
|
|
225
|
+
|
|
226
|
+
for i, src in enumerate(sources):
|
|
227
|
+
if isinstance(src, str):
|
|
228
|
+
source_paths.append(src)
|
|
229
|
+
source_names.append(src)
|
|
230
|
+
elif isinstance(src, dict):
|
|
231
|
+
p = src.get("path") or src.get("local_path")
|
|
232
|
+
if not p:
|
|
233
|
+
return {"error": f"Source {i} missing path"}
|
|
234
|
+
source_paths.append(p)
|
|
235
|
+
source_names.append(src.get("name") or p)
|
|
236
|
+
else:
|
|
237
|
+
return {"error": f"Unsupported source type at index {i}"}
|
|
238
|
+
|
|
239
|
+
for p in source_paths:
|
|
240
|
+
if not os.path.exists(p):
|
|
241
|
+
return {"error": f"Source not found: {p}"}
|
|
242
|
+
|
|
243
|
+
if output_path is None:
|
|
244
|
+
ext_map = {
|
|
245
|
+
"feather": ".feather",
|
|
246
|
+
"parquet": ".parquet",
|
|
247
|
+
"csv": ".csv",
|
|
248
|
+
"jsonl": ".jsonl",
|
|
249
|
+
"arrow": ".arrow",
|
|
250
|
+
"ipc": ".arrow"
|
|
251
|
+
}
|
|
252
|
+
ext = ext_map.get(output_format, ".feather")
|
|
253
|
+
output_path = os.path.abspath(f"fused_dataset{ext}")
|
|
254
|
+
|
|
255
|
+
_ensure_parent(output_path)
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
dfs = [_load_with_polars(p) for p in source_paths]
|
|
259
|
+
except Exception as e:
|
|
260
|
+
return {"error": f"Failed to load sources: {str(e)}"}
|
|
261
|
+
|
|
262
|
+
rows_before = sum(len(df) for df in dfs)
|
|
263
|
+
null_before = sum(_compute_null_ratio(df) for df in dfs) / len(dfs)
|
|
264
|
+
|
|
265
|
+
strategy = (strategy or "concat").lower()
|
|
266
|
+
how = (how or "inner").lower()
|
|
267
|
+
conflict_renames = []
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
if strategy == "concat":
|
|
271
|
+
fused = _concat_polars(dfs, source_names)
|
|
272
|
+
elif strategy == "join":
|
|
273
|
+
if not join_on:
|
|
274
|
+
return {"error": "join_on is required when strategy='join'"}
|
|
275
|
+
join_keys = [join_on] if isinstance(join_on, str) else list(join_on)
|
|
276
|
+
for key in join_keys:
|
|
277
|
+
for idx, df in enumerate(dfs):
|
|
278
|
+
if key not in df.columns:
|
|
279
|
+
return {"error": f"Join key '{key}' missing in source {source_paths[idx]}"}
|
|
280
|
+
fused, conflict_renames = _join_polars(dfs, source_names, join_keys, how)
|
|
281
|
+
else:
|
|
282
|
+
return {"error": f"Unknown strategy: {strategy}. Use concat or join."}
|
|
283
|
+
|
|
284
|
+
duplicates_removed = 0
|
|
285
|
+
if dedup:
|
|
286
|
+
before = len(fused)
|
|
287
|
+
fused = fused.unique(maintain_order=True)
|
|
288
|
+
duplicates_removed = before - len(fused)
|
|
289
|
+
|
|
290
|
+
leakage = None
|
|
291
|
+
if leakage_check:
|
|
292
|
+
leakage = _leakage_report(fused, source_col="_vesper_source", id_col=id_column)
|
|
293
|
+
|
|
294
|
+
_write_with_polars(fused, output_path, output_format, compression)
|
|
295
|
+
|
|
296
|
+
preview_path = None
|
|
297
|
+
if preview:
|
|
298
|
+
preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
|
|
299
|
+
fused.head(min(preview_rows, len(fused))).write_csv(preview_path)
|
|
300
|
+
|
|
301
|
+
quality_report = None
|
|
302
|
+
if run_quality_after:
|
|
303
|
+
quality_report = _run_quality_engine(output_path)
|
|
304
|
+
|
|
305
|
+
rows_after = len(fused)
|
|
306
|
+
null_after = _compute_null_ratio(fused)
|
|
307
|
+
|
|
308
|
+
return {
|
|
309
|
+
"success": True,
|
|
310
|
+
"output_path": output_path,
|
|
311
|
+
"preview_path": preview_path,
|
|
312
|
+
"stats": {
|
|
313
|
+
"sources_count": len(source_paths),
|
|
314
|
+
"rows_before": rows_before,
|
|
315
|
+
"rows_after": rows_after,
|
|
316
|
+
"columns_after": len(fused.columns),
|
|
317
|
+
"duplicates_removed": duplicates_removed,
|
|
318
|
+
"null_ratio_before": round(null_before, 3),
|
|
319
|
+
"null_ratio_after": round(null_after, 3),
|
|
320
|
+
"null_delta": round(null_after - null_before, 3),
|
|
321
|
+
"conflict_renames": conflict_renames,
|
|
322
|
+
},
|
|
323
|
+
"quality_report": quality_report,
|
|
324
|
+
"leakage_report": leakage,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
return {"error": f"Fusion failed: {str(e)}"}
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def main():
|
|
332
|
+
if len(sys.argv) < 3:
|
|
333
|
+
print(json.dumps({
|
|
334
|
+
"error": "Usage: fusion_engine.py <sources_json> <output_path> [config_json]"
|
|
335
|
+
}))
|
|
336
|
+
sys.exit(1)
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
sources = json.loads(sys.argv[1])
|
|
340
|
+
output_path = sys.argv[2]
|
|
341
|
+
config = {}
|
|
342
|
+
if len(sys.argv) > 3:
|
|
343
|
+
config = json.loads(sys.argv[3])
|
|
344
|
+
|
|
345
|
+
result = fuse_datasets(
|
|
346
|
+
sources=sources,
|
|
347
|
+
output_path=output_path,
|
|
348
|
+
strategy=config.get("strategy", "concat"),
|
|
349
|
+
join_on=config.get("join_on"),
|
|
350
|
+
how=config.get("how", "inner"),
|
|
351
|
+
dedup=config.get("dedup", True),
|
|
352
|
+
run_quality_after=config.get("run_quality_after", True),
|
|
353
|
+
leakage_check=config.get("leakage_check", True),
|
|
354
|
+
output_format=config.get("output_format", "feather"),
|
|
355
|
+
compression=config.get("compression", "lz4"),
|
|
356
|
+
preview=config.get("preview", True),
|
|
357
|
+
preview_rows=config.get("preview_rows", 500),
|
|
358
|
+
id_column=config.get("id_column"),
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
print(json.dumps(result))
|
|
362
|
+
except Exception as e:
|
|
363
|
+
print(json.dumps({"error": str(e)}))
|
|
364
|
+
sys.exit(1)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
if __name__ == "__main__":
|
|
368
|
+
main()
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import tempfile
|
|
5
|
+
from typing import Dict, Any, List
|
|
6
|
+
from config import get_all
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
10
|
+
HAS_KAGGLE = True
|
|
11
|
+
except Exception:
|
|
12
|
+
HAS_KAGGLE = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _ensure_auth() -> Dict[str, Any]:
|
|
16
|
+
if not HAS_KAGGLE:
|
|
17
|
+
return {
|
|
18
|
+
"ok": False,
|
|
19
|
+
"error": "kaggle package not installed. Install with: pip install kaggle",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
# Priority:
|
|
23
|
+
# 1) Existing env vars
|
|
24
|
+
# 2) secure local store (keyring or ~/.vesper/config.toml)
|
|
25
|
+
# 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
|
|
26
|
+
if not os.getenv("KAGGLE_USERNAME") or not os.getenv("KAGGLE_KEY"):
|
|
27
|
+
keys = get_all()
|
|
28
|
+
if keys.get("kaggle_username") and keys.get("kaggle_key"):
|
|
29
|
+
os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
|
|
30
|
+
os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
|
|
31
|
+
|
|
32
|
+
api = KaggleApi()
|
|
33
|
+
try:
|
|
34
|
+
api.authenticate()
|
|
35
|
+
except Exception as e:
|
|
36
|
+
return {
|
|
37
|
+
"ok": False,
|
|
38
|
+
"error": "Kaggle requires API key — run 'vespermcp config keys' (30 seconds) or provide ~/.kaggle/kaggle.json",
|
|
39
|
+
"details": str(e),
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return {"ok": True, "api": api}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _dataset_to_dict(ds) -> Dict[str, Any]:
|
|
46
|
+
# kaggle API object fields differ by version; use getattr defensively
|
|
47
|
+
ref = getattr(ds, "ref", None) or getattr(ds, "datasetRef", None) or ""
|
|
48
|
+
title = getattr(ds, "title", None) or ref
|
|
49
|
+
subtitle = getattr(ds, "subtitle", None) or ""
|
|
50
|
+
owner = getattr(ds, "creatorName", None) or getattr(ds, "ownerName", None) or ""
|
|
51
|
+
votes = int(getattr(ds, "voteCount", 0) or 0)
|
|
52
|
+
downloads = int(getattr(ds, "downloadCount", 0) or 0)
|
|
53
|
+
size = int(getattr(ds, "totalBytes", 0) or 0)
|
|
54
|
+
last_updated = str(getattr(ds, "lastUpdated", ""))
|
|
55
|
+
tags = []
|
|
56
|
+
raw_tags = getattr(ds, "tags", None)
|
|
57
|
+
if raw_tags:
|
|
58
|
+
for t in raw_tags:
|
|
59
|
+
tags.append(getattr(t, "name", str(t)))
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"id": ref,
|
|
63
|
+
"source": "kaggle",
|
|
64
|
+
"name": title,
|
|
65
|
+
"description": subtitle or title,
|
|
66
|
+
"downloads": downloads,
|
|
67
|
+
"likes": votes,
|
|
68
|
+
"stars": 0,
|
|
69
|
+
"tags": tags,
|
|
70
|
+
"last_updated": last_updated,
|
|
71
|
+
"task": "unknown",
|
|
72
|
+
"domain": "unknown",
|
|
73
|
+
"languages": [],
|
|
74
|
+
"splits": [{"name": "data", "num_examples": 0, "size_bytes": size}],
|
|
75
|
+
"license": {
|
|
76
|
+
"id": "unknown",
|
|
77
|
+
"name": "unknown",
|
|
78
|
+
"category": "unknown",
|
|
79
|
+
"usage_restrictions": [],
|
|
80
|
+
"warnings": ["Kaggle license details may vary by dataset"],
|
|
81
|
+
},
|
|
82
|
+
"quality_score": 40,
|
|
83
|
+
"quality_warnings": ["Review dataset card and competition rules before use"],
|
|
84
|
+
"download_url": f"https://www.kaggle.com/datasets/{ref}",
|
|
85
|
+
"format": None,
|
|
86
|
+
"total_examples": 0,
|
|
87
|
+
"total_size_bytes": size,
|
|
88
|
+
"total_size_mb": round(size / (1024 * 1024), 2) if size else 0,
|
|
89
|
+
"columns": [],
|
|
90
|
+
"is_structured": False,
|
|
91
|
+
"has_target_column": False,
|
|
92
|
+
"is_safe_source": True,
|
|
93
|
+
"has_personal_data": False,
|
|
94
|
+
"is_paywalled": False,
|
|
95
|
+
"is_scraped_web_data": False,
|
|
96
|
+
"uses_https": True,
|
|
97
|
+
"has_train_split": False,
|
|
98
|
+
"has_test_split": False,
|
|
99
|
+
"has_validation_split": False,
|
|
100
|
+
"description_length": len(subtitle or title),
|
|
101
|
+
"has_readme": True,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
106
|
+
auth = _ensure_auth()
|
|
107
|
+
if not auth.get("ok"):
|
|
108
|
+
return auth
|
|
109
|
+
|
|
110
|
+
api: KaggleApi = auth["api"]
|
|
111
|
+
try:
|
|
112
|
+
datasets = api.dataset_list(search=query, page_size=max(1, min(limit, 100)))
|
|
113
|
+
items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
|
|
114
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
115
|
+
except Exception as e:
|
|
116
|
+
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _pick_best_file(root: str) -> str:
|
|
120
|
+
candidates: List[str] = []
|
|
121
|
+
for base, _, files in os.walk(root):
|
|
122
|
+
for name in files:
|
|
123
|
+
full = os.path.join(base, name)
|
|
124
|
+
lower = name.lower()
|
|
125
|
+
if lower.endswith((".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow")):
|
|
126
|
+
candidates.append(full)
|
|
127
|
+
|
|
128
|
+
if not candidates:
|
|
129
|
+
raise RuntimeError("No suitable data file found after download")
|
|
130
|
+
|
|
131
|
+
# prioritize common tabular formats
|
|
132
|
+
priorities = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow"]
|
|
133
|
+
for ext in priorities:
|
|
134
|
+
for c in candidates:
|
|
135
|
+
if c.lower().endswith(ext):
|
|
136
|
+
return c
|
|
137
|
+
return candidates[0]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
141
|
+
auth = _ensure_auth()
|
|
142
|
+
if not auth.get("ok"):
|
|
143
|
+
return auth
|
|
144
|
+
|
|
145
|
+
api: KaggleApi = auth["api"]
|
|
146
|
+
|
|
147
|
+
if not target_dir:
|
|
148
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_kaggle_")
|
|
149
|
+
|
|
150
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
if "kaggle.com/datasets/" in dataset_ref:
|
|
154
|
+
dataset_ref = dataset_ref.split("kaggle.com/datasets/")[1].lstrip("/")
|
|
155
|
+
|
|
156
|
+
# unzip in place, remove zip for convenience
|
|
157
|
+
api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
|
|
158
|
+
best_file = _pick_best_file(target_dir)
|
|
159
|
+
return {
|
|
160
|
+
"ok": True,
|
|
161
|
+
"dataset_id": dataset_ref,
|
|
162
|
+
"target_dir": target_dir,
|
|
163
|
+
"local_path": best_file,
|
|
164
|
+
}
|
|
165
|
+
except Exception as e:
|
|
166
|
+
msg = str(e)
|
|
167
|
+
if "401" in msg or "Unauthorized" in msg:
|
|
168
|
+
return {"ok": False, "error": "Invalid Kaggle credentials (401). Run 'vespermcp config kaggle' again."}
|
|
169
|
+
if "429" in msg or "Too Many Requests" in msg:
|
|
170
|
+
return {"ok": False, "error": "Kaggle rate limit reached. Please retry later."}
|
|
171
|
+
return {"ok": False, "error": f"Kaggle download failed: {msg}"}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def main():
|
|
175
|
+
if len(sys.argv) < 2:
|
|
176
|
+
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py <discover|download> ..."}))
|
|
177
|
+
sys.exit(1)
|
|
178
|
+
|
|
179
|
+
command = sys.argv[1]
|
|
180
|
+
|
|
181
|
+
if command == "discover":
|
|
182
|
+
if len(sys.argv) < 3:
|
|
183
|
+
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py discover <query> [limit]"}))
|
|
184
|
+
sys.exit(1)
|
|
185
|
+
query = sys.argv[2]
|
|
186
|
+
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20
|
|
187
|
+
print(json.dumps(discover(query, limit)))
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
if command == "download":
|
|
191
|
+
if len(sys.argv) < 3:
|
|
192
|
+
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py download <dataset_ref> [target_dir]"}))
|
|
193
|
+
sys.exit(1)
|
|
194
|
+
dataset_ref = sys.argv[2]
|
|
195
|
+
target_dir = sys.argv[3] if len(sys.argv) > 3 else ""
|
|
196
|
+
print(json.dumps(download(dataset_ref, target_dir)))
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
print(json.dumps({"ok": False, "error": f"Unknown command: {command}"}))
|
|
200
|
+
sys.exit(1)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == "__main__":
|
|
204
|
+
main()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import polars as pl
|
|
7
|
+
except Exception:
|
|
8
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
9
|
+
sys.exit(1)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def count_rows(path: str) -> int:
|
|
13
|
+
ext = os.path.splitext(path)[1].lower()
|
|
14
|
+
|
|
15
|
+
if ext == ".csv":
|
|
16
|
+
# Faster than full read for large csv
|
|
17
|
+
return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
|
|
18
|
+
if ext in [".parquet", ".pq"]:
|
|
19
|
+
return int(pl.scan_parquet(path).select(pl.len()).collect().item())
|
|
20
|
+
if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
|
|
21
|
+
return int(pl.scan_ipc(path).select(pl.len()).collect().item())
|
|
22
|
+
if ext in [".jsonl", ".ndjson"]:
|
|
23
|
+
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
24
|
+
if ext == ".json":
|
|
25
|
+
# fallback to eager for plain JSON arrays
|
|
26
|
+
try:
|
|
27
|
+
return int(pl.read_json(path).height)
|
|
28
|
+
except Exception:
|
|
29
|
+
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
30
|
+
|
|
31
|
+
# unknown extension fallback
|
|
32
|
+
return int(pl.read_csv(path, ignore_errors=True).height)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main():
|
|
36
|
+
if len(sys.argv) < 2:
|
|
37
|
+
print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
p = sys.argv[1]
|
|
41
|
+
if not os.path.exists(p):
|
|
42
|
+
print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
rows = count_rows(p)
|
|
47
|
+
print(json.dumps({"ok": True, "rows": rows}))
|
|
48
|
+
except Exception as e:
|
|
49
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
main()
|