vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,298 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
HuggingFace Datasets Library Fallback Downloader.
|
|
3
|
-
|
|
4
|
-
Used when the HF Hub file listing finds no suitable data files
|
|
5
|
-
(e.g. script-based datasets, gated datasets, datasets that use
|
|
6
|
-
the `datasets` library format).
|
|
7
|
-
|
|
8
|
-
Handles:
|
|
9
|
-
- Legacy script-based datasets (trust_remote_code)
|
|
10
|
-
- Gated/private datasets (token auth)
|
|
11
|
-
- Image datasets (PIL Image columns → stripped for tabular export)
|
|
12
|
-
- Various split formats (DatasetDict, single split)
|
|
13
|
-
|
|
14
|
-
Usage:
|
|
15
|
-
python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
|
|
16
|
-
|
|
17
|
-
Output: JSON to stdout
|
|
18
|
-
{"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
|
|
19
|
-
{"ok": false, "error": "..."}
|
|
20
|
-
"""
|
|
21
|
-
import sys
|
|
22
|
-
import json
|
|
23
|
-
import os
|
|
24
|
-
import warnings
|
|
25
|
-
|
|
26
|
-
# Suppress noisy HF warnings about trust_remote_code etc.
|
|
27
|
-
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
28
|
-
warnings.filterwarnings("ignore", message=".*legacy.*")
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _detect_image_columns(ds):
|
|
32
|
-
"""Detect columns that contain HF Image features or PIL Image objects."""
|
|
33
|
-
image_cols = []
|
|
34
|
-
features = getattr(ds, "features", None)
|
|
35
|
-
if features:
|
|
36
|
-
for name, feat in features.items():
|
|
37
|
-
feat_cls = feat.__class__.__name__.lower()
|
|
38
|
-
feat_str = str(feat).lower()
|
|
39
|
-
if feat_cls == "image" or "image(" in feat_str:
|
|
40
|
-
image_cols.append(name)
|
|
41
|
-
return image_cols
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _strip_image_columns(ds, image_cols):
|
|
45
|
-
"""Remove image columns from dataset so it can be exported to Parquet/CSV.
|
|
46
|
-
|
|
47
|
-
Image columns contain PIL Image objects that can't be serialized to tabular
|
|
48
|
-
formats. We replace them with a placeholder string indicating the column
|
|
49
|
-
was an image column.
|
|
50
|
-
"""
|
|
51
|
-
if not image_cols:
|
|
52
|
-
return ds
|
|
53
|
-
|
|
54
|
-
# Remove the image columns entirely for tabular export
|
|
55
|
-
cols_to_keep = [c for c in ds.column_names if c not in image_cols]
|
|
56
|
-
if not cols_to_keep:
|
|
57
|
-
# Dataset is ALL image columns — keep them but cast to path strings if possible
|
|
58
|
-
return ds
|
|
59
|
-
|
|
60
|
-
return ds.select_columns(cols_to_keep)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def _load_dataset_robust(repo_id, token, split):
|
|
64
|
-
"""Load a HuggingFace dataset with multiple fallback strategies.
|
|
65
|
-
|
|
66
|
-
Strategy order:
|
|
67
|
-
1. Normal load with trust_remote_code=True (handles legacy script datasets)
|
|
68
|
-
2. Load without trust_remote_code (newer datasets that reject it)
|
|
69
|
-
3. Load with streaming=True then materialize (handles very large datasets)
|
|
70
|
-
"""
|
|
71
|
-
from datasets import load_dataset, DatasetDict
|
|
72
|
-
|
|
73
|
-
errors = []
|
|
74
|
-
splits_to_try = [split] if split else ["train", "test", "validation", None]
|
|
75
|
-
|
|
76
|
-
# Strategy 1: Normal load with trust_remote_code
|
|
77
|
-
for s in splits_to_try:
|
|
78
|
-
try:
|
|
79
|
-
kwargs = {"path": repo_id, "trust_remote_code": True}
|
|
80
|
-
if token:
|
|
81
|
-
kwargs["token"] = token
|
|
82
|
-
if s:
|
|
83
|
-
kwargs["split"] = s
|
|
84
|
-
ds = load_dataset(**kwargs)
|
|
85
|
-
return ds, s
|
|
86
|
-
except (ValueError, KeyError):
|
|
87
|
-
continue
|
|
88
|
-
except Exception as e:
|
|
89
|
-
msg = str(e)
|
|
90
|
-
# Auth errors should be raised immediately, not retried
|
|
91
|
-
if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
|
|
92
|
-
raise
|
|
93
|
-
if "split" in msg.lower() or "key" in msg.lower():
|
|
94
|
-
continue
|
|
95
|
-
errors.append(f"trust_remote_code=True, split={s}: {msg}")
|
|
96
|
-
|
|
97
|
-
# Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
|
|
98
|
-
for s in splits_to_try:
|
|
99
|
-
try:
|
|
100
|
-
kwargs = {"path": repo_id}
|
|
101
|
-
if token:
|
|
102
|
-
kwargs["token"] = token
|
|
103
|
-
if s:
|
|
104
|
-
kwargs["split"] = s
|
|
105
|
-
ds = load_dataset(**kwargs)
|
|
106
|
-
return ds, s
|
|
107
|
-
except (ValueError, KeyError):
|
|
108
|
-
continue
|
|
109
|
-
except Exception as e:
|
|
110
|
-
msg = str(e)
|
|
111
|
-
if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
|
|
112
|
-
raise
|
|
113
|
-
if "split" in msg.lower() or "key" in msg.lower():
|
|
114
|
-
continue
|
|
115
|
-
errors.append(f"trust_remote_code=False, split={s}: {msg}")
|
|
116
|
-
|
|
117
|
-
# Strategy 3: Streaming fallback (for very large / oddly structured datasets)
|
|
118
|
-
for s in splits_to_try:
|
|
119
|
-
if s is None:
|
|
120
|
-
continue # streaming requires a split
|
|
121
|
-
try:
|
|
122
|
-
kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
|
|
123
|
-
if token:
|
|
124
|
-
kwargs["token"] = token
|
|
125
|
-
if s:
|
|
126
|
-
kwargs["split"] = s
|
|
127
|
-
ds_stream = load_dataset(**kwargs)
|
|
128
|
-
# Materialize from streaming iterator
|
|
129
|
-
from datasets import Dataset as HFDataset
|
|
130
|
-
rows = []
|
|
131
|
-
for i, row in enumerate(ds_stream):
|
|
132
|
-
if i >= 500000:
|
|
133
|
-
break
|
|
134
|
-
rows.append(row)
|
|
135
|
-
if rows:
|
|
136
|
-
ds = HFDataset.from_list(rows)
|
|
137
|
-
return ds, s
|
|
138
|
-
except Exception:
|
|
139
|
-
continue
|
|
140
|
-
|
|
141
|
-
# All strategies failed
|
|
142
|
-
error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
|
|
143
|
-
return None, error_summary
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
def main():
|
|
147
|
-
if len(sys.argv) < 2:
|
|
148
|
-
print(json.dumps({"ok": False, "error": "Missing payload argument"}))
|
|
149
|
-
sys.exit(1)
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
payload = json.loads(sys.argv[1])
|
|
153
|
-
except json.JSONDecodeError as e:
|
|
154
|
-
print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
|
|
155
|
-
sys.exit(1)
|
|
156
|
-
|
|
157
|
-
repo_id = payload.get("repo_id", "").strip()
|
|
158
|
-
output_path = payload.get("output_path", "").strip()
|
|
159
|
-
token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
|
|
160
|
-
max_rows = payload.get("max_rows", 500000)
|
|
161
|
-
split = payload.get("split") # None = auto-detect
|
|
162
|
-
|
|
163
|
-
if not repo_id:
|
|
164
|
-
print(json.dumps({"ok": False, "error": "repo_id is required"}))
|
|
165
|
-
sys.exit(1)
|
|
166
|
-
|
|
167
|
-
if not output_path:
|
|
168
|
-
print(json.dumps({"ok": False, "error": "output_path is required"}))
|
|
169
|
-
sys.exit(1)
|
|
170
|
-
|
|
171
|
-
try:
|
|
172
|
-
from datasets import load_dataset
|
|
173
|
-
except ImportError:
|
|
174
|
-
print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
|
|
175
|
-
sys.exit(1)
|
|
176
|
-
|
|
177
|
-
try:
|
|
178
|
-
import polars as pl
|
|
179
|
-
except ImportError:
|
|
180
|
-
pl = None
|
|
181
|
-
|
|
182
|
-
try:
|
|
183
|
-
ds, used_split = _load_dataset_robust(repo_id, token, split)
|
|
184
|
-
|
|
185
|
-
if ds is None:
|
|
186
|
-
print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
|
|
187
|
-
sys.exit(1)
|
|
188
|
-
|
|
189
|
-
# Handle DatasetDict (when no split specified)
|
|
190
|
-
from datasets import DatasetDict, Dataset
|
|
191
|
-
if isinstance(ds, DatasetDict):
|
|
192
|
-
# Pick the best split
|
|
193
|
-
for preferred in ["train", "test", "validation"]:
|
|
194
|
-
if preferred in ds:
|
|
195
|
-
ds = ds[preferred]
|
|
196
|
-
used_split = preferred
|
|
197
|
-
break
|
|
198
|
-
else:
|
|
199
|
-
# Just pick the first available split
|
|
200
|
-
first_key = list(ds.keys())[0]
|
|
201
|
-
ds = ds[first_key]
|
|
202
|
-
used_split = first_key
|
|
203
|
-
|
|
204
|
-
# Limit rows if needed
|
|
205
|
-
total_rows = len(ds)
|
|
206
|
-
if max_rows and total_rows > max_rows:
|
|
207
|
-
ds = ds.select(range(max_rows))
|
|
208
|
-
total_rows = max_rows
|
|
209
|
-
|
|
210
|
-
# Detect and handle image columns (PIL Image objects can't be exported to Parquet)
|
|
211
|
-
image_cols = _detect_image_columns(ds)
|
|
212
|
-
has_images = len(image_cols) > 0
|
|
213
|
-
|
|
214
|
-
if has_images:
|
|
215
|
-
# Strip image columns for tabular export, note them in output
|
|
216
|
-
export_ds = _strip_image_columns(ds, image_cols)
|
|
217
|
-
else:
|
|
218
|
-
export_ds = ds
|
|
219
|
-
|
|
220
|
-
# Ensure output directory exists
|
|
221
|
-
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
222
|
-
|
|
223
|
-
# Export to parquet
|
|
224
|
-
columns = export_ds.column_names
|
|
225
|
-
|
|
226
|
-
try:
|
|
227
|
-
if output_path.endswith(".parquet"):
|
|
228
|
-
export_ds.to_parquet(output_path)
|
|
229
|
-
elif output_path.endswith(".csv"):
|
|
230
|
-
export_ds.to_csv(output_path)
|
|
231
|
-
else:
|
|
232
|
-
# Default to parquet
|
|
233
|
-
if not output_path.endswith(".parquet"):
|
|
234
|
-
output_path = output_path + ".parquet"
|
|
235
|
-
export_ds.to_parquet(output_path)
|
|
236
|
-
except Exception as export_err:
|
|
237
|
-
# If parquet export fails (e.g. complex nested types), try CSV
|
|
238
|
-
csv_path = output_path.replace(".parquet", ".csv")
|
|
239
|
-
try:
|
|
240
|
-
export_ds.to_csv(csv_path)
|
|
241
|
-
output_path = csv_path
|
|
242
|
-
except Exception:
|
|
243
|
-
raise export_err # Re-raise original error
|
|
244
|
-
|
|
245
|
-
result = {
|
|
246
|
-
"ok": True,
|
|
247
|
-
"path": output_path,
|
|
248
|
-
"rows": total_rows,
|
|
249
|
-
"columns": columns,
|
|
250
|
-
"split": used_split
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
if has_images:
|
|
254
|
-
result["image_columns"] = image_cols
|
|
255
|
-
result["note"] = (
|
|
256
|
-
f"This dataset contains image columns ({', '.join(image_cols)}). "
|
|
257
|
-
"Image data was stripped for tabular export. "
|
|
258
|
-
"Use vesper_download_assets with source='huggingface' to download the actual images."
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
print(json.dumps(result))
|
|
262
|
-
|
|
263
|
-
except Exception as e:
|
|
264
|
-
error_msg = str(e)
|
|
265
|
-
# Provide helpful, actionable hints
|
|
266
|
-
if "401" in error_msg or "Unauthorized" in error_msg:
|
|
267
|
-
error_msg = (
|
|
268
|
-
f"Authentication required for dataset '{repo_id}'. "
|
|
269
|
-
"This dataset may be gated or private. "
|
|
270
|
-
"Use the configure_keys tool to set your HF_TOKEN, then retry."
|
|
271
|
-
)
|
|
272
|
-
elif "403" in error_msg or "Forbidden" in error_msg:
|
|
273
|
-
error_msg = (
|
|
274
|
-
f"Access denied for dataset '{repo_id}'. "
|
|
275
|
-
"You may need to accept the dataset's usage agreement on huggingface.co, "
|
|
276
|
-
"then set HF_TOKEN via configure_keys tool."
|
|
277
|
-
)
|
|
278
|
-
elif "gated" in error_msg.lower():
|
|
279
|
-
error_msg = (
|
|
280
|
-
f"Dataset '{repo_id}' is gated. "
|
|
281
|
-
"Visit https://huggingface.co/datasets/{repo_id} to request access, "
|
|
282
|
-
"then set HF_TOKEN via configure_keys tool."
|
|
283
|
-
).format(repo_id=repo_id)
|
|
284
|
-
elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
|
|
285
|
-
error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
|
|
286
|
-
elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
|
|
287
|
-
error_msg = (
|
|
288
|
-
f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
|
|
289
|
-
"by the current version of the datasets library. "
|
|
290
|
-
"Try: pip install datasets --upgrade, or use an older datasets version."
|
|
291
|
-
)
|
|
292
|
-
|
|
293
|
-
print(json.dumps({"ok": False, "error": error_msg}))
|
|
294
|
-
sys.exit(1)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
if __name__ == "__main__":
|
|
298
|
-
main()
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
from PIL import Image
|
|
5
|
-
import cv2
|
|
6
|
-
import numpy as np
|
|
7
|
-
|
|
8
|
-
def analyze_image(image_path):
|
|
9
|
-
stats = {
|
|
10
|
-
"path": image_path,
|
|
11
|
-
"filename": os.path.basename(image_path),
|
|
12
|
-
"status": "ok",
|
|
13
|
-
"error": None
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
try:
|
|
17
|
-
# 1. Basic Metadata with Pillow
|
|
18
|
-
img = Image.open(image_path)
|
|
19
|
-
stats["width"], stats["height"] = img.size
|
|
20
|
-
stats["format"] = img.format
|
|
21
|
-
stats["mode"] = img.mode
|
|
22
|
-
|
|
23
|
-
# 2. Advanced Analysis with OpenCV
|
|
24
|
-
cv_img = cv2.imread(image_path)
|
|
25
|
-
if cv_img is None:
|
|
26
|
-
stats["status"] = "corrupted"
|
|
27
|
-
stats["error"] = "OpenCV failed to decode image"
|
|
28
|
-
return stats
|
|
29
|
-
|
|
30
|
-
# Blur detection (Laplacian variance)
|
|
31
|
-
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
|
|
32
|
-
laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
|
|
33
|
-
stats["blur_score"] = laplacian_var
|
|
34
|
-
stats["is_blurry"] = laplacian_var < 100 # Rule of thumb threshold
|
|
35
|
-
|
|
36
|
-
# Brightness
|
|
37
|
-
stats["brightness"] = np.mean(gray)
|
|
38
|
-
|
|
39
|
-
# Aspect Ratio
|
|
40
|
-
stats["aspect_ratio"] = stats["width"] / stats["height"]
|
|
41
|
-
|
|
42
|
-
except Exception as e:
|
|
43
|
-
stats["status"] = "failed"
|
|
44
|
-
stats["error"] = str(e)
|
|
45
|
-
|
|
46
|
-
return stats
|
|
47
|
-
|
|
48
|
-
def main():
|
|
49
|
-
if len(sys.argv) < 2:
|
|
50
|
-
print(json.dumps({"error": "No path provided"}))
|
|
51
|
-
sys.exit(1)
|
|
52
|
-
|
|
53
|
-
input_path = sys.argv[1]
|
|
54
|
-
results = []
|
|
55
|
-
|
|
56
|
-
if os.path.isfile(input_path):
|
|
57
|
-
results.append(analyze_image(input_path))
|
|
58
|
-
elif os.path.isdir(input_path):
|
|
59
|
-
# Analyze first 50 images for performance in this demo
|
|
60
|
-
valid_exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
|
|
61
|
-
files = [os.path.join(input_path, f) for f in os.listdir(input_path) if f.lower().endswith(valid_exts)]
|
|
62
|
-
for f in files[:50]:
|
|
63
|
-
results.append(analyze_image(f))
|
|
64
|
-
else:
|
|
65
|
-
print(json.dumps({"error": "Invalid path"}))
|
|
66
|
-
sys.exit(1)
|
|
67
|
-
|
|
68
|
-
# Aggregate stats
|
|
69
|
-
if not results:
|
|
70
|
-
print(json.dumps({"error": "No images found"}))
|
|
71
|
-
sys.exit(1)
|
|
72
|
-
|
|
73
|
-
report = {
|
|
74
|
-
"total_images": len(results),
|
|
75
|
-
"corrupted_count": len([r for r in results if r["status"] == "corrupted"]),
|
|
76
|
-
"failed_count": len([r for r in results if r["status"] == "failed"]),
|
|
77
|
-
"average_width": np.mean([r["width"] for r in results if "width" in r]),
|
|
78
|
-
"average_height": np.mean([r["height"] for r in results if "height" in r]),
|
|
79
|
-
"blurry_count": len([r for r in results if r.get("is_blurry")]),
|
|
80
|
-
"individual_results": results
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
print(json.dumps(report))
|
|
84
|
-
|
|
85
|
-
if __name__ == "__main__":
|
|
86
|
-
main()
|
|
@@ -1,295 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import os
|
|
3
|
-
import json
|
|
4
|
-
import tempfile
|
|
5
|
-
from typing import Dict, Any, List
|
|
6
|
-
from config import get_all
|
|
7
|
-
|
|
8
|
-
try:
|
|
9
|
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
10
|
-
HAS_KAGGLE = True
|
|
11
|
-
except Exception:
|
|
12
|
-
HAS_KAGGLE = False
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
IMAGE_EXTENSIONS = {
|
|
16
|
-
".jpg",
|
|
17
|
-
".jpeg",
|
|
18
|
-
".png",
|
|
19
|
-
".webp",
|
|
20
|
-
".bmp",
|
|
21
|
-
".gif",
|
|
22
|
-
".tiff",
|
|
23
|
-
".tif",
|
|
24
|
-
".svg",
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _ensure_auth() -> Dict[str, Any]:
|
|
29
|
-
if not HAS_KAGGLE:
|
|
30
|
-
return {
|
|
31
|
-
"ok": False,
|
|
32
|
-
"error": "kaggle package not installed. Install with: pip install kaggle",
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
# Priority:
|
|
36
|
-
# 1) secure local store (keyring or ~/.vesper/config.toml)
|
|
37
|
-
# 2) existing env vars
|
|
38
|
-
# 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
|
|
39
|
-
keys = get_all()
|
|
40
|
-
if keys.get("kaggle_username") and keys.get("kaggle_key"):
|
|
41
|
-
os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
|
|
42
|
-
os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
|
|
43
|
-
|
|
44
|
-
api = KaggleApi()
|
|
45
|
-
try:
|
|
46
|
-
api.authenticate()
|
|
47
|
-
except Exception as e:
|
|
48
|
-
return {
|
|
49
|
-
"ok": False,
|
|
50
|
-
"error": "Kaggle requires API key — run 'vespermcp config keys' (30 seconds) or provide ~/.kaggle/kaggle.json",
|
|
51
|
-
"details": str(e),
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
return {"ok": True, "api": api}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def _dataset_to_dict(ds) -> Dict[str, Any]:
|
|
58
|
-
# kaggle API object fields differ by version; use getattr defensively
|
|
59
|
-
ref = getattr(ds, "ref", None) or getattr(ds, "datasetRef", None) or ""
|
|
60
|
-
title = getattr(ds, "title", None) or ref
|
|
61
|
-
subtitle = getattr(ds, "subtitle", None) or ""
|
|
62
|
-
owner = getattr(ds, "creatorName", None) or getattr(ds, "ownerName", None) or ""
|
|
63
|
-
votes = int(getattr(ds, "voteCount", 0) or 0)
|
|
64
|
-
downloads = int(getattr(ds, "downloadCount", 0) or 0)
|
|
65
|
-
size = int(getattr(ds, "totalBytes", 0) or 0)
|
|
66
|
-
last_updated = str(getattr(ds, "lastUpdated", ""))
|
|
67
|
-
tags = []
|
|
68
|
-
raw_tags = getattr(ds, "tags", None)
|
|
69
|
-
if raw_tags:
|
|
70
|
-
for t in raw_tags:
|
|
71
|
-
tags.append(getattr(t, "name", str(t)))
|
|
72
|
-
|
|
73
|
-
return {
|
|
74
|
-
"id": ref,
|
|
75
|
-
"source": "kaggle",
|
|
76
|
-
"name": title,
|
|
77
|
-
"description": subtitle or title,
|
|
78
|
-
"downloads": downloads,
|
|
79
|
-
"likes": votes,
|
|
80
|
-
"stars": 0,
|
|
81
|
-
"tags": tags,
|
|
82
|
-
"last_updated": last_updated,
|
|
83
|
-
"task": "unknown",
|
|
84
|
-
"domain": "unknown",
|
|
85
|
-
"languages": [],
|
|
86
|
-
"splits": [{"name": "data", "num_examples": 0, "size_bytes": size}],
|
|
87
|
-
"license": {
|
|
88
|
-
"id": "unknown",
|
|
89
|
-
"name": "unknown",
|
|
90
|
-
"category": "unknown",
|
|
91
|
-
"usage_restrictions": [],
|
|
92
|
-
"warnings": ["Kaggle license details may vary by dataset"],
|
|
93
|
-
},
|
|
94
|
-
"quality_score": 40,
|
|
95
|
-
"quality_warnings": ["Review dataset card and competition rules before use"],
|
|
96
|
-
"download_url": f"https://www.kaggle.com/datasets/{ref}",
|
|
97
|
-
"format": None,
|
|
98
|
-
"total_examples": 0,
|
|
99
|
-
"total_size_bytes": size,
|
|
100
|
-
"total_size_mb": round(size / (1024 * 1024), 2) if size else 0,
|
|
101
|
-
"columns": [],
|
|
102
|
-
"is_structured": False,
|
|
103
|
-
"has_target_column": False,
|
|
104
|
-
"is_safe_source": True,
|
|
105
|
-
"has_personal_data": False,
|
|
106
|
-
"is_paywalled": False,
|
|
107
|
-
"is_scraped_web_data": False,
|
|
108
|
-
"uses_https": True,
|
|
109
|
-
"has_train_split": False,
|
|
110
|
-
"has_test_split": False,
|
|
111
|
-
"has_validation_split": False,
|
|
112
|
-
"description_length": len(subtitle or title),
|
|
113
|
-
"has_readme": True,
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
118
|
-
auth = _ensure_auth()
|
|
119
|
-
if not auth.get("ok"):
|
|
120
|
-
return auth
|
|
121
|
-
|
|
122
|
-
api: KaggleApi = auth["api"]
|
|
123
|
-
try:
|
|
124
|
-
desired = max(1, min(limit, 100))
|
|
125
|
-
|
|
126
|
-
try:
|
|
127
|
-
datasets = api.dataset_list(search=query, page_size=desired)
|
|
128
|
-
items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
|
|
129
|
-
return {"ok": True, "results": items, "count": len(items)}
|
|
130
|
-
except TypeError:
|
|
131
|
-
pass
|
|
132
|
-
|
|
133
|
-
collected = []
|
|
134
|
-
page = 1
|
|
135
|
-
while len(collected) < limit:
|
|
136
|
-
page_items = api.dataset_list(search=query, page=page)
|
|
137
|
-
if not page_items:
|
|
138
|
-
break
|
|
139
|
-
|
|
140
|
-
collected.extend(page_items)
|
|
141
|
-
if len(page_items) < 20:
|
|
142
|
-
break
|
|
143
|
-
page += 1
|
|
144
|
-
|
|
145
|
-
items = [_dataset_to_dict(ds) for ds in collected[:limit]]
|
|
146
|
-
return {"ok": True, "results": items, "count": len(items)}
|
|
147
|
-
except Exception as e:
|
|
148
|
-
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def _find_image_files(root: str) -> List[str]:
|
|
152
|
-
image_files: List[str] = []
|
|
153
|
-
for base, _, files in os.walk(root):
|
|
154
|
-
for name in files:
|
|
155
|
-
full = os.path.join(base, name)
|
|
156
|
-
if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
|
|
157
|
-
image_files.append(full)
|
|
158
|
-
image_files.sort()
|
|
159
|
-
return image_files
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
|
|
163
|
-
relative_path = os.path.relpath(full_path, root).replace("\\", "/")
|
|
164
|
-
parent_dir = os.path.dirname(relative_path)
|
|
165
|
-
parts = [part for part in parent_dir.split("/") if part and part != "."]
|
|
166
|
-
|
|
167
|
-
split = None
|
|
168
|
-
label = None
|
|
169
|
-
if parts:
|
|
170
|
-
first = parts[0].lower()
|
|
171
|
-
if first in {"train", "test", "val", "valid", "validation"}:
|
|
172
|
-
split = parts[0]
|
|
173
|
-
if len(parts) > 1:
|
|
174
|
-
label = parts[-1]
|
|
175
|
-
else:
|
|
176
|
-
label = parts[-1]
|
|
177
|
-
|
|
178
|
-
record: Dict[str, Any] = {
|
|
179
|
-
"id": index,
|
|
180
|
-
"image_path": os.path.abspath(full_path),
|
|
181
|
-
"relative_path": relative_path,
|
|
182
|
-
"file_name": os.path.basename(full_path),
|
|
183
|
-
"extension": os.path.splitext(full_path)[1].lower().lstrip("."),
|
|
184
|
-
}
|
|
185
|
-
if split:
|
|
186
|
-
record["split"] = split
|
|
187
|
-
if label:
|
|
188
|
-
record["label"] = label
|
|
189
|
-
return record
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def _write_image_manifest(root: str, image_files: List[str]) -> str:
|
|
193
|
-
manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
|
|
194
|
-
with open(manifest_path, "w", encoding="utf-8") as handle:
|
|
195
|
-
for index, full_path in enumerate(image_files):
|
|
196
|
-
handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
|
|
197
|
-
return manifest_path
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def _pick_best_file(root: str) -> Dict[str, Any]:
|
|
201
|
-
candidates: List[str] = []
|
|
202
|
-
for base, _, files in os.walk(root):
|
|
203
|
-
for name in files:
|
|
204
|
-
full = os.path.join(base, name)
|
|
205
|
-
lower = name.lower()
|
|
206
|
-
if lower.endswith((".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow")):
|
|
207
|
-
candidates.append(full)
|
|
208
|
-
|
|
209
|
-
if not candidates:
|
|
210
|
-
image_files = _find_image_files(root)
|
|
211
|
-
if image_files:
|
|
212
|
-
manifest_path = _write_image_manifest(root, image_files)
|
|
213
|
-
return {
|
|
214
|
-
"local_path": manifest_path,
|
|
215
|
-
"dataset_kind": "image-manifest",
|
|
216
|
-
"image_count": len(image_files),
|
|
217
|
-
}
|
|
218
|
-
raise RuntimeError("No suitable data file found after download")
|
|
219
|
-
|
|
220
|
-
# prioritize common tabular formats
|
|
221
|
-
priorities = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow"]
|
|
222
|
-
for ext in priorities:
|
|
223
|
-
for c in candidates:
|
|
224
|
-
if c.lower().endswith(ext):
|
|
225
|
-
return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
|
|
226
|
-
return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
230
|
-
auth = _ensure_auth()
|
|
231
|
-
if not auth.get("ok"):
|
|
232
|
-
return auth
|
|
233
|
-
|
|
234
|
-
api: KaggleApi = auth["api"]
|
|
235
|
-
|
|
236
|
-
if not target_dir:
|
|
237
|
-
target_dir = tempfile.mkdtemp(prefix="vesper_kaggle_")
|
|
238
|
-
|
|
239
|
-
os.makedirs(target_dir, exist_ok=True)
|
|
240
|
-
|
|
241
|
-
try:
|
|
242
|
-
if "kaggle.com/datasets/" in dataset_ref:
|
|
243
|
-
dataset_ref = dataset_ref.split("kaggle.com/datasets/")[1].lstrip("/")
|
|
244
|
-
|
|
245
|
-
# unzip in place, remove zip for convenience
|
|
246
|
-
api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
|
|
247
|
-
artifact = _pick_best_file(target_dir)
|
|
248
|
-
return {
|
|
249
|
-
"ok": True,
|
|
250
|
-
"dataset_id": dataset_ref,
|
|
251
|
-
"target_dir": target_dir,
|
|
252
|
-
"local_path": artifact["local_path"],
|
|
253
|
-
"dataset_kind": artifact["dataset_kind"],
|
|
254
|
-
"image_count": artifact.get("image_count", 0),
|
|
255
|
-
}
|
|
256
|
-
except Exception as e:
|
|
257
|
-
msg = str(e)
|
|
258
|
-
if "401" in msg or "Unauthorized" in msg:
|
|
259
|
-
return {"ok": False, "error": "Invalid Kaggle credentials (401). Run 'vespermcp config kaggle' again."}
|
|
260
|
-
if "429" in msg or "Too Many Requests" in msg:
|
|
261
|
-
return {"ok": False, "error": "Kaggle rate limit reached. Please retry later."}
|
|
262
|
-
return {"ok": False, "error": f"Kaggle download failed: {msg}"}
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def main():
|
|
266
|
-
if len(sys.argv) < 2:
|
|
267
|
-
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py <discover|download> ..."}))
|
|
268
|
-
sys.exit(1)
|
|
269
|
-
|
|
270
|
-
command = sys.argv[1]
|
|
271
|
-
|
|
272
|
-
if command == "discover":
|
|
273
|
-
if len(sys.argv) < 3:
|
|
274
|
-
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py discover <query> [limit]"}))
|
|
275
|
-
sys.exit(1)
|
|
276
|
-
query = sys.argv[2]
|
|
277
|
-
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20
|
|
278
|
-
print(json.dumps(discover(query, limit)))
|
|
279
|
-
return
|
|
280
|
-
|
|
281
|
-
if command == "download":
|
|
282
|
-
if len(sys.argv) < 3:
|
|
283
|
-
print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py download <dataset_ref> [target_dir]"}))
|
|
284
|
-
sys.exit(1)
|
|
285
|
-
dataset_ref = sys.argv[2]
|
|
286
|
-
target_dir = sys.argv[3] if len(sys.argv) > 3 else ""
|
|
287
|
-
print(json.dumps(download(dataset_ref, target_dir)))
|
|
288
|
-
return
|
|
289
|
-
|
|
290
|
-
print(json.dumps({"ok": False, "error": f"Unknown command: {command}"}))
|
|
291
|
-
sys.exit(1)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
if __name__ == "__main__":
|
|
295
|
-
main()
|