vesper-wizard 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,679 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import json
|
|
5
|
-
import mimetypes
|
|
6
|
-
import os
|
|
7
|
-
import shutil
|
|
8
|
-
import tempfile
|
|
9
|
-
from dataclasses import dataclass
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
|
|
12
|
-
from urllib.parse import urlparse
|
|
13
|
-
|
|
14
|
-
import aiohttp
|
|
15
|
-
|
|
16
|
-
from vesper.core.download_recipe import get_download_recipe
|
|
17
|
-
|
|
18
|
-
try:
|
|
19
|
-
import aiofiles
|
|
20
|
-
except Exception: # pragma: no cover
|
|
21
|
-
aiofiles = None
|
|
22
|
-
|
|
23
|
-
try:
|
|
24
|
-
import webdataset as wds
|
|
25
|
-
except Exception: # pragma: no cover
|
|
26
|
-
wds = None
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@dataclass
|
|
33
|
-
class DownloadResult:
|
|
34
|
-
dataset_id: str
|
|
35
|
-
source: str
|
|
36
|
-
output_dir: str
|
|
37
|
-
downloaded_assets: int
|
|
38
|
-
failed_assets: int
|
|
39
|
-
errors_file: str
|
|
40
|
-
metadata_file: str
|
|
41
|
-
output_format: str
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class AssetDownloader:
|
|
45
|
-
def __init__(
|
|
46
|
-
self,
|
|
47
|
-
output_root: str,
|
|
48
|
-
workers: int = 8,
|
|
49
|
-
recipes_dir: Optional[str] = None,
|
|
50
|
-
progress_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None] | None]] = None,
|
|
51
|
-
) -> None:
|
|
52
|
-
self.output_root = Path(output_root)
|
|
53
|
-
self.workers = max(1, min(workers, 32))
|
|
54
|
-
self.recipes_dir = recipes_dir
|
|
55
|
-
self.progress_callback = progress_callback
|
|
56
|
-
|
|
57
|
-
async def _emit(self, stage: str, payload: Dict[str, Any]) -> None:
|
|
58
|
-
if not self.progress_callback:
|
|
59
|
-
return
|
|
60
|
-
maybe = self.progress_callback(stage, payload)
|
|
61
|
-
if asyncio.iscoroutine(maybe):
|
|
62
|
-
await maybe
|
|
63
|
-
|
|
64
|
-
@staticmethod
|
|
65
|
-
def _hydrate_kaggle_credentials() -> None:
|
|
66
|
-
try:
|
|
67
|
-
from config import get_all # type: ignore
|
|
68
|
-
keys = get_all() or {}
|
|
69
|
-
except Exception:
|
|
70
|
-
keys = {}
|
|
71
|
-
|
|
72
|
-
username = keys.get("kaggle_username") or os.getenv("KAGGLE_USERNAME")
|
|
73
|
-
key = keys.get("kaggle_key") or os.getenv("KAGGLE_KEY")
|
|
74
|
-
|
|
75
|
-
if username:
|
|
76
|
-
os.environ["KAGGLE_USERNAME"] = str(username)
|
|
77
|
-
if key:
|
|
78
|
-
os.environ["KAGGLE_KEY"] = str(key)
|
|
79
|
-
|
|
80
|
-
username = os.getenv("KAGGLE_USERNAME")
|
|
81
|
-
key = os.getenv("KAGGLE_KEY")
|
|
82
|
-
if not username or not key:
|
|
83
|
-
return
|
|
84
|
-
|
|
85
|
-
kaggle_dir = Path.home() / ".kaggle"
|
|
86
|
-
kaggle_file = kaggle_dir / "kaggle.json"
|
|
87
|
-
try:
|
|
88
|
-
kaggle_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
-
kaggle_file.write_text(
|
|
90
|
-
json.dumps({"username": username, "key": key}, ensure_ascii=False),
|
|
91
|
-
encoding="utf-8",
|
|
92
|
-
)
|
|
93
|
-
try:
|
|
94
|
-
os.chmod(kaggle_file, 0o600)
|
|
95
|
-
except Exception:
|
|
96
|
-
pass
|
|
97
|
-
except Exception:
|
|
98
|
-
pass
|
|
99
|
-
|
|
100
|
-
@staticmethod
|
|
101
|
-
def find_image_column(dataset: Any) -> Optional[str]:
|
|
102
|
-
"""Auto-detect the image column in a HuggingFace dataset.
|
|
103
|
-
|
|
104
|
-
Detection strategy (in priority order):
|
|
105
|
-
1. HF Feature type: columns with Image() feature type
|
|
106
|
-
2. Known column names: 'image', 'img', 'photo', 'image_url', etc.
|
|
107
|
-
3. URL pattern detection: columns containing image URLs (http(s)://...jpg)
|
|
108
|
-
4. Path pattern detection: columns with file paths ending in image extensions
|
|
109
|
-
"""
|
|
110
|
-
# Strategy 1: Check HF Feature types (most reliable)
|
|
111
|
-
features = getattr(dataset, "features", None)
|
|
112
|
-
if features:
|
|
113
|
-
for name, feature in features.items():
|
|
114
|
-
feat_cls = feature.__class__.__name__.lower()
|
|
115
|
-
feat_str = str(feature).lower()
|
|
116
|
-
if feat_cls == "image" or "image(" in feat_str:
|
|
117
|
-
return str(name)
|
|
118
|
-
|
|
119
|
-
# Strategy 2: Check known column names
|
|
120
|
-
cols = getattr(dataset, "column_names", []) or []
|
|
121
|
-
|
|
122
|
-
# Exact match first (highest priority names)
|
|
123
|
-
priority_exact = ["image", "img", "photo", "picture", "images"]
|
|
124
|
-
for c in priority_exact:
|
|
125
|
-
if c in cols:
|
|
126
|
-
return c
|
|
127
|
-
|
|
128
|
-
# Partial match (column names containing image-related keywords)
|
|
129
|
-
priority_partial = [
|
|
130
|
-
"image_path", "image_url", "img_path", "img_url",
|
|
131
|
-
"image_file", "file_name", "filepath", "filename",
|
|
132
|
-
"photo_url", "picture_url", "thumbnail",
|
|
133
|
-
"url", "path", "file",
|
|
134
|
-
]
|
|
135
|
-
for target in priority_partial:
|
|
136
|
-
for c in cols:
|
|
137
|
-
if c.lower() == target:
|
|
138
|
-
return c
|
|
139
|
-
|
|
140
|
-
# Strategy 3: Sample values to detect URL/path patterns
|
|
141
|
-
try:
|
|
142
|
-
sample_size = min(5, len(dataset)) if hasattr(dataset, "__len__") else 5
|
|
143
|
-
if sample_size > 0:
|
|
144
|
-
for c in cols:
|
|
145
|
-
is_image_col = False
|
|
146
|
-
for i in range(sample_size):
|
|
147
|
-
try:
|
|
148
|
-
val = dataset[i][c]
|
|
149
|
-
except Exception:
|
|
150
|
-
break
|
|
151
|
-
|
|
152
|
-
if val is None:
|
|
153
|
-
continue
|
|
154
|
-
|
|
155
|
-
# PIL Image object
|
|
156
|
-
if hasattr(val, "save") and hasattr(val, "size"):
|
|
157
|
-
is_image_col = True
|
|
158
|
-
break
|
|
159
|
-
|
|
160
|
-
# Dict with image data
|
|
161
|
-
if isinstance(val, dict) and any(k in val for k in ("bytes", "path", "url")):
|
|
162
|
-
is_image_col = True
|
|
163
|
-
break
|
|
164
|
-
|
|
165
|
-
# String: URL or file path
|
|
166
|
-
if isinstance(val, str):
|
|
167
|
-
val_lower = val.lower()
|
|
168
|
-
# Check for image URLs
|
|
169
|
-
if val_lower.startswith(("http://", "https://")) and any(
|
|
170
|
-
ext in val_lower.split("?")[0] for ext in IMAGE_EXTENSIONS
|
|
171
|
-
):
|
|
172
|
-
is_image_col = True
|
|
173
|
-
break
|
|
174
|
-
# Check for file paths with image extensions
|
|
175
|
-
if any(val_lower.endswith(ext) for ext in IMAGE_EXTENSIONS):
|
|
176
|
-
is_image_col = True
|
|
177
|
-
break
|
|
178
|
-
|
|
179
|
-
if is_image_col:
|
|
180
|
-
return c
|
|
181
|
-
except Exception:
|
|
182
|
-
pass
|
|
183
|
-
|
|
184
|
-
return None
|
|
185
|
-
|
|
186
|
-
async def download_assets(
|
|
187
|
-
self,
|
|
188
|
-
dataset_id: str,
|
|
189
|
-
source: Optional[str] = None,
|
|
190
|
-
repo_id: Optional[str] = None,
|
|
191
|
-
kaggle_ref: Optional[str] = None,
|
|
192
|
-
urls: Optional[List[str]] = None,
|
|
193
|
-
output_format: str = "webdataset",
|
|
194
|
-
output_dir: Optional[str] = None,
|
|
195
|
-
max_items: Optional[int] = None,
|
|
196
|
-
image_column: Optional[str] = None,
|
|
197
|
-
) -> Dict[str, Any]:
|
|
198
|
-
recipe = get_download_recipe(dataset_id, self.recipes_dir)
|
|
199
|
-
if recipe:
|
|
200
|
-
source = source or recipe.get("source")
|
|
201
|
-
repo_id = repo_id or recipe.get("repo_id")
|
|
202
|
-
image_column = image_column or recipe.get("image_column")
|
|
203
|
-
|
|
204
|
-
source = (source or "").lower()
|
|
205
|
-
if source not in {"huggingface", "kaggle", "url"}:
|
|
206
|
-
raise ValueError("source must be one of: huggingface, kaggle, url")
|
|
207
|
-
|
|
208
|
-
# --- Validate imports and args BEFORE creating any directories ---
|
|
209
|
-
if source == "huggingface":
|
|
210
|
-
if not repo_id:
|
|
211
|
-
raise ValueError("repo_id is required for source=huggingface")
|
|
212
|
-
try:
|
|
213
|
-
from datasets import load_dataset as _ld # noqa: F401
|
|
214
|
-
except Exception as e:
|
|
215
|
-
raise RuntimeError(
|
|
216
|
-
f"datasets package is required for HuggingFace downloads. "
|
|
217
|
-
f"Install with: pip install datasets. Details: {e}"
|
|
218
|
-
)
|
|
219
|
-
elif source == "kaggle":
|
|
220
|
-
ref = kaggle_ref or repo_id
|
|
221
|
-
if not ref:
|
|
222
|
-
raise ValueError("kaggle_ref is required for source=kaggle")
|
|
223
|
-
try:
|
|
224
|
-
from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
|
|
225
|
-
except Exception as e:
|
|
226
|
-
raise RuntimeError(
|
|
227
|
-
f"kaggle package is required for Kaggle downloads. "
|
|
228
|
-
f"Install with: pip install kaggle. Details: {e}"
|
|
229
|
-
)
|
|
230
|
-
else:
|
|
231
|
-
if not urls:
|
|
232
|
-
raise ValueError("urls are required for source=url")
|
|
233
|
-
|
|
234
|
-
# --- Now safe to create directories ---
|
|
235
|
-
if output_dir:
|
|
236
|
-
dataset_dir = Path(output_dir).expanduser().resolve()
|
|
237
|
-
else:
|
|
238
|
-
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
239
|
-
images_dir = dataset_dir / "images"
|
|
240
|
-
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
241
|
-
images_dir.mkdir(parents=True, exist_ok=True)
|
|
242
|
-
|
|
243
|
-
errors_file = dataset_dir / "errors.jsonl"
|
|
244
|
-
metadata_file = dataset_dir / "metadata.jsonl"
|
|
245
|
-
|
|
246
|
-
try:
|
|
247
|
-
if source == "huggingface":
|
|
248
|
-
summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
|
|
249
|
-
elif source == "kaggle":
|
|
250
|
-
ref = kaggle_ref or repo_id
|
|
251
|
-
summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
252
|
-
else:
|
|
253
|
-
summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
254
|
-
except Exception:
|
|
255
|
-
# Clean up empty directories on failure so we don't leave ghost artifacts
|
|
256
|
-
if images_dir.exists() and not any(images_dir.iterdir()):
|
|
257
|
-
shutil.rmtree(dataset_dir, ignore_errors=True)
|
|
258
|
-
raise
|
|
259
|
-
|
|
260
|
-
if output_format == "webdataset":
|
|
261
|
-
await self._write_webdataset(dataset_dir, images_dir, metadata_file)
|
|
262
|
-
elif output_format == "parquet":
|
|
263
|
-
await self._write_parquet(dataset_dir, metadata_file)
|
|
264
|
-
|
|
265
|
-
result = DownloadResult(
|
|
266
|
-
dataset_id=dataset_id,
|
|
267
|
-
source=source,
|
|
268
|
-
output_dir=str(dataset_dir),
|
|
269
|
-
downloaded_assets=summary["downloaded"],
|
|
270
|
-
failed_assets=summary["failed"],
|
|
271
|
-
errors_file=str(errors_file),
|
|
272
|
-
metadata_file=str(metadata_file),
|
|
273
|
-
output_format=output_format,
|
|
274
|
-
)
|
|
275
|
-
return result.__dict__
|
|
276
|
-
|
|
277
|
-
async def _download_huggingface(
|
|
278
|
-
self,
|
|
279
|
-
repo_id: str,
|
|
280
|
-
dataset_id: str,
|
|
281
|
-
images_dir: Path,
|
|
282
|
-
metadata_file: Path,
|
|
283
|
-
errors_file: Path,
|
|
284
|
-
max_items: Optional[int],
|
|
285
|
-
image_column: Optional[str],
|
|
286
|
-
) -> Dict[str, int]:
|
|
287
|
-
from datasets import load_dataset # validated in download_assets()
|
|
288
|
-
import warnings
|
|
289
|
-
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
290
|
-
|
|
291
|
-
await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
|
|
292
|
-
|
|
293
|
-
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
|
|
294
|
-
|
|
295
|
-
# Try loading with multiple strategies
|
|
296
|
-
ds = None
|
|
297
|
-
load_errors = []
|
|
298
|
-
|
|
299
|
-
for trust_rc in [True, False]:
|
|
300
|
-
for split_name in ["train", "test", "validation"]:
|
|
301
|
-
try:
|
|
302
|
-
kwargs = {"path": repo_id, "split": split_name}
|
|
303
|
-
if trust_rc:
|
|
304
|
-
kwargs["trust_remote_code"] = True
|
|
305
|
-
if token:
|
|
306
|
-
kwargs["token"] = token
|
|
307
|
-
ds = load_dataset(**kwargs)
|
|
308
|
-
break
|
|
309
|
-
except Exception as e:
|
|
310
|
-
msg = str(e)
|
|
311
|
-
# Immediately raise auth errors
|
|
312
|
-
if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
|
|
313
|
-
raise RuntimeError(
|
|
314
|
-
f"Authentication required for '{repo_id}'. "
|
|
315
|
-
"This dataset may be gated or private. "
|
|
316
|
-
"Use the configure_keys tool to set HF_TOKEN, then retry."
|
|
317
|
-
)
|
|
318
|
-
load_errors.append(msg)
|
|
319
|
-
continue
|
|
320
|
-
if ds is not None:
|
|
321
|
-
break
|
|
322
|
-
|
|
323
|
-
# Fallback: load without split
|
|
324
|
-
if ds is None:
|
|
325
|
-
try:
|
|
326
|
-
kwargs = {"path": repo_id, "trust_remote_code": True}
|
|
327
|
-
if token:
|
|
328
|
-
kwargs["token"] = token
|
|
329
|
-
dd = load_dataset(**kwargs)
|
|
330
|
-
from datasets import DatasetDict
|
|
331
|
-
if isinstance(dd, DatasetDict):
|
|
332
|
-
first_split = list(dd.keys())[0]
|
|
333
|
-
ds = dd[first_split]
|
|
334
|
-
else:
|
|
335
|
-
ds = dd
|
|
336
|
-
except Exception as e:
|
|
337
|
-
msg = str(e)
|
|
338
|
-
if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
|
|
339
|
-
raise RuntimeError(
|
|
340
|
-
f"Authentication required for '{repo_id}'. "
|
|
341
|
-
"Use the configure_keys tool to set HF_TOKEN, then retry."
|
|
342
|
-
)
|
|
343
|
-
combined = "; ".join(load_errors[:3])
|
|
344
|
-
raise RuntimeError(
|
|
345
|
-
f"Failed to load HuggingFace dataset '{repo_id}': {msg}. "
|
|
346
|
-
f"Previous attempts: {combined}"
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
col = image_column or self.find_image_column(ds)
|
|
350
|
-
if not col:
|
|
351
|
-
raise RuntimeError(
|
|
352
|
-
f"No image column detected in HuggingFace dataset '{repo_id}'. "
|
|
353
|
-
"Available columns: " + ", ".join(getattr(ds, "column_names", [])) + ". "
|
|
354
|
-
"Provide image_column parameter explicitly."
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
total = len(ds) if hasattr(ds, "__len__") else 0
|
|
358
|
-
target = min(total, max_items) if max_items and total else (max_items or total or 0)
|
|
359
|
-
|
|
360
|
-
downloaded = 0
|
|
361
|
-
failed = 0
|
|
362
|
-
|
|
363
|
-
# Create an aiohttp session for URL-based images
|
|
364
|
-
session = None
|
|
365
|
-
|
|
366
|
-
try:
|
|
367
|
-
with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
|
|
368
|
-
for idx, row in enumerate(ds):
|
|
369
|
-
if max_items and idx >= max_items:
|
|
370
|
-
break
|
|
371
|
-
try:
|
|
372
|
-
out_name = f"{idx:08d}.jpg"
|
|
373
|
-
out_path = images_dir / out_name
|
|
374
|
-
value = row.get(col)
|
|
375
|
-
|
|
376
|
-
# Handle URL-based images inline
|
|
377
|
-
if isinstance(value, dict) and value.get("url") and not value.get("bytes") and not value.get("path"):
|
|
378
|
-
url = value["url"]
|
|
379
|
-
if session is None:
|
|
380
|
-
session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
|
|
381
|
-
await self._download_image_from_url(session, url, out_path)
|
|
382
|
-
elif isinstance(value, str) and value.startswith(("http://", "https://")):
|
|
383
|
-
if session is None:
|
|
384
|
-
session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
|
|
385
|
-
await self._download_image_from_url(session, value, out_path)
|
|
386
|
-
else:
|
|
387
|
-
self._save_image_value(value, out_path)
|
|
388
|
-
|
|
389
|
-
record = {
|
|
390
|
-
"dataset_id": dataset_id,
|
|
391
|
-
"index": idx,
|
|
392
|
-
"image_path": str(out_path),
|
|
393
|
-
"source": "huggingface",
|
|
394
|
-
"repo_id": repo_id,
|
|
395
|
-
}
|
|
396
|
-
mf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
397
|
-
downloaded += 1
|
|
398
|
-
if downloaded % 50 == 0:
|
|
399
|
-
await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
|
|
400
|
-
except Exception as e:
|
|
401
|
-
failed += 1
|
|
402
|
-
ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
|
|
403
|
-
finally:
|
|
404
|
-
if session is not None:
|
|
405
|
-
await session.close()
|
|
406
|
-
|
|
407
|
-
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
408
|
-
return {"downloaded": downloaded, "failed": failed}
|
|
409
|
-
|
|
410
|
-
async def _download_image_from_url(self, session: aiohttp.ClientSession, url: str, out_path: Path) -> None:
|
|
411
|
-
"""Download an image from a URL to a local path."""
|
|
412
|
-
async with session.get(url) as response:
|
|
413
|
-
if response.status != 200:
|
|
414
|
-
raise RuntimeError(f"HTTP {response.status} downloading {url}")
|
|
415
|
-
data = await response.read()
|
|
416
|
-
if not data:
|
|
417
|
-
raise RuntimeError(f"Empty response from {url}")
|
|
418
|
-
out_path.write_bytes(data)
|
|
419
|
-
|
|
420
|
-
async def _download_kaggle(
|
|
421
|
-
self,
|
|
422
|
-
kaggle_ref: str,
|
|
423
|
-
dataset_id: str,
|
|
424
|
-
images_dir: Path,
|
|
425
|
-
metadata_file: Path,
|
|
426
|
-
errors_file: Path,
|
|
427
|
-
max_items: Optional[int],
|
|
428
|
-
) -> Dict[str, int]:
|
|
429
|
-
from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
|
|
430
|
-
|
|
431
|
-
await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
|
|
432
|
-
|
|
433
|
-
self._hydrate_kaggle_credentials()
|
|
434
|
-
|
|
435
|
-
api = KaggleApi()
|
|
436
|
-
try:
|
|
437
|
-
api.authenticate()
|
|
438
|
-
except Exception as e:
|
|
439
|
-
raise RuntimeError(
|
|
440
|
-
"Kaggle authentication failed. Run 'configure_kaggle' or 'configure_keys' with "
|
|
441
|
-
"kaggle_username and kaggle_key, then retry. "
|
|
442
|
-
f"Details: {e}"
|
|
443
|
-
)
|
|
444
|
-
|
|
445
|
-
tmp_dir = Path(tempfile.mkdtemp(prefix="vesper_kaggle_assets_"))
|
|
446
|
-
downloaded = 0
|
|
447
|
-
failed = 0
|
|
448
|
-
|
|
449
|
-
try:
|
|
450
|
-
api.dataset_download_files(kaggle_ref, path=str(tmp_dir), unzip=True, quiet=True)
|
|
451
|
-
candidates = [p for p in tmp_dir.rglob("*") if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS]
|
|
452
|
-
if max_items:
|
|
453
|
-
candidates = candidates[:max_items]
|
|
454
|
-
|
|
455
|
-
with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
|
|
456
|
-
for idx, src_path in enumerate(candidates):
|
|
457
|
-
try:
|
|
458
|
-
out_name = f"{idx:08d}{src_path.suffix.lower()}"
|
|
459
|
-
out_path = images_dir / out_name
|
|
460
|
-
shutil.copy2(src_path, out_path)
|
|
461
|
-
record = {
|
|
462
|
-
"dataset_id": dataset_id,
|
|
463
|
-
"index": idx,
|
|
464
|
-
"image_path": str(out_path),
|
|
465
|
-
"source": "kaggle",
|
|
466
|
-
"repo_id": kaggle_ref,
|
|
467
|
-
}
|
|
468
|
-
mf.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
469
|
-
downloaded += 1
|
|
470
|
-
except Exception as e:
|
|
471
|
-
failed += 1
|
|
472
|
-
ef.write(json.dumps({"file": str(src_path), "error": str(e)}, ensure_ascii=False) + "\n")
|
|
473
|
-
finally:
|
|
474
|
-
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
475
|
-
|
|
476
|
-
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
477
|
-
return {"downloaded": downloaded, "failed": failed}
|
|
478
|
-
|
|
479
|
-
async def _download_urls(
|
|
480
|
-
self,
|
|
481
|
-
urls: List[str],
|
|
482
|
-
dataset_id: str,
|
|
483
|
-
images_dir: Path,
|
|
484
|
-
metadata_file: Path,
|
|
485
|
-
errors_file: Path,
|
|
486
|
-
max_items: Optional[int],
|
|
487
|
-
) -> Dict[str, int]:
|
|
488
|
-
if aiofiles is None:
|
|
489
|
-
raise RuntimeError("aiofiles is required for URL downloads. Install with: pip install aiofiles")
|
|
490
|
-
|
|
491
|
-
selected = urls[:max_items] if max_items else urls
|
|
492
|
-
sem = asyncio.Semaphore(self.workers)
|
|
493
|
-
|
|
494
|
-
downloaded = 0
|
|
495
|
-
failed = 0
|
|
496
|
-
metadata_lock = asyncio.Lock()
|
|
497
|
-
|
|
498
|
-
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=180)) as session:
|
|
499
|
-
async def worker(idx: int, url: str) -> None:
|
|
500
|
-
nonlocal downloaded, failed
|
|
501
|
-
async with sem:
|
|
502
|
-
try:
|
|
503
|
-
local_path = await self._download_one_url(session, idx, url, images_dir)
|
|
504
|
-
async with metadata_lock:
|
|
505
|
-
async with aiofiles.open(metadata_file, "a", encoding="utf-8") as mf:
|
|
506
|
-
await mf.write(json.dumps({
|
|
507
|
-
"dataset_id": dataset_id,
|
|
508
|
-
"index": idx,
|
|
509
|
-
"image_path": str(local_path),
|
|
510
|
-
"source": "url",
|
|
511
|
-
"url": url,
|
|
512
|
-
}, ensure_ascii=False) + "\n")
|
|
513
|
-
downloaded += 1
|
|
514
|
-
except Exception as e:
|
|
515
|
-
failed += 1
|
|
516
|
-
async with metadata_lock:
|
|
517
|
-
async with aiofiles.open(errors_file, "a", encoding="utf-8") as ef:
|
|
518
|
-
await ef.write(json.dumps({"index": idx, "url": url, "error": str(e)}, ensure_ascii=False) + "\n")
|
|
519
|
-
|
|
520
|
-
tasks = [asyncio.create_task(worker(i, u)) for i, u in enumerate(selected)]
|
|
521
|
-
await asyncio.gather(*tasks)
|
|
522
|
-
|
|
523
|
-
await self._emit("done", {"downloaded": downloaded, "failed": failed})
|
|
524
|
-
return {"downloaded": downloaded, "failed": failed}
|
|
525
|
-
|
|
526
|
-
async def _download_one_url(self, session: aiohttp.ClientSession, idx: int, url: str, images_dir: Path) -> Path:
|
|
527
|
-
ext = Path(url.split("?")[0]).suffix.lower()
|
|
528
|
-
if ext not in IMAGE_EXTENSIONS:
|
|
529
|
-
ext = ".jpg"
|
|
530
|
-
out_path = images_dir / f"{idx:08d}{ext}"
|
|
531
|
-
|
|
532
|
-
existing_size = out_path.stat().st_size if out_path.exists() else 0
|
|
533
|
-
headers: Dict[str, str] = {}
|
|
534
|
-
if existing_size > 0:
|
|
535
|
-
headers["Range"] = f"bytes={existing_size}-"
|
|
536
|
-
|
|
537
|
-
async with session.get(url, headers=headers) as response:
|
|
538
|
-
if response.status not in (200, 206):
|
|
539
|
-
raise RuntimeError(f"HTTP {response.status}")
|
|
540
|
-
|
|
541
|
-
mode = "ab" if response.status == 206 and existing_size > 0 else "wb"
|
|
542
|
-
async with aiofiles.open(out_path, mode) as f:
|
|
543
|
-
async for chunk in response.content.iter_chunked(1024 * 256):
|
|
544
|
-
await f.write(chunk)
|
|
545
|
-
|
|
546
|
-
return out_path
|
|
547
|
-
|
|
548
|
-
@staticmethod
|
|
549
|
-
def _save_image_value(value: Any, out_path: Path) -> None:
|
|
550
|
-
"""Save an image value to disk. Handles multiple image representations:
|
|
551
|
-
- PIL Image objects (have .save method)
|
|
552
|
-
- dict with 'bytes' key (raw image bytes)
|
|
553
|
-
- dict with 'path' key (local file path)
|
|
554
|
-
- bytes/bytearray (raw image data)
|
|
555
|
-
- str (local file path)
|
|
556
|
-
"""
|
|
557
|
-
if value is None:
|
|
558
|
-
raise ValueError("empty image value")
|
|
559
|
-
|
|
560
|
-
# PIL Image object
|
|
561
|
-
if hasattr(value, "save") and hasattr(value, "size"):
|
|
562
|
-
value.save(out_path)
|
|
563
|
-
return
|
|
564
|
-
|
|
565
|
-
# Raw bytes
|
|
566
|
-
if isinstance(value, (bytes, bytearray)):
|
|
567
|
-
out_path.write_bytes(value)
|
|
568
|
-
return
|
|
569
|
-
|
|
570
|
-
# Dict with image data
|
|
571
|
-
if isinstance(value, dict):
|
|
572
|
-
if value.get("bytes"):
|
|
573
|
-
raw = value["bytes"]
|
|
574
|
-
if isinstance(raw, (bytes, bytearray)):
|
|
575
|
-
out_path.write_bytes(raw)
|
|
576
|
-
else:
|
|
577
|
-
# Could be a list of ints
|
|
578
|
-
out_path.write_bytes(bytes(raw))
|
|
579
|
-
return
|
|
580
|
-
if value.get("path"):
|
|
581
|
-
p = str(value["path"])
|
|
582
|
-
if os.path.exists(p):
|
|
583
|
-
shutil.copy2(p, out_path)
|
|
584
|
-
return
|
|
585
|
-
raise ValueError(f"Image path not found: {p}")
|
|
586
|
-
if value.get("url"):
|
|
587
|
-
raise ValueError("image URL detected — use async URL downloader")
|
|
588
|
-
|
|
589
|
-
# String: local file path
|
|
590
|
-
if isinstance(value, str):
|
|
591
|
-
if os.path.exists(value):
|
|
592
|
-
shutil.copy2(value, out_path)
|
|
593
|
-
return
|
|
594
|
-
if value.startswith(("http://", "https://")):
|
|
595
|
-
raise ValueError("image URL detected — use async URL downloader")
|
|
596
|
-
raise ValueError(f"Image path not found: {value}")
|
|
597
|
-
|
|
598
|
-
# numpy array (common in some datasets)
|
|
599
|
-
try:
|
|
600
|
-
import numpy as np
|
|
601
|
-
if isinstance(value, np.ndarray):
|
|
602
|
-
from PIL import Image
|
|
603
|
-
img = Image.fromarray(value)
|
|
604
|
-
img.save(out_path)
|
|
605
|
-
return
|
|
606
|
-
except (ImportError, Exception):
|
|
607
|
-
pass
|
|
608
|
-
|
|
609
|
-
raise ValueError(f"Unsupported image value type: {type(value).__name__}")
|
|
610
|
-
|
|
611
|
-
async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
|
|
612
|
-
"""Write a webdataset-compatible tar archive.
|
|
613
|
-
|
|
614
|
-
Uses Python's built-in tarfile module instead of wds.ShardWriter to
|
|
615
|
-
avoid the gopen() handler issue on Windows (backslash paths).
|
|
616
|
-
The resulting .tar files are fully compatible with webdataset readers.
|
|
617
|
-
"""
|
|
618
|
-
import io
|
|
619
|
-
import tarfile as _tarfile
|
|
620
|
-
|
|
621
|
-
max_per_shard = 5000
|
|
622
|
-
shard_idx = 0
|
|
623
|
-
count_in_shard = 0
|
|
624
|
-
current_tar: _tarfile.TarFile | None = None
|
|
625
|
-
|
|
626
|
-
def _open_shard() -> _tarfile.TarFile:
|
|
627
|
-
nonlocal shard_idx
|
|
628
|
-
shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
|
|
629
|
-
shard_idx += 1
|
|
630
|
-
return _tarfile.open(str(shard_path), "w")
|
|
631
|
-
|
|
632
|
-
try:
|
|
633
|
-
current_tar = _open_shard()
|
|
634
|
-
|
|
635
|
-
with metadata_file.open("r", encoding="utf-8") as mf:
|
|
636
|
-
for line in mf:
|
|
637
|
-
row = json.loads(line)
|
|
638
|
-
image_path = Path(row["image_path"])
|
|
639
|
-
if not image_path.exists():
|
|
640
|
-
continue
|
|
641
|
-
|
|
642
|
-
key = image_path.stem
|
|
643
|
-
ext = image_path.suffix.lstrip(".") or "jpg"
|
|
644
|
-
|
|
645
|
-
# Add image file
|
|
646
|
-
img_data = image_path.read_bytes()
|
|
647
|
-
img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
|
|
648
|
-
img_info.size = len(img_data)
|
|
649
|
-
current_tar.addfile(img_info, io.BytesIO(img_data))
|
|
650
|
-
|
|
651
|
-
# Add JSON metadata sidecar
|
|
652
|
-
json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
|
|
653
|
-
json_info = _tarfile.TarInfo(name=f"{key}.json")
|
|
654
|
-
json_info.size = len(json_data)
|
|
655
|
-
current_tar.addfile(json_info, io.BytesIO(json_data))
|
|
656
|
-
|
|
657
|
-
count_in_shard += 1
|
|
658
|
-
if count_in_shard >= max_per_shard:
|
|
659
|
-
current_tar.close()
|
|
660
|
-
current_tar = _open_shard()
|
|
661
|
-
count_in_shard = 0
|
|
662
|
-
finally:
|
|
663
|
-
if current_tar is not None:
|
|
664
|
-
current_tar.close()
|
|
665
|
-
|
|
666
|
-
async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
|
|
667
|
-
try:
|
|
668
|
-
import pyarrow as pa
|
|
669
|
-
import pyarrow.parquet as pq
|
|
670
|
-
except Exception as e:
|
|
671
|
-
raise RuntimeError(f"pyarrow is required for parquet output: {e}")
|
|
672
|
-
|
|
673
|
-
rows: List[Dict[str, Any]] = []
|
|
674
|
-
with metadata_file.open("r", encoding="utf-8") as mf:
|
|
675
|
-
for line in mf:
|
|
676
|
-
rows.append(json.loads(line))
|
|
677
|
-
|
|
678
|
-
table = pa.Table.from_pylist(rows)
|
|
679
|
-
pq.write_table(table, str(dataset_dir / "metadata.parquet"))
|