vesper-wizard 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,679 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import json
5
- import mimetypes
6
- import os
7
- import shutil
8
- import tempfile
9
- from dataclasses import dataclass
10
- from pathlib import Path
11
- from typing import Any, Awaitable, Callable, Dict, Iterable, List, Optional
12
- from urllib.parse import urlparse
13
-
14
- import aiohttp
15
-
16
- from vesper.core.download_recipe import get_download_recipe
17
-
18
- try:
19
- import aiofiles
20
- except Exception: # pragma: no cover
21
- aiofiles = None
22
-
23
- try:
24
- import webdataset as wds
25
- except Exception: # pragma: no cover
26
- wds = None
27
-
28
-
29
- IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"}
30
-
31
-
32
- @dataclass
33
- class DownloadResult:
34
- dataset_id: str
35
- source: str
36
- output_dir: str
37
- downloaded_assets: int
38
- failed_assets: int
39
- errors_file: str
40
- metadata_file: str
41
- output_format: str
42
-
43
-
44
- class AssetDownloader:
45
- def __init__(
46
- self,
47
- output_root: str,
48
- workers: int = 8,
49
- recipes_dir: Optional[str] = None,
50
- progress_callback: Optional[Callable[[str, Dict[str, Any]], Awaitable[None] | None]] = None,
51
- ) -> None:
52
- self.output_root = Path(output_root)
53
- self.workers = max(1, min(workers, 32))
54
- self.recipes_dir = recipes_dir
55
- self.progress_callback = progress_callback
56
-
57
- async def _emit(self, stage: str, payload: Dict[str, Any]) -> None:
58
- if not self.progress_callback:
59
- return
60
- maybe = self.progress_callback(stage, payload)
61
- if asyncio.iscoroutine(maybe):
62
- await maybe
63
-
64
- @staticmethod
65
- def _hydrate_kaggle_credentials() -> None:
66
- try:
67
- from config import get_all # type: ignore
68
- keys = get_all() or {}
69
- except Exception:
70
- keys = {}
71
-
72
- username = keys.get("kaggle_username") or os.getenv("KAGGLE_USERNAME")
73
- key = keys.get("kaggle_key") or os.getenv("KAGGLE_KEY")
74
-
75
- if username:
76
- os.environ["KAGGLE_USERNAME"] = str(username)
77
- if key:
78
- os.environ["KAGGLE_KEY"] = str(key)
79
-
80
- username = os.getenv("KAGGLE_USERNAME")
81
- key = os.getenv("KAGGLE_KEY")
82
- if not username or not key:
83
- return
84
-
85
- kaggle_dir = Path.home() / ".kaggle"
86
- kaggle_file = kaggle_dir / "kaggle.json"
87
- try:
88
- kaggle_dir.mkdir(parents=True, exist_ok=True)
89
- kaggle_file.write_text(
90
- json.dumps({"username": username, "key": key}, ensure_ascii=False),
91
- encoding="utf-8",
92
- )
93
- try:
94
- os.chmod(kaggle_file, 0o600)
95
- except Exception:
96
- pass
97
- except Exception:
98
- pass
99
-
100
- @staticmethod
101
- def find_image_column(dataset: Any) -> Optional[str]:
102
- """Auto-detect the image column in a HuggingFace dataset.
103
-
104
- Detection strategy (in priority order):
105
- 1. HF Feature type: columns with Image() feature type
106
- 2. Known column names: 'image', 'img', 'photo', 'image_url', etc.
107
- 3. URL pattern detection: columns containing image URLs (http(s)://...jpg)
108
- 4. Path pattern detection: columns with file paths ending in image extensions
109
- """
110
- # Strategy 1: Check HF Feature types (most reliable)
111
- features = getattr(dataset, "features", None)
112
- if features:
113
- for name, feature in features.items():
114
- feat_cls = feature.__class__.__name__.lower()
115
- feat_str = str(feature).lower()
116
- if feat_cls == "image" or "image(" in feat_str:
117
- return str(name)
118
-
119
- # Strategy 2: Check known column names
120
- cols = getattr(dataset, "column_names", []) or []
121
-
122
- # Exact match first (highest priority names)
123
- priority_exact = ["image", "img", "photo", "picture", "images"]
124
- for c in priority_exact:
125
- if c in cols:
126
- return c
127
-
128
- # Partial match (column names containing image-related keywords)
129
- priority_partial = [
130
- "image_path", "image_url", "img_path", "img_url",
131
- "image_file", "file_name", "filepath", "filename",
132
- "photo_url", "picture_url", "thumbnail",
133
- "url", "path", "file",
134
- ]
135
- for target in priority_partial:
136
- for c in cols:
137
- if c.lower() == target:
138
- return c
139
-
140
- # Strategy 3: Sample values to detect URL/path patterns
141
- try:
142
- sample_size = min(5, len(dataset)) if hasattr(dataset, "__len__") else 5
143
- if sample_size > 0:
144
- for c in cols:
145
- is_image_col = False
146
- for i in range(sample_size):
147
- try:
148
- val = dataset[i][c]
149
- except Exception:
150
- break
151
-
152
- if val is None:
153
- continue
154
-
155
- # PIL Image object
156
- if hasattr(val, "save") and hasattr(val, "size"):
157
- is_image_col = True
158
- break
159
-
160
- # Dict with image data
161
- if isinstance(val, dict) and any(k in val for k in ("bytes", "path", "url")):
162
- is_image_col = True
163
- break
164
-
165
- # String: URL or file path
166
- if isinstance(val, str):
167
- val_lower = val.lower()
168
- # Check for image URLs
169
- if val_lower.startswith(("http://", "https://")) and any(
170
- ext in val_lower.split("?")[0] for ext in IMAGE_EXTENSIONS
171
- ):
172
- is_image_col = True
173
- break
174
- # Check for file paths with image extensions
175
- if any(val_lower.endswith(ext) for ext in IMAGE_EXTENSIONS):
176
- is_image_col = True
177
- break
178
-
179
- if is_image_col:
180
- return c
181
- except Exception:
182
- pass
183
-
184
- return None
185
-
186
- async def download_assets(
187
- self,
188
- dataset_id: str,
189
- source: Optional[str] = None,
190
- repo_id: Optional[str] = None,
191
- kaggle_ref: Optional[str] = None,
192
- urls: Optional[List[str]] = None,
193
- output_format: str = "webdataset",
194
- output_dir: Optional[str] = None,
195
- max_items: Optional[int] = None,
196
- image_column: Optional[str] = None,
197
- ) -> Dict[str, Any]:
198
- recipe = get_download_recipe(dataset_id, self.recipes_dir)
199
- if recipe:
200
- source = source or recipe.get("source")
201
- repo_id = repo_id or recipe.get("repo_id")
202
- image_column = image_column or recipe.get("image_column")
203
-
204
- source = (source or "").lower()
205
- if source not in {"huggingface", "kaggle", "url"}:
206
- raise ValueError("source must be one of: huggingface, kaggle, url")
207
-
208
- # --- Validate imports and args BEFORE creating any directories ---
209
- if source == "huggingface":
210
- if not repo_id:
211
- raise ValueError("repo_id is required for source=huggingface")
212
- try:
213
- from datasets import load_dataset as _ld # noqa: F401
214
- except Exception as e:
215
- raise RuntimeError(
216
- f"datasets package is required for HuggingFace downloads. "
217
- f"Install with: pip install datasets. Details: {e}"
218
- )
219
- elif source == "kaggle":
220
- ref = kaggle_ref or repo_id
221
- if not ref:
222
- raise ValueError("kaggle_ref is required for source=kaggle")
223
- try:
224
- from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
225
- except Exception as e:
226
- raise RuntimeError(
227
- f"kaggle package is required for Kaggle downloads. "
228
- f"Install with: pip install kaggle. Details: {e}"
229
- )
230
- else:
231
- if not urls:
232
- raise ValueError("urls are required for source=url")
233
-
234
- # --- Now safe to create directories ---
235
- if output_dir:
236
- dataset_dir = Path(output_dir).expanduser().resolve()
237
- else:
238
- dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
239
- images_dir = dataset_dir / "images"
240
- dataset_dir.mkdir(parents=True, exist_ok=True)
241
- images_dir.mkdir(parents=True, exist_ok=True)
242
-
243
- errors_file = dataset_dir / "errors.jsonl"
244
- metadata_file = dataset_dir / "metadata.jsonl"
245
-
246
- try:
247
- if source == "huggingface":
248
- summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
249
- elif source == "kaggle":
250
- ref = kaggle_ref or repo_id
251
- summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
252
- else:
253
- summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
254
- except Exception:
255
- # Clean up empty directories on failure so we don't leave ghost artifacts
256
- if images_dir.exists() and not any(images_dir.iterdir()):
257
- shutil.rmtree(dataset_dir, ignore_errors=True)
258
- raise
259
-
260
- if output_format == "webdataset":
261
- await self._write_webdataset(dataset_dir, images_dir, metadata_file)
262
- elif output_format == "parquet":
263
- await self._write_parquet(dataset_dir, metadata_file)
264
-
265
- result = DownloadResult(
266
- dataset_id=dataset_id,
267
- source=source,
268
- output_dir=str(dataset_dir),
269
- downloaded_assets=summary["downloaded"],
270
- failed_assets=summary["failed"],
271
- errors_file=str(errors_file),
272
- metadata_file=str(metadata_file),
273
- output_format=output_format,
274
- )
275
- return result.__dict__
276
-
277
- async def _download_huggingface(
278
- self,
279
- repo_id: str,
280
- dataset_id: str,
281
- images_dir: Path,
282
- metadata_file: Path,
283
- errors_file: Path,
284
- max_items: Optional[int],
285
- image_column: Optional[str],
286
- ) -> Dict[str, int]:
287
- from datasets import load_dataset # validated in download_assets()
288
- import warnings
289
- warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
290
-
291
- await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
292
-
293
- token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
294
-
295
- # Try loading with multiple strategies
296
- ds = None
297
- load_errors = []
298
-
299
- for trust_rc in [True, False]:
300
- for split_name in ["train", "test", "validation"]:
301
- try:
302
- kwargs = {"path": repo_id, "split": split_name}
303
- if trust_rc:
304
- kwargs["trust_remote_code"] = True
305
- if token:
306
- kwargs["token"] = token
307
- ds = load_dataset(**kwargs)
308
- break
309
- except Exception as e:
310
- msg = str(e)
311
- # Immediately raise auth errors
312
- if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
313
- raise RuntimeError(
314
- f"Authentication required for '{repo_id}'. "
315
- "This dataset may be gated or private. "
316
- "Use the configure_keys tool to set HF_TOKEN, then retry."
317
- )
318
- load_errors.append(msg)
319
- continue
320
- if ds is not None:
321
- break
322
-
323
- # Fallback: load without split
324
- if ds is None:
325
- try:
326
- kwargs = {"path": repo_id, "trust_remote_code": True}
327
- if token:
328
- kwargs["token"] = token
329
- dd = load_dataset(**kwargs)
330
- from datasets import DatasetDict
331
- if isinstance(dd, DatasetDict):
332
- first_split = list(dd.keys())[0]
333
- ds = dd[first_split]
334
- else:
335
- ds = dd
336
- except Exception as e:
337
- msg = str(e)
338
- if any(x in msg for x in ["401", "403", "gated", "Unauthorized"]):
339
- raise RuntimeError(
340
- f"Authentication required for '{repo_id}'. "
341
- "Use the configure_keys tool to set HF_TOKEN, then retry."
342
- )
343
- combined = "; ".join(load_errors[:3])
344
- raise RuntimeError(
345
- f"Failed to load HuggingFace dataset '{repo_id}': {msg}. "
346
- f"Previous attempts: {combined}"
347
- )
348
-
349
- col = image_column or self.find_image_column(ds)
350
- if not col:
351
- raise RuntimeError(
352
- f"No image column detected in HuggingFace dataset '{repo_id}'. "
353
- "Available columns: " + ", ".join(getattr(ds, "column_names", [])) + ". "
354
- "Provide image_column parameter explicitly."
355
- )
356
-
357
- total = len(ds) if hasattr(ds, "__len__") else 0
358
- target = min(total, max_items) if max_items and total else (max_items or total or 0)
359
-
360
- downloaded = 0
361
- failed = 0
362
-
363
- # Create an aiohttp session for URL-based images
364
- session = None
365
-
366
- try:
367
- with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
368
- for idx, row in enumerate(ds):
369
- if max_items and idx >= max_items:
370
- break
371
- try:
372
- out_name = f"{idx:08d}.jpg"
373
- out_path = images_dir / out_name
374
- value = row.get(col)
375
-
376
- # Handle URL-based images inline
377
- if isinstance(value, dict) and value.get("url") and not value.get("bytes") and not value.get("path"):
378
- url = value["url"]
379
- if session is None:
380
- session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
381
- await self._download_image_from_url(session, url, out_path)
382
- elif isinstance(value, str) and value.startswith(("http://", "https://")):
383
- if session is None:
384
- session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))
385
- await self._download_image_from_url(session, value, out_path)
386
- else:
387
- self._save_image_value(value, out_path)
388
-
389
- record = {
390
- "dataset_id": dataset_id,
391
- "index": idx,
392
- "image_path": str(out_path),
393
- "source": "huggingface",
394
- "repo_id": repo_id,
395
- }
396
- mf.write(json.dumps(record, ensure_ascii=False) + "\n")
397
- downloaded += 1
398
- if downloaded % 50 == 0:
399
- await self._emit("progress", {"downloaded": downloaded, "failed": failed, "target": target})
400
- except Exception as e:
401
- failed += 1
402
- ef.write(json.dumps({"index": idx, "error": str(e)}, ensure_ascii=False) + "\n")
403
- finally:
404
- if session is not None:
405
- await session.close()
406
-
407
- await self._emit("done", {"downloaded": downloaded, "failed": failed})
408
- return {"downloaded": downloaded, "failed": failed}
409
-
410
- async def _download_image_from_url(self, session: aiohttp.ClientSession, url: str, out_path: Path) -> None:
411
- """Download an image from a URL to a local path."""
412
- async with session.get(url) as response:
413
- if response.status != 200:
414
- raise RuntimeError(f"HTTP {response.status} downloading {url}")
415
- data = await response.read()
416
- if not data:
417
- raise RuntimeError(f"Empty response from {url}")
418
- out_path.write_bytes(data)
419
-
420
- async def _download_kaggle(
421
- self,
422
- kaggle_ref: str,
423
- dataset_id: str,
424
- images_dir: Path,
425
- metadata_file: Path,
426
- errors_file: Path,
427
- max_items: Optional[int],
428
- ) -> Dict[str, int]:
429
- from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
430
-
431
- await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
432
-
433
- self._hydrate_kaggle_credentials()
434
-
435
- api = KaggleApi()
436
- try:
437
- api.authenticate()
438
- except Exception as e:
439
- raise RuntimeError(
440
- "Kaggle authentication failed. Run 'configure_kaggle' or 'configure_keys' with "
441
- "kaggle_username and kaggle_key, then retry. "
442
- f"Details: {e}"
443
- )
444
-
445
- tmp_dir = Path(tempfile.mkdtemp(prefix="vesper_kaggle_assets_"))
446
- downloaded = 0
447
- failed = 0
448
-
449
- try:
450
- api.dataset_download_files(kaggle_ref, path=str(tmp_dir), unzip=True, quiet=True)
451
- candidates = [p for p in tmp_dir.rglob("*") if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS]
452
- if max_items:
453
- candidates = candidates[:max_items]
454
-
455
- with metadata_file.open("w", encoding="utf-8") as mf, errors_file.open("w", encoding="utf-8") as ef:
456
- for idx, src_path in enumerate(candidates):
457
- try:
458
- out_name = f"{idx:08d}{src_path.suffix.lower()}"
459
- out_path = images_dir / out_name
460
- shutil.copy2(src_path, out_path)
461
- record = {
462
- "dataset_id": dataset_id,
463
- "index": idx,
464
- "image_path": str(out_path),
465
- "source": "kaggle",
466
- "repo_id": kaggle_ref,
467
- }
468
- mf.write(json.dumps(record, ensure_ascii=False) + "\n")
469
- downloaded += 1
470
- except Exception as e:
471
- failed += 1
472
- ef.write(json.dumps({"file": str(src_path), "error": str(e)}, ensure_ascii=False) + "\n")
473
- finally:
474
- shutil.rmtree(tmp_dir, ignore_errors=True)
475
-
476
- await self._emit("done", {"downloaded": downloaded, "failed": failed})
477
- return {"downloaded": downloaded, "failed": failed}
478
-
479
- async def _download_urls(
480
- self,
481
- urls: List[str],
482
- dataset_id: str,
483
- images_dir: Path,
484
- metadata_file: Path,
485
- errors_file: Path,
486
- max_items: Optional[int],
487
- ) -> Dict[str, int]:
488
- if aiofiles is None:
489
- raise RuntimeError("aiofiles is required for URL downloads. Install with: pip install aiofiles")
490
-
491
- selected = urls[:max_items] if max_items else urls
492
- sem = asyncio.Semaphore(self.workers)
493
-
494
- downloaded = 0
495
- failed = 0
496
- metadata_lock = asyncio.Lock()
497
-
498
- async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=180)) as session:
499
- async def worker(idx: int, url: str) -> None:
500
- nonlocal downloaded, failed
501
- async with sem:
502
- try:
503
- local_path = await self._download_one_url(session, idx, url, images_dir)
504
- async with metadata_lock:
505
- async with aiofiles.open(metadata_file, "a", encoding="utf-8") as mf:
506
- await mf.write(json.dumps({
507
- "dataset_id": dataset_id,
508
- "index": idx,
509
- "image_path": str(local_path),
510
- "source": "url",
511
- "url": url,
512
- }, ensure_ascii=False) + "\n")
513
- downloaded += 1
514
- except Exception as e:
515
- failed += 1
516
- async with metadata_lock:
517
- async with aiofiles.open(errors_file, "a", encoding="utf-8") as ef:
518
- await ef.write(json.dumps({"index": idx, "url": url, "error": str(e)}, ensure_ascii=False) + "\n")
519
-
520
- tasks = [asyncio.create_task(worker(i, u)) for i, u in enumerate(selected)]
521
- await asyncio.gather(*tasks)
522
-
523
- await self._emit("done", {"downloaded": downloaded, "failed": failed})
524
- return {"downloaded": downloaded, "failed": failed}
525
-
526
- async def _download_one_url(self, session: aiohttp.ClientSession, idx: int, url: str, images_dir: Path) -> Path:
527
- ext = Path(url.split("?")[0]).suffix.lower()
528
- if ext not in IMAGE_EXTENSIONS:
529
- ext = ".jpg"
530
- out_path = images_dir / f"{idx:08d}{ext}"
531
-
532
- existing_size = out_path.stat().st_size if out_path.exists() else 0
533
- headers: Dict[str, str] = {}
534
- if existing_size > 0:
535
- headers["Range"] = f"bytes={existing_size}-"
536
-
537
- async with session.get(url, headers=headers) as response:
538
- if response.status not in (200, 206):
539
- raise RuntimeError(f"HTTP {response.status}")
540
-
541
- mode = "ab" if response.status == 206 and existing_size > 0 else "wb"
542
- async with aiofiles.open(out_path, mode) as f:
543
- async for chunk in response.content.iter_chunked(1024 * 256):
544
- await f.write(chunk)
545
-
546
- return out_path
547
-
548
- @staticmethod
549
- def _save_image_value(value: Any, out_path: Path) -> None:
550
- """Save an image value to disk. Handles multiple image representations:
551
- - PIL Image objects (have .save method)
552
- - dict with 'bytes' key (raw image bytes)
553
- - dict with 'path' key (local file path)
554
- - bytes/bytearray (raw image data)
555
- - str (local file path)
556
- """
557
- if value is None:
558
- raise ValueError("empty image value")
559
-
560
- # PIL Image object
561
- if hasattr(value, "save") and hasattr(value, "size"):
562
- value.save(out_path)
563
- return
564
-
565
- # Raw bytes
566
- if isinstance(value, (bytes, bytearray)):
567
- out_path.write_bytes(value)
568
- return
569
-
570
- # Dict with image data
571
- if isinstance(value, dict):
572
- if value.get("bytes"):
573
- raw = value["bytes"]
574
- if isinstance(raw, (bytes, bytearray)):
575
- out_path.write_bytes(raw)
576
- else:
577
- # Could be a list of ints
578
- out_path.write_bytes(bytes(raw))
579
- return
580
- if value.get("path"):
581
- p = str(value["path"])
582
- if os.path.exists(p):
583
- shutil.copy2(p, out_path)
584
- return
585
- raise ValueError(f"Image path not found: {p}")
586
- if value.get("url"):
587
- raise ValueError("image URL detected — use async URL downloader")
588
-
589
- # String: local file path
590
- if isinstance(value, str):
591
- if os.path.exists(value):
592
- shutil.copy2(value, out_path)
593
- return
594
- if value.startswith(("http://", "https://")):
595
- raise ValueError("image URL detected — use async URL downloader")
596
- raise ValueError(f"Image path not found: {value}")
597
-
598
- # numpy array (common in some datasets)
599
- try:
600
- import numpy as np
601
- if isinstance(value, np.ndarray):
602
- from PIL import Image
603
- img = Image.fromarray(value)
604
- img.save(out_path)
605
- return
606
- except (ImportError, Exception):
607
- pass
608
-
609
- raise ValueError(f"Unsupported image value type: {type(value).__name__}")
610
-
611
- async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
612
- """Write a webdataset-compatible tar archive.
613
-
614
- Uses Python's built-in tarfile module instead of wds.ShardWriter to
615
- avoid the gopen() handler issue on Windows (backslash paths).
616
- The resulting .tar files are fully compatible with webdataset readers.
617
- """
618
- import io
619
- import tarfile as _tarfile
620
-
621
- max_per_shard = 5000
622
- shard_idx = 0
623
- count_in_shard = 0
624
- current_tar: _tarfile.TarFile | None = None
625
-
626
- def _open_shard() -> _tarfile.TarFile:
627
- nonlocal shard_idx
628
- shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
629
- shard_idx += 1
630
- return _tarfile.open(str(shard_path), "w")
631
-
632
- try:
633
- current_tar = _open_shard()
634
-
635
- with metadata_file.open("r", encoding="utf-8") as mf:
636
- for line in mf:
637
- row = json.loads(line)
638
- image_path = Path(row["image_path"])
639
- if not image_path.exists():
640
- continue
641
-
642
- key = image_path.stem
643
- ext = image_path.suffix.lstrip(".") or "jpg"
644
-
645
- # Add image file
646
- img_data = image_path.read_bytes()
647
- img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
648
- img_info.size = len(img_data)
649
- current_tar.addfile(img_info, io.BytesIO(img_data))
650
-
651
- # Add JSON metadata sidecar
652
- json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
653
- json_info = _tarfile.TarInfo(name=f"{key}.json")
654
- json_info.size = len(json_data)
655
- current_tar.addfile(json_info, io.BytesIO(json_data))
656
-
657
- count_in_shard += 1
658
- if count_in_shard >= max_per_shard:
659
- current_tar.close()
660
- current_tar = _open_shard()
661
- count_in_shard = 0
662
- finally:
663
- if current_tar is not None:
664
- current_tar.close()
665
-
666
- async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
667
- try:
668
- import pyarrow as pa
669
- import pyarrow.parquet as pq
670
- except Exception as e:
671
- raise RuntimeError(f"pyarrow is required for parquet output: {e}")
672
-
673
- rows: List[Dict[str, Any]] = []
674
- with metadata_file.open("r", encoding="utf-8") as mf:
675
- for line in mf:
676
- rows.append(json.loads(line))
677
-
678
- table = pa.Table.from_pylist(rows)
679
- pq.write_table(table, str(dataset_dir / "metadata.parquet"))