vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,298 +0,0 @@
1
- """
2
- HuggingFace Datasets Library Fallback Downloader.
3
-
4
- Used when the HF Hub file listing finds no suitable data files
5
- (e.g. script-based datasets, gated datasets, datasets that use
6
- the `datasets` library format).
7
-
8
- Handles:
9
- - Legacy script-based datasets (trust_remote_code)
10
- - Gated/private datasets (token auth)
11
- - Image datasets (PIL Image columns → stripped for tabular export)
12
- - Various split formats (DatasetDict, single split)
13
-
14
- Usage:
15
- python hf_fallback.py '{"repo_id": "user/dataset", "output_path": "/path/to/output.parquet", "token": "optional", "max_rows": 100000, "split": "train"}'
16
-
17
- Output: JSON to stdout
18
- {"ok": true, "path": "/path/to/output.parquet", "rows": 12345, "columns": ["col1", "col2"]}
19
- {"ok": false, "error": "..."}
20
- """
21
- import sys
22
- import json
23
- import os
24
- import warnings
25
-
26
- # Suppress noisy HF warnings about trust_remote_code etc.
27
- warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
28
- warnings.filterwarnings("ignore", message=".*legacy.*")
29
-
30
-
31
- def _detect_image_columns(ds):
32
- """Detect columns that contain HF Image features or PIL Image objects."""
33
- image_cols = []
34
- features = getattr(ds, "features", None)
35
- if features:
36
- for name, feat in features.items():
37
- feat_cls = feat.__class__.__name__.lower()
38
- feat_str = str(feat).lower()
39
- if feat_cls == "image" or "image(" in feat_str:
40
- image_cols.append(name)
41
- return image_cols
42
-
43
-
44
- def _strip_image_columns(ds, image_cols):
45
- """Remove image columns from dataset so it can be exported to Parquet/CSV.
46
-
47
- Image columns contain PIL Image objects that can't be serialized to tabular
48
- formats. We replace them with a placeholder string indicating the column
49
- was an image column.
50
- """
51
- if not image_cols:
52
- return ds
53
-
54
- # Remove the image columns entirely for tabular export
55
- cols_to_keep = [c for c in ds.column_names if c not in image_cols]
56
- if not cols_to_keep:
57
- # Dataset is ALL image columns — keep them but cast to path strings if possible
58
- return ds
59
-
60
- return ds.select_columns(cols_to_keep)
61
-
62
-
63
- def _load_dataset_robust(repo_id, token, split):
64
- """Load a HuggingFace dataset with multiple fallback strategies.
65
-
66
- Strategy order:
67
- 1. Normal load with trust_remote_code=True (handles legacy script datasets)
68
- 2. Load without trust_remote_code (newer datasets that reject it)
69
- 3. Load with streaming=True then materialize (handles very large datasets)
70
- """
71
- from datasets import load_dataset, DatasetDict
72
-
73
- errors = []
74
- splits_to_try = [split] if split else ["train", "test", "validation", None]
75
-
76
- # Strategy 1: Normal load with trust_remote_code
77
- for s in splits_to_try:
78
- try:
79
- kwargs = {"path": repo_id, "trust_remote_code": True}
80
- if token:
81
- kwargs["token"] = token
82
- if s:
83
- kwargs["split"] = s
84
- ds = load_dataset(**kwargs)
85
- return ds, s
86
- except (ValueError, KeyError):
87
- continue
88
- except Exception as e:
89
- msg = str(e)
90
- # Auth errors should be raised immediately, not retried
91
- if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
92
- raise
93
- if "split" in msg.lower() or "key" in msg.lower():
94
- continue
95
- errors.append(f"trust_remote_code=True, split={s}: {msg}")
96
-
97
- # Strategy 2: Load WITHOUT trust_remote_code (some repos reject it)
98
- for s in splits_to_try:
99
- try:
100
- kwargs = {"path": repo_id}
101
- if token:
102
- kwargs["token"] = token
103
- if s:
104
- kwargs["split"] = s
105
- ds = load_dataset(**kwargs)
106
- return ds, s
107
- except (ValueError, KeyError):
108
- continue
109
- except Exception as e:
110
- msg = str(e)
111
- if any(x in msg for x in ["401", "403", "gated", "Unauthorized", "access"]):
112
- raise
113
- if "split" in msg.lower() or "key" in msg.lower():
114
- continue
115
- errors.append(f"trust_remote_code=False, split={s}: {msg}")
116
-
117
- # Strategy 3: Streaming fallback (for very large / oddly structured datasets)
118
- for s in splits_to_try:
119
- if s is None:
120
- continue # streaming requires a split
121
- try:
122
- kwargs = {"path": repo_id, "streaming": True, "trust_remote_code": True}
123
- if token:
124
- kwargs["token"] = token
125
- if s:
126
- kwargs["split"] = s
127
- ds_stream = load_dataset(**kwargs)
128
- # Materialize from streaming iterator
129
- from datasets import Dataset as HFDataset
130
- rows = []
131
- for i, row in enumerate(ds_stream):
132
- if i >= 500000:
133
- break
134
- rows.append(row)
135
- if rows:
136
- ds = HFDataset.from_list(rows)
137
- return ds, s
138
- except Exception:
139
- continue
140
-
141
- # All strategies failed
142
- error_summary = "; ".join(errors[:3]) if errors else "No valid configuration found"
143
- return None, error_summary
144
-
145
-
146
- def main():
147
- if len(sys.argv) < 2:
148
- print(json.dumps({"ok": False, "error": "Missing payload argument"}))
149
- sys.exit(1)
150
-
151
- try:
152
- payload = json.loads(sys.argv[1])
153
- except json.JSONDecodeError as e:
154
- print(json.dumps({"ok": False, "error": f"Invalid JSON payload: {e}"}))
155
- sys.exit(1)
156
-
157
- repo_id = payload.get("repo_id", "").strip()
158
- output_path = payload.get("output_path", "").strip()
159
- token = payload.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or None
160
- max_rows = payload.get("max_rows", 500000)
161
- split = payload.get("split") # None = auto-detect
162
-
163
- if not repo_id:
164
- print(json.dumps({"ok": False, "error": "repo_id is required"}))
165
- sys.exit(1)
166
-
167
- if not output_path:
168
- print(json.dumps({"ok": False, "error": "output_path is required"}))
169
- sys.exit(1)
170
-
171
- try:
172
- from datasets import load_dataset
173
- except ImportError:
174
- print(json.dumps({"ok": False, "error": "Python 'datasets' library not installed. Install with: pip install datasets"}))
175
- sys.exit(1)
176
-
177
- try:
178
- import polars as pl
179
- except ImportError:
180
- pl = None
181
-
182
- try:
183
- ds, used_split = _load_dataset_robust(repo_id, token, split)
184
-
185
- if ds is None:
186
- print(json.dumps({"ok": False, "error": f"Could not load dataset '{repo_id}': {used_split}"}))
187
- sys.exit(1)
188
-
189
- # Handle DatasetDict (when no split specified)
190
- from datasets import DatasetDict, Dataset
191
- if isinstance(ds, DatasetDict):
192
- # Pick the best split
193
- for preferred in ["train", "test", "validation"]:
194
- if preferred in ds:
195
- ds = ds[preferred]
196
- used_split = preferred
197
- break
198
- else:
199
- # Just pick the first available split
200
- first_key = list(ds.keys())[0]
201
- ds = ds[first_key]
202
- used_split = first_key
203
-
204
- # Limit rows if needed
205
- total_rows = len(ds)
206
- if max_rows and total_rows > max_rows:
207
- ds = ds.select(range(max_rows))
208
- total_rows = max_rows
209
-
210
- # Detect and handle image columns (PIL Image objects can't be exported to Parquet)
211
- image_cols = _detect_image_columns(ds)
212
- has_images = len(image_cols) > 0
213
-
214
- if has_images:
215
- # Strip image columns for tabular export, note them in output
216
- export_ds = _strip_image_columns(ds, image_cols)
217
- else:
218
- export_ds = ds
219
-
220
- # Ensure output directory exists
221
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
222
-
223
- # Export to parquet
224
- columns = export_ds.column_names
225
-
226
- try:
227
- if output_path.endswith(".parquet"):
228
- export_ds.to_parquet(output_path)
229
- elif output_path.endswith(".csv"):
230
- export_ds.to_csv(output_path)
231
- else:
232
- # Default to parquet
233
- if not output_path.endswith(".parquet"):
234
- output_path = output_path + ".parquet"
235
- export_ds.to_parquet(output_path)
236
- except Exception as export_err:
237
- # If parquet export fails (e.g. complex nested types), try CSV
238
- csv_path = output_path.replace(".parquet", ".csv")
239
- try:
240
- export_ds.to_csv(csv_path)
241
- output_path = csv_path
242
- except Exception:
243
- raise export_err # Re-raise original error
244
-
245
- result = {
246
- "ok": True,
247
- "path": output_path,
248
- "rows": total_rows,
249
- "columns": columns,
250
- "split": used_split
251
- }
252
-
253
- if has_images:
254
- result["image_columns"] = image_cols
255
- result["note"] = (
256
- f"This dataset contains image columns ({', '.join(image_cols)}). "
257
- "Image data was stripped for tabular export. "
258
- "Use vesper_download_assets with source='huggingface' to download the actual images."
259
- )
260
-
261
- print(json.dumps(result))
262
-
263
- except Exception as e:
264
- error_msg = str(e)
265
- # Provide helpful, actionable hints
266
- if "401" in error_msg or "Unauthorized" in error_msg:
267
- error_msg = (
268
- f"Authentication required for dataset '{repo_id}'. "
269
- "This dataset may be gated or private. "
270
- "Use the configure_keys tool to set your HF_TOKEN, then retry."
271
- )
272
- elif "403" in error_msg or "Forbidden" in error_msg:
273
- error_msg = (
274
- f"Access denied for dataset '{repo_id}'. "
275
- "You may need to accept the dataset's usage agreement on huggingface.co, "
276
- "then set HF_TOKEN via configure_keys tool."
277
- )
278
- elif "gated" in error_msg.lower():
279
- error_msg = (
280
- f"Dataset '{repo_id}' is gated. "
281
- "Visit https://huggingface.co/datasets/{repo_id} to request access, "
282
- "then set HF_TOKEN via configure_keys tool."
283
- ).format(repo_id=repo_id)
284
- elif "FileNotFoundError" in error_msg or "does not exist" in error_msg.lower() or "doesn't exist" in error_msg.lower():
285
- error_msg = f"Dataset '{repo_id}' not found on HuggingFace. Check the dataset ID."
286
- elif "script" in error_msg.lower() and "no longer supported" in error_msg.lower():
287
- error_msg = (
288
- f"Dataset '{repo_id}' uses a legacy loading script that is no longer supported "
289
- "by the current version of the datasets library. "
290
- "Try: pip install datasets --upgrade, or use an older datasets version."
291
- )
292
-
293
- print(json.dumps({"ok": False, "error": error_msg}))
294
- sys.exit(1)
295
-
296
-
297
- if __name__ == "__main__":
298
- main()
@@ -1,86 +0,0 @@
1
- import sys
2
- import json
3
- import os
4
- from PIL import Image
5
- import cv2
6
- import numpy as np
7
-
8
- def analyze_image(image_path):
9
- stats = {
10
- "path": image_path,
11
- "filename": os.path.basename(image_path),
12
- "status": "ok",
13
- "error": None
14
- }
15
-
16
- try:
17
- # 1. Basic Metadata with Pillow
18
- img = Image.open(image_path)
19
- stats["width"], stats["height"] = img.size
20
- stats["format"] = img.format
21
- stats["mode"] = img.mode
22
-
23
- # 2. Advanced Analysis with OpenCV
24
- cv_img = cv2.imread(image_path)
25
- if cv_img is None:
26
- stats["status"] = "corrupted"
27
- stats["error"] = "OpenCV failed to decode image"
28
- return stats
29
-
30
- # Blur detection (Laplacian variance)
31
- gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
32
- laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
33
- stats["blur_score"] = laplacian_var
34
- stats["is_blurry"] = laplacian_var < 100 # Rule of thumb threshold
35
-
36
- # Brightness
37
- stats["brightness"] = np.mean(gray)
38
-
39
- # Aspect Ratio
40
- stats["aspect_ratio"] = stats["width"] / stats["height"]
41
-
42
- except Exception as e:
43
- stats["status"] = "failed"
44
- stats["error"] = str(e)
45
-
46
- return stats
47
-
48
- def main():
49
- if len(sys.argv) < 2:
50
- print(json.dumps({"error": "No path provided"}))
51
- sys.exit(1)
52
-
53
- input_path = sys.argv[1]
54
- results = []
55
-
56
- if os.path.isfile(input_path):
57
- results.append(analyze_image(input_path))
58
- elif os.path.isdir(input_path):
59
- # Analyze first 50 images for performance in this demo
60
- valid_exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
61
- files = [os.path.join(input_path, f) for f in os.listdir(input_path) if f.lower().endswith(valid_exts)]
62
- for f in files[:50]:
63
- results.append(analyze_image(f))
64
- else:
65
- print(json.dumps({"error": "Invalid path"}))
66
- sys.exit(1)
67
-
68
- # Aggregate stats
69
- if not results:
70
- print(json.dumps({"error": "No images found"}))
71
- sys.exit(1)
72
-
73
- report = {
74
- "total_images": len(results),
75
- "corrupted_count": len([r for r in results if r["status"] == "corrupted"]),
76
- "failed_count": len([r for r in results if r["status"] == "failed"]),
77
- "average_width": np.mean([r["width"] for r in results if "width" in r]),
78
- "average_height": np.mean([r["height"] for r in results if "height" in r]),
79
- "blurry_count": len([r for r in results if r.get("is_blurry")]),
80
- "individual_results": results
81
- }
82
-
83
- print(json.dumps(report))
84
-
85
- if __name__ == "__main__":
86
- main()
@@ -1,295 +0,0 @@
1
- import sys
2
- import os
3
- import json
4
- import tempfile
5
- from typing import Dict, Any, List
6
- from config import get_all
7
-
8
- try:
9
- from kaggle.api.kaggle_api_extended import KaggleApi
10
- HAS_KAGGLE = True
11
- except Exception:
12
- HAS_KAGGLE = False
13
-
14
-
15
- IMAGE_EXTENSIONS = {
16
- ".jpg",
17
- ".jpeg",
18
- ".png",
19
- ".webp",
20
- ".bmp",
21
- ".gif",
22
- ".tiff",
23
- ".tif",
24
- ".svg",
25
- }
26
-
27
-
28
- def _ensure_auth() -> Dict[str, Any]:
29
- if not HAS_KAGGLE:
30
- return {
31
- "ok": False,
32
- "error": "kaggle package not installed. Install with: pip install kaggle",
33
- }
34
-
35
- # Priority:
36
- # 1) secure local store (keyring or ~/.vesper/config.toml)
37
- # 2) existing env vars
38
- # 3) ~/.kaggle/kaggle.json handled by KaggleApi.authenticate()
39
- keys = get_all()
40
- if keys.get("kaggle_username") and keys.get("kaggle_key"):
41
- os.environ["KAGGLE_USERNAME"] = keys["kaggle_username"]
42
- os.environ["KAGGLE_KEY"] = keys["kaggle_key"]
43
-
44
- api = KaggleApi()
45
- try:
46
- api.authenticate()
47
- except Exception as e:
48
- return {
49
- "ok": False,
50
- "error": "Kaggle requires API key — run 'vespermcp config keys' (30 seconds) or provide ~/.kaggle/kaggle.json",
51
- "details": str(e),
52
- }
53
-
54
- return {"ok": True, "api": api}
55
-
56
-
57
- def _dataset_to_dict(ds) -> Dict[str, Any]:
58
- # kaggle API object fields differ by version; use getattr defensively
59
- ref = getattr(ds, "ref", None) or getattr(ds, "datasetRef", None) or ""
60
- title = getattr(ds, "title", None) or ref
61
- subtitle = getattr(ds, "subtitle", None) or ""
62
- owner = getattr(ds, "creatorName", None) or getattr(ds, "ownerName", None) or ""
63
- votes = int(getattr(ds, "voteCount", 0) or 0)
64
- downloads = int(getattr(ds, "downloadCount", 0) or 0)
65
- size = int(getattr(ds, "totalBytes", 0) or 0)
66
- last_updated = str(getattr(ds, "lastUpdated", ""))
67
- tags = []
68
- raw_tags = getattr(ds, "tags", None)
69
- if raw_tags:
70
- for t in raw_tags:
71
- tags.append(getattr(t, "name", str(t)))
72
-
73
- return {
74
- "id": ref,
75
- "source": "kaggle",
76
- "name": title,
77
- "description": subtitle or title,
78
- "downloads": downloads,
79
- "likes": votes,
80
- "stars": 0,
81
- "tags": tags,
82
- "last_updated": last_updated,
83
- "task": "unknown",
84
- "domain": "unknown",
85
- "languages": [],
86
- "splits": [{"name": "data", "num_examples": 0, "size_bytes": size}],
87
- "license": {
88
- "id": "unknown",
89
- "name": "unknown",
90
- "category": "unknown",
91
- "usage_restrictions": [],
92
- "warnings": ["Kaggle license details may vary by dataset"],
93
- },
94
- "quality_score": 40,
95
- "quality_warnings": ["Review dataset card and competition rules before use"],
96
- "download_url": f"https://www.kaggle.com/datasets/{ref}",
97
- "format": None,
98
- "total_examples": 0,
99
- "total_size_bytes": size,
100
- "total_size_mb": round(size / (1024 * 1024), 2) if size else 0,
101
- "columns": [],
102
- "is_structured": False,
103
- "has_target_column": False,
104
- "is_safe_source": True,
105
- "has_personal_data": False,
106
- "is_paywalled": False,
107
- "is_scraped_web_data": False,
108
- "uses_https": True,
109
- "has_train_split": False,
110
- "has_test_split": False,
111
- "has_validation_split": False,
112
- "description_length": len(subtitle or title),
113
- "has_readme": True,
114
- }
115
-
116
-
117
- def discover(query: str, limit: int = 20) -> Dict[str, Any]:
118
- auth = _ensure_auth()
119
- if not auth.get("ok"):
120
- return auth
121
-
122
- api: KaggleApi = auth["api"]
123
- try:
124
- desired = max(1, min(limit, 100))
125
-
126
- try:
127
- datasets = api.dataset_list(search=query, page_size=desired)
128
- items = [_dataset_to_dict(ds) for ds in datasets[:limit]]
129
- return {"ok": True, "results": items, "count": len(items)}
130
- except TypeError:
131
- pass
132
-
133
- collected = []
134
- page = 1
135
- while len(collected) < limit:
136
- page_items = api.dataset_list(search=query, page=page)
137
- if not page_items:
138
- break
139
-
140
- collected.extend(page_items)
141
- if len(page_items) < 20:
142
- break
143
- page += 1
144
-
145
- items = [_dataset_to_dict(ds) for ds in collected[:limit]]
146
- return {"ok": True, "results": items, "count": len(items)}
147
- except Exception as e:
148
- return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
149
-
150
-
151
- def _find_image_files(root: str) -> List[str]:
152
- image_files: List[str] = []
153
- for base, _, files in os.walk(root):
154
- for name in files:
155
- full = os.path.join(base, name)
156
- if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
157
- image_files.append(full)
158
- image_files.sort()
159
- return image_files
160
-
161
-
162
- def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
163
- relative_path = os.path.relpath(full_path, root).replace("\\", "/")
164
- parent_dir = os.path.dirname(relative_path)
165
- parts = [part for part in parent_dir.split("/") if part and part != "."]
166
-
167
- split = None
168
- label = None
169
- if parts:
170
- first = parts[0].lower()
171
- if first in {"train", "test", "val", "valid", "validation"}:
172
- split = parts[0]
173
- if len(parts) > 1:
174
- label = parts[-1]
175
- else:
176
- label = parts[-1]
177
-
178
- record: Dict[str, Any] = {
179
- "id": index,
180
- "image_path": os.path.abspath(full_path),
181
- "relative_path": relative_path,
182
- "file_name": os.path.basename(full_path),
183
- "extension": os.path.splitext(full_path)[1].lower().lstrip("."),
184
- }
185
- if split:
186
- record["split"] = split
187
- if label:
188
- record["label"] = label
189
- return record
190
-
191
-
192
- def _write_image_manifest(root: str, image_files: List[str]) -> str:
193
- manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
194
- with open(manifest_path, "w", encoding="utf-8") as handle:
195
- for index, full_path in enumerate(image_files):
196
- handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
197
- return manifest_path
198
-
199
-
200
- def _pick_best_file(root: str) -> Dict[str, Any]:
201
- candidates: List[str] = []
202
- for base, _, files in os.walk(root):
203
- for name in files:
204
- full = os.path.join(base, name)
205
- lower = name.lower()
206
- if lower.endswith((".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow")):
207
- candidates.append(full)
208
-
209
- if not candidates:
210
- image_files = _find_image_files(root)
211
- if image_files:
212
- manifest_path = _write_image_manifest(root, image_files)
213
- return {
214
- "local_path": manifest_path,
215
- "dataset_kind": "image-manifest",
216
- "image_count": len(image_files),
217
- }
218
- raise RuntimeError("No suitable data file found after download")
219
-
220
- # prioritize common tabular formats
221
- priorities = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow"]
222
- for ext in priorities:
223
- for c in candidates:
224
- if c.lower().endswith(ext):
225
- return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
226
- return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
227
-
228
-
229
- def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
230
- auth = _ensure_auth()
231
- if not auth.get("ok"):
232
- return auth
233
-
234
- api: KaggleApi = auth["api"]
235
-
236
- if not target_dir:
237
- target_dir = tempfile.mkdtemp(prefix="vesper_kaggle_")
238
-
239
- os.makedirs(target_dir, exist_ok=True)
240
-
241
- try:
242
- if "kaggle.com/datasets/" in dataset_ref:
243
- dataset_ref = dataset_ref.split("kaggle.com/datasets/")[1].lstrip("/")
244
-
245
- # unzip in place, remove zip for convenience
246
- api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
247
- artifact = _pick_best_file(target_dir)
248
- return {
249
- "ok": True,
250
- "dataset_id": dataset_ref,
251
- "target_dir": target_dir,
252
- "local_path": artifact["local_path"],
253
- "dataset_kind": artifact["dataset_kind"],
254
- "image_count": artifact.get("image_count", 0),
255
- }
256
- except Exception as e:
257
- msg = str(e)
258
- if "401" in msg or "Unauthorized" in msg:
259
- return {"ok": False, "error": "Invalid Kaggle credentials (401). Run 'vespermcp config kaggle' again."}
260
- if "429" in msg or "Too Many Requests" in msg:
261
- return {"ok": False, "error": "Kaggle rate limit reached. Please retry later."}
262
- return {"ok": False, "error": f"Kaggle download failed: {msg}"}
263
-
264
-
265
- def main():
266
- if len(sys.argv) < 2:
267
- print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py <discover|download> ..."}))
268
- sys.exit(1)
269
-
270
- command = sys.argv[1]
271
-
272
- if command == "discover":
273
- if len(sys.argv) < 3:
274
- print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py discover <query> [limit]"}))
275
- sys.exit(1)
276
- query = sys.argv[2]
277
- limit = int(sys.argv[3]) if len(sys.argv) > 3 else 20
278
- print(json.dumps(discover(query, limit)))
279
- return
280
-
281
- if command == "download":
282
- if len(sys.argv) < 3:
283
- print(json.dumps({"ok": False, "error": "Usage: kaggle_engine.py download <dataset_ref> [target_dir]"}))
284
- sys.exit(1)
285
- dataset_ref = sys.argv[2]
286
- target_dir = sys.argv[3] if len(sys.argv) > 3 else ""
287
- print(json.dumps(download(dataset_ref, target_dir)))
288
- return
289
-
290
- print(json.dumps({"ok": False, "error": f"Unknown command: {command}"}))
291
- sys.exit(1)
292
-
293
-
294
- if __name__ == "__main__":
295
- main()