vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,133 +0,0 @@
1
- import sys
2
- import json
3
- import os
4
- import cv2
5
- import numpy as np
6
-
7
- # Audio analysis depends on librosa/soundfile. Fallback if not available.
8
- try:
9
- import librosa
10
- AUDIO_SUPPORT = True
11
- except ImportError:
12
- AUDIO_SUPPORT = False
13
-
14
- def analyze_audio(path):
15
- if not AUDIO_SUPPORT:
16
- return {"status": "error", "error": "librosa not installed"}
17
-
18
- try:
19
- # Load audio (mono, default sr)
20
- y, sr = librosa.load(path, sr=None)
21
- duration = librosa.get_duration(y=y, sr=sr)
22
-
23
- # Audio metrics
24
- rms = librosa.feature.rms(y=y)
25
- avg_rms = float(np.mean(rms))
26
-
27
- return {
28
- "status": "ok",
29
- "type": "audio",
30
- "filename": os.path.basename(path),
31
- "sample_rate": int(sr),
32
- "duration": float(duration),
33
- "avg_volume_rms": avg_rms,
34
- "is_silent": avg_rms < 0.001
35
- }
36
- except Exception as e:
37
- return {"status": "error", "error": str(e)}
38
-
39
- def analyze_video(path):
40
- try:
41
- cap = cv2.VideoCapture(path)
42
- if not cap.isOpened():
43
- return {"status": "error", "error": "Could not open video file"}
44
-
45
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
46
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
47
- fps = cap.get(cv2.CAP_PROP_FPS)
48
- frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
49
- duration = frame_count / fps if fps > 0 else 0
50
-
51
- # Check integrity by reading a few frames
52
- test_frame_indices = [0, frame_count // 2, frame_count - 1] if frame_count > 0 else []
53
- failed_frames = 0
54
-
55
- for idx in test_frame_indices:
56
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
57
- ret, frame = cap.read()
58
- if not ret or frame is None:
59
- failed_frames += 1
60
-
61
- cap.release()
62
-
63
- return {
64
- "status": "ok",
65
- "type": "video",
66
- "filename": os.path.basename(path),
67
- "width": width,
68
- "height": height,
69
- "fps": float(fps),
70
- "duration": float(duration),
71
- "frame_count": frame_count,
72
- "corruption_risk": "high" if failed_frames > 0 else "low"
73
- }
74
- except Exception as e:
75
- return {"status": "error", "error": str(e)}
76
-
77
- def main():
78
- if len(sys.argv) < 2:
79
- print(json.dumps({"error": "No path provided"}))
80
- sys.exit(1)
81
-
82
- input_path = sys.argv[1]
83
- results = []
84
-
85
- # Supported extensions
86
- AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
87
- VIDEO_EXTS = (".mp4", ".avi", ".mkv", ".mov", ".wmv")
88
-
89
- if os.path.isfile(input_path):
90
- ext = os.path.splitext(input_path.lower())[1]
91
- if ext in AUDIO_EXTS:
92
- results.append(analyze_audio(input_path))
93
- elif ext in VIDEO_EXTS:
94
- results.append(analyze_video(input_path))
95
- else:
96
- results.append({"status": "error", "error": f"Unsupported file type: {ext}"})
97
- elif os.path.isdir(input_path):
98
- files = [os.path.join(input_path, f) for f in os.listdir(input_path)]
99
- for f in files[:50]: # Limit for demo
100
- ext = os.path.splitext(f.lower())[1]
101
- if ext in AUDIO_EXTS:
102
- results.append(analyze_audio(f))
103
- elif ext in VIDEO_EXTS:
104
- results.append(analyze_video(f))
105
- else:
106
- print(json.dumps({"error": "Invalid path"}))
107
- sys.exit(1)
108
-
109
- # Filtering failed results for report aggregation
110
- ok_results = [r for r in results if r.get("status") == "ok"]
111
-
112
- report = {
113
- "total_files": len(results),
114
- "ok_files": len(ok_results),
115
- "failed_files": len(results) - len(ok_results),
116
- "details": results
117
- }
118
-
119
- # Calculate some averages if files were found
120
- if ok_results:
121
- audio_files = [r for r in ok_results if r["type"] == "audio"]
122
- video_files = [r for r in ok_results if r["type"] == "video"]
123
-
124
- if audio_files:
125
- report["avg_audio_duration"] = float(np.mean([r["duration"] for r in audio_files]))
126
- if video_files:
127
- report["avg_video_duration"] = float(np.mean([r["duration"] for r in video_files]))
128
- report["avg_fps"] = float(np.mean([r["fps"] for r in video_files]))
129
-
130
- print(json.dumps(report))
131
-
132
- if __name__ == "__main__":
133
- main()
@@ -1,82 +0,0 @@
1
- import sys
2
- import json
3
- import argparse
4
- import urllib.request
5
- import urllib.parse
6
- from datetime import datetime
7
-
8
- # NASA Data Portal uses Socrata
9
- NASA_API_URL = "https://api.us.socrata.com/api/catalog/v1"
10
- NASA_DOMAIN = "data.nasa.gov"
11
-
12
- def search_nasa(query: str, limit: int = 10):
13
- """
14
- Search NASA data portal.
15
- """
16
- try:
17
- params = {
18
- "q": query,
19
- "limit": limit,
20
- "domains": NASA_DOMAIN,
21
- "search_context": NASA_DOMAIN
22
- }
23
-
24
- query_string = urllib.parse.urlencode(params)
25
- url = f"{NASA_API_URL}?{query_string}"
26
-
27
- req = urllib.request.Request(url)
28
- with urllib.request.urlopen(req) as response:
29
- data = json.load(response)
30
-
31
- results = []
32
- # Socrata catalog results are in 'results'
33
- items = data.get('results', [])
34
-
35
- for item in items:
36
- ds = item.get('resource', {})
37
-
38
- metadata = {
39
- "id": f"nasa:{ds.get('id')}",
40
- "source": "nasa",
41
- "name": ds.get('name'),
42
- "description": ds.get('description') or "No description available.",
43
- "downloads": ds.get('download_count', 0),
44
- "likes": ds.get('view_count', 0) // 10,
45
- "last_updated": ds.get('updatedAt') or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
46
- "quality_score": 90,
47
- "license": {
48
- "id": "public_domain",
49
- "name": "Public Domain",
50
- "category": "safe",
51
- "usage_restrictions": [],
52
- "warnings": []
53
- },
54
- "tags": ds.get('tags', []),
55
- "total_examples": 0,
56
- "is_safe_source": True,
57
- "is_structured": True,
58
- "metadata_url": f"https://data.nasa.gov/d/{ds.get('id')}",
59
- "domain": "science"
60
- }
61
-
62
- results.append(metadata)
63
-
64
- return results
65
-
66
- except Exception as e:
67
- return {"error": str(e)}
68
-
69
- def main():
70
- parser = argparse.ArgumentParser(description="NASA Adapter")
71
- parser.add_argument("--action", required=True, choices=["search"])
72
- parser.add_argument("--query", required=True)
73
- parser.add_argument("--limit", type=int, default=10)
74
-
75
- args = parser.parse_args()
76
-
77
- if args.action == "search":
78
- results = search_nasa(args.query, args.limit)
79
- print(json.dumps(results))
80
-
81
- if __name__ == "__main__":
82
- main()
@@ -1,83 +0,0 @@
1
- """
2
- Normalize any supported dataset file to parquet format.
3
- Usage: normalize_engine.py <input_path> <output_path>
4
- Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
5
- """
6
- import sys
7
- import json
8
- import os
9
-
10
- try:
11
- import polars as pl
12
- except Exception:
13
- print(json.dumps({"ok": False, "error": "polars is required"}))
14
- sys.exit(1)
15
-
16
-
17
- def _load(src: str) -> pl.DataFrame:
18
- ext = os.path.splitext(src)[1].lower()
19
-
20
- if ext == ".csv":
21
- return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
22
- if ext in (".tsv", ".tab"):
23
- return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
24
- if ext in (".parquet", ".pq"):
25
- return pl.read_parquet(src)
26
- if ext in (".feather", ".ftr", ".arrow", ".ipc"):
27
- return pl.read_ipc(src)
28
- if ext in (".jsonl", ".ndjson"):
29
- return pl.read_ndjson(src)
30
- if ext == ".json":
31
- raw = open(src, "r", encoding="utf-8").read().strip()
32
- if raw.startswith("["):
33
- return pl.read_json(src)
34
- # Try NDJSON
35
- if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
36
- return pl.read_ndjson(src)
37
- # Try wrapper object
38
- obj = json.loads(raw)
39
- if isinstance(obj, dict):
40
- for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
41
- if key in obj and isinstance(obj[key], list):
42
- return pl.DataFrame(obj[key])
43
- # Last resort - take first list value
44
- for v in obj.values():
45
- if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
46
- return pl.DataFrame(v)
47
- return pl.read_json(src)
48
- if ext == ".txt":
49
- return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
50
-
51
- # Fallback: try csv
52
- return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
53
-
54
-
55
- def normalize(input_path: str, output_path: str):
56
- df = _load(input_path)
57
- os.makedirs(os.path.dirname(output_path), exist_ok=True)
58
- df.write_parquet(output_path)
59
- return df.height
60
-
61
-
62
- def main():
63
- if len(sys.argv) < 3:
64
- print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
65
- sys.exit(1)
66
-
67
- input_path = sys.argv[1]
68
- output_path = sys.argv[2]
69
-
70
- if not os.path.exists(input_path):
71
- print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
72
- sys.exit(1)
73
-
74
- try:
75
- rows = normalize(input_path, output_path)
76
- print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
77
- except Exception as e:
78
- print(json.dumps({"ok": False, "error": str(e)}))
79
- sys.exit(1)
80
-
81
-
82
- if __name__ == "__main__":
83
- main()
@@ -1,146 +0,0 @@
1
- import sys
2
- import json
3
- import argparse
4
- import tempfile
5
- import os
6
- from typing import Dict, Any, List
7
-
8
- try:
9
- import openml
10
- except ImportError:
11
- openml = None
12
-
13
- def _ensure_openml() -> Dict[str, Any]:
14
- if openml is None:
15
- return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
16
- return {"ok": True}
17
-
18
- def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
19
- # OpenML dataset dict from list_datasets
20
- did = ds.get("did", "")
21
- name = ds.get("name", f"dataset_{did}")
22
- version = ds.get("version", "1")
23
- status = ds.get("status", "active")
24
- format = ds.get("format", "unknown")
25
-
26
- # Map to Vesper DatasetMetadata format
27
- return {
28
- "id": f"openml:{did}",
29
- "name": name,
30
- "source": "openml",
31
- "description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
32
- "author": "OpenML Community",
33
- "license": "Public",
34
- "tags": ["openml", format.lower()],
35
- "downloads": ds.get("NumberOfDownloads", 0),
36
- "likes": ds.get("NumberOfLikes", 0),
37
- "created_at": ds.get("upload_date", ""),
38
- "updated_at": ds.get("upload_date", ""),
39
- "size_bytes": 0, # Not always available in list
40
- "quality_score": 0.8, # Default good score for OpenML
41
- "domain": "machine_learning",
42
- "is_gated": False,
43
- "is_nsfw": False,
44
- "description_length": 100,
45
- "has_readme": False,
46
- }
47
-
48
- def discover(query: str, limit: int = 20) -> Dict[str, Any]:
49
- check = _ensure_openml()
50
- if not check.get("ok"):
51
- return check
52
-
53
- try:
54
- # OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
55
- # But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
56
- # Actually, openml.datasets.list_datasets() returns a dict of datasets.
57
- # We can fetch a larger batch and filter by name/keyword.
58
-
59
- # Fetching a batch of datasets
60
- datasets = openml.datasets.list_datasets(output_format='dataframe')
61
-
62
- if query:
63
- # Simple case-insensitive search in name
64
- mask = datasets['name'].str.contains(query, case=False, na=False)
65
- filtered = datasets[mask]
66
- else:
67
- filtered = datasets
68
-
69
- # Sort by NumberOfDownloads if available, else just take top
70
- if 'NumberOfDownloads' in filtered.columns:
71
- filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
72
-
73
- top_k = filtered.head(limit)
74
-
75
- # Convert to list of dicts
76
- records = top_k.to_dict(orient='records')
77
- items = [_dataset_to_dict(r) for r in records]
78
-
79
- return {"ok": True, "results": items, "count": len(items)}
80
- except Exception as e:
81
- return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
82
-
83
- def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
84
- check = _ensure_openml()
85
- if not check.get("ok"):
86
- return check
87
-
88
- try:
89
- # dataset_ref is expected to be "openml:ID"
90
- if dataset_ref.startswith("openml:"):
91
- did_str = dataset_ref.split(":")[1]
92
- else:
93
- did_str = dataset_ref
94
-
95
- did = int(did_str)
96
-
97
- if not target_dir:
98
- target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
99
-
100
- os.makedirs(target_dir, exist_ok=True)
101
-
102
- # Get the dataset
103
- dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
104
-
105
- # Get the pandas dataframe
106
- X, y, categorical_indicator, attribute_names = dataset.get_data(
107
- dataset_format="dataframe"
108
- )
109
-
110
- # If there's a target column (y), we might want to join it back if it was separated
111
- # get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
112
- # Let's just get everything
113
- df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
114
-
115
- # Save to parquet in the target directory
116
- safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
117
- file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
118
-
119
- df.to_parquet(file_path, index=False)
120
-
121
- return {
122
- "ok": True,
123
- "local_path": file_path,
124
- "target_dir": target_dir
125
- }
126
- except Exception as e:
127
- return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
128
-
129
- def main():
130
- parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
131
- parser.add_argument("action", choices=["discover", "download"])
132
- parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
133
- parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
134
-
135
- args = parser.parse_args()
136
-
137
- if args.action == "discover":
138
- limit = int(args.arg2) if args.arg2 else 20
139
- result = discover(args.arg1, limit)
140
- print(json.dumps(result))
141
- elif args.action == "download":
142
- result = download(args.arg1, args.arg2)
143
- print(json.dumps(result))
144
-
145
- if __name__ == "__main__":
146
- main()