vesper-wizard 2.0.5 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/scripts/wizard.cjs +625 -0
  174. package/{wizard.js → scripts/wizard.js} +99 -21
  175. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  179. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  180. package/src/python/asset_downloader_engine.py +92 -0
  181. package/src/python/cleaner.py +226 -0
  182. package/src/python/config.py +263 -0
  183. package/src/python/dataworld_engine.py +208 -0
  184. package/src/python/export_engine.py +243 -0
  185. package/src/python/framework_adapters.py +100 -0
  186. package/src/python/fusion_engine.py +368 -0
  187. package/src/python/github_adapter.py +106 -0
  188. package/src/python/hf_fallback.py +298 -0
  189. package/src/python/image_engine.py +86 -0
  190. package/src/python/kaggle_engine.py +295 -0
  191. package/src/python/media_engine.py +133 -0
  192. package/src/python/nasa_adapter.py +82 -0
  193. package/src/python/openml_engine.py +146 -0
  194. package/src/python/quality_engine.py +267 -0
  195. package/src/python/row_count.py +54 -0
  196. package/src/python/splitter_engine.py +283 -0
  197. package/src/python/target_engine.py +154 -0
  198. package/src/python/test_framework_adapters.py +61 -0
  199. package/src/python/test_fusion_engine.py +89 -0
  200. package/src/python/uci_adapter.py +94 -0
  201. package/src/python/vesper/__init__.py +1 -0
  202. package/src/python/vesper/core/__init__.py +1 -0
  203. package/src/python/vesper/core/asset_downloader.py +675 -0
  204. package/src/python/vesper/core/download_recipe.py +104 -0
  205. package/src/python/worldbank_adapter.py +99 -0
  206. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,133 @@
1
+ import sys
2
+ import json
3
+ import os
4
+ import cv2
5
+ import numpy as np
6
+
7
+ # Audio analysis depends on librosa/soundfile. Fallback if not available.
8
+ try:
9
+ import librosa
10
+ AUDIO_SUPPORT = True
11
+ except ImportError:
12
+ AUDIO_SUPPORT = False
13
+
14
+ def analyze_audio(path):
15
+ if not AUDIO_SUPPORT:
16
+ return {"status": "error", "error": "librosa not installed"}
17
+
18
+ try:
19
+ # Load audio (mono, default sr)
20
+ y, sr = librosa.load(path, sr=None)
21
+ duration = librosa.get_duration(y=y, sr=sr)
22
+
23
+ # Audio metrics
24
+ rms = librosa.feature.rms(y=y)
25
+ avg_rms = float(np.mean(rms))
26
+
27
+ return {
28
+ "status": "ok",
29
+ "type": "audio",
30
+ "filename": os.path.basename(path),
31
+ "sample_rate": int(sr),
32
+ "duration": float(duration),
33
+ "avg_volume_rms": avg_rms,
34
+ "is_silent": avg_rms < 0.001
35
+ }
36
+ except Exception as e:
37
+ return {"status": "error", "error": str(e)}
38
+
39
+ def analyze_video(path):
40
+ try:
41
+ cap = cv2.VideoCapture(path)
42
+ if not cap.isOpened():
43
+ return {"status": "error", "error": "Could not open video file"}
44
+
45
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
46
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
47
+ fps = cap.get(cv2.CAP_PROP_FPS)
48
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
49
+ duration = frame_count / fps if fps > 0 else 0
50
+
51
+ # Check integrity by reading a few frames
52
+ test_frame_indices = [0, frame_count // 2, frame_count - 1] if frame_count > 0 else []
53
+ failed_frames = 0
54
+
55
+ for idx in test_frame_indices:
56
+ cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
57
+ ret, frame = cap.read()
58
+ if not ret or frame is None:
59
+ failed_frames += 1
60
+
61
+ cap.release()
62
+
63
+ return {
64
+ "status": "ok",
65
+ "type": "video",
66
+ "filename": os.path.basename(path),
67
+ "width": width,
68
+ "height": height,
69
+ "fps": float(fps),
70
+ "duration": float(duration),
71
+ "frame_count": frame_count,
72
+ "corruption_risk": "high" if failed_frames > 0 else "low"
73
+ }
74
+ except Exception as e:
75
+ return {"status": "error", "error": str(e)}
76
+
77
+ def main():
78
+ if len(sys.argv) < 2:
79
+ print(json.dumps({"error": "No path provided"}))
80
+ sys.exit(1)
81
+
82
+ input_path = sys.argv[1]
83
+ results = []
84
+
85
+ # Supported extensions
86
+ AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
87
+ VIDEO_EXTS = (".mp4", ".avi", ".mkv", ".mov", ".wmv")
88
+
89
+ if os.path.isfile(input_path):
90
+ ext = os.path.splitext(input_path.lower())[1]
91
+ if ext in AUDIO_EXTS:
92
+ results.append(analyze_audio(input_path))
93
+ elif ext in VIDEO_EXTS:
94
+ results.append(analyze_video(input_path))
95
+ else:
96
+ results.append({"status": "error", "error": f"Unsupported file type: {ext}"})
97
+ elif os.path.isdir(input_path):
98
+ files = [os.path.join(input_path, f) for f in os.listdir(input_path)]
99
+ for f in files[:50]: # Limit for demo
100
+ ext = os.path.splitext(f.lower())[1]
101
+ if ext in AUDIO_EXTS:
102
+ results.append(analyze_audio(f))
103
+ elif ext in VIDEO_EXTS:
104
+ results.append(analyze_video(f))
105
+ else:
106
+ print(json.dumps({"error": "Invalid path"}))
107
+ sys.exit(1)
108
+
109
+ # Filtering failed results for report aggregation
110
+ ok_results = [r for r in results if r.get("status") == "ok"]
111
+
112
+ report = {
113
+ "total_files": len(results),
114
+ "ok_files": len(ok_results),
115
+ "failed_files": len(results) - len(ok_results),
116
+ "details": results
117
+ }
118
+
119
+ # Calculate some averages if files were found
120
+ if ok_results:
121
+ audio_files = [r for r in ok_results if r["type"] == "audio"]
122
+ video_files = [r for r in ok_results if r["type"] == "video"]
123
+
124
+ if audio_files:
125
+ report["avg_audio_duration"] = float(np.mean([r["duration"] for r in audio_files]))
126
+ if video_files:
127
+ report["avg_video_duration"] = float(np.mean([r["duration"] for r in video_files]))
128
+ report["avg_fps"] = float(np.mean([r["fps"] for r in video_files]))
129
+
130
+ print(json.dumps(report))
131
+
132
+ if __name__ == "__main__":
133
+ main()
@@ -0,0 +1,82 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import urllib.request
5
+ import urllib.parse
6
+ from datetime import datetime
7
+
8
+ # NASA Data Portal uses Socrata
9
+ NASA_API_URL = "https://api.us.socrata.com/api/catalog/v1"
10
+ NASA_DOMAIN = "data.nasa.gov"
11
+
12
+ def search_nasa(query: str, limit: int = 10):
13
+ """
14
+ Search NASA data portal.
15
+ """
16
+ try:
17
+ params = {
18
+ "q": query,
19
+ "limit": limit,
20
+ "domains": NASA_DOMAIN,
21
+ "search_context": NASA_DOMAIN
22
+ }
23
+
24
+ query_string = urllib.parse.urlencode(params)
25
+ url = f"{NASA_API_URL}?{query_string}"
26
+
27
+ req = urllib.request.Request(url)
28
+ with urllib.request.urlopen(req) as response:
29
+ data = json.load(response)
30
+
31
+ results = []
32
+ # Socrata catalog results are in 'results'
33
+ items = data.get('results', [])
34
+
35
+ for item in items:
36
+ ds = item.get('resource', {})
37
+
38
+ metadata = {
39
+ "id": f"nasa:{ds.get('id')}",
40
+ "source": "nasa",
41
+ "name": ds.get('name'),
42
+ "description": ds.get('description') or "No description available.",
43
+ "downloads": ds.get('download_count', 0),
44
+ "likes": ds.get('view_count', 0) // 10,
45
+ "last_updated": ds.get('updatedAt') or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
46
+ "quality_score": 90,
47
+ "license": {
48
+ "id": "public_domain",
49
+ "name": "Public Domain",
50
+ "category": "safe",
51
+ "usage_restrictions": [],
52
+ "warnings": []
53
+ },
54
+ "tags": ds.get('tags', []),
55
+ "total_examples": 0,
56
+ "is_safe_source": True,
57
+ "is_structured": True,
58
+ "metadata_url": f"https://data.nasa.gov/d/{ds.get('id')}",
59
+ "domain": "science"
60
+ }
61
+
62
+ results.append(metadata)
63
+
64
+ return results
65
+
66
+ except Exception as e:
67
+ return {"error": str(e)}
68
+
69
+ def main():
70
+ parser = argparse.ArgumentParser(description="NASA Adapter")
71
+ parser.add_argument("--action", required=True, choices=["search"])
72
+ parser.add_argument("--query", required=True)
73
+ parser.add_argument("--limit", type=int, default=10)
74
+
75
+ args = parser.parse_args()
76
+
77
+ if args.action == "search":
78
+ results = search_nasa(args.query, args.limit)
79
+ print(json.dumps(results))
80
+
81
+ if __name__ == "__main__":
82
+ main()
@@ -0,0 +1,146 @@
1
+ import sys
2
+ import json
3
+ import argparse
4
+ import tempfile
5
+ import os
6
+ from typing import Dict, Any, List
7
+
8
+ try:
9
+ import openml
10
+ except ImportError:
11
+ openml = None
12
+
13
+ def _ensure_openml() -> Dict[str, Any]:
14
+ if openml is None:
15
+ return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
16
+ return {"ok": True}
17
+
18
+ def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
19
+ # OpenML dataset dict from list_datasets
20
+ did = ds.get("did", "")
21
+ name = ds.get("name", f"dataset_{did}")
22
+ version = ds.get("version", "1")
23
+ status = ds.get("status", "active")
24
+ format = ds.get("format", "unknown")
25
+
26
+ # Map to Vesper DatasetMetadata format
27
+ return {
28
+ "id": f"openml:{did}",
29
+ "name": name,
30
+ "source": "openml",
31
+ "description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
32
+ "author": "OpenML Community",
33
+ "license": "Public",
34
+ "tags": ["openml", format.lower()],
35
+ "downloads": ds.get("NumberOfDownloads", 0),
36
+ "likes": ds.get("NumberOfLikes", 0),
37
+ "created_at": ds.get("upload_date", ""),
38
+ "updated_at": ds.get("upload_date", ""),
39
+ "size_bytes": 0, # Not always available in list
40
+ "quality_score": 0.8, # Default good score for OpenML
41
+ "domain": "machine_learning",
42
+ "is_gated": False,
43
+ "is_nsfw": False,
44
+ "description_length": 100,
45
+ "has_readme": False,
46
+ }
47
+
48
+ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
49
+ check = _ensure_openml()
50
+ if not check.get("ok"):
51
+ return check
52
+
53
+ try:
54
+ # OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
55
+ # But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
56
+ # Actually, openml.datasets.list_datasets() returns a dict of datasets.
57
+ # We can fetch a larger batch and filter by name/keyword.
58
+
59
+ # Fetching a batch of datasets
60
+ datasets = openml.datasets.list_datasets(output_format='dataframe')
61
+
62
+ if query:
63
+ # Simple case-insensitive search in name
64
+ mask = datasets['name'].str.contains(query, case=False, na=False)
65
+ filtered = datasets[mask]
66
+ else:
67
+ filtered = datasets
68
+
69
+ # Sort by NumberOfDownloads if available, else just take top
70
+ if 'NumberOfDownloads' in filtered.columns:
71
+ filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
72
+
73
+ top_k = filtered.head(limit)
74
+
75
+ # Convert to list of dicts
76
+ records = top_k.to_dict(orient='records')
77
+ items = [_dataset_to_dict(r) for r in records]
78
+
79
+ return {"ok": True, "results": items, "count": len(items)}
80
+ except Exception as e:
81
+ return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
82
+
83
+ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
84
+ check = _ensure_openml()
85
+ if not check.get("ok"):
86
+ return check
87
+
88
+ try:
89
+ # dataset_ref is expected to be "openml:ID"
90
+ if dataset_ref.startswith("openml:"):
91
+ did_str = dataset_ref.split(":")[1]
92
+ else:
93
+ did_str = dataset_ref
94
+
95
+ did = int(did_str)
96
+
97
+ if not target_dir:
98
+ target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
99
+
100
+ os.makedirs(target_dir, exist_ok=True)
101
+
102
+ # Get the dataset
103
+ dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
104
+
105
+ # Get the pandas dataframe
106
+ X, y, categorical_indicator, attribute_names = dataset.get_data(
107
+ dataset_format="dataframe"
108
+ )
109
+
110
+ # If there's a target column (y), we might want to join it back if it was separated
111
+ # get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
112
+ # Let's just get everything
113
+ df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
114
+
115
+ # Save to parquet in the target directory
116
+ safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
117
+ file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
118
+
119
+ df.to_parquet(file_path, index=False)
120
+
121
+ return {
122
+ "ok": True,
123
+ "local_path": file_path,
124
+ "target_dir": target_dir
125
+ }
126
+ except Exception as e:
127
+ return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
128
+
129
+ def main():
130
+ parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
131
+ parser.add_argument("action", choices=["discover", "download"])
132
+ parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
133
+ parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
134
+
135
+ args = parser.parse_args()
136
+
137
+ if args.action == "discover":
138
+ limit = int(args.arg2) if args.arg2 else 20
139
+ result = discover(args.arg1, limit)
140
+ print(json.dumps(result))
141
+ elif args.action == "download":
142
+ result = download(args.arg1, args.arg2)
143
+ print(json.dumps(result))
144
+
145
+ if __name__ == "__main__":
146
+ main()
@@ -0,0 +1,267 @@
1
+ import sys
2
+ import json
3
+ import polars as pl
4
+ import numpy as np
5
+
6
+ def analyze_column(df, col_name, dtype):
7
+ stats = {
8
+ "name": col_name,
9
+ "type": str(dtype),
10
+ "inferred_type": str(dtype), # Default to actual
11
+ "missing_count": 0,
12
+ "missing_percentage": 0.0,
13
+ "unique_count": 0,
14
+ "is_constant": False,
15
+ "is_mixed_type": False
16
+ }
17
+
18
+ try:
19
+ col = df[col_name]
20
+ null_count = col.null_count()
21
+ row_count = len(col)
22
+
23
+ stats["missing_count"] = null_count
24
+ stats["missing_percentage"] = (null_count / row_count) * 100 if row_count > 0 else 0
25
+ stats["unique_count"] = col.n_unique()
26
+ stats["is_constant"] = stats["unique_count"] <= 1 and row_count > 0
27
+
28
+ # Schema Inference & Validation
29
+ is_string = dtype == pl.Utf8 or dtype == pl.Object
30
+
31
+ if is_string and row_count > 0:
32
+ # Try inferring Numeric
33
+ # Check if majority can be cast to float
34
+ try:
35
+ # Use strict=False to turn non-numbers into nulls
36
+ numeric_cast = col.str.strip_chars().cast(pl.Float64, strict=False)
37
+ numeric_nulls = numeric_cast.null_count()
38
+
39
+ # If valid numbers are significantly more than original nulls, it might be numeric
40
+ valid_numbers = row_count - numeric_nulls
41
+ original_valid = row_count - null_count
42
+
43
+ if valid_numbers > 0 and (valid_numbers / original_valid) > 0.9:
44
+ stats["inferred_type"] = "Numeric (Stored as String)"
45
+
46
+ # Mixed type check: If valid numbers exist but plenty of strings too
47
+ elif valid_numbers > 0 and (valid_numbers / original_valid) < 0.9:
48
+ stats["is_mixed_type"] = True
49
+ except:
50
+ pass
51
+
52
+ # Numeric Analysis
53
+ if dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32] or stats["inferred_type"].startswith("Numeric"):
54
+ clean_col = col
55
+ if is_string:
56
+ # Cast for analysis if it was inferred
57
+ clean_col = col.str.strip_chars().cast(pl.Float64, strict=False)
58
+
59
+ clean_col = clean_col.drop_nulls()
60
+
61
+ if len(clean_col) > 0:
62
+ stats["distribution"] = {
63
+ "min": float(clean_col.min()),
64
+ "max": float(clean_col.max()),
65
+ "mean": float(clean_col.mean()),
66
+ "std": float(clean_col.std()) if len(clean_col) > 1 else 0,
67
+ "p25": float(clean_col.quantile(0.25)),
68
+ "p50": float(clean_col.median()),
69
+ "p75": float(clean_col.quantile(0.75))
70
+ }
71
+
72
+ # Categorical Analysis
73
+ if dtype == pl.Utf8 or dtype == pl.Categorical:
74
+ value_counts = col.value_counts(sort=True).head(5)
75
+ # Handle different polars versions return structure for value_counts
76
+ try:
77
+ # Format: struct with name/counts or columns
78
+ rows = value_counts.rows()
79
+ top_values = {}
80
+ for row in rows:
81
+ val = str(row[0]) if row[0] is not None else "null"
82
+ count = int(row[1])
83
+ top_values[val] = count
84
+ stats["top_values"] = top_values
85
+ except:
86
+ pass
87
+
88
+ except Exception as e:
89
+ stats["error"] = str(e)
90
+
91
+ return stats
92
+
93
+ def main():
94
+ if len(sys.argv) < 2:
95
+ print(json.dumps({"error": "No file path provided"}))
96
+ sys.exit(1)
97
+
98
+ file_path = sys.argv[1]
99
+
100
+ try:
101
+ # Robust file reading with extension detection
102
+ file_path_lower = file_path.lower()
103
+ if file_path_lower.endswith(".csv"):
104
+ df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
105
+ elif file_path_lower.endswith(".tsv"):
106
+ df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
107
+ elif file_path_lower.endswith(".txt"):
108
+ sep = ","
109
+ try:
110
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
111
+ first_line = fh.readline()
112
+ if "\t" in first_line:
113
+ sep = "\t"
114
+ except Exception:
115
+ sep = ","
116
+ df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
117
+ elif file_path_lower.endswith(".parquet"):
118
+ try:
119
+ # Try scanning first (faster for large files)
120
+ df = pl.scan_parquet(file_path).limit(10000).collect()
121
+ except:
122
+ df = pl.read_parquet(file_path)
123
+ if len(df) > 10000: df = df.head(10000)
124
+ elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
125
+ # Explicit NDJSON
126
+ df = pl.scan_ndjson(file_path).limit(10000).collect()
127
+ elif file_path_lower.endswith(".json"):
128
+ # Ambiguous .json: Try standard JSON first, then NDJSON fallback
129
+ try:
130
+ # read_json reads standard JSON array [{}, {}]
131
+ df = pl.read_json(file_path)
132
+ if len(df) > 10000: df = df.head(10000)
133
+ except Exception:
134
+ try:
135
+ # Fallback to NDJSON (common for large datasets mislabeled as .json)
136
+ df = pl.scan_ndjson(file_path).limit(10000).collect()
137
+ except Exception as e:
138
+ print(json.dumps({"error": f"Failed to read JSON: {str(e)}"}))
139
+ sys.exit(1)
140
+ else:
141
+ print(json.dumps({"error": f"Unsupported file extension: {file_path}"}))
142
+ sys.exit(1)
143
+
144
+ row_count = len(df)
145
+ column_count = len(df.columns)
146
+
147
+ # Duplicate detection (exact)
148
+ # NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
149
+ # Use a Python fallback that is slower but robust for the 10k sampled rows.
150
+ duplicate_count = 0
151
+ try:
152
+ seen = set()
153
+ for row in df.to_dicts():
154
+ row_key = json.dumps(row, sort_keys=True, default=str)
155
+ if row_key in seen:
156
+ duplicate_count += 1
157
+ else:
158
+ seen.add(row_key)
159
+ except Exception:
160
+ duplicate_count = 0
161
+
162
+ columns_stats = []
163
+ text_cols = []
164
+ for col in df.columns:
165
+ stats = analyze_column(df, col, df.schema[col])
166
+ columns_stats.append(stats)
167
+ # Check for String type (Polars can return 'String' or 'Utf8' depending on version)
168
+ dtype_str = stats["type"]
169
+ if ("String" in dtype_str or "Utf8" in dtype_str) and stats["unique_count"] > 1:
170
+ text_cols.append(col)
171
+
172
+ report = {
173
+ "row_count": row_count,
174
+ "column_count": column_count,
175
+ "duplicate_rows": int(duplicate_count),
176
+ "duplicate_percentage": (duplicate_count / row_count * 100) if row_count > 0 else 0,
177
+ "columns": columns_stats,
178
+ "warnings": [],
179
+ "schema_warnings": [],
180
+ "overall_score": 100
181
+ }
182
+
183
+ # Integrity Check 1: Text Duplicates (Fuzzyish Proxy)
184
+ # If duplicated rows are 0, check if main text content is duplicated
185
+ if duplicate_count == 0 and len(text_cols) > 0:
186
+ # Pick longest text column as likely "content"
187
+ # In real impl, we'd use heuristics. For now, first text col.
188
+ target_col = text_cols[0]
189
+ try:
190
+ text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
191
+ if text_dupes > 0:
192
+ report["text_duplicates"] = int(text_dupes)
193
+ if text_dupes > (row_count * 0.2):
194
+ report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
195
+ except Exception:
196
+ # Skip text duplicate warning if backend cannot compute duplicates for this dtype
197
+ pass
198
+
199
+ # Integrity Check 2: Contamination / Leakage (Basic)
200
+ # (Skipping correlation for now)
201
+
202
+ report["class_imbalance_warnings"] = []
203
+ report["pii_warnings"] = []
204
+
205
+ # PII Patterns (Regex)
206
+ import re
207
+ pii_patterns = {
208
+ "Email": r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
209
+ "Phone": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', # Basic US-ish pattern
210
+ "SSN": r'\d{3}-\d{2}-\d{4}',
211
+ "IPv4": r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
212
+ }
213
+
214
+ # Bias & PII Analysis
215
+ for col_name, stats in zip(df.columns, columns_stats):
216
+ # Class Imbalance
217
+ if stats["unique_count"] > 1 and stats["unique_count"] < 50:
218
+ try:
219
+ col = df[col_name]
220
+ top_val_count = col.value_counts().sort("count", descending=True).row(0)[1]
221
+ total = len(col)
222
+ if total > 0:
223
+ ratio = top_val_count / total
224
+ if ratio > 0.9:
225
+ report["class_imbalance_warnings"].append(f"Severe imbalance in '{col_name}': Top class is {(ratio*100):.1f}% of data")
226
+ except:
227
+ pass
228
+
229
+ # PII Detection (on Text Columns only)
230
+ if ("String" in stats["type"] or "Utf8" in stats["type"]):
231
+ try:
232
+ # Sample for performance (check first 1000 non-null values)
233
+ sample_text = df[col_name].drop_nulls().head(1000).to_list()
234
+ # Join a subset to regex against (faster than row-by-row for simple checks)
235
+ combined_text = " ".join([str(x) for x in sample_text])
236
+
237
+ for pii_type, pattern in pii_patterns.items():
238
+ if re.search(pattern, combined_text):
239
+ # Ensure we don't flag column names like "email_address" but actual content
240
+ # Double check with a strict count if trigger found
241
+ matches = len(re.findall(pattern, combined_text))
242
+ if matches > 0:
243
+ report["pii_warnings"].append(f"Potential {pii_type} detected in column '{col_name}' ({matches} matches in sample)")
244
+ except:
245
+ pass
246
+
247
+ # Basic warnings
248
+ if report["duplicate_percentage"] > 10:
249
+ report["warnings"].append("High duplication rate (>10%)")
250
+ if row_count < 50:
251
+ report["warnings"].append("Dataset is very small (<50 rows)")
252
+
253
+ # Schema warnings
254
+ for col in columns_stats:
255
+ if "Numeric" in col.get("inferred_type", "") and "Utf8" in col.get("type", ""):
256
+ report["schema_warnings"].append(f"Column '{col['name']}' looks Numeric but is stored as String")
257
+ if col.get("is_mixed_type"):
258
+ report["schema_warnings"].append(f"Column '{col['name']}' likely contains mixed types (numbers and strings)")
259
+
260
+ print(json.dumps(report))
261
+
262
+ except Exception as e:
263
+ print(json.dumps({"error": f"Analysis failed: {str(e)}"}))
264
+ sys.exit(1)
265
+
266
+ if __name__ == "__main__":
267
+ main()
@@ -0,0 +1,54 @@
1
+ import sys
2
+ import json
3
+ import os
4
+
5
+ try:
6
+ import polars as pl
7
+ except Exception:
8
+ print(json.dumps({"ok": False, "error": "polars is required"}))
9
+ sys.exit(1)
10
+
11
+
12
+ def count_rows(path: str) -> int:
13
+ ext = os.path.splitext(path)[1].lower()
14
+
15
+ if ext == ".csv":
16
+ # Faster than full read for large csv
17
+ return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
18
+ if ext in [".parquet", ".pq"]:
19
+ return int(pl.scan_parquet(path).select(pl.len()).collect().item())
20
+ if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
21
+ return int(pl.scan_ipc(path).select(pl.len()).collect().item())
22
+ if ext in [".jsonl", ".ndjson"]:
23
+ return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
24
+ if ext == ".json":
25
+ # fallback to eager for plain JSON arrays
26
+ try:
27
+ return int(pl.read_json(path).height)
28
+ except Exception:
29
+ return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
30
+
31
+ # unknown extension fallback
32
+ return int(pl.read_csv(path, ignore_errors=True).height)
33
+
34
+
35
+ def main():
36
+ if len(sys.argv) < 2:
37
+ print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
38
+ sys.exit(1)
39
+
40
+ p = sys.argv[1]
41
+ if not os.path.exists(p):
42
+ print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
43
+ sys.exit(1)
44
+
45
+ try:
46
+ rows = count_rows(p)
47
+ print(json.dumps({"ok": True, "rows": rows}))
48
+ except Exception as e:
49
+ print(json.dumps({"ok": False, "error": str(e)}))
50
+ sys.exit(1)
51
+
52
+
53
+ if __name__ == "__main__":
54
+ main()