vesper-wizard 2.0.5 → 2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +300 -37
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +81 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +62 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +127 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +26 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/config/config-manager.js +221 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +69 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/engine.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/gateway/unified-dataset-gateway.js +409 -0
- package/build/index.js +2704 -0
- package/build/ingestion/hf-downloader.js +171 -0
- package/build/ingestion/ingestor.js +271 -0
- package/build/ingestion/kaggle-downloader.js +102 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +136 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/lib/supabase.js +3 -0
- package/build/metadata/dataworld-source.js +89 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/openml-source.js +87 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +377 -0
- package/build/metadata/store.js +340 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/preparation/target-detector.js +75 -0
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +92 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/config.py +263 -0
- package/build/python/dataworld_engine.py +208 -0
- package/build/python/export_engine.py +243 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/fusion_engine.py +368 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/hf_fallback.py +298 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/kaggle_engine.py +295 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/openml_engine.py +146 -0
- package/build/python/quality_engine.py +267 -0
- package/build/python/row_count.py +54 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +675 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +93 -0
- package/build/quality/image-analyzer.js +114 -0
- package/build/quality/media-analyzer.js +115 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +74 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +152 -0
- package/build/search/jit-orchestrator.js +258 -0
- package/build/search/vector-store.js +123 -0
- package/build/splitting/splitter.js +82 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +251 -0
- package/build/utils/downloader.js +52 -0
- package/build/utils/selector.js +69 -0
- package/mcp-config-template.json +18 -0
- package/package.json +101 -29
- package/scripts/postinstall.cjs +114 -0
- package/scripts/preindex_registry.cjs +157 -0
- package/scripts/refresh-index.cjs +87 -0
- package/scripts/wizard.cjs +625 -0
- package/{wizard.js → scripts/wizard.js} +99 -21
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +92 -0
- package/src/python/cleaner.py +226 -0
- package/src/python/config.py +263 -0
- package/src/python/dataworld_engine.py +208 -0
- package/src/python/export_engine.py +243 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/fusion_engine.py +368 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/hf_fallback.py +298 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/kaggle_engine.py +295 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/openml_engine.py +146 -0
- package/src/python/quality_engine.py +267 -0
- package/src/python/row_count.py +54 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/target_engine.py +154 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/test_fusion_engine.py +89 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +675 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
- package/src/python/worldbank_adapter.py +99 -0
- package/vesper-mcp-config.json +0 -6
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import cv2
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
# Audio analysis depends on librosa/soundfile. Fallback if not available.
|
|
8
|
+
try:
|
|
9
|
+
import librosa
|
|
10
|
+
AUDIO_SUPPORT = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
AUDIO_SUPPORT = False
|
|
13
|
+
|
|
14
|
+
def analyze_audio(path):
|
|
15
|
+
if not AUDIO_SUPPORT:
|
|
16
|
+
return {"status": "error", "error": "librosa not installed"}
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
# Load audio (mono, default sr)
|
|
20
|
+
y, sr = librosa.load(path, sr=None)
|
|
21
|
+
duration = librosa.get_duration(y=y, sr=sr)
|
|
22
|
+
|
|
23
|
+
# Audio metrics
|
|
24
|
+
rms = librosa.feature.rms(y=y)
|
|
25
|
+
avg_rms = float(np.mean(rms))
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
"status": "ok",
|
|
29
|
+
"type": "audio",
|
|
30
|
+
"filename": os.path.basename(path),
|
|
31
|
+
"sample_rate": int(sr),
|
|
32
|
+
"duration": float(duration),
|
|
33
|
+
"avg_volume_rms": avg_rms,
|
|
34
|
+
"is_silent": avg_rms < 0.001
|
|
35
|
+
}
|
|
36
|
+
except Exception as e:
|
|
37
|
+
return {"status": "error", "error": str(e)}
|
|
38
|
+
|
|
39
|
+
def analyze_video(path):
|
|
40
|
+
try:
|
|
41
|
+
cap = cv2.VideoCapture(path)
|
|
42
|
+
if not cap.isOpened():
|
|
43
|
+
return {"status": "error", "error": "Could not open video file"}
|
|
44
|
+
|
|
45
|
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
46
|
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
47
|
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
48
|
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
49
|
+
duration = frame_count / fps if fps > 0 else 0
|
|
50
|
+
|
|
51
|
+
# Check integrity by reading a few frames
|
|
52
|
+
test_frame_indices = [0, frame_count // 2, frame_count - 1] if frame_count > 0 else []
|
|
53
|
+
failed_frames = 0
|
|
54
|
+
|
|
55
|
+
for idx in test_frame_indices:
|
|
56
|
+
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
|
57
|
+
ret, frame = cap.read()
|
|
58
|
+
if not ret or frame is None:
|
|
59
|
+
failed_frames += 1
|
|
60
|
+
|
|
61
|
+
cap.release()
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
"status": "ok",
|
|
65
|
+
"type": "video",
|
|
66
|
+
"filename": os.path.basename(path),
|
|
67
|
+
"width": width,
|
|
68
|
+
"height": height,
|
|
69
|
+
"fps": float(fps),
|
|
70
|
+
"duration": float(duration),
|
|
71
|
+
"frame_count": frame_count,
|
|
72
|
+
"corruption_risk": "high" if failed_frames > 0 else "low"
|
|
73
|
+
}
|
|
74
|
+
except Exception as e:
|
|
75
|
+
return {"status": "error", "error": str(e)}
|
|
76
|
+
|
|
77
|
+
def main():
|
|
78
|
+
if len(sys.argv) < 2:
|
|
79
|
+
print(json.dumps({"error": "No path provided"}))
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
|
|
82
|
+
input_path = sys.argv[1]
|
|
83
|
+
results = []
|
|
84
|
+
|
|
85
|
+
# Supported extensions
|
|
86
|
+
AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
|
|
87
|
+
VIDEO_EXTS = (".mp4", ".avi", ".mkv", ".mov", ".wmv")
|
|
88
|
+
|
|
89
|
+
if os.path.isfile(input_path):
|
|
90
|
+
ext = os.path.splitext(input_path.lower())[1]
|
|
91
|
+
if ext in AUDIO_EXTS:
|
|
92
|
+
results.append(analyze_audio(input_path))
|
|
93
|
+
elif ext in VIDEO_EXTS:
|
|
94
|
+
results.append(analyze_video(input_path))
|
|
95
|
+
else:
|
|
96
|
+
results.append({"status": "error", "error": f"Unsupported file type: {ext}"})
|
|
97
|
+
elif os.path.isdir(input_path):
|
|
98
|
+
files = [os.path.join(input_path, f) for f in os.listdir(input_path)]
|
|
99
|
+
for f in files[:50]: # Limit for demo
|
|
100
|
+
ext = os.path.splitext(f.lower())[1]
|
|
101
|
+
if ext in AUDIO_EXTS:
|
|
102
|
+
results.append(analyze_audio(f))
|
|
103
|
+
elif ext in VIDEO_EXTS:
|
|
104
|
+
results.append(analyze_video(f))
|
|
105
|
+
else:
|
|
106
|
+
print(json.dumps({"error": "Invalid path"}))
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
|
|
109
|
+
# Filtering failed results for report aggregation
|
|
110
|
+
ok_results = [r for r in results if r.get("status") == "ok"]
|
|
111
|
+
|
|
112
|
+
report = {
|
|
113
|
+
"total_files": len(results),
|
|
114
|
+
"ok_files": len(ok_results),
|
|
115
|
+
"failed_files": len(results) - len(ok_results),
|
|
116
|
+
"details": results
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Calculate some averages if files were found
|
|
120
|
+
if ok_results:
|
|
121
|
+
audio_files = [r for r in ok_results if r["type"] == "audio"]
|
|
122
|
+
video_files = [r for r in ok_results if r["type"] == "video"]
|
|
123
|
+
|
|
124
|
+
if audio_files:
|
|
125
|
+
report["avg_audio_duration"] = float(np.mean([r["duration"] for r in audio_files]))
|
|
126
|
+
if video_files:
|
|
127
|
+
report["avg_video_duration"] = float(np.mean([r["duration"] for r in video_files]))
|
|
128
|
+
report["avg_fps"] = float(np.mean([r["fps"] for r in video_files]))
|
|
129
|
+
|
|
130
|
+
print(json.dumps(report))
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
main()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import urllib.request
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
# NASA Data Portal uses Socrata
|
|
9
|
+
NASA_API_URL = "https://api.us.socrata.com/api/catalog/v1"
|
|
10
|
+
NASA_DOMAIN = "data.nasa.gov"
|
|
11
|
+
|
|
12
|
+
def search_nasa(query: str, limit: int = 10):
|
|
13
|
+
"""
|
|
14
|
+
Search NASA data portal.
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
params = {
|
|
18
|
+
"q": query,
|
|
19
|
+
"limit": limit,
|
|
20
|
+
"domains": NASA_DOMAIN,
|
|
21
|
+
"search_context": NASA_DOMAIN
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
query_string = urllib.parse.urlencode(params)
|
|
25
|
+
url = f"{NASA_API_URL}?{query_string}"
|
|
26
|
+
|
|
27
|
+
req = urllib.request.Request(url)
|
|
28
|
+
with urllib.request.urlopen(req) as response:
|
|
29
|
+
data = json.load(response)
|
|
30
|
+
|
|
31
|
+
results = []
|
|
32
|
+
# Socrata catalog results are in 'results'
|
|
33
|
+
items = data.get('results', [])
|
|
34
|
+
|
|
35
|
+
for item in items:
|
|
36
|
+
ds = item.get('resource', {})
|
|
37
|
+
|
|
38
|
+
metadata = {
|
|
39
|
+
"id": f"nasa:{ds.get('id')}",
|
|
40
|
+
"source": "nasa",
|
|
41
|
+
"name": ds.get('name'),
|
|
42
|
+
"description": ds.get('description') or "No description available.",
|
|
43
|
+
"downloads": ds.get('download_count', 0),
|
|
44
|
+
"likes": ds.get('view_count', 0) // 10,
|
|
45
|
+
"last_updated": ds.get('updatedAt') or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
46
|
+
"quality_score": 90,
|
|
47
|
+
"license": {
|
|
48
|
+
"id": "public_domain",
|
|
49
|
+
"name": "Public Domain",
|
|
50
|
+
"category": "safe",
|
|
51
|
+
"usage_restrictions": [],
|
|
52
|
+
"warnings": []
|
|
53
|
+
},
|
|
54
|
+
"tags": ds.get('tags', []),
|
|
55
|
+
"total_examples": 0,
|
|
56
|
+
"is_safe_source": True,
|
|
57
|
+
"is_structured": True,
|
|
58
|
+
"metadata_url": f"https://data.nasa.gov/d/{ds.get('id')}",
|
|
59
|
+
"domain": "science"
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
results.append(metadata)
|
|
63
|
+
|
|
64
|
+
return results
|
|
65
|
+
|
|
66
|
+
except Exception as e:
|
|
67
|
+
return {"error": str(e)}
|
|
68
|
+
|
|
69
|
+
def main():
|
|
70
|
+
parser = argparse.ArgumentParser(description="NASA Adapter")
|
|
71
|
+
parser.add_argument("--action", required=True, choices=["search"])
|
|
72
|
+
parser.add_argument("--query", required=True)
|
|
73
|
+
parser.add_argument("--limit", type=int, default=10)
|
|
74
|
+
|
|
75
|
+
args = parser.parse_args()
|
|
76
|
+
|
|
77
|
+
if args.action == "search":
|
|
78
|
+
results = search_nasa(args.query, args.limit)
|
|
79
|
+
print(json.dumps(results))
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
main()
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import tempfile
|
|
5
|
+
import os
|
|
6
|
+
from typing import Dict, Any, List
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import openml
|
|
10
|
+
except ImportError:
|
|
11
|
+
openml = None
|
|
12
|
+
|
|
13
|
+
def _ensure_openml() -> Dict[str, Any]:
|
|
14
|
+
if openml is None:
|
|
15
|
+
return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
|
|
16
|
+
return {"ok": True}
|
|
17
|
+
|
|
18
|
+
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
19
|
+
# OpenML dataset dict from list_datasets
|
|
20
|
+
did = ds.get("did", "")
|
|
21
|
+
name = ds.get("name", f"dataset_{did}")
|
|
22
|
+
version = ds.get("version", "1")
|
|
23
|
+
status = ds.get("status", "active")
|
|
24
|
+
format = ds.get("format", "unknown")
|
|
25
|
+
|
|
26
|
+
# Map to Vesper DatasetMetadata format
|
|
27
|
+
return {
|
|
28
|
+
"id": f"openml:{did}",
|
|
29
|
+
"name": name,
|
|
30
|
+
"source": "openml",
|
|
31
|
+
"description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
|
|
32
|
+
"author": "OpenML Community",
|
|
33
|
+
"license": "Public",
|
|
34
|
+
"tags": ["openml", format.lower()],
|
|
35
|
+
"downloads": ds.get("NumberOfDownloads", 0),
|
|
36
|
+
"likes": ds.get("NumberOfLikes", 0),
|
|
37
|
+
"created_at": ds.get("upload_date", ""),
|
|
38
|
+
"updated_at": ds.get("upload_date", ""),
|
|
39
|
+
"size_bytes": 0, # Not always available in list
|
|
40
|
+
"quality_score": 0.8, # Default good score for OpenML
|
|
41
|
+
"domain": "machine_learning",
|
|
42
|
+
"is_gated": False,
|
|
43
|
+
"is_nsfw": False,
|
|
44
|
+
"description_length": 100,
|
|
45
|
+
"has_readme": False,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
49
|
+
check = _ensure_openml()
|
|
50
|
+
if not check.get("ok"):
|
|
51
|
+
return check
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
# OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
|
|
55
|
+
# But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
|
|
56
|
+
# Actually, openml.datasets.list_datasets() returns a dict of datasets.
|
|
57
|
+
# We can fetch a larger batch and filter by name/keyword.
|
|
58
|
+
|
|
59
|
+
# Fetching a batch of datasets
|
|
60
|
+
datasets = openml.datasets.list_datasets(output_format='dataframe')
|
|
61
|
+
|
|
62
|
+
if query:
|
|
63
|
+
# Simple case-insensitive search in name
|
|
64
|
+
mask = datasets['name'].str.contains(query, case=False, na=False)
|
|
65
|
+
filtered = datasets[mask]
|
|
66
|
+
else:
|
|
67
|
+
filtered = datasets
|
|
68
|
+
|
|
69
|
+
# Sort by NumberOfDownloads if available, else just take top
|
|
70
|
+
if 'NumberOfDownloads' in filtered.columns:
|
|
71
|
+
filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
|
|
72
|
+
|
|
73
|
+
top_k = filtered.head(limit)
|
|
74
|
+
|
|
75
|
+
# Convert to list of dicts
|
|
76
|
+
records = top_k.to_dict(orient='records')
|
|
77
|
+
items = [_dataset_to_dict(r) for r in records]
|
|
78
|
+
|
|
79
|
+
return {"ok": True, "results": items, "count": len(items)}
|
|
80
|
+
except Exception as e:
|
|
81
|
+
return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
|
|
82
|
+
|
|
83
|
+
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
84
|
+
check = _ensure_openml()
|
|
85
|
+
if not check.get("ok"):
|
|
86
|
+
return check
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
# dataset_ref is expected to be "openml:ID"
|
|
90
|
+
if dataset_ref.startswith("openml:"):
|
|
91
|
+
did_str = dataset_ref.split(":")[1]
|
|
92
|
+
else:
|
|
93
|
+
did_str = dataset_ref
|
|
94
|
+
|
|
95
|
+
did = int(did_str)
|
|
96
|
+
|
|
97
|
+
if not target_dir:
|
|
98
|
+
target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
|
|
99
|
+
|
|
100
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
# Get the dataset
|
|
103
|
+
dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
|
|
104
|
+
|
|
105
|
+
# Get the pandas dataframe
|
|
106
|
+
X, y, categorical_indicator, attribute_names = dataset.get_data(
|
|
107
|
+
dataset_format="dataframe"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# If there's a target column (y), we might want to join it back if it was separated
|
|
111
|
+
# get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
|
|
112
|
+
# Let's just get everything
|
|
113
|
+
df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
|
|
114
|
+
|
|
115
|
+
# Save to parquet in the target directory
|
|
116
|
+
safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
|
|
117
|
+
file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
|
|
118
|
+
|
|
119
|
+
df.to_parquet(file_path, index=False)
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"ok": True,
|
|
123
|
+
"local_path": file_path,
|
|
124
|
+
"target_dir": target_dir
|
|
125
|
+
}
|
|
126
|
+
except Exception as e:
|
|
127
|
+
return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
|
|
128
|
+
|
|
129
|
+
def main():
|
|
130
|
+
parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
|
|
131
|
+
parser.add_argument("action", choices=["discover", "download"])
|
|
132
|
+
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
133
|
+
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
134
|
+
|
|
135
|
+
args = parser.parse_args()
|
|
136
|
+
|
|
137
|
+
if args.action == "discover":
|
|
138
|
+
limit = int(args.arg2) if args.arg2 else 20
|
|
139
|
+
result = discover(args.arg1, limit)
|
|
140
|
+
print(json.dumps(result))
|
|
141
|
+
elif args.action == "download":
|
|
142
|
+
result = download(args.arg1, args.arg2)
|
|
143
|
+
print(json.dumps(result))
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
main()
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import polars as pl
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
def analyze_column(df, col_name, dtype):
|
|
7
|
+
stats = {
|
|
8
|
+
"name": col_name,
|
|
9
|
+
"type": str(dtype),
|
|
10
|
+
"inferred_type": str(dtype), # Default to actual
|
|
11
|
+
"missing_count": 0,
|
|
12
|
+
"missing_percentage": 0.0,
|
|
13
|
+
"unique_count": 0,
|
|
14
|
+
"is_constant": False,
|
|
15
|
+
"is_mixed_type": False
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
col = df[col_name]
|
|
20
|
+
null_count = col.null_count()
|
|
21
|
+
row_count = len(col)
|
|
22
|
+
|
|
23
|
+
stats["missing_count"] = null_count
|
|
24
|
+
stats["missing_percentage"] = (null_count / row_count) * 100 if row_count > 0 else 0
|
|
25
|
+
stats["unique_count"] = col.n_unique()
|
|
26
|
+
stats["is_constant"] = stats["unique_count"] <= 1 and row_count > 0
|
|
27
|
+
|
|
28
|
+
# Schema Inference & Validation
|
|
29
|
+
is_string = dtype == pl.Utf8 or dtype == pl.Object
|
|
30
|
+
|
|
31
|
+
if is_string and row_count > 0:
|
|
32
|
+
# Try inferring Numeric
|
|
33
|
+
# Check if majority can be cast to float
|
|
34
|
+
try:
|
|
35
|
+
# Use strict=False to turn non-numbers into nulls
|
|
36
|
+
numeric_cast = col.str.strip_chars().cast(pl.Float64, strict=False)
|
|
37
|
+
numeric_nulls = numeric_cast.null_count()
|
|
38
|
+
|
|
39
|
+
# If valid numbers are significantly more than original nulls, it might be numeric
|
|
40
|
+
valid_numbers = row_count - numeric_nulls
|
|
41
|
+
original_valid = row_count - null_count
|
|
42
|
+
|
|
43
|
+
if valid_numbers > 0 and (valid_numbers / original_valid) > 0.9:
|
|
44
|
+
stats["inferred_type"] = "Numeric (Stored as String)"
|
|
45
|
+
|
|
46
|
+
# Mixed type check: If valid numbers exist but plenty of strings too
|
|
47
|
+
elif valid_numbers > 0 and (valid_numbers / original_valid) < 0.9:
|
|
48
|
+
stats["is_mixed_type"] = True
|
|
49
|
+
except:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
# Numeric Analysis
|
|
53
|
+
if dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32] or stats["inferred_type"].startswith("Numeric"):
|
|
54
|
+
clean_col = col
|
|
55
|
+
if is_string:
|
|
56
|
+
# Cast for analysis if it was inferred
|
|
57
|
+
clean_col = col.str.strip_chars().cast(pl.Float64, strict=False)
|
|
58
|
+
|
|
59
|
+
clean_col = clean_col.drop_nulls()
|
|
60
|
+
|
|
61
|
+
if len(clean_col) > 0:
|
|
62
|
+
stats["distribution"] = {
|
|
63
|
+
"min": float(clean_col.min()),
|
|
64
|
+
"max": float(clean_col.max()),
|
|
65
|
+
"mean": float(clean_col.mean()),
|
|
66
|
+
"std": float(clean_col.std()) if len(clean_col) > 1 else 0,
|
|
67
|
+
"p25": float(clean_col.quantile(0.25)),
|
|
68
|
+
"p50": float(clean_col.median()),
|
|
69
|
+
"p75": float(clean_col.quantile(0.75))
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Categorical Analysis
|
|
73
|
+
if dtype == pl.Utf8 or dtype == pl.Categorical:
|
|
74
|
+
value_counts = col.value_counts(sort=True).head(5)
|
|
75
|
+
# Handle different polars versions return structure for value_counts
|
|
76
|
+
try:
|
|
77
|
+
# Format: struct with name/counts or columns
|
|
78
|
+
rows = value_counts.rows()
|
|
79
|
+
top_values = {}
|
|
80
|
+
for row in rows:
|
|
81
|
+
val = str(row[0]) if row[0] is not None else "null"
|
|
82
|
+
count = int(row[1])
|
|
83
|
+
top_values[val] = count
|
|
84
|
+
stats["top_values"] = top_values
|
|
85
|
+
except:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
stats["error"] = str(e)
|
|
90
|
+
|
|
91
|
+
return stats
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
if len(sys.argv) < 2:
|
|
95
|
+
print(json.dumps({"error": "No file path provided"}))
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
file_path = sys.argv[1]
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Robust file reading with extension detection
|
|
102
|
+
file_path_lower = file_path.lower()
|
|
103
|
+
if file_path_lower.endswith(".csv"):
|
|
104
|
+
df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
|
|
105
|
+
elif file_path_lower.endswith(".tsv"):
|
|
106
|
+
df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
|
|
107
|
+
elif file_path_lower.endswith(".txt"):
|
|
108
|
+
sep = ","
|
|
109
|
+
try:
|
|
110
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
111
|
+
first_line = fh.readline()
|
|
112
|
+
if "\t" in first_line:
|
|
113
|
+
sep = "\t"
|
|
114
|
+
except Exception:
|
|
115
|
+
sep = ","
|
|
116
|
+
df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
|
|
117
|
+
elif file_path_lower.endswith(".parquet"):
|
|
118
|
+
try:
|
|
119
|
+
# Try scanning first (faster for large files)
|
|
120
|
+
df = pl.scan_parquet(file_path).limit(10000).collect()
|
|
121
|
+
except:
|
|
122
|
+
df = pl.read_parquet(file_path)
|
|
123
|
+
if len(df) > 10000: df = df.head(10000)
|
|
124
|
+
elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
|
|
125
|
+
# Explicit NDJSON
|
|
126
|
+
df = pl.scan_ndjson(file_path).limit(10000).collect()
|
|
127
|
+
elif file_path_lower.endswith(".json"):
|
|
128
|
+
# Ambiguous .json: Try standard JSON first, then NDJSON fallback
|
|
129
|
+
try:
|
|
130
|
+
# read_json reads standard JSON array [{}, {}]
|
|
131
|
+
df = pl.read_json(file_path)
|
|
132
|
+
if len(df) > 10000: df = df.head(10000)
|
|
133
|
+
except Exception:
|
|
134
|
+
try:
|
|
135
|
+
# Fallback to NDJSON (common for large datasets mislabeled as .json)
|
|
136
|
+
df = pl.scan_ndjson(file_path).limit(10000).collect()
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(json.dumps({"error": f"Failed to read JSON: {str(e)}"}))
|
|
139
|
+
sys.exit(1)
|
|
140
|
+
else:
|
|
141
|
+
print(json.dumps({"error": f"Unsupported file extension: {file_path}"}))
|
|
142
|
+
sys.exit(1)
|
|
143
|
+
|
|
144
|
+
row_count = len(df)
|
|
145
|
+
column_count = len(df.columns)
|
|
146
|
+
|
|
147
|
+
# Duplicate detection (exact)
|
|
148
|
+
# NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
|
|
149
|
+
# Use a Python fallback that is slower but robust for the 10k sampled rows.
|
|
150
|
+
duplicate_count = 0
|
|
151
|
+
try:
|
|
152
|
+
seen = set()
|
|
153
|
+
for row in df.to_dicts():
|
|
154
|
+
row_key = json.dumps(row, sort_keys=True, default=str)
|
|
155
|
+
if row_key in seen:
|
|
156
|
+
duplicate_count += 1
|
|
157
|
+
else:
|
|
158
|
+
seen.add(row_key)
|
|
159
|
+
except Exception:
|
|
160
|
+
duplicate_count = 0
|
|
161
|
+
|
|
162
|
+
columns_stats = []
|
|
163
|
+
text_cols = []
|
|
164
|
+
for col in df.columns:
|
|
165
|
+
stats = analyze_column(df, col, df.schema[col])
|
|
166
|
+
columns_stats.append(stats)
|
|
167
|
+
# Check for String type (Polars can return 'String' or 'Utf8' depending on version)
|
|
168
|
+
dtype_str = stats["type"]
|
|
169
|
+
if ("String" in dtype_str or "Utf8" in dtype_str) and stats["unique_count"] > 1:
|
|
170
|
+
text_cols.append(col)
|
|
171
|
+
|
|
172
|
+
report = {
|
|
173
|
+
"row_count": row_count,
|
|
174
|
+
"column_count": column_count,
|
|
175
|
+
"duplicate_rows": int(duplicate_count),
|
|
176
|
+
"duplicate_percentage": (duplicate_count / row_count * 100) if row_count > 0 else 0,
|
|
177
|
+
"columns": columns_stats,
|
|
178
|
+
"warnings": [],
|
|
179
|
+
"schema_warnings": [],
|
|
180
|
+
"overall_score": 100
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
# Integrity Check 1: Text Duplicates (Fuzzyish Proxy)
|
|
184
|
+
# If duplicated rows are 0, check if main text content is duplicated
|
|
185
|
+
if duplicate_count == 0 and len(text_cols) > 0:
|
|
186
|
+
# Pick longest text column as likely "content"
|
|
187
|
+
# In real impl, we'd use heuristics. For now, first text col.
|
|
188
|
+
target_col = text_cols[0]
|
|
189
|
+
try:
|
|
190
|
+
text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
|
|
191
|
+
if text_dupes > 0:
|
|
192
|
+
report["text_duplicates"] = int(text_dupes)
|
|
193
|
+
if text_dupes > (row_count * 0.2):
|
|
194
|
+
report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
|
|
195
|
+
except Exception:
|
|
196
|
+
# Skip text duplicate warning if backend cannot compute duplicates for this dtype
|
|
197
|
+
pass
|
|
198
|
+
|
|
199
|
+
# Integrity Check 2: Contamination / Leakage (Basic)
|
|
200
|
+
# (Skipping correlation for now)
|
|
201
|
+
|
|
202
|
+
report["class_imbalance_warnings"] = []
|
|
203
|
+
report["pii_warnings"] = []
|
|
204
|
+
|
|
205
|
+
# PII Patterns (Regex)
|
|
206
|
+
import re
|
|
207
|
+
pii_patterns = {
|
|
208
|
+
"Email": r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
|
|
209
|
+
"Phone": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', # Basic US-ish pattern
|
|
210
|
+
"SSN": r'\d{3}-\d{2}-\d{4}',
|
|
211
|
+
"IPv4": r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Bias & PII Analysis
|
|
215
|
+
for col_name, stats in zip(df.columns, columns_stats):
|
|
216
|
+
# Class Imbalance
|
|
217
|
+
if stats["unique_count"] > 1 and stats["unique_count"] < 50:
|
|
218
|
+
try:
|
|
219
|
+
col = df[col_name]
|
|
220
|
+
top_val_count = col.value_counts().sort("count", descending=True).row(0)[1]
|
|
221
|
+
total = len(col)
|
|
222
|
+
if total > 0:
|
|
223
|
+
ratio = top_val_count / total
|
|
224
|
+
if ratio > 0.9:
|
|
225
|
+
report["class_imbalance_warnings"].append(f"Severe imbalance in '{col_name}': Top class is {(ratio*100):.1f}% of data")
|
|
226
|
+
except:
|
|
227
|
+
pass
|
|
228
|
+
|
|
229
|
+
# PII Detection (on Text Columns only)
|
|
230
|
+
if ("String" in stats["type"] or "Utf8" in stats["type"]):
|
|
231
|
+
try:
|
|
232
|
+
# Sample for performance (check first 1000 non-null values)
|
|
233
|
+
sample_text = df[col_name].drop_nulls().head(1000).to_list()
|
|
234
|
+
# Join a subset to regex against (faster than row-by-row for simple checks)
|
|
235
|
+
combined_text = " ".join([str(x) for x in sample_text])
|
|
236
|
+
|
|
237
|
+
for pii_type, pattern in pii_patterns.items():
|
|
238
|
+
if re.search(pattern, combined_text):
|
|
239
|
+
# Ensure we don't flag column names like "email_address" but actual content
|
|
240
|
+
# Double check with a strict count if trigger found
|
|
241
|
+
matches = len(re.findall(pattern, combined_text))
|
|
242
|
+
if matches > 0:
|
|
243
|
+
report["pii_warnings"].append(f"Potential {pii_type} detected in column '{col_name}' ({matches} matches in sample)")
|
|
244
|
+
except:
|
|
245
|
+
pass
|
|
246
|
+
|
|
247
|
+
# Basic warnings
|
|
248
|
+
if report["duplicate_percentage"] > 10:
|
|
249
|
+
report["warnings"].append("High duplication rate (>10%)")
|
|
250
|
+
if row_count < 50:
|
|
251
|
+
report["warnings"].append("Dataset is very small (<50 rows)")
|
|
252
|
+
|
|
253
|
+
# Schema warnings
|
|
254
|
+
for col in columns_stats:
|
|
255
|
+
if "Numeric" in col.get("inferred_type", "") and "Utf8" in col.get("type", ""):
|
|
256
|
+
report["schema_warnings"].append(f"Column '{col['name']}' looks Numeric but is stored as String")
|
|
257
|
+
if col.get("is_mixed_type"):
|
|
258
|
+
report["schema_warnings"].append(f"Column '{col['name']}' likely contains mixed types (numbers and strings)")
|
|
259
|
+
|
|
260
|
+
print(json.dumps(report))
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
print(json.dumps({"error": f"Analysis failed: {str(e)}"}))
|
|
264
|
+
sys.exit(1)
|
|
265
|
+
|
|
266
|
+
if __name__ == "__main__":
|
|
267
|
+
main()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import polars as pl
|
|
7
|
+
except Exception:
|
|
8
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
9
|
+
sys.exit(1)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def count_rows(path: str) -> int:
|
|
13
|
+
ext = os.path.splitext(path)[1].lower()
|
|
14
|
+
|
|
15
|
+
if ext == ".csv":
|
|
16
|
+
# Faster than full read for large csv
|
|
17
|
+
return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
|
|
18
|
+
if ext in [".parquet", ".pq"]:
|
|
19
|
+
return int(pl.scan_parquet(path).select(pl.len()).collect().item())
|
|
20
|
+
if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
|
|
21
|
+
return int(pl.scan_ipc(path).select(pl.len()).collect().item())
|
|
22
|
+
if ext in [".jsonl", ".ndjson"]:
|
|
23
|
+
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
24
|
+
if ext == ".json":
|
|
25
|
+
# fallback to eager for plain JSON arrays
|
|
26
|
+
try:
|
|
27
|
+
return int(pl.read_json(path).height)
|
|
28
|
+
except Exception:
|
|
29
|
+
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
30
|
+
|
|
31
|
+
# unknown extension fallback
|
|
32
|
+
return int(pl.read_csv(path, ignore_errors=True).height)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main():
|
|
36
|
+
if len(sys.argv) < 2:
|
|
37
|
+
print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
p = sys.argv[1]
|
|
41
|
+
if not os.path.exists(p):
|
|
42
|
+
print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
|
|
43
|
+
sys.exit(1)
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
rows = count_rows(p)
|
|
47
|
+
print(json.dumps({"ok": True, "rows": rows}))
|
|
48
|
+
except Exception as e:
|
|
49
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
main()
|