vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
import cv2
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
7
|
-
# Audio analysis depends on librosa/soundfile. Fallback if not available.
|
|
8
|
-
try:
|
|
9
|
-
import librosa
|
|
10
|
-
AUDIO_SUPPORT = True
|
|
11
|
-
except ImportError:
|
|
12
|
-
AUDIO_SUPPORT = False
|
|
13
|
-
|
|
14
|
-
def analyze_audio(path):
|
|
15
|
-
if not AUDIO_SUPPORT:
|
|
16
|
-
return {"status": "error", "error": "librosa not installed"}
|
|
17
|
-
|
|
18
|
-
try:
|
|
19
|
-
# Load audio (mono, default sr)
|
|
20
|
-
y, sr = librosa.load(path, sr=None)
|
|
21
|
-
duration = librosa.get_duration(y=y, sr=sr)
|
|
22
|
-
|
|
23
|
-
# Audio metrics
|
|
24
|
-
rms = librosa.feature.rms(y=y)
|
|
25
|
-
avg_rms = float(np.mean(rms))
|
|
26
|
-
|
|
27
|
-
return {
|
|
28
|
-
"status": "ok",
|
|
29
|
-
"type": "audio",
|
|
30
|
-
"filename": os.path.basename(path),
|
|
31
|
-
"sample_rate": int(sr),
|
|
32
|
-
"duration": float(duration),
|
|
33
|
-
"avg_volume_rms": avg_rms,
|
|
34
|
-
"is_silent": avg_rms < 0.001
|
|
35
|
-
}
|
|
36
|
-
except Exception as e:
|
|
37
|
-
return {"status": "error", "error": str(e)}
|
|
38
|
-
|
|
39
|
-
def analyze_video(path):
|
|
40
|
-
try:
|
|
41
|
-
cap = cv2.VideoCapture(path)
|
|
42
|
-
if not cap.isOpened():
|
|
43
|
-
return {"status": "error", "error": "Could not open video file"}
|
|
44
|
-
|
|
45
|
-
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
46
|
-
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
47
|
-
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
48
|
-
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
49
|
-
duration = frame_count / fps if fps > 0 else 0
|
|
50
|
-
|
|
51
|
-
# Check integrity by reading a few frames
|
|
52
|
-
test_frame_indices = [0, frame_count // 2, frame_count - 1] if frame_count > 0 else []
|
|
53
|
-
failed_frames = 0
|
|
54
|
-
|
|
55
|
-
for idx in test_frame_indices:
|
|
56
|
-
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
|
57
|
-
ret, frame = cap.read()
|
|
58
|
-
if not ret or frame is None:
|
|
59
|
-
failed_frames += 1
|
|
60
|
-
|
|
61
|
-
cap.release()
|
|
62
|
-
|
|
63
|
-
return {
|
|
64
|
-
"status": "ok",
|
|
65
|
-
"type": "video",
|
|
66
|
-
"filename": os.path.basename(path),
|
|
67
|
-
"width": width,
|
|
68
|
-
"height": height,
|
|
69
|
-
"fps": float(fps),
|
|
70
|
-
"duration": float(duration),
|
|
71
|
-
"frame_count": frame_count,
|
|
72
|
-
"corruption_risk": "high" if failed_frames > 0 else "low"
|
|
73
|
-
}
|
|
74
|
-
except Exception as e:
|
|
75
|
-
return {"status": "error", "error": str(e)}
|
|
76
|
-
|
|
77
|
-
def main():
|
|
78
|
-
if len(sys.argv) < 2:
|
|
79
|
-
print(json.dumps({"error": "No path provided"}))
|
|
80
|
-
sys.exit(1)
|
|
81
|
-
|
|
82
|
-
input_path = sys.argv[1]
|
|
83
|
-
results = []
|
|
84
|
-
|
|
85
|
-
# Supported extensions
|
|
86
|
-
AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
|
|
87
|
-
VIDEO_EXTS = (".mp4", ".avi", ".mkv", ".mov", ".wmv")
|
|
88
|
-
|
|
89
|
-
if os.path.isfile(input_path):
|
|
90
|
-
ext = os.path.splitext(input_path.lower())[1]
|
|
91
|
-
if ext in AUDIO_EXTS:
|
|
92
|
-
results.append(analyze_audio(input_path))
|
|
93
|
-
elif ext in VIDEO_EXTS:
|
|
94
|
-
results.append(analyze_video(input_path))
|
|
95
|
-
else:
|
|
96
|
-
results.append({"status": "error", "error": f"Unsupported file type: {ext}"})
|
|
97
|
-
elif os.path.isdir(input_path):
|
|
98
|
-
files = [os.path.join(input_path, f) for f in os.listdir(input_path)]
|
|
99
|
-
for f in files[:50]: # Limit for demo
|
|
100
|
-
ext = os.path.splitext(f.lower())[1]
|
|
101
|
-
if ext in AUDIO_EXTS:
|
|
102
|
-
results.append(analyze_audio(f))
|
|
103
|
-
elif ext in VIDEO_EXTS:
|
|
104
|
-
results.append(analyze_video(f))
|
|
105
|
-
else:
|
|
106
|
-
print(json.dumps({"error": "Invalid path"}))
|
|
107
|
-
sys.exit(1)
|
|
108
|
-
|
|
109
|
-
# Filtering failed results for report aggregation
|
|
110
|
-
ok_results = [r for r in results if r.get("status") == "ok"]
|
|
111
|
-
|
|
112
|
-
report = {
|
|
113
|
-
"total_files": len(results),
|
|
114
|
-
"ok_files": len(ok_results),
|
|
115
|
-
"failed_files": len(results) - len(ok_results),
|
|
116
|
-
"details": results
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
# Calculate some averages if files were found
|
|
120
|
-
if ok_results:
|
|
121
|
-
audio_files = [r for r in ok_results if r["type"] == "audio"]
|
|
122
|
-
video_files = [r for r in ok_results if r["type"] == "video"]
|
|
123
|
-
|
|
124
|
-
if audio_files:
|
|
125
|
-
report["avg_audio_duration"] = float(np.mean([r["duration"] for r in audio_files]))
|
|
126
|
-
if video_files:
|
|
127
|
-
report["avg_video_duration"] = float(np.mean([r["duration"] for r in video_files]))
|
|
128
|
-
report["avg_fps"] = float(np.mean([r["fps"] for r in video_files]))
|
|
129
|
-
|
|
130
|
-
print(json.dumps(report))
|
|
131
|
-
|
|
132
|
-
if __name__ == "__main__":
|
|
133
|
-
main()
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import argparse
|
|
4
|
-
import urllib.request
|
|
5
|
-
import urllib.parse
|
|
6
|
-
from datetime import datetime
|
|
7
|
-
|
|
8
|
-
# NASA Data Portal uses Socrata
|
|
9
|
-
NASA_API_URL = "https://api.us.socrata.com/api/catalog/v1"
|
|
10
|
-
NASA_DOMAIN = "data.nasa.gov"
|
|
11
|
-
|
|
12
|
-
def search_nasa(query: str, limit: int = 10):
|
|
13
|
-
"""
|
|
14
|
-
Search NASA data portal.
|
|
15
|
-
"""
|
|
16
|
-
try:
|
|
17
|
-
params = {
|
|
18
|
-
"q": query,
|
|
19
|
-
"limit": limit,
|
|
20
|
-
"domains": NASA_DOMAIN,
|
|
21
|
-
"search_context": NASA_DOMAIN
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
query_string = urllib.parse.urlencode(params)
|
|
25
|
-
url = f"{NASA_API_URL}?{query_string}"
|
|
26
|
-
|
|
27
|
-
req = urllib.request.Request(url)
|
|
28
|
-
with urllib.request.urlopen(req) as response:
|
|
29
|
-
data = json.load(response)
|
|
30
|
-
|
|
31
|
-
results = []
|
|
32
|
-
# Socrata catalog results are in 'results'
|
|
33
|
-
items = data.get('results', [])
|
|
34
|
-
|
|
35
|
-
for item in items:
|
|
36
|
-
ds = item.get('resource', {})
|
|
37
|
-
|
|
38
|
-
metadata = {
|
|
39
|
-
"id": f"nasa:{ds.get('id')}",
|
|
40
|
-
"source": "nasa",
|
|
41
|
-
"name": ds.get('name'),
|
|
42
|
-
"description": ds.get('description') or "No description available.",
|
|
43
|
-
"downloads": ds.get('download_count', 0),
|
|
44
|
-
"likes": ds.get('view_count', 0) // 10,
|
|
45
|
-
"last_updated": ds.get('updatedAt') or datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
46
|
-
"quality_score": 90,
|
|
47
|
-
"license": {
|
|
48
|
-
"id": "public_domain",
|
|
49
|
-
"name": "Public Domain",
|
|
50
|
-
"category": "safe",
|
|
51
|
-
"usage_restrictions": [],
|
|
52
|
-
"warnings": []
|
|
53
|
-
},
|
|
54
|
-
"tags": ds.get('tags', []),
|
|
55
|
-
"total_examples": 0,
|
|
56
|
-
"is_safe_source": True,
|
|
57
|
-
"is_structured": True,
|
|
58
|
-
"metadata_url": f"https://data.nasa.gov/d/{ds.get('id')}",
|
|
59
|
-
"domain": "science"
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
results.append(metadata)
|
|
63
|
-
|
|
64
|
-
return results
|
|
65
|
-
|
|
66
|
-
except Exception as e:
|
|
67
|
-
return {"error": str(e)}
|
|
68
|
-
|
|
69
|
-
def main():
|
|
70
|
-
parser = argparse.ArgumentParser(description="NASA Adapter")
|
|
71
|
-
parser.add_argument("--action", required=True, choices=["search"])
|
|
72
|
-
parser.add_argument("--query", required=True)
|
|
73
|
-
parser.add_argument("--limit", type=int, default=10)
|
|
74
|
-
|
|
75
|
-
args = parser.parse_args()
|
|
76
|
-
|
|
77
|
-
if args.action == "search":
|
|
78
|
-
results = search_nasa(args.query, args.limit)
|
|
79
|
-
print(json.dumps(results))
|
|
80
|
-
|
|
81
|
-
if __name__ == "__main__":
|
|
82
|
-
main()
|
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Normalize any supported dataset file to parquet format.
|
|
3
|
-
Usage: normalize_engine.py <input_path> <output_path>
|
|
4
|
-
Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
|
|
5
|
-
"""
|
|
6
|
-
import sys
|
|
7
|
-
import json
|
|
8
|
-
import os
|
|
9
|
-
|
|
10
|
-
try:
|
|
11
|
-
import polars as pl
|
|
12
|
-
except Exception:
|
|
13
|
-
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
14
|
-
sys.exit(1)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _load(src: str) -> pl.DataFrame:
|
|
18
|
-
ext = os.path.splitext(src)[1].lower()
|
|
19
|
-
|
|
20
|
-
if ext == ".csv":
|
|
21
|
-
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
22
|
-
if ext in (".tsv", ".tab"):
|
|
23
|
-
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
24
|
-
if ext in (".parquet", ".pq"):
|
|
25
|
-
return pl.read_parquet(src)
|
|
26
|
-
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
27
|
-
return pl.read_ipc(src)
|
|
28
|
-
if ext in (".jsonl", ".ndjson"):
|
|
29
|
-
return pl.read_ndjson(src)
|
|
30
|
-
if ext == ".json":
|
|
31
|
-
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
32
|
-
if raw.startswith("["):
|
|
33
|
-
return pl.read_json(src)
|
|
34
|
-
# Try NDJSON
|
|
35
|
-
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
36
|
-
return pl.read_ndjson(src)
|
|
37
|
-
# Try wrapper object
|
|
38
|
-
obj = json.loads(raw)
|
|
39
|
-
if isinstance(obj, dict):
|
|
40
|
-
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
41
|
-
if key in obj and isinstance(obj[key], list):
|
|
42
|
-
return pl.DataFrame(obj[key])
|
|
43
|
-
# Last resort - take first list value
|
|
44
|
-
for v in obj.values():
|
|
45
|
-
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
46
|
-
return pl.DataFrame(v)
|
|
47
|
-
return pl.read_json(src)
|
|
48
|
-
if ext == ".txt":
|
|
49
|
-
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
50
|
-
|
|
51
|
-
# Fallback: try csv
|
|
52
|
-
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def normalize(input_path: str, output_path: str):
|
|
56
|
-
df = _load(input_path)
|
|
57
|
-
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
58
|
-
df.write_parquet(output_path)
|
|
59
|
-
return df.height
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def main():
|
|
63
|
-
if len(sys.argv) < 3:
|
|
64
|
-
print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
|
|
65
|
-
sys.exit(1)
|
|
66
|
-
|
|
67
|
-
input_path = sys.argv[1]
|
|
68
|
-
output_path = sys.argv[2]
|
|
69
|
-
|
|
70
|
-
if not os.path.exists(input_path):
|
|
71
|
-
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
72
|
-
sys.exit(1)
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
rows = normalize(input_path, output_path)
|
|
76
|
-
print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
|
|
77
|
-
except Exception as e:
|
|
78
|
-
print(json.dumps({"ok": False, "error": str(e)}))
|
|
79
|
-
sys.exit(1)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
if __name__ == "__main__":
|
|
83
|
-
main()
|
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import argparse
|
|
4
|
-
import tempfile
|
|
5
|
-
import os
|
|
6
|
-
from typing import Dict, Any, List
|
|
7
|
-
|
|
8
|
-
try:
|
|
9
|
-
import openml
|
|
10
|
-
except ImportError:
|
|
11
|
-
openml = None
|
|
12
|
-
|
|
13
|
-
def _ensure_openml() -> Dict[str, Any]:
|
|
14
|
-
if openml is None:
|
|
15
|
-
return {"ok": False, "error": "openml package is not installed. Run 'pip install openml'"}
|
|
16
|
-
return {"ok": True}
|
|
17
|
-
|
|
18
|
-
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
19
|
-
# OpenML dataset dict from list_datasets
|
|
20
|
-
did = ds.get("did", "")
|
|
21
|
-
name = ds.get("name", f"dataset_{did}")
|
|
22
|
-
version = ds.get("version", "1")
|
|
23
|
-
status = ds.get("status", "active")
|
|
24
|
-
format = ds.get("format", "unknown")
|
|
25
|
-
|
|
26
|
-
# Map to Vesper DatasetMetadata format
|
|
27
|
-
return {
|
|
28
|
-
"id": f"openml:{did}",
|
|
29
|
-
"name": name,
|
|
30
|
-
"source": "openml",
|
|
31
|
-
"description": f"OpenML Dataset {name} (ID: {did}, Version: {version}, Format: {format}, Status: {status})",
|
|
32
|
-
"author": "OpenML Community",
|
|
33
|
-
"license": "Public",
|
|
34
|
-
"tags": ["openml", format.lower()],
|
|
35
|
-
"downloads": ds.get("NumberOfDownloads", 0),
|
|
36
|
-
"likes": ds.get("NumberOfLikes", 0),
|
|
37
|
-
"created_at": ds.get("upload_date", ""),
|
|
38
|
-
"updated_at": ds.get("upload_date", ""),
|
|
39
|
-
"size_bytes": 0, # Not always available in list
|
|
40
|
-
"quality_score": 0.8, # Default good score for OpenML
|
|
41
|
-
"domain": "machine_learning",
|
|
42
|
-
"is_gated": False,
|
|
43
|
-
"is_nsfw": False,
|
|
44
|
-
"description_length": 100,
|
|
45
|
-
"has_readme": False,
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
49
|
-
check = _ensure_openml()
|
|
50
|
-
if not check.get("ok"):
|
|
51
|
-
return check
|
|
52
|
-
|
|
53
|
-
try:
|
|
54
|
-
# OpenML list_datasets doesn't have a direct text search in the python API easily exposed without downloading all.
|
|
55
|
-
# But we can filter by tag or just get a list and filter locally if needed, or use the REST API directly.
|
|
56
|
-
# Actually, openml.datasets.list_datasets() returns a dict of datasets.
|
|
57
|
-
# We can fetch a larger batch and filter by name/keyword.
|
|
58
|
-
|
|
59
|
-
# Fetching a batch of datasets
|
|
60
|
-
datasets = openml.datasets.list_datasets(output_format='dataframe')
|
|
61
|
-
|
|
62
|
-
if query:
|
|
63
|
-
# Simple case-insensitive search in name
|
|
64
|
-
mask = datasets['name'].str.contains(query, case=False, na=False)
|
|
65
|
-
filtered = datasets[mask]
|
|
66
|
-
else:
|
|
67
|
-
filtered = datasets
|
|
68
|
-
|
|
69
|
-
# Sort by NumberOfDownloads if available, else just take top
|
|
70
|
-
if 'NumberOfDownloads' in filtered.columns:
|
|
71
|
-
filtered = filtered.sort_values('NumberOfDownloads', ascending=False)
|
|
72
|
-
|
|
73
|
-
top_k = filtered.head(limit)
|
|
74
|
-
|
|
75
|
-
# Convert to list of dicts
|
|
76
|
-
records = top_k.to_dict(orient='records')
|
|
77
|
-
items = [_dataset_to_dict(r) for r in records]
|
|
78
|
-
|
|
79
|
-
return {"ok": True, "results": items, "count": len(items)}
|
|
80
|
-
except Exception as e:
|
|
81
|
-
return {"ok": False, "error": f"OpenML discover failed: {str(e)}"}
|
|
82
|
-
|
|
83
|
-
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
84
|
-
check = _ensure_openml()
|
|
85
|
-
if not check.get("ok"):
|
|
86
|
-
return check
|
|
87
|
-
|
|
88
|
-
try:
|
|
89
|
-
# dataset_ref is expected to be "openml:ID"
|
|
90
|
-
if dataset_ref.startswith("openml:"):
|
|
91
|
-
did_str = dataset_ref.split(":")[1]
|
|
92
|
-
else:
|
|
93
|
-
did_str = dataset_ref
|
|
94
|
-
|
|
95
|
-
did = int(did_str)
|
|
96
|
-
|
|
97
|
-
if not target_dir:
|
|
98
|
-
target_dir = tempfile.mkdtemp(prefix="vesper_openml_")
|
|
99
|
-
|
|
100
|
-
os.makedirs(target_dir, exist_ok=True)
|
|
101
|
-
|
|
102
|
-
# Get the dataset
|
|
103
|
-
dataset = openml.datasets.get_dataset(did, download_data=True, download_qualities=False, download_features_meta_data=False)
|
|
104
|
-
|
|
105
|
-
# Get the pandas dataframe
|
|
106
|
-
X, y, categorical_indicator, attribute_names = dataset.get_data(
|
|
107
|
-
dataset_format="dataframe"
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
# If there's a target column (y), we might want to join it back if it was separated
|
|
111
|
-
# get_data() can return X and y separately if target is specified, but usually X contains everything if target=None
|
|
112
|
-
# Let's just get everything
|
|
113
|
-
df, _, _, _ = dataset.get_data(target=None, dataset_format="dataframe")
|
|
114
|
-
|
|
115
|
-
# Save to parquet in the target directory
|
|
116
|
-
safe_name = "".join([c if c.isalnum() else "_" for c in dataset.name])
|
|
117
|
-
file_path = os.path.join(target_dir, f"{safe_name}_{did}.parquet")
|
|
118
|
-
|
|
119
|
-
df.to_parquet(file_path, index=False)
|
|
120
|
-
|
|
121
|
-
return {
|
|
122
|
-
"ok": True,
|
|
123
|
-
"local_path": file_path,
|
|
124
|
-
"target_dir": target_dir
|
|
125
|
-
}
|
|
126
|
-
except Exception as e:
|
|
127
|
-
return {"ok": False, "error": f"OpenML download failed: {str(e)}"}
|
|
128
|
-
|
|
129
|
-
def main():
|
|
130
|
-
parser = argparse.ArgumentParser(description="Vesper OpenML Engine")
|
|
131
|
-
parser.add_argument("action", choices=["discover", "download"])
|
|
132
|
-
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
133
|
-
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
134
|
-
|
|
135
|
-
args = parser.parse_args()
|
|
136
|
-
|
|
137
|
-
if args.action == "discover":
|
|
138
|
-
limit = int(args.arg2) if args.arg2 else 20
|
|
139
|
-
result = discover(args.arg1, limit)
|
|
140
|
-
print(json.dumps(result))
|
|
141
|
-
elif args.action == "download":
|
|
142
|
-
result = download(args.arg1, args.arg2)
|
|
143
|
-
print(json.dumps(result))
|
|
144
|
-
|
|
145
|
-
if __name__ == "__main__":
|
|
146
|
-
main()
|