vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import argparse
|
|
4
|
-
import tempfile
|
|
5
|
-
import os
|
|
6
|
-
import urllib.request
|
|
7
|
-
import urllib.error
|
|
8
|
-
import urllib.parse
|
|
9
|
-
from typing import Dict, Any, List
|
|
10
|
-
|
|
11
|
-
def _get_token() -> str:
|
|
12
|
-
token = os.environ.get("DW_AUTH_TOKEN")
|
|
13
|
-
if not token:
|
|
14
|
-
raise ValueError("DW_AUTH_TOKEN environment variable is required for data.world")
|
|
15
|
-
return token
|
|
16
|
-
|
|
17
|
-
def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
|
|
18
|
-
owner_field = ds.get("owner", "")
|
|
19
|
-
if isinstance(owner_field, dict):
|
|
20
|
-
owner = owner_field.get("id") or owner_field.get("name") or ""
|
|
21
|
-
else:
|
|
22
|
-
owner = owner_field or ""
|
|
23
|
-
|
|
24
|
-
id_str = ds.get("id", "")
|
|
25
|
-
title = ds.get("title", "")
|
|
26
|
-
|
|
27
|
-
if (not owner or not id_str) and isinstance(ds.get("resourceLink"), str):
|
|
28
|
-
# Expected format includes /<owner>/<dataset-id>
|
|
29
|
-
parts = ds["resourceLink"].strip("/").split("/")
|
|
30
|
-
if len(parts) >= 2:
|
|
31
|
-
owner = owner or parts[-2]
|
|
32
|
-
id_str = id_str or parts[-1]
|
|
33
|
-
|
|
34
|
-
if isinstance(id_str, str) and "/" in id_str and not owner:
|
|
35
|
-
split_ref = id_str.split("/", 1)
|
|
36
|
-
owner = split_ref[0]
|
|
37
|
-
id_str = split_ref[1]
|
|
38
|
-
|
|
39
|
-
if not owner and not id_str:
|
|
40
|
-
owner = "unknown"
|
|
41
|
-
id_str = "unknown"
|
|
42
|
-
|
|
43
|
-
if not title:
|
|
44
|
-
title = f"{owner}/{id_str}"
|
|
45
|
-
|
|
46
|
-
return {
|
|
47
|
-
"id": f"dataworld:{owner}/{id_str}",
|
|
48
|
-
"name": title,
|
|
49
|
-
"source": "dataworld",
|
|
50
|
-
"description": ds.get("description", f"data.world dataset {title}"),
|
|
51
|
-
"author": owner,
|
|
52
|
-
"license": {
|
|
53
|
-
"id": "Unknown",
|
|
54
|
-
"category": "unknown",
|
|
55
|
-
"commercial_use": None,
|
|
56
|
-
"warnings": []
|
|
57
|
-
},
|
|
58
|
-
"tags": ds.get("tags", []) + ["dataworld"],
|
|
59
|
-
"downloads": 0,
|
|
60
|
-
"likes": 0,
|
|
61
|
-
"created_at": ds.get("created", ""),
|
|
62
|
-
"updated_at": ds.get("updated", ""),
|
|
63
|
-
"size_bytes": 0,
|
|
64
|
-
"quality_score": 0.8,
|
|
65
|
-
"domain": "general",
|
|
66
|
-
"is_gated": False,
|
|
67
|
-
"is_nsfw": False,
|
|
68
|
-
"description_length": len(ds.get("description", "")),
|
|
69
|
-
"has_readme": False,
|
|
70
|
-
"download_url": f"https://data.world/{owner}/{id_str}",
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
74
|
-
try:
|
|
75
|
-
token = _get_token()
|
|
76
|
-
|
|
77
|
-
# data.world simple search API
|
|
78
|
-
url = f"https://api.data.world/v0/search/resources?size={limit}"
|
|
79
|
-
|
|
80
|
-
headers = {
|
|
81
|
-
"Authorization": f"Bearer {token}",
|
|
82
|
-
"Content-Type": "application/json",
|
|
83
|
-
"Accept": "application/json"
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
# Search datasets and include community results to improve recall
|
|
87
|
-
body = {
|
|
88
|
-
"query": query,
|
|
89
|
-
"category": ["dataset"],
|
|
90
|
-
"includeCommunityResults": True,
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers=headers, method="POST")
|
|
94
|
-
|
|
95
|
-
with urllib.request.urlopen(req) as response:
|
|
96
|
-
data = json.loads(response.read().decode('utf-8'))
|
|
97
|
-
|
|
98
|
-
records = data.get("records", [])
|
|
99
|
-
|
|
100
|
-
# Fallback to advanced endpoint if simple search returns nothing
|
|
101
|
-
if not records:
|
|
102
|
-
adv_url = f"https://api.data.world/v0/search?size={limit}"
|
|
103
|
-
adv_body = {
|
|
104
|
-
"query": query,
|
|
105
|
-
"category": ["dataset"],
|
|
106
|
-
}
|
|
107
|
-
adv_req = urllib.request.Request(
|
|
108
|
-
adv_url,
|
|
109
|
-
data=json.dumps(adv_body).encode("utf-8"),
|
|
110
|
-
headers=headers,
|
|
111
|
-
method="POST",
|
|
112
|
-
)
|
|
113
|
-
with urllib.request.urlopen(adv_req) as response:
|
|
114
|
-
adv_data = json.loads(response.read().decode("utf-8"))
|
|
115
|
-
records = adv_data.get("records", [])
|
|
116
|
-
|
|
117
|
-
items = [_dataset_to_dict(r) for r in records]
|
|
118
|
-
|
|
119
|
-
return {"ok": True, "results": items, "count": len(items)}
|
|
120
|
-
except Exception as e:
|
|
121
|
-
return {"ok": False, "error": f"data.world discover failed: {str(e)}"}
|
|
122
|
-
|
|
123
|
-
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
124
|
-
try:
|
|
125
|
-
token = _get_token()
|
|
126
|
-
|
|
127
|
-
# dataset_ref is expected to be "dataworld:owner/id"
|
|
128
|
-
if dataset_ref.startswith("dataworld:"):
|
|
129
|
-
ref = dataset_ref.split(":", 1)[1]
|
|
130
|
-
else:
|
|
131
|
-
ref = dataset_ref
|
|
132
|
-
|
|
133
|
-
parts = ref.split("/")
|
|
134
|
-
if len(parts) != 2:
|
|
135
|
-
return {"ok": False, "error": f"Invalid data.world dataset ID format. Expected owner/id, got {ref}"}
|
|
136
|
-
|
|
137
|
-
owner, dataset_id = parts
|
|
138
|
-
|
|
139
|
-
if not target_dir:
|
|
140
|
-
target_dir = tempfile.mkdtemp(prefix="vesper_dataworld_")
|
|
141
|
-
|
|
142
|
-
os.makedirs(target_dir, exist_ok=True)
|
|
143
|
-
|
|
144
|
-
# First, get the dataset metadata to find the files
|
|
145
|
-
url = f"https://api.data.world/v0/datasets/{owner}/{dataset_id}"
|
|
146
|
-
headers = {
|
|
147
|
-
"Authorization": f"Bearer {token}",
|
|
148
|
-
"Accept": "application/json"
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
req = urllib.request.Request(url, headers=headers)
|
|
152
|
-
with urllib.request.urlopen(req) as response:
|
|
153
|
-
dataset_meta = json.loads(response.read().decode('utf-8'))
|
|
154
|
-
|
|
155
|
-
files = dataset_meta.get("files", [])
|
|
156
|
-
if not files:
|
|
157
|
-
return {"ok": False, "error": "No files found in this dataset"}
|
|
158
|
-
|
|
159
|
-
# Find the best file to download (prefer csv, parquet, jsonl)
|
|
160
|
-
best_file = None
|
|
161
|
-
for ext in [".parquet", ".csv", ".jsonl", ".json"]:
|
|
162
|
-
for f in files:
|
|
163
|
-
if f.get("name", "").lower().endswith(ext):
|
|
164
|
-
best_file = f
|
|
165
|
-
break
|
|
166
|
-
if best_file:
|
|
167
|
-
break
|
|
168
|
-
|
|
169
|
-
if not best_file:
|
|
170
|
-
best_file = files[0] # Just take the first one if no preferred format
|
|
171
|
-
|
|
172
|
-
filename = best_file.get("name")
|
|
173
|
-
|
|
174
|
-
# Download the file
|
|
175
|
-
download_url = f"https://api.data.world/v0/file_download/{owner}/{dataset_id}/{urllib.parse.quote(filename)}"
|
|
176
|
-
|
|
177
|
-
file_path = os.path.join(target_dir, filename)
|
|
178
|
-
|
|
179
|
-
download_req = urllib.request.Request(download_url, headers=headers)
|
|
180
|
-
with urllib.request.urlopen(download_req) as response, open(file_path, 'wb') as out_file:
|
|
181
|
-
out_file.write(response.read())
|
|
182
|
-
|
|
183
|
-
return {
|
|
184
|
-
"ok": True,
|
|
185
|
-
"local_path": file_path,
|
|
186
|
-
"target_dir": target_dir
|
|
187
|
-
}
|
|
188
|
-
except Exception as e:
|
|
189
|
-
return {"ok": False, "error": f"data.world download failed: {str(e)}"}
|
|
190
|
-
|
|
191
|
-
def main():
|
|
192
|
-
parser = argparse.ArgumentParser(description="Vesper data.world Engine")
|
|
193
|
-
parser.add_argument("action", choices=["discover", "download"])
|
|
194
|
-
parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
|
|
195
|
-
parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
|
|
196
|
-
|
|
197
|
-
args = parser.parse_args()
|
|
198
|
-
|
|
199
|
-
if args.action == "discover":
|
|
200
|
-
limit = int(args.arg2) if args.arg2 else 20
|
|
201
|
-
result = discover(args.arg1, limit)
|
|
202
|
-
print(json.dumps(result))
|
|
203
|
-
elif args.action == "download":
|
|
204
|
-
result = download(args.arg1, args.arg2)
|
|
205
|
-
print(json.dumps(result))
|
|
206
|
-
|
|
207
|
-
if __name__ == "__main__":
|
|
208
|
-
main()
|
|
@@ -1,288 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import polars as pl
|
|
4
|
-
import os
|
|
5
|
-
import time
|
|
6
|
-
|
|
7
|
-
# Optional imports for extra formats
|
|
8
|
-
try:
|
|
9
|
-
import pyarrow as pa
|
|
10
|
-
import pyarrow.feather as pf
|
|
11
|
-
HAS_PYARROW = True
|
|
12
|
-
except ImportError:
|
|
13
|
-
HAS_PYARROW = False
|
|
14
|
-
|
|
15
|
-
try:
|
|
16
|
-
import tensorflow as tf
|
|
17
|
-
HAS_TENSORFLOW = True
|
|
18
|
-
except ImportError:
|
|
19
|
-
HAS_TENSORFLOW = False
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# ---------------------------------------------------------------------------
|
|
23
|
-
# Helpers
|
|
24
|
-
# ---------------------------------------------------------------------------
|
|
25
|
-
|
|
26
|
-
def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
27
|
-
"""Load any supported input format into a Polars DataFrame."""
|
|
28
|
-
sample_rows = options.get("sample_rows") # int | None
|
|
29
|
-
columns = options.get("columns") # list[str] | None
|
|
30
|
-
|
|
31
|
-
ext = os.path.splitext(file_path)[1].lower()
|
|
32
|
-
if ext == ".csv":
|
|
33
|
-
df = pl.read_csv(file_path, ignore_errors=True)
|
|
34
|
-
elif ext == ".tsv":
|
|
35
|
-
df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
|
|
36
|
-
elif ext == ".txt":
|
|
37
|
-
# Heuristic delimiter detection for plain text tabular files.
|
|
38
|
-
sep = ","
|
|
39
|
-
try:
|
|
40
|
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
41
|
-
first_line = fh.readline()
|
|
42
|
-
if "\t" in first_line:
|
|
43
|
-
sep = "\t"
|
|
44
|
-
except Exception:
|
|
45
|
-
sep = ","
|
|
46
|
-
df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
|
|
47
|
-
elif ext in (".parquet", ".pq"):
|
|
48
|
-
df = pl.read_parquet(file_path)
|
|
49
|
-
elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
50
|
-
df = pl.read_ipc(file_path)
|
|
51
|
-
elif ext == ".jsonl":
|
|
52
|
-
df = pl.read_ndjson(file_path)
|
|
53
|
-
elif ext == ".json":
|
|
54
|
-
# Auto-detect: array-of-objects vs NDJSON vs nested structures
|
|
55
|
-
try:
|
|
56
|
-
import json as _json
|
|
57
|
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
58
|
-
raw_text = fh.read(512) # peek
|
|
59
|
-
stripped = raw_text.lstrip()
|
|
60
|
-
if stripped.startswith("["):
|
|
61
|
-
# Array of objects — standard JSON
|
|
62
|
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
63
|
-
data = _json.load(fh)
|
|
64
|
-
if isinstance(data, list) and len(data) > 0:
|
|
65
|
-
df = pl.DataFrame(data)
|
|
66
|
-
else:
|
|
67
|
-
raise ValueError("JSON file is empty or not an array of objects")
|
|
68
|
-
elif stripped.startswith("{"):
|
|
69
|
-
# Could be NDJSON or a single object wrapping rows
|
|
70
|
-
try:
|
|
71
|
-
df = pl.read_ndjson(file_path)
|
|
72
|
-
except Exception:
|
|
73
|
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
74
|
-
data = _json.load(fh)
|
|
75
|
-
# Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
|
|
76
|
-
rows = None
|
|
77
|
-
if isinstance(data, dict):
|
|
78
|
-
for key in ("data", "rows", "records", "items", "results", "entries"):
|
|
79
|
-
if key in data and isinstance(data[key], list):
|
|
80
|
-
rows = data[key]
|
|
81
|
-
break
|
|
82
|
-
if rows is None:
|
|
83
|
-
# Last resort: try to use the dict values
|
|
84
|
-
rows = [data]
|
|
85
|
-
if rows and len(rows) > 0:
|
|
86
|
-
df = pl.DataFrame(rows)
|
|
87
|
-
else:
|
|
88
|
-
raise ValueError("Could not parse JSON structure into tabular data")
|
|
89
|
-
else:
|
|
90
|
-
raise ValueError("JSON file does not start with [ or {")
|
|
91
|
-
except pl.exceptions.ComputeError as ce:
|
|
92
|
-
raise ValueError(f"Failed to parse JSON: {ce}")
|
|
93
|
-
elif ext == ".xlsx":
|
|
94
|
-
try:
|
|
95
|
-
df = pl.read_excel(file_path)
|
|
96
|
-
except Exception as e:
|
|
97
|
-
raise ValueError(f"Failed to read Excel file: {e}")
|
|
98
|
-
else:
|
|
99
|
-
raise ValueError(f"Unsupported input format: {ext}")
|
|
100
|
-
|
|
101
|
-
if len(df) == 0:
|
|
102
|
-
raise ValueError("empty CSV")
|
|
103
|
-
|
|
104
|
-
# Column selection (before sampling for speed)
|
|
105
|
-
if columns:
|
|
106
|
-
valid = [c for c in columns if c in df.columns]
|
|
107
|
-
if valid:
|
|
108
|
-
df = df.select(valid)
|
|
109
|
-
|
|
110
|
-
# Optional sampling
|
|
111
|
-
if sample_rows and sample_rows < len(df):
|
|
112
|
-
seed = options.get("random_seed", 42)
|
|
113
|
-
df = df.sample(n=sample_rows, seed=seed)
|
|
114
|
-
|
|
115
|
-
return df
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
|
|
119
|
-
"""Stringify complex columns so CSV doesn't choke."""
|
|
120
|
-
for col in df.columns:
|
|
121
|
-
dtype = df.schema[col]
|
|
122
|
-
is_simple = (
|
|
123
|
-
dtype.is_numeric()
|
|
124
|
-
or dtype.is_temporal()
|
|
125
|
-
or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
126
|
-
)
|
|
127
|
-
if not is_simple:
|
|
128
|
-
def safe_serialize(val):
|
|
129
|
-
try:
|
|
130
|
-
if hasattr(val, "to_list"):
|
|
131
|
-
return json.dumps(val.to_list())
|
|
132
|
-
if hasattr(val, "to_dict"):
|
|
133
|
-
return json.dumps(val.to_dict())
|
|
134
|
-
return json.dumps(val)
|
|
135
|
-
except Exception:
|
|
136
|
-
return str(val)
|
|
137
|
-
df = df.with_columns(
|
|
138
|
-
pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
|
|
139
|
-
)
|
|
140
|
-
return df
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
|
|
144
|
-
"""Write a small CSV preview next to the exported file."""
|
|
145
|
-
preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
|
|
146
|
-
preview_df = _safe_csv_df(df.head(min(n, len(df))))
|
|
147
|
-
preview_df.write_csv(preview_path)
|
|
148
|
-
return preview_path
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
# ---------------------------------------------------------------------------
|
|
152
|
-
# Main export function
|
|
153
|
-
# ---------------------------------------------------------------------------
|
|
154
|
-
|
|
155
|
-
def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
|
|
156
|
-
options = options or {}
|
|
157
|
-
t0 = time.perf_counter()
|
|
158
|
-
|
|
159
|
-
# ---- Load ----
|
|
160
|
-
try:
|
|
161
|
-
df = _load(file_path, options)
|
|
162
|
-
except Exception as e:
|
|
163
|
-
return {"error": f"Failed to load input file: {str(e)}"}
|
|
164
|
-
|
|
165
|
-
output_dir = os.path.dirname(output_path)
|
|
166
|
-
if output_dir and not os.path.exists(output_dir):
|
|
167
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
168
|
-
|
|
169
|
-
preview_path = None
|
|
170
|
-
generate_preview = options.get("preview", False)
|
|
171
|
-
|
|
172
|
-
try:
|
|
173
|
-
# ---- Feather (Arrow IPC) – fastest binary format ----
|
|
174
|
-
if format == "feather":
|
|
175
|
-
if not HAS_PYARROW:
|
|
176
|
-
return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
|
|
177
|
-
compression = options.get("compression", "lz4")
|
|
178
|
-
if compression in ("uncompressed", "none", "None", None):
|
|
179
|
-
compression = "uncompressed"
|
|
180
|
-
# Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
|
|
181
|
-
arrow_table = df.to_arrow()
|
|
182
|
-
pf.write_feather(arrow_table, output_path, compression=compression)
|
|
183
|
-
if generate_preview:
|
|
184
|
-
preview_path = _write_preview(df, output_path)
|
|
185
|
-
|
|
186
|
-
# ---- Parquet – best compression, big-data friendly ----
|
|
187
|
-
elif format == "parquet":
|
|
188
|
-
compression = options.get("compression", "snappy")
|
|
189
|
-
if compression in ("uncompressed", "none", "None", None):
|
|
190
|
-
compression = "uncompressed"
|
|
191
|
-
df.write_parquet(output_path, compression=compression)
|
|
192
|
-
if generate_preview:
|
|
193
|
-
preview_path = _write_preview(df, output_path)
|
|
194
|
-
|
|
195
|
-
# ---- CSV – human-readable fallback ----
|
|
196
|
-
elif format == "csv":
|
|
197
|
-
df = _safe_csv_df(df)
|
|
198
|
-
df.write_csv(output_path)
|
|
199
|
-
|
|
200
|
-
# ---- JSONL ----
|
|
201
|
-
elif format == "jsonl":
|
|
202
|
-
df.write_ndjson(output_path)
|
|
203
|
-
if generate_preview:
|
|
204
|
-
preview_path = _write_preview(df, output_path)
|
|
205
|
-
|
|
206
|
-
# ---- Arrow IPC (legacy name kept for compat) ----
|
|
207
|
-
elif format in ("arrow", "ipc"):
|
|
208
|
-
compression = options.get("compression", "uncompressed")
|
|
209
|
-
if compression == "uncompressed":
|
|
210
|
-
compression = None
|
|
211
|
-
df.write_ipc(output_path, compression=compression)
|
|
212
|
-
if generate_preview:
|
|
213
|
-
preview_path = _write_preview(df, output_path)
|
|
214
|
-
|
|
215
|
-
# ---- TFRecord ----
|
|
216
|
-
elif format == "tfrecord":
|
|
217
|
-
if not HAS_TENSORFLOW:
|
|
218
|
-
return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
|
|
219
|
-
with tf.io.TFRecordWriter(output_path) as writer:
|
|
220
|
-
pdf = df.to_pandas()
|
|
221
|
-
for _, row in pdf.iterrows():
|
|
222
|
-
feature = {}
|
|
223
|
-
for col, value in row.items():
|
|
224
|
-
if value is None:
|
|
225
|
-
continue
|
|
226
|
-
if isinstance(value, int):
|
|
227
|
-
feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
|
|
228
|
-
elif isinstance(value, float):
|
|
229
|
-
feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
|
|
230
|
-
elif isinstance(value, str):
|
|
231
|
-
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
|
|
232
|
-
elif isinstance(value, bytes):
|
|
233
|
-
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
|
|
234
|
-
else:
|
|
235
|
-
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
|
|
236
|
-
example = tf.train.Example(features=tf.train.Features(feature=feature))
|
|
237
|
-
writer.write(example.SerializeToString())
|
|
238
|
-
|
|
239
|
-
else:
|
|
240
|
-
return {"error": f"Unknown export format: {format}"}
|
|
241
|
-
|
|
242
|
-
elapsed = round(time.perf_counter() - t0, 3)
|
|
243
|
-
file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
244
|
-
|
|
245
|
-
result = {
|
|
246
|
-
"success": True,
|
|
247
|
-
"output_path": output_path,
|
|
248
|
-
"rows": len(df),
|
|
249
|
-
"columns": len(df.columns),
|
|
250
|
-
"format": format,
|
|
251
|
-
"compression": options.get("compression", "default"),
|
|
252
|
-
"file_size_mb": file_size_mb,
|
|
253
|
-
"elapsed_seconds": elapsed,
|
|
254
|
-
}
|
|
255
|
-
if preview_path:
|
|
256
|
-
result["preview_path"] = preview_path
|
|
257
|
-
|
|
258
|
-
return result
|
|
259
|
-
|
|
260
|
-
except Exception as e:
|
|
261
|
-
return {"error": f"Export failed: {str(e)}"}
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
def main():
|
|
265
|
-
if len(sys.argv) < 4:
|
|
266
|
-
print(
|
|
267
|
-
json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
|
|
268
|
-
file=sys.stderr,
|
|
269
|
-
)
|
|
270
|
-
sys.exit(1)
|
|
271
|
-
|
|
272
|
-
input_file = sys.argv[1]
|
|
273
|
-
output_file = sys.argv[2]
|
|
274
|
-
fmt = sys.argv[3]
|
|
275
|
-
|
|
276
|
-
options = {}
|
|
277
|
-
if len(sys.argv) > 4:
|
|
278
|
-
try:
|
|
279
|
-
options = json.loads(sys.argv[4])
|
|
280
|
-
except Exception:
|
|
281
|
-
pass
|
|
282
|
-
|
|
283
|
-
result = export_data(input_file, output_file, fmt, options)
|
|
284
|
-
print(json.dumps(result))
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
if __name__ == "__main__":
|
|
288
|
-
main()
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import json
|
|
3
|
-
import warnings
|
|
4
|
-
|
|
5
|
-
# --- PyTorch Adapter ---
|
|
6
|
-
try:
|
|
7
|
-
import torch
|
|
8
|
-
from torch.utils.data import Dataset
|
|
9
|
-
import polars as pl
|
|
10
|
-
|
|
11
|
-
class VesperPyTorchDataset(Dataset):
|
|
12
|
-
"""
|
|
13
|
-
PyTorch Dataset wrapper for Vesper exported files (Parquet/CSV/Arrow).
|
|
14
|
-
Efficiently loads data using Polars and converts to Tensors on demand.
|
|
15
|
-
"""
|
|
16
|
-
def __init__(self, file_path, target_col=None, transform=None):
|
|
17
|
-
self.file_path = file_path
|
|
18
|
-
self.target_col = target_col
|
|
19
|
-
self.transform = transform
|
|
20
|
-
|
|
21
|
-
# Auto-detect format
|
|
22
|
-
if file_path.endswith(".parquet"):
|
|
23
|
-
self.df = pl.read_parquet(file_path)
|
|
24
|
-
elif file_path.endswith(".csv"):
|
|
25
|
-
self.df = pl.read_csv(file_path, ignore_errors=True)
|
|
26
|
-
elif file_path.endswith(".arrow"):
|
|
27
|
-
self.df = pl.read_ipc(file_path)
|
|
28
|
-
else:
|
|
29
|
-
raise ValueError(f"Unsupported file format for PyTorch loader: {file_path}")
|
|
30
|
-
|
|
31
|
-
self.data = self.df.to_pandas() # Convert to pandas for easier row access in __getitem__ (Polars slice can be slow row-wise)
|
|
32
|
-
|
|
33
|
-
def __len__(self):
|
|
34
|
-
return len(self.data)
|
|
35
|
-
|
|
36
|
-
def __getitem__(self, idx):
|
|
37
|
-
row = self.data.iloc[idx]
|
|
38
|
-
|
|
39
|
-
# Simple assumption: all numeric columns except target are features
|
|
40
|
-
# In production, metadata would tell us which columns are features
|
|
41
|
-
if self.target_col and self.target_col in row:
|
|
42
|
-
y = row[self.target_col]
|
|
43
|
-
x = row.drop(self.target_col).values
|
|
44
|
-
|
|
45
|
-
# Convert to tensors
|
|
46
|
-
x = torch.tensor(x, dtype=torch.float32)
|
|
47
|
-
# Auto-detect target type (scalar vs class index)
|
|
48
|
-
if isinstance(y, (int, float)):
|
|
49
|
-
y = torch.tensor(y, dtype=torch.float32) # Regression/Binary
|
|
50
|
-
else:
|
|
51
|
-
# TODO: Label encoding if string
|
|
52
|
-
pass
|
|
53
|
-
|
|
54
|
-
sample = (x, y)
|
|
55
|
-
else:
|
|
56
|
-
# Unsupervised
|
|
57
|
-
x = torch.tensor(row.values, dtype=torch.float32)
|
|
58
|
-
sample = x
|
|
59
|
-
|
|
60
|
-
if self.transform:
|
|
61
|
-
sample = self.transform(sample)
|
|
62
|
-
|
|
63
|
-
return sample
|
|
64
|
-
|
|
65
|
-
except ImportError:
|
|
66
|
-
class VesperPyTorchDataset:
|
|
67
|
-
def __init__(self, *args, **kwargs):
|
|
68
|
-
raise ImportError("PyTorch or Polars not installed.")
|
|
69
|
-
|
|
70
|
-
# --- HuggingFace Adapter ---
|
|
71
|
-
try:
|
|
72
|
-
from datasets import load_dataset as hf_load_dataset
|
|
73
|
-
|
|
74
|
-
def load_vesper_dataset(file_path):
|
|
75
|
-
"""
|
|
76
|
-
Loads a Vesper export into a Hugging Face Dataset.
|
|
77
|
-
Supported: Parquet, CSV, JSONL, Arrow.
|
|
78
|
-
"""
|
|
79
|
-
output_format = "parquet" # Default fallback
|
|
80
|
-
if file_path.endswith(".csv"): output_format = "csv"
|
|
81
|
-
elif file_path.endswith(".jsonl"): output_format = "json"
|
|
82
|
-
elif file_path.endswith(".arrow"): output_format = "arrow"
|
|
83
|
-
|
|
84
|
-
# 'arrow' format in HF might need custom script, but usually parquet/csv/json are native
|
|
85
|
-
if output_format == "arrow":
|
|
86
|
-
# Use pandas/polars to read then convert to HF dataset
|
|
87
|
-
import polars as pl
|
|
88
|
-
from datasets import Dataset
|
|
89
|
-
df = pl.read_ipc(file_path).to_pandas()
|
|
90
|
-
return Dataset.from_pandas(df)
|
|
91
|
-
|
|
92
|
-
return hf_load_dataset(output_format, data_files=file_path, split="train")
|
|
93
|
-
|
|
94
|
-
except ImportError:
|
|
95
|
-
def load_vesper_dataset(*args, **kwargs):
|
|
96
|
-
raise ImportError("HuggingFace 'datasets' library not installed.")
|
|
97
|
-
|
|
98
|
-
if __name__ == "__main__":
|
|
99
|
-
print("Vesper Framework Adapters Library")
|
|
100
|
-
print("Usage: import this module in your training script.")
|