vesper-wizard 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +300 -37
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +81 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +62 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +127 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +26 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/config/config-manager.js +221 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +69 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/engine.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/gateway/unified-dataset-gateway.js +409 -0
- package/build/index.js +2704 -0
- package/build/ingestion/hf-downloader.js +171 -0
- package/build/ingestion/ingestor.js +271 -0
- package/build/ingestion/kaggle-downloader.js +102 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +136 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/lib/supabase.js +3 -0
- package/build/metadata/dataworld-source.js +89 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/openml-source.js +87 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +377 -0
- package/build/metadata/store.js +340 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/preparation/target-detector.js +75 -0
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +92 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/config.py +263 -0
- package/build/python/dataworld_engine.py +208 -0
- package/build/python/export_engine.py +243 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/fusion_engine.py +368 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/hf_fallback.py +298 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/kaggle_engine.py +295 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/openml_engine.py +146 -0
- package/build/python/quality_engine.py +267 -0
- package/build/python/row_count.py +54 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +675 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +93 -0
- package/build/quality/image-analyzer.js +114 -0
- package/build/quality/media-analyzer.js +115 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +74 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +152 -0
- package/build/search/jit-orchestrator.js +258 -0
- package/build/search/vector-store.js +123 -0
- package/build/splitting/splitter.js +82 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +251 -0
- package/build/utils/downloader.js +52 -0
- package/build/utils/selector.js +69 -0
- package/mcp-config-template.json +18 -0
- package/package.json +101 -29
- package/scripts/postinstall.cjs +114 -0
- package/scripts/preindex_registry.cjs +157 -0
- package/scripts/refresh-index.cjs +87 -0
- package/{wizard.js → scripts/wizard.js} +148 -32
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +92 -0
- package/src/python/cleaner.py +226 -0
- package/src/python/config.py +263 -0
- package/src/python/dataworld_engine.py +208 -0
- package/src/python/export_engine.py +243 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/fusion_engine.py +368 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/hf_fallback.py +298 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/kaggle_engine.py +295 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/openml_engine.py +146 -0
- package/src/python/quality_engine.py +267 -0
- package/src/python/row_count.py +54 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/target_engine.py +154 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/test_fusion_engine.py +89 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +675 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
- package/src/python/worldbank_adapter.py +99 -0
- package/vesper-mcp-config.json +0 -6
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import polars as pl
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
# Optional imports for extra formats
|
|
8
|
+
try:
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import pyarrow.feather as pf
|
|
11
|
+
HAS_PYARROW = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
HAS_PYARROW = False
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import tensorflow as tf
|
|
17
|
+
HAS_TENSORFLOW = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
HAS_TENSORFLOW = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Helpers
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
27
|
+
"""Load any supported input format into a Polars DataFrame."""
|
|
28
|
+
sample_rows = options.get("sample_rows") # int | None
|
|
29
|
+
columns = options.get("columns") # list[str] | None
|
|
30
|
+
|
|
31
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
32
|
+
if ext == ".csv":
|
|
33
|
+
df = pl.read_csv(file_path, ignore_errors=True)
|
|
34
|
+
elif ext == ".tsv":
|
|
35
|
+
df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
|
|
36
|
+
elif ext == ".txt":
|
|
37
|
+
# Heuristic delimiter detection for plain text tabular files.
|
|
38
|
+
sep = ","
|
|
39
|
+
try:
|
|
40
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
41
|
+
first_line = fh.readline()
|
|
42
|
+
if "\t" in first_line:
|
|
43
|
+
sep = "\t"
|
|
44
|
+
except Exception:
|
|
45
|
+
sep = ","
|
|
46
|
+
df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
|
|
47
|
+
elif ext in (".parquet", ".pq"):
|
|
48
|
+
df = pl.read_parquet(file_path)
|
|
49
|
+
elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
50
|
+
df = pl.read_ipc(file_path)
|
|
51
|
+
elif ext == ".jsonl":
|
|
52
|
+
df = pl.read_ndjson(file_path)
|
|
53
|
+
else:
|
|
54
|
+
raise ValueError(f"Unsupported input format: {ext}")
|
|
55
|
+
|
|
56
|
+
if len(df) == 0:
|
|
57
|
+
raise ValueError("empty CSV")
|
|
58
|
+
|
|
59
|
+
# Column selection (before sampling for speed)
|
|
60
|
+
if columns:
|
|
61
|
+
valid = [c for c in columns if c in df.columns]
|
|
62
|
+
if valid:
|
|
63
|
+
df = df.select(valid)
|
|
64
|
+
|
|
65
|
+
# Optional sampling
|
|
66
|
+
if sample_rows and sample_rows < len(df):
|
|
67
|
+
seed = options.get("random_seed", 42)
|
|
68
|
+
df = df.sample(n=sample_rows, seed=seed)
|
|
69
|
+
|
|
70
|
+
return df
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
|
|
74
|
+
"""Stringify complex columns so CSV doesn't choke."""
|
|
75
|
+
for col in df.columns:
|
|
76
|
+
dtype = df.schema[col]
|
|
77
|
+
is_simple = (
|
|
78
|
+
dtype.is_numeric()
|
|
79
|
+
or dtype.is_temporal()
|
|
80
|
+
or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
81
|
+
)
|
|
82
|
+
if not is_simple:
|
|
83
|
+
def safe_serialize(val):
|
|
84
|
+
try:
|
|
85
|
+
if hasattr(val, "to_list"):
|
|
86
|
+
return json.dumps(val.to_list())
|
|
87
|
+
if hasattr(val, "to_dict"):
|
|
88
|
+
return json.dumps(val.to_dict())
|
|
89
|
+
return json.dumps(val)
|
|
90
|
+
except Exception:
|
|
91
|
+
return str(val)
|
|
92
|
+
df = df.with_columns(
|
|
93
|
+
pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
|
|
94
|
+
)
|
|
95
|
+
return df
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
|
|
99
|
+
"""Write a small CSV preview next to the exported file."""
|
|
100
|
+
preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
|
|
101
|
+
preview_df = _safe_csv_df(df.head(min(n, len(df))))
|
|
102
|
+
preview_df.write_csv(preview_path)
|
|
103
|
+
return preview_path
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
# Main export function
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
|
|
111
|
+
options = options or {}
|
|
112
|
+
t0 = time.perf_counter()
|
|
113
|
+
|
|
114
|
+
# ---- Load ----
|
|
115
|
+
try:
|
|
116
|
+
df = _load(file_path, options)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
return {"error": f"Failed to load input file: {str(e)}"}
|
|
119
|
+
|
|
120
|
+
output_dir = os.path.dirname(output_path)
|
|
121
|
+
if output_dir and not os.path.exists(output_dir):
|
|
122
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
123
|
+
|
|
124
|
+
preview_path = None
|
|
125
|
+
generate_preview = options.get("preview", False)
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
# ---- Feather (Arrow IPC) – fastest binary format ----
|
|
129
|
+
if format == "feather":
|
|
130
|
+
if not HAS_PYARROW:
|
|
131
|
+
return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
|
|
132
|
+
compression = options.get("compression", "lz4")
|
|
133
|
+
if compression in ("uncompressed", "none", "None", None):
|
|
134
|
+
compression = "uncompressed"
|
|
135
|
+
# Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
|
|
136
|
+
arrow_table = df.to_arrow()
|
|
137
|
+
pf.write_feather(arrow_table, output_path, compression=compression)
|
|
138
|
+
if generate_preview:
|
|
139
|
+
preview_path = _write_preview(df, output_path)
|
|
140
|
+
|
|
141
|
+
# ---- Parquet – best compression, big-data friendly ----
|
|
142
|
+
elif format == "parquet":
|
|
143
|
+
compression = options.get("compression", "snappy")
|
|
144
|
+
if compression in ("uncompressed", "none", "None", None):
|
|
145
|
+
compression = "uncompressed"
|
|
146
|
+
df.write_parquet(output_path, compression=compression)
|
|
147
|
+
if generate_preview:
|
|
148
|
+
preview_path = _write_preview(df, output_path)
|
|
149
|
+
|
|
150
|
+
# ---- CSV – human-readable fallback ----
|
|
151
|
+
elif format == "csv":
|
|
152
|
+
df = _safe_csv_df(df)
|
|
153
|
+
df.write_csv(output_path)
|
|
154
|
+
|
|
155
|
+
# ---- JSONL ----
|
|
156
|
+
elif format == "jsonl":
|
|
157
|
+
df.write_ndjson(output_path)
|
|
158
|
+
if generate_preview:
|
|
159
|
+
preview_path = _write_preview(df, output_path)
|
|
160
|
+
|
|
161
|
+
# ---- Arrow IPC (legacy name kept for compat) ----
|
|
162
|
+
elif format in ("arrow", "ipc"):
|
|
163
|
+
compression = options.get("compression", "uncompressed")
|
|
164
|
+
if compression == "uncompressed":
|
|
165
|
+
compression = None
|
|
166
|
+
df.write_ipc(output_path, compression=compression)
|
|
167
|
+
if generate_preview:
|
|
168
|
+
preview_path = _write_preview(df, output_path)
|
|
169
|
+
|
|
170
|
+
# ---- TFRecord ----
|
|
171
|
+
elif format == "tfrecord":
|
|
172
|
+
if not HAS_TENSORFLOW:
|
|
173
|
+
return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
|
|
174
|
+
with tf.io.TFRecordWriter(output_path) as writer:
|
|
175
|
+
pdf = df.to_pandas()
|
|
176
|
+
for _, row in pdf.iterrows():
|
|
177
|
+
feature = {}
|
|
178
|
+
for col, value in row.items():
|
|
179
|
+
if value is None:
|
|
180
|
+
continue
|
|
181
|
+
if isinstance(value, int):
|
|
182
|
+
feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
|
|
183
|
+
elif isinstance(value, float):
|
|
184
|
+
feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
|
|
185
|
+
elif isinstance(value, str):
|
|
186
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
|
|
187
|
+
elif isinstance(value, bytes):
|
|
188
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
|
|
189
|
+
else:
|
|
190
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
|
|
191
|
+
example = tf.train.Example(features=tf.train.Features(feature=feature))
|
|
192
|
+
writer.write(example.SerializeToString())
|
|
193
|
+
|
|
194
|
+
else:
|
|
195
|
+
return {"error": f"Unknown export format: {format}"}
|
|
196
|
+
|
|
197
|
+
elapsed = round(time.perf_counter() - t0, 3)
|
|
198
|
+
file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
199
|
+
|
|
200
|
+
result = {
|
|
201
|
+
"success": True,
|
|
202
|
+
"output_path": output_path,
|
|
203
|
+
"rows": len(df),
|
|
204
|
+
"columns": len(df.columns),
|
|
205
|
+
"format": format,
|
|
206
|
+
"compression": options.get("compression", "default"),
|
|
207
|
+
"file_size_mb": file_size_mb,
|
|
208
|
+
"elapsed_seconds": elapsed,
|
|
209
|
+
}
|
|
210
|
+
if preview_path:
|
|
211
|
+
result["preview_path"] = preview_path
|
|
212
|
+
|
|
213
|
+
return result
|
|
214
|
+
|
|
215
|
+
except Exception as e:
|
|
216
|
+
return {"error": f"Export failed: {str(e)}"}
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def main():
|
|
220
|
+
if len(sys.argv) < 4:
|
|
221
|
+
print(
|
|
222
|
+
json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
|
|
223
|
+
file=sys.stderr,
|
|
224
|
+
)
|
|
225
|
+
sys.exit(1)
|
|
226
|
+
|
|
227
|
+
input_file = sys.argv[1]
|
|
228
|
+
output_file = sys.argv[2]
|
|
229
|
+
fmt = sys.argv[3]
|
|
230
|
+
|
|
231
|
+
options = {}
|
|
232
|
+
if len(sys.argv) > 4:
|
|
233
|
+
try:
|
|
234
|
+
options = json.loads(sys.argv[4])
|
|
235
|
+
except Exception:
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
result = export_data(input_file, output_file, fmt, options)
|
|
239
|
+
print(json.dumps(result))
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
main()
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
# --- PyTorch Adapter ---
|
|
6
|
+
try:
|
|
7
|
+
import torch
|
|
8
|
+
from torch.utils.data import Dataset
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
class VesperPyTorchDataset(Dataset):
|
|
12
|
+
"""
|
|
13
|
+
PyTorch Dataset wrapper for Vesper exported files (Parquet/CSV/Arrow).
|
|
14
|
+
Efficiently loads data using Polars and converts to Tensors on demand.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, file_path, target_col=None, transform=None):
|
|
17
|
+
self.file_path = file_path
|
|
18
|
+
self.target_col = target_col
|
|
19
|
+
self.transform = transform
|
|
20
|
+
|
|
21
|
+
# Auto-detect format
|
|
22
|
+
if file_path.endswith(".parquet"):
|
|
23
|
+
self.df = pl.read_parquet(file_path)
|
|
24
|
+
elif file_path.endswith(".csv"):
|
|
25
|
+
self.df = pl.read_csv(file_path, ignore_errors=True)
|
|
26
|
+
elif file_path.endswith(".arrow"):
|
|
27
|
+
self.df = pl.read_ipc(file_path)
|
|
28
|
+
else:
|
|
29
|
+
raise ValueError(f"Unsupported file format for PyTorch loader: {file_path}")
|
|
30
|
+
|
|
31
|
+
self.data = self.df.to_pandas() # Convert to pandas for easier row access in __getitem__ (Polars slice can be slow row-wise)
|
|
32
|
+
|
|
33
|
+
def __len__(self):
|
|
34
|
+
return len(self.data)
|
|
35
|
+
|
|
36
|
+
def __getitem__(self, idx):
|
|
37
|
+
row = self.data.iloc[idx]
|
|
38
|
+
|
|
39
|
+
# Simple assumption: all numeric columns except target are features
|
|
40
|
+
# In production, metadata would tell us which columns are features
|
|
41
|
+
if self.target_col and self.target_col in row:
|
|
42
|
+
y = row[self.target_col]
|
|
43
|
+
x = row.drop(self.target_col).values
|
|
44
|
+
|
|
45
|
+
# Convert to tensors
|
|
46
|
+
x = torch.tensor(x, dtype=torch.float32)
|
|
47
|
+
# Auto-detect target type (scalar vs class index)
|
|
48
|
+
if isinstance(y, (int, float)):
|
|
49
|
+
y = torch.tensor(y, dtype=torch.float32) # Regression/Binary
|
|
50
|
+
else:
|
|
51
|
+
# TODO: Label encoding if string
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
sample = (x, y)
|
|
55
|
+
else:
|
|
56
|
+
# Unsupervised
|
|
57
|
+
x = torch.tensor(row.values, dtype=torch.float32)
|
|
58
|
+
sample = x
|
|
59
|
+
|
|
60
|
+
if self.transform:
|
|
61
|
+
sample = self.transform(sample)
|
|
62
|
+
|
|
63
|
+
return sample
|
|
64
|
+
|
|
65
|
+
except ImportError:
|
|
66
|
+
class VesperPyTorchDataset:
|
|
67
|
+
def __init__(self, *args, **kwargs):
|
|
68
|
+
raise ImportError("PyTorch or Polars not installed.")
|
|
69
|
+
|
|
70
|
+
# --- HuggingFace Adapter ---
|
|
71
|
+
try:
|
|
72
|
+
from datasets import load_dataset as hf_load_dataset
|
|
73
|
+
|
|
74
|
+
def load_vesper_dataset(file_path):
|
|
75
|
+
"""
|
|
76
|
+
Loads a Vesper export into a Hugging Face Dataset.
|
|
77
|
+
Supported: Parquet, CSV, JSONL, Arrow.
|
|
78
|
+
"""
|
|
79
|
+
output_format = "parquet" # Default fallback
|
|
80
|
+
if file_path.endswith(".csv"): output_format = "csv"
|
|
81
|
+
elif file_path.endswith(".jsonl"): output_format = "json"
|
|
82
|
+
elif file_path.endswith(".arrow"): output_format = "arrow"
|
|
83
|
+
|
|
84
|
+
# 'arrow' format in HF might need custom script, but usually parquet/csv/json are native
|
|
85
|
+
if output_format == "arrow":
|
|
86
|
+
# Use pandas/polars to read then convert to HF dataset
|
|
87
|
+
import polars as pl
|
|
88
|
+
from datasets import Dataset
|
|
89
|
+
df = pl.read_ipc(file_path).to_pandas()
|
|
90
|
+
return Dataset.from_pandas(df)
|
|
91
|
+
|
|
92
|
+
return hf_load_dataset(output_format, data_files=file_path, split="train")
|
|
93
|
+
|
|
94
|
+
except ImportError:
|
|
95
|
+
def load_vesper_dataset(*args, **kwargs):
|
|
96
|
+
raise ImportError("HuggingFace 'datasets' library not installed.")
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
print("Vesper Framework Adapters Library")
|
|
100
|
+
print("Usage: import this module in your training script.")
|