vesper-wizard 2.0.5 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/scripts/wizard.cjs +625 -0
  174. package/{wizard.js → scripts/wizard.js} +99 -21
  175. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  179. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  180. package/src/python/asset_downloader_engine.py +92 -0
  181. package/src/python/cleaner.py +226 -0
  182. package/src/python/config.py +263 -0
  183. package/src/python/dataworld_engine.py +208 -0
  184. package/src/python/export_engine.py +243 -0
  185. package/src/python/framework_adapters.py +100 -0
  186. package/src/python/fusion_engine.py +368 -0
  187. package/src/python/github_adapter.py +106 -0
  188. package/src/python/hf_fallback.py +298 -0
  189. package/src/python/image_engine.py +86 -0
  190. package/src/python/kaggle_engine.py +295 -0
  191. package/src/python/media_engine.py +133 -0
  192. package/src/python/nasa_adapter.py +82 -0
  193. package/src/python/openml_engine.py +146 -0
  194. package/src/python/quality_engine.py +267 -0
  195. package/src/python/row_count.py +54 -0
  196. package/src/python/splitter_engine.py +283 -0
  197. package/src/python/target_engine.py +154 -0
  198. package/src/python/test_framework_adapters.py +61 -0
  199. package/src/python/test_fusion_engine.py +89 -0
  200. package/src/python/uci_adapter.py +94 -0
  201. package/src/python/vesper/__init__.py +1 -0
  202. package/src/python/vesper/core/__init__.py +1 -0
  203. package/src/python/vesper/core/asset_downloader.py +675 -0
  204. package/src/python/vesper/core/download_recipe.py +104 -0
  205. package/src/python/worldbank_adapter.py +99 -0
  206. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,243 @@
1
+ import sys
2
+ import json
3
+ import polars as pl
4
+ import os
5
+ import time
6
+
7
+ # Optional imports for extra formats
8
+ try:
9
+ import pyarrow as pa
10
+ import pyarrow.feather as pf
11
+ HAS_PYARROW = True
12
+ except ImportError:
13
+ HAS_PYARROW = False
14
+
15
+ try:
16
+ import tensorflow as tf
17
+ HAS_TENSORFLOW = True
18
+ except ImportError:
19
+ HAS_TENSORFLOW = False
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Helpers
24
+ # ---------------------------------------------------------------------------
25
+
26
+ def _load(file_path: str, options: dict) -> pl.DataFrame:
27
+ """Load any supported input format into a Polars DataFrame."""
28
+ sample_rows = options.get("sample_rows") # int | None
29
+ columns = options.get("columns") # list[str] | None
30
+
31
+ ext = os.path.splitext(file_path)[1].lower()
32
+ if ext == ".csv":
33
+ df = pl.read_csv(file_path, ignore_errors=True)
34
+ elif ext == ".tsv":
35
+ df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
36
+ elif ext == ".txt":
37
+ # Heuristic delimiter detection for plain text tabular files.
38
+ sep = ","
39
+ try:
40
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
41
+ first_line = fh.readline()
42
+ if "\t" in first_line:
43
+ sep = "\t"
44
+ except Exception:
45
+ sep = ","
46
+ df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
47
+ elif ext in (".parquet", ".pq"):
48
+ df = pl.read_parquet(file_path)
49
+ elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
50
+ df = pl.read_ipc(file_path)
51
+ elif ext == ".jsonl":
52
+ df = pl.read_ndjson(file_path)
53
+ else:
54
+ raise ValueError(f"Unsupported input format: {ext}")
55
+
56
+ if len(df) == 0:
57
+ raise ValueError("empty CSV")
58
+
59
+ # Column selection (before sampling for speed)
60
+ if columns:
61
+ valid = [c for c in columns if c in df.columns]
62
+ if valid:
63
+ df = df.select(valid)
64
+
65
+ # Optional sampling
66
+ if sample_rows and sample_rows < len(df):
67
+ seed = options.get("random_seed", 42)
68
+ df = df.sample(n=sample_rows, seed=seed)
69
+
70
+ return df
71
+
72
+
73
+ def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
74
+ """Stringify complex columns so CSV doesn't choke."""
75
+ for col in df.columns:
76
+ dtype = df.schema[col]
77
+ is_simple = (
78
+ dtype.is_numeric()
79
+ or dtype.is_temporal()
80
+ or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
81
+ )
82
+ if not is_simple:
83
+ def safe_serialize(val):
84
+ try:
85
+ if hasattr(val, "to_list"):
86
+ return json.dumps(val.to_list())
87
+ if hasattr(val, "to_dict"):
88
+ return json.dumps(val.to_dict())
89
+ return json.dumps(val)
90
+ except Exception:
91
+ return str(val)
92
+ df = df.with_columns(
93
+ pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
94
+ )
95
+ return df
96
+
97
+
98
+ def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
99
+ """Write a small CSV preview next to the exported file."""
100
+ preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
101
+ preview_df = _safe_csv_df(df.head(min(n, len(df))))
102
+ preview_df.write_csv(preview_path)
103
+ return preview_path
104
+
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # Main export function
108
+ # ---------------------------------------------------------------------------
109
+
110
+ def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
111
+ options = options or {}
112
+ t0 = time.perf_counter()
113
+
114
+ # ---- Load ----
115
+ try:
116
+ df = _load(file_path, options)
117
+ except Exception as e:
118
+ return {"error": f"Failed to load input file: {str(e)}"}
119
+
120
+ output_dir = os.path.dirname(output_path)
121
+ if output_dir and not os.path.exists(output_dir):
122
+ os.makedirs(output_dir, exist_ok=True)
123
+
124
+ preview_path = None
125
+ generate_preview = options.get("preview", False)
126
+
127
+ try:
128
+ # ---- Feather (Arrow IPC) – fastest binary format ----
129
+ if format == "feather":
130
+ if not HAS_PYARROW:
131
+ return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
132
+ compression = options.get("compression", "lz4")
133
+ if compression in ("uncompressed", "none", "None", None):
134
+ compression = "uncompressed"
135
+ # Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
136
+ arrow_table = df.to_arrow()
137
+ pf.write_feather(arrow_table, output_path, compression=compression)
138
+ if generate_preview:
139
+ preview_path = _write_preview(df, output_path)
140
+
141
+ # ---- Parquet – best compression, big-data friendly ----
142
+ elif format == "parquet":
143
+ compression = options.get("compression", "snappy")
144
+ if compression in ("uncompressed", "none", "None", None):
145
+ compression = "uncompressed"
146
+ df.write_parquet(output_path, compression=compression)
147
+ if generate_preview:
148
+ preview_path = _write_preview(df, output_path)
149
+
150
+ # ---- CSV – human-readable fallback ----
151
+ elif format == "csv":
152
+ df = _safe_csv_df(df)
153
+ df.write_csv(output_path)
154
+
155
+ # ---- JSONL ----
156
+ elif format == "jsonl":
157
+ df.write_ndjson(output_path)
158
+ if generate_preview:
159
+ preview_path = _write_preview(df, output_path)
160
+
161
+ # ---- Arrow IPC (legacy name kept for compat) ----
162
+ elif format in ("arrow", "ipc"):
163
+ compression = options.get("compression", "uncompressed")
164
+ if compression == "uncompressed":
165
+ compression = None
166
+ df.write_ipc(output_path, compression=compression)
167
+ if generate_preview:
168
+ preview_path = _write_preview(df, output_path)
169
+
170
+ # ---- TFRecord ----
171
+ elif format == "tfrecord":
172
+ if not HAS_TENSORFLOW:
173
+ return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
174
+ with tf.io.TFRecordWriter(output_path) as writer:
175
+ pdf = df.to_pandas()
176
+ for _, row in pdf.iterrows():
177
+ feature = {}
178
+ for col, value in row.items():
179
+ if value is None:
180
+ continue
181
+ if isinstance(value, int):
182
+ feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
183
+ elif isinstance(value, float):
184
+ feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
185
+ elif isinstance(value, str):
186
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
187
+ elif isinstance(value, bytes):
188
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
189
+ else:
190
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
191
+ example = tf.train.Example(features=tf.train.Features(feature=feature))
192
+ writer.write(example.SerializeToString())
193
+
194
+ else:
195
+ return {"error": f"Unknown export format: {format}"}
196
+
197
+ elapsed = round(time.perf_counter() - t0, 3)
198
+ file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
199
+
200
+ result = {
201
+ "success": True,
202
+ "output_path": output_path,
203
+ "rows": len(df),
204
+ "columns": len(df.columns),
205
+ "format": format,
206
+ "compression": options.get("compression", "default"),
207
+ "file_size_mb": file_size_mb,
208
+ "elapsed_seconds": elapsed,
209
+ }
210
+ if preview_path:
211
+ result["preview_path"] = preview_path
212
+
213
+ return result
214
+
215
+ except Exception as e:
216
+ return {"error": f"Export failed: {str(e)}"}
217
+
218
+
219
+ def main():
220
+ if len(sys.argv) < 4:
221
+ print(
222
+ json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
223
+ file=sys.stderr,
224
+ )
225
+ sys.exit(1)
226
+
227
+ input_file = sys.argv[1]
228
+ output_file = sys.argv[2]
229
+ fmt = sys.argv[3]
230
+
231
+ options = {}
232
+ if len(sys.argv) > 4:
233
+ try:
234
+ options = json.loads(sys.argv[4])
235
+ except Exception:
236
+ pass
237
+
238
+ result = export_data(input_file, output_file, fmt, options)
239
+ print(json.dumps(result))
240
+
241
+
242
+ if __name__ == "__main__":
243
+ main()
@@ -0,0 +1,100 @@
1
+ import os
2
+ import json
3
+ import warnings
4
+
5
+ # --- PyTorch Adapter ---
6
+ try:
7
+ import torch
8
+ from torch.utils.data import Dataset
9
+ import polars as pl
10
+
11
+ class VesperPyTorchDataset(Dataset):
12
+ """
13
+ PyTorch Dataset wrapper for Vesper exported files (Parquet/CSV/Arrow).
14
+ Efficiently loads data using Polars and converts to Tensors on demand.
15
+ """
16
+ def __init__(self, file_path, target_col=None, transform=None):
17
+ self.file_path = file_path
18
+ self.target_col = target_col
19
+ self.transform = transform
20
+
21
+ # Auto-detect format
22
+ if file_path.endswith(".parquet"):
23
+ self.df = pl.read_parquet(file_path)
24
+ elif file_path.endswith(".csv"):
25
+ self.df = pl.read_csv(file_path, ignore_errors=True)
26
+ elif file_path.endswith(".arrow"):
27
+ self.df = pl.read_ipc(file_path)
28
+ else:
29
+ raise ValueError(f"Unsupported file format for PyTorch loader: {file_path}")
30
+
31
+ self.data = self.df.to_pandas() # Convert to pandas for easier row access in __getitem__ (Polars slice can be slow row-wise)
32
+
33
+ def __len__(self):
34
+ return len(self.data)
35
+
36
+ def __getitem__(self, idx):
37
+ row = self.data.iloc[idx]
38
+
39
+ # Simple assumption: all numeric columns except target are features
40
+ # In production, metadata would tell us which columns are features
41
+ if self.target_col and self.target_col in row:
42
+ y = row[self.target_col]
43
+ x = row.drop(self.target_col).values
44
+
45
+ # Convert to tensors
46
+ x = torch.tensor(x, dtype=torch.float32)
47
+ # Auto-detect target type (scalar vs class index)
48
+ if isinstance(y, (int, float)):
49
+ y = torch.tensor(y, dtype=torch.float32) # Regression/Binary
50
+ else:
51
+ # TODO: Label encoding if string
52
+ pass
53
+
54
+ sample = (x, y)
55
+ else:
56
+ # Unsupervised
57
+ x = torch.tensor(row.values, dtype=torch.float32)
58
+ sample = x
59
+
60
+ if self.transform:
61
+ sample = self.transform(sample)
62
+
63
+ return sample
64
+
65
+ except ImportError:
66
+ class VesperPyTorchDataset:
67
+ def __init__(self, *args, **kwargs):
68
+ raise ImportError("PyTorch or Polars not installed.")
69
+
70
+ # --- HuggingFace Adapter ---
71
+ try:
72
+ from datasets import load_dataset as hf_load_dataset
73
+
74
+ def load_vesper_dataset(file_path):
75
+ """
76
+ Loads a Vesper export into a Hugging Face Dataset.
77
+ Supported: Parquet, CSV, JSONL, Arrow.
78
+ """
79
+ output_format = "parquet" # Default fallback
80
+ if file_path.endswith(".csv"): output_format = "csv"
81
+ elif file_path.endswith(".jsonl"): output_format = "json"
82
+ elif file_path.endswith(".arrow"): output_format = "arrow"
83
+
84
+ # 'arrow' format in HF might need custom script, but usually parquet/csv/json are native
85
+ if output_format == "arrow":
86
+ # Use pandas/polars to read then convert to HF dataset
87
+ import polars as pl
88
+ from datasets import Dataset
89
+ df = pl.read_ipc(file_path).to_pandas()
90
+ return Dataset.from_pandas(df)
91
+
92
+ return hf_load_dataset(output_format, data_files=file_path, split="train")
93
+
94
+ except ImportError:
95
+ def load_vesper_dataset(*args, **kwargs):
96
+ raise ImportError("HuggingFace 'datasets' library not installed.")
97
+
98
+ if __name__ == "__main__":
99
+ print("Vesper Framework Adapters Library")
100
+ print("Usage: import this module in your training script.")