vesper-wizard 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +1 -1
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/requirements.txt +0 -12
  203. package/src/python/row_count.py +0 -54
  204. package/src/python/splitter_engine.py +0 -283
  205. package/src/python/target_engine.py +0 -154
  206. package/src/python/test_framework_adapters.py +0 -61
  207. package/src/python/test_fusion_engine.py +0 -89
  208. package/src/python/uci_adapter.py +0 -94
  209. package/src/python/vesper/__init__.py +0 -1
  210. package/src/python/vesper/core/__init__.py +0 -1
  211. package/src/python/vesper/core/asset_downloader.py +0 -679
  212. package/src/python/vesper/core/download_recipe.py +0 -104
  213. package/src/python/worldbank_adapter.py +0 -99
  214. package/wizard.cjs +0 -3
@@ -1,208 +0,0 @@
1
- import sys
2
- import json
3
- import argparse
4
- import tempfile
5
- import os
6
- import urllib.request
7
- import urllib.error
8
- import urllib.parse
9
- from typing import Dict, Any, List
10
-
11
- def _get_token() -> str:
12
- token = os.environ.get("DW_AUTH_TOKEN")
13
- if not token:
14
- raise ValueError("DW_AUTH_TOKEN environment variable is required for data.world")
15
- return token
16
-
17
- def _dataset_to_dict(ds: Dict[str, Any]) -> Dict[str, Any]:
18
- owner_field = ds.get("owner", "")
19
- if isinstance(owner_field, dict):
20
- owner = owner_field.get("id") or owner_field.get("name") or ""
21
- else:
22
- owner = owner_field or ""
23
-
24
- id_str = ds.get("id", "")
25
- title = ds.get("title", "")
26
-
27
- if (not owner or not id_str) and isinstance(ds.get("resourceLink"), str):
28
- # Expected format includes /<owner>/<dataset-id>
29
- parts = ds["resourceLink"].strip("/").split("/")
30
- if len(parts) >= 2:
31
- owner = owner or parts[-2]
32
- id_str = id_str or parts[-1]
33
-
34
- if isinstance(id_str, str) and "/" in id_str and not owner:
35
- split_ref = id_str.split("/", 1)
36
- owner = split_ref[0]
37
- id_str = split_ref[1]
38
-
39
- if not owner and not id_str:
40
- owner = "unknown"
41
- id_str = "unknown"
42
-
43
- if not title:
44
- title = f"{owner}/{id_str}"
45
-
46
- return {
47
- "id": f"dataworld:{owner}/{id_str}",
48
- "name": title,
49
- "source": "dataworld",
50
- "description": ds.get("description", f"data.world dataset {title}"),
51
- "author": owner,
52
- "license": {
53
- "id": "Unknown",
54
- "category": "unknown",
55
- "commercial_use": None,
56
- "warnings": []
57
- },
58
- "tags": ds.get("tags", []) + ["dataworld"],
59
- "downloads": 0,
60
- "likes": 0,
61
- "created_at": ds.get("created", ""),
62
- "updated_at": ds.get("updated", ""),
63
- "size_bytes": 0,
64
- "quality_score": 0.8,
65
- "domain": "general",
66
- "is_gated": False,
67
- "is_nsfw": False,
68
- "description_length": len(ds.get("description", "")),
69
- "has_readme": False,
70
- "download_url": f"https://data.world/{owner}/{id_str}",
71
- }
72
-
73
- def discover(query: str, limit: int = 20) -> Dict[str, Any]:
74
- try:
75
- token = _get_token()
76
-
77
- # data.world simple search API
78
- url = f"https://api.data.world/v0/search/resources?size={limit}"
79
-
80
- headers = {
81
- "Authorization": f"Bearer {token}",
82
- "Content-Type": "application/json",
83
- "Accept": "application/json"
84
- }
85
-
86
- # Search datasets and include community results to improve recall
87
- body = {
88
- "query": query,
89
- "category": ["dataset"],
90
- "includeCommunityResults": True,
91
- }
92
-
93
- req = urllib.request.Request(url, data=json.dumps(body).encode('utf-8'), headers=headers, method="POST")
94
-
95
- with urllib.request.urlopen(req) as response:
96
- data = json.loads(response.read().decode('utf-8'))
97
-
98
- records = data.get("records", [])
99
-
100
- # Fallback to advanced endpoint if simple search returns nothing
101
- if not records:
102
- adv_url = f"https://api.data.world/v0/search?size={limit}"
103
- adv_body = {
104
- "query": query,
105
- "category": ["dataset"],
106
- }
107
- adv_req = urllib.request.Request(
108
- adv_url,
109
- data=json.dumps(adv_body).encode("utf-8"),
110
- headers=headers,
111
- method="POST",
112
- )
113
- with urllib.request.urlopen(adv_req) as response:
114
- adv_data = json.loads(response.read().decode("utf-8"))
115
- records = adv_data.get("records", [])
116
-
117
- items = [_dataset_to_dict(r) for r in records]
118
-
119
- return {"ok": True, "results": items, "count": len(items)}
120
- except Exception as e:
121
- return {"ok": False, "error": f"data.world discover failed: {str(e)}"}
122
-
123
- def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
124
- try:
125
- token = _get_token()
126
-
127
- # dataset_ref is expected to be "dataworld:owner/id"
128
- if dataset_ref.startswith("dataworld:"):
129
- ref = dataset_ref.split(":", 1)[1]
130
- else:
131
- ref = dataset_ref
132
-
133
- parts = ref.split("/")
134
- if len(parts) != 2:
135
- return {"ok": False, "error": f"Invalid data.world dataset ID format. Expected owner/id, got {ref}"}
136
-
137
- owner, dataset_id = parts
138
-
139
- if not target_dir:
140
- target_dir = tempfile.mkdtemp(prefix="vesper_dataworld_")
141
-
142
- os.makedirs(target_dir, exist_ok=True)
143
-
144
- # First, get the dataset metadata to find the files
145
- url = f"https://api.data.world/v0/datasets/{owner}/{dataset_id}"
146
- headers = {
147
- "Authorization": f"Bearer {token}",
148
- "Accept": "application/json"
149
- }
150
-
151
- req = urllib.request.Request(url, headers=headers)
152
- with urllib.request.urlopen(req) as response:
153
- dataset_meta = json.loads(response.read().decode('utf-8'))
154
-
155
- files = dataset_meta.get("files", [])
156
- if not files:
157
- return {"ok": False, "error": "No files found in this dataset"}
158
-
159
- # Find the best file to download (prefer csv, parquet, jsonl)
160
- best_file = None
161
- for ext in [".parquet", ".csv", ".jsonl", ".json"]:
162
- for f in files:
163
- if f.get("name", "").lower().endswith(ext):
164
- best_file = f
165
- break
166
- if best_file:
167
- break
168
-
169
- if not best_file:
170
- best_file = files[0] # Just take the first one if no preferred format
171
-
172
- filename = best_file.get("name")
173
-
174
- # Download the file
175
- download_url = f"https://api.data.world/v0/file_download/{owner}/{dataset_id}/{urllib.parse.quote(filename)}"
176
-
177
- file_path = os.path.join(target_dir, filename)
178
-
179
- download_req = urllib.request.Request(download_url, headers=headers)
180
- with urllib.request.urlopen(download_req) as response, open(file_path, 'wb') as out_file:
181
- out_file.write(response.read())
182
-
183
- return {
184
- "ok": True,
185
- "local_path": file_path,
186
- "target_dir": target_dir
187
- }
188
- except Exception as e:
189
- return {"ok": False, "error": f"data.world download failed: {str(e)}"}
190
-
191
- def main():
192
- parser = argparse.ArgumentParser(description="Vesper data.world Engine")
193
- parser.add_argument("action", choices=["discover", "download"])
194
- parser.add_argument("arg1", help="Query for discover, Dataset ID for download")
195
- parser.add_argument("arg2", nargs="?", help="Limit for discover, Target Dir for download")
196
-
197
- args = parser.parse_args()
198
-
199
- if args.action == "discover":
200
- limit = int(args.arg2) if args.arg2 else 20
201
- result = discover(args.arg1, limit)
202
- print(json.dumps(result))
203
- elif args.action == "download":
204
- result = download(args.arg1, args.arg2)
205
- print(json.dumps(result))
206
-
207
- if __name__ == "__main__":
208
- main()
@@ -1,288 +0,0 @@
1
- import sys
2
- import json
3
- import polars as pl
4
- import os
5
- import time
6
-
7
- # Optional imports for extra formats
8
- try:
9
- import pyarrow as pa
10
- import pyarrow.feather as pf
11
- HAS_PYARROW = True
12
- except ImportError:
13
- HAS_PYARROW = False
14
-
15
- try:
16
- import tensorflow as tf
17
- HAS_TENSORFLOW = True
18
- except ImportError:
19
- HAS_TENSORFLOW = False
20
-
21
-
22
- # ---------------------------------------------------------------------------
23
- # Helpers
24
- # ---------------------------------------------------------------------------
25
-
26
- def _load(file_path: str, options: dict) -> pl.DataFrame:
27
- """Load any supported input format into a Polars DataFrame."""
28
- sample_rows = options.get("sample_rows") # int | None
29
- columns = options.get("columns") # list[str] | None
30
-
31
- ext = os.path.splitext(file_path)[1].lower()
32
- if ext == ".csv":
33
- df = pl.read_csv(file_path, ignore_errors=True)
34
- elif ext == ".tsv":
35
- df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
36
- elif ext == ".txt":
37
- # Heuristic delimiter detection for plain text tabular files.
38
- sep = ","
39
- try:
40
- with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
41
- first_line = fh.readline()
42
- if "\t" in first_line:
43
- sep = "\t"
44
- except Exception:
45
- sep = ","
46
- df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
47
- elif ext in (".parquet", ".pq"):
48
- df = pl.read_parquet(file_path)
49
- elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
50
- df = pl.read_ipc(file_path)
51
- elif ext == ".jsonl":
52
- df = pl.read_ndjson(file_path)
53
- elif ext == ".json":
54
- # Auto-detect: array-of-objects vs NDJSON vs nested structures
55
- try:
56
- import json as _json
57
- with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
58
- raw_text = fh.read(512) # peek
59
- stripped = raw_text.lstrip()
60
- if stripped.startswith("["):
61
- # Array of objects — standard JSON
62
- with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
63
- data = _json.load(fh)
64
- if isinstance(data, list) and len(data) > 0:
65
- df = pl.DataFrame(data)
66
- else:
67
- raise ValueError("JSON file is empty or not an array of objects")
68
- elif stripped.startswith("{"):
69
- # Could be NDJSON or a single object wrapping rows
70
- try:
71
- df = pl.read_ndjson(file_path)
72
- except Exception:
73
- with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
74
- data = _json.load(fh)
75
- # Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
76
- rows = None
77
- if isinstance(data, dict):
78
- for key in ("data", "rows", "records", "items", "results", "entries"):
79
- if key in data and isinstance(data[key], list):
80
- rows = data[key]
81
- break
82
- if rows is None:
83
- # Last resort: try to use the dict values
84
- rows = [data]
85
- if rows and len(rows) > 0:
86
- df = pl.DataFrame(rows)
87
- else:
88
- raise ValueError("Could not parse JSON structure into tabular data")
89
- else:
90
- raise ValueError("JSON file does not start with [ or {")
91
- except pl.exceptions.ComputeError as ce:
92
- raise ValueError(f"Failed to parse JSON: {ce}")
93
- elif ext == ".xlsx":
94
- try:
95
- df = pl.read_excel(file_path)
96
- except Exception as e:
97
- raise ValueError(f"Failed to read Excel file: {e}")
98
- else:
99
- raise ValueError(f"Unsupported input format: {ext}")
100
-
101
- if len(df) == 0:
102
- raise ValueError("empty CSV")
103
-
104
- # Column selection (before sampling for speed)
105
- if columns:
106
- valid = [c for c in columns if c in df.columns]
107
- if valid:
108
- df = df.select(valid)
109
-
110
- # Optional sampling
111
- if sample_rows and sample_rows < len(df):
112
- seed = options.get("random_seed", 42)
113
- df = df.sample(n=sample_rows, seed=seed)
114
-
115
- return df
116
-
117
-
118
- def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
119
- """Stringify complex columns so CSV doesn't choke."""
120
- for col in df.columns:
121
- dtype = df.schema[col]
122
- is_simple = (
123
- dtype.is_numeric()
124
- or dtype.is_temporal()
125
- or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
126
- )
127
- if not is_simple:
128
- def safe_serialize(val):
129
- try:
130
- if hasattr(val, "to_list"):
131
- return json.dumps(val.to_list())
132
- if hasattr(val, "to_dict"):
133
- return json.dumps(val.to_dict())
134
- return json.dumps(val)
135
- except Exception:
136
- return str(val)
137
- df = df.with_columns(
138
- pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
139
- )
140
- return df
141
-
142
-
143
- def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
144
- """Write a small CSV preview next to the exported file."""
145
- preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
146
- preview_df = _safe_csv_df(df.head(min(n, len(df))))
147
- preview_df.write_csv(preview_path)
148
- return preview_path
149
-
150
-
151
- # ---------------------------------------------------------------------------
152
- # Main export function
153
- # ---------------------------------------------------------------------------
154
-
155
- def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
156
- options = options or {}
157
- t0 = time.perf_counter()
158
-
159
- # ---- Load ----
160
- try:
161
- df = _load(file_path, options)
162
- except Exception as e:
163
- return {"error": f"Failed to load input file: {str(e)}"}
164
-
165
- output_dir = os.path.dirname(output_path)
166
- if output_dir and not os.path.exists(output_dir):
167
- os.makedirs(output_dir, exist_ok=True)
168
-
169
- preview_path = None
170
- generate_preview = options.get("preview", False)
171
-
172
- try:
173
- # ---- Feather (Arrow IPC) – fastest binary format ----
174
- if format == "feather":
175
- if not HAS_PYARROW:
176
- return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
177
- compression = options.get("compression", "lz4")
178
- if compression in ("uncompressed", "none", "None", None):
179
- compression = "uncompressed"
180
- # Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
181
- arrow_table = df.to_arrow()
182
- pf.write_feather(arrow_table, output_path, compression=compression)
183
- if generate_preview:
184
- preview_path = _write_preview(df, output_path)
185
-
186
- # ---- Parquet – best compression, big-data friendly ----
187
- elif format == "parquet":
188
- compression = options.get("compression", "snappy")
189
- if compression in ("uncompressed", "none", "None", None):
190
- compression = "uncompressed"
191
- df.write_parquet(output_path, compression=compression)
192
- if generate_preview:
193
- preview_path = _write_preview(df, output_path)
194
-
195
- # ---- CSV – human-readable fallback ----
196
- elif format == "csv":
197
- df = _safe_csv_df(df)
198
- df.write_csv(output_path)
199
-
200
- # ---- JSONL ----
201
- elif format == "jsonl":
202
- df.write_ndjson(output_path)
203
- if generate_preview:
204
- preview_path = _write_preview(df, output_path)
205
-
206
- # ---- Arrow IPC (legacy name kept for compat) ----
207
- elif format in ("arrow", "ipc"):
208
- compression = options.get("compression", "uncompressed")
209
- if compression == "uncompressed":
210
- compression = None
211
- df.write_ipc(output_path, compression=compression)
212
- if generate_preview:
213
- preview_path = _write_preview(df, output_path)
214
-
215
- # ---- TFRecord ----
216
- elif format == "tfrecord":
217
- if not HAS_TENSORFLOW:
218
- return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
219
- with tf.io.TFRecordWriter(output_path) as writer:
220
- pdf = df.to_pandas()
221
- for _, row in pdf.iterrows():
222
- feature = {}
223
- for col, value in row.items():
224
- if value is None:
225
- continue
226
- if isinstance(value, int):
227
- feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
228
- elif isinstance(value, float):
229
- feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
230
- elif isinstance(value, str):
231
- feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
232
- elif isinstance(value, bytes):
233
- feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
234
- else:
235
- feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
236
- example = tf.train.Example(features=tf.train.Features(feature=feature))
237
- writer.write(example.SerializeToString())
238
-
239
- else:
240
- return {"error": f"Unknown export format: {format}"}
241
-
242
- elapsed = round(time.perf_counter() - t0, 3)
243
- file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
244
-
245
- result = {
246
- "success": True,
247
- "output_path": output_path,
248
- "rows": len(df),
249
- "columns": len(df.columns),
250
- "format": format,
251
- "compression": options.get("compression", "default"),
252
- "file_size_mb": file_size_mb,
253
- "elapsed_seconds": elapsed,
254
- }
255
- if preview_path:
256
- result["preview_path"] = preview_path
257
-
258
- return result
259
-
260
- except Exception as e:
261
- return {"error": f"Export failed: {str(e)}"}
262
-
263
-
264
- def main():
265
- if len(sys.argv) < 4:
266
- print(
267
- json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
268
- file=sys.stderr,
269
- )
270
- sys.exit(1)
271
-
272
- input_file = sys.argv[1]
273
- output_file = sys.argv[2]
274
- fmt = sys.argv[3]
275
-
276
- options = {}
277
- if len(sys.argv) > 4:
278
- try:
279
- options = json.loads(sys.argv[4])
280
- except Exception:
281
- pass
282
-
283
- result = export_data(input_file, output_file, fmt, options)
284
- print(json.dumps(result))
285
-
286
-
287
- if __name__ == "__main__":
288
- main()
@@ -1,100 +0,0 @@
1
- import os
2
- import json
3
- import warnings
4
-
5
- # --- PyTorch Adapter ---
6
- try:
7
- import torch
8
- from torch.utils.data import Dataset
9
- import polars as pl
10
-
11
- class VesperPyTorchDataset(Dataset):
12
- """
13
- PyTorch Dataset wrapper for Vesper exported files (Parquet/CSV/Arrow).
14
- Efficiently loads data using Polars and converts to Tensors on demand.
15
- """
16
- def __init__(self, file_path, target_col=None, transform=None):
17
- self.file_path = file_path
18
- self.target_col = target_col
19
- self.transform = transform
20
-
21
- # Auto-detect format
22
- if file_path.endswith(".parquet"):
23
- self.df = pl.read_parquet(file_path)
24
- elif file_path.endswith(".csv"):
25
- self.df = pl.read_csv(file_path, ignore_errors=True)
26
- elif file_path.endswith(".arrow"):
27
- self.df = pl.read_ipc(file_path)
28
- else:
29
- raise ValueError(f"Unsupported file format for PyTorch loader: {file_path}")
30
-
31
- self.data = self.df.to_pandas() # Convert to pandas for easier row access in __getitem__ (Polars slice can be slow row-wise)
32
-
33
- def __len__(self):
34
- return len(self.data)
35
-
36
- def __getitem__(self, idx):
37
- row = self.data.iloc[idx]
38
-
39
- # Simple assumption: all numeric columns except target are features
40
- # In production, metadata would tell us which columns are features
41
- if self.target_col and self.target_col in row:
42
- y = row[self.target_col]
43
- x = row.drop(self.target_col).values
44
-
45
- # Convert to tensors
46
- x = torch.tensor(x, dtype=torch.float32)
47
- # Auto-detect target type (scalar vs class index)
48
- if isinstance(y, (int, float)):
49
- y = torch.tensor(y, dtype=torch.float32) # Regression/Binary
50
- else:
51
- # TODO: Label encoding if string
52
- pass
53
-
54
- sample = (x, y)
55
- else:
56
- # Unsupervised
57
- x = torch.tensor(row.values, dtype=torch.float32)
58
- sample = x
59
-
60
- if self.transform:
61
- sample = self.transform(sample)
62
-
63
- return sample
64
-
65
- except ImportError:
66
- class VesperPyTorchDataset:
67
- def __init__(self, *args, **kwargs):
68
- raise ImportError("PyTorch or Polars not installed.")
69
-
70
- # --- HuggingFace Adapter ---
71
- try:
72
- from datasets import load_dataset as hf_load_dataset
73
-
74
- def load_vesper_dataset(file_path):
75
- """
76
- Loads a Vesper export into a Hugging Face Dataset.
77
- Supported: Parquet, CSV, JSONL, Arrow.
78
- """
79
- output_format = "parquet" # Default fallback
80
- if file_path.endswith(".csv"): output_format = "csv"
81
- elif file_path.endswith(".jsonl"): output_format = "json"
82
- elif file_path.endswith(".arrow"): output_format = "arrow"
83
-
84
- # 'arrow' format in HF might need custom script, but usually parquet/csv/json are native
85
- if output_format == "arrow":
86
- # Use pandas/polars to read then convert to HF dataset
87
- import polars as pl
88
- from datasets import Dataset
89
- df = pl.read_ipc(file_path).to_pandas()
90
- return Dataset.from_pandas(df)
91
-
92
- return hf_load_dataset(output_format, data_files=file_path, split="train")
93
-
94
- except ImportError:
95
- def load_vesper_dataset(*args, **kwargs):
96
- raise ImportError("HuggingFace 'datasets' library not installed.")
97
-
98
- if __name__ == "__main__":
99
- print("Vesper Framework Adapters Library")
100
- print("Usage: import this module in your training script.")