vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,226 +0,0 @@
1
- import sys
2
- import json
3
- import polars as pl
4
- import numpy as np
5
-
6
- # --- Operations Library ---
7
-
8
- def op_remove_duplicates(df, params):
9
- subset = params.get("subset", None) # List of cols or None
10
- before = len(df)
11
- if subset:
12
- df = df.unique(subset=subset)
13
- else:
14
- df = df.unique()
15
- return df, {"rows_removed": before - len(df)}
16
-
17
- def op_drop_columns(df, params):
18
- cols = params.get("columns", [])
19
- before = len(df.columns)
20
- # Filter only existing cols to avoid errors
21
- cols_to_drop = [c for c in cols if c in df.columns]
22
- df = df.drop(cols_to_drop)
23
- return df, {"columns_dropped": len(cols_to_drop)}
24
-
25
- def op_fill_missing(df, params):
26
- col = params["column"]
27
- method = params.get("method", "mean") # mean, median, mode, constant
28
- value = params.get("value", None)
29
-
30
- if col not in df.columns:
31
- return df, {"error": f"Column {col} not found"}
32
-
33
- affected = df[col].null_count()
34
-
35
- if method == "constant":
36
- df = df.with_columns(pl.col(col).fill_null(value))
37
- elif method == "mean":
38
- mean_val = df[col].mean()
39
- df = df.with_columns(pl.col(col).fill_null(mean_val))
40
- elif method == "median":
41
- median_val = df[col].median()
42
- df = df.with_columns(pl.col(col).fill_null(median_val))
43
-
44
- return df, {"rows_imputed": affected}
45
-
46
- def op_fix_types(df, params):
47
- col = params["column"]
48
- target_type = params["type"] # "int", "float", "string", "date"
49
-
50
- if col not in df.columns:
51
- return df, {"error": f"Column {col} not found"}
52
-
53
- try:
54
- if target_type == "int":
55
- df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))
56
- elif target_type == "float":
57
- df = df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
58
- elif target_type == "string":
59
- df = df.with_columns(pl.col(col).cast(pl.Utf8))
60
- elif target_type == "date":
61
- df = df.with_columns(pl.col(col).str.to_date(strict=False))
62
-
63
- return df, {"status": "Converted"}
64
- except Exception as e:
65
- return df, {"error": str(e)}
66
-
67
- def op_remove_outliers(df, params):
68
- col = params["column"]
69
- method = params.get("method", "iqr")
70
- threshold = params.get("threshold", 1.5)
71
-
72
- if col not in df.columns:
73
- return df, {"error": f"Column {col} not found"}
74
-
75
- before = len(df)
76
-
77
- if method == "iqr":
78
- q1 = df[col].quantile(0.25)
79
- q3 = df[col].quantile(0.75)
80
- iqr = q3 - q1
81
- lower = q1 - (threshold * iqr)
82
- upper = q3 + (threshold * iqr)
83
-
84
- df = df.filter((pl.col(col) >= lower) & (pl.col(col) <= upper))
85
-
86
- return df, {"rows_removed": before - len(df)}
87
-
88
- def op_encode_categories(df, params):
89
- col = params["column"]
90
- method = params.get("method", "label") # label, onehot
91
-
92
- if col not in df.columns:
93
- return df, {"error": f"Column {col} not found"}
94
-
95
- if method == "label":
96
- # Polars dense_rank acts similar to label encoding
97
- df = df.with_columns(pl.col(col).rank("dense").alias(f"{col}_encoded"))
98
- elif method == "onehot":
99
- dummies = df[col].to_dummies()
100
- df = pl.concat([df, dummies], how="horizontal")
101
-
102
- return df, {"status": f"Encoded using {method}"}
103
-
104
- # --- Registry ---
105
-
106
- OPERATIONS = {
107
- "RemoveDuplicates": op_remove_duplicates,
108
- "DropColumns": op_drop_columns,
109
- "FillMissing": op_fill_missing,
110
- "FixTypes": op_fix_types,
111
- "RemoveOutliers": op_remove_outliers,
112
- "EncodeCategories": op_encode_categories
113
- }
114
-
115
- def main():
116
- if len(sys.argv) < 3:
117
- print(json.dumps({"error": "Usage: cleaner.py <file_path> <operations_json>"}), file=sys.stderr)
118
- sys.exit(1)
119
-
120
- file_path = sys.argv[1]
121
- ops_json = sys.argv[2]
122
-
123
- try:
124
- operations = json.loads(ops_json)
125
-
126
- # Load Data
127
- file_path_lower = file_path.lower()
128
- if file_path_lower.endswith(".csv"):
129
- df = pl.read_csv(file_path, ignore_errors=True)
130
- elif file_path_lower.endswith(".parquet"):
131
- df = pl.read_parquet(file_path)
132
- elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
133
- # Explicit NDJSON
134
- df = pl.read_ndjson(file_path)
135
- elif file_path_lower.endswith(".json"):
136
- # Ambiguous .json
137
- try:
138
- df = pl.read_json(file_path)
139
- except Exception:
140
- try:
141
- df = pl.read_ndjson(file_path)
142
- except Exception as e:
143
- raise ValueError(f"Failed to read JSON: {str(e)}")
144
- else:
145
- raise ValueError(f"Unsupported format: {file_path}")
146
-
147
- logs = []
148
- total_rows_affected = 0
149
-
150
- # Execute Pipeline
151
- for op in operations:
152
- op_type = op["type"]
153
- params = op.get("params", {})
154
-
155
- if op_type == "RenameTarget":
156
- old_name = params.get("old_name")
157
- new_name = params.get("new_name", "target")
158
- if old_name and old_name in df.columns:
159
- df = df.rename({old_name: new_name})
160
- logs.append(f"Renamed column '{old_name}' to '{new_name}'")
161
- else:
162
- logs.append(f"Failed RenameTarget: Column '{old_name}' not found or not specified.")
163
- elif op_type in OPERATIONS:
164
- try:
165
- df, stats = OPERATIONS[op_type](df, params)
166
- logs.append(f"Executed {op_type}: {stats}")
167
- total_rows_affected += stats.get("rows_removed", 0)
168
- except Exception as e:
169
- logs.append(f"Failed {op_type}: {str(e)}")
170
- else:
171
- logs.append(f"Unknown operation: {op_type}")
172
-
173
- # Save Result (overwrite or new file)
174
- # Save Result (overwrite or new file)
175
- output_format = sys.argv[3] if len(sys.argv) > 3 else None
176
-
177
- if not output_format:
178
- # Legacy logic: preserve CSV or default to parquet
179
- if file_path_lower.endswith(".csv"):
180
- output_format = "csv"
181
- else:
182
- output_format = "parquet"
183
-
184
- base_name = file_path.rsplit(".", 1)[0]
185
- if output_format == "csv":
186
- output_path = f"{base_name}_cleaned.csv"
187
- # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
188
- for col in df.columns:
189
- dtype = df.schema[col]
190
- # Only keep simple types; stringify everything else for CSV
191
- is_simple = (
192
- dtype.is_numeric() or
193
- dtype.is_temporal() or
194
- str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
195
- )
196
- if not is_simple:
197
- # Use a robust helper for clean JSON serialization
198
- def safe_serialize(val):
199
- try:
200
- # Handle Polars nested types (convert to Python list/dict first)
201
- if hasattr(val, "to_list"):
202
- return json.dumps(val.to_list())
203
- if hasattr(val, "to_dict"):
204
- return json.dumps(val.to_dict())
205
- return json.dumps(val)
206
- except:
207
- return str(val)
208
- df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
209
- df.write_csv(output_path)
210
- else:
211
- output_path = f"{base_name}_cleaned.parquet"
212
- df.write_parquet(output_path)
213
-
214
- print(json.dumps({
215
- "success": True,
216
- "output_path": output_path,
217
- "rows_affected": total_rows_affected,
218
- "logs": logs
219
- }, default=str))
220
-
221
- except Exception as e:
222
- print(json.dumps({"success": False, "error": str(e)}, default=str))
223
- sys.exit(1)
224
-
225
- if __name__ == "__main__":
226
- main()
@@ -1,263 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import base64
5
- import hashlib
6
- import secrets
7
- from pathlib import Path
8
- from typing import Dict, Optional
9
-
10
- SERVICE_NAME = "vesper"
11
-
12
- KEY_ALIASES = {
13
- "hf_token": ["HF_TOKEN", "HUGGINGFACE_TOKEN"],
14
- "kaggle_username": ["KAGGLE_USERNAME"],
15
- "kaggle_key": ["KAGGLE_KEY"],
16
- "dataworld_token": ["DW_AUTH_TOKEN"],
17
- }
18
-
19
- try:
20
- import keyring # type: ignore
21
- HAS_KEYRING = True
22
- except Exception:
23
- HAS_KEYRING = False
24
-
25
- try:
26
- from cryptography.fernet import Fernet, InvalidToken # type: ignore
27
- HAS_FERNET = True
28
- except Exception:
29
- HAS_FERNET = False
30
-
31
-
32
- def _config_path() -> Path:
33
- return Path.home() / ".vesper" / "config.toml"
34
-
35
-
36
- def _secret_path() -> Path:
37
- return Path.home() / ".vesper" / ".config_key"
38
-
39
-
40
- def _ensure_parent(path: Path) -> None:
41
- path.parent.mkdir(parents=True, exist_ok=True)
42
-
43
-
44
- def _read_fallback_toml() -> Dict[str, str]:
45
- path = _config_path()
46
- if not path.exists():
47
- return {}
48
-
49
- values: Dict[str, str] = {}
50
- in_keys = False
51
- method = ""
52
-
53
- for raw in path.read_text(encoding="utf-8").splitlines():
54
- line = raw.strip()
55
- if not line or line.startswith("#"):
56
- continue
57
- if line.startswith("[") and line.endswith("]"):
58
- in_keys = (line == "[keys]")
59
- continue
60
- if line.startswith("method") and "=" in line:
61
- method = line.split("=", 1)[1].strip().strip('"').strip("'")
62
- continue
63
- if not in_keys or "=" not in line:
64
- continue
65
-
66
- key, val = line.split("=", 1)
67
- key = key.strip()
68
- val = val.strip().strip('"').strip("'")
69
- values[key] = val
70
-
71
- if method:
72
- values["__method__"] = method
73
-
74
- return values
75
-
76
-
77
- def _get_or_create_local_secret() -> str:
78
- secret_file = _secret_path()
79
- _ensure_parent(secret_file)
80
-
81
- if secret_file.exists():
82
- return secret_file.read_text(encoding="utf-8").strip()
83
-
84
- secret = base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8")
85
- secret_file.write_text(secret, encoding="utf-8")
86
- try:
87
- os.chmod(secret_file, 0o600)
88
- except Exception:
89
- pass
90
- return secret
91
-
92
-
93
- def _xor_encrypt(plain: str, secret: str) -> str:
94
- key = hashlib.sha256(secret.encode("utf-8")).digest()
95
- data = plain.encode("utf-8")
96
- out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
97
- return base64.urlsafe_b64encode(out).decode("utf-8")
98
-
99
-
100
- def _xor_decrypt(cipher_text: str, secret: str) -> str:
101
- key = hashlib.sha256(secret.encode("utf-8")).digest()
102
- data = base64.urlsafe_b64decode(cipher_text.encode("utf-8"))
103
- out = bytes([data[i] ^ key[i % len(key)] for i in range(len(data))])
104
- return out.decode("utf-8")
105
-
106
-
107
- def _encrypt_value(value: str, secret: str) -> Dict[str, str]:
108
- if HAS_FERNET:
109
- token = Fernet(secret.encode("utf-8")).encrypt(value.encode("utf-8")).decode("utf-8")
110
- return {"method": "fernet", "value": token}
111
- # fallback encryption (weaker than fernet, but still not plaintext)
112
- return {"method": "xor", "value": _xor_encrypt(value, secret)}
113
-
114
-
115
- def _decrypt_value(value: str, method: str, secret: str) -> Optional[str]:
116
- try:
117
- if method == "fernet" and HAS_FERNET:
118
- return Fernet(secret.encode("utf-8")).decrypt(value.encode("utf-8")).decode("utf-8")
119
- if method == "xor":
120
- return _xor_decrypt(value, secret)
121
- return None
122
- except InvalidToken:
123
- return None
124
- except Exception:
125
- return None
126
-
127
-
128
- def _write_fallback_toml(values: Dict[str, str]) -> None:
129
- path = _config_path()
130
- _ensure_parent(path)
131
-
132
- method = values.get("__method__", "fernet" if HAS_FERNET else "xor")
133
- lines = [
134
- "# Vesper optional API keys fallback storage",
135
- "# Encrypted fallback (keyring is preferred)",
136
- "[meta]",
137
- f'method = "{method}"',
138
- "[keys]",
139
- ]
140
- for key in sorted(values.keys()):
141
- if key.startswith("__"):
142
- continue
143
- val = str(values[key]).replace('"', '\\"')
144
- lines.append(f'{key} = "{val}"')
145
-
146
- path.write_text("\n".join(lines) + "\n", encoding="utf-8")
147
-
148
-
149
- def _get_from_env(name: str) -> Optional[str]:
150
- for env_key in KEY_ALIASES.get(name, []):
151
- val = os.getenv(env_key)
152
- if val:
153
- return val
154
- return None
155
-
156
-
157
- def get_key(name: str) -> Optional[str]:
158
- # 1) keyring (secure)
159
- if HAS_KEYRING:
160
- try:
161
- val = keyring.get_password(SERVICE_NAME, name)
162
- if val:
163
- return val
164
- except Exception:
165
- pass
166
-
167
- # 2) encrypted fallback config.toml
168
- fallback = _read_fallback_toml()
169
- enc = fallback.get(name)
170
- if enc:
171
- secret = _get_or_create_local_secret()
172
- method = fallback.get("__method__", "fernet" if HAS_FERNET else "xor")
173
- dec = _decrypt_value(enc, method, secret)
174
- if dec:
175
- return dec
176
-
177
- # 3) env vars (fallback only)
178
- env_val = _get_from_env(name)
179
- if env_val:
180
- return env_val
181
- return None
182
-
183
-
184
- def set_key(name: str, value: str) -> Dict[str, str]:
185
- if not value:
186
- return {"ok": "false", "method": "none", "error": "Empty value"}
187
-
188
- if HAS_KEYRING:
189
- try:
190
- keyring.set_password(SERVICE_NAME, name, value)
191
- return {"ok": "true", "method": "keyring"}
192
- except Exception:
193
- pass
194
-
195
- fallback = _read_fallback_toml()
196
- secret = _get_or_create_local_secret()
197
- enc = _encrypt_value(value, secret)
198
- fallback["__method__"] = enc["method"]
199
- fallback[name] = enc["value"]
200
- _write_fallback_toml(fallback)
201
- return {"ok": "true", "method": f'toml:{enc["method"]}'}
202
-
203
-
204
- def has_key(name: str) -> bool:
205
- return bool(get_key(name))
206
-
207
-
208
- def get_all() -> Dict[str, Optional[str]]:
209
- return {
210
- "hf_token": get_key("hf_token"),
211
- "kaggle_username": get_key("kaggle_username"),
212
- "kaggle_key": get_key("kaggle_key"),
213
- "dataworld_token": get_key("dataworld_token"),
214
- }
215
-
216
-
217
- def _print_json(data):
218
- print(json.dumps(data))
219
-
220
-
221
- def main() -> None:
222
- if len(sys.argv) < 2:
223
- _print_json({
224
- "ok": False,
225
- "error": "Usage: config.py <get|set|has|all> [name] [value]",
226
- })
227
- sys.exit(1)
228
-
229
- cmd = sys.argv[1].lower()
230
-
231
- if cmd == "all":
232
- _print_json({"ok": True, "data": get_all()})
233
- return
234
-
235
- if len(sys.argv) < 3:
236
- _print_json({"ok": False, "error": "Missing key name"})
237
- sys.exit(1)
238
-
239
- name = sys.argv[2]
240
-
241
- if cmd == "get":
242
- _print_json({"ok": True, "name": name, "value": get_key(name)})
243
- return
244
-
245
- if cmd == "has":
246
- _print_json({"ok": True, "name": name, "value": has_key(name)})
247
- return
248
-
249
- if cmd == "set":
250
- if len(sys.argv) < 4:
251
- _print_json({"ok": False, "error": "Missing value for set"})
252
- sys.exit(1)
253
- value = sys.argv[3]
254
- result = set_key(name, value)
255
- _print_json({"ok": result.get("ok") == "true", "name": name, "method": result.get("method"), "error": result.get("error")})
256
- return
257
-
258
- _print_json({"ok": False, "error": f"Unknown command: {cmd}"})
259
- sys.exit(1)
260
-
261
-
262
- if __name__ == "__main__":
263
- main()
@@ -1,92 +0,0 @@
1
- """
2
- Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
3
- Usage: convert_engine.py <input_path> <output_path>
4
- Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
5
- """
6
- import sys
7
- import json
8
- import os
9
-
10
- try:
11
- import polars as pl
12
- except Exception:
13
- print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
14
- sys.exit(1)
15
-
16
-
17
- def _load(src: str) -> pl.DataFrame:
18
- ext = os.path.splitext(src)[1].lower()
19
- if ext == ".csv":
20
- return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
21
- if ext in (".tsv", ".tab"):
22
- return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
23
- if ext in (".parquet", ".pq"):
24
- return pl.read_parquet(src)
25
- if ext in (".feather", ".ftr", ".arrow", ".ipc"):
26
- return pl.read_ipc(src)
27
- if ext in (".jsonl", ".ndjson"):
28
- return pl.read_ndjson(src)
29
- if ext == ".json":
30
- raw = open(src, "r", encoding="utf-8").read().strip()
31
- if raw.startswith("["):
32
- return pl.read_json(src)
33
- if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
34
- return pl.read_ndjson(src)
35
- obj = json.loads(raw)
36
- if isinstance(obj, dict):
37
- for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
38
- if key in obj and isinstance(obj[key], list):
39
- return pl.DataFrame(obj[key])
40
- for v in obj.values():
41
- if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
42
- return pl.DataFrame(v)
43
- return pl.read_json(src)
44
- # Fallback: try csv
45
- return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
46
-
47
-
48
- def _write(df: pl.DataFrame, dst: str) -> None:
49
- ext = os.path.splitext(dst)[1].lower()
50
- os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
51
- if ext in (".parquet", ".pq"):
52
- df.write_parquet(dst)
53
- elif ext == ".csv":
54
- df.write_csv(dst)
55
- elif ext == ".json":
56
- df.write_json(dst, row_oriented=True)
57
- elif ext in (".jsonl", ".ndjson"):
58
- df.write_ndjson(dst)
59
- else:
60
- raise ValueError(f"Unsupported output format: {ext}")
61
-
62
-
63
- def main():
64
- if len(sys.argv) < 3:
65
- print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
66
- sys.exit(1)
67
-
68
- input_path = sys.argv[1]
69
- output_path = sys.argv[2]
70
-
71
- if not os.path.exists(input_path):
72
- print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
73
- sys.exit(1)
74
-
75
- try:
76
- df = _load(input_path)
77
- _write(df, output_path)
78
- size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
79
- print(json.dumps({
80
- "ok": True,
81
- "output_path": output_path,
82
- "rows": df.height,
83
- "columns": df.width,
84
- "size_mb": size_mb,
85
- }))
86
- except Exception as e:
87
- print(json.dumps({"ok": False, "error": str(e)}))
88
- sys.exit(1)
89
-
90
-
91
- if __name__ == "__main__":
92
- main()