vesper-wizard 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. package/README.md +37 -322
  2. package/package.json +34 -100
  3. package/vesper-mcp-config.json +6 -0
  4. package/{scripts/wizard.js → wizard.js} +34 -10
  5. package/LICENSE +0 -21
  6. package/build/cache/cdn.js +0 -34
  7. package/build/cache/service.js +0 -63
  8. package/build/cleaning/cleaner.js +0 -81
  9. package/build/cleaning/evaluator.js +0 -89
  10. package/build/cleaning/executor.js +0 -62
  11. package/build/cleaning/exporter.js +0 -87
  12. package/build/cleaning/planner.js +0 -127
  13. package/build/cleaning/rules.js +0 -57
  14. package/build/cleaning/types.js +0 -1
  15. package/build/cloud/adapters/local.js +0 -37
  16. package/build/cloud/adapters/s3.js +0 -24
  17. package/build/cloud/adapters/supabase.js +0 -49
  18. package/build/cloud/storage-manager.js +0 -26
  19. package/build/cloud/types.js +0 -1
  20. package/build/compliance/service.js +0 -73
  21. package/build/compliance/store.js +0 -80
  22. package/build/compliance/types.js +0 -1
  23. package/build/config/config-manager.js +0 -221
  24. package/build/config/secure-keys.js +0 -51
  25. package/build/config/user-config.js +0 -48
  26. package/build/data/processing-worker.js +0 -23
  27. package/build/data/streaming.js +0 -38
  28. package/build/data/worker-pool.js +0 -39
  29. package/build/export/exporter.js +0 -82
  30. package/build/export/packager.js +0 -100
  31. package/build/export/types.js +0 -1
  32. package/build/fusion/aligner.js +0 -56
  33. package/build/fusion/deduplicator.js +0 -69
  34. package/build/fusion/engine.js +0 -69
  35. package/build/fusion/harmonizer.js +0 -39
  36. package/build/fusion/orchestrator.js +0 -86
  37. package/build/fusion/types.js +0 -1
  38. package/build/gateway/unified-dataset-gateway.js +0 -410
  39. package/build/index.js +0 -3068
  40. package/build/ingestion/hf-downloader.js +0 -171
  41. package/build/ingestion/ingestor.js +0 -271
  42. package/build/ingestion/kaggle-downloader.js +0 -102
  43. package/build/install/install-service.js +0 -46
  44. package/build/jobs/manager.js +0 -136
  45. package/build/jobs/queue.js +0 -59
  46. package/build/jobs/types.js +0 -1
  47. package/build/lib/supabase.js +0 -3
  48. package/build/metadata/dataworld-source.js +0 -89
  49. package/build/metadata/domain.js +0 -147
  50. package/build/metadata/github-scraper.js +0 -47
  51. package/build/metadata/institutional-scrapers.js +0 -49
  52. package/build/metadata/kaggle-scraper.js +0 -182
  53. package/build/metadata/kaggle-source.js +0 -70
  54. package/build/metadata/license.js +0 -68
  55. package/build/metadata/monitoring-service.js +0 -107
  56. package/build/metadata/monitoring-store.js +0 -78
  57. package/build/metadata/monitoring-types.js +0 -1
  58. package/build/metadata/openml-source.js +0 -87
  59. package/build/metadata/quality.js +0 -48
  60. package/build/metadata/rate-limiter.js +0 -128
  61. package/build/metadata/scraper.js +0 -448
  62. package/build/metadata/store.js +0 -340
  63. package/build/metadata/types.js +0 -1
  64. package/build/metadata/uci-scraper.js +0 -49
  65. package/build/monitoring/observability.js +0 -76
  66. package/build/preparation/target-detector.js +0 -75
  67. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  68. package/build/python/asset_downloader_engine.py +0 -94
  69. package/build/python/cleaner.py +0 -226
  70. package/build/python/config.py +0 -263
  71. package/build/python/convert_engine.py +0 -92
  72. package/build/python/dataworld_engine.py +0 -208
  73. package/build/python/export_engine.py +0 -288
  74. package/build/python/framework_adapters.py +0 -100
  75. package/build/python/fusion_engine.py +0 -368
  76. package/build/python/github_adapter.py +0 -106
  77. package/build/python/hf_fallback.py +0 -298
  78. package/build/python/image_engine.py +0 -86
  79. package/build/python/kaggle_engine.py +0 -295
  80. package/build/python/media_engine.py +0 -133
  81. package/build/python/nasa_adapter.py +0 -82
  82. package/build/python/normalize_engine.py +0 -83
  83. package/build/python/openml_engine.py +0 -146
  84. package/build/python/quality_engine.py +0 -267
  85. package/build/python/row_count.py +0 -54
  86. package/build/python/splitter_engine.py +0 -283
  87. package/build/python/target_engine.py +0 -154
  88. package/build/python/test_framework_adapters.py +0 -61
  89. package/build/python/test_fusion_engine.py +0 -89
  90. package/build/python/uci_adapter.py +0 -94
  91. package/build/python/vesper/__init__.py +0 -1
  92. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  93. package/build/python/vesper/core/__init__.py +0 -1
  94. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  95. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  96. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  97. package/build/python/vesper/core/asset_downloader.py +0 -679
  98. package/build/python/vesper/core/download_recipe.py +0 -104
  99. package/build/python/worldbank_adapter.py +0 -99
  100. package/build/quality/analyzer.js +0 -93
  101. package/build/quality/image-analyzer.js +0 -114
  102. package/build/quality/media-analyzer.js +0 -115
  103. package/build/quality/quality-orchestrator.js +0 -162
  104. package/build/quality/types.js +0 -1
  105. package/build/scripts/build-index.js +0 -54
  106. package/build/scripts/check-db.js +0 -73
  107. package/build/scripts/check-jobs.js +0 -24
  108. package/build/scripts/check-naruto.js +0 -17
  109. package/build/scripts/cleanup-kaggle.js +0 -41
  110. package/build/scripts/demo-full-pipeline.js +0 -62
  111. package/build/scripts/demo-ui.js +0 -58
  112. package/build/scripts/e2e-demo.js +0 -72
  113. package/build/scripts/massive-scrape.js +0 -103
  114. package/build/scripts/ops-dashboard.js +0 -33
  115. package/build/scripts/repro-bug.js +0 -37
  116. package/build/scripts/repro-export-bug.js +0 -56
  117. package/build/scripts/scrape-metadata.js +0 -100
  118. package/build/scripts/search-cli.js +0 -26
  119. package/build/scripts/test-bias.js +0 -45
  120. package/build/scripts/test-caching.js +0 -51
  121. package/build/scripts/test-cleaning.js +0 -76
  122. package/build/scripts/test-cloud-storage.js +0 -48
  123. package/build/scripts/test-compliance.js +0 -58
  124. package/build/scripts/test-conversion.js +0 -64
  125. package/build/scripts/test-custom-rules.js +0 -58
  126. package/build/scripts/test-db-opt.js +0 -63
  127. package/build/scripts/test-export-custom.js +0 -33
  128. package/build/scripts/test-exporter.js +0 -53
  129. package/build/scripts/test-fusion.js +0 -61
  130. package/build/scripts/test-github.js +0 -27
  131. package/build/scripts/test-group-split.js +0 -52
  132. package/build/scripts/test-hf-download.js +0 -29
  133. package/build/scripts/test-holdout-manager.js +0 -61
  134. package/build/scripts/test-hybrid-search.js +0 -41
  135. package/build/scripts/test-image-analysis.js +0 -50
  136. package/build/scripts/test-ingestion-infra.js +0 -39
  137. package/build/scripts/test-install.js +0 -40
  138. package/build/scripts/test-institutional.js +0 -26
  139. package/build/scripts/test-integrity.js +0 -41
  140. package/build/scripts/test-jit.js +0 -42
  141. package/build/scripts/test-job-queue.js +0 -62
  142. package/build/scripts/test-kaggle-download.js +0 -34
  143. package/build/scripts/test-large-data.js +0 -50
  144. package/build/scripts/test-mcp-v5.js +0 -74
  145. package/build/scripts/test-media-analysis.js +0 -61
  146. package/build/scripts/test-monitoring.js +0 -91
  147. package/build/scripts/test-observability.js +0 -106
  148. package/build/scripts/test-packager.js +0 -55
  149. package/build/scripts/test-pipeline.js +0 -50
  150. package/build/scripts/test-planning.js +0 -64
  151. package/build/scripts/test-privacy.js +0 -38
  152. package/build/scripts/test-production-sync.js +0 -36
  153. package/build/scripts/test-quality.js +0 -43
  154. package/build/scripts/test-robust-ingestion.js +0 -41
  155. package/build/scripts/test-schema.js +0 -45
  156. package/build/scripts/test-split-validation.js +0 -40
  157. package/build/scripts/test-splitter.js +0 -93
  158. package/build/scripts/test-target-detector.js +0 -29
  159. package/build/scripts/test-uci.js +0 -27
  160. package/build/scripts/test-unified-quality.js +0 -86
  161. package/build/scripts/test-write.js +0 -14
  162. package/build/scripts/verify-integration.js +0 -57
  163. package/build/scripts/verify-priority.js +0 -33
  164. package/build/search/embedder.js +0 -34
  165. package/build/search/engine.js +0 -190
  166. package/build/search/jit-orchestrator.js +0 -262
  167. package/build/search/query-intent.js +0 -509
  168. package/build/search/vector-store.js +0 -123
  169. package/build/splitting/splitter.js +0 -82
  170. package/build/splitting/types.js +0 -1
  171. package/build/tools/formatter.js +0 -251
  172. package/build/utils/downloader.js +0 -52
  173. package/build/utils/python-runtime.js +0 -130
  174. package/build/utils/selector.js +0 -69
  175. package/mcp-config-template.json +0 -18
  176. package/scripts/postinstall.cjs +0 -170
  177. package/scripts/preindex_registry.cjs +0 -157
  178. package/scripts/refresh-index.cjs +0 -87
  179. package/scripts/wizard.cjs +0 -601
  180. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  181. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  182. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  183. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  184. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  185. package/src/python/asset_downloader_engine.py +0 -94
  186. package/src/python/cleaner.py +0 -226
  187. package/src/python/config.py +0 -263
  188. package/src/python/convert_engine.py +0 -92
  189. package/src/python/dataworld_engine.py +0 -208
  190. package/src/python/export_engine.py +0 -288
  191. package/src/python/framework_adapters.py +0 -100
  192. package/src/python/fusion_engine.py +0 -368
  193. package/src/python/github_adapter.py +0 -106
  194. package/src/python/hf_fallback.py +0 -298
  195. package/src/python/image_engine.py +0 -86
  196. package/src/python/kaggle_engine.py +0 -295
  197. package/src/python/media_engine.py +0 -133
  198. package/src/python/nasa_adapter.py +0 -82
  199. package/src/python/normalize_engine.py +0 -83
  200. package/src/python/openml_engine.py +0 -146
  201. package/src/python/quality_engine.py +0 -267
  202. package/src/python/row_count.py +0 -54
  203. package/src/python/splitter_engine.py +0 -283
  204. package/src/python/target_engine.py +0 -154
  205. package/src/python/test_framework_adapters.py +0 -61
  206. package/src/python/test_fusion_engine.py +0 -89
  207. package/src/python/uci_adapter.py +0 -94
  208. package/src/python/vesper/__init__.py +0 -1
  209. package/src/python/vesper/core/__init__.py +0 -1
  210. package/src/python/vesper/core/asset_downloader.py +0 -679
  211. package/src/python/vesper/core/download_recipe.py +0 -104
  212. package/src/python/worldbank_adapter.py +0 -99
  213. package/wizard.cjs +0 -3
@@ -1,267 +0,0 @@
1
- import sys
2
- import json
3
- import polars as pl
4
- import numpy as np
5
-
6
- def analyze_column(df, col_name, dtype):
7
- stats = {
8
- "name": col_name,
9
- "type": str(dtype),
10
- "inferred_type": str(dtype), # Default to actual
11
- "missing_count": 0,
12
- "missing_percentage": 0.0,
13
- "unique_count": 0,
14
- "is_constant": False,
15
- "is_mixed_type": False
16
- }
17
-
18
- try:
19
- col = df[col_name]
20
- null_count = col.null_count()
21
- row_count = len(col)
22
-
23
- stats["missing_count"] = null_count
24
- stats["missing_percentage"] = (null_count / row_count) * 100 if row_count > 0 else 0
25
- stats["unique_count"] = col.n_unique()
26
- stats["is_constant"] = stats["unique_count"] <= 1 and row_count > 0
27
-
28
- # Schema Inference & Validation
29
- is_string = dtype == pl.Utf8 or dtype == pl.Object
30
-
31
- if is_string and row_count > 0:
32
- # Try inferring Numeric
33
- # Check if majority can be cast to float
34
- try:
35
- # Use strict=False to turn non-numbers into nulls
36
- numeric_cast = col.str.strip_chars().cast(pl.Float64, strict=False)
37
- numeric_nulls = numeric_cast.null_count()
38
-
39
- # If valid numbers are significantly more than original nulls, it might be numeric
40
- valid_numbers = row_count - numeric_nulls
41
- original_valid = row_count - null_count
42
-
43
- if valid_numbers > 0 and (valid_numbers / original_valid) > 0.9:
44
- stats["inferred_type"] = "Numeric (Stored as String)"
45
-
46
- # Mixed type check: If valid numbers exist but plenty of strings too
47
- elif valid_numbers > 0 and (valid_numbers / original_valid) < 0.9:
48
- stats["is_mixed_type"] = True
49
- except:
50
- pass
51
-
52
- # Numeric Analysis
53
- if dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32] or stats["inferred_type"].startswith("Numeric"):
54
- clean_col = col
55
- if is_string:
56
- # Cast for analysis if it was inferred
57
- clean_col = col.str.strip_chars().cast(pl.Float64, strict=False)
58
-
59
- clean_col = clean_col.drop_nulls()
60
-
61
- if len(clean_col) > 0:
62
- stats["distribution"] = {
63
- "min": float(clean_col.min()),
64
- "max": float(clean_col.max()),
65
- "mean": float(clean_col.mean()),
66
- "std": float(clean_col.std()) if len(clean_col) > 1 else 0,
67
- "p25": float(clean_col.quantile(0.25)),
68
- "p50": float(clean_col.median()),
69
- "p75": float(clean_col.quantile(0.75))
70
- }
71
-
72
- # Categorical Analysis
73
- if dtype == pl.Utf8 or dtype == pl.Categorical:
74
- value_counts = col.value_counts(sort=True).head(5)
75
- # Handle different polars versions return structure for value_counts
76
- try:
77
- # Format: struct with name/counts or columns
78
- rows = value_counts.rows()
79
- top_values = {}
80
- for row in rows:
81
- val = str(row[0]) if row[0] is not None else "null"
82
- count = int(row[1])
83
- top_values[val] = count
84
- stats["top_values"] = top_values
85
- except:
86
- pass
87
-
88
- except Exception as e:
89
- stats["error"] = str(e)
90
-
91
- return stats
92
-
93
- def main():
94
- if len(sys.argv) < 2:
95
- print(json.dumps({"error": "No file path provided"}))
96
- sys.exit(1)
97
-
98
- file_path = sys.argv[1]
99
-
100
- try:
101
- # Robust file reading with extension detection
102
- file_path_lower = file_path.lower()
103
- if file_path_lower.endswith(".csv"):
104
- df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
105
- elif file_path_lower.endswith(".tsv"):
106
- df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
107
- elif file_path_lower.endswith(".txt"):
108
- sep = ","
109
- try:
110
- with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
111
- first_line = fh.readline()
112
- if "\t" in first_line:
113
- sep = "\t"
114
- except Exception:
115
- sep = ","
116
- df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
117
- elif file_path_lower.endswith(".parquet"):
118
- try:
119
- # Try scanning first (faster for large files)
120
- df = pl.scan_parquet(file_path).limit(10000).collect()
121
- except:
122
- df = pl.read_parquet(file_path)
123
- if len(df) > 10000: df = df.head(10000)
124
- elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
125
- # Explicit NDJSON
126
- df = pl.scan_ndjson(file_path).limit(10000).collect()
127
- elif file_path_lower.endswith(".json"):
128
- # Ambiguous .json: Try standard JSON first, then NDJSON fallback
129
- try:
130
- # read_json reads standard JSON array [{}, {}]
131
- df = pl.read_json(file_path)
132
- if len(df) > 10000: df = df.head(10000)
133
- except Exception:
134
- try:
135
- # Fallback to NDJSON (common for large datasets mislabeled as .json)
136
- df = pl.scan_ndjson(file_path).limit(10000).collect()
137
- except Exception as e:
138
- print(json.dumps({"error": f"Failed to read JSON: {str(e)}"}))
139
- sys.exit(1)
140
- else:
141
- print(json.dumps({"error": f"Unsupported file extension: {file_path}"}))
142
- sys.exit(1)
143
-
144
- row_count = len(df)
145
- column_count = len(df.columns)
146
-
147
- # Duplicate detection (exact)
148
- # NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
149
- # Use a Python fallback that is slower but robust for the 10k sampled rows.
150
- duplicate_count = 0
151
- try:
152
- seen = set()
153
- for row in df.to_dicts():
154
- row_key = json.dumps(row, sort_keys=True, default=str)
155
- if row_key in seen:
156
- duplicate_count += 1
157
- else:
158
- seen.add(row_key)
159
- except Exception:
160
- duplicate_count = 0
161
-
162
- columns_stats = []
163
- text_cols = []
164
- for col in df.columns:
165
- stats = analyze_column(df, col, df.schema[col])
166
- columns_stats.append(stats)
167
- # Check for String type (Polars can return 'String' or 'Utf8' depending on version)
168
- dtype_str = stats["type"]
169
- if ("String" in dtype_str or "Utf8" in dtype_str) and stats["unique_count"] > 1:
170
- text_cols.append(col)
171
-
172
- report = {
173
- "row_count": row_count,
174
- "column_count": column_count,
175
- "duplicate_rows": int(duplicate_count),
176
- "duplicate_percentage": (duplicate_count / row_count * 100) if row_count > 0 else 0,
177
- "columns": columns_stats,
178
- "warnings": [],
179
- "schema_warnings": [],
180
- "overall_score": 100
181
- }
182
-
183
- # Integrity Check 1: Text Duplicates (Fuzzyish Proxy)
184
- # If duplicated rows are 0, check if main text content is duplicated
185
- if duplicate_count == 0 and len(text_cols) > 0:
186
- # Pick longest text column as likely "content"
187
- # In real impl, we'd use heuristics. For now, first text col.
188
- target_col = text_cols[0]
189
- try:
190
- text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
191
- if text_dupes > 0:
192
- report["text_duplicates"] = int(text_dupes)
193
- if text_dupes > (row_count * 0.2):
194
- report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
195
- except Exception:
196
- # Skip text duplicate warning if backend cannot compute duplicates for this dtype
197
- pass
198
-
199
- # Integrity Check 2: Contamination / Leakage (Basic)
200
- # (Skipping correlation for now)
201
-
202
- report["class_imbalance_warnings"] = []
203
- report["pii_warnings"] = []
204
-
205
- # PII Patterns (Regex)
206
- import re
207
- pii_patterns = {
208
- "Email": r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
209
- "Phone": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', # Basic US-ish pattern
210
- "SSN": r'\d{3}-\d{2}-\d{4}',
211
- "IPv4": r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
212
- }
213
-
214
- # Bias & PII Analysis
215
- for col_name, stats in zip(df.columns, columns_stats):
216
- # Class Imbalance
217
- if stats["unique_count"] > 1 and stats["unique_count"] < 50:
218
- try:
219
- col = df[col_name]
220
- top_val_count = col.value_counts().sort("count", descending=True).row(0)[1]
221
- total = len(col)
222
- if total > 0:
223
- ratio = top_val_count / total
224
- if ratio > 0.9:
225
- report["class_imbalance_warnings"].append(f"Severe imbalance in '{col_name}': Top class is {(ratio*100):.1f}% of data")
226
- except:
227
- pass
228
-
229
- # PII Detection (on Text Columns only)
230
- if ("String" in stats["type"] or "Utf8" in stats["type"]):
231
- try:
232
- # Sample for performance (check first 1000 non-null values)
233
- sample_text = df[col_name].drop_nulls().head(1000).to_list()
234
- # Join a subset to regex against (faster than row-by-row for simple checks)
235
- combined_text = " ".join([str(x) for x in sample_text])
236
-
237
- for pii_type, pattern in pii_patterns.items():
238
- if re.search(pattern, combined_text):
239
- # Ensure we don't flag column names like "email_address" but actual content
240
- # Double check with a strict count if trigger found
241
- matches = len(re.findall(pattern, combined_text))
242
- if matches > 0:
243
- report["pii_warnings"].append(f"Potential {pii_type} detected in column '{col_name}' ({matches} matches in sample)")
244
- except:
245
- pass
246
-
247
- # Basic warnings
248
- if report["duplicate_percentage"] > 10:
249
- report["warnings"].append("High duplication rate (>10%)")
250
- if row_count < 50:
251
- report["warnings"].append("Dataset is very small (<50 rows)")
252
-
253
- # Schema warnings
254
- for col in columns_stats:
255
- if "Numeric" in col.get("inferred_type", "") and "Utf8" in col.get("type", ""):
256
- report["schema_warnings"].append(f"Column '{col['name']}' looks Numeric but is stored as String")
257
- if col.get("is_mixed_type"):
258
- report["schema_warnings"].append(f"Column '{col['name']}' likely contains mixed types (numbers and strings)")
259
-
260
- print(json.dumps(report))
261
-
262
- except Exception as e:
263
- print(json.dumps({"error": f"Analysis failed: {str(e)}"}))
264
- sys.exit(1)
265
-
266
- if __name__ == "__main__":
267
- main()
@@ -1,54 +0,0 @@
1
- import sys
2
- import json
3
- import os
4
-
5
- try:
6
- import polars as pl
7
- except Exception:
8
- print(json.dumps({"ok": False, "error": "polars is required"}))
9
- sys.exit(1)
10
-
11
-
12
- def count_rows(path: str) -> int:
13
- ext = os.path.splitext(path)[1].lower()
14
-
15
- if ext == ".csv":
16
- # Faster than full read for large csv
17
- return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
18
- if ext in [".parquet", ".pq"]:
19
- return int(pl.scan_parquet(path).select(pl.len()).collect().item())
20
- if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
21
- return int(pl.scan_ipc(path).select(pl.len()).collect().item())
22
- if ext in [".jsonl", ".ndjson"]:
23
- return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
24
- if ext == ".json":
25
- # fallback to eager for plain JSON arrays
26
- try:
27
- return int(pl.read_json(path).height)
28
- except Exception:
29
- return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
30
-
31
- # unknown extension fallback
32
- return int(pl.read_csv(path, ignore_errors=True).height)
33
-
34
-
35
- def main():
36
- if len(sys.argv) < 2:
37
- print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
38
- sys.exit(1)
39
-
40
- p = sys.argv[1]
41
- if not os.path.exists(p):
42
- print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
43
- sys.exit(1)
44
-
45
- try:
46
- rows = count_rows(p)
47
- print(json.dumps({"ok": True, "rows": rows}))
48
- except Exception as e:
49
- print(json.dumps({"ok": False, "error": str(e)}))
50
- sys.exit(1)
51
-
52
-
53
- if __name__ == "__main__":
54
- main()
@@ -1,283 +0,0 @@
1
- import sys
2
- import json
3
- import polars as pl
4
- import numpy as np
5
- from sklearn.model_selection import train_test_split
6
-
7
- def execute_split(file_path, config):
8
- # Load Data
9
- if file_path.endswith(".csv"):
10
- df = pl.read_csv(file_path, ignore_errors=True)
11
- elif file_path.endswith(".parquet"):
12
- df = pl.read_parquet(file_path)
13
- else:
14
- raise ValueError("Unsupported format")
15
-
16
- train_ratio = config["ratios"]["train"]
17
- val_ratio = config["ratios"]["val"]
18
- test_ratio = config["ratios"]["test"]
19
- holdout_ratio = config["ratios"].get("holdout", 0)
20
- seed = config.get("random_seed", 42)
21
- shuffle = config.get("shuffle", True)
22
-
23
- # Strategy
24
- strategy = config["type"]
25
- target_col = config.get("target_column", None)
26
- time_col = config.get("time_column", None)
27
-
28
- train_df, val_df, test_df, holdout_df = None, None, None, None
29
-
30
- # --- 1. RANDOM / STRATIFIED SPLIT ---
31
- if strategy in ["random", "stratified"]:
32
- if strategy == "random":
33
- if shuffle:
34
- df = df.sample(fraction=1.0, seed=seed, shuffle=True)
35
-
36
- n = len(df)
37
- n_train = int(n * train_ratio)
38
- n_val = int(n * val_ratio)
39
- n_test = int(n * test_ratio)
40
-
41
- train_df = df.slice(0, n_train)
42
- val_df = df.slice(n_train, n_val)
43
- test_df = df.slice(n_train + n_val, n_test)
44
- holdout_df = df.slice(n_train + n_val + n_test, n - (n_train + n_val + n_test))
45
-
46
- elif strategy == "stratified":
47
- if not target_col or target_col not in df.columns:
48
- return {"error": f"Target column '{target_col}' not found needed for stratification"}
49
-
50
- y = df[target_col].to_list()
51
- indices = np.arange(len(df))
52
-
53
- # Split 1: Train vs Others
54
- others_ratio = val_ratio + test_ratio + holdout_ratio
55
- if others_ratio == 0:
56
- train_idx, others_idx = indices, []
57
- else:
58
- train_idx, others_idx = train_test_split(indices, test_size=others_ratio, stratify=y, random_state=seed, shuffle=True)
59
-
60
- train_df = df[train_idx]
61
-
62
- if len(others_idx) > 0:
63
- y_others = [y[i] for i in others_idx]
64
-
65
- # Split 2: Val vs (Test + Holdout)
66
- test_holdout_ratio = (test_ratio + holdout_ratio) / others_ratio
67
- if test_holdout_ratio > 0 and test_holdout_ratio < 1:
68
- val_idx, test_holdout_idx = train_test_split(others_idx, test_size=test_holdout_ratio, stratify=y_others, random_state=seed, shuffle=True)
69
- val_df = df[val_idx]
70
-
71
- if len(test_holdout_idx) > 0:
72
- y_th = [y[i] for i in test_holdout_idx]
73
- relative_holdout_ratio = holdout_ratio / (test_ratio + holdout_ratio)
74
-
75
- if relative_holdout_ratio > 0 and relative_holdout_ratio < 1:
76
- test_idx, holdout_idx = train_test_split(test_holdout_idx, test_size=relative_holdout_ratio, stratify=y_th, random_state=seed, shuffle=True)
77
- test_df = df[test_idx]
78
- holdout_df = df[holdout_idx]
79
- elif relative_holdout_ratio >= 1:
80
- test_df = df.slice(0, 0)
81
- holdout_df = df[test_holdout_idx]
82
- else:
83
- test_df = df[test_holdout_idx]
84
- holdout_df = df.slice(0, 0)
85
- elif test_holdout_ratio >= 1:
86
- val_df = df.slice(0, 0)
87
- # Chained split for Test/Holdout
88
- y_th = y_others
89
- relative_holdout_ratio = holdout_ratio / (test_ratio + holdout_ratio)
90
- if relative_holdout_ratio > 0 and relative_holdout_ratio < 1:
91
- test_idx, holdout_idx = train_test_split(others_idx, test_size=relative_holdout_ratio, stratify=y_th, random_state=seed, shuffle=True)
92
- test_df = df[test_idx]
93
- holdout_df = df[holdout_idx]
94
- else:
95
- test_df = df[others_idx]
96
- holdout_df = df.slice(0, 0)
97
- else:
98
- val_df = df[others_idx]
99
- test_df = df.slice(0, 0)
100
- holdout_df = df.slice(0, 0)
101
-
102
- # --- 2. TIME-BASED SPLIT ---
103
- elif strategy == "time":
104
- if not time_col or time_col not in df.columns:
105
- return {"error": f"Time column '{time_col}' not found"}
106
-
107
- df = df.sort(time_col)
108
-
109
- n = len(df)
110
- n_train = int(n * train_ratio)
111
- n_val = int(n * val_ratio)
112
- n_test = int(n * test_ratio)
113
-
114
- train_df = df.slice(0, n_train)
115
- val_df = df.slice(n_train, n_val)
116
- test_df = df.slice(n_train + n_val, n_test)
117
- holdout_df = df.slice(n_train + n_val + n_test, n - (n_train + n_val + n_test))
118
-
119
- # --- 3. GROUP-BASED SPLIT ---
120
- elif strategy == "group":
121
- if not config.get("group_column") or config["group_column"] not in df.columns:
122
- return {"error": f"Group column '{config.get('group_column')}' not found"}
123
-
124
- group_col = config["group_column"]
125
- groups = df[group_col].unique().to_list()
126
-
127
- # Split groups first to ensure zero leakage
128
- n_grps = len(groups)
129
- n_train = int(n_grps * train_ratio)
130
- n_val = int(n_grps * val_ratio)
131
- n_test = int(n_grps * test_ratio)
132
-
133
- if shuffle:
134
- np.random.seed(seed)
135
- np.random.shuffle(groups)
136
-
137
- train_grps = set(groups[:n_train])
138
- val_grps = set(groups[n_train:n_train+n_val])
139
- test_grps = set(groups[n_train+n_val:n_train+n_val+n_test])
140
- holdout_grps = set(groups[n_train+n_val+n_test:])
141
-
142
- train_df = df.filter(pl.col(group_col).is_in(train_grps))
143
- val_df = df.filter(pl.col(group_col).is_in(val_grps))
144
- test_df = df.filter(pl.col(group_col).is_in(test_grps))
145
- holdout_df = df.filter(pl.col(group_col).is_in(holdout_grps))
146
-
147
- else:
148
- return {"error": f"Strategy {strategy} not implemented yet"}
149
-
150
- # Save outputs
151
- base_name = file_path.replace(".csv", "").replace(".parquet", "")
152
- train_path = f"{base_name}_train.csv"
153
- val_path = f"{base_name}_val.csv"
154
- test_path = f"{base_name}_test.csv"
155
- holdout_path = f"{base_name}_holdout.csv"
156
-
157
- train_df.write_csv(train_path)
158
- val_df.write_csv(val_path)
159
- test_df.write_csv(test_path)
160
- holdout_df.write_csv(holdout_path)
161
-
162
- return {
163
- "success": True,
164
- "paths": { "train": train_path, "val": val_path, "test": test_path, "holdout": holdout_path },
165
- "stats": {
166
- "train_rows": len(train_df),
167
- "val_rows": len(val_df),
168
- "test_rows": len(test_df),
169
- "holdout_rows": len(holdout_df)
170
- }
171
- }
172
-
173
- def validate_split(config):
174
- # Config contains paths to check and optional ID column
175
- train_path = config["paths"]["train"]
176
- val_path = config["paths"]["val"]
177
- test_path = config["paths"]["test"]
178
- holdout_path = config["paths"].get("holdout")
179
- id_col = config.get("id_column", "id") # Default to 'id' if exists
180
- target_col = config.get("target_column", None)
181
-
182
- # Load dfs
183
- try:
184
- train_df = pl.read_csv(train_path) if train_path.endswith(".csv") else pl.read_parquet(train_path)
185
- val_df = pl.read_csv(val_path) if val_path.endswith(".csv") else pl.read_parquet(val_path)
186
- test_df = pl.read_csv(test_path) if test_path.endswith(".csv") else pl.read_parquet(test_path)
187
- holdout_df = None
188
- if holdout_path:
189
- holdout_df = pl.read_csv(holdout_path) if holdout_path.endswith(".csv") else pl.read_parquet(holdout_path)
190
- except:
191
- return {"error": "Failed to load split files for validation"}
192
-
193
- report = {
194
- "leakage_detected": False,
195
- "leakage_count": 0,
196
- "distribution_mismatch": False,
197
- "warnings": []
198
- }
199
-
200
- # 1. Leakage Check (ID intersection)
201
- if id_col in train_df.columns:
202
- train_ids = set(train_df[id_col].to_list())
203
- val_ids = set(val_df[id_col].to_list())
204
- test_ids = set(test_df[id_col].to_list())
205
- holdout_ids = set(holdout_df[id_col].to_list()) if holdout_df is not None else set()
206
-
207
- leakage_tv = len(train_ids.intersection(val_ids))
208
- leakage_tt = len(train_ids.intersection(test_ids))
209
- leakage_th = len(train_ids.intersection(holdout_ids))
210
- leakage_vt = len(val_ids.intersection(test_ids))
211
- leakage_vh = len(val_ids.intersection(holdout_ids))
212
- leakage_th_val = len(test_ids.intersection(holdout_ids))
213
-
214
- total_leakage = leakage_tv + leakage_tt + leakage_th + leakage_vt + leakage_vh + leakage_th_val
215
-
216
- if total_leakage > 0:
217
- report["leakage_detected"] = True
218
- report["leakage_count"] = total_leakage
219
- report["warnings"].append(f"Found {total_leakage} overlapping IDs between splits.")
220
- else:
221
- report["warnings"].append(f"ID column '{id_col}' not found. Skipping exact leakage check.")
222
-
223
- # 2. Distribution Check (Target Distribution)
224
- if target_col and target_col in train_df.columns:
225
- try:
226
- def get_ratios(df, col):
227
- counts = df[col].value_counts()
228
- total = len(df)
229
- ratios = {}
230
- for row in counts.rows():
231
- ratios[str(row[0])] = row[1] / total
232
- return ratios
233
-
234
- train_metrics = get_ratios(train_df, target_col)
235
- val_metrics = get_ratios(val_df, target_col)
236
- # test_metrics = get_ratios(test_df, target_col) # Optional: could check all
237
-
238
- for cls in train_metrics:
239
- train_r = train_metrics[cls]
240
- val_r = val_metrics.get(cls, 0)
241
- diff = abs(train_r - val_r)
242
- if diff > 0.1: # 10% drift
243
- report["distribution_mismatch"] = True
244
- report["warnings"].append(f"Class '{cls}' drift: Train={train_r:.2f}, Val={val_r:.2f}")
245
- except:
246
- pass
247
-
248
- return report
249
-
250
- def main():
251
- # Usage:
252
- # split: python splitter_engine.py split <file_path> <config_json>
253
- # validate: python splitter_engine.py validate <config_json> (dummy file arg ignored)
254
-
255
- if len(sys.argv) < 3:
256
- print(json.dumps({"error": "Usage: splitter_engine.py <action> <arg1> [arg2]"}), file=sys.stderr)
257
- sys.exit(1)
258
-
259
- action = sys.argv[1]
260
-
261
- try:
262
- if action == "split":
263
- file_path = sys.argv[2]
264
- config = json.loads(sys.argv[3])
265
- result = execute_split(file_path, config)
266
- print(json.dumps(result))
267
-
268
- elif action == "validate":
269
- config = json.loads(sys.argv[2])
270
- result = validate_split(config)
271
- print(json.dumps(result))
272
-
273
- else:
274
- # Fallback for old calls (implicit split) - if users used old signature
275
- # But since we control the caller, we can just update the caller (DataSplitter.ts).
276
- raise ValueError(f"Unknown action: {action}")
277
-
278
- except Exception as e:
279
- print(json.dumps({"success": False, "error": str(e)}))
280
- sys.exit(1)
281
-
282
- if __name__ == "__main__":
283
- main()