vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import polars as pl
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
def analyze_column(df, col_name, dtype):
|
|
7
|
-
stats = {
|
|
8
|
-
"name": col_name,
|
|
9
|
-
"type": str(dtype),
|
|
10
|
-
"inferred_type": str(dtype), # Default to actual
|
|
11
|
-
"missing_count": 0,
|
|
12
|
-
"missing_percentage": 0.0,
|
|
13
|
-
"unique_count": 0,
|
|
14
|
-
"is_constant": False,
|
|
15
|
-
"is_mixed_type": False
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
try:
|
|
19
|
-
col = df[col_name]
|
|
20
|
-
null_count = col.null_count()
|
|
21
|
-
row_count = len(col)
|
|
22
|
-
|
|
23
|
-
stats["missing_count"] = null_count
|
|
24
|
-
stats["missing_percentage"] = (null_count / row_count) * 100 if row_count > 0 else 0
|
|
25
|
-
stats["unique_count"] = col.n_unique()
|
|
26
|
-
stats["is_constant"] = stats["unique_count"] <= 1 and row_count > 0
|
|
27
|
-
|
|
28
|
-
# Schema Inference & Validation
|
|
29
|
-
is_string = dtype == pl.Utf8 or dtype == pl.Object
|
|
30
|
-
|
|
31
|
-
if is_string and row_count > 0:
|
|
32
|
-
# Try inferring Numeric
|
|
33
|
-
# Check if majority can be cast to float
|
|
34
|
-
try:
|
|
35
|
-
# Use strict=False to turn non-numbers into nulls
|
|
36
|
-
numeric_cast = col.str.strip_chars().cast(pl.Float64, strict=False)
|
|
37
|
-
numeric_nulls = numeric_cast.null_count()
|
|
38
|
-
|
|
39
|
-
# If valid numbers are significantly more than original nulls, it might be numeric
|
|
40
|
-
valid_numbers = row_count - numeric_nulls
|
|
41
|
-
original_valid = row_count - null_count
|
|
42
|
-
|
|
43
|
-
if valid_numbers > 0 and (valid_numbers / original_valid) > 0.9:
|
|
44
|
-
stats["inferred_type"] = "Numeric (Stored as String)"
|
|
45
|
-
|
|
46
|
-
# Mixed type check: If valid numbers exist but plenty of strings too
|
|
47
|
-
elif valid_numbers > 0 and (valid_numbers / original_valid) < 0.9:
|
|
48
|
-
stats["is_mixed_type"] = True
|
|
49
|
-
except:
|
|
50
|
-
pass
|
|
51
|
-
|
|
52
|
-
# Numeric Analysis
|
|
53
|
-
if dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32] or stats["inferred_type"].startswith("Numeric"):
|
|
54
|
-
clean_col = col
|
|
55
|
-
if is_string:
|
|
56
|
-
# Cast for analysis if it was inferred
|
|
57
|
-
clean_col = col.str.strip_chars().cast(pl.Float64, strict=False)
|
|
58
|
-
|
|
59
|
-
clean_col = clean_col.drop_nulls()
|
|
60
|
-
|
|
61
|
-
if len(clean_col) > 0:
|
|
62
|
-
stats["distribution"] = {
|
|
63
|
-
"min": float(clean_col.min()),
|
|
64
|
-
"max": float(clean_col.max()),
|
|
65
|
-
"mean": float(clean_col.mean()),
|
|
66
|
-
"std": float(clean_col.std()) if len(clean_col) > 1 else 0,
|
|
67
|
-
"p25": float(clean_col.quantile(0.25)),
|
|
68
|
-
"p50": float(clean_col.median()),
|
|
69
|
-
"p75": float(clean_col.quantile(0.75))
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
# Categorical Analysis
|
|
73
|
-
if dtype == pl.Utf8 or dtype == pl.Categorical:
|
|
74
|
-
value_counts = col.value_counts(sort=True).head(5)
|
|
75
|
-
# Handle different polars versions return structure for value_counts
|
|
76
|
-
try:
|
|
77
|
-
# Format: struct with name/counts or columns
|
|
78
|
-
rows = value_counts.rows()
|
|
79
|
-
top_values = {}
|
|
80
|
-
for row in rows:
|
|
81
|
-
val = str(row[0]) if row[0] is not None else "null"
|
|
82
|
-
count = int(row[1])
|
|
83
|
-
top_values[val] = count
|
|
84
|
-
stats["top_values"] = top_values
|
|
85
|
-
except:
|
|
86
|
-
pass
|
|
87
|
-
|
|
88
|
-
except Exception as e:
|
|
89
|
-
stats["error"] = str(e)
|
|
90
|
-
|
|
91
|
-
return stats
|
|
92
|
-
|
|
93
|
-
def main():
|
|
94
|
-
if len(sys.argv) < 2:
|
|
95
|
-
print(json.dumps({"error": "No file path provided"}))
|
|
96
|
-
sys.exit(1)
|
|
97
|
-
|
|
98
|
-
file_path = sys.argv[1]
|
|
99
|
-
|
|
100
|
-
try:
|
|
101
|
-
# Robust file reading with extension detection
|
|
102
|
-
file_path_lower = file_path.lower()
|
|
103
|
-
if file_path_lower.endswith(".csv"):
|
|
104
|
-
df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
|
|
105
|
-
elif file_path_lower.endswith(".tsv"):
|
|
106
|
-
df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
|
|
107
|
-
elif file_path_lower.endswith(".txt"):
|
|
108
|
-
sep = ","
|
|
109
|
-
try:
|
|
110
|
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
111
|
-
first_line = fh.readline()
|
|
112
|
-
if "\t" in first_line:
|
|
113
|
-
sep = "\t"
|
|
114
|
-
except Exception:
|
|
115
|
-
sep = ","
|
|
116
|
-
df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
|
|
117
|
-
elif file_path_lower.endswith(".parquet"):
|
|
118
|
-
try:
|
|
119
|
-
# Try scanning first (faster for large files)
|
|
120
|
-
df = pl.scan_parquet(file_path).limit(10000).collect()
|
|
121
|
-
except:
|
|
122
|
-
df = pl.read_parquet(file_path)
|
|
123
|
-
if len(df) > 10000: df = df.head(10000)
|
|
124
|
-
elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
|
|
125
|
-
# Explicit NDJSON
|
|
126
|
-
df = pl.scan_ndjson(file_path).limit(10000).collect()
|
|
127
|
-
elif file_path_lower.endswith(".json"):
|
|
128
|
-
# Ambiguous .json: Try standard JSON first, then NDJSON fallback
|
|
129
|
-
try:
|
|
130
|
-
# read_json reads standard JSON array [{}, {}]
|
|
131
|
-
df = pl.read_json(file_path)
|
|
132
|
-
if len(df) > 10000: df = df.head(10000)
|
|
133
|
-
except Exception:
|
|
134
|
-
try:
|
|
135
|
-
# Fallback to NDJSON (common for large datasets mislabeled as .json)
|
|
136
|
-
df = pl.scan_ndjson(file_path).limit(10000).collect()
|
|
137
|
-
except Exception as e:
|
|
138
|
-
print(json.dumps({"error": f"Failed to read JSON: {str(e)}"}))
|
|
139
|
-
sys.exit(1)
|
|
140
|
-
else:
|
|
141
|
-
print(json.dumps({"error": f"Unsupported file extension: {file_path}"}))
|
|
142
|
-
sys.exit(1)
|
|
143
|
-
|
|
144
|
-
row_count = len(df)
|
|
145
|
-
column_count = len(df.columns)
|
|
146
|
-
|
|
147
|
-
# Duplicate detection (exact)
|
|
148
|
-
# NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
|
|
149
|
-
# Use a Python fallback that is slower but robust for the 10k sampled rows.
|
|
150
|
-
duplicate_count = 0
|
|
151
|
-
try:
|
|
152
|
-
seen = set()
|
|
153
|
-
for row in df.to_dicts():
|
|
154
|
-
row_key = json.dumps(row, sort_keys=True, default=str)
|
|
155
|
-
if row_key in seen:
|
|
156
|
-
duplicate_count += 1
|
|
157
|
-
else:
|
|
158
|
-
seen.add(row_key)
|
|
159
|
-
except Exception:
|
|
160
|
-
duplicate_count = 0
|
|
161
|
-
|
|
162
|
-
columns_stats = []
|
|
163
|
-
text_cols = []
|
|
164
|
-
for col in df.columns:
|
|
165
|
-
stats = analyze_column(df, col, df.schema[col])
|
|
166
|
-
columns_stats.append(stats)
|
|
167
|
-
# Check for String type (Polars can return 'String' or 'Utf8' depending on version)
|
|
168
|
-
dtype_str = stats["type"]
|
|
169
|
-
if ("String" in dtype_str or "Utf8" in dtype_str) and stats["unique_count"] > 1:
|
|
170
|
-
text_cols.append(col)
|
|
171
|
-
|
|
172
|
-
report = {
|
|
173
|
-
"row_count": row_count,
|
|
174
|
-
"column_count": column_count,
|
|
175
|
-
"duplicate_rows": int(duplicate_count),
|
|
176
|
-
"duplicate_percentage": (duplicate_count / row_count * 100) if row_count > 0 else 0,
|
|
177
|
-
"columns": columns_stats,
|
|
178
|
-
"warnings": [],
|
|
179
|
-
"schema_warnings": [],
|
|
180
|
-
"overall_score": 100
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
# Integrity Check 1: Text Duplicates (Fuzzyish Proxy)
|
|
184
|
-
# If duplicated rows are 0, check if main text content is duplicated
|
|
185
|
-
if duplicate_count == 0 and len(text_cols) > 0:
|
|
186
|
-
# Pick longest text column as likely "content"
|
|
187
|
-
# In real impl, we'd use heuristics. For now, first text col.
|
|
188
|
-
target_col = text_cols[0]
|
|
189
|
-
try:
|
|
190
|
-
text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
|
|
191
|
-
if text_dupes > 0:
|
|
192
|
-
report["text_duplicates"] = int(text_dupes)
|
|
193
|
-
if text_dupes > (row_count * 0.2):
|
|
194
|
-
report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
|
|
195
|
-
except Exception:
|
|
196
|
-
# Skip text duplicate warning if backend cannot compute duplicates for this dtype
|
|
197
|
-
pass
|
|
198
|
-
|
|
199
|
-
# Integrity Check 2: Contamination / Leakage (Basic)
|
|
200
|
-
# (Skipping correlation for now)
|
|
201
|
-
|
|
202
|
-
report["class_imbalance_warnings"] = []
|
|
203
|
-
report["pii_warnings"] = []
|
|
204
|
-
|
|
205
|
-
# PII Patterns (Regex)
|
|
206
|
-
import re
|
|
207
|
-
pii_patterns = {
|
|
208
|
-
"Email": r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
|
|
209
|
-
"Phone": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', # Basic US-ish pattern
|
|
210
|
-
"SSN": r'\d{3}-\d{2}-\d{4}',
|
|
211
|
-
"IPv4": r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
# Bias & PII Analysis
|
|
215
|
-
for col_name, stats in zip(df.columns, columns_stats):
|
|
216
|
-
# Class Imbalance
|
|
217
|
-
if stats["unique_count"] > 1 and stats["unique_count"] < 50:
|
|
218
|
-
try:
|
|
219
|
-
col = df[col_name]
|
|
220
|
-
top_val_count = col.value_counts().sort("count", descending=True).row(0)[1]
|
|
221
|
-
total = len(col)
|
|
222
|
-
if total > 0:
|
|
223
|
-
ratio = top_val_count / total
|
|
224
|
-
if ratio > 0.9:
|
|
225
|
-
report["class_imbalance_warnings"].append(f"Severe imbalance in '{col_name}': Top class is {(ratio*100):.1f}% of data")
|
|
226
|
-
except:
|
|
227
|
-
pass
|
|
228
|
-
|
|
229
|
-
# PII Detection (on Text Columns only)
|
|
230
|
-
if ("String" in stats["type"] or "Utf8" in stats["type"]):
|
|
231
|
-
try:
|
|
232
|
-
# Sample for performance (check first 1000 non-null values)
|
|
233
|
-
sample_text = df[col_name].drop_nulls().head(1000).to_list()
|
|
234
|
-
# Join a subset to regex against (faster than row-by-row for simple checks)
|
|
235
|
-
combined_text = " ".join([str(x) for x in sample_text])
|
|
236
|
-
|
|
237
|
-
for pii_type, pattern in pii_patterns.items():
|
|
238
|
-
if re.search(pattern, combined_text):
|
|
239
|
-
# Ensure we don't flag column names like "email_address" but actual content
|
|
240
|
-
# Double check with a strict count if trigger found
|
|
241
|
-
matches = len(re.findall(pattern, combined_text))
|
|
242
|
-
if matches > 0:
|
|
243
|
-
report["pii_warnings"].append(f"Potential {pii_type} detected in column '{col_name}' ({matches} matches in sample)")
|
|
244
|
-
except:
|
|
245
|
-
pass
|
|
246
|
-
|
|
247
|
-
# Basic warnings
|
|
248
|
-
if report["duplicate_percentage"] > 10:
|
|
249
|
-
report["warnings"].append("High duplication rate (>10%)")
|
|
250
|
-
if row_count < 50:
|
|
251
|
-
report["warnings"].append("Dataset is very small (<50 rows)")
|
|
252
|
-
|
|
253
|
-
# Schema warnings
|
|
254
|
-
for col in columns_stats:
|
|
255
|
-
if "Numeric" in col.get("inferred_type", "") and "Utf8" in col.get("type", ""):
|
|
256
|
-
report["schema_warnings"].append(f"Column '{col['name']}' looks Numeric but is stored as String")
|
|
257
|
-
if col.get("is_mixed_type"):
|
|
258
|
-
report["schema_warnings"].append(f"Column '{col['name']}' likely contains mixed types (numbers and strings)")
|
|
259
|
-
|
|
260
|
-
print(json.dumps(report))
|
|
261
|
-
|
|
262
|
-
except Exception as e:
|
|
263
|
-
print(json.dumps({"error": f"Analysis failed: {str(e)}"}))
|
|
264
|
-
sys.exit(1)
|
|
265
|
-
|
|
266
|
-
if __name__ == "__main__":
|
|
267
|
-
main()
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
polars==1.2.0
|
|
2
|
-
pandas==2.2.0
|
|
3
|
-
numpy==1.26.0
|
|
4
|
-
scikit-learn==1.4.0
|
|
5
|
-
# Optional source/download extras:
|
|
6
|
-
kaggle>=1.6.17
|
|
7
|
-
aiohttp>=3.9.0
|
|
8
|
-
aiofiles>=24.1.0
|
|
9
|
-
datasets>=2.20.0
|
|
10
|
-
webdataset>=0.2.86
|
|
11
|
-
# Optional for secure key storage (preferred over file fallback):
|
|
12
|
-
# keyring>=24.0.0
|
package/src/python/row_count.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
try:
|
|
6
|
-
import polars as pl
|
|
7
|
-
except Exception:
|
|
8
|
-
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
9
|
-
sys.exit(1)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def count_rows(path: str) -> int:
|
|
13
|
-
ext = os.path.splitext(path)[1].lower()
|
|
14
|
-
|
|
15
|
-
if ext == ".csv":
|
|
16
|
-
# Faster than full read for large csv
|
|
17
|
-
return int(pl.scan_csv(path, ignore_errors=True).select(pl.len()).collect().item())
|
|
18
|
-
if ext in [".parquet", ".pq"]:
|
|
19
|
-
return int(pl.scan_parquet(path).select(pl.len()).collect().item())
|
|
20
|
-
if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
|
|
21
|
-
return int(pl.scan_ipc(path).select(pl.len()).collect().item())
|
|
22
|
-
if ext in [".jsonl", ".ndjson"]:
|
|
23
|
-
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
24
|
-
if ext == ".json":
|
|
25
|
-
# fallback to eager for plain JSON arrays
|
|
26
|
-
try:
|
|
27
|
-
return int(pl.read_json(path).height)
|
|
28
|
-
except Exception:
|
|
29
|
-
return int(pl.scan_ndjson(path).select(pl.len()).collect().item())
|
|
30
|
-
|
|
31
|
-
# unknown extension fallback
|
|
32
|
-
return int(pl.read_csv(path, ignore_errors=True).height)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def main():
|
|
36
|
-
if len(sys.argv) < 2:
|
|
37
|
-
print(json.dumps({"ok": False, "error": "Usage: row_count.py <file_path>"}))
|
|
38
|
-
sys.exit(1)
|
|
39
|
-
|
|
40
|
-
p = sys.argv[1]
|
|
41
|
-
if not os.path.exists(p):
|
|
42
|
-
print(json.dumps({"ok": False, "error": f"File not found: {p}"}))
|
|
43
|
-
sys.exit(1)
|
|
44
|
-
|
|
45
|
-
try:
|
|
46
|
-
rows = count_rows(p)
|
|
47
|
-
print(json.dumps({"ok": True, "rows": rows}))
|
|
48
|
-
except Exception as e:
|
|
49
|
-
print(json.dumps({"ok": False, "error": str(e)}))
|
|
50
|
-
sys.exit(1)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if __name__ == "__main__":
|
|
54
|
-
main()
|
|
@@ -1,283 +0,0 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import json
|
|
3
|
-
import polars as pl
|
|
4
|
-
import numpy as np
|
|
5
|
-
from sklearn.model_selection import train_test_split
|
|
6
|
-
|
|
7
|
-
def execute_split(file_path, config):
|
|
8
|
-
# Load Data
|
|
9
|
-
if file_path.endswith(".csv"):
|
|
10
|
-
df = pl.read_csv(file_path, ignore_errors=True)
|
|
11
|
-
elif file_path.endswith(".parquet"):
|
|
12
|
-
df = pl.read_parquet(file_path)
|
|
13
|
-
else:
|
|
14
|
-
raise ValueError("Unsupported format")
|
|
15
|
-
|
|
16
|
-
train_ratio = config["ratios"]["train"]
|
|
17
|
-
val_ratio = config["ratios"]["val"]
|
|
18
|
-
test_ratio = config["ratios"]["test"]
|
|
19
|
-
holdout_ratio = config["ratios"].get("holdout", 0)
|
|
20
|
-
seed = config.get("random_seed", 42)
|
|
21
|
-
shuffle = config.get("shuffle", True)
|
|
22
|
-
|
|
23
|
-
# Strategy
|
|
24
|
-
strategy = config["type"]
|
|
25
|
-
target_col = config.get("target_column", None)
|
|
26
|
-
time_col = config.get("time_column", None)
|
|
27
|
-
|
|
28
|
-
train_df, val_df, test_df, holdout_df = None, None, None, None
|
|
29
|
-
|
|
30
|
-
# --- 1. RANDOM / STRATIFIED SPLIT ---
|
|
31
|
-
if strategy in ["random", "stratified"]:
|
|
32
|
-
if strategy == "random":
|
|
33
|
-
if shuffle:
|
|
34
|
-
df = df.sample(fraction=1.0, seed=seed, shuffle=True)
|
|
35
|
-
|
|
36
|
-
n = len(df)
|
|
37
|
-
n_train = int(n * train_ratio)
|
|
38
|
-
n_val = int(n * val_ratio)
|
|
39
|
-
n_test = int(n * test_ratio)
|
|
40
|
-
|
|
41
|
-
train_df = df.slice(0, n_train)
|
|
42
|
-
val_df = df.slice(n_train, n_val)
|
|
43
|
-
test_df = df.slice(n_train + n_val, n_test)
|
|
44
|
-
holdout_df = df.slice(n_train + n_val + n_test, n - (n_train + n_val + n_test))
|
|
45
|
-
|
|
46
|
-
elif strategy == "stratified":
|
|
47
|
-
if not target_col or target_col not in df.columns:
|
|
48
|
-
return {"error": f"Target column '{target_col}' not found needed for stratification"}
|
|
49
|
-
|
|
50
|
-
y = df[target_col].to_list()
|
|
51
|
-
indices = np.arange(len(df))
|
|
52
|
-
|
|
53
|
-
# Split 1: Train vs Others
|
|
54
|
-
others_ratio = val_ratio + test_ratio + holdout_ratio
|
|
55
|
-
if others_ratio == 0:
|
|
56
|
-
train_idx, others_idx = indices, []
|
|
57
|
-
else:
|
|
58
|
-
train_idx, others_idx = train_test_split(indices, test_size=others_ratio, stratify=y, random_state=seed, shuffle=True)
|
|
59
|
-
|
|
60
|
-
train_df = df[train_idx]
|
|
61
|
-
|
|
62
|
-
if len(others_idx) > 0:
|
|
63
|
-
y_others = [y[i] for i in others_idx]
|
|
64
|
-
|
|
65
|
-
# Split 2: Val vs (Test + Holdout)
|
|
66
|
-
test_holdout_ratio = (test_ratio + holdout_ratio) / others_ratio
|
|
67
|
-
if test_holdout_ratio > 0 and test_holdout_ratio < 1:
|
|
68
|
-
val_idx, test_holdout_idx = train_test_split(others_idx, test_size=test_holdout_ratio, stratify=y_others, random_state=seed, shuffle=True)
|
|
69
|
-
val_df = df[val_idx]
|
|
70
|
-
|
|
71
|
-
if len(test_holdout_idx) > 0:
|
|
72
|
-
y_th = [y[i] for i in test_holdout_idx]
|
|
73
|
-
relative_holdout_ratio = holdout_ratio / (test_ratio + holdout_ratio)
|
|
74
|
-
|
|
75
|
-
if relative_holdout_ratio > 0 and relative_holdout_ratio < 1:
|
|
76
|
-
test_idx, holdout_idx = train_test_split(test_holdout_idx, test_size=relative_holdout_ratio, stratify=y_th, random_state=seed, shuffle=True)
|
|
77
|
-
test_df = df[test_idx]
|
|
78
|
-
holdout_df = df[holdout_idx]
|
|
79
|
-
elif relative_holdout_ratio >= 1:
|
|
80
|
-
test_df = df.slice(0, 0)
|
|
81
|
-
holdout_df = df[test_holdout_idx]
|
|
82
|
-
else:
|
|
83
|
-
test_df = df[test_holdout_idx]
|
|
84
|
-
holdout_df = df.slice(0, 0)
|
|
85
|
-
elif test_holdout_ratio >= 1:
|
|
86
|
-
val_df = df.slice(0, 0)
|
|
87
|
-
# Chained split for Test/Holdout
|
|
88
|
-
y_th = y_others
|
|
89
|
-
relative_holdout_ratio = holdout_ratio / (test_ratio + holdout_ratio)
|
|
90
|
-
if relative_holdout_ratio > 0 and relative_holdout_ratio < 1:
|
|
91
|
-
test_idx, holdout_idx = train_test_split(others_idx, test_size=relative_holdout_ratio, stratify=y_th, random_state=seed, shuffle=True)
|
|
92
|
-
test_df = df[test_idx]
|
|
93
|
-
holdout_df = df[holdout_idx]
|
|
94
|
-
else:
|
|
95
|
-
test_df = df[others_idx]
|
|
96
|
-
holdout_df = df.slice(0, 0)
|
|
97
|
-
else:
|
|
98
|
-
val_df = df[others_idx]
|
|
99
|
-
test_df = df.slice(0, 0)
|
|
100
|
-
holdout_df = df.slice(0, 0)
|
|
101
|
-
|
|
102
|
-
# --- 2. TIME-BASED SPLIT ---
|
|
103
|
-
elif strategy == "time":
|
|
104
|
-
if not time_col or time_col not in df.columns:
|
|
105
|
-
return {"error": f"Time column '{time_col}' not found"}
|
|
106
|
-
|
|
107
|
-
df = df.sort(time_col)
|
|
108
|
-
|
|
109
|
-
n = len(df)
|
|
110
|
-
n_train = int(n * train_ratio)
|
|
111
|
-
n_val = int(n * val_ratio)
|
|
112
|
-
n_test = int(n * test_ratio)
|
|
113
|
-
|
|
114
|
-
train_df = df.slice(0, n_train)
|
|
115
|
-
val_df = df.slice(n_train, n_val)
|
|
116
|
-
test_df = df.slice(n_train + n_val, n_test)
|
|
117
|
-
holdout_df = df.slice(n_train + n_val + n_test, n - (n_train + n_val + n_test))
|
|
118
|
-
|
|
119
|
-
# --- 3. GROUP-BASED SPLIT ---
|
|
120
|
-
elif strategy == "group":
|
|
121
|
-
if not config.get("group_column") or config["group_column"] not in df.columns:
|
|
122
|
-
return {"error": f"Group column '{config.get('group_column')}' not found"}
|
|
123
|
-
|
|
124
|
-
group_col = config["group_column"]
|
|
125
|
-
groups = df[group_col].unique().to_list()
|
|
126
|
-
|
|
127
|
-
# Split groups first to ensure zero leakage
|
|
128
|
-
n_grps = len(groups)
|
|
129
|
-
n_train = int(n_grps * train_ratio)
|
|
130
|
-
n_val = int(n_grps * val_ratio)
|
|
131
|
-
n_test = int(n_grps * test_ratio)
|
|
132
|
-
|
|
133
|
-
if shuffle:
|
|
134
|
-
np.random.seed(seed)
|
|
135
|
-
np.random.shuffle(groups)
|
|
136
|
-
|
|
137
|
-
train_grps = set(groups[:n_train])
|
|
138
|
-
val_grps = set(groups[n_train:n_train+n_val])
|
|
139
|
-
test_grps = set(groups[n_train+n_val:n_train+n_val+n_test])
|
|
140
|
-
holdout_grps = set(groups[n_train+n_val+n_test:])
|
|
141
|
-
|
|
142
|
-
train_df = df.filter(pl.col(group_col).is_in(train_grps))
|
|
143
|
-
val_df = df.filter(pl.col(group_col).is_in(val_grps))
|
|
144
|
-
test_df = df.filter(pl.col(group_col).is_in(test_grps))
|
|
145
|
-
holdout_df = df.filter(pl.col(group_col).is_in(holdout_grps))
|
|
146
|
-
|
|
147
|
-
else:
|
|
148
|
-
return {"error": f"Strategy {strategy} not implemented yet"}
|
|
149
|
-
|
|
150
|
-
# Save outputs
|
|
151
|
-
base_name = file_path.replace(".csv", "").replace(".parquet", "")
|
|
152
|
-
train_path = f"{base_name}_train.csv"
|
|
153
|
-
val_path = f"{base_name}_val.csv"
|
|
154
|
-
test_path = f"{base_name}_test.csv"
|
|
155
|
-
holdout_path = f"{base_name}_holdout.csv"
|
|
156
|
-
|
|
157
|
-
train_df.write_csv(train_path)
|
|
158
|
-
val_df.write_csv(val_path)
|
|
159
|
-
test_df.write_csv(test_path)
|
|
160
|
-
holdout_df.write_csv(holdout_path)
|
|
161
|
-
|
|
162
|
-
return {
|
|
163
|
-
"success": True,
|
|
164
|
-
"paths": { "train": train_path, "val": val_path, "test": test_path, "holdout": holdout_path },
|
|
165
|
-
"stats": {
|
|
166
|
-
"train_rows": len(train_df),
|
|
167
|
-
"val_rows": len(val_df),
|
|
168
|
-
"test_rows": len(test_df),
|
|
169
|
-
"holdout_rows": len(holdout_df)
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
def validate_split(config):
|
|
174
|
-
# Config contains paths to check and optional ID column
|
|
175
|
-
train_path = config["paths"]["train"]
|
|
176
|
-
val_path = config["paths"]["val"]
|
|
177
|
-
test_path = config["paths"]["test"]
|
|
178
|
-
holdout_path = config["paths"].get("holdout")
|
|
179
|
-
id_col = config.get("id_column", "id") # Default to 'id' if exists
|
|
180
|
-
target_col = config.get("target_column", None)
|
|
181
|
-
|
|
182
|
-
# Load dfs
|
|
183
|
-
try:
|
|
184
|
-
train_df = pl.read_csv(train_path) if train_path.endswith(".csv") else pl.read_parquet(train_path)
|
|
185
|
-
val_df = pl.read_csv(val_path) if val_path.endswith(".csv") else pl.read_parquet(val_path)
|
|
186
|
-
test_df = pl.read_csv(test_path) if test_path.endswith(".csv") else pl.read_parquet(test_path)
|
|
187
|
-
holdout_df = None
|
|
188
|
-
if holdout_path:
|
|
189
|
-
holdout_df = pl.read_csv(holdout_path) if holdout_path.endswith(".csv") else pl.read_parquet(holdout_path)
|
|
190
|
-
except:
|
|
191
|
-
return {"error": "Failed to load split files for validation"}
|
|
192
|
-
|
|
193
|
-
report = {
|
|
194
|
-
"leakage_detected": False,
|
|
195
|
-
"leakage_count": 0,
|
|
196
|
-
"distribution_mismatch": False,
|
|
197
|
-
"warnings": []
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
# 1. Leakage Check (ID intersection)
|
|
201
|
-
if id_col in train_df.columns:
|
|
202
|
-
train_ids = set(train_df[id_col].to_list())
|
|
203
|
-
val_ids = set(val_df[id_col].to_list())
|
|
204
|
-
test_ids = set(test_df[id_col].to_list())
|
|
205
|
-
holdout_ids = set(holdout_df[id_col].to_list()) if holdout_df is not None else set()
|
|
206
|
-
|
|
207
|
-
leakage_tv = len(train_ids.intersection(val_ids))
|
|
208
|
-
leakage_tt = len(train_ids.intersection(test_ids))
|
|
209
|
-
leakage_th = len(train_ids.intersection(holdout_ids))
|
|
210
|
-
leakage_vt = len(val_ids.intersection(test_ids))
|
|
211
|
-
leakage_vh = len(val_ids.intersection(holdout_ids))
|
|
212
|
-
leakage_th_val = len(test_ids.intersection(holdout_ids))
|
|
213
|
-
|
|
214
|
-
total_leakage = leakage_tv + leakage_tt + leakage_th + leakage_vt + leakage_vh + leakage_th_val
|
|
215
|
-
|
|
216
|
-
if total_leakage > 0:
|
|
217
|
-
report["leakage_detected"] = True
|
|
218
|
-
report["leakage_count"] = total_leakage
|
|
219
|
-
report["warnings"].append(f"Found {total_leakage} overlapping IDs between splits.")
|
|
220
|
-
else:
|
|
221
|
-
report["warnings"].append(f"ID column '{id_col}' not found. Skipping exact leakage check.")
|
|
222
|
-
|
|
223
|
-
# 2. Distribution Check (Target Distribution)
|
|
224
|
-
if target_col and target_col in train_df.columns:
|
|
225
|
-
try:
|
|
226
|
-
def get_ratios(df, col):
|
|
227
|
-
counts = df[col].value_counts()
|
|
228
|
-
total = len(df)
|
|
229
|
-
ratios = {}
|
|
230
|
-
for row in counts.rows():
|
|
231
|
-
ratios[str(row[0])] = row[1] / total
|
|
232
|
-
return ratios
|
|
233
|
-
|
|
234
|
-
train_metrics = get_ratios(train_df, target_col)
|
|
235
|
-
val_metrics = get_ratios(val_df, target_col)
|
|
236
|
-
# test_metrics = get_ratios(test_df, target_col) # Optional: could check all
|
|
237
|
-
|
|
238
|
-
for cls in train_metrics:
|
|
239
|
-
train_r = train_metrics[cls]
|
|
240
|
-
val_r = val_metrics.get(cls, 0)
|
|
241
|
-
diff = abs(train_r - val_r)
|
|
242
|
-
if diff > 0.1: # 10% drift
|
|
243
|
-
report["distribution_mismatch"] = True
|
|
244
|
-
report["warnings"].append(f"Class '{cls}' drift: Train={train_r:.2f}, Val={val_r:.2f}")
|
|
245
|
-
except:
|
|
246
|
-
pass
|
|
247
|
-
|
|
248
|
-
return report
|
|
249
|
-
|
|
250
|
-
def main():
|
|
251
|
-
# Usage:
|
|
252
|
-
# split: python splitter_engine.py split <file_path> <config_json>
|
|
253
|
-
# validate: python splitter_engine.py validate <config_json> (dummy file arg ignored)
|
|
254
|
-
|
|
255
|
-
if len(sys.argv) < 3:
|
|
256
|
-
print(json.dumps({"error": "Usage: splitter_engine.py <action> <arg1> [arg2]"}), file=sys.stderr)
|
|
257
|
-
sys.exit(1)
|
|
258
|
-
|
|
259
|
-
action = sys.argv[1]
|
|
260
|
-
|
|
261
|
-
try:
|
|
262
|
-
if action == "split":
|
|
263
|
-
file_path = sys.argv[2]
|
|
264
|
-
config = json.loads(sys.argv[3])
|
|
265
|
-
result = execute_split(file_path, config)
|
|
266
|
-
print(json.dumps(result))
|
|
267
|
-
|
|
268
|
-
elif action == "validate":
|
|
269
|
-
config = json.loads(sys.argv[2])
|
|
270
|
-
result = validate_split(config)
|
|
271
|
-
print(json.dumps(result))
|
|
272
|
-
|
|
273
|
-
else:
|
|
274
|
-
# Fallback for old calls (implicit split) - if users used old signature
|
|
275
|
-
# But since we control the caller, we can just update the caller (DataSplitter.ts).
|
|
276
|
-
raise ValueError(f"Unknown action: {action}")
|
|
277
|
-
|
|
278
|
-
except Exception as e:
|
|
279
|
-
print(json.dumps({"success": False, "error": str(e)}))
|
|
280
|
-
sys.exit(1)
|
|
281
|
-
|
|
282
|
-
if __name__ == "__main__":
|
|
283
|
-
main()
|