@vespermcp/mcp-server 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/fusion/engine.js +69 -0
- package/build/index.js +813 -25
- package/build/ingestion/hf-downloader.js +34 -5
- package/build/ingestion/ingestor.js +33 -9
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/scraper.js +34 -10
- package/build/python/config.py +259 -0
- package/build/python/export_engine.py +148 -52
- package/build/python/fusion_engine.py +368 -0
- package/build/python/kaggle_engine.py +204 -0
- package/build/python/row_count.py +54 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/scripts/build-index.js +5 -5
- package/build/search/jit-orchestrator.js +74 -14
- package/package.json +8 -2
- package/scripts/refresh-index.cjs +87 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/config.py +259 -0
- package/src/python/export_engine.py +148 -52
- package/src/python/fusion_engine.py +368 -0
- package/src/python/kaggle_engine.py +204 -0
- package/src/python/row_count.py +54 -0
- package/src/python/test_fusion_engine.py +89 -0
|
@@ -2,25 +2,102 @@ import sys
|
|
|
2
2
|
import json
|
|
3
3
|
import polars as pl
|
|
4
4
|
import os
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
# Optional imports for extra formats
|
|
8
|
+
try:
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import pyarrow.feather as pf
|
|
11
|
+
HAS_PYARROW = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
HAS_PYARROW = False
|
|
5
14
|
|
|
6
|
-
# Optional TensorFlow import for TFRecord support
|
|
7
15
|
try:
|
|
8
16
|
import tensorflow as tf
|
|
9
17
|
HAS_TENSORFLOW = True
|
|
10
18
|
except ImportError:
|
|
11
19
|
HAS_TENSORFLOW = False
|
|
12
20
|
|
|
13
|
-
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Helpers
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
27
|
+
"""Load any supported input format into a Polars DataFrame."""
|
|
28
|
+
sample_rows = options.get("sample_rows") # int | None
|
|
29
|
+
columns = options.get("columns") # list[str] | None
|
|
30
|
+
|
|
31
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
32
|
+
if ext == ".csv":
|
|
33
|
+
df = pl.read_csv(file_path, ignore_errors=True)
|
|
34
|
+
elif ext in (".parquet", ".pq"):
|
|
35
|
+
df = pl.read_parquet(file_path)
|
|
36
|
+
elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
37
|
+
df = pl.read_ipc(file_path)
|
|
38
|
+
elif ext == ".jsonl":
|
|
39
|
+
df = pl.read_ndjson(file_path)
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError(f"Unsupported input format: {ext}")
|
|
42
|
+
|
|
43
|
+
# Column selection (before sampling for speed)
|
|
44
|
+
if columns:
|
|
45
|
+
valid = [c for c in columns if c in df.columns]
|
|
46
|
+
if valid:
|
|
47
|
+
df = df.select(valid)
|
|
48
|
+
|
|
49
|
+
# Optional sampling
|
|
50
|
+
if sample_rows and sample_rows < len(df):
|
|
51
|
+
seed = options.get("random_seed", 42)
|
|
52
|
+
df = df.sample(n=sample_rows, seed=seed)
|
|
53
|
+
|
|
54
|
+
return df
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
|
|
58
|
+
"""Stringify complex columns so CSV doesn't choke."""
|
|
59
|
+
for col in df.columns:
|
|
60
|
+
dtype = df.schema[col]
|
|
61
|
+
is_simple = (
|
|
62
|
+
dtype.is_numeric()
|
|
63
|
+
or dtype.is_temporal()
|
|
64
|
+
or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
|
|
65
|
+
)
|
|
66
|
+
if not is_simple:
|
|
67
|
+
def safe_serialize(val):
|
|
68
|
+
try:
|
|
69
|
+
if hasattr(val, "to_list"):
|
|
70
|
+
return json.dumps(val.to_list())
|
|
71
|
+
if hasattr(val, "to_dict"):
|
|
72
|
+
return json.dumps(val.to_dict())
|
|
73
|
+
return json.dumps(val)
|
|
74
|
+
except Exception:
|
|
75
|
+
return str(val)
|
|
76
|
+
df = df.with_columns(
|
|
77
|
+
pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
|
|
78
|
+
)
|
|
79
|
+
return df
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
|
|
83
|
+
"""Write a small CSV preview next to the exported file."""
|
|
84
|
+
preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
|
|
85
|
+
preview_df = _safe_csv_df(df.head(min(n, len(df))))
|
|
86
|
+
preview_df.write_csv(preview_path)
|
|
87
|
+
return preview_path
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Main export function
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
|
|
14
95
|
options = options or {}
|
|
15
|
-
|
|
16
|
-
|
|
96
|
+
t0 = time.perf_counter()
|
|
97
|
+
|
|
98
|
+
# ---- Load ----
|
|
17
99
|
try:
|
|
18
|
-
|
|
19
|
-
df = pl.read_csv(file_path, ignore_errors=True)
|
|
20
|
-
elif file_path.endswith(".parquet"):
|
|
21
|
-
df = pl.read_parquet(file_path)
|
|
22
|
-
else:
|
|
23
|
-
return {"error": f"Unsupported input format: {file_path}"}
|
|
100
|
+
df = _load(file_path, options)
|
|
24
101
|
except Exception as e:
|
|
25
102
|
return {"error": f"Failed to load input file: {str(e)}"}
|
|
26
103
|
|
|
@@ -28,104 +105,123 @@ def export_data(file_path, output_path, format, options=None):
|
|
|
28
105
|
if output_dir and not os.path.exists(output_dir):
|
|
29
106
|
os.makedirs(output_dir, exist_ok=True)
|
|
30
107
|
|
|
108
|
+
preview_path = None
|
|
109
|
+
generate_preview = options.get("preview", False)
|
|
110
|
+
|
|
31
111
|
try:
|
|
32
|
-
#
|
|
33
|
-
if format == "
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return json.dumps(val.to_list())
|
|
47
|
-
if hasattr(val, "to_dict"):
|
|
48
|
-
return json.dumps(val.to_dict())
|
|
49
|
-
return json.dumps(val)
|
|
50
|
-
except:
|
|
51
|
-
return str(val)
|
|
52
|
-
df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
|
|
53
|
-
df.write_csv(output_path)
|
|
54
|
-
|
|
112
|
+
# ---- Feather (Arrow IPC) – fastest binary format ----
|
|
113
|
+
if format == "feather":
|
|
114
|
+
if not HAS_PYARROW:
|
|
115
|
+
return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
|
|
116
|
+
compression = options.get("compression", "lz4")
|
|
117
|
+
if compression in ("uncompressed", "none", "None", None):
|
|
118
|
+
compression = "uncompressed"
|
|
119
|
+
# Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
|
|
120
|
+
arrow_table = df.to_arrow()
|
|
121
|
+
pf.write_feather(arrow_table, output_path, compression=compression)
|
|
122
|
+
if generate_preview:
|
|
123
|
+
preview_path = _write_preview(df, output_path)
|
|
124
|
+
|
|
125
|
+
# ---- Parquet – best compression, big-data friendly ----
|
|
55
126
|
elif format == "parquet":
|
|
56
127
|
compression = options.get("compression", "snappy")
|
|
128
|
+
if compression in ("uncompressed", "none", "None", None):
|
|
129
|
+
compression = "uncompressed"
|
|
57
130
|
df.write_parquet(output_path, compression=compression)
|
|
58
|
-
|
|
131
|
+
if generate_preview:
|
|
132
|
+
preview_path = _write_preview(df, output_path)
|
|
133
|
+
|
|
134
|
+
# ---- CSV – human-readable fallback ----
|
|
135
|
+
elif format == "csv":
|
|
136
|
+
df = _safe_csv_df(df)
|
|
137
|
+
df.write_csv(output_path)
|
|
138
|
+
|
|
139
|
+
# ---- JSONL ----
|
|
59
140
|
elif format == "jsonl":
|
|
60
141
|
df.write_ndjson(output_path)
|
|
61
|
-
|
|
62
|
-
|
|
142
|
+
if generate_preview:
|
|
143
|
+
preview_path = _write_preview(df, output_path)
|
|
144
|
+
|
|
145
|
+
# ---- Arrow IPC (legacy name kept for compat) ----
|
|
146
|
+
elif format in ("arrow", "ipc"):
|
|
63
147
|
compression = options.get("compression", "uncompressed")
|
|
64
|
-
if compression == "uncompressed":
|
|
148
|
+
if compression == "uncompressed":
|
|
149
|
+
compression = None
|
|
65
150
|
df.write_ipc(output_path, compression=compression)
|
|
151
|
+
if generate_preview:
|
|
152
|
+
preview_path = _write_preview(df, output_path)
|
|
66
153
|
|
|
154
|
+
# ---- TFRecord ----
|
|
67
155
|
elif format == "tfrecord":
|
|
68
156
|
if not HAS_TENSORFLOW:
|
|
69
157
|
return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
|
|
70
|
-
|
|
71
|
-
# TFRecord Export Logic (using TensorFlow)
|
|
72
158
|
with tf.io.TFRecordWriter(output_path) as writer:
|
|
73
|
-
# Convert Polars -> Pandas for iteration (simpler for now)
|
|
74
|
-
# TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
|
|
75
159
|
pdf = df.to_pandas()
|
|
76
160
|
for _, row in pdf.iterrows():
|
|
77
161
|
feature = {}
|
|
78
162
|
for col, value in row.items():
|
|
79
163
|
if value is None:
|
|
80
164
|
continue
|
|
81
|
-
|
|
82
|
-
# Type inference for TFRecord features
|
|
83
165
|
if isinstance(value, int):
|
|
84
166
|
feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
|
|
85
167
|
elif isinstance(value, float):
|
|
86
168
|
feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
|
|
87
169
|
elif isinstance(value, str):
|
|
88
|
-
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode(
|
|
170
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
|
|
89
171
|
elif isinstance(value, bytes):
|
|
90
172
|
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
|
|
91
173
|
else:
|
|
92
|
-
|
|
93
|
-
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
|
|
94
|
-
|
|
174
|
+
feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
|
|
95
175
|
example = tf.train.Example(features=tf.train.Features(feature=feature))
|
|
96
176
|
writer.write(example.SerializeToString())
|
|
97
177
|
|
|
98
178
|
else:
|
|
99
179
|
return {"error": f"Unknown export format: {format}"}
|
|
100
180
|
|
|
101
|
-
|
|
181
|
+
elapsed = round(time.perf_counter() - t0, 3)
|
|
182
|
+
file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
183
|
+
|
|
184
|
+
result = {
|
|
102
185
|
"success": True,
|
|
103
186
|
"output_path": output_path,
|
|
104
187
|
"rows": len(df),
|
|
105
|
-
"
|
|
188
|
+
"columns": len(df.columns),
|
|
189
|
+
"format": format,
|
|
190
|
+
"compression": options.get("compression", "default"),
|
|
191
|
+
"file_size_mb": file_size_mb,
|
|
192
|
+
"elapsed_seconds": elapsed,
|
|
106
193
|
}
|
|
194
|
+
if preview_path:
|
|
195
|
+
result["preview_path"] = preview_path
|
|
196
|
+
|
|
197
|
+
return result
|
|
107
198
|
|
|
108
199
|
except Exception as e:
|
|
109
200
|
return {"error": f"Export failed: {str(e)}"}
|
|
110
201
|
|
|
202
|
+
|
|
111
203
|
def main():
|
|
112
204
|
if len(sys.argv) < 4:
|
|
113
|
-
print(
|
|
205
|
+
print(
|
|
206
|
+
json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
|
|
207
|
+
file=sys.stderr,
|
|
208
|
+
)
|
|
114
209
|
sys.exit(1)
|
|
115
210
|
|
|
116
211
|
input_file = sys.argv[1]
|
|
117
212
|
output_file = sys.argv[2]
|
|
118
213
|
fmt = sys.argv[3]
|
|
119
|
-
|
|
214
|
+
|
|
120
215
|
options = {}
|
|
121
216
|
if len(sys.argv) > 4:
|
|
122
217
|
try:
|
|
123
218
|
options = json.loads(sys.argv[4])
|
|
124
|
-
except:
|
|
219
|
+
except Exception:
|
|
125
220
|
pass
|
|
126
221
|
|
|
127
222
|
result = export_data(input_file, output_file, fmt, options)
|
|
128
223
|
print(json.dumps(result))
|
|
129
224
|
|
|
225
|
+
|
|
130
226
|
if __name__ == "__main__":
|
|
131
227
|
main()
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import hashlib
|
|
5
|
+
import subprocess
|
|
6
|
+
from typing import List, Optional, Union, Dict, Any
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import polars as pl
|
|
10
|
+
HAS_POLARS = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
HAS_POLARS = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _safe_suffix(source_path: str, idx: int) -> str:
|
|
16
|
+
base = os.path.basename(source_path)
|
|
17
|
+
base = os.path.splitext(base)[0].replace(" ", "_")
|
|
18
|
+
if not base:
|
|
19
|
+
base = f"source{idx+1}"
|
|
20
|
+
return base
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _load_with_polars(path: str):
|
|
24
|
+
ext = os.path.splitext(path)[1].lower()
|
|
25
|
+
if ext == ".csv":
|
|
26
|
+
return pl.read_csv(path, ignore_errors=True)
|
|
27
|
+
if ext in [".parquet", ".pq"]:
|
|
28
|
+
return pl.read_parquet(path)
|
|
29
|
+
if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
|
|
30
|
+
return pl.read_ipc(path)
|
|
31
|
+
if ext in [".jsonl", ".ndjson"]:
|
|
32
|
+
return pl.read_ndjson(path)
|
|
33
|
+
if ext == ".json":
|
|
34
|
+
try:
|
|
35
|
+
return pl.read_json(path)
|
|
36
|
+
except Exception:
|
|
37
|
+
return pl.read_ndjson(path)
|
|
38
|
+
raise ValueError(f"Unsupported source format: {ext} ({path})")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _write_with_polars(df, output_path: str, fmt: str, compression: Optional[str]):
|
|
42
|
+
fmt = fmt.lower()
|
|
43
|
+
comp = compression
|
|
44
|
+
if comp in ["none", "None", "uncompressed"]:
|
|
45
|
+
comp = None
|
|
46
|
+
|
|
47
|
+
if fmt == "csv":
|
|
48
|
+
df.write_csv(output_path)
|
|
49
|
+
elif fmt == "parquet":
|
|
50
|
+
df.write_parquet(output_path, compression=(compression or "snappy"))
|
|
51
|
+
elif fmt == "feather":
|
|
52
|
+
ipc_comp = "lz4" if compression is None else compression
|
|
53
|
+
if ipc_comp == "uncompressed":
|
|
54
|
+
ipc_comp = None
|
|
55
|
+
df.write_ipc(output_path, compression=ipc_comp)
|
|
56
|
+
elif fmt in ["arrow", "ipc"]:
|
|
57
|
+
df.write_ipc(output_path, compression=comp)
|
|
58
|
+
elif fmt == "jsonl":
|
|
59
|
+
df.write_ndjson(output_path)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError(f"Unsupported output format: {fmt}")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _ensure_parent(path: str):
|
|
65
|
+
parent = os.path.dirname(path)
|
|
66
|
+
if parent and not os.path.exists(parent):
|
|
67
|
+
os.makedirs(parent, exist_ok=True)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _compute_null_ratio(df) -> float:
|
|
71
|
+
if len(df) == 0 or len(df.columns) == 0:
|
|
72
|
+
return 0.0
|
|
73
|
+
nulls = 0
|
|
74
|
+
for col in df.columns:
|
|
75
|
+
nulls += int(df[col].null_count())
|
|
76
|
+
total_cells = len(df) * len(df.columns)
|
|
77
|
+
return (nulls / total_cells) * 100 if total_cells else 0.0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _leakage_report(df, source_col: str = "_vesper_source", id_col: Optional[str] = None) -> Dict[str, Any]:
|
|
81
|
+
report = {
|
|
82
|
+
"leakage_detected": False,
|
|
83
|
+
"leakage_count": 0,
|
|
84
|
+
"id_column": id_col,
|
|
85
|
+
"warnings": []
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if source_col not in df.columns:
|
|
89
|
+
report["warnings"].append("Source marker column missing; leakage check skipped.")
|
|
90
|
+
return report
|
|
91
|
+
|
|
92
|
+
if id_col and id_col in df.columns:
|
|
93
|
+
overlap = (
|
|
94
|
+
df.group_by(id_col)
|
|
95
|
+
.agg(pl.col(source_col).n_unique().alias("source_count"))
|
|
96
|
+
.filter(pl.col("source_count") > 1)
|
|
97
|
+
)
|
|
98
|
+
overlap_count = len(overlap)
|
|
99
|
+
if overlap_count > 0:
|
|
100
|
+
report["leakage_detected"] = True
|
|
101
|
+
report["leakage_count"] = overlap_count
|
|
102
|
+
report["warnings"].append(f"Found {overlap_count} IDs appearing across multiple sources")
|
|
103
|
+
else:
|
|
104
|
+
# Fallback: hash rows (excluding source marker) and check if same row appears in multiple sources
|
|
105
|
+
compare_cols = [c for c in df.columns if c != source_col]
|
|
106
|
+
if not compare_cols:
|
|
107
|
+
return report
|
|
108
|
+
|
|
109
|
+
row_sig = df.select(compare_cols).with_columns(
|
|
110
|
+
pl.concat_str([pl.col(c).cast(pl.Utf8, strict=False) for c in compare_cols], separator="||").alias("_row_sig")
|
|
111
|
+
)
|
|
112
|
+
tmp = row_sig.with_columns(df[source_col]).select(["_row_sig", source_col])
|
|
113
|
+
|
|
114
|
+
overlap = (
|
|
115
|
+
tmp.group_by("_row_sig")
|
|
116
|
+
.agg(pl.col(source_col).n_unique().alias("source_count"))
|
|
117
|
+
.filter(pl.col("source_count") > 1)
|
|
118
|
+
)
|
|
119
|
+
overlap_count = len(overlap)
|
|
120
|
+
if overlap_count > 0:
|
|
121
|
+
report["leakage_detected"] = True
|
|
122
|
+
report["leakage_count"] = overlap_count
|
|
123
|
+
report["warnings"].append(f"Found {overlap_count} duplicate rows across multiple sources")
|
|
124
|
+
|
|
125
|
+
return report
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _run_quality_engine(output_path: str) -> Dict[str, Any]:
|
|
129
|
+
# Reuse existing quality engine script (same folder)
|
|
130
|
+
try:
|
|
131
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
132
|
+
quality_script = os.path.join(script_dir, "quality_engine.py")
|
|
133
|
+
cmd = [sys.executable, quality_script, output_path]
|
|
134
|
+
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
|
135
|
+
if proc.returncode != 0:
|
|
136
|
+
return {"error": f"quality_engine failed: {proc.stderr.strip()}"}
|
|
137
|
+
return json.loads(proc.stdout)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
return {"error": str(e)}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _concat_polars(dfs, source_names: List[str]):
|
|
143
|
+
# Schema alignment by union columns
|
|
144
|
+
all_cols = []
|
|
145
|
+
col_set = set()
|
|
146
|
+
for df in dfs:
|
|
147
|
+
for col in df.columns:
|
|
148
|
+
if col not in col_set:
|
|
149
|
+
all_cols.append(col)
|
|
150
|
+
col_set.add(col)
|
|
151
|
+
if "_vesper_source" not in col_set:
|
|
152
|
+
all_cols.append("_vesper_source")
|
|
153
|
+
|
|
154
|
+
aligned = []
|
|
155
|
+
for i, df in enumerate(dfs):
|
|
156
|
+
tmp = df
|
|
157
|
+
if "_vesper_source" not in tmp.columns:
|
|
158
|
+
tmp = tmp.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
|
|
159
|
+
missing = [c for c in all_cols if c not in tmp.columns]
|
|
160
|
+
for m in missing:
|
|
161
|
+
tmp = tmp.with_columns(pl.lit(None).alias(m))
|
|
162
|
+
tmp = tmp.select(all_cols)
|
|
163
|
+
aligned.append(tmp)
|
|
164
|
+
|
|
165
|
+
return pl.concat(aligned, how="vertical_relaxed")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _join_polars(dfs, source_names: List[str], join_on: List[str], how: str):
|
|
169
|
+
if len(dfs) < 2:
|
|
170
|
+
raise ValueError("Join strategy requires at least 2 sources")
|
|
171
|
+
|
|
172
|
+
conflict_renames = []
|
|
173
|
+
left = dfs[0]
|
|
174
|
+
if "_vesper_source" not in left.columns:
|
|
175
|
+
left = left.with_columns(pl.lit(source_names[0]).alias("_vesper_source"))
|
|
176
|
+
|
|
177
|
+
for i in range(1, len(dfs)):
|
|
178
|
+
right = dfs[i]
|
|
179
|
+
if "_vesper_source" not in right.columns:
|
|
180
|
+
right = right.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
|
|
181
|
+
|
|
182
|
+
overlaps = [
|
|
183
|
+
c for c in right.columns
|
|
184
|
+
if c in left.columns and c not in join_on
|
|
185
|
+
]
|
|
186
|
+
rename_map = {}
|
|
187
|
+
suffix = _safe_suffix(source_names[i], i)
|
|
188
|
+
for c in overlaps:
|
|
189
|
+
new_name = f"{c}_{suffix}"
|
|
190
|
+
rename_map[c] = new_name
|
|
191
|
+
conflict_renames.append({"source": source_names[i], "from": c, "to": new_name})
|
|
192
|
+
|
|
193
|
+
if rename_map:
|
|
194
|
+
right = right.rename(rename_map)
|
|
195
|
+
|
|
196
|
+
left = left.join(right, on=join_on, how=how, coalesce=True)
|
|
197
|
+
|
|
198
|
+
return left, conflict_renames
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def fuse_datasets(
|
|
202
|
+
sources: List[Union[str, Any]],
|
|
203
|
+
strategy: str = "concat",
|
|
204
|
+
join_on: Optional[Union[str, List[str]]] = None,
|
|
205
|
+
how: str = "inner",
|
|
206
|
+
dedup: bool = True,
|
|
207
|
+
run_quality_after: bool = True,
|
|
208
|
+
leakage_check: bool = True,
|
|
209
|
+
output_path: Optional[str] = None,
|
|
210
|
+
output_format: str = "feather",
|
|
211
|
+
compression: Optional[str] = "lz4",
|
|
212
|
+
preview: bool = True,
|
|
213
|
+
preview_rows: int = 500,
|
|
214
|
+
id_column: Optional[str] = None,
|
|
215
|
+
**kwargs,
|
|
216
|
+
):
|
|
217
|
+
if not HAS_POLARS:
|
|
218
|
+
return {"error": "Polars is required for dataset fusion. Install with: pip install polars"}
|
|
219
|
+
|
|
220
|
+
if not sources or len(sources) < 2:
|
|
221
|
+
return {"error": "Need at least 2 sources to fuse"}
|
|
222
|
+
|
|
223
|
+
source_paths: List[str] = []
|
|
224
|
+
source_names: List[str] = []
|
|
225
|
+
|
|
226
|
+
for i, src in enumerate(sources):
|
|
227
|
+
if isinstance(src, str):
|
|
228
|
+
source_paths.append(src)
|
|
229
|
+
source_names.append(src)
|
|
230
|
+
elif isinstance(src, dict):
|
|
231
|
+
p = src.get("path") or src.get("local_path")
|
|
232
|
+
if not p:
|
|
233
|
+
return {"error": f"Source {i} missing path"}
|
|
234
|
+
source_paths.append(p)
|
|
235
|
+
source_names.append(src.get("name") or p)
|
|
236
|
+
else:
|
|
237
|
+
return {"error": f"Unsupported source type at index {i}"}
|
|
238
|
+
|
|
239
|
+
for p in source_paths:
|
|
240
|
+
if not os.path.exists(p):
|
|
241
|
+
return {"error": f"Source not found: {p}"}
|
|
242
|
+
|
|
243
|
+
if output_path is None:
|
|
244
|
+
ext_map = {
|
|
245
|
+
"feather": ".feather",
|
|
246
|
+
"parquet": ".parquet",
|
|
247
|
+
"csv": ".csv",
|
|
248
|
+
"jsonl": ".jsonl",
|
|
249
|
+
"arrow": ".arrow",
|
|
250
|
+
"ipc": ".arrow"
|
|
251
|
+
}
|
|
252
|
+
ext = ext_map.get(output_format, ".feather")
|
|
253
|
+
output_path = os.path.abspath(f"fused_dataset{ext}")
|
|
254
|
+
|
|
255
|
+
_ensure_parent(output_path)
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
dfs = [_load_with_polars(p) for p in source_paths]
|
|
259
|
+
except Exception as e:
|
|
260
|
+
return {"error": f"Failed to load sources: {str(e)}"}
|
|
261
|
+
|
|
262
|
+
rows_before = sum(len(df) for df in dfs)
|
|
263
|
+
null_before = sum(_compute_null_ratio(df) for df in dfs) / len(dfs)
|
|
264
|
+
|
|
265
|
+
strategy = (strategy or "concat").lower()
|
|
266
|
+
how = (how or "inner").lower()
|
|
267
|
+
conflict_renames = []
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
if strategy == "concat":
|
|
271
|
+
fused = _concat_polars(dfs, source_names)
|
|
272
|
+
elif strategy == "join":
|
|
273
|
+
if not join_on:
|
|
274
|
+
return {"error": "join_on is required when strategy='join'"}
|
|
275
|
+
join_keys = [join_on] if isinstance(join_on, str) else list(join_on)
|
|
276
|
+
for key in join_keys:
|
|
277
|
+
for idx, df in enumerate(dfs):
|
|
278
|
+
if key not in df.columns:
|
|
279
|
+
return {"error": f"Join key '{key}' missing in source {source_paths[idx]}"}
|
|
280
|
+
fused, conflict_renames = _join_polars(dfs, source_names, join_keys, how)
|
|
281
|
+
else:
|
|
282
|
+
return {"error": f"Unknown strategy: {strategy}. Use concat or join."}
|
|
283
|
+
|
|
284
|
+
duplicates_removed = 0
|
|
285
|
+
if dedup:
|
|
286
|
+
before = len(fused)
|
|
287
|
+
fused = fused.unique(maintain_order=True)
|
|
288
|
+
duplicates_removed = before - len(fused)
|
|
289
|
+
|
|
290
|
+
leakage = None
|
|
291
|
+
if leakage_check:
|
|
292
|
+
leakage = _leakage_report(fused, source_col="_vesper_source", id_col=id_column)
|
|
293
|
+
|
|
294
|
+
_write_with_polars(fused, output_path, output_format, compression)
|
|
295
|
+
|
|
296
|
+
preview_path = None
|
|
297
|
+
if preview:
|
|
298
|
+
preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
|
|
299
|
+
fused.head(min(preview_rows, len(fused))).write_csv(preview_path)
|
|
300
|
+
|
|
301
|
+
quality_report = None
|
|
302
|
+
if run_quality_after:
|
|
303
|
+
quality_report = _run_quality_engine(output_path)
|
|
304
|
+
|
|
305
|
+
rows_after = len(fused)
|
|
306
|
+
null_after = _compute_null_ratio(fused)
|
|
307
|
+
|
|
308
|
+
return {
|
|
309
|
+
"success": True,
|
|
310
|
+
"output_path": output_path,
|
|
311
|
+
"preview_path": preview_path,
|
|
312
|
+
"stats": {
|
|
313
|
+
"sources_count": len(source_paths),
|
|
314
|
+
"rows_before": rows_before,
|
|
315
|
+
"rows_after": rows_after,
|
|
316
|
+
"columns_after": len(fused.columns),
|
|
317
|
+
"duplicates_removed": duplicates_removed,
|
|
318
|
+
"null_ratio_before": round(null_before, 3),
|
|
319
|
+
"null_ratio_after": round(null_after, 3),
|
|
320
|
+
"null_delta": round(null_after - null_before, 3),
|
|
321
|
+
"conflict_renames": conflict_renames,
|
|
322
|
+
},
|
|
323
|
+
"quality_report": quality_report,
|
|
324
|
+
"leakage_report": leakage,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
return {"error": f"Fusion failed: {str(e)}"}
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def main():
|
|
332
|
+
if len(sys.argv) < 3:
|
|
333
|
+
print(json.dumps({
|
|
334
|
+
"error": "Usage: fusion_engine.py <sources_json> <output_path> [config_json]"
|
|
335
|
+
}))
|
|
336
|
+
sys.exit(1)
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
sources = json.loads(sys.argv[1])
|
|
340
|
+
output_path = sys.argv[2]
|
|
341
|
+
config = {}
|
|
342
|
+
if len(sys.argv) > 3:
|
|
343
|
+
config = json.loads(sys.argv[3])
|
|
344
|
+
|
|
345
|
+
result = fuse_datasets(
|
|
346
|
+
sources=sources,
|
|
347
|
+
output_path=output_path,
|
|
348
|
+
strategy=config.get("strategy", "concat"),
|
|
349
|
+
join_on=config.get("join_on"),
|
|
350
|
+
how=config.get("how", "inner"),
|
|
351
|
+
dedup=config.get("dedup", True),
|
|
352
|
+
run_quality_after=config.get("run_quality_after", True),
|
|
353
|
+
leakage_check=config.get("leakage_check", True),
|
|
354
|
+
output_format=config.get("output_format", "feather"),
|
|
355
|
+
compression=config.get("compression", "lz4"),
|
|
356
|
+
preview=config.get("preview", True),
|
|
357
|
+
preview_rows=config.get("preview_rows", 500),
|
|
358
|
+
id_column=config.get("id_column"),
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
print(json.dumps(result))
|
|
362
|
+
except Exception as e:
|
|
363
|
+
print(json.dumps({"error": str(e)}))
|
|
364
|
+
sys.exit(1)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
if __name__ == "__main__":
|
|
368
|
+
main()
|