@vespermcp/mcp-server 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,25 +2,102 @@ import sys
2
2
  import json
3
3
  import polars as pl
4
4
  import os
5
+ import time
6
+
7
+ # Optional imports for extra formats
8
+ try:
9
+ import pyarrow as pa
10
+ import pyarrow.feather as pf
11
+ HAS_PYARROW = True
12
+ except ImportError:
13
+ HAS_PYARROW = False
5
14
 
6
- # Optional TensorFlow import for TFRecord support
7
15
  try:
8
16
  import tensorflow as tf
9
17
  HAS_TENSORFLOW = True
10
18
  except ImportError:
11
19
  HAS_TENSORFLOW = False
12
20
 
13
- def export_data(file_path, output_path, format, options=None):
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Helpers
24
+ # ---------------------------------------------------------------------------
25
+
26
+ def _load(file_path: str, options: dict) -> pl.DataFrame:
27
+ """Load any supported input format into a Polars DataFrame."""
28
+ sample_rows = options.get("sample_rows") # int | None
29
+ columns = options.get("columns") # list[str] | None
30
+
31
+ ext = os.path.splitext(file_path)[1].lower()
32
+ if ext == ".csv":
33
+ df = pl.read_csv(file_path, ignore_errors=True)
34
+ elif ext in (".parquet", ".pq"):
35
+ df = pl.read_parquet(file_path)
36
+ elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
37
+ df = pl.read_ipc(file_path)
38
+ elif ext == ".jsonl":
39
+ df = pl.read_ndjson(file_path)
40
+ else:
41
+ raise ValueError(f"Unsupported input format: {ext}")
42
+
43
+ # Column selection (before sampling for speed)
44
+ if columns:
45
+ valid = [c for c in columns if c in df.columns]
46
+ if valid:
47
+ df = df.select(valid)
48
+
49
+ # Optional sampling
50
+ if sample_rows and sample_rows < len(df):
51
+ seed = options.get("random_seed", 42)
52
+ df = df.sample(n=sample_rows, seed=seed)
53
+
54
+ return df
55
+
56
+
57
+ def _safe_csv_df(df: pl.DataFrame) -> pl.DataFrame:
58
+ """Stringify complex columns so CSV doesn't choke."""
59
+ for col in df.columns:
60
+ dtype = df.schema[col]
61
+ is_simple = (
62
+ dtype.is_numeric()
63
+ or dtype.is_temporal()
64
+ or str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
65
+ )
66
+ if not is_simple:
67
+ def safe_serialize(val):
68
+ try:
69
+ if hasattr(val, "to_list"):
70
+ return json.dumps(val.to_list())
71
+ if hasattr(val, "to_dict"):
72
+ return json.dumps(val.to_dict())
73
+ return json.dumps(val)
74
+ except Exception:
75
+ return str(val)
76
+ df = df.with_columns(
77
+ pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8)
78
+ )
79
+ return df
80
+
81
+
82
+ def _write_preview(df: pl.DataFrame, output_path: str, n: int = 500):
83
+ """Write a small CSV preview next to the exported file."""
84
+ preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
85
+ preview_df = _safe_csv_df(df.head(min(n, len(df))))
86
+ preview_df.write_csv(preview_path)
87
+ return preview_path
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # Main export function
92
+ # ---------------------------------------------------------------------------
93
+
94
+ def export_data(file_path: str, output_path: str, format: str, options: dict | None = None):
14
95
  options = options or {}
15
-
16
- # Load Data
96
+ t0 = time.perf_counter()
97
+
98
+ # ---- Load ----
17
99
  try:
18
- if file_path.endswith(".csv"):
19
- df = pl.read_csv(file_path, ignore_errors=True)
20
- elif file_path.endswith(".parquet"):
21
- df = pl.read_parquet(file_path)
22
- else:
23
- return {"error": f"Unsupported input format: {file_path}"}
100
+ df = _load(file_path, options)
24
101
  except Exception as e:
25
102
  return {"error": f"Failed to load input file: {str(e)}"}
26
103
 
@@ -28,104 +105,123 @@ def export_data(file_path, output_path, format, options=None):
28
105
  if output_dir and not os.path.exists(output_dir):
29
106
  os.makedirs(output_dir, exist_ok=True)
30
107
 
108
+ preview_path = None
109
+ generate_preview = options.get("preview", False)
110
+
31
111
  try:
32
- # Export Logic
33
- if format == "csv":
34
- # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
35
- for col in df.columns:
36
- dtype = df.schema[col]
37
- is_simple = (
38
- dtype.is_numeric() or
39
- dtype.is_temporal() or
40
- str(dtype).lower() in ["string", "utf8", "boolean", "bool"]
41
- )
42
- if not is_simple:
43
- def safe_serialize(val):
44
- try:
45
- if hasattr(val, "to_list"):
46
- return json.dumps(val.to_list())
47
- if hasattr(val, "to_dict"):
48
- return json.dumps(val.to_dict())
49
- return json.dumps(val)
50
- except:
51
- return str(val)
52
- df = df.with_columns(pl.col(col).map_elements(safe_serialize, return_dtype=pl.Utf8))
53
- df.write_csv(output_path)
54
-
112
+ # ---- Feather (Arrow IPC) – fastest binary format ----
113
+ if format == "feather":
114
+ if not HAS_PYARROW:
115
+ return {"error": "pyarrow is not installed. Run: pip install pyarrow"}
116
+ compression = options.get("compression", "lz4")
117
+ if compression in ("uncompressed", "none", "None", None):
118
+ compression = "uncompressed"
119
+ # Polars write_ipc uses Arrow IPC (= Feather v2) under the hood
120
+ arrow_table = df.to_arrow()
121
+ pf.write_feather(arrow_table, output_path, compression=compression)
122
+ if generate_preview:
123
+ preview_path = _write_preview(df, output_path)
124
+
125
+ # ---- Parquet – best compression, big-data friendly ----
55
126
  elif format == "parquet":
56
127
  compression = options.get("compression", "snappy")
128
+ if compression in ("uncompressed", "none", "None", None):
129
+ compression = "uncompressed"
57
130
  df.write_parquet(output_path, compression=compression)
58
-
131
+ if generate_preview:
132
+ preview_path = _write_preview(df, output_path)
133
+
134
+ # ---- CSV – human-readable fallback ----
135
+ elif format == "csv":
136
+ df = _safe_csv_df(df)
137
+ df.write_csv(output_path)
138
+
139
+ # ---- JSONL ----
59
140
  elif format == "jsonl":
60
141
  df.write_ndjson(output_path)
61
-
62
- elif format == "arrow" or format == "ipc":
142
+ if generate_preview:
143
+ preview_path = _write_preview(df, output_path)
144
+
145
+ # ---- Arrow IPC (legacy name kept for compat) ----
146
+ elif format in ("arrow", "ipc"):
63
147
  compression = options.get("compression", "uncompressed")
64
- if compression == "uncompressed": compression = None
148
+ if compression == "uncompressed":
149
+ compression = None
65
150
  df.write_ipc(output_path, compression=compression)
151
+ if generate_preview:
152
+ preview_path = _write_preview(df, output_path)
66
153
 
154
+ # ---- TFRecord ----
67
155
  elif format == "tfrecord":
68
156
  if not HAS_TENSORFLOW:
69
157
  return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
70
-
71
- # TFRecord Export Logic (using TensorFlow)
72
158
  with tf.io.TFRecordWriter(output_path) as writer:
73
- # Convert Polars -> Pandas for iteration (simpler for now)
74
- # TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
75
159
  pdf = df.to_pandas()
76
160
  for _, row in pdf.iterrows():
77
161
  feature = {}
78
162
  for col, value in row.items():
79
163
  if value is None:
80
164
  continue
81
-
82
- # Type inference for TFRecord features
83
165
  if isinstance(value, int):
84
166
  feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
85
167
  elif isinstance(value, float):
86
168
  feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
87
169
  elif isinstance(value, str):
88
- feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
170
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode("utf-8")]))
89
171
  elif isinstance(value, bytes):
90
172
  feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
91
173
  else:
92
- # Fallback to string for unknown types
93
- feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
94
-
174
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode("utf-8")]))
95
175
  example = tf.train.Example(features=tf.train.Features(feature=feature))
96
176
  writer.write(example.SerializeToString())
97
177
 
98
178
  else:
99
179
  return {"error": f"Unknown export format: {format}"}
100
180
 
101
- return {
181
+ elapsed = round(time.perf_counter() - t0, 3)
182
+ file_size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
183
+
184
+ result = {
102
185
  "success": True,
103
186
  "output_path": output_path,
104
187
  "rows": len(df),
105
- "format": format
188
+ "columns": len(df.columns),
189
+ "format": format,
190
+ "compression": options.get("compression", "default"),
191
+ "file_size_mb": file_size_mb,
192
+ "elapsed_seconds": elapsed,
106
193
  }
194
+ if preview_path:
195
+ result["preview_path"] = preview_path
196
+
197
+ return result
107
198
 
108
199
  except Exception as e:
109
200
  return {"error": f"Export failed: {str(e)}"}
110
201
 
202
+
111
203
  def main():
112
204
  if len(sys.argv) < 4:
113
- print(json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}), file=sys.stderr)
205
+ print(
206
+ json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}),
207
+ file=sys.stderr,
208
+ )
114
209
  sys.exit(1)
115
210
 
116
211
  input_file = sys.argv[1]
117
212
  output_file = sys.argv[2]
118
213
  fmt = sys.argv[3]
119
-
214
+
120
215
  options = {}
121
216
  if len(sys.argv) > 4:
122
217
  try:
123
218
  options = json.loads(sys.argv[4])
124
- except:
219
+ except Exception:
125
220
  pass
126
221
 
127
222
  result = export_data(input_file, output_file, fmt, options)
128
223
  print(json.dumps(result))
129
224
 
225
+
130
226
  if __name__ == "__main__":
131
227
  main()
@@ -0,0 +1,368 @@
1
+ import sys
2
+ import json
3
+ import os
4
+ import hashlib
5
+ import subprocess
6
+ from typing import List, Optional, Union, Dict, Any
7
+
8
+ try:
9
+ import polars as pl
10
+ HAS_POLARS = True
11
+ except ImportError:
12
+ HAS_POLARS = False
13
+
14
+
15
+ def _safe_suffix(source_path: str, idx: int) -> str:
16
+ base = os.path.basename(source_path)
17
+ base = os.path.splitext(base)[0].replace(" ", "_")
18
+ if not base:
19
+ base = f"source{idx+1}"
20
+ return base
21
+
22
+
23
+ def _load_with_polars(path: str):
24
+ ext = os.path.splitext(path)[1].lower()
25
+ if ext == ".csv":
26
+ return pl.read_csv(path, ignore_errors=True)
27
+ if ext in [".parquet", ".pq"]:
28
+ return pl.read_parquet(path)
29
+ if ext in [".feather", ".ftr", ".arrow", ".ipc"]:
30
+ return pl.read_ipc(path)
31
+ if ext in [".jsonl", ".ndjson"]:
32
+ return pl.read_ndjson(path)
33
+ if ext == ".json":
34
+ try:
35
+ return pl.read_json(path)
36
+ except Exception:
37
+ return pl.read_ndjson(path)
38
+ raise ValueError(f"Unsupported source format: {ext} ({path})")
39
+
40
+
41
+ def _write_with_polars(df, output_path: str, fmt: str, compression: Optional[str]):
42
+ fmt = fmt.lower()
43
+ comp = compression
44
+ if comp in ["none", "None", "uncompressed"]:
45
+ comp = None
46
+
47
+ if fmt == "csv":
48
+ df.write_csv(output_path)
49
+ elif fmt == "parquet":
50
+ df.write_parquet(output_path, compression=(compression or "snappy"))
51
+ elif fmt == "feather":
52
+ ipc_comp = "lz4" if compression is None else compression
53
+ if ipc_comp == "uncompressed":
54
+ ipc_comp = None
55
+ df.write_ipc(output_path, compression=ipc_comp)
56
+ elif fmt in ["arrow", "ipc"]:
57
+ df.write_ipc(output_path, compression=comp)
58
+ elif fmt == "jsonl":
59
+ df.write_ndjson(output_path)
60
+ else:
61
+ raise ValueError(f"Unsupported output format: {fmt}")
62
+
63
+
64
+ def _ensure_parent(path: str):
65
+ parent = os.path.dirname(path)
66
+ if parent and not os.path.exists(parent):
67
+ os.makedirs(parent, exist_ok=True)
68
+
69
+
70
+ def _compute_null_ratio(df) -> float:
71
+ if len(df) == 0 or len(df.columns) == 0:
72
+ return 0.0
73
+ nulls = 0
74
+ for col in df.columns:
75
+ nulls += int(df[col].null_count())
76
+ total_cells = len(df) * len(df.columns)
77
+ return (nulls / total_cells) * 100 if total_cells else 0.0
78
+
79
+
80
+ def _leakage_report(df, source_col: str = "_vesper_source", id_col: Optional[str] = None) -> Dict[str, Any]:
81
+ report = {
82
+ "leakage_detected": False,
83
+ "leakage_count": 0,
84
+ "id_column": id_col,
85
+ "warnings": []
86
+ }
87
+
88
+ if source_col not in df.columns:
89
+ report["warnings"].append("Source marker column missing; leakage check skipped.")
90
+ return report
91
+
92
+ if id_col and id_col in df.columns:
93
+ overlap = (
94
+ df.group_by(id_col)
95
+ .agg(pl.col(source_col).n_unique().alias("source_count"))
96
+ .filter(pl.col("source_count") > 1)
97
+ )
98
+ overlap_count = len(overlap)
99
+ if overlap_count > 0:
100
+ report["leakage_detected"] = True
101
+ report["leakage_count"] = overlap_count
102
+ report["warnings"].append(f"Found {overlap_count} IDs appearing across multiple sources")
103
+ else:
104
+ # Fallback: hash rows (excluding source marker) and check if same row appears in multiple sources
105
+ compare_cols = [c for c in df.columns if c != source_col]
106
+ if not compare_cols:
107
+ return report
108
+
109
+ row_sig = df.select(compare_cols).with_columns(
110
+ pl.concat_str([pl.col(c).cast(pl.Utf8, strict=False) for c in compare_cols], separator="||").alias("_row_sig")
111
+ )
112
+ tmp = row_sig.with_columns(df[source_col]).select(["_row_sig", source_col])
113
+
114
+ overlap = (
115
+ tmp.group_by("_row_sig")
116
+ .agg(pl.col(source_col).n_unique().alias("source_count"))
117
+ .filter(pl.col("source_count") > 1)
118
+ )
119
+ overlap_count = len(overlap)
120
+ if overlap_count > 0:
121
+ report["leakage_detected"] = True
122
+ report["leakage_count"] = overlap_count
123
+ report["warnings"].append(f"Found {overlap_count} duplicate rows across multiple sources")
124
+
125
+ return report
126
+
127
+
128
+ def _run_quality_engine(output_path: str) -> Dict[str, Any]:
129
+ # Reuse existing quality engine script (same folder)
130
+ try:
131
+ script_dir = os.path.dirname(os.path.abspath(__file__))
132
+ quality_script = os.path.join(script_dir, "quality_engine.py")
133
+ cmd = [sys.executable, quality_script, output_path]
134
+ proc = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
135
+ if proc.returncode != 0:
136
+ return {"error": f"quality_engine failed: {proc.stderr.strip()}"}
137
+ return json.loads(proc.stdout)
138
+ except Exception as e:
139
+ return {"error": str(e)}
140
+
141
+
142
+ def _concat_polars(dfs, source_names: List[str]):
143
+ # Schema alignment by union columns
144
+ all_cols = []
145
+ col_set = set()
146
+ for df in dfs:
147
+ for col in df.columns:
148
+ if col not in col_set:
149
+ all_cols.append(col)
150
+ col_set.add(col)
151
+ if "_vesper_source" not in col_set:
152
+ all_cols.append("_vesper_source")
153
+
154
+ aligned = []
155
+ for i, df in enumerate(dfs):
156
+ tmp = df
157
+ if "_vesper_source" not in tmp.columns:
158
+ tmp = tmp.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
159
+ missing = [c for c in all_cols if c not in tmp.columns]
160
+ for m in missing:
161
+ tmp = tmp.with_columns(pl.lit(None).alias(m))
162
+ tmp = tmp.select(all_cols)
163
+ aligned.append(tmp)
164
+
165
+ return pl.concat(aligned, how="vertical_relaxed")
166
+
167
+
168
+ def _join_polars(dfs, source_names: List[str], join_on: List[str], how: str):
169
+ if len(dfs) < 2:
170
+ raise ValueError("Join strategy requires at least 2 sources")
171
+
172
+ conflict_renames = []
173
+ left = dfs[0]
174
+ if "_vesper_source" not in left.columns:
175
+ left = left.with_columns(pl.lit(source_names[0]).alias("_vesper_source"))
176
+
177
+ for i in range(1, len(dfs)):
178
+ right = dfs[i]
179
+ if "_vesper_source" not in right.columns:
180
+ right = right.with_columns(pl.lit(source_names[i]).alias("_vesper_source"))
181
+
182
+ overlaps = [
183
+ c for c in right.columns
184
+ if c in left.columns and c not in join_on
185
+ ]
186
+ rename_map = {}
187
+ suffix = _safe_suffix(source_names[i], i)
188
+ for c in overlaps:
189
+ new_name = f"{c}_{suffix}"
190
+ rename_map[c] = new_name
191
+ conflict_renames.append({"source": source_names[i], "from": c, "to": new_name})
192
+
193
+ if rename_map:
194
+ right = right.rename(rename_map)
195
+
196
+ left = left.join(right, on=join_on, how=how, coalesce=True)
197
+
198
+ return left, conflict_renames
199
+
200
+
201
+ def fuse_datasets(
202
+ sources: List[Union[str, Any]],
203
+ strategy: str = "concat",
204
+ join_on: Optional[Union[str, List[str]]] = None,
205
+ how: str = "inner",
206
+ dedup: bool = True,
207
+ run_quality_after: bool = True,
208
+ leakage_check: bool = True,
209
+ output_path: Optional[str] = None,
210
+ output_format: str = "feather",
211
+ compression: Optional[str] = "lz4",
212
+ preview: bool = True,
213
+ preview_rows: int = 500,
214
+ id_column: Optional[str] = None,
215
+ **kwargs,
216
+ ):
217
+ if not HAS_POLARS:
218
+ return {"error": "Polars is required for dataset fusion. Install with: pip install polars"}
219
+
220
+ if not sources or len(sources) < 2:
221
+ return {"error": "Need at least 2 sources to fuse"}
222
+
223
+ source_paths: List[str] = []
224
+ source_names: List[str] = []
225
+
226
+ for i, src in enumerate(sources):
227
+ if isinstance(src, str):
228
+ source_paths.append(src)
229
+ source_names.append(src)
230
+ elif isinstance(src, dict):
231
+ p = src.get("path") or src.get("local_path")
232
+ if not p:
233
+ return {"error": f"Source {i} missing path"}
234
+ source_paths.append(p)
235
+ source_names.append(src.get("name") or p)
236
+ else:
237
+ return {"error": f"Unsupported source type at index {i}"}
238
+
239
+ for p in source_paths:
240
+ if not os.path.exists(p):
241
+ return {"error": f"Source not found: {p}"}
242
+
243
+ if output_path is None:
244
+ ext_map = {
245
+ "feather": ".feather",
246
+ "parquet": ".parquet",
247
+ "csv": ".csv",
248
+ "jsonl": ".jsonl",
249
+ "arrow": ".arrow",
250
+ "ipc": ".arrow"
251
+ }
252
+ ext = ext_map.get(output_format, ".feather")
253
+ output_path = os.path.abspath(f"fused_dataset{ext}")
254
+
255
+ _ensure_parent(output_path)
256
+
257
+ try:
258
+ dfs = [_load_with_polars(p) for p in source_paths]
259
+ except Exception as e:
260
+ return {"error": f"Failed to load sources: {str(e)}"}
261
+
262
+ rows_before = sum(len(df) for df in dfs)
263
+ null_before = sum(_compute_null_ratio(df) for df in dfs) / len(dfs)
264
+
265
+ strategy = (strategy or "concat").lower()
266
+ how = (how or "inner").lower()
267
+ conflict_renames = []
268
+
269
+ try:
270
+ if strategy == "concat":
271
+ fused = _concat_polars(dfs, source_names)
272
+ elif strategy == "join":
273
+ if not join_on:
274
+ return {"error": "join_on is required when strategy='join'"}
275
+ join_keys = [join_on] if isinstance(join_on, str) else list(join_on)
276
+ for key in join_keys:
277
+ for idx, df in enumerate(dfs):
278
+ if key not in df.columns:
279
+ return {"error": f"Join key '{key}' missing in source {source_paths[idx]}"}
280
+ fused, conflict_renames = _join_polars(dfs, source_names, join_keys, how)
281
+ else:
282
+ return {"error": f"Unknown strategy: {strategy}. Use concat or join."}
283
+
284
+ duplicates_removed = 0
285
+ if dedup:
286
+ before = len(fused)
287
+ fused = fused.unique(maintain_order=True)
288
+ duplicates_removed = before - len(fused)
289
+
290
+ leakage = None
291
+ if leakage_check:
292
+ leakage = _leakage_report(fused, source_col="_vesper_source", id_col=id_column)
293
+
294
+ _write_with_polars(fused, output_path, output_format, compression)
295
+
296
+ preview_path = None
297
+ if preview:
298
+ preview_path = os.path.splitext(output_path)[0] + "_preview.csv"
299
+ fused.head(min(preview_rows, len(fused))).write_csv(preview_path)
300
+
301
+ quality_report = None
302
+ if run_quality_after:
303
+ quality_report = _run_quality_engine(output_path)
304
+
305
+ rows_after = len(fused)
306
+ null_after = _compute_null_ratio(fused)
307
+
308
+ return {
309
+ "success": True,
310
+ "output_path": output_path,
311
+ "preview_path": preview_path,
312
+ "stats": {
313
+ "sources_count": len(source_paths),
314
+ "rows_before": rows_before,
315
+ "rows_after": rows_after,
316
+ "columns_after": len(fused.columns),
317
+ "duplicates_removed": duplicates_removed,
318
+ "null_ratio_before": round(null_before, 3),
319
+ "null_ratio_after": round(null_after, 3),
320
+ "null_delta": round(null_after - null_before, 3),
321
+ "conflict_renames": conflict_renames,
322
+ },
323
+ "quality_report": quality_report,
324
+ "leakage_report": leakage,
325
+ }
326
+
327
+ except Exception as e:
328
+ return {"error": f"Fusion failed: {str(e)}"}
329
+
330
+
331
+ def main():
332
+ if len(sys.argv) < 3:
333
+ print(json.dumps({
334
+ "error": "Usage: fusion_engine.py <sources_json> <output_path> [config_json]"
335
+ }))
336
+ sys.exit(1)
337
+
338
+ try:
339
+ sources = json.loads(sys.argv[1])
340
+ output_path = sys.argv[2]
341
+ config = {}
342
+ if len(sys.argv) > 3:
343
+ config = json.loads(sys.argv[3])
344
+
345
+ result = fuse_datasets(
346
+ sources=sources,
347
+ output_path=output_path,
348
+ strategy=config.get("strategy", "concat"),
349
+ join_on=config.get("join_on"),
350
+ how=config.get("how", "inner"),
351
+ dedup=config.get("dedup", True),
352
+ run_quality_after=config.get("run_quality_after", True),
353
+ leakage_check=config.get("leakage_check", True),
354
+ output_format=config.get("output_format", "feather"),
355
+ compression=config.get("compression", "lz4"),
356
+ preview=config.get("preview", True),
357
+ preview_rows=config.get("preview_rows", 500),
358
+ id_column=config.get("id_column"),
359
+ )
360
+
361
+ print(json.dumps(result))
362
+ except Exception as e:
363
+ print(json.dumps({"error": str(e)}))
364
+ sys.exit(1)
365
+
366
+
367
+ if __name__ == "__main__":
368
+ main()