mangleframes 0.1.3__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {mangleframes-0.1.3 → mangleframes-0.1.6}/PKG-INFO +5 -2
  2. {mangleframes-0.1.3 → mangleframes-0.1.6}/pyproject.toml +5 -2
  3. mangleframes-0.1.6/python/mangleframes/__init__.py +128 -0
  4. {mangleframes-0.1.3 → mangleframes-0.1.6}/python/mangleframes/launcher.py +7 -5
  5. mangleframes-0.1.6/python/mangleframes/protocol.py +421 -0
  6. mangleframes-0.1.6/viewer/src/arrow_reader.rs +99 -0
  7. mangleframes-0.1.6/viewer/src/handlers.rs +431 -0
  8. mangleframes-0.1.6/viewer/src/history_analysis.rs +298 -0
  9. mangleframes-0.1.6/viewer/src/history_handlers.rs +148 -0
  10. mangleframes-0.1.6/viewer/src/join_analysis.rs +324 -0
  11. mangleframes-0.1.6/viewer/src/join_handlers.rs +236 -0
  12. mangleframes-0.1.6/viewer/src/main.rs +142 -0
  13. mangleframes-0.1.6/viewer/src/perf.rs +178 -0
  14. {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/socket_client.rs +57 -3
  15. mangleframes-0.1.6/viewer/src/web_server.rs +130 -0
  16. mangleframes-0.1.6/viewer/static/app.js +1205 -0
  17. mangleframes-0.1.6/viewer/static/index.html +247 -0
  18. mangleframes-0.1.6/viewer/static/style.css +951 -0
  19. mangleframes-0.1.3/python/mangleframes/__init__.py +0 -70
  20. mangleframes-0.1.3/python/mangleframes/protocol.py +0 -163
  21. mangleframes-0.1.3/viewer/src/arrow_reader.rs +0 -132
  22. mangleframes-0.1.3/viewer/src/handlers.rs +0 -190
  23. mangleframes-0.1.3/viewer/src/main.rs +0 -69
  24. mangleframes-0.1.3/viewer/src/web_server.rs +0 -63
  25. mangleframes-0.1.3/viewer/static/app.js +0 -330
  26. mangleframes-0.1.3/viewer/static/index.html +0 -60
  27. mangleframes-0.1.3/viewer/static/style.css +0 -242
  28. {mangleframes-0.1.3 → mangleframes-0.1.6}/Cargo.lock +0 -0
  29. {mangleframes-0.1.3 → mangleframes-0.1.6}/Cargo.toml +0 -0
  30. {mangleframes-0.1.3 → mangleframes-0.1.6}/python/mangleframes/server.py +0 -0
  31. {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/Cargo.toml +0 -0
  32. {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/export.rs +0 -0
  33. {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/query_engine.rs +0 -0
  34. {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/stats.rs +0 -0
  35. {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/websocket.rs +0 -0
@@ -1,9 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mangleframes
3
- Version: 0.1.3
3
+ Version: 0.1.6
4
4
  Classifier: Programming Language :: Python :: 3
5
5
  Classifier: Programming Language :: Rust
6
6
  Classifier: License :: OSI Approved :: MIT License
7
+ Requires-Dist: databricks-connect>=16.1.7
8
+ Requires-Dist: loguru>=0.7.3
9
+ Requires-Dist: maturin>=1.11.2
7
10
  Requires-Dist: pyarrow>=11.0.0
8
11
  Requires-Dist: pytest>=7.0 ; extra == 'dev'
9
12
  Requires-Dist: maturin>=1.4 ; extra == 'dev'
@@ -13,4 +16,4 @@ Provides-Extra: spark
13
16
  License-File: LICENSE
14
17
  Summary: PySpark DataFrame viewer with modern web UI
15
18
  License: MIT
16
- Requires-Python: >=3.9
19
+ Requires-Python: >=3.11
@@ -4,9 +4,9 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "mangleframes"
7
- version = "0.1.3"
7
+ version = "0.1.6"
8
8
  description = "PySpark DataFrame viewer with modern web UI"
9
- requires-python = ">=3.9"
9
+ requires-python = ">=3.11"
10
10
  license = { text = "MIT" }
11
11
  classifiers = [
12
12
  "Programming Language :: Python :: 3",
@@ -14,6 +14,9 @@ classifiers = [
14
14
  "License :: OSI Approved :: MIT License",
15
15
  ]
16
16
  dependencies = [
17
+ "databricks-connect>=16.1.7",
18
+ "loguru>=0.7.3",
19
+ "maturin>=1.11.2",
17
20
  "pyarrow>=11.0.0",
18
21
  ]
19
22
 
@@ -0,0 +1,128 @@
1
+ """MangleFrames - PySpark DataFrame viewer with modern web UI."""
2
+ from __future__ import annotations
3
+
4
+ import atexit
5
+ import os
6
+ import subprocess
7
+ import threading
8
+ import time
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
+
12
+ from .launcher import launch_viewer
13
+ from .protocol import clear_arrow_cache, clear_stats_cache, prefetch_frame
14
+ from .server import DataFrameServer
15
+
16
+ if TYPE_CHECKING:
17
+ from pyspark.sql import DataFrame
18
+
19
+ __version__ = "0.1.6"
20
+ __all__ = ["register", "unregister", "show", "cleanup"]
21
+
22
+ _registry: dict[str, DataFrame] = {}
23
+ _server: DataFrameServer | None = None
24
+ _viewer_process: subprocess.Popen | None = None
25
+ _cleanup_registered = False
26
+
27
+
28
+ def _clean_stale_sockets() -> None:
29
+ """Remove socket files from PIDs that no longer exist."""
30
+ socket_dir = Path("/tmp")
31
+ for sock in socket_dir.glob("mangleframes-*.sock"):
32
+ try:
33
+ pid_str = sock.stem.split("-")[1]
34
+ pid = int(pid_str)
35
+ if not _pid_exists(pid):
36
+ sock.unlink()
37
+ except (IndexError, ValueError, OSError):
38
+ pass
39
+
40
+
41
+ def _pid_exists(pid: int) -> bool:
42
+ """Check if a process with given PID exists."""
43
+ try:
44
+ os.kill(pid, 0)
45
+ return True
46
+ except OSError:
47
+ return False
48
+
49
+
50
+ def cleanup() -> None:
51
+ """Clean up all MangleFrames resources."""
52
+ global _server, _viewer_process, _registry
53
+
54
+ if _viewer_process is not None:
55
+ _viewer_process.terminate()
56
+ try:
57
+ _viewer_process.wait(timeout=2.0)
58
+ except subprocess.TimeoutExpired:
59
+ _viewer_process.kill()
60
+ _viewer_process = None
61
+
62
+ if _server is not None:
63
+ _server.stop()
64
+ _server = None
65
+
66
+ _registry.clear()
67
+ clear_stats_cache()
68
+ clear_arrow_cache()
69
+
70
+
71
+ def register(name: str, df: DataFrame) -> None:
72
+ """Register a DataFrame for viewing."""
73
+ global _server, _cleanup_registered
74
+
75
+ if not _cleanup_registered:
76
+ atexit.register(cleanup)
77
+ _cleanup_registered = True
78
+ _clean_stale_sockets()
79
+
80
+ _registry[name] = df
81
+ clear_stats_cache(name)
82
+ clear_arrow_cache(name)
83
+
84
+ def do_prefetch() -> None:
85
+ prefetch_frame(_registry, name, limit=10000)
86
+
87
+ threading.Thread(target=do_prefetch, daemon=True).start()
88
+
89
+ if _server is None:
90
+ _server = DataFrameServer(_registry)
91
+ _server.start()
92
+
93
+
94
+ def unregister(name: str) -> None:
95
+ """Remove a DataFrame from the viewer."""
96
+ if name in _registry:
97
+ del _registry[name]
98
+ clear_stats_cache(name)
99
+ clear_arrow_cache(name)
100
+
101
+
102
+ def show(port: int = 8765, block: bool = True) -> None:
103
+ """Open the viewer in a browser, launching the viewer if needed.
104
+
105
+ Args:
106
+ port: Port for the viewer web server.
107
+ block: If True, block until Ctrl+C (keeps server alive).
108
+ """
109
+ global _viewer_process, _server
110
+
111
+ if _server is None or not _server.is_running:
112
+ if not _registry:
113
+ raise RuntimeError("No DataFrames registered. Call register() first.")
114
+ _server = DataFrameServer(_registry)
115
+ _server.start()
116
+
117
+ if _viewer_process is None or _viewer_process.poll() is not None:
118
+ _viewer_process = launch_viewer(_server.socket_path, port)
119
+
120
+ if block:
121
+ try:
122
+ print(f"MangleFrames viewer running at http://localhost:{port}")
123
+ print("Press Ctrl+C to stop...")
124
+ while True:
125
+ time.sleep(1)
126
+ except KeyboardInterrupt:
127
+ print("\nStopping MangleFrames...")
128
+ cleanup()
@@ -4,17 +4,22 @@ from __future__ import annotations
4
4
  import os
5
5
  import shutil
6
6
  import subprocess
7
- import webbrowser
8
7
  from pathlib import Path
9
8
 
10
9
 
11
10
  def find_viewer_binary() -> Path | None:
12
11
  """Find the mangleframes-viewer binary."""
12
+ import sys
13
+
13
14
  pkg_dir = Path(__file__).parent
14
15
  pkg_binary = pkg_dir / "bin" / "mangleframes-viewer"
15
16
  if pkg_binary.exists():
16
17
  return pkg_binary
17
18
 
19
+ venv_binary = Path(sys.executable).parent / "mangleframes-viewer"
20
+ if venv_binary.exists():
21
+ return venv_binary
22
+
18
23
  path_binary = shutil.which("mangleframes-viewer")
19
24
  if path_binary:
20
25
  return Path(path_binary)
@@ -35,13 +40,10 @@ def launch_viewer(socket_path: Path, port: int = 8765) -> subprocess.Popen:
35
40
  env["RUST_LOG"] = env.get("RUST_LOG", "info")
36
41
 
37
42
  return subprocess.Popen(
38
- [str(binary), "--socket", str(socket_path), "--port", str(port)],
43
+ [str(binary), "--socket", str(socket_path), "--port", str(port), "--no-browser"],
39
44
  env=env,
40
45
  stdout=subprocess.DEVNULL,
41
46
  stderr=subprocess.DEVNULL,
42
47
  )
43
48
 
44
49
 
45
- def open_browser(port: int = 8765) -> None:
46
- """Open the viewer in the default browser."""
47
- webbrowser.open(f"http://localhost:{port}")
@@ -0,0 +1,421 @@
1
+ """Protocol handlers for DataFrame server commands."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import struct
6
+ import threading
7
+ import time
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import pyarrow as pa
11
+ from pyspark.sql import functions as F
12
+
13
+ if TYPE_CHECKING:
14
+ from pyspark.sql import DataFrame
15
+
16
+ STATUS_OK = 0
17
+ STATUS_ERROR = 1
18
+
19
+ # Cache for computed stats (cleared when DataFrame is re-registered)
20
+ _stats_cache: dict[str, dict] = {}
21
+
22
+ # Cache for prefetched Arrow data: name -> (limit, payload_bytes)
23
+ _arrow_cache: dict[str, tuple[int, bytes]] = {}
24
+ _arrow_cache_lock = threading.Lock()
25
+
26
+
27
+ def clear_stats_cache(name: str | None = None) -> None:
28
+ """Clear cached stats for a DataFrame or all DataFrames."""
29
+ if name is None:
30
+ _stats_cache.clear()
31
+ elif name in _stats_cache:
32
+ del _stats_cache[name]
33
+
34
+
35
+ def clear_arrow_cache(name: str | None = None) -> None:
36
+ """Clear cached Arrow data for a DataFrame or all DataFrames."""
37
+ with _arrow_cache_lock:
38
+ if name is None:
39
+ _arrow_cache.clear()
40
+ elif name in _arrow_cache:
41
+ del _arrow_cache[name]
42
+
43
+
44
+ def _serialize_arrow_ipc(table: pa.Table) -> tuple[bytes, int]:
45
+ """Serialize Arrow table to IPC format, returning bytes and timing in ms."""
46
+ start = time.perf_counter()
47
+ sink = pa.BufferOutputStream()
48
+ with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
49
+ for batch in table.to_batches():
50
+ writer.write_batch(batch)
51
+ ipc_ms = int((time.perf_counter() - start) * 1000)
52
+ return sink.getvalue().to_pybytes(), ipc_ms
53
+
54
+
55
+ def prefetch_frame(registry: dict[str, DataFrame], name: str, limit: int = 10000) -> bool:
56
+ """Prefetch DataFrame as Arrow IPC bytes in background.
57
+
58
+ Returns True if prefetch succeeded, False otherwise.
59
+ """
60
+ if name not in registry:
61
+ return False
62
+
63
+ try:
64
+ df = registry[name]
65
+ start = time.perf_counter()
66
+ limited_df = df.limit(limit) if limit > 0 else df
67
+ table = limited_df.toArrow()
68
+ spark_ms = int((time.perf_counter() - start) * 1000)
69
+ total_rows = table.num_rows
70
+
71
+ arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
72
+ # 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
73
+ payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
74
+
75
+ with _arrow_cache_lock:
76
+ _arrow_cache[name] = (limit, payload)
77
+
78
+ return True
79
+ except Exception:
80
+ return False
81
+
82
+
83
+ def encode_response(status: int, payload: bytes) -> bytes:
84
+ """Encode response with status and length prefix."""
85
+ return struct.pack(">II", status, len(payload)) + payload
86
+
87
+
88
+ def encode_json_response(data: Any) -> bytes:
89
+ """Encode JSON data as successful response."""
90
+ return encode_response(STATUS_OK, json.dumps(data).encode("utf-8"))
91
+
92
+
93
+ def encode_error(message: str) -> bytes:
94
+ """Encode error message response."""
95
+ return encode_response(STATUS_ERROR, message.encode("utf-8"))
96
+
97
+
98
+ def handle_list(registry: dict[str, DataFrame]) -> bytes:
99
+ """Return list of registered DataFrame names."""
100
+ return encode_json_response(list(registry.keys()))
101
+
102
+
103
+ def handle_schema(registry: dict[str, DataFrame], name: str) -> bytes:
104
+ """Return schema of a DataFrame as JSON."""
105
+ if name not in registry:
106
+ return encode_error(f"DataFrame '{name}' not found")
107
+
108
+ df = registry[name]
109
+ columns = [
110
+ {"name": field.name, "type": str(field.dataType), "nullable": field.nullable}
111
+ for field in df.schema.fields
112
+ ]
113
+ return encode_json_response({"name": name, "columns": columns})
114
+
115
+
116
+ def handle_get(registry: dict[str, DataFrame], name: str, limit: int) -> bytes:
117
+ """Return DataFrame data as Arrow IPC stream with timing info."""
118
+ if name not in registry:
119
+ return encode_error(f"DataFrame '{name}' not found")
120
+
121
+ # Check cache first - return cached data if limit is sufficient
122
+ with _arrow_cache_lock:
123
+ if name in _arrow_cache:
124
+ cached_limit, cached_payload = _arrow_cache[name]
125
+ if cached_limit >= limit:
126
+ return encode_response(STATUS_OK, cached_payload)
127
+
128
+ # Cache miss or insufficient limit - materialize from Spark
129
+ df = registry[name]
130
+
131
+ start = time.perf_counter()
132
+ limited_df = df.limit(limit) if limit > 0 else df
133
+ table = limited_df.toArrow()
134
+ spark_ms = int((time.perf_counter() - start) * 1000)
135
+ total_rows = table.num_rows
136
+
137
+ arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
138
+ # 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
139
+ payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
140
+
141
+ # Cache result for future requests
142
+ with _arrow_cache_lock:
143
+ _arrow_cache[name] = (limit, payload)
144
+
145
+ return encode_response(STATUS_OK, payload)
146
+
147
+
148
+ def _is_numeric_type(dtype_str: str) -> bool:
149
+ """Check if a Spark type string represents a numeric type."""
150
+ dtype_lower = dtype_str.lower()
151
+ return any(t in dtype_lower for t in ["int", "long", "double", "float", "decimal"])
152
+
153
+
154
+ def _is_temporal_type(dtype_str: str) -> bool:
155
+ """Check if a Spark type string represents a temporal type."""
156
+ dtype_lower = dtype_str.lower()
157
+ return any(t in dtype_lower for t in ["date", "timestamp"])
158
+
159
+
160
+ def handle_stats(registry: dict[str, DataFrame], name: str) -> bytes:
161
+ """Return basic statistics for a DataFrame using single aggregation."""
162
+ if name not in registry:
163
+ return encode_error(f"DataFrame '{name}' not found")
164
+
165
+ # Return cached stats if available
166
+ if name in _stats_cache:
167
+ return encode_json_response(_stats_cache[name])
168
+
169
+ df = registry[name]
170
+ fields = df.schema.fields
171
+
172
+ # Build all aggregation expressions in one pass
173
+ agg_exprs = [F.count(F.lit(1)).alias("__total")]
174
+ for field in fields:
175
+ col_name = field.name
176
+ agg_exprs.append(
177
+ F.sum(F.when(F.col(col_name).isNull(), 1).otherwise(0)).alias(f"{col_name}__nulls")
178
+ )
179
+ if _is_numeric_type(str(field.dataType)):
180
+ agg_exprs.append(F.min(col_name).alias(f"{col_name}__min"))
181
+ agg_exprs.append(F.max(col_name).alias(f"{col_name}__max"))
182
+
183
+ # Single Spark action
184
+ result = df.agg(*agg_exprs).collect()[0]
185
+ row_count = result["__total"]
186
+
187
+ # Extract stats from result
188
+ column_stats = []
189
+ for field in fields:
190
+ col_name = field.name
191
+ dtype_str = str(field.dataType)
192
+ stats = {"name": col_name, "type": dtype_str, "nullable": field.nullable}
193
+ stats["null_count"] = result[f"{col_name}__nulls"] or 0
194
+
195
+ if _is_numeric_type(dtype_str):
196
+ min_val = result[f"{col_name}__min"]
197
+ max_val = result[f"{col_name}__max"]
198
+ stats["min"] = str(min_val) if min_val is not None else None
199
+ stats["max"] = str(max_val) if max_val is not None else None
200
+
201
+ column_stats.append(stats)
202
+
203
+ stats_data = {"name": name, "row_count": row_count, "columns": column_stats}
204
+ _stats_cache[name] = stats_data # Cache for future requests
205
+
206
+ return encode_json_response(stats_data)
207
+
208
+
209
+ def handle_count(registry: dict[str, DataFrame], name: str) -> bytes:
210
+ """Return total row count without transferring data."""
211
+ if name not in registry:
212
+ return encode_error(f"DataFrame '{name}' not found")
213
+
214
+ df = registry[name]
215
+ start = time.perf_counter()
216
+ count = df.count()
217
+ count_ms = int((time.perf_counter() - start) * 1000)
218
+
219
+ return encode_json_response({"name": name, "count": count, "count_ms": count_ms})
220
+
221
+
222
+ def handle_join_keys(registry: dict[str, DataFrame], name: str, columns: list[str]) -> bytes:
223
+ """Return key statistics for join analysis."""
224
+ if name not in registry:
225
+ return encode_error(f"DataFrame '{name}' not found")
226
+
227
+ df = registry[name]
228
+ for col in columns:
229
+ if col not in [f.name for f in df.schema.fields]:
230
+ return encode_error(f"Column '{col}' not found in '{name}'")
231
+
232
+ if len(columns) == 1:
233
+ key_expr = F.col(columns[0])
234
+ else:
235
+ key_expr = F.struct(*[F.col(c) for c in columns])
236
+
237
+ start = time.perf_counter()
238
+ result = df.agg(
239
+ F.countDistinct(key_expr).alias("cardinality"),
240
+ F.sum(F.when(key_expr.isNull(), 1).otherwise(0)).alias("null_count"),
241
+ F.count(F.lit(1)).alias("total_rows")
242
+ ).collect()[0]
243
+ compute_ms = int((time.perf_counter() - start) * 1000)
244
+
245
+ return encode_json_response({
246
+ "frame": name,
247
+ "columns": columns,
248
+ "cardinality": result["cardinality"],
249
+ "null_count": result["null_count"] or 0,
250
+ "total_rows": result["total_rows"],
251
+ "compute_ms": compute_ms
252
+ })
253
+
254
+
255
+ def handle_join_temporal(
256
+ registry: dict[str, DataFrame], name: str, column: str, bucket: str
257
+ ) -> bytes:
258
+ """Return temporal distribution for join coverage analysis."""
259
+ if name not in registry:
260
+ return encode_error(f"DataFrame '{name}' not found")
261
+
262
+ df = registry[name]
263
+ if column not in [f.name for f in df.schema.fields]:
264
+ return encode_error(f"Column '{column}' not found in '{name}'")
265
+
266
+ field = next(f for f in df.schema.fields if f.name == column)
267
+ if not _is_temporal_type(str(field.dataType)):
268
+ return encode_error(f"Column '{column}' is not a temporal type")
269
+
270
+ if bucket not in ("day", "week", "month"):
271
+ return encode_error(f"Invalid bucket: {bucket}. Use day, week, or month")
272
+
273
+ start = time.perf_counter()
274
+ truncated = df.withColumn("__bucket", F.date_trunc(bucket, F.col(column)))
275
+ buckets_df = truncated.groupBy("__bucket").agg(F.count(F.lit(1)).alias("count"))
276
+ bucket_rows = buckets_df.orderBy("__bucket").collect()
277
+ compute_ms = int((time.perf_counter() - start) * 1000)
278
+
279
+ min_max = df.agg(
280
+ F.min(column).alias("min_val"),
281
+ F.max(column).alias("max_val")
282
+ ).collect()[0]
283
+
284
+ buckets = {}
285
+ for row in bucket_rows:
286
+ if row["__bucket"] is not None:
287
+ bucket_key = str(row["__bucket"])[:10]
288
+ buckets[bucket_key] = row["count"]
289
+
290
+ return encode_json_response({
291
+ "frame": name,
292
+ "column": column,
293
+ "bucket_size": bucket,
294
+ "min": str(min_max["min_val"]) if min_max["min_val"] else None,
295
+ "max": str(min_max["max_val"]) if min_max["max_val"] else None,
296
+ "buckets": buckets,
297
+ "compute_ms": compute_ms
298
+ })
299
+
300
+
301
+ def handle_join_overlap(
302
+ registry: dict[str, DataFrame],
303
+ frame1: str, frame2: str,
304
+ cols1: list[str], cols2: list[str]
305
+ ) -> bytes:
306
+ """Return key overlap statistics between two frames."""
307
+ if frame1 not in registry:
308
+ return encode_error(f"DataFrame '{frame1}' not found")
309
+ if frame2 not in registry:
310
+ return encode_error(f"DataFrame '{frame2}' not found")
311
+ if len(cols1) != len(cols2):
312
+ return encode_error("Column count mismatch")
313
+
314
+ df1 = registry[frame1]
315
+ df2 = registry[frame2]
316
+
317
+ for col in cols1:
318
+ if col not in [f.name for f in df1.schema.fields]:
319
+ return encode_error(f"Column '{col}' not found in '{frame1}'")
320
+ for col in cols2:
321
+ if col not in [f.name for f in df2.schema.fields]:
322
+ return encode_error(f"Column '{col}' not found in '{frame2}'")
323
+
324
+ start = time.perf_counter()
325
+
326
+ if len(cols1) == 1:
327
+ key1 = F.col(cols1[0])
328
+ key2 = F.col(cols2[0])
329
+ else:
330
+ key1 = F.struct(*[F.col(c) for c in cols1])
331
+ key2 = F.struct(*[F.col(c) for c in cols2])
332
+
333
+ keys1 = df1.select(key1.alias("key")).distinct()
334
+ keys2 = df2.select(key2.alias("key")).distinct()
335
+
336
+ count1 = keys1.count()
337
+ count2 = keys2.count()
338
+ both = keys1.join(keys2, "key", "inner").count()
339
+ left_only = count1 - both
340
+ right_only = count2 - both
341
+
342
+ compute_ms = int((time.perf_counter() - start) * 1000)
343
+
344
+ overlap_pct = (both / max(count1, count2) * 100) if max(count1, count2) > 0 else 0.0
345
+
346
+ return encode_json_response({
347
+ "frame1": frame1,
348
+ "frame2": frame2,
349
+ "cols1": cols1,
350
+ "cols2": cols2,
351
+ "left_total": count1,
352
+ "right_total": count2,
353
+ "left_only": left_only,
354
+ "right_only": right_only,
355
+ "both": both,
356
+ "overlap_pct": round(overlap_pct, 2),
357
+ "compute_ms": compute_ms
358
+ })
359
+
360
+
361
+ def dispatch_command(
362
+ registry: dict[str, DataFrame], command: str
363
+ ) -> bytes:
364
+ """Parse and dispatch a command to the appropriate handler."""
365
+ command = command.strip()
366
+
367
+ if command == "LIST":
368
+ return handle_list(registry)
369
+
370
+ if command.startswith("SCHEMA:"):
371
+ name = command[7:]
372
+ return handle_schema(registry, name)
373
+
374
+ if command.startswith("GET:"):
375
+ parts = command[4:].split(":")
376
+ if len(parts) != 2:
377
+ return encode_error("Invalid GET format. Use GET:name:limit")
378
+ name, limit_str = parts
379
+ try:
380
+ limit = int(limit_str)
381
+ except ValueError:
382
+ return encode_error(f"Invalid limit: {limit_str}")
383
+ return handle_get(registry, name, limit)
384
+
385
+ if command.startswith("STATS:"):
386
+ name = command[6:]
387
+ return handle_stats(registry, name)
388
+
389
+ if command.startswith("COUNT:"):
390
+ name = command[6:]
391
+ return handle_count(registry, name)
392
+
393
+ if command.startswith("JOIN_KEYS:"):
394
+ parts = command[10:].split(":")
395
+ if len(parts) != 2:
396
+ return encode_error("Invalid JOIN_KEYS format. Use JOIN_KEYS:name:col1,col2,...")
397
+ name, cols_str = parts
398
+ columns = [c.strip() for c in cols_str.split(",") if c.strip()]
399
+ if not columns:
400
+ return encode_error("At least one column required")
401
+ return handle_join_keys(registry, name, columns)
402
+
403
+ if command.startswith("JOIN_TEMPORAL:"):
404
+ parts = command[14:].split(":")
405
+ if len(parts) != 3:
406
+ return encode_error("Invalid JOIN_TEMPORAL format. Use JOIN_TEMPORAL:name:column:bucket")
407
+ name, column, bucket = parts
408
+ return handle_join_temporal(registry, name, column, bucket)
409
+
410
+ if command.startswith("JOIN_OVERLAP:"):
411
+ parts = command[13:].split(":")
412
+ if len(parts) != 4:
413
+ return encode_error("Invalid JOIN_OVERLAP format. Use JOIN_OVERLAP:f1:f2:cols1:cols2")
414
+ frame1, frame2, cols1_str, cols2_str = parts
415
+ cols1 = [c.strip() for c in cols1_str.split(",") if c.strip()]
416
+ cols2 = [c.strip() for c in cols2_str.split(",") if c.strip()]
417
+ if not cols1 or not cols2:
418
+ return encode_error("At least one column required per frame")
419
+ return handle_join_overlap(registry, frame1, frame2, cols1, cols2)
420
+
421
+ return encode_error(f"Unknown command: {command}")