mangleframes 0.1.3__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mangleframes-0.1.3 → mangleframes-0.1.6}/PKG-INFO +5 -2
- {mangleframes-0.1.3 → mangleframes-0.1.6}/pyproject.toml +5 -2
- mangleframes-0.1.6/python/mangleframes/__init__.py +128 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/python/mangleframes/launcher.py +7 -5
- mangleframes-0.1.6/python/mangleframes/protocol.py +421 -0
- mangleframes-0.1.6/viewer/src/arrow_reader.rs +99 -0
- mangleframes-0.1.6/viewer/src/handlers.rs +431 -0
- mangleframes-0.1.6/viewer/src/history_analysis.rs +298 -0
- mangleframes-0.1.6/viewer/src/history_handlers.rs +148 -0
- mangleframes-0.1.6/viewer/src/join_analysis.rs +324 -0
- mangleframes-0.1.6/viewer/src/join_handlers.rs +236 -0
- mangleframes-0.1.6/viewer/src/main.rs +142 -0
- mangleframes-0.1.6/viewer/src/perf.rs +178 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/socket_client.rs +57 -3
- mangleframes-0.1.6/viewer/src/web_server.rs +130 -0
- mangleframes-0.1.6/viewer/static/app.js +1205 -0
- mangleframes-0.1.6/viewer/static/index.html +247 -0
- mangleframes-0.1.6/viewer/static/style.css +951 -0
- mangleframes-0.1.3/python/mangleframes/__init__.py +0 -70
- mangleframes-0.1.3/python/mangleframes/protocol.py +0 -163
- mangleframes-0.1.3/viewer/src/arrow_reader.rs +0 -132
- mangleframes-0.1.3/viewer/src/handlers.rs +0 -190
- mangleframes-0.1.3/viewer/src/main.rs +0 -69
- mangleframes-0.1.3/viewer/src/web_server.rs +0 -63
- mangleframes-0.1.3/viewer/static/app.js +0 -330
- mangleframes-0.1.3/viewer/static/index.html +0 -60
- mangleframes-0.1.3/viewer/static/style.css +0 -242
- {mangleframes-0.1.3 → mangleframes-0.1.6}/Cargo.lock +0 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/Cargo.toml +0 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/python/mangleframes/server.py +0 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/Cargo.toml +0 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/export.rs +0 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/query_engine.rs +0 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/stats.rs +0 -0
- {mangleframes-0.1.3 → mangleframes-0.1.6}/viewer/src/websocket.rs +0 -0
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mangleframes
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Classifier: Programming Language :: Python :: 3
|
|
5
5
|
Classifier: Programming Language :: Rust
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Requires-Dist: databricks-connect>=16.1.7
|
|
8
|
+
Requires-Dist: loguru>=0.7.3
|
|
9
|
+
Requires-Dist: maturin>=1.11.2
|
|
7
10
|
Requires-Dist: pyarrow>=11.0.0
|
|
8
11
|
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
9
12
|
Requires-Dist: maturin>=1.4 ; extra == 'dev'
|
|
@@ -13,4 +16,4 @@ Provides-Extra: spark
|
|
|
13
16
|
License-File: LICENSE
|
|
14
17
|
Summary: PySpark DataFrame viewer with modern web UI
|
|
15
18
|
License: MIT
|
|
16
|
-
Requires-Python: >=3.
|
|
19
|
+
Requires-Python: >=3.11
|
|
@@ -4,9 +4,9 @@ build-backend = "maturin"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mangleframes"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.6"
|
|
8
8
|
description = "PySpark DataFrame viewer with modern web UI"
|
|
9
|
-
requires-python = ">=3.
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
10
|
license = { text = "MIT" }
|
|
11
11
|
classifiers = [
|
|
12
12
|
"Programming Language :: Python :: 3",
|
|
@@ -14,6 +14,9 @@ classifiers = [
|
|
|
14
14
|
"License :: OSI Approved :: MIT License",
|
|
15
15
|
]
|
|
16
16
|
dependencies = [
|
|
17
|
+
"databricks-connect>=16.1.7",
|
|
18
|
+
"loguru>=0.7.3",
|
|
19
|
+
"maturin>=1.11.2",
|
|
17
20
|
"pyarrow>=11.0.0",
|
|
18
21
|
]
|
|
19
22
|
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""MangleFrames - PySpark DataFrame viewer with modern web UI."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import atexit
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from .launcher import launch_viewer
|
|
13
|
+
from .protocol import clear_arrow_cache, clear_stats_cache, prefetch_frame
|
|
14
|
+
from .server import DataFrameServer
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from pyspark.sql import DataFrame
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.6"
|
|
20
|
+
__all__ = ["register", "unregister", "show", "cleanup"]
|
|
21
|
+
|
|
22
|
+
_registry: dict[str, DataFrame] = {}
|
|
23
|
+
_server: DataFrameServer | None = None
|
|
24
|
+
_viewer_process: subprocess.Popen | None = None
|
|
25
|
+
_cleanup_registered = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _clean_stale_sockets() -> None:
|
|
29
|
+
"""Remove socket files from PIDs that no longer exist."""
|
|
30
|
+
socket_dir = Path("/tmp")
|
|
31
|
+
for sock in socket_dir.glob("mangleframes-*.sock"):
|
|
32
|
+
try:
|
|
33
|
+
pid_str = sock.stem.split("-")[1]
|
|
34
|
+
pid = int(pid_str)
|
|
35
|
+
if not _pid_exists(pid):
|
|
36
|
+
sock.unlink()
|
|
37
|
+
except (IndexError, ValueError, OSError):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _pid_exists(pid: int) -> bool:
|
|
42
|
+
"""Check if a process with given PID exists."""
|
|
43
|
+
try:
|
|
44
|
+
os.kill(pid, 0)
|
|
45
|
+
return True
|
|
46
|
+
except OSError:
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def cleanup() -> None:
|
|
51
|
+
"""Clean up all MangleFrames resources."""
|
|
52
|
+
global _server, _viewer_process, _registry
|
|
53
|
+
|
|
54
|
+
if _viewer_process is not None:
|
|
55
|
+
_viewer_process.terminate()
|
|
56
|
+
try:
|
|
57
|
+
_viewer_process.wait(timeout=2.0)
|
|
58
|
+
except subprocess.TimeoutExpired:
|
|
59
|
+
_viewer_process.kill()
|
|
60
|
+
_viewer_process = None
|
|
61
|
+
|
|
62
|
+
if _server is not None:
|
|
63
|
+
_server.stop()
|
|
64
|
+
_server = None
|
|
65
|
+
|
|
66
|
+
_registry.clear()
|
|
67
|
+
clear_stats_cache()
|
|
68
|
+
clear_arrow_cache()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def register(name: str, df: DataFrame) -> None:
|
|
72
|
+
"""Register a DataFrame for viewing."""
|
|
73
|
+
global _server, _cleanup_registered
|
|
74
|
+
|
|
75
|
+
if not _cleanup_registered:
|
|
76
|
+
atexit.register(cleanup)
|
|
77
|
+
_cleanup_registered = True
|
|
78
|
+
_clean_stale_sockets()
|
|
79
|
+
|
|
80
|
+
_registry[name] = df
|
|
81
|
+
clear_stats_cache(name)
|
|
82
|
+
clear_arrow_cache(name)
|
|
83
|
+
|
|
84
|
+
def do_prefetch() -> None:
|
|
85
|
+
prefetch_frame(_registry, name, limit=10000)
|
|
86
|
+
|
|
87
|
+
threading.Thread(target=do_prefetch, daemon=True).start()
|
|
88
|
+
|
|
89
|
+
if _server is None:
|
|
90
|
+
_server = DataFrameServer(_registry)
|
|
91
|
+
_server.start()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def unregister(name: str) -> None:
|
|
95
|
+
"""Remove a DataFrame from the viewer."""
|
|
96
|
+
if name in _registry:
|
|
97
|
+
del _registry[name]
|
|
98
|
+
clear_stats_cache(name)
|
|
99
|
+
clear_arrow_cache(name)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def show(port: int = 8765, block: bool = True) -> None:
|
|
103
|
+
"""Open the viewer in a browser, launching the viewer if needed.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
port: Port for the viewer web server.
|
|
107
|
+
block: If True, block until Ctrl+C (keeps server alive).
|
|
108
|
+
"""
|
|
109
|
+
global _viewer_process, _server
|
|
110
|
+
|
|
111
|
+
if _server is None or not _server.is_running:
|
|
112
|
+
if not _registry:
|
|
113
|
+
raise RuntimeError("No DataFrames registered. Call register() first.")
|
|
114
|
+
_server = DataFrameServer(_registry)
|
|
115
|
+
_server.start()
|
|
116
|
+
|
|
117
|
+
if _viewer_process is None or _viewer_process.poll() is not None:
|
|
118
|
+
_viewer_process = launch_viewer(_server.socket_path, port)
|
|
119
|
+
|
|
120
|
+
if block:
|
|
121
|
+
try:
|
|
122
|
+
print(f"MangleFrames viewer running at http://localhost:{port}")
|
|
123
|
+
print("Press Ctrl+C to stop...")
|
|
124
|
+
while True:
|
|
125
|
+
time.sleep(1)
|
|
126
|
+
except KeyboardInterrupt:
|
|
127
|
+
print("\nStopping MangleFrames...")
|
|
128
|
+
cleanup()
|
|
@@ -4,17 +4,22 @@ from __future__ import annotations
|
|
|
4
4
|
import os
|
|
5
5
|
import shutil
|
|
6
6
|
import subprocess
|
|
7
|
-
import webbrowser
|
|
8
7
|
from pathlib import Path
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def find_viewer_binary() -> Path | None:
|
|
12
11
|
"""Find the mangleframes-viewer binary."""
|
|
12
|
+
import sys
|
|
13
|
+
|
|
13
14
|
pkg_dir = Path(__file__).parent
|
|
14
15
|
pkg_binary = pkg_dir / "bin" / "mangleframes-viewer"
|
|
15
16
|
if pkg_binary.exists():
|
|
16
17
|
return pkg_binary
|
|
17
18
|
|
|
19
|
+
venv_binary = Path(sys.executable).parent / "mangleframes-viewer"
|
|
20
|
+
if venv_binary.exists():
|
|
21
|
+
return venv_binary
|
|
22
|
+
|
|
18
23
|
path_binary = shutil.which("mangleframes-viewer")
|
|
19
24
|
if path_binary:
|
|
20
25
|
return Path(path_binary)
|
|
@@ -35,13 +40,10 @@ def launch_viewer(socket_path: Path, port: int = 8765) -> subprocess.Popen:
|
|
|
35
40
|
env["RUST_LOG"] = env.get("RUST_LOG", "info")
|
|
36
41
|
|
|
37
42
|
return subprocess.Popen(
|
|
38
|
-
[str(binary), "--socket", str(socket_path), "--port", str(port)],
|
|
43
|
+
[str(binary), "--socket", str(socket_path), "--port", str(port), "--no-browser"],
|
|
39
44
|
env=env,
|
|
40
45
|
stdout=subprocess.DEVNULL,
|
|
41
46
|
stderr=subprocess.DEVNULL,
|
|
42
47
|
)
|
|
43
48
|
|
|
44
49
|
|
|
45
|
-
def open_browser(port: int = 8765) -> None:
|
|
46
|
-
"""Open the viewer in the default browser."""
|
|
47
|
-
webbrowser.open(f"http://localhost:{port}")
|
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
"""Protocol handlers for DataFrame server commands."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import struct
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from pyspark.sql import functions as F
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from pyspark.sql import DataFrame
|
|
15
|
+
|
|
16
|
+
STATUS_OK = 0
|
|
17
|
+
STATUS_ERROR = 1
|
|
18
|
+
|
|
19
|
+
# Cache for computed stats (cleared when DataFrame is re-registered)
|
|
20
|
+
_stats_cache: dict[str, dict] = {}
|
|
21
|
+
|
|
22
|
+
# Cache for prefetched Arrow data: name -> (limit, payload_bytes)
|
|
23
|
+
_arrow_cache: dict[str, tuple[int, bytes]] = {}
|
|
24
|
+
_arrow_cache_lock = threading.Lock()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def clear_stats_cache(name: str | None = None) -> None:
|
|
28
|
+
"""Clear cached stats for a DataFrame or all DataFrames."""
|
|
29
|
+
if name is None:
|
|
30
|
+
_stats_cache.clear()
|
|
31
|
+
elif name in _stats_cache:
|
|
32
|
+
del _stats_cache[name]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def clear_arrow_cache(name: str | None = None) -> None:
|
|
36
|
+
"""Clear cached Arrow data for a DataFrame or all DataFrames."""
|
|
37
|
+
with _arrow_cache_lock:
|
|
38
|
+
if name is None:
|
|
39
|
+
_arrow_cache.clear()
|
|
40
|
+
elif name in _arrow_cache:
|
|
41
|
+
del _arrow_cache[name]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _serialize_arrow_ipc(table: pa.Table) -> tuple[bytes, int]:
|
|
45
|
+
"""Serialize Arrow table to IPC format, returning bytes and timing in ms."""
|
|
46
|
+
start = time.perf_counter()
|
|
47
|
+
sink = pa.BufferOutputStream()
|
|
48
|
+
with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
|
|
49
|
+
for batch in table.to_batches():
|
|
50
|
+
writer.write_batch(batch)
|
|
51
|
+
ipc_ms = int((time.perf_counter() - start) * 1000)
|
|
52
|
+
return sink.getvalue().to_pybytes(), ipc_ms
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def prefetch_frame(registry: dict[str, DataFrame], name: str, limit: int = 10000) -> bool:
|
|
56
|
+
"""Prefetch DataFrame as Arrow IPC bytes in background.
|
|
57
|
+
|
|
58
|
+
Returns True if prefetch succeeded, False otherwise.
|
|
59
|
+
"""
|
|
60
|
+
if name not in registry:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
df = registry[name]
|
|
65
|
+
start = time.perf_counter()
|
|
66
|
+
limited_df = df.limit(limit) if limit > 0 else df
|
|
67
|
+
table = limited_df.toArrow()
|
|
68
|
+
spark_ms = int((time.perf_counter() - start) * 1000)
|
|
69
|
+
total_rows = table.num_rows
|
|
70
|
+
|
|
71
|
+
arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
|
|
72
|
+
# 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
|
|
73
|
+
payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
|
|
74
|
+
|
|
75
|
+
with _arrow_cache_lock:
|
|
76
|
+
_arrow_cache[name] = (limit, payload)
|
|
77
|
+
|
|
78
|
+
return True
|
|
79
|
+
except Exception:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def encode_response(status: int, payload: bytes) -> bytes:
|
|
84
|
+
"""Encode response with status and length prefix."""
|
|
85
|
+
return struct.pack(">II", status, len(payload)) + payload
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def encode_json_response(data: Any) -> bytes:
|
|
89
|
+
"""Encode JSON data as successful response."""
|
|
90
|
+
return encode_response(STATUS_OK, json.dumps(data).encode("utf-8"))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def encode_error(message: str) -> bytes:
|
|
94
|
+
"""Encode error message response."""
|
|
95
|
+
return encode_response(STATUS_ERROR, message.encode("utf-8"))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def handle_list(registry: dict[str, DataFrame]) -> bytes:
|
|
99
|
+
"""Return list of registered DataFrame names."""
|
|
100
|
+
return encode_json_response(list(registry.keys()))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def handle_schema(registry: dict[str, DataFrame], name: str) -> bytes:
|
|
104
|
+
"""Return schema of a DataFrame as JSON."""
|
|
105
|
+
if name not in registry:
|
|
106
|
+
return encode_error(f"DataFrame '{name}' not found")
|
|
107
|
+
|
|
108
|
+
df = registry[name]
|
|
109
|
+
columns = [
|
|
110
|
+
{"name": field.name, "type": str(field.dataType), "nullable": field.nullable}
|
|
111
|
+
for field in df.schema.fields
|
|
112
|
+
]
|
|
113
|
+
return encode_json_response({"name": name, "columns": columns})
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def handle_get(registry: dict[str, DataFrame], name: str, limit: int) -> bytes:
|
|
117
|
+
"""Return DataFrame data as Arrow IPC stream with timing info."""
|
|
118
|
+
if name not in registry:
|
|
119
|
+
return encode_error(f"DataFrame '{name}' not found")
|
|
120
|
+
|
|
121
|
+
# Check cache first - return cached data if limit is sufficient
|
|
122
|
+
with _arrow_cache_lock:
|
|
123
|
+
if name in _arrow_cache:
|
|
124
|
+
cached_limit, cached_payload = _arrow_cache[name]
|
|
125
|
+
if cached_limit >= limit:
|
|
126
|
+
return encode_response(STATUS_OK, cached_payload)
|
|
127
|
+
|
|
128
|
+
# Cache miss or insufficient limit - materialize from Spark
|
|
129
|
+
df = registry[name]
|
|
130
|
+
|
|
131
|
+
start = time.perf_counter()
|
|
132
|
+
limited_df = df.limit(limit) if limit > 0 else df
|
|
133
|
+
table = limited_df.toArrow()
|
|
134
|
+
spark_ms = int((time.perf_counter() - start) * 1000)
|
|
135
|
+
total_rows = table.num_rows
|
|
136
|
+
|
|
137
|
+
arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
|
|
138
|
+
# 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
|
|
139
|
+
payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
|
|
140
|
+
|
|
141
|
+
# Cache result for future requests
|
|
142
|
+
with _arrow_cache_lock:
|
|
143
|
+
_arrow_cache[name] = (limit, payload)
|
|
144
|
+
|
|
145
|
+
return encode_response(STATUS_OK, payload)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _is_numeric_type(dtype_str: str) -> bool:
|
|
149
|
+
"""Check if a Spark type string represents a numeric type."""
|
|
150
|
+
dtype_lower = dtype_str.lower()
|
|
151
|
+
return any(t in dtype_lower for t in ["int", "long", "double", "float", "decimal"])
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _is_temporal_type(dtype_str: str) -> bool:
|
|
155
|
+
"""Check if a Spark type string represents a temporal type."""
|
|
156
|
+
dtype_lower = dtype_str.lower()
|
|
157
|
+
return any(t in dtype_lower for t in ["date", "timestamp"])
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def handle_stats(registry: dict[str, DataFrame], name: str) -> bytes:
|
|
161
|
+
"""Return basic statistics for a DataFrame using single aggregation."""
|
|
162
|
+
if name not in registry:
|
|
163
|
+
return encode_error(f"DataFrame '{name}' not found")
|
|
164
|
+
|
|
165
|
+
# Return cached stats if available
|
|
166
|
+
if name in _stats_cache:
|
|
167
|
+
return encode_json_response(_stats_cache[name])
|
|
168
|
+
|
|
169
|
+
df = registry[name]
|
|
170
|
+
fields = df.schema.fields
|
|
171
|
+
|
|
172
|
+
# Build all aggregation expressions in one pass
|
|
173
|
+
agg_exprs = [F.count(F.lit(1)).alias("__total")]
|
|
174
|
+
for field in fields:
|
|
175
|
+
col_name = field.name
|
|
176
|
+
agg_exprs.append(
|
|
177
|
+
F.sum(F.when(F.col(col_name).isNull(), 1).otherwise(0)).alias(f"{col_name}__nulls")
|
|
178
|
+
)
|
|
179
|
+
if _is_numeric_type(str(field.dataType)):
|
|
180
|
+
agg_exprs.append(F.min(col_name).alias(f"{col_name}__min"))
|
|
181
|
+
agg_exprs.append(F.max(col_name).alias(f"{col_name}__max"))
|
|
182
|
+
|
|
183
|
+
# Single Spark action
|
|
184
|
+
result = df.agg(*agg_exprs).collect()[0]
|
|
185
|
+
row_count = result["__total"]
|
|
186
|
+
|
|
187
|
+
# Extract stats from result
|
|
188
|
+
column_stats = []
|
|
189
|
+
for field in fields:
|
|
190
|
+
col_name = field.name
|
|
191
|
+
dtype_str = str(field.dataType)
|
|
192
|
+
stats = {"name": col_name, "type": dtype_str, "nullable": field.nullable}
|
|
193
|
+
stats["null_count"] = result[f"{col_name}__nulls"] or 0
|
|
194
|
+
|
|
195
|
+
if _is_numeric_type(dtype_str):
|
|
196
|
+
min_val = result[f"{col_name}__min"]
|
|
197
|
+
max_val = result[f"{col_name}__max"]
|
|
198
|
+
stats["min"] = str(min_val) if min_val is not None else None
|
|
199
|
+
stats["max"] = str(max_val) if max_val is not None else None
|
|
200
|
+
|
|
201
|
+
column_stats.append(stats)
|
|
202
|
+
|
|
203
|
+
stats_data = {"name": name, "row_count": row_count, "columns": column_stats}
|
|
204
|
+
_stats_cache[name] = stats_data # Cache for future requests
|
|
205
|
+
|
|
206
|
+
return encode_json_response(stats_data)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def handle_count(registry: dict[str, DataFrame], name: str) -> bytes:
|
|
210
|
+
"""Return total row count without transferring data."""
|
|
211
|
+
if name not in registry:
|
|
212
|
+
return encode_error(f"DataFrame '{name}' not found")
|
|
213
|
+
|
|
214
|
+
df = registry[name]
|
|
215
|
+
start = time.perf_counter()
|
|
216
|
+
count = df.count()
|
|
217
|
+
count_ms = int((time.perf_counter() - start) * 1000)
|
|
218
|
+
|
|
219
|
+
return encode_json_response({"name": name, "count": count, "count_ms": count_ms})
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def handle_join_keys(registry: dict[str, DataFrame], name: str, columns: list[str]) -> bytes:
|
|
223
|
+
"""Return key statistics for join analysis."""
|
|
224
|
+
if name not in registry:
|
|
225
|
+
return encode_error(f"DataFrame '{name}' not found")
|
|
226
|
+
|
|
227
|
+
df = registry[name]
|
|
228
|
+
for col in columns:
|
|
229
|
+
if col not in [f.name for f in df.schema.fields]:
|
|
230
|
+
return encode_error(f"Column '{col}' not found in '{name}'")
|
|
231
|
+
|
|
232
|
+
if len(columns) == 1:
|
|
233
|
+
key_expr = F.col(columns[0])
|
|
234
|
+
else:
|
|
235
|
+
key_expr = F.struct(*[F.col(c) for c in columns])
|
|
236
|
+
|
|
237
|
+
start = time.perf_counter()
|
|
238
|
+
result = df.agg(
|
|
239
|
+
F.countDistinct(key_expr).alias("cardinality"),
|
|
240
|
+
F.sum(F.when(key_expr.isNull(), 1).otherwise(0)).alias("null_count"),
|
|
241
|
+
F.count(F.lit(1)).alias("total_rows")
|
|
242
|
+
).collect()[0]
|
|
243
|
+
compute_ms = int((time.perf_counter() - start) * 1000)
|
|
244
|
+
|
|
245
|
+
return encode_json_response({
|
|
246
|
+
"frame": name,
|
|
247
|
+
"columns": columns,
|
|
248
|
+
"cardinality": result["cardinality"],
|
|
249
|
+
"null_count": result["null_count"] or 0,
|
|
250
|
+
"total_rows": result["total_rows"],
|
|
251
|
+
"compute_ms": compute_ms
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def handle_join_temporal(
|
|
256
|
+
registry: dict[str, DataFrame], name: str, column: str, bucket: str
|
|
257
|
+
) -> bytes:
|
|
258
|
+
"""Return temporal distribution for join coverage analysis."""
|
|
259
|
+
if name not in registry:
|
|
260
|
+
return encode_error(f"DataFrame '{name}' not found")
|
|
261
|
+
|
|
262
|
+
df = registry[name]
|
|
263
|
+
if column not in [f.name for f in df.schema.fields]:
|
|
264
|
+
return encode_error(f"Column '{column}' not found in '{name}'")
|
|
265
|
+
|
|
266
|
+
field = next(f for f in df.schema.fields if f.name == column)
|
|
267
|
+
if not _is_temporal_type(str(field.dataType)):
|
|
268
|
+
return encode_error(f"Column '{column}' is not a temporal type")
|
|
269
|
+
|
|
270
|
+
if bucket not in ("day", "week", "month"):
|
|
271
|
+
return encode_error(f"Invalid bucket: {bucket}. Use day, week, or month")
|
|
272
|
+
|
|
273
|
+
start = time.perf_counter()
|
|
274
|
+
truncated = df.withColumn("__bucket", F.date_trunc(bucket, F.col(column)))
|
|
275
|
+
buckets_df = truncated.groupBy("__bucket").agg(F.count(F.lit(1)).alias("count"))
|
|
276
|
+
bucket_rows = buckets_df.orderBy("__bucket").collect()
|
|
277
|
+
compute_ms = int((time.perf_counter() - start) * 1000)
|
|
278
|
+
|
|
279
|
+
min_max = df.agg(
|
|
280
|
+
F.min(column).alias("min_val"),
|
|
281
|
+
F.max(column).alias("max_val")
|
|
282
|
+
).collect()[0]
|
|
283
|
+
|
|
284
|
+
buckets = {}
|
|
285
|
+
for row in bucket_rows:
|
|
286
|
+
if row["__bucket"] is not None:
|
|
287
|
+
bucket_key = str(row["__bucket"])[:10]
|
|
288
|
+
buckets[bucket_key] = row["count"]
|
|
289
|
+
|
|
290
|
+
return encode_json_response({
|
|
291
|
+
"frame": name,
|
|
292
|
+
"column": column,
|
|
293
|
+
"bucket_size": bucket,
|
|
294
|
+
"min": str(min_max["min_val"]) if min_max["min_val"] else None,
|
|
295
|
+
"max": str(min_max["max_val"]) if min_max["max_val"] else None,
|
|
296
|
+
"buckets": buckets,
|
|
297
|
+
"compute_ms": compute_ms
|
|
298
|
+
})
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def handle_join_overlap(
|
|
302
|
+
registry: dict[str, DataFrame],
|
|
303
|
+
frame1: str, frame2: str,
|
|
304
|
+
cols1: list[str], cols2: list[str]
|
|
305
|
+
) -> bytes:
|
|
306
|
+
"""Return key overlap statistics between two frames."""
|
|
307
|
+
if frame1 not in registry:
|
|
308
|
+
return encode_error(f"DataFrame '{frame1}' not found")
|
|
309
|
+
if frame2 not in registry:
|
|
310
|
+
return encode_error(f"DataFrame '{frame2}' not found")
|
|
311
|
+
if len(cols1) != len(cols2):
|
|
312
|
+
return encode_error("Column count mismatch")
|
|
313
|
+
|
|
314
|
+
df1 = registry[frame1]
|
|
315
|
+
df2 = registry[frame2]
|
|
316
|
+
|
|
317
|
+
for col in cols1:
|
|
318
|
+
if col not in [f.name for f in df1.schema.fields]:
|
|
319
|
+
return encode_error(f"Column '{col}' not found in '{frame1}'")
|
|
320
|
+
for col in cols2:
|
|
321
|
+
if col not in [f.name for f in df2.schema.fields]:
|
|
322
|
+
return encode_error(f"Column '{col}' not found in '{frame2}'")
|
|
323
|
+
|
|
324
|
+
start = time.perf_counter()
|
|
325
|
+
|
|
326
|
+
if len(cols1) == 1:
|
|
327
|
+
key1 = F.col(cols1[0])
|
|
328
|
+
key2 = F.col(cols2[0])
|
|
329
|
+
else:
|
|
330
|
+
key1 = F.struct(*[F.col(c) for c in cols1])
|
|
331
|
+
key2 = F.struct(*[F.col(c) for c in cols2])
|
|
332
|
+
|
|
333
|
+
keys1 = df1.select(key1.alias("key")).distinct()
|
|
334
|
+
keys2 = df2.select(key2.alias("key")).distinct()
|
|
335
|
+
|
|
336
|
+
count1 = keys1.count()
|
|
337
|
+
count2 = keys2.count()
|
|
338
|
+
both = keys1.join(keys2, "key", "inner").count()
|
|
339
|
+
left_only = count1 - both
|
|
340
|
+
right_only = count2 - both
|
|
341
|
+
|
|
342
|
+
compute_ms = int((time.perf_counter() - start) * 1000)
|
|
343
|
+
|
|
344
|
+
overlap_pct = (both / max(count1, count2) * 100) if max(count1, count2) > 0 else 0.0
|
|
345
|
+
|
|
346
|
+
return encode_json_response({
|
|
347
|
+
"frame1": frame1,
|
|
348
|
+
"frame2": frame2,
|
|
349
|
+
"cols1": cols1,
|
|
350
|
+
"cols2": cols2,
|
|
351
|
+
"left_total": count1,
|
|
352
|
+
"right_total": count2,
|
|
353
|
+
"left_only": left_only,
|
|
354
|
+
"right_only": right_only,
|
|
355
|
+
"both": both,
|
|
356
|
+
"overlap_pct": round(overlap_pct, 2),
|
|
357
|
+
"compute_ms": compute_ms
|
|
358
|
+
})
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def dispatch_command(
|
|
362
|
+
registry: dict[str, DataFrame], command: str
|
|
363
|
+
) -> bytes:
|
|
364
|
+
"""Parse and dispatch a command to the appropriate handler."""
|
|
365
|
+
command = command.strip()
|
|
366
|
+
|
|
367
|
+
if command == "LIST":
|
|
368
|
+
return handle_list(registry)
|
|
369
|
+
|
|
370
|
+
if command.startswith("SCHEMA:"):
|
|
371
|
+
name = command[7:]
|
|
372
|
+
return handle_schema(registry, name)
|
|
373
|
+
|
|
374
|
+
if command.startswith("GET:"):
|
|
375
|
+
parts = command[4:].split(":")
|
|
376
|
+
if len(parts) != 2:
|
|
377
|
+
return encode_error("Invalid GET format. Use GET:name:limit")
|
|
378
|
+
name, limit_str = parts
|
|
379
|
+
try:
|
|
380
|
+
limit = int(limit_str)
|
|
381
|
+
except ValueError:
|
|
382
|
+
return encode_error(f"Invalid limit: {limit_str}")
|
|
383
|
+
return handle_get(registry, name, limit)
|
|
384
|
+
|
|
385
|
+
if command.startswith("STATS:"):
|
|
386
|
+
name = command[6:]
|
|
387
|
+
return handle_stats(registry, name)
|
|
388
|
+
|
|
389
|
+
if command.startswith("COUNT:"):
|
|
390
|
+
name = command[6:]
|
|
391
|
+
return handle_count(registry, name)
|
|
392
|
+
|
|
393
|
+
if command.startswith("JOIN_KEYS:"):
|
|
394
|
+
parts = command[10:].split(":")
|
|
395
|
+
if len(parts) != 2:
|
|
396
|
+
return encode_error("Invalid JOIN_KEYS format. Use JOIN_KEYS:name:col1,col2,...")
|
|
397
|
+
name, cols_str = parts
|
|
398
|
+
columns = [c.strip() for c in cols_str.split(",") if c.strip()]
|
|
399
|
+
if not columns:
|
|
400
|
+
return encode_error("At least one column required")
|
|
401
|
+
return handle_join_keys(registry, name, columns)
|
|
402
|
+
|
|
403
|
+
if command.startswith("JOIN_TEMPORAL:"):
|
|
404
|
+
parts = command[14:].split(":")
|
|
405
|
+
if len(parts) != 3:
|
|
406
|
+
return encode_error("Invalid JOIN_TEMPORAL format. Use JOIN_TEMPORAL:name:column:bucket")
|
|
407
|
+
name, column, bucket = parts
|
|
408
|
+
return handle_join_temporal(registry, name, column, bucket)
|
|
409
|
+
|
|
410
|
+
if command.startswith("JOIN_OVERLAP:"):
|
|
411
|
+
parts = command[13:].split(":")
|
|
412
|
+
if len(parts) != 4:
|
|
413
|
+
return encode_error("Invalid JOIN_OVERLAP format. Use JOIN_OVERLAP:f1:f2:cols1:cols2")
|
|
414
|
+
frame1, frame2, cols1_str, cols2_str = parts
|
|
415
|
+
cols1 = [c.strip() for c in cols1_str.split(",") if c.strip()]
|
|
416
|
+
cols2 = [c.strip() for c in cols2_str.split(",") if c.strip()]
|
|
417
|
+
if not cols1 or not cols2:
|
|
418
|
+
return encode_error("At least one column required per frame")
|
|
419
|
+
return handle_join_overlap(registry, frame1, frame2, cols1, cols2)
|
|
420
|
+
|
|
421
|
+
return encode_error(f"Unknown command: {command}")
|