mangleframes 0.1.2__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mangleframes-0.1.2 → mangleframes-0.1.5}/PKG-INFO +5 -2
- {mangleframes-0.1.2 → mangleframes-0.1.5}/pyproject.toml +5 -2
- {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/__init__.py +13 -6
- {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/launcher.py +7 -5
- {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/protocol.py +92 -8
- mangleframes-0.1.5/viewer/src/arrow_reader.rs +99 -0
- mangleframes-0.1.5/viewer/src/handlers.rs +423 -0
- mangleframes-0.1.5/viewer/src/main.rs +101 -0
- mangleframes-0.1.5/viewer/src/perf.rs +178 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/socket_client.rs +25 -3
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/web_server.rs +16 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/static/app.js +31 -2
- mangleframes-0.1.2/viewer/src/arrow_reader.rs +0 -132
- mangleframes-0.1.2/viewer/src/handlers.rs +0 -190
- mangleframes-0.1.2/viewer/src/main.rs +0 -69
- {mangleframes-0.1.2 → mangleframes-0.1.5}/Cargo.lock +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/Cargo.toml +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/server.py +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/Cargo.toml +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/export.rs +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/query_engine.rs +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/stats.rs +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/websocket.rs +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/static/index.html +0 -0
- {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/static/style.css +0 -0
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mangleframes
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Classifier: Programming Language :: Python :: 3
|
|
5
5
|
Classifier: Programming Language :: Rust
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Requires-Dist: databricks-connect>=16.1.7
|
|
8
|
+
Requires-Dist: loguru>=0.7.3
|
|
9
|
+
Requires-Dist: maturin>=1.11.2
|
|
7
10
|
Requires-Dist: pyarrow>=11.0.0
|
|
8
11
|
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
9
12
|
Requires-Dist: maturin>=1.4 ; extra == 'dev'
|
|
@@ -13,4 +16,4 @@ Provides-Extra: spark
|
|
|
13
16
|
License-File: LICENSE
|
|
14
17
|
Summary: PySpark DataFrame viewer with modern web UI
|
|
15
18
|
License: MIT
|
|
16
|
-
Requires-Python: >=3.
|
|
19
|
+
Requires-Python: >=3.11
|
|
@@ -4,9 +4,9 @@ build-backend = "maturin"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mangleframes"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.5"
|
|
8
8
|
description = "PySpark DataFrame viewer with modern web UI"
|
|
9
|
-
requires-python = ">=3.
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
10
|
license = { text = "MIT" }
|
|
11
11
|
classifiers = [
|
|
12
12
|
"Programming Language :: Python :: 3",
|
|
@@ -14,6 +14,9 @@ classifiers = [
|
|
|
14
14
|
"License :: OSI Approved :: MIT License",
|
|
15
15
|
]
|
|
16
16
|
dependencies = [
|
|
17
|
+
"databricks-connect>=16.1.7",
|
|
18
|
+
"loguru>=0.7.3",
|
|
19
|
+
"maturin>=1.11.2",
|
|
17
20
|
"pyarrow>=11.0.0",
|
|
18
21
|
]
|
|
19
22
|
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
"""MangleFrames - PySpark DataFrame viewer with modern web UI."""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import threading
|
|
4
5
|
import time
|
|
5
6
|
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
|
-
from .launcher import launch_viewer
|
|
8
|
-
from .protocol import clear_stats_cache
|
|
8
|
+
from .launcher import launch_viewer
|
|
9
|
+
from .protocol import clear_arrow_cache, clear_stats_cache, prefetch_frame
|
|
9
10
|
from .server import DataFrameServer
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from pyspark.sql import DataFrame
|
|
13
14
|
|
|
14
|
-
__version__ = "0.1.
|
|
15
|
+
__version__ = "0.1.5"
|
|
15
16
|
__all__ = ["register", "unregister", "show"]
|
|
16
17
|
|
|
17
18
|
_registry: dict[str, DataFrame] = {}
|
|
@@ -24,7 +25,14 @@ def register(name: str, df: DataFrame) -> None:
|
|
|
24
25
|
global _server
|
|
25
26
|
|
|
26
27
|
_registry[name] = df
|
|
27
|
-
clear_stats_cache(name)
|
|
28
|
+
clear_stats_cache(name)
|
|
29
|
+
clear_arrow_cache(name)
|
|
30
|
+
|
|
31
|
+
# Start background prefetch immediately
|
|
32
|
+
def do_prefetch() -> None:
|
|
33
|
+
prefetch_frame(_registry, name, limit=10000)
|
|
34
|
+
|
|
35
|
+
threading.Thread(target=do_prefetch, daemon=True).start()
|
|
28
36
|
|
|
29
37
|
if _server is None:
|
|
30
38
|
_server = DataFrameServer(_registry)
|
|
@@ -36,6 +44,7 @@ def unregister(name: str) -> None:
|
|
|
36
44
|
if name in _registry:
|
|
37
45
|
del _registry[name]
|
|
38
46
|
clear_stats_cache(name)
|
|
47
|
+
clear_arrow_cache(name)
|
|
39
48
|
|
|
40
49
|
|
|
41
50
|
def show(port: int = 8765, block: bool = True) -> None:
|
|
@@ -57,8 +66,6 @@ def show(port: int = 8765, block: bool = True) -> None:
|
|
|
57
66
|
launch_viewer(_server.socket_path, port)
|
|
58
67
|
_viewer_launched = True
|
|
59
68
|
|
|
60
|
-
open_browser(port)
|
|
61
|
-
|
|
62
69
|
if block:
|
|
63
70
|
try:
|
|
64
71
|
print(f"MangleFrames viewer running at http://localhost:{port}")
|
|
@@ -4,17 +4,22 @@ from __future__ import annotations
|
|
|
4
4
|
import os
|
|
5
5
|
import shutil
|
|
6
6
|
import subprocess
|
|
7
|
-
import webbrowser
|
|
8
7
|
from pathlib import Path
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def find_viewer_binary() -> Path | None:
|
|
12
11
|
"""Find the mangleframes-viewer binary."""
|
|
12
|
+
import sys
|
|
13
|
+
|
|
13
14
|
pkg_dir = Path(__file__).parent
|
|
14
15
|
pkg_binary = pkg_dir / "bin" / "mangleframes-viewer"
|
|
15
16
|
if pkg_binary.exists():
|
|
16
17
|
return pkg_binary
|
|
17
18
|
|
|
19
|
+
venv_binary = Path(sys.executable).parent / "mangleframes-viewer"
|
|
20
|
+
if venv_binary.exists():
|
|
21
|
+
return venv_binary
|
|
22
|
+
|
|
18
23
|
path_binary = shutil.which("mangleframes-viewer")
|
|
19
24
|
if path_binary:
|
|
20
25
|
return Path(path_binary)
|
|
@@ -35,13 +40,10 @@ def launch_viewer(socket_path: Path, port: int = 8765) -> subprocess.Popen:
|
|
|
35
40
|
env["RUST_LOG"] = env.get("RUST_LOG", "info")
|
|
36
41
|
|
|
37
42
|
return subprocess.Popen(
|
|
38
|
-
[str(binary), "--socket", str(socket_path), "--port", str(port)],
|
|
43
|
+
[str(binary), "--socket", str(socket_path), "--port", str(port), "--no-browser"],
|
|
39
44
|
env=env,
|
|
40
45
|
stdout=subprocess.DEVNULL,
|
|
41
46
|
stderr=subprocess.DEVNULL,
|
|
42
47
|
)
|
|
43
48
|
|
|
44
49
|
|
|
45
|
-
def open_browser(port: int = 8765) -> None:
|
|
46
|
-
"""Open the viewer in the default browser."""
|
|
47
|
-
webbrowser.open(f"http://localhost:{port}")
|
|
@@ -3,6 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
|
|
4
4
|
import json
|
|
5
5
|
import struct
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
6
8
|
from typing import TYPE_CHECKING, Any
|
|
7
9
|
|
|
8
10
|
import pyarrow as pa
|
|
@@ -17,6 +19,10 @@ STATUS_ERROR = 1
|
|
|
17
19
|
# Cache for computed stats (cleared when DataFrame is re-registered)
|
|
18
20
|
_stats_cache: dict[str, dict] = {}
|
|
19
21
|
|
|
22
|
+
# Cache for prefetched Arrow data: name -> (limit, payload_bytes)
|
|
23
|
+
_arrow_cache: dict[str, tuple[int, bytes]] = {}
|
|
24
|
+
_arrow_cache_lock = threading.Lock()
|
|
25
|
+
|
|
20
26
|
|
|
21
27
|
def clear_stats_cache(name: str | None = None) -> None:
|
|
22
28
|
"""Clear cached stats for a DataFrame or all DataFrames."""
|
|
@@ -26,6 +32,54 @@ def clear_stats_cache(name: str | None = None) -> None:
|
|
|
26
32
|
del _stats_cache[name]
|
|
27
33
|
|
|
28
34
|
|
|
35
|
+
def clear_arrow_cache(name: str | None = None) -> None:
|
|
36
|
+
"""Clear cached Arrow data for a DataFrame or all DataFrames."""
|
|
37
|
+
with _arrow_cache_lock:
|
|
38
|
+
if name is None:
|
|
39
|
+
_arrow_cache.clear()
|
|
40
|
+
elif name in _arrow_cache:
|
|
41
|
+
del _arrow_cache[name]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _serialize_arrow_ipc(table: pa.Table) -> tuple[bytes, int]:
|
|
45
|
+
"""Serialize Arrow table to IPC format, returning bytes and timing in ms."""
|
|
46
|
+
start = time.perf_counter()
|
|
47
|
+
sink = pa.BufferOutputStream()
|
|
48
|
+
with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
|
|
49
|
+
for batch in table.to_batches():
|
|
50
|
+
writer.write_batch(batch)
|
|
51
|
+
ipc_ms = int((time.perf_counter() - start) * 1000)
|
|
52
|
+
return sink.getvalue().to_pybytes(), ipc_ms
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def prefetch_frame(registry: dict[str, DataFrame], name: str, limit: int = 10000) -> bool:
|
|
56
|
+
"""Prefetch DataFrame as Arrow IPC bytes in background.
|
|
57
|
+
|
|
58
|
+
Returns True if prefetch succeeded, False otherwise.
|
|
59
|
+
"""
|
|
60
|
+
if name not in registry:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
df = registry[name]
|
|
65
|
+
start = time.perf_counter()
|
|
66
|
+
limited_df = df.limit(limit) if limit > 0 else df
|
|
67
|
+
table = limited_df.toArrow()
|
|
68
|
+
spark_ms = int((time.perf_counter() - start) * 1000)
|
|
69
|
+
total_rows = table.num_rows
|
|
70
|
+
|
|
71
|
+
arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
|
|
72
|
+
# 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
|
|
73
|
+
payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
|
|
74
|
+
|
|
75
|
+
with _arrow_cache_lock:
|
|
76
|
+
_arrow_cache[name] = (limit, payload)
|
|
77
|
+
|
|
78
|
+
return True
|
|
79
|
+
except Exception:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
29
83
|
def encode_response(status: int, payload: bytes) -> bytes:
|
|
30
84
|
"""Encode response with status and length prefix."""
|
|
31
85
|
return struct.pack(">II", status, len(payload)) + payload
|
|
@@ -60,22 +114,35 @@ def handle_schema(registry: dict[str, DataFrame], name: str) -> bytes:
|
|
|
60
114
|
|
|
61
115
|
|
|
62
116
|
def handle_get(registry: dict[str, DataFrame], name: str, limit: int) -> bytes:
|
|
63
|
-
"""Return DataFrame data as Arrow IPC stream."""
|
|
117
|
+
"""Return DataFrame data as Arrow IPC stream with timing info."""
|
|
64
118
|
if name not in registry:
|
|
65
119
|
return encode_error(f"DataFrame '{name}' not found")
|
|
66
120
|
|
|
121
|
+
# Check cache first - return cached data if limit is sufficient
|
|
122
|
+
with _arrow_cache_lock:
|
|
123
|
+
if name in _arrow_cache:
|
|
124
|
+
cached_limit, cached_payload = _arrow_cache[name]
|
|
125
|
+
if cached_limit >= limit:
|
|
126
|
+
return encode_response(STATUS_OK, cached_payload)
|
|
127
|
+
|
|
128
|
+
# Cache miss or insufficient limit - materialize from Spark
|
|
67
129
|
df = registry[name]
|
|
130
|
+
|
|
131
|
+
start = time.perf_counter()
|
|
68
132
|
limited_df = df.limit(limit) if limit > 0 else df
|
|
133
|
+
table = limited_df.toArrow()
|
|
134
|
+
spark_ms = int((time.perf_counter() - start) * 1000)
|
|
135
|
+
total_rows = table.num_rows
|
|
69
136
|
|
|
70
|
-
|
|
71
|
-
|
|
137
|
+
arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
|
|
138
|
+
# 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
|
|
139
|
+
payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
|
|
72
140
|
|
|
73
|
-
|
|
74
|
-
with
|
|
75
|
-
|
|
76
|
-
writer.write_batch(batch)
|
|
141
|
+
# Cache result for future requests
|
|
142
|
+
with _arrow_cache_lock:
|
|
143
|
+
_arrow_cache[name] = (limit, payload)
|
|
77
144
|
|
|
78
|
-
return encode_response(STATUS_OK,
|
|
145
|
+
return encode_response(STATUS_OK, payload)
|
|
79
146
|
|
|
80
147
|
|
|
81
148
|
def _is_numeric_type(dtype_str: str) -> bool:
|
|
@@ -133,6 +200,19 @@ def handle_stats(registry: dict[str, DataFrame], name: str) -> bytes:
|
|
|
133
200
|
return encode_json_response(stats_data)
|
|
134
201
|
|
|
135
202
|
|
|
203
|
+
def handle_count(registry: dict[str, DataFrame], name: str) -> bytes:
|
|
204
|
+
"""Return total row count without transferring data."""
|
|
205
|
+
if name not in registry:
|
|
206
|
+
return encode_error(f"DataFrame '{name}' not found")
|
|
207
|
+
|
|
208
|
+
df = registry[name]
|
|
209
|
+
start = time.perf_counter()
|
|
210
|
+
count = df.count()
|
|
211
|
+
count_ms = int((time.perf_counter() - start) * 1000)
|
|
212
|
+
|
|
213
|
+
return encode_json_response({"name": name, "count": count, "count_ms": count_ms})
|
|
214
|
+
|
|
215
|
+
|
|
136
216
|
def dispatch_command(
|
|
137
217
|
registry: dict[str, DataFrame], command: str
|
|
138
218
|
) -> bytes:
|
|
@@ -161,4 +241,8 @@ def dispatch_command(
|
|
|
161
241
|
name = command[6:]
|
|
162
242
|
return handle_stats(registry, name)
|
|
163
243
|
|
|
244
|
+
if command.startswith("COUNT:"):
|
|
245
|
+
name = command[6:]
|
|
246
|
+
return handle_count(registry, name)
|
|
247
|
+
|
|
164
248
|
return encode_error(f"Unknown command: {command}")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
//! Arrow IPC stream parsing and JSON conversion.
|
|
2
|
+
|
|
3
|
+
use std::io::Cursor;
|
|
4
|
+
|
|
5
|
+
use arrow::array::RecordBatch;
|
|
6
|
+
use arrow_ipc::reader::StreamReader;
|
|
7
|
+
use arrow_json::ArrayWriter;
|
|
8
|
+
use serde_json::Value;
|
|
9
|
+
use thiserror::Error;
|
|
10
|
+
|
|
11
|
+
#[derive(Error, Debug)]
|
|
12
|
+
pub enum ArrowError {
|
|
13
|
+
#[error("Failed to parse Arrow IPC: {0}")]
|
|
14
|
+
ParseError(#[from] arrow::error::ArrowError),
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
pub fn parse_arrow_stream(data: &[u8]) -> Result<Vec<RecordBatch>, ArrowError> {
|
|
18
|
+
let cursor = Cursor::new(data);
|
|
19
|
+
let reader = StreamReader::try_new(cursor, None)?;
|
|
20
|
+
let batches: Result<Vec<_>, _> = reader.collect();
|
|
21
|
+
Ok(batches?)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/// High-performance JSON conversion returning raw bytes.
|
|
25
|
+
/// Skips intermediate Value parsing for maximum speed.
|
|
26
|
+
pub fn batches_to_json_bytes(batches: &[RecordBatch], offset: usize, limit: usize) -> (Vec<u8>, usize) {
|
|
27
|
+
if batches.is_empty() {
|
|
28
|
+
return (b"[]".to_vec(), 0);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
|
|
32
|
+
let actual_limit = limit.min(total_rows.saturating_sub(offset));
|
|
33
|
+
if actual_limit == 0 {
|
|
34
|
+
return (b"[]".to_vec(), 0);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
let sliced = slice_batches(batches, offset, actual_limit);
|
|
38
|
+
if sliced.is_empty() {
|
|
39
|
+
return (b"[]".to_vec(), 0);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
let mut buf = Vec::with_capacity(actual_limit * 256);
|
|
43
|
+
{
|
|
44
|
+
let mut writer = ArrayWriter::new(&mut buf);
|
|
45
|
+
for batch in &sliced {
|
|
46
|
+
if writer.write(batch).is_err() {
|
|
47
|
+
return (b"[]".to_vec(), 0);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
if writer.finish().is_err() {
|
|
51
|
+
return (b"[]".to_vec(), 0);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
let row_count = sliced.iter().map(|b| b.num_rows()).sum();
|
|
56
|
+
(buf, row_count)
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/// Legacy function for compatibility - parses back to Value
|
|
60
|
+
pub fn batches_to_json(batches: &[RecordBatch], offset: usize, limit: usize) -> Value {
|
|
61
|
+
let (bytes, _) = batches_to_json_bytes(batches, offset, limit);
|
|
62
|
+
serde_json::from_slice(&bytes).unwrap_or(Value::Array(vec![]))
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/// Slice batches to extract rows in range [offset, offset+limit)
|
|
66
|
+
fn slice_batches(batches: &[RecordBatch], offset: usize, limit: usize) -> Vec<RecordBatch> {
|
|
67
|
+
let mut result = Vec::new();
|
|
68
|
+
let mut current_offset = 0;
|
|
69
|
+
let mut remaining = limit;
|
|
70
|
+
|
|
71
|
+
for batch in batches {
|
|
72
|
+
let batch_rows = batch.num_rows();
|
|
73
|
+
|
|
74
|
+
if current_offset + batch_rows <= offset {
|
|
75
|
+
current_offset += batch_rows;
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
let start = if current_offset < offset { offset - current_offset } else { 0 };
|
|
80
|
+
let len = remaining.min(batch_rows - start);
|
|
81
|
+
|
|
82
|
+
if len > 0 {
|
|
83
|
+
let sliced = batch.slice(start, len);
|
|
84
|
+
result.push(sliced);
|
|
85
|
+
remaining -= len;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if remaining == 0 {
|
|
89
|
+
break;
|
|
90
|
+
}
|
|
91
|
+
current_offset += batch_rows;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
result
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
pub fn total_row_count(batches: &[RecordBatch]) -> usize {
|
|
98
|
+
batches.iter().map(|b| b.num_rows()).sum()
|
|
99
|
+
}
|