mangleframes 0.1.2__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {mangleframes-0.1.2 → mangleframes-0.1.5}/PKG-INFO +5 -2
  2. {mangleframes-0.1.2 → mangleframes-0.1.5}/pyproject.toml +5 -2
  3. {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/__init__.py +13 -6
  4. {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/launcher.py +7 -5
  5. {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/protocol.py +92 -8
  6. mangleframes-0.1.5/viewer/src/arrow_reader.rs +99 -0
  7. mangleframes-0.1.5/viewer/src/handlers.rs +423 -0
  8. mangleframes-0.1.5/viewer/src/main.rs +101 -0
  9. mangleframes-0.1.5/viewer/src/perf.rs +178 -0
  10. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/socket_client.rs +25 -3
  11. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/web_server.rs +16 -0
  12. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/static/app.js +31 -2
  13. mangleframes-0.1.2/viewer/src/arrow_reader.rs +0 -132
  14. mangleframes-0.1.2/viewer/src/handlers.rs +0 -190
  15. mangleframes-0.1.2/viewer/src/main.rs +0 -69
  16. {mangleframes-0.1.2 → mangleframes-0.1.5}/Cargo.lock +0 -0
  17. {mangleframes-0.1.2 → mangleframes-0.1.5}/Cargo.toml +0 -0
  18. {mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/server.py +0 -0
  19. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/Cargo.toml +0 -0
  20. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/export.rs +0 -0
  21. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/query_engine.rs +0 -0
  22. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/stats.rs +0 -0
  23. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/src/websocket.rs +0 -0
  24. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/static/index.html +0 -0
  25. {mangleframes-0.1.2 → mangleframes-0.1.5}/viewer/static/style.css +0 -0
@@ -1,9 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mangleframes
3
- Version: 0.1.2
3
+ Version: 0.1.5
4
4
  Classifier: Programming Language :: Python :: 3
5
5
  Classifier: Programming Language :: Rust
6
6
  Classifier: License :: OSI Approved :: MIT License
7
+ Requires-Dist: databricks-connect>=16.1.7
8
+ Requires-Dist: loguru>=0.7.3
9
+ Requires-Dist: maturin>=1.11.2
7
10
  Requires-Dist: pyarrow>=11.0.0
8
11
  Requires-Dist: pytest>=7.0 ; extra == 'dev'
9
12
  Requires-Dist: maturin>=1.4 ; extra == 'dev'
@@ -13,4 +16,4 @@ Provides-Extra: spark
13
16
  License-File: LICENSE
14
17
  Summary: PySpark DataFrame viewer with modern web UI
15
18
  License: MIT
16
- Requires-Python: >=3.9
19
+ Requires-Python: >=3.11
@@ -4,9 +4,9 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "mangleframes"
7
- version = "0.1.2"
7
+ version = "0.1.5"
8
8
  description = "PySpark DataFrame viewer with modern web UI"
9
- requires-python = ">=3.9"
9
+ requires-python = ">=3.11"
10
10
  license = { text = "MIT" }
11
11
  classifiers = [
12
12
  "Programming Language :: Python :: 3",
@@ -14,6 +14,9 @@ classifiers = [
14
14
  "License :: OSI Approved :: MIT License",
15
15
  ]
16
16
  dependencies = [
17
+ "databricks-connect>=16.1.7",
18
+ "loguru>=0.7.3",
19
+ "maturin>=1.11.2",
17
20
  "pyarrow>=11.0.0",
18
21
  ]
19
22
 
@@ -1,17 +1,18 @@
1
1
  """MangleFrames - PySpark DataFrame viewer with modern web UI."""
2
2
  from __future__ import annotations
3
3
 
4
+ import threading
4
5
  import time
5
6
  from typing import TYPE_CHECKING
6
7
 
7
- from .launcher import launch_viewer, open_browser
8
- from .protocol import clear_stats_cache
8
+ from .launcher import launch_viewer
9
+ from .protocol import clear_arrow_cache, clear_stats_cache, prefetch_frame
9
10
  from .server import DataFrameServer
10
11
 
11
12
  if TYPE_CHECKING:
12
13
  from pyspark.sql import DataFrame
13
14
 
14
- __version__ = "0.1.2"
15
+ __version__ = "0.1.5"
15
16
  __all__ = ["register", "unregister", "show"]
16
17
 
17
18
  _registry: dict[str, DataFrame] = {}
@@ -24,7 +25,14 @@ def register(name: str, df: DataFrame) -> None:
24
25
  global _server
25
26
 
26
27
  _registry[name] = df
27
- clear_stats_cache(name) # Invalidate cached stats for this name
28
+ clear_stats_cache(name)
29
+ clear_arrow_cache(name)
30
+
31
+ # Start background prefetch immediately
32
+ def do_prefetch() -> None:
33
+ prefetch_frame(_registry, name, limit=10000)
34
+
35
+ threading.Thread(target=do_prefetch, daemon=True).start()
28
36
 
29
37
  if _server is None:
30
38
  _server = DataFrameServer(_registry)
@@ -36,6 +44,7 @@ def unregister(name: str) -> None:
36
44
  if name in _registry:
37
45
  del _registry[name]
38
46
  clear_stats_cache(name)
47
+ clear_arrow_cache(name)
39
48
 
40
49
 
41
50
  def show(port: int = 8765, block: bool = True) -> None:
@@ -57,8 +66,6 @@ def show(port: int = 8765, block: bool = True) -> None:
57
66
  launch_viewer(_server.socket_path, port)
58
67
  _viewer_launched = True
59
68
 
60
- open_browser(port)
61
-
62
69
  if block:
63
70
  try:
64
71
  print(f"MangleFrames viewer running at http://localhost:{port}")
@@ -4,17 +4,22 @@ from __future__ import annotations
4
4
  import os
5
5
  import shutil
6
6
  import subprocess
7
- import webbrowser
8
7
  from pathlib import Path
9
8
 
10
9
 
11
10
  def find_viewer_binary() -> Path | None:
12
11
  """Find the mangleframes-viewer binary."""
12
+ import sys
13
+
13
14
  pkg_dir = Path(__file__).parent
14
15
  pkg_binary = pkg_dir / "bin" / "mangleframes-viewer"
15
16
  if pkg_binary.exists():
16
17
  return pkg_binary
17
18
 
19
+ venv_binary = Path(sys.executable).parent / "mangleframes-viewer"
20
+ if venv_binary.exists():
21
+ return venv_binary
22
+
18
23
  path_binary = shutil.which("mangleframes-viewer")
19
24
  if path_binary:
20
25
  return Path(path_binary)
@@ -35,13 +40,10 @@ def launch_viewer(socket_path: Path, port: int = 8765) -> subprocess.Popen:
35
40
  env["RUST_LOG"] = env.get("RUST_LOG", "info")
36
41
 
37
42
  return subprocess.Popen(
38
- [str(binary), "--socket", str(socket_path), "--port", str(port)],
43
+ [str(binary), "--socket", str(socket_path), "--port", str(port), "--no-browser"],
39
44
  env=env,
40
45
  stdout=subprocess.DEVNULL,
41
46
  stderr=subprocess.DEVNULL,
42
47
  )
43
48
 
44
49
 
45
- def open_browser(port: int = 8765) -> None:
46
- """Open the viewer in the default browser."""
47
- webbrowser.open(f"http://localhost:{port}")
@@ -3,6 +3,8 @@ from __future__ import annotations
3
3
 
4
4
  import json
5
5
  import struct
6
+ import threading
7
+ import time
6
8
  from typing import TYPE_CHECKING, Any
7
9
 
8
10
  import pyarrow as pa
@@ -17,6 +19,10 @@ STATUS_ERROR = 1
17
19
  # Cache for computed stats (cleared when DataFrame is re-registered)
18
20
  _stats_cache: dict[str, dict] = {}
19
21
 
22
+ # Cache for prefetched Arrow data: name -> (limit, payload_bytes)
23
+ _arrow_cache: dict[str, tuple[int, bytes]] = {}
24
+ _arrow_cache_lock = threading.Lock()
25
+
20
26
 
21
27
  def clear_stats_cache(name: str | None = None) -> None:
22
28
  """Clear cached stats for a DataFrame or all DataFrames."""
@@ -26,6 +32,54 @@ def clear_stats_cache(name: str | None = None) -> None:
26
32
  del _stats_cache[name]
27
33
 
28
34
 
35
+ def clear_arrow_cache(name: str | None = None) -> None:
36
+ """Clear cached Arrow data for a DataFrame or all DataFrames."""
37
+ with _arrow_cache_lock:
38
+ if name is None:
39
+ _arrow_cache.clear()
40
+ elif name in _arrow_cache:
41
+ del _arrow_cache[name]
42
+
43
+
44
+ def _serialize_arrow_ipc(table: pa.Table) -> tuple[bytes, int]:
45
+ """Serialize Arrow table to IPC format, returning bytes and timing in ms."""
46
+ start = time.perf_counter()
47
+ sink = pa.BufferOutputStream()
48
+ with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
49
+ for batch in table.to_batches():
50
+ writer.write_batch(batch)
51
+ ipc_ms = int((time.perf_counter() - start) * 1000)
52
+ return sink.getvalue().to_pybytes(), ipc_ms
53
+
54
+
55
+ def prefetch_frame(registry: dict[str, DataFrame], name: str, limit: int = 10000) -> bool:
56
+ """Prefetch DataFrame as Arrow IPC bytes in background.
57
+
58
+ Returns True if prefetch succeeded, False otherwise.
59
+ """
60
+ if name not in registry:
61
+ return False
62
+
63
+ try:
64
+ df = registry[name]
65
+ start = time.perf_counter()
66
+ limited_df = df.limit(limit) if limit > 0 else df
67
+ table = limited_df.toArrow()
68
+ spark_ms = int((time.perf_counter() - start) * 1000)
69
+ total_rows = table.num_rows
70
+
71
+ arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
72
+ # 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
73
+ payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
74
+
75
+ with _arrow_cache_lock:
76
+ _arrow_cache[name] = (limit, payload)
77
+
78
+ return True
79
+ except Exception:
80
+ return False
81
+
82
+
29
83
  def encode_response(status: int, payload: bytes) -> bytes:
30
84
  """Encode response with status and length prefix."""
31
85
  return struct.pack(">II", status, len(payload)) + payload
@@ -60,22 +114,35 @@ def handle_schema(registry: dict[str, DataFrame], name: str) -> bytes:
60
114
 
61
115
 
62
116
  def handle_get(registry: dict[str, DataFrame], name: str, limit: int) -> bytes:
63
- """Return DataFrame data as Arrow IPC stream."""
117
+ """Return DataFrame data as Arrow IPC stream with timing info."""
64
118
  if name not in registry:
65
119
  return encode_error(f"DataFrame '{name}' not found")
66
120
 
121
+ # Check cache first - return cached data if limit is sufficient
122
+ with _arrow_cache_lock:
123
+ if name in _arrow_cache:
124
+ cached_limit, cached_payload = _arrow_cache[name]
125
+ if cached_limit >= limit:
126
+ return encode_response(STATUS_OK, cached_payload)
127
+
128
+ # Cache miss or insufficient limit - materialize from Spark
67
129
  df = registry[name]
130
+
131
+ start = time.perf_counter()
68
132
  limited_df = df.limit(limit) if limit > 0 else df
133
+ table = limited_df.toArrow()
134
+ spark_ms = int((time.perf_counter() - start) * 1000)
135
+ total_rows = table.num_rows
69
136
 
70
- batches = limited_df._collect_as_arrow()
71
- table = pa.Table.from_batches(batches) if batches else pa.table({})
137
+ arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
138
+ # 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
139
+ payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
72
140
 
73
- sink = pa.BufferOutputStream()
74
- with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
75
- for batch in table.to_batches():
76
- writer.write_batch(batch)
141
+ # Cache result for future requests
142
+ with _arrow_cache_lock:
143
+ _arrow_cache[name] = (limit, payload)
77
144
 
78
- return encode_response(STATUS_OK, sink.getvalue().to_pybytes())
145
+ return encode_response(STATUS_OK, payload)
79
146
 
80
147
 
81
148
  def _is_numeric_type(dtype_str: str) -> bool:
@@ -133,6 +200,19 @@ def handle_stats(registry: dict[str, DataFrame], name: str) -> bytes:
133
200
  return encode_json_response(stats_data)
134
201
 
135
202
 
203
+ def handle_count(registry: dict[str, DataFrame], name: str) -> bytes:
204
+ """Return total row count without transferring data."""
205
+ if name not in registry:
206
+ return encode_error(f"DataFrame '{name}' not found")
207
+
208
+ df = registry[name]
209
+ start = time.perf_counter()
210
+ count = df.count()
211
+ count_ms = int((time.perf_counter() - start) * 1000)
212
+
213
+ return encode_json_response({"name": name, "count": count, "count_ms": count_ms})
214
+
215
+
136
216
  def dispatch_command(
137
217
  registry: dict[str, DataFrame], command: str
138
218
  ) -> bytes:
@@ -161,4 +241,8 @@ def dispatch_command(
161
241
  name = command[6:]
162
242
  return handle_stats(registry, name)
163
243
 
244
+ if command.startswith("COUNT:"):
245
+ name = command[6:]
246
+ return handle_count(registry, name)
247
+
164
248
  return encode_error(f"Unknown command: {command}")
@@ -0,0 +1,99 @@
1
+ //! Arrow IPC stream parsing and JSON conversion.
2
+
3
+ use std::io::Cursor;
4
+
5
+ use arrow::array::RecordBatch;
6
+ use arrow_ipc::reader::StreamReader;
7
+ use arrow_json::ArrayWriter;
8
+ use serde_json::Value;
9
+ use thiserror::Error;
10
+
11
+ #[derive(Error, Debug)]
12
+ pub enum ArrowError {
13
+ #[error("Failed to parse Arrow IPC: {0}")]
14
+ ParseError(#[from] arrow::error::ArrowError),
15
+ }
16
+
17
+ pub fn parse_arrow_stream(data: &[u8]) -> Result<Vec<RecordBatch>, ArrowError> {
18
+ let cursor = Cursor::new(data);
19
+ let reader = StreamReader::try_new(cursor, None)?;
20
+ let batches: Result<Vec<_>, _> = reader.collect();
21
+ Ok(batches?)
22
+ }
23
+
24
+ /// High-performance JSON conversion returning raw bytes.
25
+ /// Skips intermediate Value parsing for maximum speed.
26
+ pub fn batches_to_json_bytes(batches: &[RecordBatch], offset: usize, limit: usize) -> (Vec<u8>, usize) {
27
+ if batches.is_empty() {
28
+ return (b"[]".to_vec(), 0);
29
+ }
30
+
31
+ let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
32
+ let actual_limit = limit.min(total_rows.saturating_sub(offset));
33
+ if actual_limit == 0 {
34
+ return (b"[]".to_vec(), 0);
35
+ }
36
+
37
+ let sliced = slice_batches(batches, offset, actual_limit);
38
+ if sliced.is_empty() {
39
+ return (b"[]".to_vec(), 0);
40
+ }
41
+
42
+ let mut buf = Vec::with_capacity(actual_limit * 256);
43
+ {
44
+ let mut writer = ArrayWriter::new(&mut buf);
45
+ for batch in &sliced {
46
+ if writer.write(batch).is_err() {
47
+ return (b"[]".to_vec(), 0);
48
+ }
49
+ }
50
+ if writer.finish().is_err() {
51
+ return (b"[]".to_vec(), 0);
52
+ }
53
+ }
54
+
55
+ let row_count = sliced.iter().map(|b| b.num_rows()).sum();
56
+ (buf, row_count)
57
+ }
58
+
59
+ /// Legacy function for compatibility - parses back to Value
60
+ pub fn batches_to_json(batches: &[RecordBatch], offset: usize, limit: usize) -> Value {
61
+ let (bytes, _) = batches_to_json_bytes(batches, offset, limit);
62
+ serde_json::from_slice(&bytes).unwrap_or(Value::Array(vec![]))
63
+ }
64
+
65
+ /// Slice batches to extract rows in range [offset, offset+limit)
66
+ fn slice_batches(batches: &[RecordBatch], offset: usize, limit: usize) -> Vec<RecordBatch> {
67
+ let mut result = Vec::new();
68
+ let mut current_offset = 0;
69
+ let mut remaining = limit;
70
+
71
+ for batch in batches {
72
+ let batch_rows = batch.num_rows();
73
+
74
+ if current_offset + batch_rows <= offset {
75
+ current_offset += batch_rows;
76
+ continue;
77
+ }
78
+
79
+ let start = if current_offset < offset { offset - current_offset } else { 0 };
80
+ let len = remaining.min(batch_rows - start);
81
+
82
+ if len > 0 {
83
+ let sliced = batch.slice(start, len);
84
+ result.push(sliced);
85
+ remaining -= len;
86
+ }
87
+
88
+ if remaining == 0 {
89
+ break;
90
+ }
91
+ current_offset += batch_rows;
92
+ }
93
+
94
+ result
95
+ }
96
+
97
+ pub fn total_row_count(batches: &[RecordBatch]) -> usize {
98
+ batches.iter().map(|b| b.num_rows()).sum()
99
+ }