PyPI - mangleframes - Versions diffs - 0.1.2__tar.gz → 0.1.5__tar.gz - Mend

mangleframes 0.1.2tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{mangleframes-0.1.2 → mangleframes-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,12 @@
 Metadata-Version: 2.4
 Name: mangleframes
-Version: 0.1.2
+Version: 0.1.5
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Rust
 Classifier: License :: OSI Approved :: MIT License
+Requires-Dist: databricks-connect>=16.1.7
+Requires-Dist: loguru>=0.7.3
+Requires-Dist: maturin>=1.11.2
 Requires-Dist: pyarrow>=11.0.0
 Requires-Dist: pytest>=7.0 ; extra == 'dev'
 Requires-Dist: maturin>=1.4 ; extra == 'dev'
@@ -13,4 +16,4 @@ Provides-Extra: spark
 License-File: LICENSE
 Summary: PySpark DataFrame viewer with modern web UI
 License: MIT
-Requires-Python: >=3.9
+Requires-Python: >=3.11

{mangleframes-0.1.2 → mangleframes-0.1.5}/pyproject.toml RENAMED Viewed

@@ -4,9 +4,9 @@ build-backend = "maturin"
 [project]
 name = "mangleframes"
-version = "0.1.2"
+version = "0.1.5"
 description = "PySpark DataFrame viewer with modern web UI"
-requires-python = ">=3.9"
+requires-python = ">=3.11"
 license = { text = "MIT" }
 classifiers = [
     "Programming Language :: Python :: 3",
@@ -14,6 +14,9 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
 ]
 dependencies = [
+    "databricks-connect>=16.1.7",
+    "loguru>=0.7.3",
+    "maturin>=1.11.2",
     "pyarrow>=11.0.0",
 ]

{mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/__init__.py RENAMED Viewed

@@ -1,17 +1,18 @@
 """MangleFrames - PySpark DataFrame viewer with modern web UI."""
 from __future__ import annotations
+import threading
 import time
 from typing import TYPE_CHECKING
-from .launcher import launch_viewer, open_browser
-from .protocol import clear_stats_cache
+from .launcher import launch_viewer
+from .protocol import clear_arrow_cache, clear_stats_cache, prefetch_frame
 from .server import DataFrameServer
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame
-__version__ = "0.1.2"
+__version__ = "0.1.5"
 __all__ = ["register", "unregister", "show"]
 _registry: dict[str, DataFrame] = {}
@@ -24,7 +25,14 @@ def register(name: str, df: DataFrame) -> None:
     global _server
     _registry[name] = df
-    clear_stats_cache(name)  # Invalidate cached stats for this name
+    clear_stats_cache(name)
+    clear_arrow_cache(name)
+    # Start background prefetch immediately
+    def do_prefetch() -> None:
+        prefetch_frame(_registry, name, limit=10000)
+    threading.Thread(target=do_prefetch, daemon=True).start()
     if _server is None:
         _server = DataFrameServer(_registry)
@@ -36,6 +44,7 @@ def unregister(name: str) -> None:
     if name in _registry:
         del _registry[name]
         clear_stats_cache(name)
+        clear_arrow_cache(name)
 def show(port: int = 8765, block: bool = True) -> None:
@@ -57,8 +66,6 @@ def show(port: int = 8765, block: bool = True) -> None:
         launch_viewer(_server.socket_path, port)
         _viewer_launched = True
-    open_browser(port)
     if block:
         try:
             print(f"MangleFrames viewer running at http://localhost:{port}")

{mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/launcher.py RENAMED Viewed

@@ -4,17 +4,22 @@ from __future__ import annotations
 import os
 import shutil
 import subprocess
-import webbrowser
 from pathlib import Path
 def find_viewer_binary() -> Path | None:
     """Find the mangleframes-viewer binary."""
+    import sys
     pkg_dir = Path(__file__).parent
     pkg_binary = pkg_dir / "bin" / "mangleframes-viewer"
     if pkg_binary.exists():
         return pkg_binary
+    venv_binary = Path(sys.executable).parent / "mangleframes-viewer"
+    if venv_binary.exists():
+        return venv_binary
     path_binary = shutil.which("mangleframes-viewer")
     if path_binary:
         return Path(path_binary)
@@ -35,13 +40,10 @@ def launch_viewer(socket_path: Path, port: int = 8765) -> subprocess.Popen:
     env["RUST_LOG"] = env.get("RUST_LOG", "info")
     return subprocess.Popen(
-        [str(binary), "--socket", str(socket_path), "--port", str(port)],
+        [str(binary), "--socket", str(socket_path), "--port", str(port), "--no-browser"],
         env=env,
         stdout=subprocess.DEVNULL,
         stderr=subprocess.DEVNULL,
     )
-def open_browser(port: int = 8765) -> None:
-    """Open the viewer in the default browser."""
-    webbrowser.open(f"http://localhost:{port}")

{mangleframes-0.1.2 → mangleframes-0.1.5}/python/mangleframes/protocol.py RENAMED Viewed

@@ -3,6 +3,8 @@ from __future__ import annotations
 import json
 import struct
+import threading
+import time
 from typing import TYPE_CHECKING, Any
 import pyarrow as pa
@@ -17,6 +19,10 @@ STATUS_ERROR = 1
 # Cache for computed stats (cleared when DataFrame is re-registered)
 _stats_cache: dict[str, dict] = {}
+# Cache for prefetched Arrow data: name -> (limit, payload_bytes)
+_arrow_cache: dict[str, tuple[int, bytes]] = {}
+_arrow_cache_lock = threading.Lock()
 def clear_stats_cache(name: str | None = None) -> None:
     """Clear cached stats for a DataFrame or all DataFrames."""
@@ -26,6 +32,54 @@ def clear_stats_cache(name: str | None = None) -> None:
         del _stats_cache[name]
+def clear_arrow_cache(name: str | None = None) -> None:
+    """Clear cached Arrow data for a DataFrame or all DataFrames."""
+    with _arrow_cache_lock:
+        if name is None:
+            _arrow_cache.clear()
+        elif name in _arrow_cache:
+            del _arrow_cache[name]
+def _serialize_arrow_ipc(table: pa.Table) -> tuple[bytes, int]:
+    """Serialize Arrow table to IPC format, returning bytes and timing in ms."""
+    start = time.perf_counter()
+    sink = pa.BufferOutputStream()
+    with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
+        for batch in table.to_batches():
+            writer.write_batch(batch)
+    ipc_ms = int((time.perf_counter() - start) * 1000)
+    return sink.getvalue().to_pybytes(), ipc_ms
+def prefetch_frame(registry: dict[str, DataFrame], name: str, limit: int = 10000) -> bool:
+    """Prefetch DataFrame as Arrow IPC bytes in background.
+    Returns True if prefetch succeeded, False otherwise.
+    """
+    if name not in registry:
+        return False
+    try:
+        df = registry[name]
+        start = time.perf_counter()
+        limited_df = df.limit(limit) if limit > 0 else df
+        table = limited_df.toArrow()
+        spark_ms = int((time.perf_counter() - start) * 1000)
+        total_rows = table.num_rows
+        arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
+        # 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
+        payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
+        with _arrow_cache_lock:
+            _arrow_cache[name] = (limit, payload)
+        return True
+    except Exception:
+        return False
 def encode_response(status: int, payload: bytes) -> bytes:
     """Encode response with status and length prefix."""
     return struct.pack(">II", status, len(payload)) + payload
@@ -60,22 +114,35 @@ def handle_schema(registry: dict[str, DataFrame], name: str) -> bytes:
 def handle_get(registry: dict[str, DataFrame], name: str, limit: int) -> bytes:
-    """Return DataFrame data as Arrow IPC stream."""
+    """Return DataFrame data as Arrow IPC stream with timing info."""
     if name not in registry:
         return encode_error(f"DataFrame '{name}' not found")
+    # Check cache first - return cached data if limit is sufficient
+    with _arrow_cache_lock:
+        if name in _arrow_cache:
+            cached_limit, cached_payload = _arrow_cache[name]
+            if cached_limit >= limit:
+                return encode_response(STATUS_OK, cached_payload)
+    # Cache miss or insufficient limit - materialize from Spark
     df = registry[name]
+    start = time.perf_counter()
     limited_df = df.limit(limit) if limit > 0 else df
+    table = limited_df.toArrow()
+    spark_ms = int((time.perf_counter() - start) * 1000)
+    total_rows = table.num_rows
-    batches = limited_df._collect_as_arrow()
-    table = pa.Table.from_batches(batches) if batches else pa.table({})
+    arrow_bytes, ipc_ms = _serialize_arrow_ipc(table)
+    # 24-byte header: spark_ms, ipc_ms, total_rows (all little-endian u64)
+    payload = struct.pack("<QQQ", spark_ms, ipc_ms, total_rows) + arrow_bytes
-    sink = pa.BufferOutputStream()
-    with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
-        for batch in table.to_batches():
-            writer.write_batch(batch)
+    # Cache result for future requests
+    with _arrow_cache_lock:
+        _arrow_cache[name] = (limit, payload)
-    return encode_response(STATUS_OK, sink.getvalue().to_pybytes())
+    return encode_response(STATUS_OK, payload)
 def _is_numeric_type(dtype_str: str) -> bool:
@@ -133,6 +200,19 @@ def handle_stats(registry: dict[str, DataFrame], name: str) -> bytes:
     return encode_json_response(stats_data)
+def handle_count(registry: dict[str, DataFrame], name: str) -> bytes:
+    """Return total row count without transferring data."""
+    if name not in registry:
+        return encode_error(f"DataFrame '{name}' not found")
+    df = registry[name]
+    start = time.perf_counter()
+    count = df.count()
+    count_ms = int((time.perf_counter() - start) * 1000)
+    return encode_json_response({"name": name, "count": count, "count_ms": count_ms})
 def dispatch_command(
     registry: dict[str, DataFrame], command: str
 ) -> bytes:
@@ -161,4 +241,8 @@ def dispatch_command(
         name = command[6:]
         return handle_stats(registry, name)
+    if command.startswith("COUNT:"):
+        name = command[6:]
+        return handle_count(registry, name)
     return encode_error(f"Unknown command: {command}")

mangleframes-0.1.5/viewer/src/arrow_reader.rs ADDED Viewed

@@ -0,0 +1,99 @@
+//! Arrow IPC stream parsing and JSON conversion.
+use std::io::Cursor;
+use arrow::array::RecordBatch;
+use arrow_ipc::reader::StreamReader;
+use arrow_json::ArrayWriter;
+use serde_json::Value;
+use thiserror::Error;
+#[derive(Error, Debug)]
+pub enum ArrowError {
+    #[error("Failed to parse Arrow IPC: {0}")]
+    ParseError(#[from] arrow::error::ArrowError),
+}
+pub fn parse_arrow_stream(data: &[u8]) -> Result<Vec<RecordBatch>, ArrowError> {
+    let cursor = Cursor::new(data);
+    let reader = StreamReader::try_new(cursor, None)?;
+    let batches: Result<Vec<_>, _> = reader.collect();
+    Ok(batches?)
+}
+/// High-performance JSON conversion returning raw bytes.
+/// Skips intermediate Value parsing for maximum speed.
+pub fn batches_to_json_bytes(batches: &[RecordBatch], offset: usize, limit: usize) -> (Vec<u8>, usize) {
+    if batches.is_empty() {
+        return (b"[]".to_vec(), 0);
+    }
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    let actual_limit = limit.min(total_rows.saturating_sub(offset));
+    if actual_limit == 0 {
+        return (b"[]".to_vec(), 0);
+    }
+    let sliced = slice_batches(batches, offset, actual_limit);
+    if sliced.is_empty() {
+        return (b"[]".to_vec(), 0);
+    }
+    let mut buf = Vec::with_capacity(actual_limit * 256);
+    {
+        let mut writer = ArrayWriter::new(&mut buf);
+        for batch in &sliced {
+            if writer.write(batch).is_err() {
+                return (b"[]".to_vec(), 0);
+            }
+        }
+        if writer.finish().is_err() {
+            return (b"[]".to_vec(), 0);
+        }
+    }
+    let row_count = sliced.iter().map(|b| b.num_rows()).sum();
+    (buf, row_count)
+}
+/// Legacy function for compatibility - parses back to Value
+pub fn batches_to_json(batches: &[RecordBatch], offset: usize, limit: usize) -> Value {
+    let (bytes, _) = batches_to_json_bytes(batches, offset, limit);
+    serde_json::from_slice(&bytes).unwrap_or(Value::Array(vec![]))
+}
+/// Slice batches to extract rows in range [offset, offset+limit)
+fn slice_batches(batches: &[RecordBatch], offset: usize, limit: usize) -> Vec<RecordBatch> {
+    let mut result = Vec::new();
+    let mut current_offset = 0;
+    let mut remaining = limit;
+    for batch in batches {
+        let batch_rows = batch.num_rows();
+        if current_offset + batch_rows <= offset {
+            current_offset += batch_rows;
+            continue;
+        }
+        let start = if current_offset < offset { offset - current_offset } else { 0 };
+        let len = remaining.min(batch_rows - start);
+        if len > 0 {
+            let sliced = batch.slice(start, len);
+            result.push(sliced);
+            remaining -= len;
+        }
+        if remaining == 0 {
+            break;
+        }
+        current_offset += batch_rows;
+    }
+    result
+}
+pub fn total_row_count(batches: &[RecordBatch]) -> usize {
+    batches.iter().map(|b| b.num_rows()).sum()
+}

mangleframes 0.1.2__tar.gz → 0.1.5__tar.gz

mangleframes 0.1.2tar.gz → 0.1.5tar.gz