mangleframes 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mangleframes-0.1.1 → mangleframes-0.1.2}/PKG-INFO +1 -1
- {mangleframes-0.1.1 → mangleframes-0.1.2}/pyproject.toml +1 -1
- {mangleframes-0.1.1 → mangleframes-0.1.2}/python/mangleframes/__init__.py +4 -1
- {mangleframes-0.1.1 → mangleframes-0.1.2}/python/mangleframes/protocol.py +53 -19
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/main.rs +15 -2
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/static/app.js +3 -3
- {mangleframes-0.1.1 → mangleframes-0.1.2}/Cargo.lock +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/Cargo.toml +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/python/mangleframes/launcher.py +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/python/mangleframes/server.py +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/Cargo.toml +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/arrow_reader.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/export.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/handlers.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/query_engine.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/socket_client.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/stats.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/web_server.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/src/websocket.rs +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/static/index.html +0 -0
- {mangleframes-0.1.1 → mangleframes-0.1.2}/viewer/static/style.css +0 -0
|
@@ -5,12 +5,13 @@ import time
|
|
|
5
5
|
from typing import TYPE_CHECKING
|
|
6
6
|
|
|
7
7
|
from .launcher import launch_viewer, open_browser
|
|
8
|
+
from .protocol import clear_stats_cache
|
|
8
9
|
from .server import DataFrameServer
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from pyspark.sql import DataFrame
|
|
12
13
|
|
|
13
|
-
__version__ = "0.1.
|
|
14
|
+
__version__ = "0.1.2"
|
|
14
15
|
__all__ = ["register", "unregister", "show"]
|
|
15
16
|
|
|
16
17
|
_registry: dict[str, DataFrame] = {}
|
|
@@ -23,6 +24,7 @@ def register(name: str, df: DataFrame) -> None:
|
|
|
23
24
|
global _server
|
|
24
25
|
|
|
25
26
|
_registry[name] = df
|
|
27
|
+
clear_stats_cache(name) # Invalidate cached stats for this name
|
|
26
28
|
|
|
27
29
|
if _server is None:
|
|
28
30
|
_server = DataFrameServer(_registry)
|
|
@@ -33,6 +35,7 @@ def unregister(name: str) -> None:
|
|
|
33
35
|
"""Remove a DataFrame from the viewer."""
|
|
34
36
|
if name in _registry:
|
|
35
37
|
del _registry[name]
|
|
38
|
+
clear_stats_cache(name)
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
def show(port: int = 8765, block: bool = True) -> None:
|
|
@@ -6,6 +6,7 @@ import struct
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any
|
|
7
7
|
|
|
8
8
|
import pyarrow as pa
|
|
9
|
+
from pyspark.sql import functions as F
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from pyspark.sql import DataFrame
|
|
@@ -13,6 +14,17 @@ if TYPE_CHECKING:
|
|
|
13
14
|
STATUS_OK = 0
|
|
14
15
|
STATUS_ERROR = 1
|
|
15
16
|
|
|
17
|
+
# Cache for computed stats (cleared when DataFrame is re-registered)
|
|
18
|
+
_stats_cache: dict[str, dict] = {}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def clear_stats_cache(name: str | None = None) -> None:
|
|
22
|
+
"""Clear cached stats for a DataFrame or all DataFrames."""
|
|
23
|
+
if name is None:
|
|
24
|
+
_stats_cache.clear()
|
|
25
|
+
elif name in _stats_cache:
|
|
26
|
+
del _stats_cache[name]
|
|
27
|
+
|
|
16
28
|
|
|
17
29
|
def encode_response(status: int, payload: bytes) -> bytes:
|
|
18
30
|
"""Encode response with status and length prefix."""
|
|
@@ -66,37 +78,59 @@ def handle_get(registry: dict[str, DataFrame], name: str, limit: int) -> bytes:
|
|
|
66
78
|
return encode_response(STATUS_OK, sink.getvalue().to_pybytes())
|
|
67
79
|
|
|
68
80
|
|
|
81
|
+
def _is_numeric_type(dtype_str: str) -> bool:
|
|
82
|
+
"""Check if a Spark type string represents a numeric type."""
|
|
83
|
+
dtype_lower = dtype_str.lower()
|
|
84
|
+
return any(t in dtype_lower for t in ["int", "long", "double", "float", "decimal"])
|
|
85
|
+
|
|
86
|
+
|
|
69
87
|
def handle_stats(registry: dict[str, DataFrame], name: str) -> bytes:
|
|
70
|
-
"""Return basic statistics for a DataFrame."""
|
|
88
|
+
"""Return basic statistics for a DataFrame using single aggregation."""
|
|
71
89
|
if name not in registry:
|
|
72
90
|
return encode_error(f"DataFrame '{name}' not found")
|
|
73
91
|
|
|
92
|
+
# Return cached stats if available
|
|
93
|
+
if name in _stats_cache:
|
|
94
|
+
return encode_json_response(_stats_cache[name])
|
|
95
|
+
|
|
74
96
|
df = registry[name]
|
|
75
|
-
|
|
97
|
+
fields = df.schema.fields
|
|
76
98
|
|
|
99
|
+
# Build all aggregation expressions in one pass
|
|
100
|
+
agg_exprs = [F.count(F.lit(1)).alias("__total")]
|
|
101
|
+
for field in fields:
|
|
102
|
+
col_name = field.name
|
|
103
|
+
agg_exprs.append(
|
|
104
|
+
F.sum(F.when(F.col(col_name).isNull(), 1).otherwise(0)).alias(f"{col_name}__nulls")
|
|
105
|
+
)
|
|
106
|
+
if _is_numeric_type(str(field.dataType)):
|
|
107
|
+
agg_exprs.append(F.min(col_name).alias(f"{col_name}__min"))
|
|
108
|
+
agg_exprs.append(F.max(col_name).alias(f"{col_name}__max"))
|
|
109
|
+
|
|
110
|
+
# Single Spark action
|
|
111
|
+
result = df.agg(*agg_exprs).collect()[0]
|
|
112
|
+
row_count = result["__total"]
|
|
113
|
+
|
|
114
|
+
# Extract stats from result
|
|
77
115
|
column_stats = []
|
|
78
|
-
for field in
|
|
116
|
+
for field in fields:
|
|
79
117
|
col_name = field.name
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
stats["null_count"] = null_count_row
|
|
118
|
+
dtype_str = str(field.dataType)
|
|
119
|
+
stats = {"name": col_name, "type": dtype_str, "nullable": field.nullable}
|
|
120
|
+
stats["null_count"] = result[f"{col_name}__nulls"] or 0
|
|
84
121
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
stats["min"] = str(
|
|
89
|
-
|
|
90
|
-
agg_result = df.agg({col_name: "max"}).collect()[0][0]
|
|
91
|
-
stats["max"] = str(agg_result) if agg_result is not None else None
|
|
122
|
+
if _is_numeric_type(dtype_str):
|
|
123
|
+
min_val = result[f"{col_name}__min"]
|
|
124
|
+
max_val = result[f"{col_name}__max"]
|
|
125
|
+
stats["min"] = str(min_val) if min_val is not None else None
|
|
126
|
+
stats["max"] = str(max_val) if max_val is not None else None
|
|
92
127
|
|
|
93
128
|
column_stats.append(stats)
|
|
94
129
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
})
|
|
130
|
+
stats_data = {"name": name, "row_count": row_count, "columns": column_stats}
|
|
131
|
+
_stats_cache[name] = stats_data # Cache for future requests
|
|
132
|
+
|
|
133
|
+
return encode_json_response(stats_data)
|
|
100
134
|
|
|
101
135
|
|
|
102
136
|
def dispatch_command(
|
|
@@ -17,7 +17,7 @@ use tracing::info;
|
|
|
17
17
|
use tracing_subscriber::EnvFilter;
|
|
18
18
|
|
|
19
19
|
use crate::socket_client::SocketClient;
|
|
20
|
-
use crate::web_server::AppState;
|
|
20
|
+
use crate::web_server::{AppState, CachedFrame};
|
|
21
21
|
|
|
22
22
|
#[derive(Parser)]
|
|
23
23
|
#[command(name = "mangleframes-viewer")]
|
|
@@ -43,7 +43,20 @@ async fn main() -> anyhow::Result<()> {
|
|
|
43
43
|
|
|
44
44
|
info!("Connecting to Python server at {:?}", args.socket);
|
|
45
45
|
let client = Arc::new(SocketClient::new(&args.socket));
|
|
46
|
-
let state = AppState::new(client);
|
|
46
|
+
let state = AppState::new(client.clone());
|
|
47
|
+
|
|
48
|
+
// Preload first frame into cache for instant display
|
|
49
|
+
if let Ok(frames) = client.list_frames() {
|
|
50
|
+
if let Some(first) = frames.first() {
|
|
51
|
+
info!("Preloading frame: {}", first);
|
|
52
|
+
if let Ok(data) = client.get_frame(first, 1000) {
|
|
53
|
+
if let Ok(batches) = arrow_reader::parse_arrow_stream(&data) {
|
|
54
|
+
let mut cache = state.cache.write().await;
|
|
55
|
+
cache.insert(first.clone(), CachedFrame { batches, stats: None });
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
47
60
|
|
|
48
61
|
if !args.no_browser {
|
|
49
62
|
let url = format!("http://localhost:{}", args.port);
|
|
@@ -297,9 +297,9 @@ function init() {
|
|
|
297
297
|
state.offset = 0;
|
|
298
298
|
state.sortCol = null;
|
|
299
299
|
|
|
300
|
-
|
|
301
|
-
await loadData(name);
|
|
302
|
-
loadStats(name);
|
|
300
|
+
// Fetch schema and data in parallel for faster display
|
|
301
|
+
await Promise.all([loadSchema(name), loadData(name)]);
|
|
302
|
+
loadStats(name); // Fire-and-forget, updates UI when ready
|
|
303
303
|
};
|
|
304
304
|
|
|
305
305
|
$('refresh-btn').onclick = () => {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|