mangleframes 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mangleframes-0.2.2 → mangleframes-0.2.3}/PKG-INFO +6 -6
- {mangleframes-0.2.2 → mangleframes-0.2.3}/pyproject.toml +2 -2
- {mangleframes-0.2.2 → mangleframes-0.2.3}/python/mangleframes/__init__.py +1 -1
- mangleframes-0.2.3/viewer/src/arrow_reader.rs +176 -0
- mangleframes-0.2.2/viewer/src/arrow_reader.rs +0 -99
- {mangleframes-0.2.2 → mangleframes-0.2.3}/Cargo.lock +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/Cargo.toml +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/python/mangleframes/launcher.py +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/python/mangleframes/protocol.py +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/python/mangleframes/server.py +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/Cargo.toml +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/dashboard.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/dq_handlers.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/export.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/handlers.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/history_analysis.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/history_handlers.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/join_handlers.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/main.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/perf.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/reconcile_handlers.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/socket_client.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/stats.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/web_server.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/src/websocket.rs +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/static/app.js +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/static/index.html +0 -0
- {mangleframes-0.2.2 → mangleframes-0.2.3}/viewer/static/style.css +0 -0
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mangleframes
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Classifier: Programming Language :: Python :: 3
|
|
5
5
|
Classifier: Programming Language :: Rust
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
-
Requires-Dist: databricks-connect
|
|
7
|
+
Requires-Dist: databricks-connect==17.3.3
|
|
8
8
|
Requires-Dist: databricks-labs-dqx>=0.12.0
|
|
9
9
|
Requires-Dist: loguru>=0.7.3
|
|
10
10
|
Requires-Dist: maturin>=1.11.2
|
|
11
11
|
Requires-Dist: pyarrow>=11.0.0
|
|
12
12
|
Requires-Dist: python-dateutil>=2.8.0
|
|
13
|
+
Requires-Dist: pyspark>=3.4.0 ; extra == 'spark'
|
|
14
|
+
Requires-Dist: databricks-labs-dqx>=0.1.5 ; extra == 'dqx'
|
|
13
15
|
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
14
16
|
Requires-Dist: maturin>=1.4 ; extra == 'dev'
|
|
15
|
-
Requires-Dist: databricks-labs-dqx>=0.1.5 ; extra == 'dqx'
|
|
16
|
-
Requires-Dist: pyspark>=3.4.0 ; extra == 'spark'
|
|
17
|
-
Provides-Extra: dev
|
|
18
|
-
Provides-Extra: dqx
|
|
19
17
|
Provides-Extra: spark
|
|
18
|
+
Provides-Extra: dqx
|
|
19
|
+
Provides-Extra: dev
|
|
20
20
|
License-File: LICENSE
|
|
21
21
|
Summary: PySpark DataFrame viewer with modern web UI
|
|
22
22
|
License: MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "maturin"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mangleframes"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "PySpark DataFrame viewer with modern web UI"
|
|
9
9
|
requires-python = ">=3.11"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -14,7 +14,7 @@ classifiers = [
|
|
|
14
14
|
"License :: OSI Approved :: MIT License",
|
|
15
15
|
]
|
|
16
16
|
dependencies = [
|
|
17
|
-
"databricks-connect
|
|
17
|
+
"databricks-connect==17.3.3",
|
|
18
18
|
"databricks-labs-dqx>=0.12.0",
|
|
19
19
|
"loguru>=0.7.3",
|
|
20
20
|
"maturin>=1.11.2",
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
//! Arrow IPC stream parsing and JSON conversion.
|
|
2
|
+
|
|
3
|
+
use std::io::Cursor;
|
|
4
|
+
|
|
5
|
+
use arrow::array::{Array, AsArray, RecordBatch};
|
|
6
|
+
use arrow::datatypes::{DataType, Date32Type, Decimal128Type, DecimalType};
|
|
7
|
+
use arrow::temporal_conversions::date32_to_datetime;
|
|
8
|
+
use arrow_ipc::reader::StreamReader;
|
|
9
|
+
use serde_json::{json, Value};
|
|
10
|
+
use thiserror::Error;
|
|
11
|
+
|
|
12
|
+
#[derive(Error, Debug)]
|
|
13
|
+
pub enum ArrowError {
|
|
14
|
+
#[error("Failed to parse Arrow IPC: {0}")]
|
|
15
|
+
ParseError(#[from] arrow::error::ArrowError),
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
pub fn parse_arrow_stream(data: &[u8]) -> Result<Vec<RecordBatch>, ArrowError> {
|
|
19
|
+
let cursor = Cursor::new(data);
|
|
20
|
+
let reader = StreamReader::try_new(cursor, None)?;
|
|
21
|
+
let batches: Result<Vec<_>, _> = reader.collect();
|
|
22
|
+
Ok(batches?)
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/// Convert Arrow batches to JSON bytes with proper type handling.
|
|
26
|
+
/// Handles Decimal128, Date32, Timestamp, and all primitive types correctly.
|
|
27
|
+
pub fn batches_to_json_bytes(batches: &[RecordBatch], offset: usize, limit: usize) -> (Vec<u8>, usize) {
|
|
28
|
+
if batches.is_empty() {
|
|
29
|
+
return (b"[]".to_vec(), 0);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
|
|
33
|
+
let actual_limit = limit.min(total_rows.saturating_sub(offset));
|
|
34
|
+
if actual_limit == 0 {
|
|
35
|
+
return (b"[]".to_vec(), 0);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
let sliced = slice_batches(batches, offset, actual_limit);
|
|
39
|
+
if sliced.is_empty() {
|
|
40
|
+
return (b"[]".to_vec(), 0);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Build JSON array using custom type handling
|
|
44
|
+
let mut rows: Vec<Value> = Vec::with_capacity(actual_limit);
|
|
45
|
+
for batch in &sliced {
|
|
46
|
+
for row_idx in 0..batch.num_rows() {
|
|
47
|
+
let mut row = serde_json::Map::new();
|
|
48
|
+
for (col_idx, field) in batch.schema().fields().iter().enumerate() {
|
|
49
|
+
let col = batch.column(col_idx);
|
|
50
|
+
let value = array_value_to_json(col.as_ref(), row_idx);
|
|
51
|
+
row.insert(field.name().clone(), value);
|
|
52
|
+
}
|
|
53
|
+
rows.push(Value::Object(row));
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
let row_count = rows.len();
|
|
58
|
+
let bytes = serde_json::to_vec(&rows).unwrap_or_else(|_| b"[]".to_vec());
|
|
59
|
+
(bytes, row_count)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
fn array_value_to_json(array: &dyn Array, index: usize) -> Value {
|
|
63
|
+
if array.is_null(index) {
|
|
64
|
+
return Value::Null;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
match array.data_type() {
|
|
68
|
+
DataType::Boolean => {
|
|
69
|
+
json!(array.as_boolean().value(index))
|
|
70
|
+
}
|
|
71
|
+
DataType::Int8 => {
|
|
72
|
+
json!(array.as_primitive::<arrow::datatypes::Int8Type>().value(index))
|
|
73
|
+
}
|
|
74
|
+
DataType::Int16 => {
|
|
75
|
+
json!(array.as_primitive::<arrow::datatypes::Int16Type>().value(index))
|
|
76
|
+
}
|
|
77
|
+
DataType::Int32 => {
|
|
78
|
+
json!(array.as_primitive::<arrow::datatypes::Int32Type>().value(index))
|
|
79
|
+
}
|
|
80
|
+
DataType::Int64 => {
|
|
81
|
+
json!(array.as_primitive::<arrow::datatypes::Int64Type>().value(index))
|
|
82
|
+
}
|
|
83
|
+
DataType::UInt8 => {
|
|
84
|
+
json!(array.as_primitive::<arrow::datatypes::UInt8Type>().value(index))
|
|
85
|
+
}
|
|
86
|
+
DataType::UInt16 => {
|
|
87
|
+
json!(array.as_primitive::<arrow::datatypes::UInt16Type>().value(index))
|
|
88
|
+
}
|
|
89
|
+
DataType::UInt32 => {
|
|
90
|
+
json!(array.as_primitive::<arrow::datatypes::UInt32Type>().value(index))
|
|
91
|
+
}
|
|
92
|
+
DataType::UInt64 => {
|
|
93
|
+
json!(array.as_primitive::<arrow::datatypes::UInt64Type>().value(index))
|
|
94
|
+
}
|
|
95
|
+
DataType::Float32 => {
|
|
96
|
+
json!(array.as_primitive::<arrow::datatypes::Float32Type>().value(index))
|
|
97
|
+
}
|
|
98
|
+
DataType::Float64 => {
|
|
99
|
+
json!(array.as_primitive::<arrow::datatypes::Float64Type>().value(index))
|
|
100
|
+
}
|
|
101
|
+
DataType::Utf8 => {
|
|
102
|
+
json!(array.as_string::<i32>().value(index))
|
|
103
|
+
}
|
|
104
|
+
DataType::LargeUtf8 => {
|
|
105
|
+
json!(array.as_string::<i64>().value(index))
|
|
106
|
+
}
|
|
107
|
+
DataType::Date32 => {
|
|
108
|
+
let days = array.as_primitive::<Date32Type>().value(index);
|
|
109
|
+
match date32_to_datetime(days) {
|
|
110
|
+
Some(dt) => json!(dt.format("%Y-%m-%d").to_string()),
|
|
111
|
+
None => Value::Null,
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
DataType::Decimal128(precision, scale) => {
|
|
115
|
+
let arr = array.as_primitive::<Decimal128Type>();
|
|
116
|
+
let value = arr.value(index);
|
|
117
|
+
json!(Decimal128Type::format_decimal(value, *precision, *scale))
|
|
118
|
+
}
|
|
119
|
+
DataType::Timestamp(_, _) => {
|
|
120
|
+
// Format timestamp as ISO string
|
|
121
|
+
use arrow::array::TimestampMicrosecondArray;
|
|
122
|
+
if let Some(ts_array) = array.as_any().downcast_ref::<TimestampMicrosecondArray>() {
|
|
123
|
+
let micros = ts_array.value(index);
|
|
124
|
+
let secs = micros / 1_000_000;
|
|
125
|
+
let nsecs = ((micros % 1_000_000) * 1000) as u32;
|
|
126
|
+
if let Some(dt) = chrono::DateTime::from_timestamp(secs, nsecs) {
|
|
127
|
+
return json!(dt.format("%Y-%m-%dT%H:%M:%S%.6f").to_string());
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
json!(format!("{:?}", array.data_type()))
|
|
131
|
+
}
|
|
132
|
+
_ => json!(format!("{:?}", array.data_type())),
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/// Legacy function for compatibility - parses back to Value
|
|
137
|
+
pub fn batches_to_json(batches: &[RecordBatch], offset: usize, limit: usize) -> Value {
|
|
138
|
+
let (bytes, _) = batches_to_json_bytes(batches, offset, limit);
|
|
139
|
+
serde_json::from_slice(&bytes).unwrap_or(Value::Array(vec![]))
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Slice batches to extract rows in range [offset, offset+limit)
|
|
143
|
+
fn slice_batches(batches: &[RecordBatch], offset: usize, limit: usize) -> Vec<RecordBatch> {
|
|
144
|
+
let mut result = Vec::new();
|
|
145
|
+
let mut current_offset = 0;
|
|
146
|
+
let mut remaining = limit;
|
|
147
|
+
|
|
148
|
+
for batch in batches {
|
|
149
|
+
let batch_rows = batch.num_rows();
|
|
150
|
+
|
|
151
|
+
if current_offset + batch_rows <= offset {
|
|
152
|
+
current_offset += batch_rows;
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
let start = if current_offset < offset { offset - current_offset } else { 0 };
|
|
157
|
+
let len = remaining.min(batch_rows - start);
|
|
158
|
+
|
|
159
|
+
if len > 0 {
|
|
160
|
+
let sliced = batch.slice(start, len);
|
|
161
|
+
result.push(sliced);
|
|
162
|
+
remaining -= len;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
if remaining == 0 {
|
|
166
|
+
break;
|
|
167
|
+
}
|
|
168
|
+
current_offset += batch_rows;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
result
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
pub fn total_row_count(batches: &[RecordBatch]) -> usize {
|
|
175
|
+
batches.iter().map(|b| b.num_rows()).sum()
|
|
176
|
+
}
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
//! Arrow IPC stream parsing and JSON conversion.
|
|
2
|
-
|
|
3
|
-
use std::io::Cursor;
|
|
4
|
-
|
|
5
|
-
use arrow::array::RecordBatch;
|
|
6
|
-
use arrow_ipc::reader::StreamReader;
|
|
7
|
-
use arrow_json::ArrayWriter;
|
|
8
|
-
use serde_json::Value;
|
|
9
|
-
use thiserror::Error;
|
|
10
|
-
|
|
11
|
-
#[derive(Error, Debug)]
|
|
12
|
-
pub enum ArrowError {
|
|
13
|
-
#[error("Failed to parse Arrow IPC: {0}")]
|
|
14
|
-
ParseError(#[from] arrow::error::ArrowError),
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
pub fn parse_arrow_stream(data: &[u8]) -> Result<Vec<RecordBatch>, ArrowError> {
|
|
18
|
-
let cursor = Cursor::new(data);
|
|
19
|
-
let reader = StreamReader::try_new(cursor, None)?;
|
|
20
|
-
let batches: Result<Vec<_>, _> = reader.collect();
|
|
21
|
-
Ok(batches?)
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
/// High-performance JSON conversion returning raw bytes.
|
|
25
|
-
/// Skips intermediate Value parsing for maximum speed.
|
|
26
|
-
pub fn batches_to_json_bytes(batches: &[RecordBatch], offset: usize, limit: usize) -> (Vec<u8>, usize) {
|
|
27
|
-
if batches.is_empty() {
|
|
28
|
-
return (b"[]".to_vec(), 0);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
|
|
32
|
-
let actual_limit = limit.min(total_rows.saturating_sub(offset));
|
|
33
|
-
if actual_limit == 0 {
|
|
34
|
-
return (b"[]".to_vec(), 0);
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
let sliced = slice_batches(batches, offset, actual_limit);
|
|
38
|
-
if sliced.is_empty() {
|
|
39
|
-
return (b"[]".to_vec(), 0);
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
let mut buf = Vec::with_capacity(actual_limit * 256);
|
|
43
|
-
{
|
|
44
|
-
let mut writer = ArrayWriter::new(&mut buf);
|
|
45
|
-
for batch in &sliced {
|
|
46
|
-
if writer.write(batch).is_err() {
|
|
47
|
-
return (b"[]".to_vec(), 0);
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
if writer.finish().is_err() {
|
|
51
|
-
return (b"[]".to_vec(), 0);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
let row_count = sliced.iter().map(|b| b.num_rows()).sum();
|
|
56
|
-
(buf, row_count)
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
/// Legacy function for compatibility - parses back to Value
|
|
60
|
-
pub fn batches_to_json(batches: &[RecordBatch], offset: usize, limit: usize) -> Value {
|
|
61
|
-
let (bytes, _) = batches_to_json_bytes(batches, offset, limit);
|
|
62
|
-
serde_json::from_slice(&bytes).unwrap_or(Value::Array(vec![]))
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
/// Slice batches to extract rows in range [offset, offset+limit)
|
|
66
|
-
fn slice_batches(batches: &[RecordBatch], offset: usize, limit: usize) -> Vec<RecordBatch> {
|
|
67
|
-
let mut result = Vec::new();
|
|
68
|
-
let mut current_offset = 0;
|
|
69
|
-
let mut remaining = limit;
|
|
70
|
-
|
|
71
|
-
for batch in batches {
|
|
72
|
-
let batch_rows = batch.num_rows();
|
|
73
|
-
|
|
74
|
-
if current_offset + batch_rows <= offset {
|
|
75
|
-
current_offset += batch_rows;
|
|
76
|
-
continue;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
let start = if current_offset < offset { offset - current_offset } else { 0 };
|
|
80
|
-
let len = remaining.min(batch_rows - start);
|
|
81
|
-
|
|
82
|
-
if len > 0 {
|
|
83
|
-
let sliced = batch.slice(start, len);
|
|
84
|
-
result.push(sliced);
|
|
85
|
-
remaining -= len;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
if remaining == 0 {
|
|
89
|
-
break;
|
|
90
|
-
}
|
|
91
|
-
current_offset += batch_rows;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
result
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
pub fn total_row_count(batches: &[RecordBatch]) -> usize {
|
|
98
|
-
batches.iter().map(|b| b.num_rows()).sum()
|
|
99
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|