mangleframes 0.2.9__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {mangleframes-0.2.9 → mangleframes-0.3.0}/PKG-INFO +1 -1
  2. {mangleframes-0.2.9 → mangleframes-0.3.0}/pyproject.toml +1 -1
  3. {mangleframes-0.2.9 → mangleframes-0.3.0}/python/mangleframes/__init__.py +1 -1
  4. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/src/client.rs +97 -0
  5. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/reconcile_handlers.rs +108 -13
  6. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/spark_client.rs +14 -0
  7. {mangleframes-0.2.9 → mangleframes-0.3.0}/Cargo.lock +0 -0
  8. {mangleframes-0.2.9 → mangleframes-0.3.0}/Cargo.toml +0 -0
  9. {mangleframes-0.2.9 → mangleframes-0.3.0}/python/mangleframes/alerts.py +0 -0
  10. {mangleframes-0.2.9 → mangleframes-0.3.0}/python/mangleframes/launcher.py +0 -0
  11. {mangleframes-0.2.9 → mangleframes-0.3.0}/python/mangleframes/session.py +0 -0
  12. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/Cargo.toml +0 -0
  13. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/build.rs +0 -0
  14. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/base.proto +0 -0
  15. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/catalog.proto +0 -0
  16. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/commands.proto +0 -0
  17. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/common.proto +0 -0
  18. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/expressions.proto +0 -0
  19. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/ml.proto +0 -0
  20. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/ml_common.proto +0 -0
  21. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/relations.proto +0 -0
  22. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/proto/spark/connect/types.proto +0 -0
  23. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/src/error.rs +0 -0
  24. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/src/lib.rs +0 -0
  25. {mangleframes-0.2.9 → mangleframes-0.3.0}/spark-connect/src/proto/spark.connect.rs +0 -0
  26. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/Cargo.toml +0 -0
  27. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/index.html +0 -0
  28. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/package-lock.json +0 -0
  29. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/package.json +0 -0
  30. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/postcss.config.js +0 -0
  31. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/App.tsx +0 -0
  32. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/analysis/JoinAnalyzer.tsx +0 -0
  33. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/analysis/Reconciliation.tsx +0 -0
  34. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/analysis/SQLEditor.tsx +0 -0
  35. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/data/ColumnDropdown.tsx +0 -0
  36. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/data/ColumnStats.tsx +0 -0
  37. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/data/DataGrid.tsx +0 -0
  38. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/data/SchemaView.tsx +0 -0
  39. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/layout/ContextPanel.tsx +0 -0
  40. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/layout/Layout.tsx +0 -0
  41. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/layout/MainContent.tsx +0 -0
  42. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/layout/Sidebar.tsx +0 -0
  43. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/layout/StatusBar.tsx +0 -0
  44. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/layout/TabBar.tsx +0 -0
  45. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/layout/TopBar.tsx +0 -0
  46. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/quality/AlertBuilder.tsx +0 -0
  47. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/components/quality/QualityDashboard.tsx +0 -0
  48. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/index.css +0 -0
  49. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/main.tsx +0 -0
  50. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/stores/dataStore.ts +0 -0
  51. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/src/stores/uiStore.ts +0 -0
  52. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/tailwind.config.js +0 -0
  53. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/tsconfig.json +0 -0
  54. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/tsconfig.node.json +0 -0
  55. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/frontend/vite.config.ts +0 -0
  56. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/alert_handlers.rs +0 -0
  57. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/arrow_reader.rs +0 -0
  58. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/dashboard.rs +0 -0
  59. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/export.rs +0 -0
  60. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/handlers.rs +0 -0
  61. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/history_analysis.rs +0 -0
  62. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/history_handlers.rs +0 -0
  63. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/join_handlers.rs +0 -0
  64. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/main.rs +0 -0
  65. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/perf.rs +0 -0
  66. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/sql_builder.rs +0 -0
  67. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/stats.rs +0 -0
  68. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/test_helpers.rs +0 -0
  69. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/web_server.rs +0 -0
  70. {mangleframes-0.2.9 → mangleframes-0.3.0}/viewer/src/websocket.rs +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mangleframes
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Classifier: Programming Language :: Python :: 3
5
5
  Classifier: Programming Language :: Rust
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "mangleframes"
7
- version = "0.2.9"
7
+ version = "0.3.0"
8
8
  description = "PySpark DataFrame viewer with modern web UI"
9
9
  requires-python = ">=3.12"
10
10
  license = { text = "MIT" }
@@ -32,7 +32,7 @@ from .session import SparkSession, get_proxy_port, get_spark_session
32
32
  if TYPE_CHECKING:
33
33
  from pyspark.sql import DataFrame
34
34
 
35
- __version__ = "0.2.9"
35
+ __version__ = "0.3.0"
36
36
 
37
37
  # Import alert classes for convenience (optional dependency)
38
38
  try:
@@ -5,6 +5,7 @@ use std::time::Instant;
5
5
 
6
6
  use arrow::record_batch::RecordBatch;
7
7
  use arrow_ipc::reader::StreamReader;
8
+ use arrow_ipc::writer::StreamWriter;
8
9
  use tonic::metadata::MetadataValue;
9
10
  use tonic::service::Interceptor;
10
11
  use tonic::transport::{Channel, ClientTlsConfig};
@@ -364,6 +365,102 @@ impl SparkConnectClient {
364
365
  info!("Found {} tables", tables.len());
365
366
  Ok(tables)
366
367
  }
368
+
369
+ /// Register Arrow record batches as a temporary view in Spark.
370
+ pub async fn create_temp_view(
371
+ &self,
372
+ view_name: &str,
373
+ batches: &[RecordBatch],
374
+ ) -> Result<(), SparkConnectError> {
375
+ if batches.is_empty() {
376
+ return Err(SparkConnectError::Config("No batches to register".to_string()));
377
+ }
378
+
379
+ info!("Registering temp view '{}' with {} batches", view_name, batches.len());
380
+
381
+ // Serialize batches to Arrow IPC streaming format
382
+ let ipc_data = serialize_batches_to_ipc(batches)?;
383
+ info!("Serialized {} bytes of Arrow IPC data", ipc_data.len());
384
+
385
+ // Create LocalRelation with the IPC data
386
+ let local_relation = Relation {
387
+ common: None,
388
+ rel_type: Some(crate::proto::relation::RelType::LocalRelation(
389
+ crate::proto::LocalRelation {
390
+ data: Some(ipc_data),
391
+ schema: None,
392
+ },
393
+ )),
394
+ };
395
+
396
+ // Create the temp view command
397
+ let command = crate::proto::Command {
398
+ command_type: Some(crate::proto::command::CommandType::CreateDataframeView(
399
+ crate::proto::CreateDataFrameViewCommand {
400
+ input: Some(local_relation),
401
+ name: view_name.to_string(),
402
+ is_global: false,
403
+ replace: true,
404
+ },
405
+ )),
406
+ };
407
+
408
+ self.execute_command(command).await?;
409
+ info!("Successfully registered temp view '{}'", view_name);
410
+ Ok(())
411
+ }
412
+
413
+ /// Execute a Spark Connect command (non-query operation).
414
+ async fn execute_command(
415
+ &self,
416
+ command: crate::proto::Command,
417
+ ) -> Result<(), SparkConnectError> {
418
+ let plan = Plan {
419
+ op_type: Some(crate::proto::plan::OpType::Command(command)),
420
+ };
421
+
422
+ let request = ExecutePlanRequest {
423
+ session_id: self.session_id.clone(),
424
+ user_context: Some(UserContext {
425
+ user_id: "spark-connect-rs".to_string(),
426
+ user_name: "spark-connect-rs".to_string(),
427
+ extensions: vec![],
428
+ }),
429
+ operation_id: Some(Uuid::new_v4().to_string()),
430
+ plan: Some(plan),
431
+ client_type: Some("spark-connect-rs".to_string()),
432
+ request_options: vec![],
433
+ tags: vec![],
434
+ client_observed_server_side_session_id: None,
435
+ };
436
+
437
+ let response = match &self.inner {
438
+ ClientInner::Direct(c) => c.clone().execute_plan(request).await?,
439
+ ClientInner::Proxy(c) => c.clone().execute_plan(request).await?,
440
+ };
441
+
442
+ // Drain the response stream
443
+ let mut stream = response.into_inner();
444
+ while stream.message().await?.is_some() {}
445
+
446
+ Ok(())
447
+ }
448
+ }
449
+
450
+ /// Serialize Arrow RecordBatches to IPC streaming format.
451
+ fn serialize_batches_to_ipc(batches: &[RecordBatch]) -> Result<Vec<u8>, SparkConnectError> {
452
+ let schema = batches[0].schema();
453
+ let mut ipc_data = Vec::new();
454
+
455
+ {
456
+ let mut writer = StreamWriter::try_new(&mut ipc_data, &schema)?;
457
+ for batch in batches {
458
+ writer.write(batch)?;
459
+ }
460
+ writer.finish()?;
461
+ }
462
+
463
+ Ok(ipc_data)
367
464
  }
368
465
 
369
466
  /// Extract a JSON value from an Arrow array at a specific index.
@@ -23,6 +23,14 @@ use crate::web_server::{AppState, CachedFrame};
23
23
 
24
24
  const CSV_FRAME_PREFIX: &str = "__csv__";
25
25
 
26
+ /// Sanitize a name for use as a Spark temp view name.
27
+ /// Replaces non-alphanumeric characters (except underscores) with underscores.
28
+ fn sanitize_name(name: &str) -> String {
29
+ name.chars()
30
+ .map(|c| if c.is_alphanumeric() || c == '_' { c } else { '_' })
31
+ .collect()
32
+ }
33
+
26
34
  #[derive(Debug, Clone, Serialize, Deserialize)]
27
35
  #[serde(rename_all = "snake_case")]
28
36
  pub enum JoinType {
@@ -229,13 +237,44 @@ pub async fn reconcile(
229
237
  }
230
238
  };
231
239
 
240
+ // Check if source is a CSV frame and register it as a temp view in Spark
241
+ let source_frame_name = {
242
+ let csv_cache_key = format!("{}{}", CSV_FRAME_PREFIX, req.source_frame);
243
+ let cache = state.cache.read().await;
244
+
245
+ if let Some(cached) = cache.get(&csv_cache_key) {
246
+ let view_name = format!("__mf_csv_{}", sanitize_name(&req.source_frame));
247
+ tracing::info!(
248
+ "Registering CSV '{}' as temp view '{}' ({} batches)",
249
+ req.source_frame,
250
+ view_name,
251
+ cached.batches.len()
252
+ );
253
+
254
+ // Drop the cache lock before async operation
255
+ let batches = cached.batches.clone();
256
+ drop(cache);
257
+
258
+ if let Err(e) = dbx.create_temp_view(&view_name, &batches).await {
259
+ return error_response(
260
+ StatusCode::INTERNAL_SERVER_ERROR,
261
+ &format!("Failed to register CSV as temp view: {}", e),
262
+ );
263
+ }
264
+ view_name
265
+ } else {
266
+ drop(cache);
267
+ req.source_frame.clone()
268
+ }
269
+ };
270
+
232
271
  let config = build_reconcile_config(&req);
233
272
  let sample_limit = req.sample_limit.unwrap_or(100);
234
273
 
235
- tracing::info!("Executing reconciliation: {} vs {}", req.source_frame, req.target_frame);
274
+ tracing::info!("Executing reconciliation: {} vs {}", source_frame_name, req.target_frame);
236
275
 
237
- // Execute stats query
238
- let stats = match execute_stats_query(dbx, &req.source_frame, &req.target_frame, &config).await
276
+ // Execute stats query (use source_frame_name for CSV temp view)
277
+ let stats = match execute_stats_query(dbx, &source_frame_name, &req.target_frame, &config).await
239
278
  {
240
279
  Ok(s) => s,
241
280
  Err(e) => return error_response(StatusCode::INTERNAL_SERVER_ERROR, &e),
@@ -243,7 +282,7 @@ pub async fn reconcile(
243
282
 
244
283
  // Execute source-only query
245
284
  let source_only_rows =
246
- match execute_source_only_query(dbx, &req.source_frame, &req.target_frame, &config, sample_limit).await
285
+ match execute_source_only_query(dbx, &source_frame_name, &req.target_frame, &config, sample_limit).await
247
286
  {
248
287
  Ok(rows) => rows,
249
288
  Err(e) => return error_response(StatusCode::INTERNAL_SERVER_ERROR, &e),
@@ -251,7 +290,7 @@ pub async fn reconcile(
251
290
 
252
291
  // Execute target-only query
253
292
  let target_only_rows =
254
- match execute_target_only_query(dbx, &req.source_frame, &req.target_frame, &config, sample_limit).await
293
+ match execute_target_only_query(dbx, &source_frame_name, &req.target_frame, &config, sample_limit).await
255
294
  {
256
295
  Ok(rows) => rows,
257
296
  Err(e) => return error_response(StatusCode::INTERNAL_SERVER_ERROR, &e),
@@ -259,7 +298,7 @@ pub async fn reconcile(
259
298
 
260
299
  // Execute matched rows query
261
300
  let matched_rows =
262
- match execute_matched_query(dbx, &req.source_frame, &req.target_frame, &config, sample_limit).await
301
+ match execute_matched_query(dbx, &source_frame_name, &req.target_frame, &config, sample_limit).await
263
302
  {
264
303
  Ok(rows) => rows,
265
304
  Err(e) => return error_response(StatusCode::INTERNAL_SERVER_ERROR, &e),
@@ -267,7 +306,7 @@ pub async fn reconcile(
267
306
 
268
307
  // Execute totals query
269
308
  let column_totals =
270
- match execute_totals_query(dbx, &req.source_frame, &req.target_frame, &config).await {
309
+ match execute_totals_query(dbx, &source_frame_name, &req.target_frame, &config).await {
271
310
  Ok(totals) => totals,
272
311
  Err(e) => {
273
312
  tracing::warn!("Failed to get column totals: {}", e);
@@ -536,6 +575,35 @@ pub async fn export_reconciliation(
536
575
  }
537
576
  };
538
577
 
578
+ // Check if source is a CSV frame and register it as a temp view in Spark
579
+ let source_frame_name = {
580
+ let csv_cache_key = format!("{}{}", CSV_FRAME_PREFIX, req.source_frame);
581
+ let cache = state.cache.read().await;
582
+
583
+ if let Some(cached) = cache.get(&csv_cache_key) {
584
+ let view_name = format!("__mf_csv_{}", sanitize_name(&req.source_frame));
585
+ tracing::info!(
586
+ "Registering CSV '{}' as temp view '{}' for export",
587
+ req.source_frame,
588
+ view_name
589
+ );
590
+
591
+ let batches = cached.batches.clone();
592
+ drop(cache);
593
+
594
+ if let Err(e) = dbx.create_temp_view(&view_name, &batches).await {
595
+ return error_response(
596
+ StatusCode::INTERNAL_SERVER_ERROR,
597
+ &format!("Failed to register CSV as temp view: {}", e),
598
+ );
599
+ }
600
+ view_name
601
+ } else {
602
+ drop(cache);
603
+ req.source_frame.clone()
604
+ }
605
+ };
606
+
539
607
  let config = json!({
540
608
  "source_type": req.source_type,
541
609
  "source_group_by": req.source_group_by,
@@ -550,13 +618,13 @@ pub async fn export_reconciliation(
550
618
 
551
619
  tracing::info!(
552
620
  "Exporting reconciliation dashboard: {} vs {}",
553
- req.source_frame,
621
+ source_frame_name,
554
622
  req.target_frame
555
623
  );
556
624
 
557
625
  // Execute stats query
558
626
  let stats =
559
- match execute_stats_query(dbx, &req.source_frame, &req.target_frame, &config).await {
627
+ match execute_stats_query(dbx, &source_frame_name, &req.target_frame, &config).await {
560
628
  Ok(s) => s,
561
629
  Err(e) => return error_response(StatusCode::INTERNAL_SERVER_ERROR, &e),
562
630
  };
@@ -564,7 +632,7 @@ pub async fn export_reconciliation(
564
632
  // Execute source-only query
565
633
  let source_only_rows = match execute_source_only_query(
566
634
  dbx,
567
- &req.source_frame,
635
+ &source_frame_name,
568
636
  &req.target_frame,
569
637
  &config,
570
638
  sample_limit,
@@ -578,7 +646,7 @@ pub async fn export_reconciliation(
578
646
  // Execute target-only query
579
647
  let target_only_rows = match execute_target_only_query(
580
648
  dbx,
581
- &req.source_frame,
649
+ &source_frame_name,
582
650
  &req.target_frame,
583
651
  &config,
584
652
  sample_limit,
@@ -592,7 +660,7 @@ pub async fn export_reconciliation(
592
660
  // Execute matched rows query
593
661
  let matched_rows = match execute_matched_query(
594
662
  dbx,
595
- &req.source_frame,
663
+ &source_frame_name,
596
664
  &req.target_frame,
597
665
  &config,
598
666
  sample_limit,
@@ -605,7 +673,7 @@ pub async fn export_reconciliation(
605
673
 
606
674
  // Execute totals query
607
675
  let column_totals =
608
- match execute_totals_query(dbx, &req.source_frame, &req.target_frame, &config).await {
676
+ match execute_totals_query(dbx, &source_frame_name, &req.target_frame, &config).await {
609
677
  Ok(totals) => totals,
610
678
  Err(e) => {
611
679
  tracing::warn!("Failed to get column totals: {}", e);
@@ -905,4 +973,31 @@ mod tests {
905
973
  let row = json!({"count": "not a number"});
906
974
  assert_eq!(extract_i64(&row, "count"), 0);
907
975
  }
976
+
977
+ // ============ sanitize_name tests ============
978
+
979
+ #[test]
980
+ fn test_sanitize_name_spaces_and_parens() {
981
+ assert_eq!(sanitize_name("orders (1)"), "orders__1_");
982
+ }
983
+
984
+ #[test]
985
+ fn test_sanitize_name_dots_and_hyphens() {
986
+ assert_eq!(sanitize_name("my-table.csv"), "my_table_csv");
987
+ }
988
+
989
+ #[test]
990
+ fn test_sanitize_name_already_valid() {
991
+ assert_eq!(sanitize_name("normal_name"), "normal_name");
992
+ }
993
+
994
+ #[test]
995
+ fn test_sanitize_name_alphanumeric() {
996
+ assert_eq!(sanitize_name("table123"), "table123");
997
+ }
998
+
999
+ #[test]
1000
+ fn test_sanitize_name_special_chars() {
1001
+ assert_eq!(sanitize_name("data@2024#test!"), "data_2024_test_");
1002
+ }
908
1003
  }
@@ -102,6 +102,20 @@ impl DatabricksClient {
102
102
  execution_ms,
103
103
  })
104
104
  }
105
+
106
+ /// Register Arrow batches as a temporary view in Spark.
107
+ pub async fn create_temp_view(
108
+ &self,
109
+ view_name: &str,
110
+ batches: &[RecordBatch],
111
+ ) -> Result<(), SparkConnectError> {
112
+ let guard = self.client.read().await;
113
+ let client = guard
114
+ .as_ref()
115
+ .ok_or_else(|| SparkConnectError::Config("Not connected".to_string()))?;
116
+
117
+ client.create_temp_view(view_name, batches).await
118
+ }
105
119
  }
106
120
 
107
121
  impl Default for DatabricksClient {
File without changes
File without changes