mangleframes 0.3.4__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {mangleframes-0.3.4 → mangleframes-0.3.5}/PKG-INFO +1 -1
  2. {mangleframes-0.3.4 → mangleframes-0.3.5}/pyproject.toml +1 -1
  3. {mangleframes-0.3.4 → mangleframes-0.3.5}/python/mangleframes/__init__.py +1 -1
  4. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/src/client.rs +190 -7
  5. mangleframes-0.3.5/viewer/src/benchmark.rs +242 -0
  6. mangleframes-0.3.5/viewer/src/main.rs +194 -0
  7. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/spark_client.rs +30 -0
  8. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/sql_builder.rs +18 -3
  9. mangleframes-0.3.4/viewer/src/main.rs +0 -109
  10. {mangleframes-0.3.4 → mangleframes-0.3.5}/Cargo.lock +0 -0
  11. {mangleframes-0.3.4 → mangleframes-0.3.5}/Cargo.toml +0 -0
  12. {mangleframes-0.3.4 → mangleframes-0.3.5}/python/mangleframes/alerts.py +0 -0
  13. {mangleframes-0.3.4 → mangleframes-0.3.5}/python/mangleframes/launcher.py +0 -0
  14. {mangleframes-0.3.4 → mangleframes-0.3.5}/python/mangleframes/session.py +0 -0
  15. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/Cargo.toml +0 -0
  16. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/build.rs +0 -0
  17. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/base.proto +0 -0
  18. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/catalog.proto +0 -0
  19. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/commands.proto +0 -0
  20. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/common.proto +0 -0
  21. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/expressions.proto +0 -0
  22. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/ml.proto +0 -0
  23. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/ml_common.proto +0 -0
  24. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/relations.proto +0 -0
  25. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/proto/spark/connect/types.proto +0 -0
  26. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/src/error.rs +0 -0
  27. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/src/lib.rs +0 -0
  28. {mangleframes-0.3.4 → mangleframes-0.3.5}/spark-connect/src/proto/spark.connect.rs +0 -0
  29. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/Cargo.toml +0 -0
  30. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/index.html +0 -0
  31. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/package-lock.json +0 -0
  32. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/package.json +0 -0
  33. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/postcss.config.js +0 -0
  34. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/App.tsx +0 -0
  35. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/analysis/JoinAnalyzer.tsx +0 -0
  36. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/analysis/Reconciliation.tsx +0 -0
  37. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/analysis/SQLEditor.tsx +0 -0
  38. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/analysis/__tests__/JoinAnalyzer.test.tsx +0 -0
  39. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/data/ColumnDropdown.tsx +0 -0
  40. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/data/ColumnStats.tsx +0 -0
  41. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/data/DataGrid.tsx +0 -0
  42. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/data/SchemaView.tsx +0 -0
  43. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/ERDBuilder.tsx +0 -0
  44. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/ERDCanvas.tsx +0 -0
  45. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/ERDConfigModal.tsx +0 -0
  46. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/ERDTableList.tsx +0 -0
  47. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/ERDToolbar.tsx +0 -0
  48. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/ERDValidationPanel.tsx +0 -0
  49. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/TableNode.tsx +0 -0
  50. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/__tests__/ERDDragDrop.test.tsx +0 -0
  51. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/erd/index.ts +0 -0
  52. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/layout/ContextPanel.tsx +0 -0
  53. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/layout/Layout.tsx +0 -0
  54. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/layout/MainContent.tsx +0 -0
  55. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/layout/Sidebar.tsx +0 -0
  56. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/layout/StatusBar.tsx +0 -0
  57. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/layout/TabBar.tsx +0 -0
  58. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/layout/TopBar.tsx +0 -0
  59. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/quality/AlertBuilder.tsx +0 -0
  60. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/components/quality/QualityDashboard.tsx +0 -0
  61. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/index.css +0 -0
  62. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/lib/api.ts +0 -0
  63. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/lib/erdValidation.ts +0 -0
  64. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/main.tsx +0 -0
  65. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/stores/dataStore.ts +0 -0
  66. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/stores/erdStore.ts +0 -0
  67. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/stores/uiStore.ts +0 -0
  68. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/src/test/setup.ts +0 -0
  69. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/tailwind.config.js +0 -0
  70. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/tsconfig.json +0 -0
  71. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/tsconfig.node.json +0 -0
  72. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/frontend/vite.config.ts +0 -0
  73. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/alert_handlers.rs +0 -0
  74. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/arrow_reader.rs +0 -0
  75. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/dashboard.rs +0 -0
  76. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/export.rs +0 -0
  77. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/handlers.rs +0 -0
  78. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/history_analysis.rs +0 -0
  79. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/history_handlers.rs +0 -0
  80. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/join_handlers.rs +0 -0
  81. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/perf.rs +0 -0
  82. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/reconcile_handlers.rs +0 -0
  83. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/stats.rs +0 -0
  84. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/test_helpers.rs +0 -0
  85. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/web_server.rs +0 -0
  86. {mangleframes-0.3.4 → mangleframes-0.3.5}/viewer/src/websocket.rs +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mangleframes
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Classifier: Programming Language :: Python :: 3
5
5
  Classifier: Programming Language :: Rust
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "mangleframes"
7
- version = "0.3.4"
7
+ version = "0.3.5"
8
8
  description = "PySpark DataFrame viewer with modern web UI"
9
9
  requires-python = ">=3.12"
10
10
  license = { text = "MIT" }
@@ -32,7 +32,7 @@ from .session import SparkSession, get_proxy_port, get_spark_session
32
32
  if TYPE_CHECKING:
33
33
  from pyspark.sql import DataFrame
34
34
 
35
- __version__ = "0.3.4"
35
+ __version__ = "0.3.5"
36
36
 
37
37
  # Import alert classes for convenience (optional dependency)
38
38
  try:
@@ -16,7 +16,8 @@ use uuid::Uuid;
16
16
  use crate::error::SparkConnectError;
17
17
  use crate::proto::spark_connect_service_client::SparkConnectServiceClient;
18
18
  use crate::proto::{
19
- ExecutePlanRequest, Plan, Relation, Sql, UserContext,
19
+ ExecutePlanRequest, Plan, ReattachExecuteRequest, ReattachOptions, Relation, Sql, UserContext,
20
+ execute_plan_request::{RequestOption, request_option},
20
21
  execute_plan_response::ResponseType,
21
22
  };
22
23
 
@@ -247,26 +248,140 @@ impl SparkConnectClient {
247
248
  ClientInner::Proxy(c) => c.clone().execute_plan(request).await?,
248
249
  };
249
250
  let mut stream = response.into_inner();
250
-
251
251
  let mut batches = Vec::new();
252
- let mut arrow_data = Vec::new();
253
252
 
254
253
  while let Some(resp) = stream.message().await? {
255
254
  if let Some(response_type) = resp.response_type {
256
255
  if let ResponseType::ArrowBatch(batch) = response_type {
257
- arrow_data.extend_from_slice(&batch.data);
256
+ // Each ArrowBatch is a complete IPC stream - parse it separately
257
+ let parsed = parse_arrow_ipc(&batch.data)?;
258
+ batches.extend(parsed);
258
259
  }
259
260
  }
260
261
  }
261
262
 
262
- if !arrow_data.is_empty() {
263
- batches = parse_arrow_ipc(&arrow_data)?;
263
+ let elapsed_ms = start.elapsed().as_millis();
264
+ let row_count: usize = batches.iter().map(|b| b.num_rows()).sum();
265
+ info!(
266
+ "SQL executed in {}ms, {} rows returned",
267
+ elapsed_ms, row_count
268
+ );
269
+
270
+ if batches.is_empty() {
271
+ return Err(SparkConnectError::NoData);
272
+ }
273
+
274
+ Ok(batches)
275
+ }
276
+
277
+ /// Execute SQL with reattachable execution for large result sets (>10K rows).
278
+ /// Uses ReattachExecute RPC to continue fetching when server sends partial results.
279
+ pub async fn sql_reattachable(
280
+ &self,
281
+ query: &str,
282
+ limit: u32,
283
+ ) -> Result<Vec<RecordBatch>, SparkConnectError> {
284
+ let start = Instant::now();
285
+ info!("Executing reattachable SQL via Spark Connect: {}", query);
286
+
287
+ let sql_relation = Relation {
288
+ common: None,
289
+ rel_type: Some(crate::proto::relation::RelType::Sql(Sql {
290
+ query: query.to_string(),
291
+ args: Default::default(),
292
+ pos_args: vec![],
293
+ named_arguments: Default::default(),
294
+ pos_arguments: vec![],
295
+ })),
296
+ };
297
+
298
+ let relation = if limit < u32::MAX {
299
+ Relation {
300
+ common: None,
301
+ rel_type: Some(crate::proto::relation::RelType::Limit(Box::new(
302
+ crate::proto::Limit {
303
+ input: Some(Box::new(sql_relation)),
304
+ limit: limit as i32,
305
+ },
306
+ ))),
307
+ }
308
+ } else {
309
+ sql_relation
310
+ };
311
+
312
+ let plan = Plan {
313
+ op_type: Some(crate::proto::plan::OpType::Root(relation)),
314
+ };
315
+
316
+ let operation_id = Uuid::new_v4().to_string();
317
+ let reattach_option = RequestOption {
318
+ request_option: Some(request_option::RequestOption::ReattachOptions(
319
+ ReattachOptions { reattachable: true },
320
+ )),
321
+ };
322
+
323
+ let request = ExecutePlanRequest {
324
+ session_id: self.session_id.clone(),
325
+ user_context: Some(UserContext {
326
+ user_id: "spark-connect-rs".to_string(),
327
+ user_name: "spark-connect-rs".to_string(),
328
+ extensions: vec![],
329
+ }),
330
+ operation_id: Some(operation_id.clone()),
331
+ plan: Some(plan),
332
+ client_type: Some("spark-connect-rs".to_string()),
333
+ request_options: vec![reattach_option],
334
+ tags: vec![],
335
+ client_observed_server_side_session_id: None,
336
+ };
337
+
338
+ let mut batches = Vec::new();
339
+ let mut last_response_id: Option<String> = None;
340
+ let mut result_complete = false;
341
+
342
+ // Initial execution
343
+ info!("Starting reattachable execution for operation {}", operation_id);
344
+ let (complete, resp_id) = self
345
+ .process_execute_stream(request, &mut batches)
346
+ .await?;
347
+ result_complete = complete;
348
+ last_response_id = resp_id;
349
+ info!(
350
+ "Initial stream ended: result_complete={}, last_response_id={:?}, batches_count={}",
351
+ result_complete, last_response_id, batches.len()
352
+ );
353
+
354
+ // Reattach loop: continue fetching if ResultComplete was not received
355
+ while !result_complete {
356
+ info!(
357
+ "Reattaching to operation {} from response {:?}",
358
+ operation_id, last_response_id
359
+ );
360
+
361
+ let reattach_request = ReattachExecuteRequest {
362
+ session_id: self.session_id.clone(),
363
+ user_context: Some(UserContext {
364
+ user_id: "spark-connect-rs".to_string(),
365
+ user_name: "spark-connect-rs".to_string(),
366
+ extensions: vec![],
367
+ }),
368
+ operation_id: operation_id.clone(),
369
+ client_type: Some("spark-connect-rs".to_string()),
370
+ last_response_id: last_response_id.clone(),
371
+ client_observed_server_side_session_id: None,
372
+ };
373
+
374
+ let (complete, resp_id) = self
375
+ .process_reattach_stream(reattach_request, &mut batches)
376
+ .await?;
377
+ result_complete = complete;
378
+ last_response_id = resp_id;
264
379
  }
265
380
 
266
381
  let elapsed_ms = start.elapsed().as_millis();
267
382
  let row_count: usize = batches.iter().map(|b| b.num_rows()).sum();
268
383
  info!(
269
- "SQL executed in {}ms, {} rows returned",
384
+ "Reattachable SQL executed in {}ms, {} rows returned",
270
385
  elapsed_ms, row_count
271
386
  );
272
387
 
@@ -277,6 +392,74 @@ impl SparkConnectClient {
277
392
  Ok(batches)
278
393
  }
279
394
 
395
+ /// Process ExecutePlan response stream, returns (result_complete, last_response_id).
396
+ async fn process_execute_stream(
397
+ &self,
398
+ request: ExecutePlanRequest,
399
+ batches: &mut Vec<RecordBatch>,
400
+ ) -> Result<(bool, Option<String>), SparkConnectError> {
401
+ let response = match &self.inner {
402
+ ClientInner::Direct(c) => c.clone().execute_plan(request).await?,
403
+ ClientInner::Proxy(c) => c.clone().execute_plan(request).await?,
404
+ };
405
+ self.process_response_stream(response.into_inner(), batches)
406
+ .await
407
+ }
408
+
409
+ /// Process ReattachExecute response stream, returns (result_complete, last_response_id).
410
+ async fn process_reattach_stream(
411
+ &self,
412
+ request: ReattachExecuteRequest,
413
+ batches: &mut Vec<RecordBatch>,
414
+ ) -> Result<(bool, Option<String>), SparkConnectError> {
415
+ let response = match &self.inner {
416
+ ClientInner::Direct(c) => c.clone().reattach_execute(request).await?,
417
+ ClientInner::Proxy(c) => c.clone().reattach_execute(request).await?,
418
+ };
419
+ self.process_response_stream(response.into_inner(), batches)
420
+ .await
421
+ }
422
+
423
+ /// Process a response stream, collecting Arrow record batches.
424
+ /// Returns (result_complete, last_response_id).
425
+ async fn process_response_stream(
426
+ &self,
427
+ mut stream: tonic::Streaming<crate::proto::ExecutePlanResponse>,
428
+ batches: &mut Vec<RecordBatch>,
429
+ ) -> Result<(bool, Option<String>), SparkConnectError> {
430
+ let mut result_complete = false;
431
+ let mut last_response_id: Option<String> = None;
432
+ let mut batch_count = 0;
433
+ let mut total_rows = 0i64;
434
+
435
+ while let Some(resp) = stream.message().await? {
436
+ last_response_id = Some(resp.response_id.clone());
437
+
438
+ if let Some(response_type) = resp.response_type {
439
+ match response_type {
440
+ ResponseType::ArrowBatch(batch) => {
441
+ batch_count += 1;
442
+ total_rows += batch.row_count;
443
+ // Each ArrowBatch is a complete IPC stream - parse it separately
444
+ let parsed = parse_arrow_ipc(&batch.data)?;
445
+ batches.extend(parsed);
446
+ }
447
+ ResponseType::ResultComplete(_) => {
448
+ info!("Received ResultComplete after {} batches, {} rows", batch_count, total_rows);
449
+ result_complete = true;
450
+ }
451
+ _ => {}
452
+ }
453
+ }
454
+ }
455
+
456
+ info!(
457
+ "Stream ended: {} batches, {} rows, result_complete={}",
458
+ batch_count, total_rows, result_complete
459
+ );
460
+ Ok((result_complete, last_response_id))
461
+ }
462
+
280
463
  /// Execute SQL and return a single row as JSON Value.
281
464
  pub async fn sql_single_row(
282
465
  &self,
@@ -0,0 +1,242 @@
1
+ //! Benchmark harness for measuring Spark Connect client performance.
2
+ //!
3
+ //! Measures query execution time across different row counts to compare
4
+ //! against alternative approaches (Python subprocess, HTTP service).
5
+
6
+ use std::time::Instant;
7
+
8
+ use serde::{Deserialize, Serialize};
9
+ use tracing::info;
10
+
11
+ use crate::spark_client::DatabricksClient;
12
+ use crate::sql_builder;
13
+
14
+ /// Single benchmark iteration result.
15
+ #[derive(Debug, Clone, Serialize, Deserialize)]
16
+ pub struct IterationResult {
17
+ pub iteration: usize,
18
+ pub rows_fetched: usize,
19
+ pub total_ms: u64,
20
+ pub rows_per_sec: f64,
21
+ }
22
+
23
+ /// Aggregated results for a single row count.
24
+ #[derive(Debug, Clone, Serialize, Deserialize)]
25
+ pub struct RowCountResult {
26
+ pub row_count: usize,
27
+ pub iterations: Vec<IterationResult>,
28
+ pub avg_ms: f64,
29
+ pub min_ms: u64,
30
+ pub max_ms: u64,
31
+ pub p50_ms: u64,
32
+ pub p95_ms: u64,
33
+ pub avg_rows_per_sec: f64,
34
+ }
35
+
36
+ /// Full benchmark suite results.
37
+ #[derive(Debug, Clone, Serialize, Deserialize)]
38
+ pub struct BenchmarkResults {
39
+ pub scenario: String,
40
+ pub table_name: String,
41
+ pub timestamp: String,
42
+ pub row_counts: Vec<RowCountResult>,
43
+ }
44
+
45
+ /// Configuration for benchmark run.
46
+ #[derive(Debug, Clone)]
47
+ pub struct BenchmarkConfig {
48
+ pub table_name: String,
49
+ pub row_counts: Vec<usize>,
50
+ pub iterations: usize,
51
+ pub warmup_iterations: usize,
52
+ }
53
+
54
+ impl Default for BenchmarkConfig {
55
+ fn default() -> Self {
56
+ Self {
57
+ table_name: String::new(),
58
+ row_counts: vec![1000, 10_000, 100_000],
59
+ iterations: 10,
60
+ warmup_iterations: 2,
61
+ }
62
+ }
63
+ }
64
+
65
+ /// Run benchmark suite against Databricks via Rust Spark Connect client.
66
+ pub async fn run_benchmark(
67
+ client: &DatabricksClient,
68
+ config: &BenchmarkConfig,
69
+ ) -> Result<BenchmarkResults, String> {
70
+ info!(
71
+ "Starting Rust client benchmark: table={}, row_counts={:?}, iterations={}",
72
+ config.table_name, config.row_counts, config.iterations
73
+ );
74
+
75
+ let mut row_count_results = Vec::with_capacity(config.row_counts.len());
76
+
77
+ for &row_count in &config.row_counts {
78
+ info!("Benchmarking {} rows...", row_count);
79
+
80
+ // Warmup runs (not counted)
81
+ for i in 0..config.warmup_iterations {
82
+ info!(" Warmup {}/{}", i + 1, config.warmup_iterations);
83
+ let sql = sql_builder::select_data_sql(&config.table_name, row_count, 0);
84
+ if let Err(e) = client.execute_sql_reattachable(&sql, row_count).await {
85
+ return Err(format!("Warmup query failed: {}", e));
86
+ }
87
+ }
88
+
89
+ // Actual benchmark iterations (use reattachable for >10K rows)
90
+ let mut iterations = Vec::with_capacity(config.iterations);
91
+
92
+ for i in 0..config.iterations {
93
+ let sql = sql_builder::select_data_sql(&config.table_name, row_count, 0);
94
+
95
+ let start = Instant::now();
96
+ let response = client
97
+ .execute_sql_reattachable(&sql, row_count)
98
+ .await
99
+ .map_err(|e| format!("Query failed: {}", e))?;
100
+ let total_ms = start.elapsed().as_millis() as u64;
101
+
102
+ let rows_fetched = response.row_count as usize;
103
+ let rows_per_sec = if total_ms > 0 {
104
+ (rows_fetched as f64) / (total_ms as f64 / 1000.0)
105
+ } else {
106
+ 0.0
107
+ };
108
+
109
+ info!(
110
+ " Iteration {}/{}: {}ms, {} rows, {:.0} rows/sec",
111
+ i + 1,
112
+ config.iterations,
113
+ total_ms,
114
+ rows_fetched,
115
+ rows_per_sec
116
+ );
117
+
118
+ iterations.push(IterationResult {
119
+ iteration: i + 1,
120
+ rows_fetched,
121
+ total_ms,
122
+ rows_per_sec,
123
+ });
124
+ }
125
+
126
+ let result = aggregate_iterations(&iterations, row_count);
127
+ info!(
128
+ " Summary: avg={}ms, p50={}ms, p95={}ms, {:.0} rows/sec",
129
+ result.avg_ms as u64, result.p50_ms, result.p95_ms, result.avg_rows_per_sec
130
+ );
131
+
132
+ row_count_results.push(result);
133
+ }
134
+
135
+ Ok(BenchmarkResults {
136
+ scenario: "rust_direct".to_string(),
137
+ table_name: config.table_name.clone(),
138
+ timestamp: chrono::Utc::now().to_rfc3339(),
139
+ row_counts: row_count_results,
140
+ })
141
+ }
142
+
143
+ fn aggregate_iterations(iterations: &[IterationResult], row_count: usize) -> RowCountResult {
144
+ let mut timings: Vec<u64> = iterations.iter().map(|i| i.total_ms).collect();
145
+ timings.sort_unstable();
146
+
147
+ let sum_ms: u64 = timings.iter().sum();
148
+ let avg_ms = sum_ms as f64 / timings.len() as f64;
149
+ let min_ms = *timings.first().unwrap_or(&0);
150
+ let max_ms = *timings.last().unwrap_or(&0);
151
+ let p50_ms = percentile(&timings, 50);
152
+ let p95_ms = percentile(&timings, 95);
153
+
154
+ // Use actual rows_fetched, not the requested row_count
155
+ let total_rows_fetched: usize = iterations.iter().map(|i| i.rows_fetched).sum();
156
+ let avg_rows_fetched = total_rows_fetched as f64 / iterations.len() as f64;
157
+
158
+ let avg_rows_per_sec = if avg_ms > 0.0 {
159
+ avg_rows_fetched / (avg_ms / 1000.0)
160
+ } else {
161
+ 0.0
162
+ };
163
+
164
+ RowCountResult {
165
+ row_count,
166
+ iterations: iterations.to_vec(),
167
+ avg_ms,
168
+ min_ms,
169
+ max_ms,
170
+ p50_ms,
171
+ p95_ms,
172
+ avg_rows_per_sec,
173
+ }
174
+ }
175
+
176
+ fn percentile(sorted: &[u64], p: usize) -> u64 {
177
+ if sorted.is_empty() {
178
+ return 0;
179
+ }
180
+ let idx = (p * sorted.len() / 100).min(sorted.len() - 1);
181
+ sorted[idx]
182
+ }
183
+
184
+ #[cfg(test)]
185
+ mod tests {
186
+ use super::*;
187
+
188
+ #[test]
189
+ fn test_percentile_empty() {
190
+ assert_eq!(percentile(&[], 50), 0);
191
+ }
192
+
193
+ #[test]
194
+ fn test_percentile_single() {
195
+ assert_eq!(percentile(&[100], 50), 100);
196
+ }
197
+
198
+ #[test]
199
+ fn test_percentile_multiple() {
200
+ let sorted: Vec<u64> = (1..=100).collect();
201
+ assert!(percentile(&sorted, 50) >= 50);
202
+ assert!(percentile(&sorted, 95) >= 95);
203
+ }
204
+
205
+ #[test]
206
+ fn test_aggregate_iterations() {
207
+ let iterations = vec![
208
+ IterationResult {
209
+ iteration: 1,
210
+ rows_fetched: 1000,
211
+ total_ms: 100,
212
+ rows_per_sec: 10000.0,
213
+ },
214
+ IterationResult {
215
+ iteration: 2,
216
+ rows_fetched: 1000,
217
+ total_ms: 120,
218
+ rows_per_sec: 8333.0,
219
+ },
220
+ IterationResult {
221
+ iteration: 3,
222
+ rows_fetched: 1000,
223
+ total_ms: 110,
224
+ rows_per_sec: 9090.0,
225
+ },
226
+ ];
227
+
228
+ let result = aggregate_iterations(&iterations, 1000);
229
+ assert_eq!(result.row_count, 1000);
230
+ assert_eq!(result.min_ms, 100);
231
+ assert_eq!(result.max_ms, 120);
232
+ assert!((result.avg_ms - 110.0).abs() < 0.1);
233
+ }
234
+
235
+ #[test]
236
+ fn test_benchmark_config_default() {
237
+ let config = BenchmarkConfig::default();
238
+ assert_eq!(config.row_counts, vec![1000, 10_000, 100_000]);
239
+ assert_eq!(config.iterations, 10);
240
+ assert_eq!(config.warmup_iterations, 2);
241
+ }
242
+ }
@@ -0,0 +1,194 @@
1
+ //! MangleFrames Viewer - Web-based PySpark DataFrame viewer.
2
+
3
+ mod alert_handlers;
4
+ mod arrow_reader;
5
+ mod benchmark;
6
+ mod dashboard;
7
+ mod export;
8
+ mod handlers;
9
+ mod history_analysis;
10
+ mod history_handlers;
11
+ mod join_handlers;
12
+ mod perf;
13
+ mod reconcile_handlers;
14
+ mod spark_client;
15
+ mod sql_builder;
16
+ mod stats;
17
+ #[cfg(test)]
18
+ mod test_helpers;
19
+ mod web_server;
20
+ mod websocket;
21
+
22
+ use std::sync::Arc;
23
+
24
+ use clap::{Parser, Subcommand};
25
+ use tracing::info;
26
+ use tracing_subscriber::EnvFilter;
27
+
28
+ use crate::web_server::AppState;
29
+
30
+ #[derive(Parser)]
31
+ #[command(name = "mangleframes-viewer")]
32
+ #[command(about = "Web-based DataFrame viewer via Spark Connect")]
33
+ struct Args {
34
+ /// Web server port
35
+ #[arg(short, long, default_value = "8765", global = true)]
36
+ port: u16,
37
+
38
+ /// Connect via Spark Connect proxy (e.g., sc://localhost:15002)
39
+ #[arg(long, global = true)]
40
+ proxy_url: Option<String>,
41
+
42
+ /// Databricks workspace host (not needed when using --proxy-url)
43
+ #[arg(long, env = "DATABRICKS_HOST", global = true)]
44
+ databricks_host: Option<String>,
45
+
46
+ /// Databricks personal access token (not needed when using --proxy-url)
47
+ #[arg(long, env = "DATABRICKS_TOKEN", global = true)]
48
+ databricks_token: Option<String>,
49
+
50
+ /// Databricks cluster ID (for cluster mode)
51
+ #[arg(long, env = "DATABRICKS_CLUSTER_ID", global = true)]
52
+ databricks_cluster_id: Option<String>,
53
+
54
+ /// Use Databricks serverless compute (no cluster ID needed)
55
+ #[arg(long, global = true)]
56
+ serverless: bool,
57
+
58
+ #[command(subcommand)]
59
+ command: Option<Command>,
60
+ }
61
+
62
+ #[derive(Subcommand)]
63
+ enum Command {
64
+ /// Run performance benchmark against Databricks
65
+ Benchmark {
66
+ /// Table name to benchmark (e.g., catalog.schema.table)
67
+ #[arg(short, long)]
68
+ table: String,
69
+
70
+ /// Row counts to test (comma-separated)
71
+ #[arg(short, long, default_value = "1000,10000,100000")]
72
+ row_counts: String,
73
+
74
+ /// Number of iterations per row count
75
+ #[arg(short, long, default_value = "10")]
76
+ iterations: usize,
77
+
78
+ /// Number of warmup iterations (not counted)
79
+ #[arg(short, long, default_value = "2")]
80
+ warmup: usize,
81
+
82
+ /// Output file for JSON results
83
+ #[arg(short, long)]
84
+ output: Option<String>,
85
+ },
86
+ }
87
+
88
+ #[tokio::main]
89
+ async fn main() -> anyhow::Result<()> {
90
+ tracing_subscriber::fmt()
91
+ .with_env_filter(EnvFilter::from_default_env())
92
+ .init();
93
+
94
+ let args = Args::parse();
95
+
96
+ let client = Arc::new(spark_client::DatabricksClient::new());
97
+
98
+ // Connect via proxy or directly to Databricks
99
+ if let Some(ref proxy_url) = args.proxy_url {
100
+ info!("Connecting via Spark Connect proxy at {}", proxy_url);
101
+ client
102
+ .connect_via_proxy(proxy_url)
103
+ .await
104
+ .map_err(|e| anyhow::anyhow!("Proxy connection failed: {}", e))?;
105
+ } else {
106
+ // Direct Databricks connection requires host and token
107
+ let host = args
108
+ .databricks_host
109
+ .as_deref()
110
+ .ok_or_else(|| anyhow::anyhow!("DATABRICKS_HOST is required"))?;
111
+ let token = args
112
+ .databricks_token
113
+ .as_deref()
114
+ .ok_or_else(|| anyhow::anyhow!("DATABRICKS_TOKEN is required"))?;
115
+
116
+ // Determine cluster_id: None for serverless, Some for cluster mode
117
+ let cluster_id = if args.serverless {
118
+ info!("Initializing Databricks serverless mode");
119
+ None
120
+ } else if let Some(ref id) = args.databricks_cluster_id {
121
+ info!("Initializing Databricks cluster mode (cluster: {})", id);
122
+ Some(id.as_str())
123
+ } else {
124
+ info!("Initializing Databricks serverless mode (default)");
125
+ None
126
+ };
127
+
128
+ client
129
+ .connect(host, token, cluster_id)
130
+ .await
131
+ .map_err(|e| anyhow::anyhow!("Databricks connection failed: {}", e))?;
132
+ }
133
+
134
+ // Handle subcommand or default to web server
135
+ match args.command {
136
+ Some(Command::Benchmark {
137
+ table,
138
+ row_counts,
139
+ iterations,
140
+ warmup,
141
+ output,
142
+ }) => {
143
+ run_benchmark_command(&client, &table, &row_counts, iterations, warmup, output).await
144
+ }
145
+ None => {
146
+ let state = AppState::new(Some(client));
147
+ info!("Starting web server on http://localhost:{}", args.port);
148
+ info!("Open this URL in your browser to view the UI");
149
+ web_server::run(state, args.port).await
150
+ }
151
+ }
152
+ }
153
+
154
+ async fn run_benchmark_command(
155
+ client: &spark_client::DatabricksClient,
156
+ table: &str,
157
+ row_counts_str: &str,
158
+ iterations: usize,
159
+ warmup: usize,
160
+ output: Option<String>,
161
+ ) -> anyhow::Result<()> {
162
+ let row_counts: Vec<usize> = row_counts_str
163
+ .split(',')
164
+ .filter_map(|s| s.trim().parse().ok())
165
+ .collect();
166
+
167
+ if row_counts.is_empty() {
168
+ return Err(anyhow::anyhow!("No valid row counts provided"));
169
+ }
170
+
171
+ info!("Running benchmark: table={}, row_counts={:?}", table, row_counts);
172
+
173
+ let config = benchmark::BenchmarkConfig {
174
+ table_name: table.to_string(),
175
+ row_counts,
176
+ iterations,
177
+ warmup_iterations: warmup,
178
+ };
179
+
180
+ let results = benchmark::run_benchmark(client, &config)
181
+ .await
182
+ .map_err(|e| anyhow::anyhow!("Benchmark failed: {}", e))?;
183
+
184
+ let json = serde_json::to_string_pretty(&results)?;
185
+
186
+ if let Some(path) = output {
187
+ std::fs::write(&path, &json)?;
188
+ info!("Results written to {}", path);
189
+ } else {
190
+ println!("{}", json);
191
+ }
192
+
193
+ Ok(())
194
+ }
@@ -103,6 +103,36 @@ impl DatabricksClient {
103
103
  })
104
104
  }
105
105
 
106
+ /// Execute SQL with reattachable execution for large result sets (>10K rows).
107
+ /// Uses ReattachExecute RPC to ensure complete results.
108
+ pub async fn execute_sql_reattachable(
109
+ &self,
110
+ query: &str,
111
+ limit: usize,
112
+ ) -> Result<SqlResponse, SparkConnectError> {
113
+ let guard = self.client.read().await;
114
+ let client = guard
115
+ .as_ref()
116
+ .ok_or_else(|| SparkConnectError::Config("Not connected".to_string()))?;
117
+
118
+ let start = Instant::now();
119
+ let batches = client.sql_reattachable(query, limit as u32).await?;
120
+ let execution_ms = start.elapsed().as_millis() as u64;
121
+
122
+ let row_count: u64 = batches.iter().map(|b| b.num_rows() as u64).sum();
123
+
124
+ info!(
125
+ "Reattachable SQL executed via Spark Connect in {}ms, {} rows",
126
+ execution_ms, row_count
127
+ );
128
+
129
+ Ok(SqlResponse {
130
+ batches,
131
+ row_count,
132
+ execution_ms,
133
+ })
134
+ }
135
+
106
136
  /// Register Arrow batches as a temporary view in Spark.
107
137
  pub async fn create_temp_view(
108
138
  &self,
@@ -3,8 +3,12 @@
3
3
  use serde_json::Value;
4
4
 
5
5
  /// Quote an identifier to prevent SQL injection.
6
+ /// Handles multi-part names (catalog.schema.table) by quoting each part separately.
6
7
  pub fn quote_identifier(name: &str) -> String {
7
- format!("`{}`", name.replace('`', "``"))
8
+ name.split('.')
9
+ .map(|part| format!("`{}`", part.replace('`', "``")))
10
+ .collect::<Vec<_>>()
11
+ .join(".")
8
12
  }
9
13
 
10
14
  /// Quote multiple identifiers and join with commas.
@@ -1279,7 +1283,18 @@ mod tests {
1279
1283
  #[test]
1280
1284
  fn test_quote_identifier_special_chars() {
1281
1285
  assert_eq!(quote_identifier("col-name"), "`col-name`");
1282
- assert_eq!(quote_identifier("col.name"), "`col.name`");
1286
+ }
1287
+
1288
+ #[test]
1289
+ fn test_quote_identifier_multipart_names() {
1290
+ assert_eq!(quote_identifier("catalog.schema.table"), "`catalog`.`schema`.`table`");
1291
+ assert_eq!(quote_identifier("schema.table"), "`schema`.`table`");
1292
+ assert_eq!(quote_identifier("simple"), "`simple`");
1293
+ }
1294
+
1295
+ #[test]
1296
+ fn test_quote_identifier_multipart_with_backticks() {
1297
+ assert_eq!(quote_identifier("cat`alog.sch`ema.tab`le"), "`cat``alog`.`sch``ema`.`tab``le`");
1283
1298
  }
1284
1299
 
1285
1300
  // ============ quote_identifiers tests ============
@@ -1326,7 +1341,7 @@ mod tests {
1326
1341
  #[test]
1327
1342
  fn test_describe_table_sql_qualified_name() {
1328
1343
  let sql = describe_table_sql("catalog.schema.table");
1329
- assert_eq!(sql, "DESCRIBE TABLE `catalog.schema.table`");
1344
+ assert_eq!(sql, "DESCRIBE TABLE `catalog`.`schema`.`table`");
1330
1345
  }
1331
1346
 
1332
1347
  // ============ select_data_sql tests ============
@@ -1,109 +0,0 @@
1
- //! MangleFrames Viewer - Web-based PySpark DataFrame viewer.
2
-
3
- mod alert_handlers;
4
- mod arrow_reader;
5
- mod dashboard;
6
- mod export;
7
- mod handlers;
8
- mod history_analysis;
9
- mod history_handlers;
10
- mod join_handlers;
11
- mod perf;
12
- mod reconcile_handlers;
13
- mod spark_client;
14
- mod sql_builder;
15
- mod stats;
16
- #[cfg(test)]
17
- mod test_helpers;
18
- mod web_server;
19
- mod websocket;
20
-
21
- use std::sync::Arc;
22
-
23
- use clap::Parser;
24
- use tracing::info;
25
- use tracing_subscriber::EnvFilter;
26
-
27
- use crate::web_server::AppState;
28
-
29
- #[derive(Parser)]
30
- #[command(name = "mangleframes-viewer")]
31
- #[command(about = "Web-based DataFrame viewer via Spark Connect")]
32
- struct Args {
33
- /// Web server port
34
- #[arg(short, long, default_value = "8765")]
35
- port: u16,
36
-
37
- /// Connect via Spark Connect proxy (e.g., sc://localhost:15002)
38
- #[arg(long)]
39
- proxy_url: Option<String>,
40
-
41
- /// Databricks workspace host (not needed when using --proxy-url)
42
- #[arg(long, env = "DATABRICKS_HOST")]
43
- databricks_host: Option<String>,
44
-
45
- /// Databricks personal access token (not needed when using --proxy-url)
46
- #[arg(long, env = "DATABRICKS_TOKEN")]
47
- databricks_token: Option<String>,
48
-
49
- /// Databricks cluster ID (for cluster mode)
50
- #[arg(long, env = "DATABRICKS_CLUSTER_ID")]
51
- databricks_cluster_id: Option<String>,
52
-
53
- /// Use Databricks serverless compute (no cluster ID needed)
54
- #[arg(long)]
55
- serverless: bool,
56
- }
57
-
58
- #[tokio::main]
59
- async fn main() -> anyhow::Result<()> {
60
- tracing_subscriber::fmt()
61
- .with_env_filter(EnvFilter::from_default_env())
62
- .init();
63
-
64
- let args = Args::parse();
65
-
66
- let client = Arc::new(spark_client::DatabricksClient::new());
67
-
68
- // Connect via proxy or directly to Databricks
69
- if let Some(ref proxy_url) = args.proxy_url {
70
- info!("Connecting via Spark Connect proxy at {}", proxy_url);
71
- client
72
- .connect_via_proxy(proxy_url)
73
- .await
74
- .map_err(|e| anyhow::anyhow!("Proxy connection failed: {}", e))?;
75
- } else {
76
- // Direct Databricks connection requires host and token
77
- let host = args
78
- .databricks_host
79
- .as_deref()
80
- .ok_or_else(|| anyhow::anyhow!("DATABRICKS_HOST is required"))?;
81
- let token = args
82
- .databricks_token
83
- .as_deref()
84
- .ok_or_else(|| anyhow::anyhow!("DATABRICKS_TOKEN is required"))?;
85
-
86
- // Determine cluster_id: None for serverless, Some for cluster mode
87
- let cluster_id = if args.serverless {
88
- info!("Initializing Databricks serverless mode");
89
- None
90
- } else if let Some(ref id) = args.databricks_cluster_id {
91
- info!("Initializing Databricks cluster mode (cluster: {})", id);
92
- Some(id.as_str())
93
- } else {
94
- info!("Initializing Databricks serverless mode (default)");
95
- None
96
- };
97
-
98
- client
99
- .connect(host, token, cluster_id)
100
- .await
101
- .map_err(|e| anyhow::anyhow!("Databricks connection failed: {}", e))?;
102
- }
103
-
104
- let state = AppState::new(Some(client));
105
-
106
- info!("Starting web server on http://localhost:{}", args.port);
107
- info!("Open this URL in your browser to view the UI");
108
- web_server::run(state, args.port).await
109
- }
File without changes
File without changes