duckrun 0.2.19.dev5__tar.gz → 0.2.19.dev7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/core.py +2 -1
  3. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/rle.py +83 -3
  4. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/PKG-INFO +1 -1
  5. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/pyproject.toml +1 -1
  6. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/tests/test_rle.py +2 -2
  7. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/LICENSE +0 -0
  8. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/README.md +0 -0
  9. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/__init__.py +0 -0
  10. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/auth.py +0 -0
  11. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/files.py +0 -0
  12. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/lakehouse.py +0 -0
  13. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/notebook.py +0 -0
  14. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/runner.py +0 -0
  15. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/semantic_model.py +0 -0
  16. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/stats.py +0 -0
  17. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/writer.py +0 -0
  18. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/SOURCES.txt +0 -0
  19. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev5
3
+ Version: 0.2.19.dev7
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1425,7 +1425,8 @@ class Duckrun(WorkspaceOperationsMixin):
1425
1425
  max_cardinality_pct=max_cardinality_pct,
1426
1426
  max_ordering_depth=max_ordering_depth,
1427
1427
  schema_name=schema_name,
1428
- table_display_name=tbl
1428
+ table_display_name=tbl,
1429
+ duckrun_instance=self # Pass duckrun instance for detailed parquet stats
1429
1430
  )
1430
1431
 
1431
1432
  def close(self):
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
  from typing import List, Dict, Tuple, Optional
3
3
  import pandas as pd
4
+ from .stats import get_stats
4
5
 
5
6
  def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
6
7
  """
@@ -392,7 +393,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
392
393
  max_cardinality_pct: float = 0.01,
393
394
  max_ordering_depth: int = 3,
394
395
  schema_name: str = None,
395
- table_display_name: str = None) -> pd.DataFrame:
396
+ table_display_name: str = None,
397
+ duckrun_instance = None) -> pd.DataFrame:
396
398
  """
397
399
  Test column orderings for RLE optimization.
398
400
 
@@ -412,6 +414,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
412
414
  max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
413
415
  schema_name: Optional schema name to include in results (default: None)
414
416
  table_display_name: Optional table name to include in results (default: None)
417
+ duckrun_instance: Optional Duckrun instance to fetch detailed parquet stats (default: None)
415
418
 
416
419
  Returns:
417
420
  DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
@@ -472,6 +475,35 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
472
475
  print(f"✓ Analysis complete!")
473
476
  print(f"{'='*60}")
474
477
 
478
+ # Get detailed parquet stats if duckrun_instance is provided
479
+ parquet_stats = None
480
+ vorder_status = False
481
+ table_size_mb = None
482
+ if duckrun_instance and table_display_name:
483
+ print("\nFetching detailed parquet metadata...")
484
+ try:
485
+ # For single-schema connections, just use the table name
486
+ # For multi-schema connections, use schema.table format
487
+ if hasattr(duckrun_instance, 'scan_all_schemas') and duckrun_instance.scan_all_schemas and schema_name:
488
+ source_param = f"{schema_name}.{table_display_name}"
489
+ else:
490
+ source_param = table_display_name
491
+
492
+ parquet_stats = get_stats(duckrun_instance, source=source_param, detailed=True)
493
+ print(f"✓ Retrieved parquet metadata for {len(parquet_stats)} row groups/columns")
494
+
495
+ # Get vorder status from the stats if available
496
+ if 'vorder' in parquet_stats.columns:
497
+ vorder_status = parquet_stats['vorder'].iloc[0] if len(parquet_stats) > 0 else False
498
+
499
+ # Calculate total table size from compressed sizes
500
+ if 'total_compressed_size' in parquet_stats.columns:
501
+ total_bytes = parquet_stats['total_compressed_size'].sum()
502
+ table_size_mb = round(total_bytes / (1024 * 1024), 2) if total_bytes else None
503
+ except Exception as e:
504
+ print(f"⚠️ Could not fetch parquet stats: {e}")
505
+ parquet_stats = None
506
+
475
507
  # Transform to long format
476
508
  long_format_results = []
477
509
 
@@ -489,6 +521,45 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
489
521
  # Get total rows and NDV from card_stats if available
490
522
  total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
491
523
 
524
+ # Aggregate parquet stats per column if available
525
+ parquet_by_column = {}
526
+ avg_row_group_size = None # Calculate once for the table
527
+
528
+ if parquet_stats is not None and not parquet_stats.empty:
529
+ # Calculate average row group size once (same for all columns)
530
+ if 'row_group_num_rows' in parquet_stats.columns:
531
+ # Get unique row groups to avoid counting duplicates (one entry per column per row group)
532
+ unique_rg_sizes = parquet_stats.drop_duplicates(subset=['row_group_id'])['row_group_num_rows']
533
+ avg_row_group_size = int(unique_rg_sizes.mean())
534
+
535
+ # Determine column name field - can be 'name' or 'path_in_schema'
536
+ col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
537
+
538
+ # Group by column name and aggregate
539
+ for col_name in data_columns:
540
+ col_stats = parquet_stats[parquet_stats[col_name_field] == col_name] if col_name_field in parquet_stats.columns else pd.DataFrame()
541
+
542
+ if not col_stats.empty:
543
+ # Aggregate stats across all row groups for this column
544
+ total_compressed_bytes = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
545
+ total_compressed_mb = round(total_compressed_bytes / (1024 * 1024), 2) if total_compressed_bytes else None
546
+ # Handle both 'encodings' (multiple) and 'encoding' (single) column names
547
+ if 'encodings' in col_stats.columns:
548
+ encodings = col_stats['encodings'].unique().tolist()
549
+ elif 'encoding' in col_stats.columns:
550
+ encodings = col_stats['encoding'].unique().tolist()
551
+ else:
552
+ encodings = []
553
+ compressions = col_stats['compression'].unique().tolist() if 'compression' in col_stats.columns else []
554
+ num_row_groups = col_stats['row_group_id'].nunique() if 'row_group_id' in col_stats.columns else len(col_stats)
555
+
556
+ parquet_by_column[col_name] = {
557
+ 'encoding': ', '.join([str(e) for e in encodings if e is not None]),
558
+ 'compression': ', '.join([str(c) for c in compressions if c is not None]),
559
+ 'total_compressed_size_mb': total_compressed_mb,
560
+ 'num_row_groups': num_row_groups
561
+ }
562
+
492
563
  # Create one row per data column
493
564
  for col in data_columns:
494
565
  rle_value = row[col]
@@ -496,16 +567,25 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
496
567
  # Get NDV from card_stats
497
568
  ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
498
569
 
570
+ # Get parquet stats for this column
571
+ col_parquet = parquet_by_column.get(col, {})
572
+
499
573
  long_format_results.append({
500
574
  'schema': schema_val,
501
575
  'table': table_val,
502
576
  'sort_type': sort_order,
503
- 'column': col,
577
+ 'name': col,
504
578
  'order': None,
505
579
  'RLE': rle_value,
506
580
  'NDV': ndv_value,
507
581
  'total_rows': total_rows,
508
582
  'total_RLE': total_rle_all_val,
583
+ 'encoding': col_parquet.get('encoding', ''),
584
+ 'compression': col_parquet.get('compression', ''),
585
+ 'column_size_mb': col_parquet.get('total_compressed_size_mb', None),
586
+ 'avg_row_group_size': avg_row_group_size,
587
+ 'table_size_mb': table_size_mb,
588
+ 'vorder': vorder_status,
509
589
  'comments': ''
510
590
  })
511
591
 
@@ -829,7 +909,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
829
909
  'schema': schema_val,
830
910
  'table': table_val,
831
911
  'sort_type': sort_order,
832
- 'column': col,
912
+ 'name': col,
833
913
  'order': order_position,
834
914
  'RLE': rle_value,
835
915
  'NDV': ndv_value,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev5
3
+ Version: 0.2.19.dev7
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev5"
7
+ version = "0.2.19.dev7"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
7
  import duckrun
8
8
 
9
9
  # Analyze multiple schemas/tables
10
- conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
10
+ conn = duckrun.connect("tmp/data.lakehouse/spark_vorder")
11
11
 
12
12
  # Analyze tables - now returns long format automatically
13
- result = conn.rle("calendar",'auto')
13
+ result = conn.rle("summary")
14
14
  print(result)
15
15
  conn.close()
16
16
 
File without changes
File without changes
File without changes