duckrun 0.2.19.dev6__tar.gz → 0.2.19.dev7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/rle.py +15 -9
  3. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/PKG-INFO +1 -1
  4. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/pyproject.toml +1 -1
  5. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/tests/test_rle.py +2 -2
  6. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/LICENSE +0 -0
  7. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/README.md +0 -0
  8. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/__init__.py +0 -0
  9. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/auth.py +0 -0
  10. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/core.py +0 -0
  11. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/files.py +0 -0
  12. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/lakehouse.py +0 -0
  13. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/notebook.py +0 -0
  14. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/runner.py +0 -0
  15. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/semantic_model.py +0 -0
  16. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/stats.py +0 -0
  17. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/writer.py +0 -0
  18. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/SOURCES.txt +0 -0
  19. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev6
3
+ Version: 0.2.19.dev7
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -523,7 +523,15 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
523
523
 
524
524
  # Aggregate parquet stats per column if available
525
525
  parquet_by_column = {}
526
+ avg_row_group_size = None # Calculate once for the table
527
+
526
528
  if parquet_stats is not None and not parquet_stats.empty:
529
+ # Calculate average row group size once (same for all columns)
530
+ if 'row_group_num_rows' in parquet_stats.columns:
531
+ # Get unique row groups to avoid counting duplicates (one entry per column per row group)
532
+ unique_rg_sizes = parquet_stats.drop_duplicates(subset=['row_group_id'])['row_group_num_rows']
533
+ avg_row_group_size = int(unique_rg_sizes.mean())
534
+
527
535
  # Determine column name field - can be 'name' or 'path_in_schema'
528
536
  col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
529
537
 
@@ -533,8 +541,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
533
541
 
534
542
  if not col_stats.empty:
535
543
  # Aggregate stats across all row groups for this column
536
- total_compressed = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
537
- total_uncompressed = col_stats['total_uncompressed_size'].sum() if 'total_uncompressed_size' in col_stats.columns else None
544
+ total_compressed_bytes = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
545
+ total_compressed_mb = round(total_compressed_bytes / (1024 * 1024), 2) if total_compressed_bytes else None
538
546
  # Handle both 'encodings' (multiple) and 'encoding' (single) column names
539
547
  if 'encodings' in col_stats.columns:
540
548
  encodings = col_stats['encodings'].unique().tolist()
@@ -548,8 +556,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
548
556
  parquet_by_column[col_name] = {
549
557
  'encoding': ', '.join([str(e) for e in encodings if e is not None]),
550
558
  'compression': ', '.join([str(c) for c in compressions if c is not None]),
551
- 'total_compressed_size': total_compressed,
552
- 'total_uncompressed_size': total_uncompressed,
559
+ 'total_compressed_size_mb': total_compressed_mb,
553
560
  'num_row_groups': num_row_groups
554
561
  }
555
562
 
@@ -567,7 +574,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
567
574
  'schema': schema_val,
568
575
  'table': table_val,
569
576
  'sort_type': sort_order,
570
- 'column': col,
577
+ 'name': col,
571
578
  'order': None,
572
579
  'RLE': rle_value,
573
580
  'NDV': ndv_value,
@@ -575,9 +582,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
575
582
  'total_RLE': total_rle_all_val,
576
583
  'encoding': col_parquet.get('encoding', ''),
577
584
  'compression': col_parquet.get('compression', ''),
578
- 'total_compressed_size': col_parquet.get('total_compressed_size', None),
579
- 'total_uncompressed_size': col_parquet.get('total_uncompressed_size', None),
580
- 'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
585
+ 'column_size_mb': col_parquet.get('total_compressed_size_mb', None),
586
+ 'avg_row_group_size': avg_row_group_size,
581
587
  'table_size_mb': table_size_mb,
582
588
  'vorder': vorder_status,
583
589
  'comments': ''
@@ -903,7 +909,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
903
909
  'schema': schema_val,
904
910
  'table': table_val,
905
911
  'sort_type': sort_order,
906
- 'column': col,
912
+ 'name': col,
907
913
  'order': order_position,
908
914
  'RLE': rle_value,
909
915
  'NDV': ndv_value,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev6
3
+ Version: 0.2.19.dev7
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev6"
7
+ version = "0.2.19.dev7"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
7
  import duckrun
8
8
 
9
9
  # Analyze multiple schemas/tables
10
- conn = duckrun.connect("tmp/data.lakehouse")
10
+ conn = duckrun.connect("tmp/data.lakehouse/spark_vorder")
11
11
 
12
12
  # Analyze tables - now returns long format automatically
13
- result = conn.rle("aemo.calendar")
13
+ result = conn.rle("summary")
14
14
  print(result)
15
15
  conn.close()
16
16
 
File without changes
File without changes
File without changes