duckrun 0.2.19.dev6__tar.gz → 0.2.19.dev7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/rle.py +15 -9
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/pyproject.toml +1 -1
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/tests/test_rle.py +2 -2
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/LICENSE +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/README.md +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/core.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/setup.cfg +0 -0
|
@@ -523,7 +523,15 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
523
523
|
|
|
524
524
|
# Aggregate parquet stats per column if available
|
|
525
525
|
parquet_by_column = {}
|
|
526
|
+
avg_row_group_size = None # Calculate once for the table
|
|
527
|
+
|
|
526
528
|
if parquet_stats is not None and not parquet_stats.empty:
|
|
529
|
+
# Calculate average row group size once (same for all columns)
|
|
530
|
+
if 'row_group_num_rows' in parquet_stats.columns:
|
|
531
|
+
# Get unique row groups to avoid counting duplicates (one entry per column per row group)
|
|
532
|
+
unique_rg_sizes = parquet_stats.drop_duplicates(subset=['row_group_id'])['row_group_num_rows']
|
|
533
|
+
avg_row_group_size = int(unique_rg_sizes.mean())
|
|
534
|
+
|
|
527
535
|
# Determine column name field - can be 'name' or 'path_in_schema'
|
|
528
536
|
col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
|
|
529
537
|
|
|
@@ -533,8 +541,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
533
541
|
|
|
534
542
|
if not col_stats.empty:
|
|
535
543
|
# Aggregate stats across all row groups for this column
|
|
536
|
-
|
|
537
|
-
|
|
544
|
+
total_compressed_bytes = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
|
|
545
|
+
total_compressed_mb = round(total_compressed_bytes / (1024 * 1024), 2) if total_compressed_bytes else None
|
|
538
546
|
# Handle both 'encodings' (multiple) and 'encoding' (single) column names
|
|
539
547
|
if 'encodings' in col_stats.columns:
|
|
540
548
|
encodings = col_stats['encodings'].unique().tolist()
|
|
@@ -548,8 +556,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
548
556
|
parquet_by_column[col_name] = {
|
|
549
557
|
'encoding': ', '.join([str(e) for e in encodings if e is not None]),
|
|
550
558
|
'compression': ', '.join([str(c) for c in compressions if c is not None]),
|
|
551
|
-
'
|
|
552
|
-
'total_uncompressed_size': total_uncompressed,
|
|
559
|
+
'total_compressed_size_mb': total_compressed_mb,
|
|
553
560
|
'num_row_groups': num_row_groups
|
|
554
561
|
}
|
|
555
562
|
|
|
@@ -567,7 +574,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
567
574
|
'schema': schema_val,
|
|
568
575
|
'table': table_val,
|
|
569
576
|
'sort_type': sort_order,
|
|
570
|
-
'
|
|
577
|
+
'name': col,
|
|
571
578
|
'order': None,
|
|
572
579
|
'RLE': rle_value,
|
|
573
580
|
'NDV': ndv_value,
|
|
@@ -575,9 +582,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
575
582
|
'total_RLE': total_rle_all_val,
|
|
576
583
|
'encoding': col_parquet.get('encoding', ''),
|
|
577
584
|
'compression': col_parquet.get('compression', ''),
|
|
578
|
-
'
|
|
579
|
-
'
|
|
580
|
-
'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
|
|
585
|
+
'column_size_mb': col_parquet.get('total_compressed_size_mb', None),
|
|
586
|
+
'avg_row_group_size': avg_row_group_size,
|
|
581
587
|
'table_size_mb': table_size_mb,
|
|
582
588
|
'vorder': vorder_status,
|
|
583
589
|
'comments': ''
|
|
@@ -903,7 +909,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
903
909
|
'schema': schema_val,
|
|
904
910
|
'table': table_val,
|
|
905
911
|
'sort_type': sort_order,
|
|
906
|
-
'
|
|
912
|
+
'name': col,
|
|
907
913
|
'order': order_position,
|
|
908
914
|
'RLE': rle_value,
|
|
909
915
|
'NDV': ndv_value,
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev7"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
7
7
|
import duckrun
|
|
8
8
|
|
|
9
9
|
# Analyze multiple schemas/tables
|
|
10
|
-
conn = duckrun.connect("tmp/data.lakehouse")
|
|
10
|
+
conn = duckrun.connect("tmp/data.lakehouse/spark_vorder")
|
|
11
11
|
|
|
12
12
|
# Analyze tables - now returns long format automatically
|
|
13
|
-
result = conn.rle("
|
|
13
|
+
result = conn.rle("summary")
|
|
14
14
|
print(result)
|
|
15
15
|
conn.close()
|
|
16
16
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|