PyPI - duckrun - Versions diffs - 0.2.19.dev6__tar.gz → 0.2.19.dev7__tar.gz - Mend

duckrun 0.2.19.dev6tar.gz → 0.2.19.dev7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.19.dev6
+Version: 0.2.19.dev7
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

{duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun/rle.py RENAMED Viewed

@@ -523,7 +523,15 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
             # Aggregate parquet stats per column if available
             parquet_by_column = {}
+            avg_row_group_size = None  # Calculate once for the table
             if parquet_stats is not None and not parquet_stats.empty:
+                # Calculate average row group size once (same for all columns)
+                if 'row_group_num_rows' in parquet_stats.columns:
+                    # Get unique row groups to avoid counting duplicates (one entry per column per row group)
+                    unique_rg_sizes = parquet_stats.drop_duplicates(subset=['row_group_id'])['row_group_num_rows']
+                    avg_row_group_size = int(unique_rg_sizes.mean())
                 # Determine column name field - can be 'name' or 'path_in_schema'
                 col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
@@ -533,8 +541,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                     if not col_stats.empty:
                         # Aggregate stats across all row groups for this column
-                        total_compressed = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
-                        total_uncompressed = col_stats['total_uncompressed_size'].sum() if 'total_uncompressed_size' in col_stats.columns else None
+                        total_compressed_bytes = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
+                        total_compressed_mb = round(total_compressed_bytes / (1024 * 1024), 2) if total_compressed_bytes else None
                         # Handle both 'encodings' (multiple) and 'encoding' (single) column names
                         if 'encodings' in col_stats.columns:
                             encodings = col_stats['encodings'].unique().tolist()
@@ -548,8 +556,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                         parquet_by_column[col_name] = {
                             'encoding': ', '.join([str(e) for e in encodings if e is not None]),
                             'compression': ', '.join([str(c) for c in compressions if c is not None]),
-                            'total_compressed_size': total_compressed,
-                            'total_uncompressed_size': total_uncompressed,
+                            'total_compressed_size_mb': total_compressed_mb,
                             'num_row_groups': num_row_groups
                         }
@@ -567,7 +574,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                     'schema': schema_val,
                     'table': table_val,
                     'sort_type': sort_order,
-                    'column': col,
+                    'name': col,
                     'order': None,
                     'RLE': rle_value,
                     'NDV': ndv_value,
@@ -575,9 +582,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                     'total_RLE': total_rle_all_val,
                     'encoding': col_parquet.get('encoding', ''),
                     'compression': col_parquet.get('compression', ''),
-                    'total_compressed_size': col_parquet.get('total_compressed_size', None),
-                    'total_uncompressed_size': col_parquet.get('total_uncompressed_size', None),
-                    'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
+                    'column_size_mb': col_parquet.get('total_compressed_size_mb', None),
+                    'avg_row_group_size': avg_row_group_size,
                     'table_size_mb': table_size_mb,
                     'vorder': vorder_status,
                     'comments': ''
@@ -903,7 +909,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                 'schema': schema_val,
                 'table': table_val,
                 'sort_type': sort_order,
-                'column': col,
+                'name': col,
                 'order': order_position,
                 'RLE': rle_value,
                 'NDV': ndv_value,

{duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/duckrun.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.19.dev6
+Version: 0.2.19.dev7
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

{duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "duckrun"
-version = "0.2.19.dev6"
+version = "0.2.19.dev7"
 description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
 readme = "README.md"
 license = {text = "MIT"}

{duckrun-0.2.19.dev6 → duckrun-0.2.19.dev7}/tests/test_rle.py RENAMED Viewed

@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import duckrun
 # Analyze multiple schemas/tables
-conn = duckrun.connect("tmp/data.lakehouse")
+conn = duckrun.connect("tmp/data.lakehouse/spark_vorder")
 # Analyze tables - now returns long format automatically
-result = conn.rle("aemo.calendar")
+result = conn.rle("summary")
 print(result)
 conn.close()