PyPI - duckrun - Versions diffs - 0.2.19.dev4__tar.gz → 0.2.19.dev6__tar.gz - Mend

duckrun 0.2.19.dev4tar.gz → 0.2.19.dev6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.19.dev4
+Version: 0.2.19.dev6
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

{duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/core.py RENAMED Viewed

@@ -1425,7 +1425,8 @@ class Duckrun(WorkspaceOperationsMixin):
             max_cardinality_pct=max_cardinality_pct,
             max_ordering_depth=max_ordering_depth,
             schema_name=schema_name,
-            table_display_name=tbl
+            table_display_name=tbl,
+            duckrun_instance=self  # Pass duckrun instance for detailed parquet stats
         )
     def close(self):

{duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun/rle.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import itertools
 from typing import List, Dict, Tuple, Optional
 import pandas as pd
+from .stats import get_stats
 def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
     """
@@ -238,22 +239,21 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
     Calculate cardinality ratio for each column (distinct_values / total_rows).
     Lower ratio = better for RLE compression (more repetition).
-    NEVER uses sampling - always scans full dataset with exact or approximate distinct counts.
+    NEVER uses sampling - always scans full dataset with exact distinct counts.
     Args:
         con: DuckDB connection
         source: Either a table name (default) or parquet file path
         limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
         is_parquet: If True, source is a parquet file path; if False, source is a table name
-        use_approx: If True, use HyperLogLog (approx). If False, use exact COUNT(DISTINCT).
-                    If None (default), auto-decide based on table size threshold.
-        approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
+        use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
+        approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
     Returns:
         Dictionary mapping column names to dict with keys:
         - 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
         - 'total_rows': total row count
-        - 'distinct_values': number of distinct values (exact or approximate)
+        - 'distinct_values': number of distinct values (exact)
     """
     # Build the FROM clause based on source type
     if is_parquet:
@@ -274,26 +274,15 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
     if not column_names:
         return {}
-    # Auto-decide whether to use approximate or exact based on table size
-    if use_approx is None:
-        # Quick row count check
-        total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
-        use_approx = total_rows > approx_threshold
-        if use_approx:
-            print(f"   Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
-        else:
-            print(f"   Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
-    else:
-        total_rows = None  # Will be calculated in main query
+    # Get row count
+    total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
+    print(f"   Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
-    # Build a single query that calculates all NFV scores in one pass
+    # Build a single query that calculates all cardinality in one pass
     # This scans the data only ONCE instead of once per column
     select_clauses = []
     for col in column_names:
-        if use_approx:
-            select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
-        else:
-            select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
+        select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
     query = f"""
         SELECT
@@ -307,8 +296,7 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
     if not result:
         return {}
-    if total_rows is None:
-        total_rows = result[0]
+    total_rows = result[0]
     nfv_stats = {}
@@ -405,7 +393,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                                 max_cardinality_pct: float = 0.01,
                                 max_ordering_depth: int = 3,
                                 schema_name: str = None,
-                                table_display_name: str = None) -> pd.DataFrame:
+                                table_display_name: str = None,
+                                duckrun_instance = None) -> pd.DataFrame:
     """
     Test column orderings for RLE optimization.
@@ -425,6 +414,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
         max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
         schema_name: Optional schema name to include in results (default: None)
         table_display_name: Optional table name to include in results (default: None)
+        duckrun_instance: Optional Duckrun instance to fetch detailed parquet stats (default: None)
     Returns:
         DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
@@ -485,6 +475,35 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
         print(f"✓ Analysis complete!")
         print(f"{'='*60}")
+        # Get detailed parquet stats if duckrun_instance is provided
+        parquet_stats = None
+        vorder_status = False
+        table_size_mb = None
+        if duckrun_instance and table_display_name:
+            print("\nFetching detailed parquet metadata...")
+            try:
+                # For single-schema connections, just use the table name
+                # For multi-schema connections, use schema.table format
+                if hasattr(duckrun_instance, 'scan_all_schemas') and duckrun_instance.scan_all_schemas and schema_name:
+                    source_param = f"{schema_name}.{table_display_name}"
+                else:
+                    source_param = table_display_name
+                parquet_stats = get_stats(duckrun_instance, source=source_param, detailed=True)
+                print(f"✓ Retrieved parquet metadata for {len(parquet_stats)} row groups/columns")
+                # Get vorder status from the stats if available
+                if 'vorder' in parquet_stats.columns:
+                    vorder_status = parquet_stats['vorder'].iloc[0] if len(parquet_stats) > 0 else False
+                # Calculate total table size from compressed sizes
+                if 'total_compressed_size' in parquet_stats.columns:
+                    total_bytes = parquet_stats['total_compressed_size'].sum()
+                    table_size_mb = round(total_bytes / (1024 * 1024), 2) if total_bytes else None
+            except Exception as e:
+                print(f"⚠️  Could not fetch parquet stats: {e}")
+                parquet_stats = None
         # Transform to long format
         long_format_results = []
@@ -502,6 +521,38 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
             # Get total rows and NDV from card_stats if available
             total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
+            # Aggregate parquet stats per column if available
+            parquet_by_column = {}
+            if parquet_stats is not None and not parquet_stats.empty:
+                # Determine column name field - can be 'name' or 'path_in_schema'
+                col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
+                # Group by column name and aggregate
+                for col_name in data_columns:
+                    col_stats = parquet_stats[parquet_stats[col_name_field] == col_name] if col_name_field in parquet_stats.columns else pd.DataFrame()
+                    if not col_stats.empty:
+                        # Aggregate stats across all row groups for this column
+                        total_compressed = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
+                        total_uncompressed = col_stats['total_uncompressed_size'].sum() if 'total_uncompressed_size' in col_stats.columns else None
+                        # Handle both 'encodings' (multiple) and 'encoding' (single) column names
+                        if 'encodings' in col_stats.columns:
+                            encodings = col_stats['encodings'].unique().tolist()
+                        elif 'encoding' in col_stats.columns:
+                            encodings = col_stats['encoding'].unique().tolist()
+                        else:
+                            encodings = []
+                        compressions = col_stats['compression'].unique().tolist() if 'compression' in col_stats.columns else []
+                        num_row_groups = col_stats['row_group_id'].nunique() if 'row_group_id' in col_stats.columns else len(col_stats)
+                        parquet_by_column[col_name] = {
+                            'encoding': ', '.join([str(e) for e in encodings if e is not None]),
+                            'compression': ', '.join([str(c) for c in compressions if c is not None]),
+                            'total_compressed_size': total_compressed,
+                            'total_uncompressed_size': total_uncompressed,
+                            'num_row_groups': num_row_groups
+                        }
             # Create one row per data column
             for col in data_columns:
                 rle_value = row[col]
@@ -509,6 +560,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                 # Get NDV from card_stats
                 ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
+                # Get parquet stats for this column
+                col_parquet = parquet_by_column.get(col, {})
                 long_format_results.append({
                     'schema': schema_val,
                     'table': table_val,
@@ -519,6 +573,13 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
                     'NDV': ndv_value,
                     'total_rows': total_rows,
                     'total_RLE': total_rle_all_val,
+                    'encoding': col_parquet.get('encoding', ''),
+                    'compression': col_parquet.get('compression', ''),
+                    'total_compressed_size': col_parquet.get('total_compressed_size', None),
+                    'total_uncompressed_size': col_parquet.get('total_uncompressed_size', None),
+                    'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
+                    'table_size_mb': table_size_mb,
+                    'vorder': vorder_status,
                     'comments': ''
                 })

{duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/duckrun.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.19.dev4
+Version: 0.2.19.dev6
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

{duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "duckrun"
-version = "0.2.19.dev4"
+version = "0.2.19.dev6"
 description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
 readme = "README.md"
 license = {text = "MIT"}

{duckrun-0.2.19.dev4 → duckrun-0.2.19.dev6}/tests/test_rle.py RENAMED Viewed

@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import duckrun
 # Analyze multiple schemas/tables
-conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
+conn = duckrun.connect("tmp/data.lakehouse")
 # Analyze tables - now returns long format automatically
-result = conn.rle("calendar",'auto')
+result = conn.rle("aemo.calendar")
 print(result)
 conn.close()