PyPI - duckrun - Versions diffs - 0.2.19.dev2__tar.gz → 0.2.19.dev4__tar.gz - Mend

duckrun 0.2.19.dev2tar.gz → 0.2.19.dev4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.19.dev2
+Version: 0.2.19.dev4
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

{duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/core.py RENAMED Viewed

@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
             refresh=refresh
         )
-    def rle(self, table_name: str = None, mode: str = "summary", sort_columns: List[str] = None,
-            limit: int = None, max_combinations: int = 20, use_stratified_sampling: bool = True,
-            num_segments: int = 5, segment_size: int = 1000):
+    def rle(self, table_name: str = None, mode = "natural",
+            min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
+            max_ordering_depth: int = 3, limit: int = None):
         """
         Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
@@ -1254,38 +1254,48 @@ class Duckrun(WorkspaceOperationsMixin):
             table_name: Name of the table to analyze. Can be:
                 - 'table_name' (uses current schema)
                 - 'schema.table_name' (specific schema)
-                - None (analyzes all tables in current schema - summary only)
-            mode: Analysis mode:
-                - "summary": Quick NFV (Number of Distinct Values) analysis (default)
-                - "smart": Smart heuristic-based analysis (recommended)
-                - "full": Full RLE analysis with all column orderings
-            sort_columns: Optional list of columns to sort by for RLE calculation
-            limit: Optional limit on number of rows to analyze (ignored if using stratified sampling)
-            max_combinations: Maximum number of orderings to test (for smart mode)
-            use_stratified_sampling: If True, use stratified sampling across entire file (recommended)
-            num_segments: Number of segments for stratified sampling
-            segment_size: Size of each segment for sampling
+            mode: Analysis mode or column ordering:
+                - "natural": Calculate RLE for natural order only (fastest)
+                - "auto": Natural order + cardinality-based ordering (recommended)
+                - "advanced": Natural + cardinality + greedy incremental search (most thorough)
+                - List[str]: Specific column ordering to test, e.g., ['date', 'duid']
+            min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
+            max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
+            max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
+            limit: Optional row limit for testing/development (default: None, analyzes all rows)
         Returns:
             DataFrame with RLE analysis results
         Examples:
-            # Quick summary of a specific table
+            # Natural order only (baseline)
             con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
-            con.rle("mytable")  # defaults to summary mode
-            con.rle("mytable", "summary")
+            con.rle("mytable")  # same as con.rle("mytable", "natural")
-            # Smart analysis (finds optimal column ordering)
-            con.rle("mytable", "smart")
+            # Auto optimization (natural + cardinality-based)
+            con.rle("mytable", "auto")
+            # Advanced optimization (greedy incremental search)
+            con.rle("mytable", "advanced")
+            # Test specific column ordering
+            con.rle("mytable", ["date", "duid"])
+            con.rle("mytable", ["cutoff", "time", "DUID", "date"])
+            # Advanced with custom depth
+            con.rle("mytable", "advanced", max_ordering_depth=4)
             # Analyze table from different schema
-            con.rle("otherschema.mytable", "smart")
+            con.rle("otherschema.mytable", "auto")
-            # Full analysis with custom parameters
-            con.rle("mytable", "full", use_stratified_sampling=True, num_segments=10)
+            # Custom thresholds for small tables
+            con.rle("mytable", "auto", max_cardinality_pct=0.05)
+            # Limit rows for testing
+            con.rle("mytable", "auto", limit=10000)
         """
         from .rle import (
-            calculate_nfv_score,
+            calculate_cardinality_ratio,
             test_column_orderings_smart,
             calculate_rle_for_columns
         )
@@ -1310,7 +1320,7 @@ class Duckrun(WorkspaceOperationsMixin):
         # Construct the full table path using the same logic as get_stats
         table_path = f"{self.table_base_url}{schema_name}/{tbl}"
-        # Get the actual parquet files from Delta table
+        # Verify table exists and is not empty
         print(f"📊 Analyzing table: {schema_name}.{tbl}")
         try:
@@ -1321,40 +1331,102 @@ class Duckrun(WorkspaceOperationsMixin):
                 print("⚠️  Table is empty (no files)")
                 return None
-            # Construct full paths for parquet files
-            parquet_paths = [table_path + "/" + f for f in delta_files]
         except Exception as e:
             print(f"❌ Error accessing Delta table: {e}")
             return None
-        # For now, analyze the first file (can be extended to analyze all files)
-        parquet_path = parquet_paths[0]
-        if mode == "summary":
-            # Quick NFV analysis
-            nfv_scores = calculate_nfv_score(self.con, parquet_path, limit)
+        # Check if mode is a list of columns (custom ordering)
+        if isinstance(mode, list):
+            # User wants to test a specific column ordering
+            print(f"Testing custom column ordering: {', '.join(mode)}")
+            # Calculate cardinality for NDV values
+            card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
+            # Calculate RLE for the specified ordering
+            rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
+            total_rle_all = sum(rle_counts.values())
+            print(f"\nResults:")
+            print(f"  Custom ordering: [{', '.join(mode)}]")
+            print(f"  Total RLE (all columns): {total_rle_all:,} runs")
+            # Return as DataFrame for consistency
             import pandas as pd
-            df = pd.DataFrame([
-                {"column": col, "nfv_score": score}
-                for col, score in sorted(nfv_scores.items(), key=lambda x: x[1])
-            ])
-            return df
-        elif mode in ["smart", "full"]:
-            # Smart or full RLE analysis
-            return test_column_orderings_smart(
-                self.con,
-                parquet_path,
-                limit=limit,
-                max_combinations=max_combinations,
-                use_stratified_sampling=use_stratified_sampling,
-                num_segments=num_segments,
-                segment_size=segment_size
-            )
-        else:
-            print(f"❌ Unknown mode: {mode}. Use 'summary', 'smart', or 'full'")
-            return None
+            results = [{
+                'schema': schema_name,
+                'table': tbl,
+                'sort_order': 'custom',
+                'columns_used': ', '.join(mode),
+                'total_rle_all': total_rle_all,
+                **rle_counts
+            }]
+            df = pd.DataFrame(results)
+            # Transform to long format
+            long_format_results = []
+            for _, row in df.iterrows():
+                schema_val = row['schema']
+                table_val = row['table']
+                sort_order = row['sort_order']
+                columns_used = row['columns_used']
+                total_rle_all_val = row['total_rle_all']
+                # Get all column names except metadata columns
+                metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
+                data_columns = [col for col in df.columns if col not in metadata_cols]
+                # Get total rows from card_stats if available
+                total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
+                # Parse the columns_used to get ordering
+                sort_columns_list = [c.strip() for c in columns_used.split(',')]
+                # Create one row per data column
+                for col in data_columns:
+                    rle_value = row[col]
+                    # Get NDV from card_stats
+                    ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
+                    # Determine if column was included in the sort and its position
+                    is_in_sort = col in sort_columns_list
+                    order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
+                    comment = '' if is_in_sort else 'not included in the sort'
+                    long_format_results.append({
+                        'schema': schema_val,
+                        'table': table_val,
+                        'sort_type': sort_order,
+                        'column': col,
+                        'order': order_position,
+                        'RLE': rle_value,
+                        'NDV': ndv_value,
+                        'total_rows': total_rows,
+                        'total_RLE': total_rle_all_val,
+                        'comments': comment
+                    })
+            long_df = pd.DataFrame(long_format_results)
+            return long_df
+        # All modes now use test_column_orderings_smart with the mode parameter
+        return test_column_orderings_smart(
+            self.con,
+            table_path,
+            table_name=table_name,  # Pass table name for cardinality calculation on full dataset
+            mode=mode,
+            limit=limit,
+            min_distinct_threshold=min_distinct_threshold,
+            max_cardinality_pct=max_cardinality_pct,
+            max_ordering_depth=max_ordering_depth,
+            schema_name=schema_name,
+            table_display_name=tbl
+        )
     def close(self):
         """Close DuckDB connection"""

duckrun 0.2.19.dev2__tar.gz → 0.2.19.dev4__tar.gz

duckrun 0.2.19.dev2tar.gz → 0.2.19.dev4tar.gz