PyPI - duckrun - Versions diffs - 0.2.18.dev2__py3-none-any.whl → 0.2.19.dev0__py3-none-any.whl - Mend

duckrun 0.2.18.dev2py3-none-any.whl → 0.2.19.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of duckrun might be problematic. Click here for more details.

Files changed (11) hide show

duckrun/__init__.py +1 -1
duckrun/core.py +28 -14
duckrun/notebook.py +2 -0
duckrun/semantic_model.py +115 -13
duckrun/stats.py +227 -67
{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/METADATA +1 -1
duckrun-0.2.19.dev0.dist-info/RECORD +15 -0
duckrun-0.2.18.dev2.dist-info/RECORD +0 -15
{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/WHEEL +0 -0
{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/licenses/LICENSE +0 -0
{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/top_level.txt +0 -0

duckrun/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from duckrun.core import Duckrun
 from duckrun.notebook import import_notebook_from_web, import_notebook
-__version__ = "0.2.18.dev2"
+__version__ = "0.2.18"
 # Expose unified connect method at module level
 connect = Duckrun.connect

duckrun/core.py CHANGED Viewed

@@ -1035,7 +1035,7 @@ class Duckrun(WorkspaceOperationsMixin):
         """Get underlying DuckDB connection"""
         return self.con
-    def get_stats(self, source: str = None):
+    def get_stats(self, source: str = None, detailed = False):
         """
         Get comprehensive statistics for Delta Lake tables.
@@ -1045,27 +1045,34 @@ class Duckrun(WorkspaceOperationsMixin):
                    - Table name: 'table_name' (uses current schema)
                    - Schema.table: 'schema.table_name' (specific table in schema)
                    - Schema only: 'schema' (all tables in schema)
+            detailed: Optional. Controls the level of detail in statistics:
+                     - False (default): Aggregated table-level stats
+                     - True: Row group level statistics with compression details
         Returns:
-            Arrow table with statistics including total rows, file count, row groups,
-            average row group size, file sizes, VORDER status, and timestamp
+            DataFrame with statistics based on detailed parameter:
+            - If detailed=False: Aggregated table-level summary
+            - If detailed=True: Granular file and row group level stats
         Examples:
             con = duckrun.connect("tmp/data.lakehouse/aemo")
-            # All tables in current schema (aemo)
+            # All tables in current schema (aemo) - aggregated
             stats = con.get_stats()
-            # Single table in current schema
+            # Single table in current schema - aggregated
             stats = con.get_stats('price')
+            # Single table with detailed row group statistics
+            stats_detailed = con.get_stats('price', detailed=True)
             # Specific table in different schema
             stats = con.get_stats('aemo.price')
             # All tables in a schema
             stats = con.get_stats('aemo')
         """
-        return _get_stats(self, source)
+        return _get_stats(self, source, detailed)
     def list_lakehouses(self) -> List[str]:
         """
@@ -1179,7 +1186,7 @@ class Duckrun(WorkspaceOperationsMixin):
             return False
     def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
-               wait_seconds: int = 5) -> int:
+               wait_seconds: int = 5, refresh: str = "full") -> int:
         """
         Deploy a semantic model from a BIM file using DirectLake mode.
@@ -1188,8 +1195,11 @@ class Duckrun(WorkspaceOperationsMixin):
                 - URL: "https://raw.githubusercontent.com/.../model.bim"
                 - Local file: "model.bim"
                 - Workspace/Model: "workspace_name/model_name"
-            dataset_name: Name for the semantic model (default: source model name if workspace/model format, else lakehouse_schema)
+            dataset_name: Name for the semantic model (default: schema name)
             wait_seconds: Seconds to wait for permission propagation (default: 5)
+            refresh: Refresh strategy:
+                - "full": Clear values and process full refresh (default)
+                - "ignore": Skip refresh entirely
         Returns:
             1 for success, 0 for failure
@@ -1197,14 +1207,17 @@ class Duckrun(WorkspaceOperationsMixin):
         Examples:
             dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
+            # Deploy with schema name as dataset name (dbo)
+            dr.deploy("https://github.com/.../model.bim")
             # Deploy from workspace/model (uses same name by default)
             dr.deploy("Source Workspace/Source Model")  # Creates "Source Model"
             # Deploy with custom name
-            dr.deploy("Source Workspace/Source Model", dataset_name="Sales Model Copy")
+            dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
-            # Deploy from URL or local file
-            dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
+            # Deploy without refresh
+            dr.deploy("https://github.com/.../model.bim", refresh="ignore")
         """
         from .semantic_model import deploy_semantic_model
@@ -1216,9 +1229,9 @@ class Duckrun(WorkspaceOperationsMixin):
                 if len(parts) == 2:
                     dataset_name = parts[1]  # Use the model name
                 else:
-                    dataset_name = f"{self.lakehouse_name}_{self.schema}"
+                    dataset_name = self.schema  # Use schema name
             else:
-                dataset_name = f"{self.lakehouse_name}_{self.schema}"
+                dataset_name = self.schema  # Use schema name
         # Call the deployment function (DirectLake only)
         return deploy_semantic_model(
@@ -1227,7 +1240,8 @@ class Duckrun(WorkspaceOperationsMixin):
             schema_name=self.schema,
             dataset_name=dataset_name,
             bim_url_or_path=bim_url,
-            wait_seconds=wait_seconds
+            wait_seconds=wait_seconds,
+            refresh=refresh
         )
     def close(self):

duckrun/notebook.py CHANGED Viewed

@@ -160,6 +160,7 @@ def import_notebook_from_web(
             update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
             payload = {
                 "definition": {
+                    "format": "ipynb",
                     "parts": [
                         {
                             "path": "notebook-content.py",
@@ -192,6 +193,7 @@ def import_notebook_from_web(
             payload = {
                 "displayName": notebook_name,
                 "definition": {
+                    "format": "ipynb",
                     "parts": [
                         {
                             "path": "notebook-content.py",

duckrun/semantic_model.py CHANGED Viewed

@@ -129,14 +129,21 @@ def check_dataset_exists(dataset_name, workspace_id, client):
         return False
-def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
+def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
     """Refresh a dataset and monitor progress using Power BI API
-    For DirectLake models, performs a two-step refresh:
-    1. clearValues - Purges data from memory
-    2. full - Reframes data from Delta tables
+    For DirectLake models, performs refresh based on refresh parameter:
+    - refresh="full": Two-step refresh (clearValues + full reframe)
+    - refresh="ignore": Skip refresh entirely
+    If a refresh is already in progress, waits for it to complete before starting a new one.
     """
+    # Skip refresh entirely if refresh is "ignore"
+    if refresh == "ignore":
+        print("   Ignoring refresh - skipping refresh")
+        return
     # If dataset_id not provided, look it up by name
     if not dataset_id:
         dataset_id = get_dataset_id(dataset_name, workspace_id, client)
@@ -145,6 +152,46 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
     powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
     headers = client._get_headers()
+    # Check for in-progress refreshes
+    print("   Checking for in-progress refreshes...")
+    try:
+        status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
+        if status_response.status_code == 200:
+            refreshes = status_response.json().get('value', [])
+            if refreshes:
+                latest_refresh = refreshes[0]
+                status = latest_refresh.get('status')
+                if status in ['InProgress', 'Unknown']:
+                    refresh_id = latest_refresh.get('requestId')
+                    print(f"   ⚠️  Found in-progress refresh (ID: {refresh_id})")
+                    print(f"   Waiting for current refresh to complete...")
+                    # Wait for the in-progress refresh to complete
+                    max_wait_attempts = 60
+                    for attempt in range(max_wait_attempts):
+                        time.sleep(5)
+                        check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
+                        if check_response.status_code == 200:
+                            current_status = check_response.json().get('status')
+                            if current_status == 'Completed':
+                                print(f"   ✓ Previous refresh completed")
+                                break
+                            elif current_status == 'Failed':
+                                print(f"   ⚠️  Previous refresh failed, continuing with new refresh")
+                                break
+                            elif current_status == 'Cancelled':
+                                print(f"   ⚠️  Previous refresh was cancelled, continuing with new refresh")
+                                break
+                            if attempt % 6 == 0:
+                                print(f"   Still waiting... (status: {current_status})")
+                    else:
+                        print(f"   ⚠️  Timeout waiting for previous refresh, will attempt new refresh anyway")
+    except Exception as e:
+        print(f"   ⚠️  Could not check refresh status: {e}")
+        print(f"   Continuing with refresh attempt...")
     # Step 1: clearValues - Purge data from memory
     print("   Step 1: Clearing values from memory...")
     clearvalues_payload = {
@@ -158,9 +205,45 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
     response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
     if response.status_code in [200, 202]:
-        print("   ✓ Clear values completed")
+        # For 202, monitor the clearValues operation
+        if response.status_code == 202:
+            location = response.headers.get('Location')
+            if location:
+                clear_refresh_id = location.split('/')[-1]
+                print("   ✓ Clear values initiated, monitoring progress...")
+                max_attempts = 60
+                for attempt in range(max_attempts):
+                    time.sleep(2)
+                    status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
+                    status_response = requests.get(status_url, headers=headers)
+                    status_response.raise_for_status()
+                    status = status_response.json().get('status')
+                    if status == 'Completed':
+                        print(f"   ✓ Clear values completed")
+                        break
+                    elif status == 'Failed':
+                        error = status_response.json().get('serviceExceptionJson', '')
+                        raise Exception(f"Clear values failed: {error}")
+                    elif status == 'Cancelled':
+                        raise Exception("Clear values was cancelled")
+                    if attempt % 10 == 0 and attempt > 0:
+                        print(f"   Clear values status: {status}...")
+                else:
+                    raise Exception(f"Clear values timed out")
+        else:
+            print("   ✓ Clear values completed")
     else:
-        response.raise_for_status()
+        # Provide detailed error message
+        try:
+            error_details = response.json()
+            error_message = error_details.get('error', {}).get('message', response.text)
+            raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
+        except (json.JSONDecodeError, ValueError):
+            response.raise_for_status()
     # Step 2: full refresh - Reframe data from Delta tables
     print("   Step 2: Full refresh to reframe data...")
@@ -175,7 +258,7 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
     response = requests.post(powerbi_url, headers=headers, json=full_payload)
     if response.status_code in [200, 202]:
-        print(f"✓ Refresh initiated")
+        print(f"   ✓ Refresh initiated")
         # For 202, get the refresh_id from the Location header
         if response.status_code == 202:
@@ -207,7 +290,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
                 raise Exception(f"Refresh timed out")
     else:
-        response.raise_for_status()
+        # Provide detailed error message
+        try:
+            error_details = response.json()
+            error_message = error_details.get('error', {}).get('message', response.text)
+            raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
+        except (json.JSONDecodeError, ValueError):
+            response.raise_for_status()
 def download_bim_from_github(url_or_path):
@@ -455,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
 def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
-                         bim_url_or_path, wait_seconds=5):
+                         bim_url_or_path, wait_seconds=5, refresh="full"):
     """
     Deploy a semantic model using DirectLake mode.
@@ -466,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
         dataset_name: Name for the semantic model
         bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
         wait_seconds: Seconds to wait before refresh (default: 5)
+        refresh: Refresh strategy (default: "full")
+            - "full": Clear values and process full refresh
+            - "ignore": Skip refresh entirely
     Returns:
         1 for success, 0 for failure
@@ -478,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
         # Using a local file
         dr.deploy("./my_model.bim")
         dr.deploy("C:/path/to/model.bim")
+        # Deploy without refresh
+        dr.deploy("./my_model.bim", refresh="ignore")
     """
     print("=" * 70)
     print("Semantic Model Deployment (DirectLake)")
@@ -502,7 +597,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
                 time.sleep(wait_seconds)
             print("\n[Step 3/3] Refreshing existing semantic model...")
-            refresh_dataset(dataset_name, workspace_id, client)
+            refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
             print("\n" + "=" * 70)
             print("🎉 Refresh Completed!")
@@ -534,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
         # Step 6: Refresh using the dataset ID returned from creation
         print("\n[Step 6/6] Refreshing semantic model...")
-        refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
+        refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
         print("\n" + "=" * 70)
         print("🎉 Deployment Completed!")
@@ -561,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
         return 0
-def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
+def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
     """
     Copy a semantic model from one workspace to another.
@@ -574,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
         destination: Destination in format "workspace/lakehouse.lakehouse/schema"
         new_model_name: Name for the new semantic model (default: same as source)
         wait_seconds: Seconds to wait before refresh (default: 5)
+        refresh: Refresh strategy (default: "full")
+            - "full": Clear values and process full refresh
+            - "ignore": Skip refresh entirely
     Returns:
         1 for success, 0 for failure
@@ -586,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
         copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
                    new_model_name="Production Model - Copy")
+        # Copy without refresh
+        copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
         # Using the connect pattern
         import duckrun
         duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
@@ -712,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
             schema_name=schema,
             dataset_name=new_model_name,
             bim_url_or_path=temp_bim_path,
-            wait_seconds=wait_seconds
+            wait_seconds=wait_seconds,
+            refresh=refresh
         )
         # Clean up temp file

duckrun/stats.py CHANGED Viewed

@@ -60,7 +60,50 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
         return []
-def get_stats(duckrun_instance, source: str = None):
+def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
+    """Match tables across all schemas using a wildcard pattern.
+    Pattern can be:
+    - '*.summary' - matches 'summary' table in all schemas
+    - '*summary' - matches any table ending with 'summary'
+    - 'schema.*' - matches all tables in 'schema'
+    Returns a dict mapping schema names to lists of matching table names."""
+    import fnmatch
+    try:
+        # Query all schemas and tables in one go
+        query = """
+            SELECT table_schema, table_name
+            FROM information_schema.tables
+            WHERE table_schema NOT LIKE 'pg_%'
+            AND table_schema != 'information_schema'
+            AND table_name NOT LIKE 'tbl_%'
+        """
+        result = duckrun_instance.con.execute(query).fetchall()
+        matched = {}
+        # Check if pattern contains a dot (schema.table pattern)
+        if '.' in pattern:
+            schema_pattern, table_pattern = pattern.split('.', 1)
+            for schema, table in result:
+                if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
+                    if schema not in matched:
+                        matched[schema] = []
+                    matched[schema].append(table)
+        else:
+            # Pattern matches only table names
+            for schema, table in result:
+                if fnmatch.fnmatch(table, pattern):
+                    if schema not in matched:
+                        matched[schema] = []
+                    matched[schema].append(table)
+        return matched
+    except:
+        return {}
+def get_stats(duckrun_instance, source: str = None, detailed = False):
     """
     Get comprehensive statistics for Delta Lake tables.
@@ -71,25 +114,35 @@ def get_stats(duckrun_instance, source: str = None):
                - Table name: 'table_name' (uses main schema in DuckDB)
                - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
                - Schema only: 'schema' (all tables in schema, if multi-schema)
+               - Wildcard pattern: '*.summary' (matches tables across all schemas)
+        detailed: Optional. Controls the level of detail in statistics:
+                 - False (default): Aggregated table-level stats (total rows, file count,
+                   row groups, average row group size, file sizes, VORDER status)
+                 - True: Row group level statistics with compression details, row group sizes,
+                   and parquet metadata
     Returns:
-        Arrow table with statistics including total rows, file count, row groups,
-        average row group size, file sizes, VORDER status, and timestamp
+        DataFrame with statistics based on detailed parameter:
+        - If detailed=False: Aggregated table-level summary
+        - If detailed=True: Granular file and row group level stats
     Examples:
         con = duckrun.connect("tmp/data.lakehouse/test")
-        # All tables in the connection's schema
+        # All tables in the connection's schema (aggregated)
         stats = con.get_stats()
-        # Single table in main schema (DuckDB uses 'main', not 'test')
-        stats = con.get_stats('price_today')
+        # Single table with detailed row group statistics
+        stats_detailed = con.get_stats('price_today', detailed=True)
         # Specific table in different schema (only if multi-schema enabled)
         stats = con.get_stats('aemo.price')
         # All tables in a schema (only if multi-schema enabled)
         stats = con.get_stats('aemo')
+        # Wildcard pattern across all schemas (only if multi-schema enabled)
+        stats = con.get_stats('*.summary')
     """
     timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
@@ -101,8 +154,27 @@ def get_stats(duckrun_instance, source: str = None):
     if source is None:
         source = url_schema
+    # Check if source contains wildcard characters
+    if '*' in source or '?' in source:
+        # Wildcard pattern mode - only valid if multi-schema is enabled
+        if not duckrun_instance.scan_all_schemas:
+            raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
+        matched_tables = _match_tables_by_pattern(duckrun_instance, source)
+        if not matched_tables:
+            raise ValueError(f"No tables found matching pattern '{source}'")
+        # Flatten the matched tables into a list with schema info
+        tables_with_schemas = []
+        for schema, tables in matched_tables.items():
+            for table in tables:
+                tables_with_schemas.append((schema, table))
+        print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
     # Parse the source and validate existence
-    if '.' in source:
+    elif '.' in source:
         # Format: schema.table - only valid if multi-schema is enabled
         schema_name, table_name = source.split('.', 1)
@@ -113,46 +185,45 @@ def get_stats(duckrun_instance, source: str = None):
         if not _table_exists(duckrun_instance, schema_name, table_name):
             raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
-        list_tables = [table_name]
+        tables_with_schemas = [(schema_name, table_name)]
     else:
         # Could be just table name or schema name
         if duckrun_instance.scan_all_schemas:
             # Multi-schema mode: DuckDB has actual schemas
             # First check if it's a table in main schema
             if _table_exists(duckrun_instance, duckdb_schema, source):
-                list_tables = [source]
-                schema_name = duckdb_schema
+                tables_with_schemas = [(duckdb_schema, source)]
             # Otherwise, check if it's a schema name
             elif _schema_exists(duckrun_instance, source):
                 schema_name = source
                 list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
                 if not list_tables:
                     raise ValueError(f"Schema '{source}' exists but contains no tables")
+                tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
             else:
                 raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
         else:
             # Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
             if _table_exists(duckrun_instance, duckdb_schema, source):
                 # It's a table name
-                list_tables = [source]
-                schema_name = url_schema  # Use URL schema for file path construction
+                tables_with_schemas = [(url_schema, source)]
             elif source == url_schema:
                 # Special case: user asked for stats on the URL schema name - list all tables
                 list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
-                schema_name = url_schema  # Use URL schema for file path construction
                 if not list_tables:
                     raise ValueError(f"No tables found in schema '{url_schema}'")
+                tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
             else:
                 raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
     # Use the existing connection
     con = duckrun_instance.con
-    print(f"Processing {len(list_tables)} tables: {list_tables}")
+    print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
     successful_tables = []
-    for idx, tbl in enumerate(list_tables):
-        print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
+    for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
+        print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
         # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
         table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
@@ -179,8 +250,18 @@ def get_stats(duckrun_instance, source: str = None):
                     print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
                     xx = {}
-            # Check if VORDER exists
-            vorder = 'tags.VORDER' in xx.keys()
+            # Check if VORDER exists - handle both formats:
+            # 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
+            # 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
+            vorder = False
+            if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
+                vorder = True
+            elif 'tags' in xx.keys() and xx['tags']:
+                # Check nested tags dictionary (tags is a list of dicts, one per file)
+                for tag_dict in xx['tags']:
+                    if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
+                        vorder = True
+                        break
             # Calculate total size
             total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
@@ -195,6 +276,7 @@ def get_stats(duckrun_instance, source: str = None):
                 con.execute(f'''
                     CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
                     SELECT
+                        '{schema_name}' as schema,
                         '{tbl}' as tbl,
                         'empty' as file_name,
                         0 as num_rows,
@@ -207,21 +289,45 @@ def get_stats(duckrun_instance, source: str = None):
                 ''')
             else:
                 # Get parquet metadata and create temp table with compression info
-                con.execute(f'''
-                    CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
-                    SELECT
-                        '{tbl}' as tbl,
-                        fm.file_name,
-                        fm.num_rows,
-                        fm.num_row_groups,
-                        CEIL({total_size}/(1024*1024)) as size,
-                        {vorder} as vorder,
-                        COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
-                        '{timestamp}' as timestamp
-                    FROM parquet_file_metadata({delta}) fm
-                    LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
-                    GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
-                ''')
+                if detailed == True:
+                    # Detailed mode: Include row group level statistics
+                    con.execute(f'''
+                        CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
+                        SELECT
+                            '{schema_name}' as schema,
+                            '{tbl}' as tbl,
+                            pm.file_name,
+                            pm.row_group_id,
+                            pm.row_group_num_rows,
+                            pm.row_group_num_columns,
+                            pm.row_group_bytes,
+                            {vorder} as vorder,
+                            pm.compression,
+                            pm.total_compressed_size,
+                            pm.total_uncompressed_size,
+                            ROUND(pm.total_compressed_size::DOUBLE / NULLIF(pm.total_uncompressed_size, 0), 4) as compression_ratio,
+                            '{timestamp}' as timestamp
+                        FROM parquet_metadata({delta}) pm
+                        WHERE pm.column_id = 0  -- Only include first column to avoid duplication per column
+                    ''')
+                else:
+                    # Aggregated mode: Original summary statistics
+                    con.execute(f'''
+                        CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
+                        SELECT
+                            '{schema_name}' as schema,
+                            '{tbl}' as tbl,
+                            fm.file_name,
+                            fm.num_rows,
+                            fm.num_row_groups,
+                            CEIL({total_size}/(1024*1024)) as size,
+                            {vorder} as vorder,
+                            COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
+                            '{timestamp}' as timestamp
+                        FROM parquet_file_metadata({delta}) fm
+                        LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
+                        GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
+                    ''')
         except Exception as e:
             error_msg = str(e)
@@ -245,6 +351,7 @@ def get_stats(duckrun_instance, source: str = None):
                     con.execute(f'''
                         CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
                         SELECT
+                            '{schema_name}' as schema,
                             '{tbl}' as tbl,
                             'empty' as file_name,
                             0 as num_rows,
@@ -269,21 +376,45 @@ def get_stats(duckrun_instance, source: str = None):
                         filenames.append(table_path + "/" + filename)
                     # Use parquet_file_metadata to get actual parquet stats with compression
-                    con.execute(f'''
-                        CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
-                        SELECT
-                            '{tbl}' as tbl,
-                            fm.file_name,
-                            fm.num_rows,
-                            fm.num_row_groups,
-                            0 as size,
-                            false as vorder,
-                            COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
-                            '{timestamp}' as timestamp
-                        FROM parquet_file_metadata({filenames}) fm
-                        LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
-                        GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
-                    ''')
+                    if detailed == True:
+                        # Detailed mode: Include row group level statistics
+                        con.execute(f'''
+                            CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
+                            SELECT
+                                '{schema_name}' as schema,
+                                '{tbl}' as tbl,
+                                pm.file_name,
+                                pm.row_group_id,
+                                pm.row_group_num_rows,
+                                pm.row_group_num_columns,
+                                pm.row_group_bytes,
+                                false as vorder,
+                                pm.compression,
+                                pm.total_compressed_size,
+                                pm.total_uncompressed_size,
+                                ROUND(pm.total_compressed_size::DOUBLE / NULLIF(pm.total_uncompressed_size, 0), 4) as compression_ratio,
+                                '{timestamp}' as timestamp
+                            FROM parquet_metadata({filenames}) pm
+                            WHERE pm.column_id = 0  -- Only include first column to avoid duplication per column
+                        ''')
+                    else:
+                        # Aggregated mode: Original summary statistics
+                        con.execute(f'''
+                            CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
+                            SELECT
+                                '{schema_name}' as schema,
+                                '{tbl}' as tbl,
+                                fm.file_name,
+                                fm.num_rows,
+                                fm.num_row_groups,
+                                0 as size,
+                                false as vorder,
+                                COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
+                                '{timestamp}' as timestamp
+                            FROM parquet_file_metadata({filenames}) fm
+                            LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
+                            GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
+                        ''')
                 print(f"   ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
             except Exception as fallback_error:
@@ -299,30 +430,59 @@ def get_stats(duckrun_instance, source: str = None):
         # No tables were processed successfully - return empty dataframe
         print("⚠️  No tables could be processed successfully")
         import pandas as pd
-        return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
-                                     'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
+        if detailed == True:
+            return pd.DataFrame(columns=['schema', 'tbl', 'file_name', 'row_group_id', 'row_group_num_rows',
+                                         'row_group_num_columns', 'row_group_bytes', 'vorder', 'compression',
+                                         'total_compressed_size', 'total_uncompressed_size', 'compression_ratio', 'timestamp'])
+        else:
+            return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
+                                         'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
     # Union all successfully processed temp tables
     union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
     union_query = ' UNION ALL '.join(union_parts)
-    # Generate final summary
-    final_result = con.execute(f'''
-        SELECT
-            tbl,
-            SUM(num_rows) as total_rows,
-            COUNT(*) as num_files,
-            SUM(num_row_groups) as num_row_group,
-            CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
-            MIN(size) as file_size_MB,
-            ANY_VALUE(vorder) as vorder,
-            STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
-            ANY_VALUE(timestamp) as timestamp
-        FROM ({union_query})
-        WHERE tbl IS NOT NULL
-        GROUP BY tbl
-        ORDER BY total_rows DESC
-    ''').df()
+    # Generate final summary based on detailed flag
+    if detailed == True:
+        # Detailed mode: Return row group level data without aggregation
+        final_result = con.execute(f'''
+            SELECT
+                schema,
+                tbl,
+                file_name,
+                row_group_id,
+                row_group_num_rows,
+                row_group_num_columns,
+                row_group_bytes,
+                vorder,
+                compression,
+                total_compressed_size,
+                total_uncompressed_size,
+                compression_ratio,
+                timestamp
+            FROM ({union_query})
+            WHERE tbl IS NOT NULL
+            ORDER BY schema, tbl, file_name, row_group_id
+        ''').df()
+    else:
+        # Aggregated mode: Original summary statistics
+        final_result = con.execute(f'''
+            SELECT
+                schema,
+                tbl,
+                SUM(num_rows) as total_rows,
+                COUNT(*) as num_files,
+                SUM(num_row_groups) as num_row_group,
+                CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
+                MIN(size) as file_size_MB,
+                ANY_VALUE(vorder) as vorder,
+                STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
+                ANY_VALUE(timestamp) as timestamp
+            FROM ({union_query})
+            WHERE tbl IS NOT NULL
+            GROUP BY schema, tbl
+            ORDER BY total_rows DESC
+        ''').df()
     return final_result

{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.18.dev2
+Version: 0.2.19.dev0
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

duckrun-0.2.19.dev0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+duckrun/__init__.py,sha256=-DPOb_ETaBC0M7YqXj482FE1aZ-SxJeSeY6KB6hPgWU,350
+duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
+duckrun/core.py,sha256=jpg1okp6-Y4HubTJmSjyT9uhUc5pFr4A0tcNxNujSig,69086
+duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
+duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
+duckrun/notebook.py,sha256=lzDRBoWZ_lePF-_5BbA1_42BImLZC5yrq6nzlmlKglM,12183
+duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
+duckrun/semantic_model.py,sha256=shRPBN1II60K_PH8JOqke-_3hAwLspcx4Add0VJRwwU,35913
+duckrun/stats.py,sha256=HyzfDUGvYIxJ9QM8gbT_ISmVrVeEhhbxpxg1VLAgaRQ,23862
+duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
+duckrun-0.2.19.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
+duckrun-0.2.19.dev0.dist-info/METADATA,sha256=I2EXHQLP-Gr_O2Y3yYiAb7el4OTeuutB5P-SvisnO4g,20807
+duckrun-0.2.19.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+duckrun-0.2.19.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
+duckrun-0.2.19.dev0.dist-info/RECORD,,

duckrun-0.2.18.dev2.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-duckrun/__init__.py,sha256=vqv_bJjHjrrXGs8Zyxuy-GKTCyJlZ5z3npPQgE9ipBY,355
-duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
-duckrun/core.py,sha256=tWLFOSVZHoJ0r5YJaj0lG1s_kehiIrnxPMrQQIcyh94,68367
-duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
-duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
-duckrun/notebook.py,sha256=SzdKTpvzHiWMrvg7mCd3DN6R4gU_6Gm7gfkuETzylaE,12103
-duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
-duckrun/semantic_model.py,sha256=X3VKdo4BehAg681Ucq7fzB2KPY2mwPLbfIZqI5Gbqp4,30377
-duckrun/stats.py,sha256=qvWnPk2P8Ob_tzaiNfdQmUQqMVq2FWv3EgArE7hPl44,15482
-duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
-duckrun-0.2.18.dev2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
-duckrun-0.2.18.dev2.dist-info/METADATA,sha256=JpewTO7QqHrdUn_G3Lz-1jxFifVyBxj9lNX_Qodhe2A,20807
-duckrun-0.2.18.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-duckrun-0.2.18.dev2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
-duckrun-0.2.18.dev2.dist-info/RECORD,,

{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/WHEEL RENAMED Viewed

File without changes

{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{duckrun-0.2.18.dev2.dist-info → duckrun-0.2.19.dev0.dist-info}/top_level.txt RENAMED Viewed

File without changes

duckrun 0.2.18.dev2__py3-none-any.whl → 0.2.19.dev0__py3-none-any.whl

Potentially problematic release.

duckrun 0.2.18.dev2py3-none-any.whl → 0.2.19.dev0py3-none-any.whl