PyPI - duckrun - Versions diffs - 0.2.14.dev3__py3-none-any.whl → 0.2.15__py3-none-any.whl - Mend

duckrun 0.2.14.dev3py3-none-any.whl → 0.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of duckrun might be problematic. Click here for more details.

Files changed (8) hide show

duckrun/core.py +38 -14
duckrun/stats.py +77 -16
{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/METADATA +1 -1
duckrun-0.2.15.dist-info/RECORD +14 -0
duckrun-0.2.14.dev3.dist-info/RECORD +0 -14
{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/WHEEL +0 -0
{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/licenses/LICENSE +0 -0
{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/top_level.txt +0 -0

duckrun/core.py CHANGED Viewed

@@ -817,16 +817,16 @@ class Duckrun:
         # Fallback to original value
         return self.workspace_id
-    def get_lakehouse_id(self, force: bool = False) -> str:
+    def get_item_id(self, force: bool = False) -> str:
         """
-        Get the lakehouse ID (GUID or name).
-        Use this when passing lakehouse parameter to Python functions.
+        Get the item ID (GUID or name) - works for lakehouses, warehouses, databases, etc.
+        Use this when passing lakehouse/item parameter to Python functions.
         Args:
             force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
         Returns:
-            Lakehouse ID - either a GUID or lakehouse name
+            Item ID - either a GUID or item name (supports all OneLake item types)
         """
         if not force:
             return self.lakehouse_id
@@ -839,14 +839,24 @@ class Duckrun:
         if guid_pattern.match(self.lakehouse_id):
             return self.lakehouse_id
-        # Try to get from notebook context first (fastest)
-        try:
-            import notebookutils  # type: ignore
-            lakehouse_guid = notebookutils.lakehouse.get("id")
-            if lakehouse_guid:
-                return lakehouse_guid
-        except (ImportError, Exception):
-            pass
+        # Detect item type from lakehouse_id (e.g., "data.Lakehouse" -> Lakehouse)
+        item_type = None
+        item_name = self.lakehouse_id
+        for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
+            if self.lakehouse_id.endswith(suffix):
+                item_type = suffix[1:]  # Remove the leading dot
+                item_name = self.lakehouse_id[:-len(suffix)]
+                break
+        # Try to get from notebook context first (only works for lakehouses)
+        if item_type == 'Lakehouse' or item_type is None:
+            try:
+                import notebookutils  # type: ignore
+                lakehouse_guid = notebookutils.lakehouse.get("id")
+                if lakehouse_guid:
+                    return lakehouse_guid
+            except (ImportError, Exception):
+                pass
         # Resolve via API
         try:
@@ -855,8 +865,15 @@ class Duckrun:
             if token:
                 # First get workspace GUID
                 workspace_guid = self.get_workspace_id(force=True)
-                # Then resolve lakehouse name to ID
-                resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, self.lakehouse_id)
+                # Use appropriate resolver based on item type
+                if item_type == 'Lakehouse' or item_type is None:
+                    # Use lakehouse-specific API
+                    resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, item_name if item_name else self.lakehouse_id)
+                else:
+                    # Use generic items API for warehouses, databases, etc.
+                    resolved_id = self._resolve_item_id_by_name(token, workspace_guid, item_name, item_type)
                 if resolved_id:
                     return resolved_id
         except Exception:
@@ -864,6 +881,13 @@ class Duckrun:
         # Fallback to original value
         return self.lakehouse_id
+    def get_lakehouse_id(self, force: bool = False) -> str:
+        """
+        Deprecated: Use get_item_id() instead.
+        Backward compatibility alias for get_item_id().
+        """
+        return self.get_item_id(force)
     def run(self, pipeline: List[Tuple]) -> bool:
         """

duckrun/stats.py CHANGED Viewed

@@ -142,7 +142,9 @@ def get_stats(duckrun_instance, source: str):
     print(f"Processing {len(list_tables)} tables: {list_tables}")
+    successful_tables = []
     for idx, tbl in enumerate(list_tables):
+        print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
         # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
         table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
@@ -210,23 +212,82 @@ def get_stats(duckrun_instance, source: str):
                 ''')
         except Exception as e:
-            print(f"Warning: Could not process table '{tbl}': {e}")
-            # Create empty temp table for failed tables
-            con.execute(f'''
-                CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
-                SELECT
-                    '{tbl}' as tbl,
-                    'error' as file_name,
-                    0 as num_rows,
-                    0 as num_row_groups,
-                    0 as size,
-                    false as vorder,
-                    '{timestamp}' as timestamp
-                WHERE false
-            ''')
+            error_msg = str(e)
+            print(f"Warning: Could not process table '{tbl}' using DeltaTable API: {e}")
+            # Fallback: Use DuckDB's delta_scan with filename parameter
+            if "Invalid JSON" in error_msg or "MetadataValue" in error_msg:
+                print(f"   Detected JSON parsing issue - falling back to DuckDB delta_scan")
+            else:
+                print(f"   Falling back to DuckDB delta_scan")
+            try:
+                # First get the list of actual parquet files using delta_scan
+                file_list_result = con.execute(f'''
+                    SELECT DISTINCT filename
+                    FROM delta_scan('{table_path}', filename=1)
+                ''').fetchall()
+                if not file_list_result:
+                    # Empty table
+                    con.execute(f'''
+                        CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
+                        SELECT
+                            '{tbl}' as tbl,
+                            'empty' as file_name,
+                            0 as num_rows,
+                            0 as num_row_groups,
+                            0 as size,
+                            false as vorder,
+                            '{timestamp}' as timestamp
+                        WHERE false
+                    ''')
+                else:
+                    # Extract just the filename (not the full path) from delta_scan results
+                    # delta_scan returns full ABFSS paths, we need to extract just the filename part
+                    filenames = []
+                    for row in file_list_result:
+                        full_path = row[0]
+                        # Extract just the filename from the full ABFSS path
+                        if '/' in full_path:
+                            filename = full_path.split('/')[-1]
+                        else:
+                            filename = full_path
+                        filenames.append(table_path + "/" + filename)
+                    # Use parquet_file_metadata to get actual parquet stats
+                    con.execute(f'''
+                        CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
+                        SELECT
+                            '{tbl}' as tbl,
+                            file_name,
+                            num_rows,
+                            num_row_groups,
+                            0 as size,
+                            false as vorder,
+                            '{timestamp}' as timestamp
+                        FROM parquet_file_metadata({filenames})
+                    ''')
+                print(f"   ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
+            except Exception as fallback_error:
+                print(f"   ✗ DuckDB fallback also failed for '{tbl}': {fallback_error}")
+                print(f"   ⏭️  Skipping table '{tbl}'")
+                continue
+        # Mark this table as successfully processed
+        successful_tables.append(idx)
+    # Only union tables that were successfully processed
+    if not successful_tables:
+        # No tables were processed successfully - return empty dataframe
+        print("⚠️  No tables could be processed successfully")
+        import pandas as pd
+        return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
+                                     'average_row_group', 'file_size_MB', 'vorder', 'timestamp'])
-    # Union all temp tables
-    union_parts = [f'SELECT * FROM tbl_{i}' for i in range(len(list_tables))]
+    # Union all successfully processed temp tables
+    union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
     union_query = ' UNION ALL '.join(union_parts)
     # Generate final summary

{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.14.dev3
+Version: 0.2.15
 Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
 Author: mim
 License: MIT

duckrun-0.2.15.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
+duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
+duckrun/core.py,sha256=c98sASAWlq0DDIR9gYbj5ZaKOa6MoO8Z09qhRhG4JWI,67097
+duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
+duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
+duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
+duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
+duckrun/stats.py,sha256=xqgtW_HHAizom6E13_UjitNgmz6pzK10XdosPWJO1Ew,14282
+duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
+duckrun-0.2.15.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
+duckrun-0.2.15.dist-info/METADATA,sha256=xExTRo--bAjK6Ioq7O6F_641ZkVGgHj3_d-jHO9tadE,20766
+duckrun-0.2.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+duckrun-0.2.15.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
+duckrun-0.2.15.dist-info/RECORD,,

duckrun-0.2.14.dev3.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
-duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
-duckrun/core.py,sha256=_D0CnaRNQm_wW4bSP__EAPHEt_VNgf9N-VXWYSZScL8,65829
-duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
-duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
-duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
-duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
-duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
-duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
-duckrun-0.2.14.dev3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
-duckrun-0.2.14.dev3.dist-info/METADATA,sha256=tOLtAIHcEJyXk93hvvgZNC3Cx7U2Dy7iatRutBnrU3Y,20771
-duckrun-0.2.14.dev3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-duckrun-0.2.14.dev3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
-duckrun-0.2.14.dev3.dist-info/RECORD,,

{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{duckrun-0.2.14.dev3.dist-info → duckrun-0.2.15.dist-info}/top_level.txt RENAMED Viewed

File without changes

duckrun 0.2.14.dev3__py3-none-any.whl → 0.2.15__py3-none-any.whl

Potentially problematic release.

duckrun 0.2.14.dev3py3-none-any.whl → 0.2.15py3-none-any.whl