PyPI - duckrun - Versions diffs - 0.1.5.5__py3-none-any.whl → 0.1.5.6__py3-none-any.whl - Mend

duckrun 0.1.5.5py3-none-any.whl → 0.1.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

duckrun/core.py CHANGED Viewed

@@ -112,7 +112,7 @@ class Duckrun:
     Usage:
         # For pipelines:
         dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
-        dr = Duckrun.connect("workspace/lakehouse.lakehouse")  # defaults to dbo schema
+        dr = Duckrun.connect("workspace/lakehouse.lakehouse")  # defaults to dbo schema, lists all tables
         dr.run(pipeline)
         # For data exploration with Spark-style API:
@@ -122,12 +122,14 @@ class Duckrun:
     """
     def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
-                 sql_folder: Optional[str] = None, compaction_threshold: int = 10):
+                 sql_folder: Optional[str] = None, compaction_threshold: int = 10,
+                 scan_all_schemas: bool = False):
         self.workspace = workspace
         self.lakehouse_name = lakehouse_name
         self.schema = schema
         self.sql_folder = sql_folder.strip() if sql_folder else None
         self.compaction_threshold = compaction_threshold
+        self.scan_all_schemas = scan_all_schemas
         self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
         self.con = duckdb.connect()
         self.con.sql("SET preserve_insertion_order = false")
@@ -144,27 +146,31 @@ class Duckrun:
         1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
         2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
-        Schema defaults to "dbo" if not specified.
+        Schema defaults to "dbo" if not specified. When no schema is provided,
+        all tables across all schemas will be listed, but operations will use "dbo".
         Examples:
             dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
-            dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse")  # uses dbo
+            dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse")  # lists all, uses dbo
             dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
-            dr = Duckrun.connect("myworkspace", "mylakehouse")  # uses dbo
+            dr = Duckrun.connect("myworkspace", "mylakehouse")  # lists all, uses dbo
             dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
         """
         print("Connecting to Lakehouse...")
+        scan_all_schemas = False
         # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
         if workspace and "/" in workspace and lakehouse_name is None:
             parts = workspace.split("/")
             if len(parts) == 2:
                 # Format: "ws/lh.lakehouse" (schema will use default)
                 workspace, lakehouse_name = parts
-                # schema already has default value "dbo"
-                print(f"ℹ️  No schema specified. Using default schema 'dbo'.")
-                print(f"   To specify a schema, use: {workspace}/{lakehouse_name}.lakehouse/schema")
-                print(f"   Note: Scanning all schemas will be added in a future update.\n")
+                scan_all_schemas = True
+                print(f"ℹ️  No schema specified. Using default schema 'dbo' for operations.")
+                print(f"   Scanning all schemas for table discovery...")
+                print(f"   ⚠️  WARNING: Scanning all schemas can be slow for large lakehouses!")
+                print(f"   💡 For better performance, specify a schema: {workspace}/{lakehouse_name}.lakehouse/schema\n")
             elif len(parts) == 3:
                 # Format: "ws/lh.lakehouse/schema"
                 workspace, lakehouse_name, schema = parts
@@ -177,18 +183,27 @@ class Duckrun:
             # Remove .lakehouse suffix if present
             if lakehouse_name.endswith(".lakehouse"):
                 lakehouse_name = lakehouse_name[:-10]
+        elif lakehouse_name is not None:
+            # Traditional format used, check if schema was explicitly provided
+            # If schema is still "dbo" (default), scan all schemas
+            if schema == "dbo":
+                scan_all_schemas = True
+                print(f"ℹ️  No schema specified. Using default schema 'dbo' for operations.")
+                print(f"   Scanning all schemas for table discovery...")
+                print(f"   ⚠️  WARNING: Scanning all schemas can be slow for large lakehouses!")
+                print(f"   💡 For better performance, specify a schema explicitly.\n")
         # Validate all required parameters are present
         if not workspace or not lakehouse_name:
             raise ValueError(
                 "Missing required parameters. Use either:\n"
                 "  connect('workspace/lakehouse.lakehouse/schema')\n"
-                "  connect('workspace/lakehouse.lakehouse')  # defaults to dbo\n"
+                "  connect('workspace/lakehouse.lakehouse')  # defaults to dbo, lists all\n"
                 "  connect('workspace', 'lakehouse', 'schema')\n"
-                "  connect('workspace', 'lakehouse')  # defaults to dbo"
+                "  connect('workspace', 'lakehouse')  # defaults to dbo, lists all"
             )
-        return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
+        return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
     def _get_storage_token(self):
         return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
@@ -208,47 +223,77 @@ class Duckrun:
     def _attach_lakehouse(self):
         self._create_onelake_secret()
         try:
-            # Use expensive list operation but filter for _delta_log folders only
-            # This avoids parsing JSON content that causes Iceberg metadata issues
-            print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
-            list_tables_query = f"""
-                SELECT DISTINCT
-                    regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
-                FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
-                WHERE file LIKE '%/_delta_log/%'
-                  AND file NOT LIKE '%/metadata/%'
-                  AND file NOT LIKE '%/iceberg/%'
-                  AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
-            """
+            if self.scan_all_schemas:
+                # Scan all schemas
+                print(f"⚠️  Scanning for Delta tables across all schemas...")
+                print(f"   This may take a while for large lakehouses with many schemas/tables.")
+                list_tables_query = f"""
+                    SELECT DISTINCT
+                        regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) as schema_name,
+                        regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) as table_name
+                    FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/**")
+                    WHERE file LIKE '%/_delta_log/%'
+                      AND file NOT LIKE '%/metadata/%'
+                      AND file NOT LIKE '%/iceberg/%'
+                      AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) IS NOT NULL
+                      AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) IS NOT NULL
+                    ORDER BY schema_name, table_name
+                """
+            else:
+                # Scan specific schema only
+                print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
+                list_tables_query = f"""
+                    SELECT DISTINCT
+                        '{self.schema}' as schema_name,
+                        regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
+                    FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
+                    WHERE file LIKE '%/_delta_log/%'
+                      AND file NOT LIKE '%/metadata/%'
+                      AND file NOT LIKE '%/iceberg/%'
+                      AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
+                """
             list_tables_df = self.con.sql(list_tables_query).df()
             if list_tables_df.empty:
-                print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}.")
+                if self.scan_all_schemas:
+                    print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
+                else:
+                    print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
                 return
-            table_names = list_tables_df['table_name'].tolist()
+            print(f"Found {len(list_tables_df)} Delta tables. Attaching as views...\n")
-            print(f"Found {len(table_names)} Delta tables. Attaching as views...")
-            for table in table_names:
+            for _, row in list_tables_df.iterrows():
+                schema_name = row['schema_name']
+                table_name = row['table_name']
                 # Skip Iceberg-related folders and empty names
-                if not table or table in ('metadata', 'iceberg'):
+                if not table_name or table_name in ('metadata', 'iceberg'):
                     continue
                 try:
+                    # Create view with schema prefix to avoid conflicts
+                    view_name = f"{schema_name}_{table_name}" if self.scan_all_schemas else table_name
                     self.con.sql(f"""
-                        CREATE OR REPLACE VIEW {table}
-                        AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
+                        CREATE OR REPLACE VIEW {view_name}
+                        AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
                     """)
-                    print(f"  ✓ Attached: {table}")
+                    print(f"  ✓ Attached: {schema_name}.{table_name} → {view_name}")
                 except Exception as e:
-                    print(f"  ⚠ Skipped {table}: {str(e)[:100]}")
+                    print(f"  ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
                     continue
             print("\nAttached tables (views) in DuckDB:")
-            self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
+            self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
+            if self.scan_all_schemas:
+                print(f"\nNote: Tables are prefixed with schema (e.g., dbo_tablename)")
+                print(f"      Default schema for operations: {self.schema}")
         except Exception as e:
             print(f"Error attaching lakehouse: {e}")
             print("Continuing without pre-attached tables.")

{duckrun-0.1.5.5.dist-info → duckrun-0.1.5.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.1.5.5
+Version: 0.1.5.6
 Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
 Author: mim
 License-Expression: MIT
@@ -15,7 +15,7 @@ Requires-Dist: deltalake>=0.18.2
 Requires-Dist: requests>=2.28.0
 Dynamic: license-file
-<img src="duckrun.png" width="400" alt="Duckrun">
+<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
 Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
@@ -38,10 +38,11 @@ pip install duckrun
 ```python
 import duckrun
-# Connect to your Fabric lakehouse
+# Connect to your Fabric lakehouse with a specific schema
 con = duckrun.connect("my_workspace/my_lakehouse.lakehouse/dbo")
-# Schema defaults to 'dbo' if not specified
+# Schema defaults to 'dbo' if not specified (scans all schemas)
+# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
 con = duckrun.connect("my_workspace/my_lakehouse.lakehouse")
 # Explore data
@@ -56,17 +57,37 @@ That's it! No `sql_folder` needed for data exploration.
 ## Connection Format
 ```python
-# With schema
+# With schema (recommended for better performance)
 con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
-# Without schema (uses 'dbo' by default)
+# Without schema (defaults to 'dbo', scans all schemas)
+# ⚠️ This can be slow for large lakehouses!
 con = duckrun.connect("workspace/lakehouse.lakehouse")
 # With options
 con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
 ```
-**Note:** When schema is not specified, Duckrun defaults to `dbo`. Multi-schema scanning will be added in a future update.
+### Multi-Schema Support
+When you don't specify a schema, Duckrun will:
+- **Default to `dbo`** for write operations
+- **Scan all schemas** to discover and attach all Delta tables
+- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
+**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
+```python
+# Fast: scans only 'dbo' schema
+con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
+# Slower: scans all schemas
+con = duckrun.connect("workspace/lakehouse.lakehouse")
+# Query tables from different schemas (when scanning all)
+con.sql("SELECT * FROM dbo_customers").show()
+con.sql("SELECT * FROM bronze_raw_data").show()
+```
 ## Two Ways to Use Duckrun
@@ -262,7 +283,7 @@ con = duckrun.connect(
 ```python
 import duckrun
-# Connect
+# Connect (specify schema for best performance)
 con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
 # Pipeline with mixed tasks
@@ -297,7 +318,7 @@ con.sql("""
 ## How It Works
 1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
-2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
+2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
 3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
 4. **Write Operations**: Results are written back as Delta tables with automatic optimization
 5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks

duckrun-0.1.5.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
+duckrun/core.py,sha256=AjaY3fkbO2S9rCejy-gF06UgQ13J1K6gBAp_AEwcyRs,23762
+duckrun-0.1.5.6.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
+duckrun-0.1.5.6.dist-info/METADATA,sha256=bGr8L2ZCLOqVtvUtcpBQPxtLgkiZAhy7lOq0U4KtTSI,9258
+duckrun-0.1.5.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+duckrun-0.1.5.6.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
+duckrun-0.1.5.6.dist-info/RECORD,,

duckrun-0.1.5.5.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
-duckrun/core.py,sha256=0Jo7zkVuTvdPPt-ubUhy5996oAm4VffZrH6K1AUw7wE,20804
-duckrun-0.1.5.5.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
-duckrun-0.1.5.5.dist-info/METADATA,sha256=0fp-MgKtZuxYBxvXtGOpUsK4aJbaobLckzFfq-LMu4o,8201
-duckrun-0.1.5.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-duckrun-0.1.5.5.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
-duckrun-0.1.5.5.dist-info/RECORD,,

{duckrun-0.1.5.5.dist-info → duckrun-0.1.5.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{duckrun-0.1.5.5.dist-info → duckrun-0.1.5.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{duckrun-0.1.5.5.dist-info → duckrun-0.1.5.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

duckrun 0.1.5.5__py3-none-any.whl → 0.1.5.6__py3-none-any.whl

duckrun 0.1.5.5py3-none-any.whl → 0.1.5.6py3-none-any.whl