PyPI - duckrun - Versions diffs - 0.1.5.5__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

duckrun 0.1.5.5py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

duckrun/core.py CHANGED Viewed

@@ -5,6 +5,8 @@ import importlib.util
 from deltalake import DeltaTable, write_deltalake
 from typing import List, Tuple, Union, Optional, Callable, Dict, Any
 from string import Template
+import obstore as obs
+from obstore.store import AzureStore
 class DeltaWriter:
@@ -13,7 +15,7 @@ class DeltaWriter:
     def __init__(self, relation, duckrun_instance):
         self.relation = relation
         self.duckrun = duckrun_instance
-        self._format = "delta"  # Default to delta format
+        self._format = "delta"
         self._mode = "overwrite"
     def format(self, format_type: str):
@@ -32,46 +34,35 @@ class DeltaWriter:
     def saveAsTable(self, table_name: str):
         """Save query result as Delta table"""
-        # Format defaults to "delta", so no need to check
         if self._format != "delta":
             raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
-        # Parse schema.table or use default schema
         if "." in table_name:
             schema, table = table_name.split(".", 1)
         else:
             schema = self.duckrun.schema
             table = table_name
-        # Ensure OneLake secret is created
         self.duckrun._create_onelake_secret()
-        # Build path
         path = f"{self.duckrun.table_base_url}{schema}/{table}"
-        # Execute query and get result
         df = self.relation.record_batch()
         print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
-        # Write to Delta
         write_deltalake(path, df, mode=self._mode)
-        # Create or replace view in DuckDB
         self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
         self.duckrun.con.sql(f"""
             CREATE OR REPLACE VIEW {table}
             AS SELECT * FROM delta_scan('{path}')
         """)
-        # Optimize if needed
         dt = DeltaTable(path)
         if self._mode == "overwrite":
             dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
             dt.cleanup_metadata()
             print(f"✅ Table {schema}.{table} created/overwritten")
-        else:  # append
+        else:
             file_count = len(dt.file_uris())
             if file_count > self.duckrun.compaction_threshold:
                 print(f"Compacting {schema}.{table} ({file_count} files)")
@@ -112,7 +103,7 @@ class Duckrun:
     Usage:
         # For pipelines:
         dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
-        dr = Duckrun.connect("workspace/lakehouse.lakehouse")  # defaults to dbo schema
+        dr = Duckrun.connect("workspace/lakehouse.lakehouse")  # defaults to dbo schema, lists all tables
         dr.run(pipeline)
         # For data exploration with Spark-style API:
@@ -122,12 +113,14 @@ class Duckrun:
     """
     def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
-                 sql_folder: Optional[str] = None, compaction_threshold: int = 10):
+                 sql_folder: Optional[str] = None, compaction_threshold: int = 10,
+                 scan_all_schemas: bool = False):
         self.workspace = workspace
         self.lakehouse_name = lakehouse_name
         self.schema = schema
         self.sql_folder = sql_folder.strip() if sql_folder else None
         self.compaction_threshold = compaction_threshold
+        self.scan_all_schemas = scan_all_schemas
         self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
         self.con = duckdb.connect()
         self.con.sql("SET preserve_insertion_order = false")
@@ -144,29 +137,21 @@ class Duckrun:
         1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
         2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
-        Schema defaults to "dbo" if not specified.
-        Examples:
-            dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
-            dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse")  # uses dbo
-            dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
-            dr = Duckrun.connect("myworkspace", "mylakehouse")  # uses dbo
-            dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
+        Schema defaults to "dbo" if not specified. When no schema is provided,
+        all tables across all schemas will be listed, but operations will use "dbo".
         """
         print("Connecting to Lakehouse...")
-        # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
+        scan_all_schemas = False
         if workspace and "/" in workspace and lakehouse_name is None:
             parts = workspace.split("/")
             if len(parts) == 2:
-                # Format: "ws/lh.lakehouse" (schema will use default)
                 workspace, lakehouse_name = parts
-                # schema already has default value "dbo"
-                print(f"ℹ️  No schema specified. Using default schema 'dbo'.")
-                print(f"   To specify a schema, use: {workspace}/{lakehouse_name}.lakehouse/schema")
-                print(f"   Note: Scanning all schemas will be added in a future update.\n")
+                scan_all_schemas = True
+                print(f"ℹ️  No schema specified. Using default schema 'dbo' for operations.")
+                print(f"   Scanning all schemas for table discovery...\n")
             elif len(parts) == 3:
-                # Format: "ws/lh.lakehouse/schema"
                 workspace, lakehouse_name, schema = parts
             else:
                 raise ValueError(
@@ -174,21 +159,24 @@ class Duckrun:
                     "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
                 )
-            # Remove .lakehouse suffix if present
             if lakehouse_name.endswith(".lakehouse"):
                 lakehouse_name = lakehouse_name[:-10]
+        elif lakehouse_name is not None:
+            if schema == "dbo":
+                scan_all_schemas = True
+                print(f"ℹ️  No schema specified. Using default schema 'dbo' for operations.")
+                print(f"   Scanning all schemas for table discovery...\n")
-        # Validate all required parameters are present
         if not workspace or not lakehouse_name:
             raise ValueError(
                 "Missing required parameters. Use either:\n"
                 "  connect('workspace/lakehouse.lakehouse/schema')\n"
-                "  connect('workspace/lakehouse.lakehouse')  # defaults to dbo\n"
+                "  connect('workspace/lakehouse.lakehouse')  # defaults to dbo, lists all\n"
                 "  connect('workspace', 'lakehouse', 'schema')\n"
-                "  connect('workspace', 'lakehouse')  # defaults to dbo"
+                "  connect('workspace', 'lakehouse')  # defaults to dbo, lists all"
             )
-        return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
+        return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
     def _get_storage_token(self):
         return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
@@ -205,52 +193,107 @@ class Duckrun:
             os.environ["AZURE_STORAGE_TOKEN"] = token.token
             self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
+    def _discover_tables_fast(self) -> List[Tuple[str, str]]:
+        """
+        Fast Delta table discovery using obstore with list_with_delimiter.
+        Only lists directories, not files - super fast!
+        Returns:
+            List of tuples: [(schema, table_name), ...]
+        """
+        token = self._get_storage_token()
+        if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
+            print("Getting Azure token for table discovery...")
+            from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
+            credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
+            token_obj = credential.get_token("https://storage.azure.com/.default")
+            token = token_obj.token
+            os.environ["AZURE_STORAGE_TOKEN"] = token
+        url = f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/"
+        store = AzureStore.from_url(url, bearer_token=token)
+        base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
+        tables_found = []
+        if self.scan_all_schemas:
+            # Discover all schemas first
+            print("🔍 Discovering schemas...")
+            schemas_result = obs.list_with_delimiter(store, prefix=base_path)
+            schemas = [
+                prefix.rstrip('/').split('/')[-1]
+                for prefix in schemas_result['common_prefixes']
+            ]
+            print(f"   Found {len(schemas)} schemas: {', '.join(schemas)}\n")
+            # Discover tables in each schema
+            print("🔍 Discovering tables...")
+            for schema_name in schemas:
+                schema_path = f"{base_path}{schema_name}/"
+                result = obs.list_with_delimiter(store, prefix=schema_path)
+                for table_prefix in result['common_prefixes']:
+                    table_name = table_prefix.rstrip('/').split('/')[-1]
+                    # Skip non-table directories
+                    if table_name not in ('metadata', 'iceberg'):
+                        tables_found.append((schema_name, table_name))
+        else:
+            # Scan specific schema only
+            print(f"🔍 Discovering tables in schema '{self.schema}'...")
+            schema_path = f"{base_path}{self.schema}/"
+            result = obs.list_with_delimiter(store, prefix=schema_path)
+            for table_prefix in result['common_prefixes']:
+                table_name = table_prefix.rstrip('/').split('/')[-1]
+                if table_name not in ('metadata', 'iceberg'):
+                    tables_found.append((self.schema, table_name))
+        return tables_found
     def _attach_lakehouse(self):
+        """Attach lakehouse tables as DuckDB views using fast discovery"""
         self._create_onelake_secret()
         try:
-            # Use expensive list operation but filter for _delta_log folders only
-            # This avoids parsing JSON content that causes Iceberg metadata issues
-            print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
-            list_tables_query = f"""
-                SELECT DISTINCT
-                    regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
-                FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
-                WHERE file LIKE '%/_delta_log/%'
-                  AND file NOT LIKE '%/metadata/%'
-                  AND file NOT LIKE '%/iceberg/%'
-                  AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
-            """
-            list_tables_df = self.con.sql(list_tables_query).df()
+            tables = self._discover_tables_fast()
-            if list_tables_df.empty:
-                print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}.")
+            if not tables:
+                if self.scan_all_schemas:
+                    print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
+                else:
+                    print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
                 return
-            table_names = list_tables_df['table_name'].tolist()
-            print(f"Found {len(table_names)} Delta tables. Attaching as views...")
-            for table in table_names:
-                # Skip Iceberg-related folders and empty names
-                if not table or table in ('metadata', 'iceberg'):
-                    continue
+            print(f"\n📊 Found {len(tables)} Delta tables. Attaching as views...\n")
+            attached_count = 0
+            for schema_name, table_name in tables:
                 try:
+                    view_name = f"{schema_name}_{table_name}" if self.scan_all_schemas else table_name
                     self.con.sql(f"""
-                        CREATE OR REPLACE VIEW {table}
-                        AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
+                        CREATE OR REPLACE VIEW {view_name}
+                        AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
                     """)
-                    print(f"  ✓ Attached: {table}")
+                    print(f"  ✓ Attached: {schema_name}.{table_name} → {view_name}")
+                    attached_count += 1
                 except Exception as e:
-                    print(f"  ⚠ Skipped {table}: {str(e)[:100]}")
+                    print(f"  ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
                     continue
-            print("\nAttached tables (views) in DuckDB:")
-            self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
+            print(f"\n{'='*60}")
+            print(f"✅ Successfully attached {attached_count}/{len(tables)} tables")
+            print(f"{'='*60}\n")
+            print("Available views in DuckDB:")
+            self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
+            if self.scan_all_schemas:
+                print(f"\n💡 Note: Tables are prefixed with schema (e.g., dbo_tablename)")
+                print(f"   Default schema for operations: {self.schema}\n")
         except Exception as e:
-            print(f"Error attaching lakehouse: {e}")
+            print(f"❌ Error attaching lakehouse: {e}")
             print("Continuing without pre-attached tables.")
     def _normalize_table_name(self, name: str) -> str:
@@ -284,7 +327,6 @@ class Duckrun:
             print(f"SQL file is empty: {table_name}.sql")
             return None
-        # Auto-inject common params, merge with user params
         full_params = {
             'ws': self.workspace,
             'lh': self.lakehouse_name,
@@ -407,18 +449,9 @@ class Duckrun:
         Returns:
             True if all tasks succeeded
-        Example:
-            pipeline = [
-                ('download', (urls, paths, depth)),
-                ('staging', 'overwrite', {'run_date': '2024-06-01'}),
-                ('transform', 'append'),  # {} optional!
-                ('calendar', 'ignore')     # {} optional!
-            ]
-            dr.run(pipeline)
         """
         if self.sql_folder is None:
-            raise RuntimeError("sql_folder is not configured. Cannot run pipelines. Set sql_folder when creating connection.")
+            raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
         for i, task in enumerate(pipeline, 1):
             print(f"\n{'='*60}")
@@ -427,18 +460,14 @@ class Duckrun:
             try:
                 if len(task) == 2:
-                    # Could be Python: ('name', (args,)) or SQL: ('table', 'mode')
                     name, second = task
                     if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
-                        # SQL task without params: ('table', 'mode')
                         self._run_sql(name, second, {})
                     else:
-                        # Python task: ('name', (args,))
                         args = second if isinstance(second, (tuple, list)) else (second,)
                         self._run_python(name, tuple(args))
                 elif len(task) == 3:
-                    # SQL task with params: ('table', 'mode', {params})
                     table, mode, params = task
                     if not isinstance(params, dict):
                         raise ValueError(f"Expected dict for params, got {type(params)}")
@@ -461,13 +490,9 @@ class Duckrun:
         Execute raw SQL query with Spark-style write API.
         Example:
-            # Traditional DuckDB style
             dr.sql("SELECT * FROM table").show()
             df = dr.sql("SELECT * FROM table").df()
-            # New Spark-style write API (format is optional, defaults to delta)
             dr.sql("SELECT 43 as value").write.mode("append").saveAsTable("test")
-            dr.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
         """
         relation = self.con.sql(query)
         return QueryResult(relation, self)

{duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.1.5.5
+Version: 0.1.6
 Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
 Author: mim
-License-Expression: MIT
+License: MIT
 Project-URL: Homepage, https://github.com/djouallah/duckrun
 Project-URL: Repository, https://github.com/djouallah/duckrun
 Project-URL: Issues, https://github.com/djouallah/duckrun/issues
@@ -13,9 +13,12 @@ License-File: LICENSE
 Requires-Dist: duckdb>=1.2.0
 Requires-Dist: deltalake>=0.18.2
 Requires-Dist: requests>=2.28.0
+Requires-Dist: obstore>=0.2.0
+Provides-Extra: local
+Requires-Dist: azure-identity>=1.12.0; extra == "local"
 Dynamic: license-file
-<img src="duckrun.png" width="400" alt="Duckrun">
+<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
 Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
@@ -38,10 +41,11 @@ pip install duckrun
 ```python
 import duckrun
-# Connect to your Fabric lakehouse
+# Connect to your Fabric lakehouse with a specific schema
 con = duckrun.connect("my_workspace/my_lakehouse.lakehouse/dbo")
-# Schema defaults to 'dbo' if not specified
+# Schema defaults to 'dbo' if not specified (scans all schemas)
+# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
 con = duckrun.connect("my_workspace/my_lakehouse.lakehouse")
 # Explore data
@@ -56,17 +60,37 @@ That's it! No `sql_folder` needed for data exploration.
 ## Connection Format
 ```python
-# With schema
+# With schema (recommended for better performance)
 con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
-# Without schema (uses 'dbo' by default)
+# Without schema (defaults to 'dbo', scans all schemas)
+# ⚠️ This can be slow for large lakehouses!
 con = duckrun.connect("workspace/lakehouse.lakehouse")
 # With options
 con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
 ```
-**Note:** When schema is not specified, Duckrun defaults to `dbo`. Multi-schema scanning will be added in a future update.
+### Multi-Schema Support
+When you don't specify a schema, Duckrun will:
+- **Default to `dbo`** for write operations
+- **Scan all schemas** to discover and attach all Delta tables
+- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
+**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
+```python
+# Fast: scans only 'dbo' schema
+con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
+# Slower: scans all schemas
+con = duckrun.connect("workspace/lakehouse.lakehouse")
+# Query tables from different schemas (when scanning all)
+con.sql("SELECT * FROM dbo_customers").show()
+con.sql("SELECT * FROM bronze_raw_data").show()
+```
 ## Two Ways to Use Duckrun
@@ -262,7 +286,7 @@ con = duckrun.connect(
 ```python
 import duckrun
-# Connect
+# Connect (specify schema for best performance)
 con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
 # Pipeline with mixed tasks
@@ -297,7 +321,7 @@ con.sql("""
 ## How It Works
 1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
-2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
+2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
 3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
 4. **Write Operations**: Results are written back as Delta tables with automatic optimization
 5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks

duckrun-0.1.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
+duckrun/core.py,sha256=H7Q-mvE5ET3mdEi7VTubWdaCrgVaJW9G0LfAu0Gpw-g,21872
+duckrun-0.1.6.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
+duckrun-0.1.6.dist-info/METADATA,sha256=20vTn4-9fn8iqwXGjYT3IQd9Xk47sQAD-Tv3wk2Pp9I,9356
+duckrun-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+duckrun-0.1.6.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
+duckrun-0.1.6.dist-info/RECORD,,

duckrun-0.1.5.5.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
-duckrun/core.py,sha256=0Jo7zkVuTvdPPt-ubUhy5996oAm4VffZrH6K1AUw7wE,20804
-duckrun-0.1.5.5.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
-duckrun-0.1.5.5.dist-info/METADATA,sha256=0fp-MgKtZuxYBxvXtGOpUsK4aJbaobLckzFfq-LMu4o,8201
-duckrun-0.1.5.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-duckrun-0.1.5.5.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
-duckrun-0.1.5.5.dist-info/RECORD,,

{duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

duckrun 0.1.5.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

duckrun 0.1.5.5py3-none-any.whl → 0.1.6py3-none-any.whl