PyPI - duckrun - Versions diffs - 0.1.6.3__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

duckrun 0.1.6.3py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

duckrun/core.py CHANGED Viewed

@@ -8,6 +8,9 @@ from string import Template
 import obstore as obs
 from obstore.store import AzureStore
+# Row Group configuration for optimal Delta Lake performance
+RG = 8_000_000
 class DeltaWriter:
     """Spark-style write API for Delta Lake"""
@@ -48,7 +51,7 @@ class DeltaWriter:
         df = self.relation.record_batch()
         print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
-        write_deltalake(path, df, mode=self._mode)
+        write_deltalake(path, df, mode=self._mode, max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
         self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
         self.duckrun.con.sql(f"""
@@ -127,77 +130,57 @@ class Duckrun:
         self._attach_lakehouse()
     @classmethod
-    def connect(cls, workspace: Union[str, None] = None, lakehouse_name: Optional[str] = None,
-                schema: str = "dbo", sql_folder: Optional[str] = None,
+    def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
                 compaction_threshold: int = 100):
         """
         Create and connect to lakehouse.
-        Supports two formats:
-        1. Compact: connect("ws/lh.lakehouse/schema", sql_folder=...) or connect("ws/lh.lakehouse")
-        2. Traditional: connect("ws", "lh", "schema", sql_folder) or connect("ws", "lh")
+        Uses compact format: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
         Args:
-            workspace: Workspace name or full path "ws/lh.lakehouse/schema"
-            lakehouse_name: Lakehouse name (optional if using compact format)
-            schema: Schema name (defaults to "dbo")
+            connection_string: OneLake path "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
             sql_folder: Optional path or URL to SQL files folder
             compaction_threshold: File count threshold for compaction
         Examples:
-            # Compact format (second param treated as sql_folder if it's a URL/path string)
-            dr = Duckrun.connect("temp/power.lakehouse/wa", "https://github.com/.../sql/")
-            dr = Duckrun.connect("ws/lh.lakehouse/schema", "./sql")
+            dr = Duckrun.connect("ws/lh.lakehouse/schema", sql_folder="./sql")
             dr = Duckrun.connect("ws/lh.lakehouse/schema")  # no SQL folder
-            # Traditional format
-            dr = Duckrun.connect("ws", "lh", "schema", "./sql")
-            dr = Duckrun.connect("ws", "lh", "schema")
+            dr = Duckrun.connect("ws/lh.lakehouse")  # defaults to dbo schema
         """
         print("Connecting to Lakehouse...")
         scan_all_schemas = False
-        # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
-        # If second param looks like a path/URL and not a lakehouse name, treat it as sql_folder
-        if workspace and "/" in workspace and (lakehouse_name is None or
-            (isinstance(lakehouse_name, str) and ('/' in lakehouse_name or lakehouse_name.startswith('http') or lakehouse_name.startswith('.')))):
-            # If lakehouse_name looks like a sql_folder, shift it
-            if lakehouse_name and ('/' in lakehouse_name or lakehouse_name.startswith('http') or lakehouse_name.startswith('.')):
-                sql_folder = lakehouse_name
-                lakehouse_name = None
-            parts = workspace.split("/")
-            if len(parts) == 2:
-                workspace, lakehouse_name = parts
-                scan_all_schemas = True
-                print(f"ℹ️  No schema specified. Using default schema 'dbo' for operations.")
-                print(f"   Scanning all schemas for table discovery...\n")
-            elif len(parts) == 3:
-                workspace, lakehouse_name, schema = parts
-            else:
-                raise ValueError(
-                    f"Invalid connection string format: '{workspace}'. "
-                    "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
-                )
-            if lakehouse_name.endswith(".lakehouse"):
-                lakehouse_name = lakehouse_name[:-10]
-        elif lakehouse_name is not None:
-            # Traditional format - check if schema was explicitly provided
-            if schema == "dbo":
-                scan_all_schemas = True
-                print(f"ℹ️  No schema specified. Using default schema 'dbo' for operations.")
-                print(f"   Scanning all schemas for table discovery...\n")
+        # Only support compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
+        if not connection_string or "/" not in connection_string:
+            raise ValueError(
+                "Invalid connection string format. "
+                "Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
+            )
+        parts = connection_string.split("/")
+        if len(parts) == 2:
+            workspace, lakehouse_name = parts
+            scan_all_schemas = True
+            schema = "dbo"
+            print(f"ℹ️  No schema specified. Using default schema 'dbo' for operations.")
+            print(f"   Scanning all schemas for table discovery...\n")
+        elif len(parts) == 3:
+            workspace, lakehouse_name, schema = parts
+        else:
+            raise ValueError(
+                f"Invalid connection string format: '{connection_string}'. "
+                "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
+            )
+        if lakehouse_name.endswith(".lakehouse"):
+            lakehouse_name = lakehouse_name[:-10]
         if not workspace or not lakehouse_name:
             raise ValueError(
-                "Missing required parameters. Use either:\n"
+                "Missing required parameters. Use compact format:\n"
                 "  connect('workspace/lakehouse.lakehouse/schema', 'sql_folder')\n"
-                "  connect('workspace/lakehouse.lakehouse')  # defaults to dbo\n"
-                "  connect('workspace', 'lakehouse', 'schema', 'sql_folder')\n"
-                "  connect('workspace', 'lakehouse')  # defaults to dbo"
+                "  connect('workspace/lakehouse.lakehouse')  # defaults to dbo"
             )
         return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
@@ -210,7 +193,7 @@ class Duckrun:
         if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
             self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
         else:
-            print("Please login to Azure CLI")
+            print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
             from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
             credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
             token = credential.get_token("https://storage.azure.com/.default")
@@ -227,7 +210,7 @@ class Duckrun:
         """
         token = self._get_storage_token()
         if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
-            print("Getting Azure token for table discovery...")
+            print("Authenticating with Azure for table discovery (trying CLI, will fallback to browser if needed)...")
             from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
             credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
             token_obj = credential.get_token("https://storage.azure.com/.default")
@@ -426,7 +409,7 @@ class Duckrun:
         if mode == 'overwrite':
             self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
             df = self.con.sql(sql).record_batch()
-            write_deltalake(path, df, mode='overwrite')
+            write_deltalake(path, df, mode='overwrite', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
             self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
             dt = DeltaTable(path)
             dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
@@ -434,7 +417,7 @@ class Duckrun:
         elif mode == 'append':
             df = self.con.sql(sql).record_batch()
-            write_deltalake(path, df, mode='append')
+            write_deltalake(path, df, mode='append', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
             self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
             dt = DeltaTable(path)
             if len(dt.file_uris()) > self.compaction_threshold:
@@ -451,7 +434,7 @@ class Duckrun:
                 print(f"Table {normalized_table} doesn't exist. Creating...")
                 self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
                 df = self.con.sql(sql).record_batch()
-                write_deltalake(path, df, mode='overwrite')
+                write_deltalake(path, df, mode='overwrite', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
                 self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
                 dt = DeltaTable(path)
                 dt.vacuum(dry_run=False)
@@ -542,7 +525,7 @@ class Duckrun:
         # Get Azure token
         token = self._get_storage_token()
         if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
-            print("Getting Azure token for file upload...")
+            print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
             from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
             credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
             token_obj = credential.get_token("https://storage.azure.com/.default")
@@ -649,7 +632,7 @@ class Duckrun:
         # Get Azure token
         token = self._get_storage_token()
         if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
-            print("Getting Azure token for file download...")
+            print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
             from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
             credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
             token_obj = credential.get_token("https://storage.azure.com/.default")

{duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.1.6.3
+Version: 0.1.8
 Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
 Author: mim
 License: MIT
@@ -11,7 +11,7 @@ Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: duckdb>=1.2.0
-Requires-Dist: deltalake>=0.18.2
+Requires-Dist: deltalake<=0.18.2
 Requires-Dist: requests>=2.28.0
 Requires-Dist: obstore>=0.2.0
 Provides-Extra: local
@@ -20,7 +20,7 @@ Dynamic: license-file
 <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
-Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
+A helper package for stuff that made my life easier when working with Fabric Python notebooks. Just the things that actually made sense to me - nothing fancy
 ## Important Notes
@@ -30,6 +30,10 @@ Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and
 **Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
+## What It Does
+It does orchestration, arbitrary SQL statements, and file manipulation. That's it - just stuff I encounter in my daily workflow when working with Fabric notebooks.
 ## Installation
 ```bash
@@ -101,7 +105,7 @@ con.sql("SELECT * FROM dbo_customers").show()
 con.sql("SELECT * FROM bronze_raw_data").show()
 ```
-## Two Ways to Use Duckrun
+## Three Ways to Use Duckrun
 ### 1. Data Exploration (Spark-Style API)

duckrun-0.1.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
+duckrun/core.py,sha256=NrGriuJO7Mh1e9NKplNKkNleUWBpIKG5CwJGj3qNxxw,33334
+duckrun-0.1.8.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
+duckrun-0.1.8.dist-info/METADATA,sha256=CsvDljoHqgKfoDDdxHmNoKiR1PJNkqf6ye3hbxWm118,13847
+duckrun-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+duckrun-0.1.8.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
+duckrun-0.1.8.dist-info/RECORD,,

duckrun-0.1.6.3.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
-duckrun/core.py,sha256=CT2NH5hCLsv4uB5zH3VxTuCVQy0nWkPBG-cICLPhG_8,34245
-duckrun-0.1.6.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
-duckrun-0.1.6.3.dist-info/METADATA,sha256=ny5DcRSU1B4SdHdJqHCYk0-hNo9-zqFABqMY9ulAVNk,13595
-duckrun-0.1.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-duckrun-0.1.6.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
-duckrun-0.1.6.3.dist-info/RECORD,,

{duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

duckrun 0.1.6.3__py3-none-any.whl → 0.1.8__py3-none-any.whl

duckrun 0.1.6.3py3-none-any.whl → 0.1.8py3-none-any.whl