PyPI - duckrun - Versions diffs - 0.1.5.1__py3-none-any.whl → 0.1.5.3__py3-none-any.whl - Mend

duckrun 0.1.5.1py3-none-any.whl → 0.1.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

duckrun/core.py +54 -6
duckrun-0.1.5.3.dist-info/METADATA +303 -0
duckrun-0.1.5.3.dist-info/RECORD +7 -0
duckrun-0.1.5.1.dist-info/METADATA +0 -173
duckrun-0.1.5.1.dist-info/RECORD +0 -7
{duckrun-0.1.5.1.dist-info → duckrun-0.1.5.3.dist-info}/WHEEL +0 -0
{duckrun-0.1.5.1.dist-info → duckrun-0.1.5.3.dist-info}/licenses/LICENSE +0 -0
{duckrun-0.1.5.1.dist-info → duckrun-0.1.5.3.dist-info}/top_level.txt +0 -0

duckrun/core.py CHANGED Viewed

@@ -111,16 +111,17 @@ class Duckrun:
     Usage:
         # For pipelines:
-        dr = Duckrun.connect(workspace, lakehouse, schema, sql_folder)
+        dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
+        dr = Duckrun.connect("workspace/lakehouse.lakehouse")  # defaults to dbo schema
         dr.run(pipeline)
         # For data exploration with Spark-style API:
-        dr = Duckrun.connect(workspace, lakehouse, schema)
+        dr = Duckrun.connect("workspace/lakehouse.lakehouse")
         dr.sql("SELECT * FROM table").show()
         dr.sql("SELECT 43").write.mode("append").saveAsTable("test")
     """
-    def __init__(self, workspace: str, lakehouse_name: str, schema: str,
+    def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
                  sql_folder: Optional[str] = None, compaction_threshold: int = 10):
         self.workspace = workspace
         self.lakehouse_name = lakehouse_name
@@ -133,10 +134,57 @@ class Duckrun:
         self._attach_lakehouse()
     @classmethod
-    def connect(cls, workspace: str, lakehouse_name: str, schema: str,
-                sql_folder: Optional[str] = None, compaction_threshold: int = 100):
-        """Create and connect to lakehouse"""
+    def connect(cls, workspace: Union[str, None] = None, lakehouse_name: Optional[str] = None,
+                schema: str = "dbo", sql_folder: Optional[str] = None,
+                compaction_threshold: int = 100):
+        """
+        Create and connect to lakehouse.
+        Supports two formats:
+        1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
+        2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
+        Schema defaults to "dbo" if not specified.
+        Examples:
+            dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
+            dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse")  # uses dbo
+            dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
+            dr = Duckrun.connect("myworkspace", "mylakehouse")  # uses dbo
+            dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
+        """
         print("Connecting to Lakehouse...")
+        # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
+        if workspace and "/" in workspace and lakehouse_name is None:
+            parts = workspace.split("/")
+            if len(parts) == 2:
+                # Format: "ws/lh.lakehouse" (schema will use default)
+                workspace, lakehouse_name = parts
+                # schema already has default value "dbo"
+            elif len(parts) == 3:
+                # Format: "ws/lh.lakehouse/schema"
+                workspace, lakehouse_name, schema = parts
+            else:
+                raise ValueError(
+                    f"Invalid connection string format: '{workspace}'. "
+                    "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
+                )
+            # Remove .lakehouse suffix if present
+            if lakehouse_name.endswith(".lakehouse"):
+                lakehouse_name = lakehouse_name[:-10]
+        # Validate all required parameters are present
+        if not workspace or not lakehouse_name:
+            raise ValueError(
+                "Missing required parameters. Use either:\n"
+                "  connect('workspace/lakehouse.lakehouse/schema')\n"
+                "  connect('workspace/lakehouse.lakehouse')  # defaults to dbo\n"
+                "  connect('workspace', 'lakehouse', 'schema')\n"
+                "  connect('workspace', 'lakehouse')  # defaults to dbo"
+            )
         return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
     def _get_storage_token(self):

duckrun-0.1.5.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,303 @@
+Metadata-Version: 2.4
+Name: duckrun
+Version: 0.1.5.3
+Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
+Author: mim
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/djouallah/duckrun
+Project-URL: Repository, https://github.com/djouallah/duckrun
+Project-URL: Issues, https://github.com/djouallah/duckrun/issues
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: duckdb>=1.2.0
+Requires-Dist: deltalake>=0.18.2
+Requires-Dist: requests>=2.28.0
+Dynamic: license-file
+<img src="duckrun.png" width="400" alt="Duckrun">
+Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
+## Important Notes
+**Requirements:**
+- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
+- Workspace and lakehouse names cannot contain spaces
+**Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
+## Installation
+```bash
+pip install duckrun
+```
+## Quick Start
+```python
+import duckrun
+# Connect to your Fabric lakehouse
+con = duckrun.connect(
+    workspace="my_workspace",
+    lakehouse_name="my_lakehouse",
+    schema="dbo"
+)
+# Explore data
+con.sql("SELECT * FROM my_table LIMIT 10").show()
+# Write to Delta tables (Spark-style API)
+con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
+```
+That's it! No `sql_folder` needed for data exploration.
+## Two Ways to Use Duckrun
+### 1. Data Exploration (Spark-Style API)
+Perfect for ad-hoc analysis and interactive notebooks:
+```python
+con = duckrun.connect("workspace", "lakehouse", "dbo")
+# Query existing tables
+con.sql("SELECT * FROM sales WHERE year = 2024").show()
+# Get DataFrame
+df = con.sql("SELECT COUNT(*) FROM orders").df()
+# Write results to Delta tables
+con.sql("""
+    SELECT
+        customer_id,
+        SUM(amount) as total
+    FROM orders
+    GROUP BY customer_id
+""").write.mode("overwrite").saveAsTable("customer_totals")
+# Append mode
+con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
+```
+**Note:** `.format("delta")` is optional - Delta is the default format!
+### 2. Pipeline Orchestration
+For production workflows with reusable SQL and Python tasks:
+```python
+con = duckrun.connect(
+    workspace="my_workspace",
+    lakehouse_name="my_lakehouse",
+    schema="dbo",
+    sql_folder="./sql"  # folder with .sql and .py files
+)
+# Define pipeline
+pipeline = [
+    ('download_data', (url, path)),    # Python task
+    ('clean_data', 'overwrite'),       # SQL task
+    ('aggregate', 'append')            # SQL task
+]
+# Run it
+con.run(pipeline)
+```
+## Pipeline Tasks
+### Python Tasks
+**Format:** `('function_name', (arg1, arg2, ...))`
+Create `sql_folder/function_name.py`:
+```python
+# sql_folder/download_data.py
+def download_data(url, path):
+    # your code here
+    return 1  # 1 = success, 0 = failure
+```
+### SQL Tasks
+**Format:** `('table_name', 'mode')` or `('table_name', 'mode', {params})`
+Create `sql_folder/table_name.sql`:
+```sql
+-- sql_folder/clean_data.sql
+SELECT
+    id,
+    TRIM(name) as name,
+    date
+FROM raw_data
+WHERE date >= '2024-01-01'
+```
+**Write Modes:**
+- `overwrite` - Replace table completely
+- `append` - Add to existing table
+- `ignore` - Create only if doesn't exist
+### Parameterized SQL
+Built-in parameters (always available):
+- `$ws` - workspace name
+- `$lh` - lakehouse name
+- `$schema` - schema name
+Custom parameters:
+```python
+pipeline = [
+    ('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
+]
+```
+```sql
+-- sql_folder/sales.sql
+SELECT * FROM transactions
+WHERE date BETWEEN '$start_date' AND '$end_date'
+```
+## Advanced Features
+### Table Name Variants
+Use `__` to create multiple versions of the same table:
+```python
+pipeline = [
+    ('sales__initial', 'overwrite'),     # writes to 'sales'
+    ('sales__incremental', 'append'),    # appends to 'sales'
+]
+```
+Both tasks write to the `sales` table but use different SQL files (`sales__initial.sql` and `sales__incremental.sql`).
+### Remote SQL Files
+Load tasks from GitHub or any URL:
+```python
+con = duckrun.connect(
+    workspace="Analytics",
+    lakehouse_name="Sales",
+    schema="dbo",
+    sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
+)
+```
+### Early Exit on Failure
+**Pipelines automatically stop when any task fails** - subsequent tasks won't run.
+For **SQL tasks**, failure is automatic:
+- If the query has a syntax error or runtime error, the task fails
+- The pipeline stops immediately
+For **Python tasks**, you control success/failure by returning:
+- `1` = Success → pipeline continues to next task
+- `0` = Failure → pipeline stops, remaining tasks are skipped
+Example:
+```python
+# sql_folder/download_data.py
+def download_data(url, path):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        # save data...
+        return 1  # Success - pipeline continues
+    except Exception as e:
+        print(f"Download failed: {e}")
+        return 0  # Failure - pipeline stops here
+```
+```python
+pipeline = [
+    ('download_data', (url, path)),     # If returns 0, stops here
+    ('clean_data', 'overwrite'),        # Won't run if download failed
+    ('aggregate', 'append')             # Won't run if download failed
+]
+success = con.run(pipeline)  # Returns True only if ALL tasks succeed
+```
+This prevents downstream tasks from processing incomplete or corrupted data.
+### Delta Lake Optimization
+Duckrun automatically:
+- Compacts small files when file count exceeds threshold (default: 100)
+- Vacuums old versions on overwrite
+- Cleans up metadata
+Customize compaction threshold:
+```python
+con = duckrun.connect(
+    workspace="workspace",
+    lakehouse_name="lakehouse",
+    schema="dbo",
+    compaction_threshold=50  # compact after 50 files
+)
+```
+## Complete Example
+```python
+import duckrun
+# Connect
+con = duckrun.connect("Analytics", "Sales", "dbo", "./sql")
+# Pipeline with mixed tasks
+pipeline = [
+    # Download raw data (Python)
+    ('fetch_api_data', ('https://api.example.com/sales', 'raw')),
+    # Clean and transform (SQL)
+    ('clean_sales', 'overwrite'),
+    # Aggregate by region (SQL with params)
+    ('regional_summary', 'overwrite', {'min_amount': 1000}),
+    # Append to history (SQL)
+    ('sales_history', 'append')
+]
+# Run
+success = con.run(pipeline)
+# Explore results
+con.sql("SELECT * FROM regional_summary").show()
+# Export to new table
+con.sql("""
+    SELECT region, SUM(total) as grand_total
+    FROM regional_summary
+    GROUP BY region
+""").write.mode("overwrite").saveAsTable("region_totals")
+```
+## How It Works
+1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
+2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
+3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
+4. **Write Operations**: Results are written back as Delta tables with automatic optimization
+5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
+## Real-World Example
+For a complete production example, see [fabric_demo](https://github.com/djouallah/fabric_demo).
+## License
+MIT

duckrun-0.1.5.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
+duckrun/core.py,sha256=n4FqyWlPFRnC-BBMphnOCzxrad4FwTTgl7lTfWL7AEk,20525
+duckrun-0.1.5.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
+duckrun-0.1.5.3.dist-info/METADATA,sha256=1ibXy62hbaRBlw7br1UIUUCkVw0AsZxao1cl-0hWitg,7792
+duckrun-0.1.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+duckrun-0.1.5.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
+duckrun-0.1.5.3.dist-info/RECORD,,

duckrun-0.1.5.1.dist-info/METADATA DELETED Viewed

@@ -1,173 +0,0 @@
-Metadata-Version: 2.4
-Name: duckrun
-Version: 0.1.5.1
-Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
-Author: mim
-License-Expression: MIT
-Project-URL: Homepage, https://github.com/djouallah/duckrun
-Project-URL: Repository, https://github.com/djouallah/duckrun
-Project-URL: Issues, https://github.com/djouallah/duckrun/issues
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: duckdb>=1.2.0
-Requires-Dist: deltalake>=0.18.2
-Requires-Dist: requests>=2.28.0
-Dynamic: license-file
-<img src="duckrun.png" width="400" alt="Duckrun">
-Simple  task runner for Microsoft Fabric Python notebook, powered by DuckDB and Delta_rs.
-## Known Limitation
-Support only Lakehouse with schema, Workspace and lakehouse names should not contains space
-## Installation
-```bash
-pip install duckrun
-```
-## Quick Start
-```python
-import duckrun
-# Connect to your Fabric lakehouse (using `con` pattern)
-con = duckrun.connect(
-    workspace="my_workspace",
-    lakehouse_name="my_lakehouse",
-    schema="dbo",
-    sql_folder="./sql"  # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
-)
-# Define your pipeline
-pipeline = [
-    ('load_data', (url, path)),           # Python task
-    ('clean_data', 'overwrite'),          # SQL task
-    ('aggregate', 'append')               # SQL task
-]
-# Run it
-con.run(pipeline)
-```
-Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
-## Early Exit
-In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
-## How It Works
-Duckrun runs two types of tasks:
-### 1. Python Tasks
-Format: `('function_name', (arg1, arg2, ...))`
-Create a file `sql_folder/function_name.py` with a function matching the name:
-```python
-# sql_folder/load_data.py
-def load_data(url, path):
-    # your code here
-    # IMPORTANT: Must return 1 for success, 0 for failure
-    return 1
-```
-### 2. SQL Tasks
-Format: `('table_name', 'mode')` or `('table_name', 'mode', {params})`
-Create a file `sql_folder/table_name.sql`:
-```sql
--- sql_folder/clean_data.sql
-SELECT
-    id,
-    TRIM(name) as name,
-    date
-FROM raw_data
-WHERE date >= '2024-01-01'
-```
-**Modes:**
-- `overwrite` - Replace table completely
-- `append` - Add to existing table
-- `ignore` - Create only if doesn't exist
-## Task Files
-The `sql_folder` can contain a mixture of both `.sql` and `.py` files. This allows you to combine SQL transformations and Python logic in your pipelines.
-### SQL Files
-Your SQL files automatically have access to:
-- `$ws` - workspace name
-- `$lh` - lakehouse name
-- `$schema` - schema name
-Pass custom parameters:
-```python
-pipeline = [
-    ('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
-]
-```
-```sql
--- sql_folder/sales.sql
-SELECT * FROM transactions
-WHERE date BETWEEN '$start_date' AND '$end_date'
-```
-## Table Name Convention
-Use `__` to create variants of the same table:
-```python
-pipeline = [
-    ('sales__initial', 'overwrite'),    # writes to 'sales' table
-    ('sales__incremental', 'append'),   # appends to 'sales' table
-]
-```
-Both write to the same `sales` table, but use different SQL files.
-## Query Data
-```python
-# Run queries
-con.sql("SELECT * FROM my_table LIMIT 10").show()
-# Get as DataFrame
-df = con.sql("SELECT COUNT(*) FROM sales").df()
-```
-Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
-## Remote SQL Files
-You can load SQL/Python files from a URL:
-```python
-con = duckrun.connect(
-    workspace="Analytics",
-    lakehouse_name="Sales",
-    schema="dbo",
-    sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
-)
-```
-## Real-Life Usage
-For a complete, production-style example, see [fabric_demo](https://github.com/djouallah/fabric_demo).
-## License
-MIT

duckrun-0.1.5.1.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
-duckrun/core.py,sha256=EoXlQsx7i3BS2a26zB90n4xDBy_WQu1sNicPNYU3DgY,18110
-duckrun-0.1.5.1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
-duckrun-0.1.5.1.dist-info/METADATA,sha256=piXLbt2nRJoAngkOFojRNVX1-nfEGta6p7WKyAKcxEU,4392
-duckrun-0.1.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-duckrun-0.1.5.1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
-duckrun-0.1.5.1.dist-info/RECORD,,

{duckrun-0.1.5.1.dist-info → duckrun-0.1.5.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{duckrun-0.1.5.1.dist-info → duckrun-0.1.5.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{duckrun-0.1.5.1.dist-info → duckrun-0.1.5.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

duckrun 0.1.5.1__py3-none-any.whl → 0.1.5.3__py3-none-any.whl

duckrun 0.1.5.1py3-none-any.whl → 0.1.5.3py3-none-any.whl