duckrun 0.1.5.1__py3-none-any.whl → 0.1.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -111,16 +111,17 @@ class Duckrun:
111
111
 
112
112
  Usage:
113
113
  # For pipelines:
114
- dr = Duckrun.connect(workspace, lakehouse, schema, sql_folder)
114
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
115
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema
115
116
  dr.run(pipeline)
116
117
 
117
118
  # For data exploration with Spark-style API:
118
- dr = Duckrun.connect(workspace, lakehouse, schema)
119
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse")
119
120
  dr.sql("SELECT * FROM table").show()
120
121
  dr.sql("SELECT 43").write.mode("append").saveAsTable("test")
121
122
  """
122
123
 
123
- def __init__(self, workspace: str, lakehouse_name: str, schema: str,
124
+ def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
124
125
  sql_folder: Optional[str] = None, compaction_threshold: int = 10):
125
126
  self.workspace = workspace
126
127
  self.lakehouse_name = lakehouse_name
@@ -133,10 +134,57 @@ class Duckrun:
133
134
  self._attach_lakehouse()
134
135
 
135
136
  @classmethod
136
- def connect(cls, workspace: str, lakehouse_name: str, schema: str,
137
- sql_folder: Optional[str] = None, compaction_threshold: int = 100):
138
- """Create and connect to lakehouse"""
137
+ def connect(cls, workspace: Union[str, None] = None, lakehouse_name: Optional[str] = None,
138
+ schema: str = "dbo", sql_folder: Optional[str] = None,
139
+ compaction_threshold: int = 100):
140
+ """
141
+ Create and connect to lakehouse.
142
+
143
+ Supports two formats:
144
+ 1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
145
+ 2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
146
+
147
+ Schema defaults to "dbo" if not specified.
148
+
149
+ Examples:
150
+ dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
151
+ dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # uses dbo
152
+ dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
153
+ dr = Duckrun.connect("myworkspace", "mylakehouse") # uses dbo
154
+ dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
155
+ """
139
156
  print("Connecting to Lakehouse...")
157
+
158
+ # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
159
+ if workspace and "/" in workspace and lakehouse_name is None:
160
+ parts = workspace.split("/")
161
+ if len(parts) == 2:
162
+ # Format: "ws/lh.lakehouse" (schema will use default)
163
+ workspace, lakehouse_name = parts
164
+ # schema already has default value "dbo"
165
+ elif len(parts) == 3:
166
+ # Format: "ws/lh.lakehouse/schema"
167
+ workspace, lakehouse_name, schema = parts
168
+ else:
169
+ raise ValueError(
170
+ f"Invalid connection string format: '{workspace}'. "
171
+ "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
172
+ )
173
+
174
+ # Remove .lakehouse suffix if present
175
+ if lakehouse_name.endswith(".lakehouse"):
176
+ lakehouse_name = lakehouse_name[:-10]
177
+
178
+ # Validate all required parameters are present
179
+ if not workspace or not lakehouse_name:
180
+ raise ValueError(
181
+ "Missing required parameters. Use either:\n"
182
+ " connect('workspace/lakehouse.lakehouse/schema')\n"
183
+ " connect('workspace/lakehouse.lakehouse') # defaults to dbo\n"
184
+ " connect('workspace', 'lakehouse', 'schema')\n"
185
+ " connect('workspace', 'lakehouse') # defaults to dbo"
186
+ )
187
+
140
188
  return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
141
189
 
142
190
  def _get_storage_token(self):
@@ -0,0 +1,303 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckrun
3
+ Version: 0.1.5.3
4
+ Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
+ Author: mim
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/djouallah/duckrun
8
+ Project-URL: Repository, https://github.com/djouallah/duckrun
9
+ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: duckdb>=1.2.0
14
+ Requires-Dist: deltalake>=0.18.2
15
+ Requires-Dist: requests>=2.28.0
16
+ Dynamic: license-file
17
+
18
+ <img src="duckrun.png" width="400" alt="Duckrun">
19
+
20
+ Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
21
+
22
+ ## Important Notes
23
+
24
+ **Requirements:**
25
+ - Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
26
+ - Workspace and lakehouse names cannot contain spaces
27
+
28
+ **Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ pip install duckrun
34
+ ```
35
+
36
+ ## Quick Start
37
+
38
+ ```python
39
+ import duckrun
40
+
41
+ # Connect to your Fabric lakehouse
42
+ con = duckrun.connect(
43
+ workspace="my_workspace",
44
+ lakehouse_name="my_lakehouse",
45
+ schema="dbo"
46
+ )
47
+
48
+ # Explore data
49
+ con.sql("SELECT * FROM my_table LIMIT 10").show()
50
+
51
+ # Write to Delta tables (Spark-style API)
52
+ con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
53
+ ```
54
+
55
+ That's it! No `sql_folder` needed for data exploration.
56
+
57
+ ## Two Ways to Use Duckrun
58
+
59
+ ### 1. Data Exploration (Spark-Style API)
60
+
61
+ Perfect for ad-hoc analysis and interactive notebooks:
62
+
63
+ ```python
64
+ con = duckrun.connect("workspace", "lakehouse", "dbo")
65
+
66
+ # Query existing tables
67
+ con.sql("SELECT * FROM sales WHERE year = 2024").show()
68
+
69
+ # Get DataFrame
70
+ df = con.sql("SELECT COUNT(*) FROM orders").df()
71
+
72
+ # Write results to Delta tables
73
+ con.sql("""
74
+ SELECT
75
+ customer_id,
76
+ SUM(amount) as total
77
+ FROM orders
78
+ GROUP BY customer_id
79
+ """).write.mode("overwrite").saveAsTable("customer_totals")
80
+
81
+ # Append mode
82
+ con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
83
+ ```
84
+
85
+ **Note:** `.format("delta")` is optional - Delta is the default format!
86
+
87
+ ### 2. Pipeline Orchestration
88
+
89
+ For production workflows with reusable SQL and Python tasks:
90
+
91
+ ```python
92
+ con = duckrun.connect(
93
+ workspace="my_workspace",
94
+ lakehouse_name="my_lakehouse",
95
+ schema="dbo",
96
+ sql_folder="./sql" # folder with .sql and .py files
97
+ )
98
+
99
+ # Define pipeline
100
+ pipeline = [
101
+ ('download_data', (url, path)), # Python task
102
+ ('clean_data', 'overwrite'), # SQL task
103
+ ('aggregate', 'append') # SQL task
104
+ ]
105
+
106
+ # Run it
107
+ con.run(pipeline)
108
+ ```
109
+
110
+ ## Pipeline Tasks
111
+
112
+ ### Python Tasks
113
+
114
+ **Format:** `('function_name', (arg1, arg2, ...))`
115
+
116
+ Create `sql_folder/function_name.py`:
117
+
118
+ ```python
119
+ # sql_folder/download_data.py
120
+ def download_data(url, path):
121
+ # your code here
122
+ return 1 # 1 = success, 0 = failure
123
+ ```
124
+
125
+ ### SQL Tasks
126
+
127
+ **Format:** `('table_name', 'mode')` or `('table_name', 'mode', {params})`
128
+
129
+ Create `sql_folder/table_name.sql`:
130
+
131
+ ```sql
132
+ -- sql_folder/clean_data.sql
133
+ SELECT
134
+ id,
135
+ TRIM(name) as name,
136
+ date
137
+ FROM raw_data
138
+ WHERE date >= '2024-01-01'
139
+ ```
140
+
141
+ **Write Modes:**
142
+ - `overwrite` - Replace table completely
143
+ - `append` - Add to existing table
144
+ - `ignore` - Create only if doesn't exist
145
+
146
+ ### Parameterized SQL
147
+
148
+ Built-in parameters (always available):
149
+ - `$ws` - workspace name
150
+ - `$lh` - lakehouse name
151
+ - `$schema` - schema name
152
+
153
+ Custom parameters:
154
+
155
+ ```python
156
+ pipeline = [
157
+ ('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
158
+ ]
159
+ ```
160
+
161
+ ```sql
162
+ -- sql_folder/sales.sql
163
+ SELECT * FROM transactions
164
+ WHERE date BETWEEN '$start_date' AND '$end_date'
165
+ ```
166
+
167
+ ## Advanced Features
168
+
169
+ ### Table Name Variants
170
+
171
+ Use `__` to create multiple versions of the same table:
172
+
173
+ ```python
174
+ pipeline = [
175
+ ('sales__initial', 'overwrite'), # writes to 'sales'
176
+ ('sales__incremental', 'append'), # appends to 'sales'
177
+ ]
178
+ ```
179
+
180
+ Both tasks write to the `sales` table but use different SQL files (`sales__initial.sql` and `sales__incremental.sql`).
181
+
182
+ ### Remote SQL Files
183
+
184
+ Load tasks from GitHub or any URL:
185
+
186
+ ```python
187
+ con = duckrun.connect(
188
+ workspace="Analytics",
189
+ lakehouse_name="Sales",
190
+ schema="dbo",
191
+ sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
192
+ )
193
+ ```
194
+
195
+ ### Early Exit on Failure
196
+
197
+ **Pipelines automatically stop when any task fails** - subsequent tasks won't run.
198
+
199
+ For **SQL tasks**, failure is automatic:
200
+ - If the query has a syntax error or runtime error, the task fails
201
+ - The pipeline stops immediately
202
+
203
+ For **Python tasks**, you control success/failure by returning:
204
+ - `1` = Success → pipeline continues to next task
205
+ - `0` = Failure → pipeline stops, remaining tasks are skipped
206
+
207
+ Example:
208
+
209
+ ```python
210
+ # sql_folder/download_data.py
211
+ def download_data(url, path):
212
+ try:
213
+ response = requests.get(url)
214
+ response.raise_for_status()
215
+ # save data...
216
+ return 1 # Success - pipeline continues
217
+ except Exception as e:
218
+ print(f"Download failed: {e}")
219
+ return 0 # Failure - pipeline stops here
220
+ ```
221
+
222
+ ```python
223
+ pipeline = [
224
+ ('download_data', (url, path)), # If returns 0, stops here
225
+ ('clean_data', 'overwrite'), # Won't run if download failed
226
+ ('aggregate', 'append') # Won't run if download failed
227
+ ]
228
+
229
+ success = con.run(pipeline) # Returns True only if ALL tasks succeed
230
+ ```
231
+
232
+ This prevents downstream tasks from processing incomplete or corrupted data.
233
+
234
+ ### Delta Lake Optimization
235
+
236
+ Duckrun automatically:
237
+ - Compacts small files when file count exceeds threshold (default: 100)
238
+ - Vacuums old versions on overwrite
239
+ - Cleans up metadata
240
+
241
+ Customize compaction threshold:
242
+
243
+ ```python
244
+ con = duckrun.connect(
245
+ workspace="workspace",
246
+ lakehouse_name="lakehouse",
247
+ schema="dbo",
248
+ compaction_threshold=50 # compact after 50 files
249
+ )
250
+ ```
251
+
252
+ ## Complete Example
253
+
254
+ ```python
255
+ import duckrun
256
+
257
+ # Connect
258
+ con = duckrun.connect("Analytics", "Sales", "dbo", "./sql")
259
+
260
+ # Pipeline with mixed tasks
261
+ pipeline = [
262
+ # Download raw data (Python)
263
+ ('fetch_api_data', ('https://api.example.com/sales', 'raw')),
264
+
265
+ # Clean and transform (SQL)
266
+ ('clean_sales', 'overwrite'),
267
+
268
+ # Aggregate by region (SQL with params)
269
+ ('regional_summary', 'overwrite', {'min_amount': 1000}),
270
+
271
+ # Append to history (SQL)
272
+ ('sales_history', 'append')
273
+ ]
274
+
275
+ # Run
276
+ success = con.run(pipeline)
277
+
278
+ # Explore results
279
+ con.sql("SELECT * FROM regional_summary").show()
280
+
281
+ # Export to new table
282
+ con.sql("""
283
+ SELECT region, SUM(total) as grand_total
284
+ FROM regional_summary
285
+ GROUP BY region
286
+ """).write.mode("overwrite").saveAsTable("region_totals")
287
+ ```
288
+
289
+ ## How It Works
290
+
291
+ 1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
292
+ 2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
293
+ 3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
294
+ 4. **Write Operations**: Results are written back as Delta tables with automatic optimization
295
+ 5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
296
+
297
+ ## Real-World Example
298
+
299
+ For a complete production example, see [fabric_demo](https://github.com/djouallah/fabric_demo).
300
+
301
+ ## License
302
+
303
+ MIT
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=n4FqyWlPFRnC-BBMphnOCzxrad4FwTTgl7lTfWL7AEk,20525
3
+ duckrun-0.1.5.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.5.3.dist-info/METADATA,sha256=1ibXy62hbaRBlw7br1UIUUCkVw0AsZxao1cl-0hWitg,7792
5
+ duckrun-0.1.5.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.5.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.5.3.dist-info/RECORD,,
@@ -1,173 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: duckrun
3
- Version: 0.1.5.1
4
- Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
- Author: mim
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/djouallah/duckrun
8
- Project-URL: Repository, https://github.com/djouallah/duckrun
9
- Project-URL: Issues, https://github.com/djouallah/duckrun/issues
10
- Requires-Python: >=3.9
11
- Description-Content-Type: text/markdown
12
- License-File: LICENSE
13
- Requires-Dist: duckdb>=1.2.0
14
- Requires-Dist: deltalake>=0.18.2
15
- Requires-Dist: requests>=2.28.0
16
- Dynamic: license-file
17
-
18
-
19
- <img src="duckrun.png" width="400" alt="Duckrun">
20
-
21
- Simple task runner for Microsoft Fabric Python notebook, powered by DuckDB and Delta_rs.
22
-
23
-
24
- ## Known Limitation
25
-
26
- Support only Lakehouse with schema, Workspace and lakehouse names should not contains space
27
-
28
- ## Installation
29
-
30
- ```bash
31
- pip install duckrun
32
- ```
33
-
34
-
35
-
36
- ## Quick Start
37
-
38
- ```python
39
- import duckrun
40
-
41
- # Connect to your Fabric lakehouse (using `con` pattern)
42
- con = duckrun.connect(
43
- workspace="my_workspace",
44
- lakehouse_name="my_lakehouse",
45
- schema="dbo",
46
- sql_folder="./sql" # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
47
- )
48
-
49
- # Define your pipeline
50
- pipeline = [
51
- ('load_data', (url, path)), # Python task
52
- ('clean_data', 'overwrite'), # SQL task
53
- ('aggregate', 'append') # SQL task
54
- ]
55
-
56
- # Run it
57
- con.run(pipeline)
58
- ```
59
-
60
- Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
61
-
62
- ## Early Exit
63
-
64
- In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
65
-
66
- ## How It Works
67
-
68
- Duckrun runs two types of tasks:
69
-
70
- ### 1. Python Tasks
71
- Format: `('function_name', (arg1, arg2, ...))`
72
-
73
- Create a file `sql_folder/function_name.py` with a function matching the name:
74
-
75
- ```python
76
- # sql_folder/load_data.py
77
- def load_data(url, path):
78
- # your code here
79
- # IMPORTANT: Must return 1 for success, 0 for failure
80
- return 1
81
- ```
82
-
83
- ### 2. SQL Tasks
84
- Format: `('table_name', 'mode')` or `('table_name', 'mode', {params})`
85
-
86
- Create a file `sql_folder/table_name.sql`:
87
-
88
- ```sql
89
- -- sql_folder/clean_data.sql
90
- SELECT
91
- id,
92
- TRIM(name) as name,
93
- date
94
- FROM raw_data
95
- WHERE date >= '2024-01-01'
96
- ```
97
-
98
- **Modes:**
99
- - `overwrite` - Replace table completely
100
- - `append` - Add to existing table
101
- - `ignore` - Create only if doesn't exist
102
-
103
- ## Task Files
104
-
105
- The `sql_folder` can contain a mixture of both `.sql` and `.py` files. This allows you to combine SQL transformations and Python logic in your pipelines.
106
-
107
- ### SQL Files
108
- Your SQL files automatically have access to:
109
- - `$ws` - workspace name
110
- - `$lh` - lakehouse name
111
- - `$schema` - schema name
112
-
113
- Pass custom parameters:
114
-
115
- ```python
116
- pipeline = [
117
- ('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
118
- ]
119
- ```
120
-
121
- ```sql
122
- -- sql_folder/sales.sql
123
- SELECT * FROM transactions
124
- WHERE date BETWEEN '$start_date' AND '$end_date'
125
- ```
126
-
127
- ## Table Name Convention
128
-
129
- Use `__` to create variants of the same table:
130
-
131
- ```python
132
- pipeline = [
133
- ('sales__initial', 'overwrite'), # writes to 'sales' table
134
- ('sales__incremental', 'append'), # appends to 'sales' table
135
- ]
136
- ```
137
-
138
- Both write to the same `sales` table, but use different SQL files.
139
-
140
- ## Query Data
141
-
142
- ```python
143
- # Run queries
144
- con.sql("SELECT * FROM my_table LIMIT 10").show()
145
-
146
- # Get as DataFrame
147
- df = con.sql("SELECT COUNT(*) FROM sales").df()
148
- ```
149
-
150
- Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
151
-
152
-
153
-
154
- ## Remote SQL Files
155
-
156
- You can load SQL/Python files from a URL:
157
-
158
- ```python
159
- con = duckrun.connect(
160
- workspace="Analytics",
161
- lakehouse_name="Sales",
162
- schema="dbo",
163
- sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
164
- )
165
- ```
166
-
167
- ## Real-Life Usage
168
-
169
- For a complete, production-style example, see [fabric_demo](https://github.com/djouallah/fabric_demo).
170
-
171
- ## License
172
-
173
- MIT
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=EoXlQsx7i3BS2a26zB90n4xDBy_WQu1sNicPNYU3DgY,18110
3
- duckrun-0.1.5.1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.5.1.dist-info/METADATA,sha256=piXLbt2nRJoAngkOFojRNVX1-nfEGta6p7WKyAKcxEU,4392
5
- duckrun-0.1.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.5.1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.5.1.dist-info/RECORD,,