duckrun 0.1.5__py3-none-any.whl → 0.1.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -13,7 +13,7 @@ class DeltaWriter:
13
13
  def __init__(self, relation, duckrun_instance):
14
14
  self.relation = relation
15
15
  self.duckrun = duckrun_instance
16
- self._format = None
16
+ self._format = "delta" # Default to delta format
17
17
  self._mode = "overwrite"
18
18
 
19
19
  def format(self, format_type: str):
@@ -32,8 +32,9 @@ class DeltaWriter:
32
32
 
33
33
  def saveAsTable(self, table_name: str):
34
34
  """Save query result as Delta table"""
35
+ # Format defaults to "delta", so no need to check
35
36
  if self._format != "delta":
36
- raise RuntimeError("Must call .format('delta') before saveAsTable()")
37
+ raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
37
38
 
38
39
  # Parse schema.table or use default schema
39
40
  if "." in table_name:
@@ -116,7 +117,7 @@ class Duckrun:
116
117
  # For data exploration with Spark-style API:
117
118
  dr = Duckrun.connect(workspace, lakehouse, schema)
118
119
  dr.sql("SELECT * FROM table").show()
119
- dr.sql("SELECT 43").write.format("delta").mode("append").saveAsTable("aemo.test")
120
+ dr.sql("SELECT 43").write.mode("append").saveAsTable("test")
120
121
  """
121
122
 
122
123
  def __init__(self, workspace: str, lakehouse_name: str, schema: str,
@@ -413,9 +414,9 @@ class Duckrun:
413
414
  dr.sql("SELECT * FROM table").show()
414
415
  df = dr.sql("SELECT * FROM table").df()
415
416
 
416
- # New Spark-style write API
417
- dr.sql("SELECT 43 as value").write.format("delta").mode("append").saveAsTable("aemo.test")
418
- dr.sql("SELECT * FROM source").write.format("delta").mode("overwrite").saveAsTable("target")
417
+ # New Spark-style write API (format is optional, defaults to delta)
418
+ dr.sql("SELECT 43 as value").write.mode("append").saveAsTable("test")
419
+ dr.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
419
420
  """
420
421
  relation = self.con.sql(query)
421
422
  return QueryResult(relation, self)
@@ -0,0 +1,303 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckrun
3
+ Version: 0.1.5.2
4
+ Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
+ Author: mim
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/djouallah/duckrun
8
+ Project-URL: Repository, https://github.com/djouallah/duckrun
9
+ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: duckdb>=1.2.0
14
+ Requires-Dist: deltalake>=0.18.2
15
+ Requires-Dist: requests>=2.28.0
16
+ Dynamic: license-file
17
+
18
+ <img src="duckrun.png" width="400" alt="Duckrun">
19
+
20
+ Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
21
+
22
+ ## Important Notes
23
+
24
+ **Requirements:**
25
+ - Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
26
+ - Workspace and lakehouse names cannot contain spaces
27
+
28
+ **Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ pip install duckrun
34
+ ```
35
+
36
+ ## Quick Start
37
+
38
+ ```python
39
+ import duckrun
40
+
41
+ # Connect to your Fabric lakehouse
42
+ con = duckrun.connect(
43
+ workspace="my_workspace",
44
+ lakehouse_name="my_lakehouse",
45
+ schema="dbo"
46
+ )
47
+
48
+ # Explore data
49
+ con.sql("SELECT * FROM my_table LIMIT 10").show()
50
+
51
+ # Write to Delta tables (Spark-style API)
52
+ con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
53
+ ```
54
+
55
+ That's it! No `sql_folder` needed for data exploration.
56
+
57
+ ## Two Ways to Use Duckrun
58
+
59
+ ### 1. Data Exploration (Spark-Style API)
60
+
61
+ Perfect for ad-hoc analysis and interactive notebooks:
62
+
63
+ ```python
64
+ con = duckrun.connect("workspace", "lakehouse", "dbo")
65
+
66
+ # Query existing tables
67
+ con.sql("SELECT * FROM sales WHERE year = 2024").show()
68
+
69
+ # Get DataFrame
70
+ df = con.sql("SELECT COUNT(*) FROM orders").df()
71
+
72
+ # Write results to Delta tables
73
+ con.sql("""
74
+ SELECT
75
+ customer_id,
76
+ SUM(amount) as total
77
+ FROM orders
78
+ GROUP BY customer_id
79
+ """).write.mode("overwrite").saveAsTable("customer_totals")
80
+
81
+ # Append mode
82
+ con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
83
+ ```
84
+
85
+ **Note:** `.format("delta")` is optional - Delta is the default format!
86
+
87
+ ### 2. Pipeline Orchestration
88
+
89
+ For production workflows with reusable SQL and Python tasks:
90
+
91
+ ```python
92
+ con = duckrun.connect(
93
+ workspace="my_workspace",
94
+ lakehouse_name="my_lakehouse",
95
+ schema="dbo",
96
+ sql_folder="./sql" # folder with .sql and .py files
97
+ )
98
+
99
+ # Define pipeline
100
+ pipeline = [
101
+ ('download_data', (url, path)), # Python task
102
+ ('clean_data', 'overwrite'), # SQL task
103
+ ('aggregate', 'append') # SQL task
104
+ ]
105
+
106
+ # Run it
107
+ con.run(pipeline)
108
+ ```
109
+
110
+ ## Pipeline Tasks
111
+
112
+ ### Python Tasks
113
+
114
+ **Format:** `('function_name', (arg1, arg2, ...))`
115
+
116
+ Create `sql_folder/function_name.py`:
117
+
118
+ ```python
119
+ # sql_folder/download_data.py
120
+ def download_data(url, path):
121
+ # your code here
122
+ return 1 # 1 = success, 0 = failure
123
+ ```
124
+
125
+ ### SQL Tasks
126
+
127
+ **Format:** `('table_name', 'mode')` or `('table_name', 'mode', {params})`
128
+
129
+ Create `sql_folder/table_name.sql`:
130
+
131
+ ```sql
132
+ -- sql_folder/clean_data.sql
133
+ SELECT
134
+ id,
135
+ TRIM(name) as name,
136
+ date
137
+ FROM raw_data
138
+ WHERE date >= '2024-01-01'
139
+ ```
140
+
141
+ **Write Modes:**
142
+ - `overwrite` - Replace table completely
143
+ - `append` - Add to existing table
144
+ - `ignore` - Create only if doesn't exist
145
+
146
+ ### Parameterized SQL
147
+
148
+ Built-in parameters (always available):
149
+ - `$ws` - workspace name
150
+ - `$lh` - lakehouse name
151
+ - `$schema` - schema name
152
+
153
+ Custom parameters:
154
+
155
+ ```python
156
+ pipeline = [
157
+ ('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
158
+ ]
159
+ ```
160
+
161
+ ```sql
162
+ -- sql_folder/sales.sql
163
+ SELECT * FROM transactions
164
+ WHERE date BETWEEN '$start_date' AND '$end_date'
165
+ ```
166
+
167
+ ## Advanced Features
168
+
169
+ ### Table Name Variants
170
+
171
+ Use `__` to create multiple versions of the same table:
172
+
173
+ ```python
174
+ pipeline = [
175
+ ('sales__initial', 'overwrite'), # writes to 'sales'
176
+ ('sales__incremental', 'append'), # appends to 'sales'
177
+ ]
178
+ ```
179
+
180
+ Both tasks write to the `sales` table but use different SQL files (`sales__initial.sql` and `sales__incremental.sql`).
181
+
182
+ ### Remote SQL Files
183
+
184
+ Load tasks from GitHub or any URL:
185
+
186
+ ```python
187
+ con = duckrun.connect(
188
+ workspace="Analytics",
189
+ lakehouse_name="Sales",
190
+ schema="dbo",
191
+ sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
192
+ )
193
+ ```
194
+
195
+ ### Early Exit on Failure
196
+
197
+ **Pipelines automatically stop when any task fails** - subsequent tasks won't run.
198
+
199
+ For **SQL tasks**, failure is automatic:
200
+ - If the query has a syntax error or runtime error, the task fails
201
+ - The pipeline stops immediately
202
+
203
+ For **Python tasks**, you control success/failure by returning:
204
+ - `1` = Success → pipeline continues to next task
205
+ - `0` = Failure → pipeline stops, remaining tasks are skipped
206
+
207
+ Example:
208
+
209
+ ```python
210
+ # sql_folder/download_data.py
211
+ def download_data(url, path):
212
+ try:
213
+ response = requests.get(url)
214
+ response.raise_for_status()
215
+ # save data...
216
+ return 1 # Success - pipeline continues
217
+ except Exception as e:
218
+ print(f"Download failed: {e}")
219
+ return 0 # Failure - pipeline stops here
220
+ ```
221
+
222
+ ```python
223
+ pipeline = [
224
+ ('download_data', (url, path)), # If returns 0, stops here
225
+ ('clean_data', 'overwrite'), # Won't run if download failed
226
+ ('aggregate', 'append') # Won't run if download failed
227
+ ]
228
+
229
+ success = con.run(pipeline) # Returns True only if ALL tasks succeed
230
+ ```
231
+
232
+ This prevents downstream tasks from processing incomplete or corrupted data.
233
+
234
+ ### Delta Lake Optimization
235
+
236
+ Duckrun automatically:
237
+ - Compacts small files when file count exceeds threshold (default: 100)
238
+ - Vacuums old versions on overwrite
239
+ - Cleans up metadata
240
+
241
+ Customize compaction threshold:
242
+
243
+ ```python
244
+ con = duckrun.connect(
245
+ workspace="workspace",
246
+ lakehouse_name="lakehouse",
247
+ schema="dbo",
248
+ compaction_threshold=50 # compact after 50 files
249
+ )
250
+ ```
251
+
252
+ ## Complete Example
253
+
254
+ ```python
255
+ import duckrun
256
+
257
+ # Connect
258
+ con = duckrun.connect("Analytics", "Sales", "dbo", "./sql")
259
+
260
+ # Pipeline with mixed tasks
261
+ pipeline = [
262
+ # Download raw data (Python)
263
+ ('fetch_api_data', ('https://api.example.com/sales', 'raw')),
264
+
265
+ # Clean and transform (SQL)
266
+ ('clean_sales', 'overwrite'),
267
+
268
+ # Aggregate by region (SQL with params)
269
+ ('regional_summary', 'overwrite', {'min_amount': 1000}),
270
+
271
+ # Append to history (SQL)
272
+ ('sales_history', 'append')
273
+ ]
274
+
275
+ # Run
276
+ success = con.run(pipeline)
277
+
278
+ # Explore results
279
+ con.sql("SELECT * FROM regional_summary").show()
280
+
281
+ # Export to new table
282
+ con.sql("""
283
+ SELECT region, SUM(total) as grand_total
284
+ FROM regional_summary
285
+ GROUP BY region
286
+ """).write.mode("overwrite").saveAsTable("region_totals")
287
+ ```
288
+
289
+ ## How It Works
290
+
291
+ 1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
292
+ 2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
293
+ 3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
294
+ 4. **Write Operations**: Results are written back as Delta tables with automatic optimization
295
+ 5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
296
+
297
+ ## Real-World Example
298
+
299
+ For a complete production example, see [fabric_demo](https://github.com/djouallah/fabric_demo).
300
+
301
+ ## License
302
+
303
+ MIT
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=EoXlQsx7i3BS2a26zB90n4xDBy_WQu1sNicPNYU3DgY,18110
3
+ duckrun-0.1.5.2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.5.2.dist-info/METADATA,sha256=UWtYkoj5E5CsqHKUiJOOVJTQywdTsyzoJpnbiAd28cI,7792
5
+ duckrun-0.1.5.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.5.2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.5.2.dist-info/RECORD,,
@@ -1,172 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: duckrun
3
- Version: 0.1.5
4
- Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
- License-Expression: MIT
6
- Project-URL: Homepage, https://github.com/djouallah/duckrun
7
- Project-URL: Repository, https://github.com/djouallah/duckrun
8
- Project-URL: Issues, https://github.com/djouallah/duckrun/issues
9
- Requires-Python: >=3.9
10
- Description-Content-Type: text/markdown
11
- License-File: LICENSE
12
- Requires-Dist: duckdb>=1.2.0
13
- Requires-Dist: deltalake>=0.18.2
14
- Requires-Dist: requests>=2.28.0
15
- Dynamic: license-file
16
-
17
-
18
- <img src="duckrun.png" width="400" alt="Duckrun">
19
-
20
- Simple task runner for Microsoft Fabric Python notebook, powered by DuckDB and Delta_rs.
21
-
22
-
23
- ## Known Limitation
24
-
25
- Support only Lakehouse with schema, Workspace and lakehouse names should not contains space
26
-
27
- ## Installation
28
-
29
- ```bash
30
- pip install duckrun
31
- ```
32
-
33
-
34
-
35
- ## Quick Start
36
-
37
- ```python
38
- import duckrun
39
-
40
- # Connect to your Fabric lakehouse (using `con` pattern)
41
- con = duckrun.connect(
42
- workspace="my_workspace",
43
- lakehouse_name="my_lakehouse",
44
- schema="dbo",
45
- sql_folder="./sql" # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
46
- )
47
-
48
- # Define your pipeline
49
- pipeline = [
50
- ('load_data', (url, path)), # Python task
51
- ('clean_data', 'overwrite'), # SQL task
52
- ('aggregate', 'append') # SQL task
53
- ]
54
-
55
- # Run it
56
- con.run(pipeline)
57
- ```
58
-
59
- Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
60
-
61
- ## Early Exit
62
-
63
- In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
64
-
65
- ## How It Works
66
-
67
- Duckrun runs two types of tasks:
68
-
69
- ### 1. Python Tasks
70
- Format: `('function_name', (arg1, arg2, ...))`
71
-
72
- Create a file `sql_folder/function_name.py` with a function matching the name:
73
-
74
- ```python
75
- # sql_folder/load_data.py
76
- def load_data(url, path):
77
- # your code here
78
- # IMPORTANT: Must return 1 for success, 0 for failure
79
- return 1
80
- ```
81
-
82
- ### 2. SQL Tasks
83
- Format: `('table_name', 'mode')` or `('table_name', 'mode', {params})`
84
-
85
- Create a file `sql_folder/table_name.sql`:
86
-
87
- ```sql
88
- -- sql_folder/clean_data.sql
89
- SELECT
90
- id,
91
- TRIM(name) as name,
92
- date
93
- FROM raw_data
94
- WHERE date >= '2024-01-01'
95
- ```
96
-
97
- **Modes:**
98
- - `overwrite` - Replace table completely
99
- - `append` - Add to existing table
100
- - `ignore` - Create only if doesn't exist
101
-
102
- ## Task Files
103
-
104
- The `sql_folder` can contain a mixture of both `.sql` and `.py` files. This allows you to combine SQL transformations and Python logic in your pipelines.
105
-
106
- ### SQL Files
107
- Your SQL files automatically have access to:
108
- - `$ws` - workspace name
109
- - `$lh` - lakehouse name
110
- - `$schema` - schema name
111
-
112
- Pass custom parameters:
113
-
114
- ```python
115
- pipeline = [
116
- ('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
117
- ]
118
- ```
119
-
120
- ```sql
121
- -- sql_folder/sales.sql
122
- SELECT * FROM transactions
123
- WHERE date BETWEEN '$start_date' AND '$end_date'
124
- ```
125
-
126
- ## Table Name Convention
127
-
128
- Use `__` to create variants of the same table:
129
-
130
- ```python
131
- pipeline = [
132
- ('sales__initial', 'overwrite'), # writes to 'sales' table
133
- ('sales__incremental', 'append'), # appends to 'sales' table
134
- ]
135
- ```
136
-
137
- Both write to the same `sales` table, but use different SQL files.
138
-
139
- ## Query Data
140
-
141
- ```python
142
- # Run queries
143
- con.sql("SELECT * FROM my_table LIMIT 10").show()
144
-
145
- # Get as DataFrame
146
- df = con.sql("SELECT COUNT(*) FROM sales").df()
147
- ```
148
-
149
- Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
150
-
151
-
152
-
153
- ## Remote SQL Files
154
-
155
- You can load SQL/Python files from a URL:
156
-
157
- ```python
158
- con = duckrun.connect(
159
- workspace="Analytics",
160
- lakehouse_name="Sales",
161
- schema="dbo",
162
- sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
163
- )
164
- ```
165
-
166
- ## Real-Life Usage
167
-
168
- For a complete, production-style example, see [fabric_demo](https://github.com/djouallah/fabric_demo).
169
-
170
- ## License
171
-
172
- MIT
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=6kjFoxjucRgb3qV9R6ZDy1oHcLE2JXPH5E0WCDKgpZM,18031
3
- duckrun-0.1.5.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.5.dist-info/METADATA,sha256=U2NQWJKm02rG4YvgAkDCJ9434OC_sJE-cuSUHu3eSvM,4377
5
- duckrun-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.5.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.5.dist-info/RECORD,,