duckrun 0.2.10.dev1__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- duckrun/core.py +0 -26
- duckrun-0.2.11.dist-info/METADATA +1367 -0
- {duckrun-0.2.10.dev1.dist-info → duckrun-0.2.11.dist-info}/RECORD +6 -6
- duckrun-0.2.10.dev1.dist-info/METADATA +0 -653
- {duckrun-0.2.10.dev1.dist-info → duckrun-0.2.11.dist-info}/WHEEL +0 -0
- {duckrun-0.2.10.dev1.dist-info → duckrun-0.2.11.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.10.dev1.dist-info → duckrun-0.2.11.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
duckrun/__init__.py,sha256=cTj6KQ6hKmgu1z7k9nhDcO5lct049luxjx1V0QnymCo,235
|
|
2
2
|
duckrun/auth.py,sha256=qPaLQ7InlV9leA9r6E6VEeYavFFoBi0zSN8m_l1aoQs,9545
|
|
3
|
-
duckrun/core.py,sha256=
|
|
3
|
+
duckrun/core.py,sha256=3usRl9SetUytVFzCzbpiFXppTjHzwTqFlSEKnUSbcK8,52460
|
|
4
4
|
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
|
5
5
|
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
|
6
6
|
duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
|
|
7
7
|
duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
|
|
8
8
|
duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
|
|
9
9
|
duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
|
|
10
|
-
duckrun-0.2.
|
|
11
|
-
duckrun-0.2.
|
|
12
|
-
duckrun-0.2.
|
|
13
|
-
duckrun-0.2.
|
|
14
|
-
duckrun-0.2.
|
|
10
|
+
duckrun-0.2.11.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
|
11
|
+
duckrun-0.2.11.dist-info/METADATA,sha256=gmMgCIUivM7CCtENbLv9RBPpkU-I6bpoAaZ7EkX07PM,39613
|
|
12
|
+
duckrun-0.2.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
duckrun-0.2.11.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
|
14
|
+
duckrun-0.2.11.dist-info/RECORD,,
|
|
@@ -1,653 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: duckrun
|
|
3
|
-
Version: 0.2.10.dev1
|
|
4
|
-
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
|
5
|
-
Author: mim
|
|
6
|
-
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
|
8
|
-
Project-URL: Repository, https://github.com/djouallah/duckrun
|
|
9
|
-
Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
|
10
|
-
Requires-Python: >=3.9
|
|
11
|
-
Description-Content-Type: text/markdown
|
|
12
|
-
License-File: LICENSE
|
|
13
|
-
Requires-Dist: duckdb>=1.2.0
|
|
14
|
-
Requires-Dist: deltalake<=0.18.2
|
|
15
|
-
Requires-Dist: requests>=2.28.0
|
|
16
|
-
Requires-Dist: obstore>=0.2.0
|
|
17
|
-
Provides-Extra: local
|
|
18
|
-
Requires-Dist: azure-identity>=1.12.0; extra == "local"
|
|
19
|
-
Dynamic: license-file
|
|
20
|
-
|
|
21
|
-
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
|
-
|
|
23
|
-
A helper package for stuff that made my life easier when working with Fabric Python notebooks. Just the things that actually made sense to me - nothing fancy
|
|
24
|
-
|
|
25
|
-
## Important Notes
|
|
26
|
-
|
|
27
|
-
**Requirements:**
|
|
28
|
-
- Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
|
|
29
|
-
- **Workspace names with spaces are fully supported!** ✅
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
33
|
-
|
|
34
|
-
## What It Does
|
|
35
|
-
|
|
36
|
-
It does orchestration, arbitrary SQL statements, and file manipulation. That's it - just stuff I encounter in my daily workflow when working with Fabric notebooks.
|
|
37
|
-
|
|
38
|
-
## Installation
|
|
39
|
-
|
|
40
|
-
```bash
|
|
41
|
-
pip install duckrun
|
|
42
|
-
```
|
|
43
|
-
for local usage, Note: When running locally, your internet speed will be the main bottleneck.
|
|
44
|
-
|
|
45
|
-
```bash
|
|
46
|
-
pip install duckrun[local]
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
## Quick Start
|
|
50
|
-
|
|
51
|
-
### Simple Example for New Users
|
|
52
|
-
|
|
53
|
-
```python
|
|
54
|
-
import duckrun
|
|
55
|
-
|
|
56
|
-
# Connect to a workspace and manage lakehouses
|
|
57
|
-
con = duckrun.connect('My Workspace')
|
|
58
|
-
con.list_lakehouses() # See what lakehouses exist
|
|
59
|
-
con.create_lakehouse_if_not_exists('data') # Create if needed
|
|
60
|
-
|
|
61
|
-
# Connect to a specific lakehouse and query data
|
|
62
|
-
con = duckrun.connect("My Workspace/data.lakehouse/dbo")
|
|
63
|
-
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
### Full Feature Overview
|
|
67
|
-
|
|
68
|
-
```python
|
|
69
|
-
import duckrun
|
|
70
|
-
|
|
71
|
-
# 1. Workspace Management (list and create lakehouses)
|
|
72
|
-
ws = duckrun.connect("My Workspace")
|
|
73
|
-
lakehouses = ws.list_lakehouses() # Returns list of lakehouse names
|
|
74
|
-
ws.create_lakehouse_if_not_exists("New_Lakehouse")
|
|
75
|
-
|
|
76
|
-
# 2. Connect to lakehouse with a specific schema
|
|
77
|
-
con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
|
|
78
|
-
|
|
79
|
-
# Workspace names with spaces are supported!
|
|
80
|
-
con = duckrun.connect("Data Analytics/SalesData.lakehouse/analytics")
|
|
81
|
-
|
|
82
|
-
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
|
83
|
-
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
|
84
|
-
con = duckrun.connect("My Workspace/My_Lakehouse.lakehouse")
|
|
85
|
-
|
|
86
|
-
# 3. Explore data
|
|
87
|
-
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
|
88
|
-
|
|
89
|
-
# 4. Write to Delta tables (Spark-style API)
|
|
90
|
-
con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
|
91
|
-
|
|
92
|
-
# 5. Upload/download files to/from OneLake Files
|
|
93
|
-
con.copy("./local_folder", "target_folder") # Upload files
|
|
94
|
-
con.download("target_folder", "./downloaded") # Download files
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
That's it! No `sql_folder` needed for data exploration.
|
|
98
|
-
|
|
99
|
-
## Connection Format
|
|
100
|
-
|
|
101
|
-
```python
|
|
102
|
-
# Workspace management (list and create lakehouses)
|
|
103
|
-
ws = duckrun.connect("My Workspace")
|
|
104
|
-
ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
|
|
105
|
-
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
106
|
-
|
|
107
|
-
# Lakehouse connection with schema (recommended for best performance)
|
|
108
|
-
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
109
|
-
|
|
110
|
-
# Supports workspace names with spaces!
|
|
111
|
-
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
|
112
|
-
|
|
113
|
-
# Without schema (defaults to 'dbo', scans all schemas)
|
|
114
|
-
# ⚠️ This can be slow for large lakehouses!
|
|
115
|
-
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
|
116
|
-
|
|
117
|
-
# With SQL folder for pipeline orchestration
|
|
118
|
-
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
### Multi-Schema Support
|
|
122
|
-
|
|
123
|
-
When you don't specify a schema, Duckrun will:
|
|
124
|
-
- **Default to `dbo`** for write operations
|
|
125
|
-
- **Scan all schemas** to discover and attach all Delta tables
|
|
126
|
-
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
|
127
|
-
|
|
128
|
-
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
|
129
|
-
|
|
130
|
-
```python
|
|
131
|
-
# Fast: scans only 'dbo' schema
|
|
132
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
133
|
-
|
|
134
|
-
# Slower: scans all schemas
|
|
135
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
136
|
-
|
|
137
|
-
# Query tables from different schemas (when scanning all)
|
|
138
|
-
con.sql("SELECT * FROM dbo_customers").show()
|
|
139
|
-
con.sql("SELECT * FROM bronze_raw_data").show()
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
## Three Ways to Use Duckrun
|
|
143
|
-
|
|
144
|
-
### 1. Data Exploration (Spark-Style API)
|
|
145
|
-
|
|
146
|
-
Perfect for ad-hoc analysis and interactive notebooks:
|
|
147
|
-
|
|
148
|
-
```python
|
|
149
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
150
|
-
|
|
151
|
-
# Query existing tables
|
|
152
|
-
con.sql("SELECT * FROM sales WHERE year = 2024").show()
|
|
153
|
-
|
|
154
|
-
# Get DataFrame
|
|
155
|
-
df = con.sql("SELECT COUNT(*) FROM orders").df()
|
|
156
|
-
|
|
157
|
-
# Write results to Delta tables
|
|
158
|
-
con.sql("""
|
|
159
|
-
SELECT
|
|
160
|
-
customer_id,
|
|
161
|
-
SUM(amount) as total
|
|
162
|
-
FROM orders
|
|
163
|
-
GROUP BY customer_id
|
|
164
|
-
""").write.mode("overwrite").saveAsTable("customer_totals")
|
|
165
|
-
|
|
166
|
-
# Append mode
|
|
167
|
-
con.sql("SELECT * FROM new_orders").write.mode("append").saveAsTable("orders")
|
|
168
|
-
|
|
169
|
-
# Schema evolution and partitioning (exact Spark API compatibility)
|
|
170
|
-
con.sql("""
|
|
171
|
-
SELECT
|
|
172
|
-
customer_id,
|
|
173
|
-
order_date,
|
|
174
|
-
region,
|
|
175
|
-
product_category,
|
|
176
|
-
sales_amount,
|
|
177
|
-
new_column_added_later -- This column might not exist in target table
|
|
178
|
-
FROM source_table
|
|
179
|
-
""").write \
|
|
180
|
-
.mode("append") \
|
|
181
|
-
.option("mergeSchema", "true") \
|
|
182
|
-
.partitionBy("region", "product_category") \
|
|
183
|
-
.saveAsTable("sales_partitioned")
|
|
184
|
-
```
|
|
185
|
-
|
|
186
|
-
**Note:** `.format("delta")` is optional - Delta is the default format!
|
|
187
|
-
|
|
188
|
-
### 2. File Management (OneLake Files)
|
|
189
|
-
|
|
190
|
-
Upload and download files to/from OneLake Files section (not Delta tables):
|
|
191
|
-
|
|
192
|
-
```python
|
|
193
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
194
|
-
|
|
195
|
-
# Upload files to OneLake Files (remote_folder is required)
|
|
196
|
-
con.copy("./local_data", "uploaded_data")
|
|
197
|
-
|
|
198
|
-
# Upload only specific file types
|
|
199
|
-
con.copy("./reports", "daily_reports", ['.csv', '.parquet'])
|
|
200
|
-
|
|
201
|
-
# Upload with overwrite enabled (default is False for safety)
|
|
202
|
-
con.copy("./backup", "backups", overwrite=True)
|
|
203
|
-
|
|
204
|
-
# Download files from OneLake Files
|
|
205
|
-
con.download("uploaded_data", "./downloaded")
|
|
206
|
-
|
|
207
|
-
# Download only CSV files from a specific folder
|
|
208
|
-
con.download("daily_reports", "./reports", ['.csv'])
|
|
209
|
-
```
|
|
210
|
-
|
|
211
|
-
**Key Features:**
|
|
212
|
-
- ✅ **Files go to OneLake Files section** (not Delta Tables)
|
|
213
|
-
- ✅ **`remote_folder` parameter is required** for uploads (prevents accidental uploads)
|
|
214
|
-
- ✅ **`overwrite=False` by default** (safer - prevents accidental overwrites)
|
|
215
|
-
- ✅ **File extension filtering** (e.g., only `.csv` or `.parquet` files)
|
|
216
|
-
- ✅ **Preserves folder structure** during upload/download
|
|
217
|
-
- ✅ **Progress reporting** with file sizes and upload status
|
|
218
|
-
|
|
219
|
-
### 3. Pipeline Orchestration
|
|
220
|
-
|
|
221
|
-
For production workflows with reusable SQL and Python tasks:
|
|
222
|
-
|
|
223
|
-
```python
|
|
224
|
-
con = duckrun.connect(
|
|
225
|
-
"my_workspace/my_lakehouse.lakehouse/dbo",
|
|
226
|
-
sql_folder="./sql" # folder with .sql and .py files
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
# Define pipeline
|
|
230
|
-
pipeline = [
|
|
231
|
-
('download_data', (url, path)), # Python task
|
|
232
|
-
('clean_data', 'overwrite'), # SQL task
|
|
233
|
-
('aggregate', 'append') # SQL task
|
|
234
|
-
]
|
|
235
|
-
|
|
236
|
-
# Run it
|
|
237
|
-
con.run(pipeline)
|
|
238
|
-
```
|
|
239
|
-
|
|
240
|
-
## Pipeline Tasks
|
|
241
|
-
|
|
242
|
-
### Python Tasks
|
|
243
|
-
|
|
244
|
-
**Format:** `('function_name', (arg1, arg2, ...))`
|
|
245
|
-
|
|
246
|
-
Create `sql_folder/function_name.py`:
|
|
247
|
-
|
|
248
|
-
```python
|
|
249
|
-
# sql_folder/download_data.py
|
|
250
|
-
def download_data(url, path):
|
|
251
|
-
# your code here
|
|
252
|
-
return 1 # 1 = success, 0 = failure
|
|
253
|
-
```
|
|
254
|
-
|
|
255
|
-
### SQL Tasks
|
|
256
|
-
|
|
257
|
-
**Formats:**
|
|
258
|
-
- `('table_name', 'mode')` - Simple SQL with no parameters
|
|
259
|
-
- `('table_name', 'mode', {params})` - SQL with template parameters
|
|
260
|
-
- `('table_name', 'mode', {params}, {delta_options})` - SQL with Delta Lake options
|
|
261
|
-
|
|
262
|
-
Create `sql_folder/table_name.sql`:
|
|
263
|
-
|
|
264
|
-
```sql
|
|
265
|
-
-- sql_folder/clean_data.sql
|
|
266
|
-
SELECT
|
|
267
|
-
id,
|
|
268
|
-
TRIM(name) as name,
|
|
269
|
-
date
|
|
270
|
-
FROM raw_data
|
|
271
|
-
WHERE date >= '2024-01-01'
|
|
272
|
-
```
|
|
273
|
-
|
|
274
|
-
**Write Modes:**
|
|
275
|
-
- `overwrite` - Replace table completely
|
|
276
|
-
- `append` - Add to existing table
|
|
277
|
-
- `ignore` - Create only if doesn't exist
|
|
278
|
-
|
|
279
|
-
### Parameterized SQL
|
|
280
|
-
|
|
281
|
-
Built-in parameters (always available):
|
|
282
|
-
- `$ws` - workspace name
|
|
283
|
-
- `$lh` - lakehouse name
|
|
284
|
-
- `$schema` - schema name
|
|
285
|
-
|
|
286
|
-
Custom parameters:
|
|
287
|
-
|
|
288
|
-
```python
|
|
289
|
-
pipeline = [
|
|
290
|
-
('sales', 'append', {'start_date': '2024-01-01', 'end_date': '2024-12-31'})
|
|
291
|
-
]
|
|
292
|
-
```
|
|
293
|
-
|
|
294
|
-
```sql
|
|
295
|
-
-- sql_folder/sales.sql
|
|
296
|
-
SELECT * FROM transactions
|
|
297
|
-
WHERE date BETWEEN '$start_date' AND '$end_date'
|
|
298
|
-
```
|
|
299
|
-
|
|
300
|
-
### Delta Lake Options (Schema Evolution & Partitioning)
|
|
301
|
-
|
|
302
|
-
Use the 4-tuple format for advanced Delta Lake features:
|
|
303
|
-
|
|
304
|
-
```python
|
|
305
|
-
pipeline = [
|
|
306
|
-
# SQL with empty params but Delta options
|
|
307
|
-
('evolving_table', 'append', {}, {'mergeSchema': 'true'}),
|
|
308
|
-
|
|
309
|
-
# SQL with both params AND Delta options
|
|
310
|
-
('sales_data', 'append',
|
|
311
|
-
{'region': 'North America'},
|
|
312
|
-
{'mergeSchema': 'true', 'partitionBy': ['region', 'year']}),
|
|
313
|
-
|
|
314
|
-
# Partitioning without schema merging
|
|
315
|
-
('time_series', 'overwrite',
|
|
316
|
-
{'start_date': '2024-01-01'},
|
|
317
|
-
{'partitionBy': ['year', 'month']})
|
|
318
|
-
]
|
|
319
|
-
```
|
|
320
|
-
|
|
321
|
-
**Available Delta Options:**
|
|
322
|
-
- `mergeSchema: 'true'` - Automatically handle schema evolution (new columns)
|
|
323
|
-
- `partitionBy: ['col1', 'col2']` - Partition data by specified columns
|
|
324
|
-
|
|
325
|
-
## Advanced Features
|
|
326
|
-
|
|
327
|
-
### Schema Evolution & Partitioning
|
|
328
|
-
|
|
329
|
-
Handle evolving schemas and optimize query performance with partitioning:
|
|
330
|
-
|
|
331
|
-
```python
|
|
332
|
-
# Using Spark-style API
|
|
333
|
-
con.sql("""
|
|
334
|
-
SELECT
|
|
335
|
-
customer_id,
|
|
336
|
-
region,
|
|
337
|
-
product_category,
|
|
338
|
-
sales_amount,
|
|
339
|
-
-- New column that might not exist in target table
|
|
340
|
-
discount_percentage
|
|
341
|
-
FROM raw_sales
|
|
342
|
-
""").write \
|
|
343
|
-
.mode("append") \
|
|
344
|
-
.option("mergeSchema", "true") \
|
|
345
|
-
.partitionBy("region", "product_category") \
|
|
346
|
-
.saveAsTable("sales_partitioned")
|
|
347
|
-
|
|
348
|
-
# Using pipeline format
|
|
349
|
-
pipeline = [
|
|
350
|
-
('sales_summary', 'append',
|
|
351
|
-
{'batch_date': '2024-10-07'},
|
|
352
|
-
{'mergeSchema': 'true', 'partitionBy': ['region', 'year']})
|
|
353
|
-
]
|
|
354
|
-
```
|
|
355
|
-
|
|
356
|
-
**Benefits:**
|
|
357
|
-
- 🔄 **Schema Evolution**: Automatically handles new columns without breaking existing queries
|
|
358
|
-
- ⚡ **Query Performance**: Partitioning improves performance for filtered queries
|
|
359
|
-
|
|
360
|
-
### Table Name Variants
|
|
361
|
-
|
|
362
|
-
Use `__` to create multiple versions of the same table:
|
|
363
|
-
|
|
364
|
-
```python
|
|
365
|
-
pipeline = [
|
|
366
|
-
('sales__initial', 'overwrite'), # writes to 'sales'
|
|
367
|
-
('sales__incremental', 'append'), # appends to 'sales'
|
|
368
|
-
]
|
|
369
|
-
```
|
|
370
|
-
|
|
371
|
-
Both tasks write to the `sales` table but use different SQL files (`sales__initial.sql` and `sales__incremental.sql`).
|
|
372
|
-
|
|
373
|
-
### Remote SQL Files
|
|
374
|
-
|
|
375
|
-
Load tasks from GitHub or any URL:
|
|
376
|
-
|
|
377
|
-
```python
|
|
378
|
-
con = duckrun.connect(
|
|
379
|
-
"Analytics/Sales.lakehouse/dbo",
|
|
380
|
-
sql_folder="https://raw.githubusercontent.com/user/repo/main/sql"
|
|
381
|
-
)
|
|
382
|
-
```
|
|
383
|
-
|
|
384
|
-
### Early Exit on Failure
|
|
385
|
-
|
|
386
|
-
**Pipelines automatically stop when any task fails** - subsequent tasks won't run.
|
|
387
|
-
|
|
388
|
-
For **SQL tasks**, failure is automatic:
|
|
389
|
-
- If the query has a syntax error or runtime error, the task fails
|
|
390
|
-
- The pipeline stops immediately
|
|
391
|
-
|
|
392
|
-
For **Python tasks**, you control success/failure by returning:
|
|
393
|
-
- `1` = Success → pipeline continues to next task
|
|
394
|
-
- `0` = Failure → pipeline stops, remaining tasks are skipped
|
|
395
|
-
|
|
396
|
-
Example:
|
|
397
|
-
|
|
398
|
-
```python
|
|
399
|
-
# sql_folder/download_data.py
|
|
400
|
-
def download_data(url, path):
|
|
401
|
-
try:
|
|
402
|
-
response = requests.get(url)
|
|
403
|
-
response.raise_for_status()
|
|
404
|
-
# save data...
|
|
405
|
-
return 1 # Success - pipeline continues
|
|
406
|
-
except Exception as e:
|
|
407
|
-
print(f"Download failed: {e}")
|
|
408
|
-
return 0 # Failure - pipeline stops here
|
|
409
|
-
```
|
|
410
|
-
|
|
411
|
-
```python
|
|
412
|
-
pipeline = [
|
|
413
|
-
('download_data', (url, path)), # If returns 0, stops here
|
|
414
|
-
('clean_data', 'overwrite'), # Won't run if download failed
|
|
415
|
-
('aggregate', 'append') # Won't run if download failed
|
|
416
|
-
]
|
|
417
|
-
|
|
418
|
-
success = con.run(pipeline) # Returns True only if ALL tasks succeed
|
|
419
|
-
```
|
|
420
|
-
|
|
421
|
-
This prevents downstream tasks from processing incomplete or corrupted data.
|
|
422
|
-
|
|
423
|
-
### Semantic Model Deployment
|
|
424
|
-
|
|
425
|
-
Deploy Power BI semantic models directly from BIM files using DirectLake mode:
|
|
426
|
-
|
|
427
|
-
```python
|
|
428
|
-
# Connect to lakehouse
|
|
429
|
-
con = duckrun.connect("Analytics/Sales.lakehouse/dbo")
|
|
430
|
-
|
|
431
|
-
# Deploy with auto-generated name (lakehouse_schema)
|
|
432
|
-
con.deploy("https://raw.githubusercontent.com/user/repo/main/model.bim")
|
|
433
|
-
|
|
434
|
-
# Deploy with custom name
|
|
435
|
-
con.deploy(
|
|
436
|
-
"https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
|
|
437
|
-
dataset_name="Sales Analytics Model",
|
|
438
|
-
wait_seconds=10 # Wait for permission propagation
|
|
439
|
-
)
|
|
440
|
-
```
|
|
441
|
-
|
|
442
|
-
**Features:**
|
|
443
|
-
- 🚀 **DirectLake Mode**: Deploys semantic models with DirectLake connection
|
|
444
|
-
- 🔄 **Automatic Configuration**: Auto-configures workspace, lakehouse, and schema connections
|
|
445
|
-
- 📦 **BIM from URL**: Load model definitions from GitHub or any accessible URL
|
|
446
|
-
- ⏱️ **Permission Handling**: Configurable wait time for permission propagation
|
|
447
|
-
|
|
448
|
-
**Use Cases:**
|
|
449
|
-
- Deploy semantic models as part of CI/CD pipelines
|
|
450
|
-
- Version control your semantic models in Git
|
|
451
|
-
- Automated model deployment across environments
|
|
452
|
-
- Streamline DirectLake model creation
|
|
453
|
-
|
|
454
|
-
### Delta Lake Optimization
|
|
455
|
-
|
|
456
|
-
Duckrun automatically:
|
|
457
|
-
- Compacts small files when file count exceeds threshold (default: 100)
|
|
458
|
-
- Vacuums old versions on overwrite
|
|
459
|
-
- Cleans up metadata
|
|
460
|
-
|
|
461
|
-
Customize compaction threshold:
|
|
462
|
-
|
|
463
|
-
```python
|
|
464
|
-
con = duckrun.connect(
|
|
465
|
-
"workspace/lakehouse.lakehouse/dbo",
|
|
466
|
-
compaction_threshold=50 # compact after 50 files
|
|
467
|
-
)
|
|
468
|
-
```
|
|
469
|
-
|
|
470
|
-
## File Management API Reference
|
|
471
|
-
|
|
472
|
-
### `copy(local_folder, remote_folder, file_extensions=None, overwrite=False)`
|
|
473
|
-
|
|
474
|
-
Upload files from a local folder to OneLake Files section.
|
|
475
|
-
|
|
476
|
-
**Parameters:**
|
|
477
|
-
- `local_folder` (str): Path to local folder containing files to upload
|
|
478
|
-
- `remote_folder` (str): **Required** target folder path in OneLake Files
|
|
479
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.parquet']`)
|
|
480
|
-
- `overwrite` (bool, optional): Whether to overwrite existing files (default: False)
|
|
481
|
-
|
|
482
|
-
**Returns:** `True` if all files uploaded successfully, `False` otherwise
|
|
483
|
-
|
|
484
|
-
**Examples:**
|
|
485
|
-
```python
|
|
486
|
-
# Upload all files to a target folder
|
|
487
|
-
con.copy("./data", "processed_data")
|
|
488
|
-
|
|
489
|
-
# Upload only CSV and Parquet files
|
|
490
|
-
con.copy("./reports", "monthly_reports", ['.csv', '.parquet'])
|
|
491
|
-
|
|
492
|
-
# Upload with overwrite enabled
|
|
493
|
-
con.copy("./backup", "daily_backup", overwrite=True)
|
|
494
|
-
```
|
|
495
|
-
|
|
496
|
-
### `download(remote_folder="", local_folder="./downloaded_files", file_extensions=None, overwrite=False)`
|
|
497
|
-
|
|
498
|
-
Download files from OneLake Files section to a local folder.
|
|
499
|
-
|
|
500
|
-
**Parameters:**
|
|
501
|
-
- `remote_folder` (str, optional): Source folder path in OneLake Files (default: root)
|
|
502
|
-
- `local_folder` (str, optional): Local destination folder (default: "./downloaded_files")
|
|
503
|
-
- `file_extensions` (list, optional): Filter by file extensions (e.g., `['.csv', '.json']`)
|
|
504
|
-
- `overwrite` (bool, optional): Whether to overwrite existing local files (default: False)
|
|
505
|
-
|
|
506
|
-
**Returns:** `True` if all files downloaded successfully, `False` otherwise
|
|
507
|
-
|
|
508
|
-
**Examples:**
|
|
509
|
-
```python
|
|
510
|
-
# Download all files from OneLake Files root
|
|
511
|
-
con.download()
|
|
512
|
-
|
|
513
|
-
# Download from specific folder
|
|
514
|
-
con.download("processed_data", "./local_data")
|
|
515
|
-
|
|
516
|
-
# Download only JSON files
|
|
517
|
-
con.download("config", "./configs", ['.json'])
|
|
518
|
-
```
|
|
519
|
-
|
|
520
|
-
**Important Notes:**
|
|
521
|
-
- Files are uploaded/downloaded to/from the **OneLake Files section**, not Delta Tables
|
|
522
|
-
- The `remote_folder` parameter is **required** for uploads to prevent accidental uploads
|
|
523
|
-
- Both methods default to `overwrite=False` for safety
|
|
524
|
-
- Folder structure is preserved during upload/download operations
|
|
525
|
-
- Progress is reported with file names, sizes, and upload/download status
|
|
526
|
-
|
|
527
|
-
## Complete Example
|
|
528
|
-
|
|
529
|
-
```python
|
|
530
|
-
import duckrun
|
|
531
|
-
|
|
532
|
-
# Connect (specify schema for best performance)
|
|
533
|
-
con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
|
|
534
|
-
|
|
535
|
-
# 1. Upload raw data files to OneLake Files
|
|
536
|
-
con.copy("./raw_data", "raw_uploads", ['.csv', '.json'])
|
|
537
|
-
|
|
538
|
-
# 2. Pipeline with mixed tasks
|
|
539
|
-
pipeline = [
|
|
540
|
-
# Download raw data (Python)
|
|
541
|
-
('fetch_api_data', ('https://api.example.com/sales', 'raw')),
|
|
542
|
-
|
|
543
|
-
# Clean and transform (SQL)
|
|
544
|
-
('clean_sales', 'overwrite'),
|
|
545
|
-
|
|
546
|
-
# Aggregate by region (SQL with params)
|
|
547
|
-
('regional_summary', 'overwrite', {'min_amount': 1000}),
|
|
548
|
-
|
|
549
|
-
# Append to history with schema evolution (SQL with Delta options)
|
|
550
|
-
('sales_history', 'append', {}, {'mergeSchema': 'true', 'partitionBy': ['year', 'region']})
|
|
551
|
-
]
|
|
552
|
-
|
|
553
|
-
# Run pipeline
|
|
554
|
-
success = con.run(pipeline)
|
|
555
|
-
|
|
556
|
-
# 3. Explore results using DuckDB
|
|
557
|
-
con.sql("SELECT * FROM regional_summary").show()
|
|
558
|
-
|
|
559
|
-
# 4. Export to new Delta table
|
|
560
|
-
con.sql("""
|
|
561
|
-
SELECT region, SUM(total) as grand_total
|
|
562
|
-
FROM regional_summary
|
|
563
|
-
GROUP BY region
|
|
564
|
-
""").write.mode("overwrite").saveAsTable("region_totals")
|
|
565
|
-
|
|
566
|
-
# 5. Download processed files for external systems
|
|
567
|
-
con.download("processed_reports", "./exports", ['.csv'])
|
|
568
|
-
|
|
569
|
-
# 6. Deploy semantic model for Power BI
|
|
570
|
-
con.deploy(
|
|
571
|
-
"https://raw.githubusercontent.com/user/repo/main/sales_model.bim",
|
|
572
|
-
dataset_name="Sales Analytics"
|
|
573
|
-
)
|
|
574
|
-
```
|
|
575
|
-
|
|
576
|
-
**This example demonstrates:**
|
|
577
|
-
- 📁 **File uploads** to OneLake Files section
|
|
578
|
-
- 🔄 **Pipeline orchestration** with SQL and Python tasks
|
|
579
|
-
- ⚡ **Fast data exploration** with DuckDB
|
|
580
|
-
- 💾 **Delta table creation** with Spark-style API
|
|
581
|
-
- 🔀 **Schema evolution** and partitioning
|
|
582
|
-
- 📤 **File downloads** from OneLake Files
|
|
583
|
-
- 📊 **Semantic model deployment** with DirectLake
|
|
584
|
-
|
|
585
|
-
## Schema Evolution & Partitioning Guide
|
|
586
|
-
|
|
587
|
-
### When to Use Schema Evolution
|
|
588
|
-
|
|
589
|
-
Use `mergeSchema: 'true'` when:
|
|
590
|
-
- Adding new columns to existing tables
|
|
591
|
-
- Source data schema changes over time
|
|
592
|
-
- Working with evolving data pipelines
|
|
593
|
-
- Need backward compatibility
|
|
594
|
-
|
|
595
|
-
### When to Use Partitioning
|
|
596
|
-
|
|
597
|
-
Use `partitionBy` when:
|
|
598
|
-
- Queries frequently filter by specific columns (dates, regions, categories)
|
|
599
|
-
- Tables are large and need performance optimization
|
|
600
|
-
- Want to organize data logically for maintenance
|
|
601
|
-
|
|
602
|
-
### Best Practices
|
|
603
|
-
|
|
604
|
-
```python
|
|
605
|
-
# ✅ Good: Partition by commonly filtered columns
|
|
606
|
-
.partitionBy("year", "region") # Often filtered: WHERE year = 2024 AND region = 'US'
|
|
607
|
-
|
|
608
|
-
# ❌ Avoid: High cardinality partitions
|
|
609
|
-
.partitionBy("customer_id") # Creates too many small partitions
|
|
610
|
-
|
|
611
|
-
# ✅ Good: Schema evolution for append operations
|
|
612
|
-
.mode("append").option("mergeSchema", "true")
|
|
613
|
-
|
|
614
|
-
# ✅ Good: Combined approach for data lakes
|
|
615
|
-
pipeline = [
|
|
616
|
-
('daily_sales', 'append',
|
|
617
|
-
{'batch_date': '2024-10-07'},
|
|
618
|
-
{'mergeSchema': 'true', 'partitionBy': ['year', 'month', 'region']})
|
|
619
|
-
]
|
|
620
|
-
```
|
|
621
|
-
|
|
622
|
-
### Task Format Reference
|
|
623
|
-
|
|
624
|
-
```python
|
|
625
|
-
# 2-tuple: Simple SQL/Python
|
|
626
|
-
('task_name', 'mode') # SQL: no params, no Delta options
|
|
627
|
-
('function_name', (args)) # Python: function with arguments
|
|
628
|
-
|
|
629
|
-
# 3-tuple: SQL with parameters
|
|
630
|
-
('task_name', 'mode', {'param': 'value'})
|
|
631
|
-
|
|
632
|
-
# 4-tuple: SQL with parameters AND Delta options
|
|
633
|
-
('task_name', 'mode', {'param': 'value'}, {'mergeSchema': 'true', 'partitionBy': ['col']})
|
|
634
|
-
|
|
635
|
-
# 4-tuple: Empty parameters but Delta options
|
|
636
|
-
('task_name', 'mode', {}, {'mergeSchema': 'true'})
|
|
637
|
-
```
|
|
638
|
-
|
|
639
|
-
## How It Works
|
|
640
|
-
|
|
641
|
-
1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
|
|
642
|
-
2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
|
|
643
|
-
3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
|
|
644
|
-
4. **Write Operations**: Results are written back as Delta tables with automatic optimization
|
|
645
|
-
5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
|
|
646
|
-
|
|
647
|
-
## Real-World Example
|
|
648
|
-
|
|
649
|
-
For a complete production example, see [fabric_demo](https://github.com/djouallah/fabric_demo).
|
|
650
|
-
|
|
651
|
-
## License
|
|
652
|
-
|
|
653
|
-
MIT
|
|
File without changes
|
|
File without changes
|
|
File without changes
|