duckrun 0.2.20.dev5__tar.gz → 0.2.21.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/PKG-INFO +92 -65
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/README.md +91 -64
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/writer.py +18 -6
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/PKG-INFO +92 -65
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/pyproject.toml +1 -1
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/LICENSE +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/__init__.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/auth.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/core.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/ducklake_metadata.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/files.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/notebook.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/rle.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/runner.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/stats.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/setup.cfg +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_checkpoint_format.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_ducklake_export.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_register.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_rle.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_writer_dictionary.py +0 -0
- {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_writer_integration.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.21.dev1
|
|
4
4
|
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -20,13 +20,15 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
[](https://badge.fury.io/py/duckrun)
|
|
24
|
+
[](https://pepy.tech/project/duckrun)
|
|
25
|
+
|
|
26
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB/Delta_rs.
|
|
24
27
|
|
|
25
28
|
## Important Notes
|
|
26
29
|
|
|
27
30
|
**Requirements:**
|
|
28
|
-
- Lakehouse
|
|
29
|
-
- **Workspace names with spaces are fully supported!** ✅
|
|
31
|
+
- Lakehouse without schema are not supported
|
|
30
32
|
|
|
31
33
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
32
34
|
|
|
@@ -48,68 +50,110 @@ pip install duckrun[local]
|
|
|
48
50
|
|
|
49
51
|
Note: When running locally, your internet speed will be the main bottleneck.
|
|
50
52
|
|
|
51
|
-
##
|
|
52
|
-
|
|
53
|
-
### Simple Example for New Users
|
|
53
|
+
## Getting Started
|
|
54
54
|
|
|
55
55
|
```python
|
|
56
56
|
import duckrun
|
|
57
57
|
|
|
58
|
-
# Connect to a
|
|
59
|
-
con = duckrun.connect(
|
|
60
|
-
con.list_lakehouses() # See what lakehouses exist
|
|
61
|
-
con.create_lakehouse_if_not_exists('data') # Create if needed
|
|
62
|
-
|
|
63
|
-
# Connect to a specific lakehouse and query data
|
|
64
|
-
con = duckrun.connect("My Workspace/data.lakehouse/dbo")
|
|
58
|
+
# Connect to a lakehouse and start querying
|
|
59
|
+
con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
|
|
65
60
|
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
|
61
|
+
|
|
62
|
+
# Write results to a new table
|
|
63
|
+
con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
|
66
64
|
```
|
|
67
65
|
|
|
68
|
-
|
|
66
|
+
That's it! Connect to your lakehouse and run SQL queries with DuckDB's speed.
|
|
69
67
|
|
|
68
|
+
## Core Functionalities
|
|
69
|
+
|
|
70
|
+
### 1. **Data Exploration & Querying**
|
|
71
|
+
Query Delta tables using SQL with DuckDB performance:
|
|
70
72
|
```python
|
|
71
|
-
|
|
73
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
74
|
+
con.sql("SELECT * FROM sales WHERE year = 2024").show()
|
|
75
|
+
df = con.sql("SELECT COUNT(*) FROM orders").df()
|
|
76
|
+
```
|
|
72
77
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
78
|
+
### 2. **Write to Delta Tables**
|
|
79
|
+
Use Spark-style API to write query results:
|
|
80
|
+
```python
|
|
81
|
+
con.sql("SELECT * FROM source") \
|
|
82
|
+
.write \
|
|
83
|
+
.mode("overwrite") \
|
|
84
|
+
.saveAsTable("target")
|
|
85
|
+
```
|
|
77
86
|
|
|
78
|
-
|
|
79
|
-
|
|
87
|
+
### 3. **Workspace Management**
|
|
88
|
+
List and create lakehouses:
|
|
89
|
+
```python
|
|
90
|
+
ws = duckrun.connect("My Workspace")
|
|
91
|
+
ws.list_lakehouses()
|
|
92
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
93
|
+
```
|
|
80
94
|
|
|
81
|
-
|
|
82
|
-
|
|
95
|
+
### 4. **File Management**
|
|
96
|
+
Upload/download files to OneLake Files:
|
|
97
|
+
```python
|
|
98
|
+
con.copy("./local_folder", "remote_folder")
|
|
99
|
+
con.download("remote_folder", "./local_folder")
|
|
100
|
+
```
|
|
83
101
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
102
|
+
### 5. **Pipeline Orchestration**
|
|
103
|
+
Run SQL and Python tasks in sequence:
|
|
104
|
+
```python
|
|
105
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
|
|
106
|
+
pipeline = [
|
|
107
|
+
('clean_data', 'overwrite'),
|
|
108
|
+
('aggregate', 'append')
|
|
109
|
+
]
|
|
110
|
+
con.run(pipeline)
|
|
111
|
+
```
|
|
87
112
|
|
|
88
|
-
|
|
89
|
-
|
|
113
|
+
### 6. **Semantic Model Deployment**
|
|
114
|
+
Deploy Power BI models with DirectLake:
|
|
115
|
+
```python
|
|
116
|
+
con.deploy("https://github.com/user/repo/model.bim")
|
|
117
|
+
con.deploy("./local_model.bim", dataset_name="Sales Model")
|
|
118
|
+
```
|
|
90
119
|
|
|
91
|
-
|
|
92
|
-
|
|
120
|
+
### 7. **Download Semantic Models**
|
|
121
|
+
Download BIM files from deployed models:
|
|
122
|
+
```python
|
|
123
|
+
bim_content = con.download_bim("Sales Model")
|
|
124
|
+
con.download_bim("Sales Model", "sales_model.bim")
|
|
125
|
+
```
|
|
93
126
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
127
|
+
### 8. **Schema Evolution & Partitioning**
|
|
128
|
+
Handle evolving schemas and optimize with partitioning:
|
|
129
|
+
```python
|
|
130
|
+
con.sql("SELECT * FROM source") \
|
|
131
|
+
.write \
|
|
132
|
+
.mode("append") \
|
|
133
|
+
.option("mergeSchema", "true") \
|
|
134
|
+
.partitionBy("region", "year") \
|
|
135
|
+
.saveAsTable("target")
|
|
97
136
|
```
|
|
98
137
|
|
|
99
|
-
|
|
138
|
+
### 9. **SQL Lookup Functions**
|
|
139
|
+
Resolve workspace/lakehouse names from GUIDs in SQL:
|
|
140
|
+
```python
|
|
141
|
+
con.sql("""
|
|
142
|
+
SELECT
|
|
143
|
+
workspace_id,
|
|
144
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
145
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
146
|
+
FROM storage_logs
|
|
147
|
+
""").show()
|
|
148
|
+
```
|
|
100
149
|
|
|
101
150
|
## Connection Format
|
|
102
151
|
|
|
103
152
|
```python
|
|
104
|
-
#
|
|
105
|
-
ws = duckrun.connect("My Workspace")
|
|
106
|
-
ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
|
|
107
|
-
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
108
|
-
|
|
109
|
-
# Lakehouse connection with schema (recommended for best performance)
|
|
153
|
+
# Lakehouse connection with schema (recommended)
|
|
110
154
|
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
111
155
|
|
|
112
|
-
#
|
|
156
|
+
# Workspace names with spaces are supported!
|
|
113
157
|
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
|
114
158
|
|
|
115
159
|
# Without schema (defaults to 'dbo', scans all schemas)
|
|
@@ -120,30 +164,13 @@ con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
|
|
120
164
|
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
|
|
121
165
|
```
|
|
122
166
|
|
|
123
|
-
|
|
167
|
+
## Detailed Usage
|
|
124
168
|
|
|
125
|
-
|
|
126
|
-
- **Default to `dbo`** for write operations
|
|
127
|
-
- **Scan all schemas** to discover and attach all Delta tables
|
|
128
|
-
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
|
129
|
-
|
|
130
|
-
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
|
131
|
-
|
|
132
|
-
```python
|
|
133
|
-
# Fast: scans only 'dbo' schema
|
|
134
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
135
|
-
|
|
136
|
-
# Slower: scans all schemas
|
|
137
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
138
|
-
|
|
139
|
-
# Query tables from different schemas (when scanning all)
|
|
140
|
-
con.sql("SELECT * FROM dbo_customers").show()
|
|
141
|
-
con.sql("SELECT * FROM bronze_raw_data").show()
|
|
142
|
-
```
|
|
169
|
+
### Data Exploration
|
|
143
170
|
|
|
144
|
-
##
|
|
171
|
+
## Detailed Documentation
|
|
145
172
|
|
|
146
|
-
###
|
|
173
|
+
### Data Exploration
|
|
147
174
|
|
|
148
175
|
Perfect for ad-hoc analysis and interactive notebooks:
|
|
149
176
|
|
|
@@ -184,7 +211,7 @@ con.sql("""
|
|
|
184
211
|
|
|
185
212
|
**Note:** `.format("delta")` is optional - Delta is the default format!
|
|
186
213
|
|
|
187
|
-
###
|
|
214
|
+
### File Management (OneLake Files)
|
|
188
215
|
|
|
189
216
|
Upload and download files to/from OneLake Files section (not Delta tables):
|
|
190
217
|
|
|
@@ -215,7 +242,7 @@ con.download("daily_reports", "./reports", ['.csv'])
|
|
|
215
242
|
- ✅ **Preserves folder structure** during upload/download
|
|
216
243
|
- ✅ **Progress reporting** with file sizes and upload status
|
|
217
244
|
|
|
218
|
-
###
|
|
245
|
+
### Pipeline Orchestration
|
|
219
246
|
|
|
220
247
|
For production workflows with reusable SQL and Python tasks:
|
|
221
248
|
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://badge.fury.io/py/duckrun)
|
|
4
|
+
[](https://pepy.tech/project/duckrun)
|
|
5
|
+
|
|
6
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB/Delta_rs.
|
|
4
7
|
|
|
5
8
|
## Important Notes
|
|
6
9
|
|
|
7
10
|
**Requirements:**
|
|
8
|
-
- Lakehouse
|
|
9
|
-
- **Workspace names with spaces are fully supported!** ✅
|
|
11
|
+
- Lakehouse without schema are not supported
|
|
10
12
|
|
|
11
13
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
12
14
|
|
|
@@ -28,68 +30,110 @@ pip install duckrun[local]
|
|
|
28
30
|
|
|
29
31
|
Note: When running locally, your internet speed will be the main bottleneck.
|
|
30
32
|
|
|
31
|
-
##
|
|
32
|
-
|
|
33
|
-
### Simple Example for New Users
|
|
33
|
+
## Getting Started
|
|
34
34
|
|
|
35
35
|
```python
|
|
36
36
|
import duckrun
|
|
37
37
|
|
|
38
|
-
# Connect to a
|
|
39
|
-
con = duckrun.connect(
|
|
40
|
-
con.list_lakehouses() # See what lakehouses exist
|
|
41
|
-
con.create_lakehouse_if_not_exists('data') # Create if needed
|
|
42
|
-
|
|
43
|
-
# Connect to a specific lakehouse and query data
|
|
44
|
-
con = duckrun.connect("My Workspace/data.lakehouse/dbo")
|
|
38
|
+
# Connect to a lakehouse and start querying
|
|
39
|
+
con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
|
|
45
40
|
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
|
41
|
+
|
|
42
|
+
# Write results to a new table
|
|
43
|
+
con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
|
46
44
|
```
|
|
47
45
|
|
|
48
|
-
|
|
46
|
+
That's it! Connect to your lakehouse and run SQL queries with DuckDB's speed.
|
|
49
47
|
|
|
48
|
+
## Core Functionalities
|
|
49
|
+
|
|
50
|
+
### 1. **Data Exploration & Querying**
|
|
51
|
+
Query Delta tables using SQL with DuckDB performance:
|
|
50
52
|
```python
|
|
51
|
-
|
|
53
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
54
|
+
con.sql("SELECT * FROM sales WHERE year = 2024").show()
|
|
55
|
+
df = con.sql("SELECT COUNT(*) FROM orders").df()
|
|
56
|
+
```
|
|
52
57
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
### 2. **Write to Delta Tables**
|
|
59
|
+
Use Spark-style API to write query results:
|
|
60
|
+
```python
|
|
61
|
+
con.sql("SELECT * FROM source") \
|
|
62
|
+
.write \
|
|
63
|
+
.mode("overwrite") \
|
|
64
|
+
.saveAsTable("target")
|
|
65
|
+
```
|
|
57
66
|
|
|
58
|
-
|
|
59
|
-
|
|
67
|
+
### 3. **Workspace Management**
|
|
68
|
+
List and create lakehouses:
|
|
69
|
+
```python
|
|
70
|
+
ws = duckrun.connect("My Workspace")
|
|
71
|
+
ws.list_lakehouses()
|
|
72
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
73
|
+
```
|
|
60
74
|
|
|
61
|
-
|
|
62
|
-
|
|
75
|
+
### 4. **File Management**
|
|
76
|
+
Upload/download files to OneLake Files:
|
|
77
|
+
```python
|
|
78
|
+
con.copy("./local_folder", "remote_folder")
|
|
79
|
+
con.download("remote_folder", "./local_folder")
|
|
80
|
+
```
|
|
63
81
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
82
|
+
### 5. **Pipeline Orchestration**
|
|
83
|
+
Run SQL and Python tasks in sequence:
|
|
84
|
+
```python
|
|
85
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
|
|
86
|
+
pipeline = [
|
|
87
|
+
('clean_data', 'overwrite'),
|
|
88
|
+
('aggregate', 'append')
|
|
89
|
+
]
|
|
90
|
+
con.run(pipeline)
|
|
91
|
+
```
|
|
67
92
|
|
|
68
|
-
|
|
69
|
-
|
|
93
|
+
### 6. **Semantic Model Deployment**
|
|
94
|
+
Deploy Power BI models with DirectLake:
|
|
95
|
+
```python
|
|
96
|
+
con.deploy("https://github.com/user/repo/model.bim")
|
|
97
|
+
con.deploy("./local_model.bim", dataset_name="Sales Model")
|
|
98
|
+
```
|
|
70
99
|
|
|
71
|
-
|
|
72
|
-
|
|
100
|
+
### 7. **Download Semantic Models**
|
|
101
|
+
Download BIM files from deployed models:
|
|
102
|
+
```python
|
|
103
|
+
bim_content = con.download_bim("Sales Model")
|
|
104
|
+
con.download_bim("Sales Model", "sales_model.bim")
|
|
105
|
+
```
|
|
73
106
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
107
|
+
### 8. **Schema Evolution & Partitioning**
|
|
108
|
+
Handle evolving schemas and optimize with partitioning:
|
|
109
|
+
```python
|
|
110
|
+
con.sql("SELECT * FROM source") \
|
|
111
|
+
.write \
|
|
112
|
+
.mode("append") \
|
|
113
|
+
.option("mergeSchema", "true") \
|
|
114
|
+
.partitionBy("region", "year") \
|
|
115
|
+
.saveAsTable("target")
|
|
77
116
|
```
|
|
78
117
|
|
|
79
|
-
|
|
118
|
+
### 9. **SQL Lookup Functions**
|
|
119
|
+
Resolve workspace/lakehouse names from GUIDs in SQL:
|
|
120
|
+
```python
|
|
121
|
+
con.sql("""
|
|
122
|
+
SELECT
|
|
123
|
+
workspace_id,
|
|
124
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
125
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
126
|
+
FROM storage_logs
|
|
127
|
+
""").show()
|
|
128
|
+
```
|
|
80
129
|
|
|
81
130
|
## Connection Format
|
|
82
131
|
|
|
83
132
|
```python
|
|
84
|
-
#
|
|
85
|
-
ws = duckrun.connect("My Workspace")
|
|
86
|
-
ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
|
|
87
|
-
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
88
|
-
|
|
89
|
-
# Lakehouse connection with schema (recommended for best performance)
|
|
133
|
+
# Lakehouse connection with schema (recommended)
|
|
90
134
|
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
91
135
|
|
|
92
|
-
#
|
|
136
|
+
# Workspace names with spaces are supported!
|
|
93
137
|
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
|
94
138
|
|
|
95
139
|
# Without schema (defaults to 'dbo', scans all schemas)
|
|
@@ -100,30 +144,13 @@ con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
|
|
100
144
|
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
|
|
101
145
|
```
|
|
102
146
|
|
|
103
|
-
|
|
147
|
+
## Detailed Usage
|
|
104
148
|
|
|
105
|
-
|
|
106
|
-
- **Default to `dbo`** for write operations
|
|
107
|
-
- **Scan all schemas** to discover and attach all Delta tables
|
|
108
|
-
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
|
109
|
-
|
|
110
|
-
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
|
111
|
-
|
|
112
|
-
```python
|
|
113
|
-
# Fast: scans only 'dbo' schema
|
|
114
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
115
|
-
|
|
116
|
-
# Slower: scans all schemas
|
|
117
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
118
|
-
|
|
119
|
-
# Query tables from different schemas (when scanning all)
|
|
120
|
-
con.sql("SELECT * FROM dbo_customers").show()
|
|
121
|
-
con.sql("SELECT * FROM bronze_raw_data").show()
|
|
122
|
-
```
|
|
149
|
+
### Data Exploration
|
|
123
150
|
|
|
124
|
-
##
|
|
151
|
+
## Detailed Documentation
|
|
125
152
|
|
|
126
|
-
###
|
|
153
|
+
### Data Exploration
|
|
127
154
|
|
|
128
155
|
Perfect for ad-hoc analysis and interactive notebooks:
|
|
129
156
|
|
|
@@ -164,7 +191,7 @@ con.sql("""
|
|
|
164
191
|
|
|
165
192
|
**Note:** `.format("delta")` is optional - Delta is the default format!
|
|
166
193
|
|
|
167
|
-
###
|
|
194
|
+
### File Management (OneLake Files)
|
|
168
195
|
|
|
169
196
|
Upload and download files to/from OneLake Files section (not Delta tables):
|
|
170
197
|
|
|
@@ -195,7 +222,7 @@ con.download("daily_reports", "./reports", ['.csv'])
|
|
|
195
222
|
- ✅ **Preserves folder structure** during upload/download
|
|
196
223
|
- ✅ **Progress reporting** with file sizes and upload status
|
|
197
224
|
|
|
198
|
-
###
|
|
225
|
+
### Pipeline Orchestration
|
|
199
226
|
|
|
200
227
|
For production workflows with reusable SQL and Python tasks:
|
|
201
228
|
|
|
@@ -81,11 +81,10 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
|
81
81
|
args['max_rows_per_file'] = RG
|
|
82
82
|
args['max_rows_per_group'] = RG
|
|
83
83
|
args['min_rows_per_group'] = RG
|
|
84
|
-
# Set ZSTD compression
|
|
84
|
+
# Set ZSTD compression for PyArrow engine
|
|
85
85
|
if _HAS_PYARROW_DATASET:
|
|
86
86
|
args['file_options'] = ds.ParquetFileFormat().make_write_options(
|
|
87
|
-
compression='ZSTD'
|
|
88
|
-
use_dictionary=True
|
|
87
|
+
compression='ZSTD'
|
|
89
88
|
)
|
|
90
89
|
else:
|
|
91
90
|
# Version 0.20+: no optimization available (rust by default, no row group params supported)
|
|
@@ -115,9 +114,9 @@ class DeltaWriter:
|
|
|
115
114
|
return self
|
|
116
115
|
|
|
117
116
|
def mode(self, write_mode: str):
|
|
118
|
-
"""Set write mode: 'overwrite' or '
|
|
119
|
-
if write_mode not in {"overwrite", "append"}:
|
|
120
|
-
raise ValueError(f"Mode must be 'overwrite' or '
|
|
117
|
+
"""Set write mode: 'overwrite', 'append', or 'ignore'"""
|
|
118
|
+
if write_mode not in {"overwrite", "append", "ignore"}:
|
|
119
|
+
raise ValueError(f"Mode must be 'overwrite', 'append', or 'ignore', got '{write_mode}'")
|
|
121
120
|
self._mode = write_mode
|
|
122
121
|
return self
|
|
123
122
|
|
|
@@ -155,6 +154,19 @@ class DeltaWriter:
|
|
|
155
154
|
|
|
156
155
|
self.duckrun._create_onelake_secret()
|
|
157
156
|
path = f"{self.duckrun.table_base_url}{schema}/{table}"
|
|
157
|
+
|
|
158
|
+
# Handle 'ignore' mode - skip if table already exists
|
|
159
|
+
if self._mode == 'ignore':
|
|
160
|
+
try:
|
|
161
|
+
DeltaTable(path)
|
|
162
|
+
print(f"Table {schema}.{table} exists. Skipping (mode='ignore')")
|
|
163
|
+
return table
|
|
164
|
+
except Exception:
|
|
165
|
+
# Table doesn't exist, proceed with creation
|
|
166
|
+
print(f"Creating table {schema}.{table} (mode='ignore', table doesn't exist)")
|
|
167
|
+
# Change mode to 'overwrite' for actual write
|
|
168
|
+
self._mode = 'overwrite'
|
|
169
|
+
|
|
158
170
|
df = self.relation.record_batch()
|
|
159
171
|
|
|
160
172
|
# Build write arguments based on schema_mode and partition_by
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.21.dev1
|
|
4
4
|
Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -20,13 +20,15 @@ Dynamic: license-file
|
|
|
20
20
|
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
[](https://badge.fury.io/py/duckrun)
|
|
24
|
+
[](https://pepy.tech/project/duckrun)
|
|
25
|
+
|
|
26
|
+
A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB/Delta_rs.
|
|
24
27
|
|
|
25
28
|
## Important Notes
|
|
26
29
|
|
|
27
30
|
**Requirements:**
|
|
28
|
-
- Lakehouse
|
|
29
|
-
- **Workspace names with spaces are fully supported!** ✅
|
|
31
|
+
- Lakehouse without schema are not supported
|
|
30
32
|
|
|
31
33
|
**Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
|
|
32
34
|
|
|
@@ -48,68 +50,110 @@ pip install duckrun[local]
|
|
|
48
50
|
|
|
49
51
|
Note: When running locally, your internet speed will be the main bottleneck.
|
|
50
52
|
|
|
51
|
-
##
|
|
52
|
-
|
|
53
|
-
### Simple Example for New Users
|
|
53
|
+
## Getting Started
|
|
54
54
|
|
|
55
55
|
```python
|
|
56
56
|
import duckrun
|
|
57
57
|
|
|
58
|
-
# Connect to a
|
|
59
|
-
con = duckrun.connect(
|
|
60
|
-
con.list_lakehouses() # See what lakehouses exist
|
|
61
|
-
con.create_lakehouse_if_not_exists('data') # Create if needed
|
|
62
|
-
|
|
63
|
-
# Connect to a specific lakehouse and query data
|
|
64
|
-
con = duckrun.connect("My Workspace/data.lakehouse/dbo")
|
|
58
|
+
# Connect to a lakehouse and start querying
|
|
59
|
+
con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
|
|
65
60
|
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
|
61
|
+
|
|
62
|
+
# Write results to a new table
|
|
63
|
+
con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
|
66
64
|
```
|
|
67
65
|
|
|
68
|
-
|
|
66
|
+
That's it! Connect to your lakehouse and run SQL queries with DuckDB's speed.
|
|
69
67
|
|
|
68
|
+
## Core Functionalities
|
|
69
|
+
|
|
70
|
+
### 1. **Data Exploration & Querying**
|
|
71
|
+
Query Delta tables using SQL with DuckDB performance:
|
|
70
72
|
```python
|
|
71
|
-
|
|
73
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
74
|
+
con.sql("SELECT * FROM sales WHERE year = 2024").show()
|
|
75
|
+
df = con.sql("SELECT COUNT(*) FROM orders").df()
|
|
76
|
+
```
|
|
72
77
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
78
|
+
### 2. **Write to Delta Tables**
|
|
79
|
+
Use Spark-style API to write query results:
|
|
80
|
+
```python
|
|
81
|
+
con.sql("SELECT * FROM source") \
|
|
82
|
+
.write \
|
|
83
|
+
.mode("overwrite") \
|
|
84
|
+
.saveAsTable("target")
|
|
85
|
+
```
|
|
77
86
|
|
|
78
|
-
|
|
79
|
-
|
|
87
|
+
### 3. **Workspace Management**
|
|
88
|
+
List and create lakehouses:
|
|
89
|
+
```python
|
|
90
|
+
ws = duckrun.connect("My Workspace")
|
|
91
|
+
ws.list_lakehouses()
|
|
92
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
93
|
+
```
|
|
80
94
|
|
|
81
|
-
|
|
82
|
-
|
|
95
|
+
### 4. **File Management**
|
|
96
|
+
Upload/download files to OneLake Files:
|
|
97
|
+
```python
|
|
98
|
+
con.copy("./local_folder", "remote_folder")
|
|
99
|
+
con.download("remote_folder", "./local_folder")
|
|
100
|
+
```
|
|
83
101
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
102
|
+
### 5. **Pipeline Orchestration**
|
|
103
|
+
Run SQL and Python tasks in sequence:
|
|
104
|
+
```python
|
|
105
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
|
|
106
|
+
pipeline = [
|
|
107
|
+
('clean_data', 'overwrite'),
|
|
108
|
+
('aggregate', 'append')
|
|
109
|
+
]
|
|
110
|
+
con.run(pipeline)
|
|
111
|
+
```
|
|
87
112
|
|
|
88
|
-
|
|
89
|
-
|
|
113
|
+
### 6. **Semantic Model Deployment**
|
|
114
|
+
Deploy Power BI models with DirectLake:
|
|
115
|
+
```python
|
|
116
|
+
con.deploy("https://github.com/user/repo/model.bim")
|
|
117
|
+
con.deploy("./local_model.bim", dataset_name="Sales Model")
|
|
118
|
+
```
|
|
90
119
|
|
|
91
|
-
|
|
92
|
-
|
|
120
|
+
### 7. **Download Semantic Models**
|
|
121
|
+
Download BIM files from deployed models:
|
|
122
|
+
```python
|
|
123
|
+
bim_content = con.download_bim("Sales Model")
|
|
124
|
+
con.download_bim("Sales Model", "sales_model.bim")
|
|
125
|
+
```
|
|
93
126
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
127
|
+
### 8. **Schema Evolution & Partitioning**
|
|
128
|
+
Handle evolving schemas and optimize with partitioning:
|
|
129
|
+
```python
|
|
130
|
+
con.sql("SELECT * FROM source") \
|
|
131
|
+
.write \
|
|
132
|
+
.mode("append") \
|
|
133
|
+
.option("mergeSchema", "true") \
|
|
134
|
+
.partitionBy("region", "year") \
|
|
135
|
+
.saveAsTable("target")
|
|
97
136
|
```
|
|
98
137
|
|
|
99
|
-
|
|
138
|
+
### 9. **SQL Lookup Functions**
|
|
139
|
+
Resolve workspace/lakehouse names from GUIDs in SQL:
|
|
140
|
+
```python
|
|
141
|
+
con.sql("""
|
|
142
|
+
SELECT
|
|
143
|
+
workspace_id,
|
|
144
|
+
get_workspace_name(workspace_id) as workspace_name,
|
|
145
|
+
get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
|
|
146
|
+
FROM storage_logs
|
|
147
|
+
""").show()
|
|
148
|
+
```
|
|
100
149
|
|
|
101
150
|
## Connection Format
|
|
102
151
|
|
|
103
152
|
```python
|
|
104
|
-
#
|
|
105
|
-
ws = duckrun.connect("My Workspace")
|
|
106
|
-
ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
|
|
107
|
-
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
108
|
-
|
|
109
|
-
# Lakehouse connection with schema (recommended for best performance)
|
|
153
|
+
# Lakehouse connection with schema (recommended)
|
|
110
154
|
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
111
155
|
|
|
112
|
-
#
|
|
156
|
+
# Workspace names with spaces are supported!
|
|
113
157
|
con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
|
|
114
158
|
|
|
115
159
|
# Without schema (defaults to 'dbo', scans all schemas)
|
|
@@ -120,30 +164,13 @@ con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
|
|
|
120
164
|
con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
|
|
121
165
|
```
|
|
122
166
|
|
|
123
|
-
|
|
167
|
+
## Detailed Usage
|
|
124
168
|
|
|
125
|
-
|
|
126
|
-
- **Default to `dbo`** for write operations
|
|
127
|
-
- **Scan all schemas** to discover and attach all Delta tables
|
|
128
|
-
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
|
129
|
-
|
|
130
|
-
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
|
131
|
-
|
|
132
|
-
```python
|
|
133
|
-
# Fast: scans only 'dbo' schema
|
|
134
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
|
135
|
-
|
|
136
|
-
# Slower: scans all schemas
|
|
137
|
-
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
138
|
-
|
|
139
|
-
# Query tables from different schemas (when scanning all)
|
|
140
|
-
con.sql("SELECT * FROM dbo_customers").show()
|
|
141
|
-
con.sql("SELECT * FROM bronze_raw_data").show()
|
|
142
|
-
```
|
|
169
|
+
### Data Exploration
|
|
143
170
|
|
|
144
|
-
##
|
|
171
|
+
## Detailed Documentation
|
|
145
172
|
|
|
146
|
-
###
|
|
173
|
+
### Data Exploration
|
|
147
174
|
|
|
148
175
|
Perfect for ad-hoc analysis and interactive notebooks:
|
|
149
176
|
|
|
@@ -184,7 +211,7 @@ con.sql("""
|
|
|
184
211
|
|
|
185
212
|
**Note:** `.format("delta")` is optional - Delta is the default format!
|
|
186
213
|
|
|
187
|
-
###
|
|
214
|
+
### File Management (OneLake Files)
|
|
188
215
|
|
|
189
216
|
Upload and download files to/from OneLake Files section (not Delta tables):
|
|
190
217
|
|
|
@@ -215,7 +242,7 @@ con.download("daily_reports", "./reports", ['.csv'])
|
|
|
215
242
|
- ✅ **Preserves folder structure** during upload/download
|
|
216
243
|
- ✅ **Progress reporting** with file sizes and upload status
|
|
217
244
|
|
|
218
|
-
###
|
|
245
|
+
### Pipeline Orchestration
|
|
219
246
|
|
|
220
247
|
For production workflows with reusable SQL and Python tasks:
|
|
221
248
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.21.dev1"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|