duckrun 0.1.5.5__tar.gz → 0.1.5.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/PKG-INFO +30 -9
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/README.md +29 -8
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/duckrun/core.py +81 -36
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/duckrun.egg-info/PKG-INFO +30 -9
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/pyproject.toml +1 -1
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/LICENSE +0 -0
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/duckrun/__init__.py +0 -0
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.1.5.5 → duckrun-0.1.5.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.5.
|
3
|
+
Version: 0.1.5.6
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
Author: mim
|
6
6
|
License-Expression: MIT
|
@@ -15,7 +15,7 @@ Requires-Dist: deltalake>=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
16
16
|
Dynamic: license-file
|
17
17
|
|
18
|
-
<img src="duckrun.png" width="400" alt="Duckrun">
|
18
|
+
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
19
19
|
|
20
20
|
Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
|
21
21
|
|
@@ -38,10 +38,11 @@ pip install duckrun
|
|
38
38
|
```python
|
39
39
|
import duckrun
|
40
40
|
|
41
|
-
# Connect to your Fabric lakehouse
|
41
|
+
# Connect to your Fabric lakehouse with a specific schema
|
42
42
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse/dbo")
|
43
43
|
|
44
|
-
# Schema defaults to 'dbo' if not specified
|
44
|
+
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
45
|
+
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
45
46
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse")
|
46
47
|
|
47
48
|
# Explore data
|
@@ -56,17 +57,37 @@ That's it! No `sql_folder` needed for data exploration.
|
|
56
57
|
## Connection Format
|
57
58
|
|
58
59
|
```python
|
59
|
-
# With schema
|
60
|
+
# With schema (recommended for better performance)
|
60
61
|
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
61
62
|
|
62
|
-
# Without schema (
|
63
|
+
# Without schema (defaults to 'dbo', scans all schemas)
|
64
|
+
# ⚠️ This can be slow for large lakehouses!
|
63
65
|
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
64
66
|
|
65
67
|
# With options
|
66
68
|
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
|
67
69
|
```
|
68
70
|
|
69
|
-
|
71
|
+
### Multi-Schema Support
|
72
|
+
|
73
|
+
When you don't specify a schema, Duckrun will:
|
74
|
+
- **Default to `dbo`** for write operations
|
75
|
+
- **Scan all schemas** to discover and attach all Delta tables
|
76
|
+
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
77
|
+
|
78
|
+
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
79
|
+
|
80
|
+
```python
|
81
|
+
# Fast: scans only 'dbo' schema
|
82
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
83
|
+
|
84
|
+
# Slower: scans all schemas
|
85
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
86
|
+
|
87
|
+
# Query tables from different schemas (when scanning all)
|
88
|
+
con.sql("SELECT * FROM dbo_customers").show()
|
89
|
+
con.sql("SELECT * FROM bronze_raw_data").show()
|
90
|
+
```
|
70
91
|
|
71
92
|
## Two Ways to Use Duckrun
|
72
93
|
|
@@ -262,7 +283,7 @@ con = duckrun.connect(
|
|
262
283
|
```python
|
263
284
|
import duckrun
|
264
285
|
|
265
|
-
# Connect
|
286
|
+
# Connect (specify schema for best performance)
|
266
287
|
con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
|
267
288
|
|
268
289
|
# Pipeline with mixed tasks
|
@@ -297,7 +318,7 @@ con.sql("""
|
|
297
318
|
## How It Works
|
298
319
|
|
299
320
|
1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
|
300
|
-
2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
|
321
|
+
2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
|
301
322
|
3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
|
302
323
|
4. **Write Operations**: Results are written back as Delta tables with automatic optimization
|
303
324
|
5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
|
@@ -1,4 +1,4 @@
|
|
1
|
-
<img src="duckrun.png" width="400" alt="Duckrun">
|
1
|
+
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
2
2
|
|
3
3
|
Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
|
4
4
|
|
@@ -21,10 +21,11 @@ pip install duckrun
|
|
21
21
|
```python
|
22
22
|
import duckrun
|
23
23
|
|
24
|
-
# Connect to your Fabric lakehouse
|
24
|
+
# Connect to your Fabric lakehouse with a specific schema
|
25
25
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse/dbo")
|
26
26
|
|
27
|
-
# Schema defaults to 'dbo' if not specified
|
27
|
+
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
28
|
+
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
28
29
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse")
|
29
30
|
|
30
31
|
# Explore data
|
@@ -39,17 +40,37 @@ That's it! No `sql_folder` needed for data exploration.
|
|
39
40
|
## Connection Format
|
40
41
|
|
41
42
|
```python
|
42
|
-
# With schema
|
43
|
+
# With schema (recommended for better performance)
|
43
44
|
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
44
45
|
|
45
|
-
# Without schema (
|
46
|
+
# Without schema (defaults to 'dbo', scans all schemas)
|
47
|
+
# ⚠️ This can be slow for large lakehouses!
|
46
48
|
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
47
49
|
|
48
50
|
# With options
|
49
51
|
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
|
50
52
|
```
|
51
53
|
|
52
|
-
|
54
|
+
### Multi-Schema Support
|
55
|
+
|
56
|
+
When you don't specify a schema, Duckrun will:
|
57
|
+
- **Default to `dbo`** for write operations
|
58
|
+
- **Scan all schemas** to discover and attach all Delta tables
|
59
|
+
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
60
|
+
|
61
|
+
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
62
|
+
|
63
|
+
```python
|
64
|
+
# Fast: scans only 'dbo' schema
|
65
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
66
|
+
|
67
|
+
# Slower: scans all schemas
|
68
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
69
|
+
|
70
|
+
# Query tables from different schemas (when scanning all)
|
71
|
+
con.sql("SELECT * FROM dbo_customers").show()
|
72
|
+
con.sql("SELECT * FROM bronze_raw_data").show()
|
73
|
+
```
|
53
74
|
|
54
75
|
## Two Ways to Use Duckrun
|
55
76
|
|
@@ -245,7 +266,7 @@ con = duckrun.connect(
|
|
245
266
|
```python
|
246
267
|
import duckrun
|
247
268
|
|
248
|
-
# Connect
|
269
|
+
# Connect (specify schema for best performance)
|
249
270
|
con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
|
250
271
|
|
251
272
|
# Pipeline with mixed tasks
|
@@ -280,7 +301,7 @@ con.sql("""
|
|
280
301
|
## How It Works
|
281
302
|
|
282
303
|
1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
|
283
|
-
2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
|
304
|
+
2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
|
284
305
|
3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
|
285
306
|
4. **Write Operations**: Results are written back as Delta tables with automatic optimization
|
286
307
|
5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
|
@@ -112,7 +112,7 @@ class Duckrun:
|
|
112
112
|
Usage:
|
113
113
|
# For pipelines:
|
114
114
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
|
115
|
-
dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema
|
115
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
|
116
116
|
dr.run(pipeline)
|
117
117
|
|
118
118
|
# For data exploration with Spark-style API:
|
@@ -122,12 +122,14 @@ class Duckrun:
|
|
122
122
|
"""
|
123
123
|
|
124
124
|
def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
|
125
|
-
sql_folder: Optional[str] = None, compaction_threshold: int = 10
|
125
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 10,
|
126
|
+
scan_all_schemas: bool = False):
|
126
127
|
self.workspace = workspace
|
127
128
|
self.lakehouse_name = lakehouse_name
|
128
129
|
self.schema = schema
|
129
130
|
self.sql_folder = sql_folder.strip() if sql_folder else None
|
130
131
|
self.compaction_threshold = compaction_threshold
|
132
|
+
self.scan_all_schemas = scan_all_schemas
|
131
133
|
self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
|
132
134
|
self.con = duckdb.connect()
|
133
135
|
self.con.sql("SET preserve_insertion_order = false")
|
@@ -144,27 +146,31 @@ class Duckrun:
|
|
144
146
|
1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
|
145
147
|
2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
|
146
148
|
|
147
|
-
Schema defaults to "dbo" if not specified.
|
149
|
+
Schema defaults to "dbo" if not specified. When no schema is provided,
|
150
|
+
all tables across all schemas will be listed, but operations will use "dbo".
|
148
151
|
|
149
152
|
Examples:
|
150
153
|
dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
|
151
|
-
dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # uses dbo
|
154
|
+
dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # lists all, uses dbo
|
152
155
|
dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
|
153
|
-
dr = Duckrun.connect("myworkspace", "mylakehouse") # uses dbo
|
156
|
+
dr = Duckrun.connect("myworkspace", "mylakehouse") # lists all, uses dbo
|
154
157
|
dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
|
155
158
|
"""
|
156
159
|
print("Connecting to Lakehouse...")
|
157
160
|
|
161
|
+
scan_all_schemas = False
|
162
|
+
|
158
163
|
# Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
159
164
|
if workspace and "/" in workspace and lakehouse_name is None:
|
160
165
|
parts = workspace.split("/")
|
161
166
|
if len(parts) == 2:
|
162
167
|
# Format: "ws/lh.lakehouse" (schema will use default)
|
163
168
|
workspace, lakehouse_name = parts
|
164
|
-
|
165
|
-
print(f"ℹ️ No schema specified. Using default schema 'dbo'.")
|
166
|
-
print(f"
|
167
|
-
print(f"
|
169
|
+
scan_all_schemas = True
|
170
|
+
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
171
|
+
print(f" Scanning all schemas for table discovery...")
|
172
|
+
print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
|
173
|
+
print(f" 💡 For better performance, specify a schema: {workspace}/{lakehouse_name}.lakehouse/schema\n")
|
168
174
|
elif len(parts) == 3:
|
169
175
|
# Format: "ws/lh.lakehouse/schema"
|
170
176
|
workspace, lakehouse_name, schema = parts
|
@@ -177,18 +183,27 @@ class Duckrun:
|
|
177
183
|
# Remove .lakehouse suffix if present
|
178
184
|
if lakehouse_name.endswith(".lakehouse"):
|
179
185
|
lakehouse_name = lakehouse_name[:-10]
|
186
|
+
elif lakehouse_name is not None:
|
187
|
+
# Traditional format used, check if schema was explicitly provided
|
188
|
+
# If schema is still "dbo" (default), scan all schemas
|
189
|
+
if schema == "dbo":
|
190
|
+
scan_all_schemas = True
|
191
|
+
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
192
|
+
print(f" Scanning all schemas for table discovery...")
|
193
|
+
print(f" ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!")
|
194
|
+
print(f" 💡 For better performance, specify a schema explicitly.\n")
|
180
195
|
|
181
196
|
# Validate all required parameters are present
|
182
197
|
if not workspace or not lakehouse_name:
|
183
198
|
raise ValueError(
|
184
199
|
"Missing required parameters. Use either:\n"
|
185
200
|
" connect('workspace/lakehouse.lakehouse/schema')\n"
|
186
|
-
" connect('workspace/lakehouse.lakehouse') # defaults to dbo\n"
|
201
|
+
" connect('workspace/lakehouse.lakehouse') # defaults to dbo, lists all\n"
|
187
202
|
" connect('workspace', 'lakehouse', 'schema')\n"
|
188
|
-
" connect('workspace', 'lakehouse') # defaults to dbo"
|
203
|
+
" connect('workspace', 'lakehouse') # defaults to dbo, lists all"
|
189
204
|
)
|
190
205
|
|
191
|
-
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
|
206
|
+
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
|
192
207
|
|
193
208
|
def _get_storage_token(self):
|
194
209
|
return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
|
@@ -208,47 +223,77 @@ class Duckrun:
|
|
208
223
|
def _attach_lakehouse(self):
|
209
224
|
self._create_onelake_secret()
|
210
225
|
try:
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
226
|
+
if self.scan_all_schemas:
|
227
|
+
# Scan all schemas
|
228
|
+
print(f"⚠️ Scanning for Delta tables across all schemas...")
|
229
|
+
print(f" This may take a while for large lakehouses with many schemas/tables.")
|
230
|
+
|
231
|
+
list_tables_query = f"""
|
232
|
+
SELECT DISTINCT
|
233
|
+
regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) as schema_name,
|
234
|
+
regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) as table_name
|
235
|
+
FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/**")
|
236
|
+
WHERE file LIKE '%/_delta_log/%'
|
237
|
+
AND file NOT LIKE '%/metadata/%'
|
238
|
+
AND file NOT LIKE '%/iceberg/%'
|
239
|
+
AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 1) IS NOT NULL
|
240
|
+
AND regexp_extract(file, 'Tables/([^/]+)/([^/]+)/_delta_log', 2) IS NOT NULL
|
241
|
+
ORDER BY schema_name, table_name
|
242
|
+
"""
|
243
|
+
else:
|
244
|
+
# Scan specific schema only
|
245
|
+
print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
|
246
|
+
|
247
|
+
list_tables_query = f"""
|
248
|
+
SELECT DISTINCT
|
249
|
+
'{self.schema}' as schema_name,
|
250
|
+
regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
|
251
|
+
FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
|
252
|
+
WHERE file LIKE '%/_delta_log/%'
|
253
|
+
AND file NOT LIKE '%/metadata/%'
|
254
|
+
AND file NOT LIKE '%/iceberg/%'
|
255
|
+
AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
|
256
|
+
"""
|
224
257
|
|
225
258
|
list_tables_df = self.con.sql(list_tables_query).df()
|
226
259
|
|
227
260
|
if list_tables_df.empty:
|
228
|
-
|
261
|
+
if self.scan_all_schemas:
|
262
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
|
263
|
+
else:
|
264
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
|
229
265
|
return
|
230
266
|
|
231
|
-
|
267
|
+
print(f"Found {len(list_tables_df)} Delta tables. Attaching as views...\n")
|
232
268
|
|
233
|
-
|
234
|
-
|
235
|
-
|
269
|
+
for _, row in list_tables_df.iterrows():
|
270
|
+
schema_name = row['schema_name']
|
271
|
+
table_name = row['table_name']
|
272
|
+
|
236
273
|
# Skip Iceberg-related folders and empty names
|
237
|
-
if not
|
274
|
+
if not table_name or table_name in ('metadata', 'iceberg'):
|
238
275
|
continue
|
239
276
|
|
240
277
|
try:
|
278
|
+
# Create view with schema prefix to avoid conflicts
|
279
|
+
view_name = f"{schema_name}_{table_name}" if self.scan_all_schemas else table_name
|
280
|
+
|
241
281
|
self.con.sql(f"""
|
242
|
-
CREATE OR REPLACE VIEW {
|
243
|
-
AS SELECT * FROM delta_scan('{self.table_base_url}{
|
282
|
+
CREATE OR REPLACE VIEW {view_name}
|
283
|
+
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
244
284
|
""")
|
245
|
-
print(f" ✓ Attached: {
|
285
|
+
print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
|
246
286
|
except Exception as e:
|
247
|
-
print(f" ⚠ Skipped {
|
287
|
+
print(f" ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
|
248
288
|
continue
|
249
289
|
|
250
290
|
print("\nAttached tables (views) in DuckDB:")
|
251
|
-
self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
|
291
|
+
self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
|
292
|
+
|
293
|
+
if self.scan_all_schemas:
|
294
|
+
print(f"\nNote: Tables are prefixed with schema (e.g., dbo_tablename)")
|
295
|
+
print(f" Default schema for operations: {self.schema}")
|
296
|
+
|
252
297
|
except Exception as e:
|
253
298
|
print(f"Error attaching lakehouse: {e}")
|
254
299
|
print("Continuing without pre-attached tables.")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.5.
|
3
|
+
Version: 0.1.5.6
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
Author: mim
|
6
6
|
License-Expression: MIT
|
@@ -15,7 +15,7 @@ Requires-Dist: deltalake>=0.18.2
|
|
15
15
|
Requires-Dist: requests>=2.28.0
|
16
16
|
Dynamic: license-file
|
17
17
|
|
18
|
-
<img src="duckrun.png" width="400" alt="Duckrun">
|
18
|
+
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
19
19
|
|
20
20
|
Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
|
21
21
|
|
@@ -38,10 +38,11 @@ pip install duckrun
|
|
38
38
|
```python
|
39
39
|
import duckrun
|
40
40
|
|
41
|
-
# Connect to your Fabric lakehouse
|
41
|
+
# Connect to your Fabric lakehouse with a specific schema
|
42
42
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse/dbo")
|
43
43
|
|
44
|
-
# Schema defaults to 'dbo' if not specified
|
44
|
+
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
45
|
+
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
45
46
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse")
|
46
47
|
|
47
48
|
# Explore data
|
@@ -56,17 +57,37 @@ That's it! No `sql_folder` needed for data exploration.
|
|
56
57
|
## Connection Format
|
57
58
|
|
58
59
|
```python
|
59
|
-
# With schema
|
60
|
+
# With schema (recommended for better performance)
|
60
61
|
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
61
62
|
|
62
|
-
# Without schema (
|
63
|
+
# Without schema (defaults to 'dbo', scans all schemas)
|
64
|
+
# ⚠️ This can be slow for large lakehouses!
|
63
65
|
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
64
66
|
|
65
67
|
# With options
|
66
68
|
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
|
67
69
|
```
|
68
70
|
|
69
|
-
|
71
|
+
### Multi-Schema Support
|
72
|
+
|
73
|
+
When you don't specify a schema, Duckrun will:
|
74
|
+
- **Default to `dbo`** for write operations
|
75
|
+
- **Scan all schemas** to discover and attach all Delta tables
|
76
|
+
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
77
|
+
|
78
|
+
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
79
|
+
|
80
|
+
```python
|
81
|
+
# Fast: scans only 'dbo' schema
|
82
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
83
|
+
|
84
|
+
# Slower: scans all schemas
|
85
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
86
|
+
|
87
|
+
# Query tables from different schemas (when scanning all)
|
88
|
+
con.sql("SELECT * FROM dbo_customers").show()
|
89
|
+
con.sql("SELECT * FROM bronze_raw_data").show()
|
90
|
+
```
|
70
91
|
|
71
92
|
## Two Ways to Use Duckrun
|
72
93
|
|
@@ -262,7 +283,7 @@ con = duckrun.connect(
|
|
262
283
|
```python
|
263
284
|
import duckrun
|
264
285
|
|
265
|
-
# Connect
|
286
|
+
# Connect (specify schema for best performance)
|
266
287
|
con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
|
267
288
|
|
268
289
|
# Pipeline with mixed tasks
|
@@ -297,7 +318,7 @@ con.sql("""
|
|
297
318
|
## How It Works
|
298
319
|
|
299
320
|
1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
|
300
|
-
2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
|
321
|
+
2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
|
301
322
|
3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
|
302
323
|
4. **Write Operations**: Results are written back as Delta tables with automatic optimization
|
303
324
|
5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|