duckrun 0.1.5.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/core.py +112 -87
- {duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/METADATA +34 -10
- duckrun-0.1.6.dist-info/RECORD +7 -0
- duckrun-0.1.5.5.dist-info/RECORD +0 -7
- {duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/WHEEL +0 -0
- {duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.1.5.5.dist-info → duckrun-0.1.6.dist-info}/top_level.txt +0 -0
duckrun/core.py
CHANGED
@@ -5,6 +5,8 @@ import importlib.util
|
|
5
5
|
from deltalake import DeltaTable, write_deltalake
|
6
6
|
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
7
7
|
from string import Template
|
8
|
+
import obstore as obs
|
9
|
+
from obstore.store import AzureStore
|
8
10
|
|
9
11
|
|
10
12
|
class DeltaWriter:
|
@@ -13,7 +15,7 @@ class DeltaWriter:
|
|
13
15
|
def __init__(self, relation, duckrun_instance):
|
14
16
|
self.relation = relation
|
15
17
|
self.duckrun = duckrun_instance
|
16
|
-
self._format = "delta"
|
18
|
+
self._format = "delta"
|
17
19
|
self._mode = "overwrite"
|
18
20
|
|
19
21
|
def format(self, format_type: str):
|
@@ -32,46 +34,35 @@ class DeltaWriter:
|
|
32
34
|
|
33
35
|
def saveAsTable(self, table_name: str):
|
34
36
|
"""Save query result as Delta table"""
|
35
|
-
# Format defaults to "delta", so no need to check
|
36
37
|
if self._format != "delta":
|
37
38
|
raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
|
38
39
|
|
39
|
-
# Parse schema.table or use default schema
|
40
40
|
if "." in table_name:
|
41
41
|
schema, table = table_name.split(".", 1)
|
42
42
|
else:
|
43
43
|
schema = self.duckrun.schema
|
44
44
|
table = table_name
|
45
45
|
|
46
|
-
# Ensure OneLake secret is created
|
47
46
|
self.duckrun._create_onelake_secret()
|
48
|
-
|
49
|
-
# Build path
|
50
47
|
path = f"{self.duckrun.table_base_url}{schema}/{table}"
|
51
|
-
|
52
|
-
# Execute query and get result
|
53
48
|
df = self.relation.record_batch()
|
54
49
|
|
55
50
|
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
|
56
|
-
|
57
|
-
# Write to Delta
|
58
51
|
write_deltalake(path, df, mode=self._mode)
|
59
52
|
|
60
|
-
# Create or replace view in DuckDB
|
61
53
|
self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
|
62
54
|
self.duckrun.con.sql(f"""
|
63
55
|
CREATE OR REPLACE VIEW {table}
|
64
56
|
AS SELECT * FROM delta_scan('{path}')
|
65
57
|
""")
|
66
58
|
|
67
|
-
# Optimize if needed
|
68
59
|
dt = DeltaTable(path)
|
69
60
|
|
70
61
|
if self._mode == "overwrite":
|
71
62
|
dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
|
72
63
|
dt.cleanup_metadata()
|
73
64
|
print(f"✅ Table {schema}.{table} created/overwritten")
|
74
|
-
else:
|
65
|
+
else:
|
75
66
|
file_count = len(dt.file_uris())
|
76
67
|
if file_count > self.duckrun.compaction_threshold:
|
77
68
|
print(f"Compacting {schema}.{table} ({file_count} files)")
|
@@ -112,7 +103,7 @@ class Duckrun:
|
|
112
103
|
Usage:
|
113
104
|
# For pipelines:
|
114
105
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
|
115
|
-
dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema
|
106
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
|
116
107
|
dr.run(pipeline)
|
117
108
|
|
118
109
|
# For data exploration with Spark-style API:
|
@@ -122,12 +113,14 @@ class Duckrun:
|
|
122
113
|
"""
|
123
114
|
|
124
115
|
def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
|
125
|
-
sql_folder: Optional[str] = None, compaction_threshold: int = 10
|
116
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 10,
|
117
|
+
scan_all_schemas: bool = False):
|
126
118
|
self.workspace = workspace
|
127
119
|
self.lakehouse_name = lakehouse_name
|
128
120
|
self.schema = schema
|
129
121
|
self.sql_folder = sql_folder.strip() if sql_folder else None
|
130
122
|
self.compaction_threshold = compaction_threshold
|
123
|
+
self.scan_all_schemas = scan_all_schemas
|
131
124
|
self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
|
132
125
|
self.con = duckdb.connect()
|
133
126
|
self.con.sql("SET preserve_insertion_order = false")
|
@@ -144,29 +137,21 @@ class Duckrun:
|
|
144
137
|
1. Compact: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
|
145
138
|
2. Traditional: connect("ws", "lh", "schema") or connect("ws", "lh")
|
146
139
|
|
147
|
-
Schema defaults to "dbo" if not specified.
|
148
|
-
|
149
|
-
Examples:
|
150
|
-
dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse/bronze")
|
151
|
-
dr = Duckrun.connect("myworkspace/mylakehouse.lakehouse") # uses dbo
|
152
|
-
dr = Duckrun.connect("myworkspace", "mylakehouse", "bronze")
|
153
|
-
dr = Duckrun.connect("myworkspace", "mylakehouse") # uses dbo
|
154
|
-
dr = Duckrun.connect("ws/lh.lakehouse", sql_folder="./sql")
|
140
|
+
Schema defaults to "dbo" if not specified. When no schema is provided,
|
141
|
+
all tables across all schemas will be listed, but operations will use "dbo".
|
155
142
|
"""
|
156
143
|
print("Connecting to Lakehouse...")
|
157
144
|
|
158
|
-
|
145
|
+
scan_all_schemas = False
|
146
|
+
|
159
147
|
if workspace and "/" in workspace and lakehouse_name is None:
|
160
148
|
parts = workspace.split("/")
|
161
149
|
if len(parts) == 2:
|
162
|
-
# Format: "ws/lh.lakehouse" (schema will use default)
|
163
150
|
workspace, lakehouse_name = parts
|
164
|
-
|
165
|
-
print(f"ℹ️ No schema specified. Using default schema 'dbo'.")
|
166
|
-
print(f"
|
167
|
-
print(f" Note: Scanning all schemas will be added in a future update.\n")
|
151
|
+
scan_all_schemas = True
|
152
|
+
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
153
|
+
print(f" Scanning all schemas for table discovery...\n")
|
168
154
|
elif len(parts) == 3:
|
169
|
-
# Format: "ws/lh.lakehouse/schema"
|
170
155
|
workspace, lakehouse_name, schema = parts
|
171
156
|
else:
|
172
157
|
raise ValueError(
|
@@ -174,21 +159,24 @@ class Duckrun:
|
|
174
159
|
"Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
|
175
160
|
)
|
176
161
|
|
177
|
-
# Remove .lakehouse suffix if present
|
178
162
|
if lakehouse_name.endswith(".lakehouse"):
|
179
163
|
lakehouse_name = lakehouse_name[:-10]
|
164
|
+
elif lakehouse_name is not None:
|
165
|
+
if schema == "dbo":
|
166
|
+
scan_all_schemas = True
|
167
|
+
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
168
|
+
print(f" Scanning all schemas for table discovery...\n")
|
180
169
|
|
181
|
-
# Validate all required parameters are present
|
182
170
|
if not workspace or not lakehouse_name:
|
183
171
|
raise ValueError(
|
184
172
|
"Missing required parameters. Use either:\n"
|
185
173
|
" connect('workspace/lakehouse.lakehouse/schema')\n"
|
186
|
-
" connect('workspace/lakehouse.lakehouse') # defaults to dbo\n"
|
174
|
+
" connect('workspace/lakehouse.lakehouse') # defaults to dbo, lists all\n"
|
187
175
|
" connect('workspace', 'lakehouse', 'schema')\n"
|
188
|
-
" connect('workspace', 'lakehouse') # defaults to dbo"
|
176
|
+
" connect('workspace', 'lakehouse') # defaults to dbo, lists all"
|
189
177
|
)
|
190
178
|
|
191
|
-
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
|
179
|
+
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
|
192
180
|
|
193
181
|
def _get_storage_token(self):
|
194
182
|
return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
|
@@ -205,52 +193,107 @@ class Duckrun:
|
|
205
193
|
os.environ["AZURE_STORAGE_TOKEN"] = token.token
|
206
194
|
self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
|
207
195
|
|
196
|
+
def _discover_tables_fast(self) -> List[Tuple[str, str]]:
|
197
|
+
"""
|
198
|
+
Fast Delta table discovery using obstore with list_with_delimiter.
|
199
|
+
Only lists directories, not files - super fast!
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
List of tuples: [(schema, table_name), ...]
|
203
|
+
"""
|
204
|
+
token = self._get_storage_token()
|
205
|
+
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
206
|
+
print("Getting Azure token for table discovery...")
|
207
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
208
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
209
|
+
token_obj = credential.get_token("https://storage.azure.com/.default")
|
210
|
+
token = token_obj.token
|
211
|
+
os.environ["AZURE_STORAGE_TOKEN"] = token
|
212
|
+
|
213
|
+
url = f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/"
|
214
|
+
store = AzureStore.from_url(url, bearer_token=token)
|
215
|
+
|
216
|
+
base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
|
217
|
+
tables_found = []
|
218
|
+
|
219
|
+
if self.scan_all_schemas:
|
220
|
+
# Discover all schemas first
|
221
|
+
print("🔍 Discovering schemas...")
|
222
|
+
schemas_result = obs.list_with_delimiter(store, prefix=base_path)
|
223
|
+
schemas = [
|
224
|
+
prefix.rstrip('/').split('/')[-1]
|
225
|
+
for prefix in schemas_result['common_prefixes']
|
226
|
+
]
|
227
|
+
print(f" Found {len(schemas)} schemas: {', '.join(schemas)}\n")
|
228
|
+
|
229
|
+
# Discover tables in each schema
|
230
|
+
print("🔍 Discovering tables...")
|
231
|
+
for schema_name in schemas:
|
232
|
+
schema_path = f"{base_path}{schema_name}/"
|
233
|
+
result = obs.list_with_delimiter(store, prefix=schema_path)
|
234
|
+
|
235
|
+
for table_prefix in result['common_prefixes']:
|
236
|
+
table_name = table_prefix.rstrip('/').split('/')[-1]
|
237
|
+
# Skip non-table directories
|
238
|
+
if table_name not in ('metadata', 'iceberg'):
|
239
|
+
tables_found.append((schema_name, table_name))
|
240
|
+
else:
|
241
|
+
# Scan specific schema only
|
242
|
+
print(f"🔍 Discovering tables in schema '{self.schema}'...")
|
243
|
+
schema_path = f"{base_path}{self.schema}/"
|
244
|
+
result = obs.list_with_delimiter(store, prefix=schema_path)
|
245
|
+
|
246
|
+
for table_prefix in result['common_prefixes']:
|
247
|
+
table_name = table_prefix.rstrip('/').split('/')[-1]
|
248
|
+
if table_name not in ('metadata', 'iceberg'):
|
249
|
+
tables_found.append((self.schema, table_name))
|
250
|
+
|
251
|
+
return tables_found
|
252
|
+
|
208
253
|
def _attach_lakehouse(self):
|
254
|
+
"""Attach lakehouse tables as DuckDB views using fast discovery"""
|
209
255
|
self._create_onelake_secret()
|
256
|
+
|
210
257
|
try:
|
211
|
-
|
212
|
-
# This avoids parsing JSON content that causes Iceberg metadata issues
|
213
|
-
print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
|
214
|
-
|
215
|
-
list_tables_query = f"""
|
216
|
-
SELECT DISTINCT
|
217
|
-
regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
|
218
|
-
FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
|
219
|
-
WHERE file LIKE '%/_delta_log/%'
|
220
|
-
AND file NOT LIKE '%/metadata/%'
|
221
|
-
AND file NOT LIKE '%/iceberg/%'
|
222
|
-
AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
|
223
|
-
"""
|
224
|
-
|
225
|
-
list_tables_df = self.con.sql(list_tables_query).df()
|
258
|
+
tables = self._discover_tables_fast()
|
226
259
|
|
227
|
-
if
|
228
|
-
|
260
|
+
if not tables:
|
261
|
+
if self.scan_all_schemas:
|
262
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
|
263
|
+
else:
|
264
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
|
229
265
|
return
|
230
266
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
for table in table_names:
|
236
|
-
# Skip Iceberg-related folders and empty names
|
237
|
-
if not table or table in ('metadata', 'iceberg'):
|
238
|
-
continue
|
239
|
-
|
267
|
+
print(f"\n📊 Found {len(tables)} Delta tables. Attaching as views...\n")
|
268
|
+
|
269
|
+
attached_count = 0
|
270
|
+
for schema_name, table_name in tables:
|
240
271
|
try:
|
272
|
+
view_name = f"{schema_name}_{table_name}" if self.scan_all_schemas else table_name
|
273
|
+
|
241
274
|
self.con.sql(f"""
|
242
|
-
CREATE OR REPLACE VIEW {
|
243
|
-
AS SELECT * FROM delta_scan('{self.table_base_url}{
|
275
|
+
CREATE OR REPLACE VIEW {view_name}
|
276
|
+
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
244
277
|
""")
|
245
|
-
print(f" ✓ Attached: {
|
278
|
+
print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
|
279
|
+
attached_count += 1
|
246
280
|
except Exception as e:
|
247
|
-
print(f" ⚠ Skipped {
|
281
|
+
print(f" ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
|
248
282
|
continue
|
249
283
|
|
250
|
-
print("\
|
251
|
-
|
284
|
+
print(f"\n{'='*60}")
|
285
|
+
print(f"✅ Successfully attached {attached_count}/{len(tables)} tables")
|
286
|
+
print(f"{'='*60}\n")
|
287
|
+
|
288
|
+
print("Available views in DuckDB:")
|
289
|
+
self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory' ORDER BY name").show()
|
290
|
+
|
291
|
+
if self.scan_all_schemas:
|
292
|
+
print(f"\n💡 Note: Tables are prefixed with schema (e.g., dbo_tablename)")
|
293
|
+
print(f" Default schema for operations: {self.schema}\n")
|
294
|
+
|
252
295
|
except Exception as e:
|
253
|
-
print(f"Error attaching lakehouse: {e}")
|
296
|
+
print(f"❌ Error attaching lakehouse: {e}")
|
254
297
|
print("Continuing without pre-attached tables.")
|
255
298
|
|
256
299
|
def _normalize_table_name(self, name: str) -> str:
|
@@ -284,7 +327,6 @@ class Duckrun:
|
|
284
327
|
print(f"SQL file is empty: {table_name}.sql")
|
285
328
|
return None
|
286
329
|
|
287
|
-
# Auto-inject common params, merge with user params
|
288
330
|
full_params = {
|
289
331
|
'ws': self.workspace,
|
290
332
|
'lh': self.lakehouse_name,
|
@@ -407,18 +449,9 @@ class Duckrun:
|
|
407
449
|
|
408
450
|
Returns:
|
409
451
|
True if all tasks succeeded
|
410
|
-
|
411
|
-
Example:
|
412
|
-
pipeline = [
|
413
|
-
('download', (urls, paths, depth)),
|
414
|
-
('staging', 'overwrite', {'run_date': '2024-06-01'}),
|
415
|
-
('transform', 'append'), # {} optional!
|
416
|
-
('calendar', 'ignore') # {} optional!
|
417
|
-
]
|
418
|
-
dr.run(pipeline)
|
419
452
|
"""
|
420
453
|
if self.sql_folder is None:
|
421
|
-
raise RuntimeError("sql_folder is not configured. Cannot run pipelines.
|
454
|
+
raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
|
422
455
|
|
423
456
|
for i, task in enumerate(pipeline, 1):
|
424
457
|
print(f"\n{'='*60}")
|
@@ -427,18 +460,14 @@ class Duckrun:
|
|
427
460
|
|
428
461
|
try:
|
429
462
|
if len(task) == 2:
|
430
|
-
# Could be Python: ('name', (args,)) or SQL: ('table', 'mode')
|
431
463
|
name, second = task
|
432
464
|
if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
|
433
|
-
# SQL task without params: ('table', 'mode')
|
434
465
|
self._run_sql(name, second, {})
|
435
466
|
else:
|
436
|
-
# Python task: ('name', (args,))
|
437
467
|
args = second if isinstance(second, (tuple, list)) else (second,)
|
438
468
|
self._run_python(name, tuple(args))
|
439
469
|
|
440
470
|
elif len(task) == 3:
|
441
|
-
# SQL task with params: ('table', 'mode', {params})
|
442
471
|
table, mode, params = task
|
443
472
|
if not isinstance(params, dict):
|
444
473
|
raise ValueError(f"Expected dict for params, got {type(params)}")
|
@@ -461,13 +490,9 @@ class Duckrun:
|
|
461
490
|
Execute raw SQL query with Spark-style write API.
|
462
491
|
|
463
492
|
Example:
|
464
|
-
# Traditional DuckDB style
|
465
493
|
dr.sql("SELECT * FROM table").show()
|
466
494
|
df = dr.sql("SELECT * FROM table").df()
|
467
|
-
|
468
|
-
# New Spark-style write API (format is optional, defaults to delta)
|
469
495
|
dr.sql("SELECT 43 as value").write.mode("append").saveAsTable("test")
|
470
|
-
dr.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
|
471
496
|
"""
|
472
497
|
relation = self.con.sql(query)
|
473
498
|
return QueryResult(relation, self)
|
@@ -1,9 +1,9 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.6
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
Author: mim
|
6
|
-
License
|
6
|
+
License: MIT
|
7
7
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
8
8
|
Project-URL: Repository, https://github.com/djouallah/duckrun
|
9
9
|
Project-URL: Issues, https://github.com/djouallah/duckrun/issues
|
@@ -13,9 +13,12 @@ License-File: LICENSE
|
|
13
13
|
Requires-Dist: duckdb>=1.2.0
|
14
14
|
Requires-Dist: deltalake>=0.18.2
|
15
15
|
Requires-Dist: requests>=2.28.0
|
16
|
+
Requires-Dist: obstore>=0.2.0
|
17
|
+
Provides-Extra: local
|
18
|
+
Requires-Dist: azure-identity>=1.12.0; extra == "local"
|
16
19
|
Dynamic: license-file
|
17
20
|
|
18
|
-
<img src="duckrun.png" width="400" alt="Duckrun">
|
21
|
+
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
19
22
|
|
20
23
|
Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
|
21
24
|
|
@@ -38,10 +41,11 @@ pip install duckrun
|
|
38
41
|
```python
|
39
42
|
import duckrun
|
40
43
|
|
41
|
-
# Connect to your Fabric lakehouse
|
44
|
+
# Connect to your Fabric lakehouse with a specific schema
|
42
45
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse/dbo")
|
43
46
|
|
44
|
-
# Schema defaults to 'dbo' if not specified
|
47
|
+
# Schema defaults to 'dbo' if not specified (scans all schemas)
|
48
|
+
# ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
|
45
49
|
con = duckrun.connect("my_workspace/my_lakehouse.lakehouse")
|
46
50
|
|
47
51
|
# Explore data
|
@@ -56,17 +60,37 @@ That's it! No `sql_folder` needed for data exploration.
|
|
56
60
|
## Connection Format
|
57
61
|
|
58
62
|
```python
|
59
|
-
# With schema
|
63
|
+
# With schema (recommended for better performance)
|
60
64
|
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
61
65
|
|
62
|
-
# Without schema (
|
66
|
+
# Without schema (defaults to 'dbo', scans all schemas)
|
67
|
+
# ⚠️ This can be slow for large lakehouses!
|
63
68
|
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
64
69
|
|
65
70
|
# With options
|
66
71
|
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
|
67
72
|
```
|
68
73
|
|
69
|
-
|
74
|
+
### Multi-Schema Support
|
75
|
+
|
76
|
+
When you don't specify a schema, Duckrun will:
|
77
|
+
- **Default to `dbo`** for write operations
|
78
|
+
- **Scan all schemas** to discover and attach all Delta tables
|
79
|
+
- **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
|
80
|
+
|
81
|
+
**Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
|
82
|
+
|
83
|
+
```python
|
84
|
+
# Fast: scans only 'dbo' schema
|
85
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
|
86
|
+
|
87
|
+
# Slower: scans all schemas
|
88
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
89
|
+
|
90
|
+
# Query tables from different schemas (when scanning all)
|
91
|
+
con.sql("SELECT * FROM dbo_customers").show()
|
92
|
+
con.sql("SELECT * FROM bronze_raw_data").show()
|
93
|
+
```
|
70
94
|
|
71
95
|
## Two Ways to Use Duckrun
|
72
96
|
|
@@ -262,7 +286,7 @@ con = duckrun.connect(
|
|
262
286
|
```python
|
263
287
|
import duckrun
|
264
288
|
|
265
|
-
# Connect
|
289
|
+
# Connect (specify schema for best performance)
|
266
290
|
con = duckrun.connect("Analytics/Sales.lakehouse/dbo", sql_folder="./sql")
|
267
291
|
|
268
292
|
# Pipeline with mixed tasks
|
@@ -297,7 +321,7 @@ con.sql("""
|
|
297
321
|
## How It Works
|
298
322
|
|
299
323
|
1. **Connection**: Duckrun connects to your Fabric lakehouse using OneLake and Azure authentication
|
300
|
-
2. **Table Discovery**: Automatically scans for Delta tables in your schema and creates DuckDB views
|
324
|
+
2. **Table Discovery**: Automatically scans for Delta tables in your schema (or all schemas) and creates DuckDB views
|
301
325
|
3. **Query Execution**: Run SQL queries directly against Delta tables using DuckDB's speed
|
302
326
|
4. **Write Operations**: Results are written back as Delta tables with automatic optimization
|
303
327
|
5. **Pipelines**: Orchestrate complex workflows with reusable SQL and Python tasks
|
@@ -0,0 +1,7 @@
|
|
1
|
+
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
+
duckrun/core.py,sha256=H7Q-mvE5ET3mdEi7VTubWdaCrgVaJW9G0LfAu0Gpw-g,21872
|
3
|
+
duckrun-0.1.6.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
+
duckrun-0.1.6.dist-info/METADATA,sha256=20vTn4-9fn8iqwXGjYT3IQd9Xk47sQAD-Tv3wk2Pp9I,9356
|
5
|
+
duckrun-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
duckrun-0.1.6.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
+
duckrun-0.1.6.dist-info/RECORD,,
|
duckrun-0.1.5.5.dist-info/RECORD
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
-
duckrun/core.py,sha256=0Jo7zkVuTvdPPt-ubUhy5996oAm4VffZrH6K1AUw7wE,20804
|
3
|
-
duckrun-0.1.5.5.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
-
duckrun-0.1.5.5.dist-info/METADATA,sha256=0fp-MgKtZuxYBxvXtGOpUsK4aJbaobLckzFfq-LMu4o,8201
|
5
|
-
duckrun-0.1.5.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
-
duckrun-0.1.5.5.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
-
duckrun-0.1.5.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|