duckrun 0.1.6.3__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/core.py +42 -59
- {duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/METADATA +8 -4
- duckrun-0.1.8.dist-info/RECORD +7 -0
- duckrun-0.1.6.3.dist-info/RECORD +0 -7
- {duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/WHEEL +0 -0
- {duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.1.6.3.dist-info → duckrun-0.1.8.dist-info}/top_level.txt +0 -0
duckrun/core.py
CHANGED
@@ -8,6 +8,9 @@ from string import Template
|
|
8
8
|
import obstore as obs
|
9
9
|
from obstore.store import AzureStore
|
10
10
|
|
11
|
+
# Row Group configuration for optimal Delta Lake performance
|
12
|
+
RG = 8_000_000
|
13
|
+
|
11
14
|
|
12
15
|
class DeltaWriter:
|
13
16
|
"""Spark-style write API for Delta Lake"""
|
@@ -48,7 +51,7 @@ class DeltaWriter:
|
|
48
51
|
df = self.relation.record_batch()
|
49
52
|
|
50
53
|
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
|
51
|
-
write_deltalake(path, df, mode=self._mode)
|
54
|
+
write_deltalake(path, df, mode=self._mode, max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
|
52
55
|
|
53
56
|
self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
|
54
57
|
self.duckrun.con.sql(f"""
|
@@ -127,77 +130,57 @@ class Duckrun:
|
|
127
130
|
self._attach_lakehouse()
|
128
131
|
|
129
132
|
@classmethod
|
130
|
-
def connect(cls,
|
131
|
-
schema: str = "dbo", sql_folder: Optional[str] = None,
|
133
|
+
def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
|
132
134
|
compaction_threshold: int = 100):
|
133
135
|
"""
|
134
136
|
Create and connect to lakehouse.
|
135
137
|
|
136
|
-
|
137
|
-
1. Compact: connect("ws/lh.lakehouse/schema", sql_folder=...) or connect("ws/lh.lakehouse")
|
138
|
-
2. Traditional: connect("ws", "lh", "schema", sql_folder) or connect("ws", "lh")
|
138
|
+
Uses compact format: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
|
139
139
|
|
140
140
|
Args:
|
141
|
-
|
142
|
-
lakehouse_name: Lakehouse name (optional if using compact format)
|
143
|
-
schema: Schema name (defaults to "dbo")
|
141
|
+
connection_string: OneLake path "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
144
142
|
sql_folder: Optional path or URL to SQL files folder
|
145
143
|
compaction_threshold: File count threshold for compaction
|
146
144
|
|
147
145
|
Examples:
|
148
|
-
|
149
|
-
dr = Duckrun.connect("temp/power.lakehouse/wa", "https://github.com/.../sql/")
|
150
|
-
dr = Duckrun.connect("ws/lh.lakehouse/schema", "./sql")
|
146
|
+
dr = Duckrun.connect("ws/lh.lakehouse/schema", sql_folder="./sql")
|
151
147
|
dr = Duckrun.connect("ws/lh.lakehouse/schema") # no SQL folder
|
152
|
-
|
153
|
-
# Traditional format
|
154
|
-
dr = Duckrun.connect("ws", "lh", "schema", "./sql")
|
155
|
-
dr = Duckrun.connect("ws", "lh", "schema")
|
148
|
+
dr = Duckrun.connect("ws/lh.lakehouse") # defaults to dbo schema
|
156
149
|
"""
|
157
150
|
print("Connecting to Lakehouse...")
|
158
151
|
|
159
152
|
scan_all_schemas = False
|
160
153
|
|
161
|
-
#
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
if lakehouse_name.endswith(".lakehouse"):
|
186
|
-
lakehouse_name = lakehouse_name[:-10]
|
187
|
-
elif lakehouse_name is not None:
|
188
|
-
# Traditional format - check if schema was explicitly provided
|
189
|
-
if schema == "dbo":
|
190
|
-
scan_all_schemas = True
|
191
|
-
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
192
|
-
print(f" Scanning all schemas for table discovery...\n")
|
154
|
+
# Only support compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
155
|
+
if not connection_string or "/" not in connection_string:
|
156
|
+
raise ValueError(
|
157
|
+
"Invalid connection string format. "
|
158
|
+
"Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
|
159
|
+
)
|
160
|
+
|
161
|
+
parts = connection_string.split("/")
|
162
|
+
if len(parts) == 2:
|
163
|
+
workspace, lakehouse_name = parts
|
164
|
+
scan_all_schemas = True
|
165
|
+
schema = "dbo"
|
166
|
+
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
167
|
+
print(f" Scanning all schemas for table discovery...\n")
|
168
|
+
elif len(parts) == 3:
|
169
|
+
workspace, lakehouse_name, schema = parts
|
170
|
+
else:
|
171
|
+
raise ValueError(
|
172
|
+
f"Invalid connection string format: '{connection_string}'. "
|
173
|
+
"Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
|
174
|
+
)
|
175
|
+
|
176
|
+
if lakehouse_name.endswith(".lakehouse"):
|
177
|
+
lakehouse_name = lakehouse_name[:-10]
|
193
178
|
|
194
179
|
if not workspace or not lakehouse_name:
|
195
180
|
raise ValueError(
|
196
|
-
"Missing required parameters. Use
|
181
|
+
"Missing required parameters. Use compact format:\n"
|
197
182
|
" connect('workspace/lakehouse.lakehouse/schema', 'sql_folder')\n"
|
198
|
-
" connect('workspace/lakehouse.lakehouse') # defaults to dbo
|
199
|
-
" connect('workspace', 'lakehouse', 'schema', 'sql_folder')\n"
|
200
|
-
" connect('workspace', 'lakehouse') # defaults to dbo"
|
183
|
+
" connect('workspace/lakehouse.lakehouse') # defaults to dbo"
|
201
184
|
)
|
202
185
|
|
203
186
|
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
|
@@ -210,7 +193,7 @@ class Duckrun:
|
|
210
193
|
if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
211
194
|
self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
|
212
195
|
else:
|
213
|
-
print("
|
196
|
+
print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
|
214
197
|
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
215
198
|
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
216
199
|
token = credential.get_token("https://storage.azure.com/.default")
|
@@ -227,7 +210,7 @@ class Duckrun:
|
|
227
210
|
"""
|
228
211
|
token = self._get_storage_token()
|
229
212
|
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
230
|
-
print("
|
213
|
+
print("Authenticating with Azure for table discovery (trying CLI, will fallback to browser if needed)...")
|
231
214
|
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
232
215
|
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
233
216
|
token_obj = credential.get_token("https://storage.azure.com/.default")
|
@@ -426,7 +409,7 @@ class Duckrun:
|
|
426
409
|
if mode == 'overwrite':
|
427
410
|
self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
|
428
411
|
df = self.con.sql(sql).record_batch()
|
429
|
-
write_deltalake(path, df, mode='overwrite')
|
412
|
+
write_deltalake(path, df, mode='overwrite', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
|
430
413
|
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
431
414
|
dt = DeltaTable(path)
|
432
415
|
dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
|
@@ -434,7 +417,7 @@ class Duckrun:
|
|
434
417
|
|
435
418
|
elif mode == 'append':
|
436
419
|
df = self.con.sql(sql).record_batch()
|
437
|
-
write_deltalake(path, df, mode='append')
|
420
|
+
write_deltalake(path, df, mode='append', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
|
438
421
|
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
439
422
|
dt = DeltaTable(path)
|
440
423
|
if len(dt.file_uris()) > self.compaction_threshold:
|
@@ -451,7 +434,7 @@ class Duckrun:
|
|
451
434
|
print(f"Table {normalized_table} doesn't exist. Creating...")
|
452
435
|
self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
|
453
436
|
df = self.con.sql(sql).record_batch()
|
454
|
-
write_deltalake(path, df, mode='overwrite')
|
437
|
+
write_deltalake(path, df, mode='overwrite', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
|
455
438
|
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
456
439
|
dt = DeltaTable(path)
|
457
440
|
dt.vacuum(dry_run=False)
|
@@ -542,7 +525,7 @@ class Duckrun:
|
|
542
525
|
# Get Azure token
|
543
526
|
token = self._get_storage_token()
|
544
527
|
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
545
|
-
print("
|
528
|
+
print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
|
546
529
|
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
547
530
|
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
548
531
|
token_obj = credential.get_token("https://storage.azure.com/.default")
|
@@ -649,7 +632,7 @@ class Duckrun:
|
|
649
632
|
# Get Azure token
|
650
633
|
token = self._get_storage_token()
|
651
634
|
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
652
|
-
print("
|
635
|
+
print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
|
653
636
|
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
654
637
|
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
655
638
|
token_obj = credential.get_token("https://storage.azure.com/.default")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.8
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
Author: mim
|
6
6
|
License: MIT
|
@@ -11,7 +11,7 @@ Requires-Python: >=3.9
|
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
13
|
Requires-Dist: duckdb>=1.2.0
|
14
|
-
Requires-Dist: deltalake
|
14
|
+
Requires-Dist: deltalake<=0.18.2
|
15
15
|
Requires-Dist: requests>=2.28.0
|
16
16
|
Requires-Dist: obstore>=0.2.0
|
17
17
|
Provides-Extra: local
|
@@ -20,7 +20,7 @@ Dynamic: license-file
|
|
20
20
|
|
21
21
|
<img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
|
22
22
|
|
23
|
-
|
23
|
+
A helper package for stuff that made my life easier when working with Fabric Python notebooks. Just the things that actually made sense to me - nothing fancy
|
24
24
|
|
25
25
|
## Important Notes
|
26
26
|
|
@@ -30,6 +30,10 @@ Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and
|
|
30
30
|
|
31
31
|
**Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
|
32
32
|
|
33
|
+
## What It Does
|
34
|
+
|
35
|
+
It does orchestration, arbitrary SQL statements, and file manipulation. That's it - just stuff I encounter in my daily workflow when working with Fabric notebooks.
|
36
|
+
|
33
37
|
## Installation
|
34
38
|
|
35
39
|
```bash
|
@@ -101,7 +105,7 @@ con.sql("SELECT * FROM dbo_customers").show()
|
|
101
105
|
con.sql("SELECT * FROM bronze_raw_data").show()
|
102
106
|
```
|
103
107
|
|
104
|
-
##
|
108
|
+
## Three Ways to Use Duckrun
|
105
109
|
|
106
110
|
### 1. Data Exploration (Spark-Style API)
|
107
111
|
|
@@ -0,0 +1,7 @@
|
|
1
|
+
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
+
duckrun/core.py,sha256=NrGriuJO7Mh1e9NKplNKkNleUWBpIKG5CwJGj3qNxxw,33334
|
3
|
+
duckrun-0.1.8.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
+
duckrun-0.1.8.dist-info/METADATA,sha256=CsvDljoHqgKfoDDdxHmNoKiR1PJNkqf6ye3hbxWm118,13847
|
5
|
+
duckrun-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
duckrun-0.1.8.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
+
duckrun-0.1.8.dist-info/RECORD,,
|
duckrun-0.1.6.3.dist-info/RECORD
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
-
duckrun/core.py,sha256=CT2NH5hCLsv4uB5zH3VxTuCVQy0nWkPBG-cICLPhG_8,34245
|
3
|
-
duckrun-0.1.6.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
-
duckrun-0.1.6.3.dist-info/METADATA,sha256=ny5DcRSU1B4SdHdJqHCYk0-hNo9-zqFABqMY9ulAVNk,13595
|
5
|
-
duckrun-0.1.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
-
duckrun-0.1.6.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
-
duckrun-0.1.6.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|