duckrun 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/core.py +119 -12
- {duckrun-0.1.3.dist-info → duckrun-0.1.5.dist-info}/METADATA +13 -9
- duckrun-0.1.5.dist-info/RECORD +7 -0
- duckrun-0.1.3.dist-info/RECORD +0 -7
- {duckrun-0.1.3.dist-info → duckrun-0.1.5.dist-info}/WHEEL +0 -0
- {duckrun-0.1.3.dist-info → duckrun-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.1.3.dist-info → duckrun-0.1.5.dist-info}/top_level.txt +0 -0
duckrun/core.py
CHANGED
@@ -6,6 +6,99 @@ from deltalake import DeltaTable, write_deltalake
|
|
6
6
|
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
7
7
|
from string import Template
|
8
8
|
|
9
|
+
|
10
|
+
class DeltaWriter:
|
11
|
+
"""Spark-style write API for Delta Lake"""
|
12
|
+
|
13
|
+
def __init__(self, relation, duckrun_instance):
|
14
|
+
self.relation = relation
|
15
|
+
self.duckrun = duckrun_instance
|
16
|
+
self._format = None
|
17
|
+
self._mode = "overwrite"
|
18
|
+
|
19
|
+
def format(self, format_type: str):
|
20
|
+
"""Set output format (only 'delta' supported)"""
|
21
|
+
if format_type.lower() != "delta":
|
22
|
+
raise ValueError(f"Only 'delta' format is supported, got '{format_type}'")
|
23
|
+
self._format = "delta"
|
24
|
+
return self
|
25
|
+
|
26
|
+
def mode(self, write_mode: str):
|
27
|
+
"""Set write mode: 'overwrite' or 'append'"""
|
28
|
+
if write_mode not in {"overwrite", "append"}:
|
29
|
+
raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
|
30
|
+
self._mode = write_mode
|
31
|
+
return self
|
32
|
+
|
33
|
+
def saveAsTable(self, table_name: str):
|
34
|
+
"""Save query result as Delta table"""
|
35
|
+
if self._format != "delta":
|
36
|
+
raise RuntimeError("Must call .format('delta') before saveAsTable()")
|
37
|
+
|
38
|
+
# Parse schema.table or use default schema
|
39
|
+
if "." in table_name:
|
40
|
+
schema, table = table_name.split(".", 1)
|
41
|
+
else:
|
42
|
+
schema = self.duckrun.schema
|
43
|
+
table = table_name
|
44
|
+
|
45
|
+
# Ensure OneLake secret is created
|
46
|
+
self.duckrun._create_onelake_secret()
|
47
|
+
|
48
|
+
# Build path
|
49
|
+
path = f"{self.duckrun.table_base_url}{schema}/{table}"
|
50
|
+
|
51
|
+
# Execute query and get result
|
52
|
+
df = self.relation.record_batch()
|
53
|
+
|
54
|
+
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
|
55
|
+
|
56
|
+
# Write to Delta
|
57
|
+
write_deltalake(path, df, mode=self._mode)
|
58
|
+
|
59
|
+
# Create or replace view in DuckDB
|
60
|
+
self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
|
61
|
+
self.duckrun.con.sql(f"""
|
62
|
+
CREATE OR REPLACE VIEW {table}
|
63
|
+
AS SELECT * FROM delta_scan('{path}')
|
64
|
+
""")
|
65
|
+
|
66
|
+
# Optimize if needed
|
67
|
+
dt = DeltaTable(path)
|
68
|
+
|
69
|
+
if self._mode == "overwrite":
|
70
|
+
dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
|
71
|
+
dt.cleanup_metadata()
|
72
|
+
print(f"✅ Table {schema}.{table} created/overwritten")
|
73
|
+
else: # append
|
74
|
+
file_count = len(dt.file_uris())
|
75
|
+
if file_count > self.duckrun.compaction_threshold:
|
76
|
+
print(f"Compacting {schema}.{table} ({file_count} files)")
|
77
|
+
dt.optimize.compact()
|
78
|
+
dt.vacuum(dry_run=False)
|
79
|
+
dt.cleanup_metadata()
|
80
|
+
print(f"✅ Data appended to {schema}.{table}")
|
81
|
+
|
82
|
+
return table
|
83
|
+
|
84
|
+
|
85
|
+
class QueryResult:
|
86
|
+
"""Wrapper for DuckDB relation with write API"""
|
87
|
+
|
88
|
+
def __init__(self, relation, duckrun_instance):
|
89
|
+
self.relation = relation
|
90
|
+
self.duckrun = duckrun_instance
|
91
|
+
|
92
|
+
@property
|
93
|
+
def write(self):
|
94
|
+
"""Access write API"""
|
95
|
+
return DeltaWriter(self.relation, self.duckrun)
|
96
|
+
|
97
|
+
def __getattr__(self, name):
|
98
|
+
"""Delegate all other methods to underlying DuckDB relation"""
|
99
|
+
return getattr(self.relation, name)
|
100
|
+
|
101
|
+
|
9
102
|
class Duckrun:
|
10
103
|
"""
|
11
104
|
Lakehouse task runner with clean tuple-based API.
|
@@ -16,23 +109,22 @@ class Duckrun:
|
|
16
109
|
SQL: ('table_name', 'mode', {params})
|
17
110
|
|
18
111
|
Usage:
|
112
|
+
# For pipelines:
|
19
113
|
dr = Duckrun.connect(workspace, lakehouse, schema, sql_folder)
|
20
|
-
|
21
|
-
pipeline = [
|
22
|
-
('download', (urls, paths, depth)),
|
23
|
-
('staging', 'overwrite', {'run_date': '2024-06-01'}),
|
24
|
-
('transform', 'append')
|
25
|
-
]
|
26
|
-
|
27
114
|
dr.run(pipeline)
|
115
|
+
|
116
|
+
# For data exploration with Spark-style API:
|
117
|
+
dr = Duckrun.connect(workspace, lakehouse, schema)
|
118
|
+
dr.sql("SELECT * FROM table").show()
|
119
|
+
dr.sql("SELECT 43").write.format("delta").mode("append").saveAsTable("aemo.test")
|
28
120
|
"""
|
29
121
|
|
30
122
|
def __init__(self, workspace: str, lakehouse_name: str, schema: str,
|
31
|
-
sql_folder: str, compaction_threshold: int = 10):
|
123
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 10):
|
32
124
|
self.workspace = workspace
|
33
125
|
self.lakehouse_name = lakehouse_name
|
34
126
|
self.schema = schema
|
35
|
-
self.sql_folder = sql_folder.strip()
|
127
|
+
self.sql_folder = sql_folder.strip() if sql_folder else None
|
36
128
|
self.compaction_threshold = compaction_threshold
|
37
129
|
self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
|
38
130
|
self.con = duckdb.connect()
|
@@ -41,7 +133,7 @@ class Duckrun:
|
|
41
133
|
|
42
134
|
@classmethod
|
43
135
|
def connect(cls, workspace: str, lakehouse_name: str, schema: str,
|
44
|
-
sql_folder: str, compaction_threshold: int =
|
136
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 100):
|
45
137
|
"""Create and connect to lakehouse"""
|
46
138
|
print("Connecting to Lakehouse...")
|
47
139
|
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
|
@@ -114,6 +206,9 @@ class Duckrun:
|
|
114
206
|
return name.split('__', 1)[0] if '__' in name else name
|
115
207
|
|
116
208
|
def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
|
209
|
+
if self.sql_folder is None:
|
210
|
+
raise RuntimeError("sql_folder is not configured. Cannot read SQL files.")
|
211
|
+
|
117
212
|
is_url = self.sql_folder.startswith("http")
|
118
213
|
if is_url:
|
119
214
|
url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
|
@@ -159,6 +254,9 @@ class Duckrun:
|
|
159
254
|
return content
|
160
255
|
|
161
256
|
def _load_py_function(self, name: str) -> Optional[Callable]:
|
257
|
+
if self.sql_folder is None:
|
258
|
+
raise RuntimeError("sql_folder is not configured. Cannot load Python functions.")
|
259
|
+
|
162
260
|
is_url = self.sql_folder.startswith("http")
|
163
261
|
try:
|
164
262
|
if is_url:
|
@@ -267,6 +365,9 @@ class Duckrun:
|
|
267
365
|
]
|
268
366
|
dr.run(pipeline)
|
269
367
|
"""
|
368
|
+
if self.sql_folder is None:
|
369
|
+
raise RuntimeError("sql_folder is not configured. Cannot run pipelines. Set sql_folder when creating connection.")
|
370
|
+
|
270
371
|
for i, task in enumerate(pipeline, 1):
|
271
372
|
print(f"\n{'='*60}")
|
272
373
|
print(f"Task {i}/{len(pipeline)}: {task[0]}")
|
@@ -305,13 +406,19 @@ class Duckrun:
|
|
305
406
|
|
306
407
|
def sql(self, query: str):
|
307
408
|
"""
|
308
|
-
Execute raw SQL query.
|
409
|
+
Execute raw SQL query with Spark-style write API.
|
309
410
|
|
310
411
|
Example:
|
412
|
+
# Traditional DuckDB style
|
311
413
|
dr.sql("SELECT * FROM table").show()
|
312
414
|
df = dr.sql("SELECT * FROM table").df()
|
415
|
+
|
416
|
+
# New Spark-style write API
|
417
|
+
dr.sql("SELECT 43 as value").write.format("delta").mode("append").saveAsTable("aemo.test")
|
418
|
+
dr.sql("SELECT * FROM source").write.format("delta").mode("overwrite").saveAsTable("target")
|
313
419
|
"""
|
314
|
-
|
420
|
+
relation = self.con.sql(query)
|
421
|
+
return QueryResult(relation, self)
|
315
422
|
|
316
423
|
def get_connection(self):
|
317
424
|
"""Get underlying DuckDB connection"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
License-Expression: MIT
|
6
6
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
@@ -35,14 +35,14 @@ pip install duckrun
|
|
35
35
|
## Quick Start
|
36
36
|
|
37
37
|
```python
|
38
|
-
import duckrun
|
38
|
+
import duckrun
|
39
39
|
|
40
|
-
# Connect to your Fabric lakehouse
|
41
|
-
|
40
|
+
# Connect to your Fabric lakehouse (using `con` pattern)
|
41
|
+
con = duckrun.connect(
|
42
42
|
workspace="my_workspace",
|
43
43
|
lakehouse_name="my_lakehouse",
|
44
44
|
schema="dbo",
|
45
|
-
sql_folder="./sql" # folder containing your .sql and .py files
|
45
|
+
sql_folder="./sql" # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
|
46
46
|
)
|
47
47
|
|
48
48
|
# Define your pipeline
|
@@ -53,9 +53,11 @@ pipeline = [
|
|
53
53
|
]
|
54
54
|
|
55
55
|
# Run it
|
56
|
-
|
56
|
+
con.run(pipeline)
|
57
57
|
```
|
58
58
|
|
59
|
+
Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
|
60
|
+
|
59
61
|
## Early Exit
|
60
62
|
|
61
63
|
In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
|
@@ -138,12 +140,14 @@ Both write to the same `sales` table, but use different SQL files.
|
|
138
140
|
|
139
141
|
```python
|
140
142
|
# Run queries
|
141
|
-
|
143
|
+
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
142
144
|
|
143
145
|
# Get as DataFrame
|
144
|
-
df =
|
146
|
+
df = con.sql("SELECT COUNT(*) FROM sales").df()
|
145
147
|
```
|
146
148
|
|
149
|
+
Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
|
150
|
+
|
147
151
|
|
148
152
|
|
149
153
|
## Remote SQL Files
|
@@ -151,7 +155,7 @@ df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
|
|
151
155
|
You can load SQL/Python files from a URL:
|
152
156
|
|
153
157
|
```python
|
154
|
-
|
158
|
+
con = duckrun.connect(
|
155
159
|
workspace="Analytics",
|
156
160
|
lakehouse_name="Sales",
|
157
161
|
schema="dbo",
|
@@ -0,0 +1,7 @@
|
|
1
|
+
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
+
duckrun/core.py,sha256=6kjFoxjucRgb3qV9R6ZDy1oHcLE2JXPH5E0WCDKgpZM,18031
|
3
|
+
duckrun-0.1.5.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
+
duckrun-0.1.5.dist-info/METADATA,sha256=U2NQWJKm02rG4YvgAkDCJ9434OC_sJE-cuSUHu3eSvM,4377
|
5
|
+
duckrun-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
duckrun-0.1.5.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
+
duckrun-0.1.5.dist-info/RECORD,,
|
duckrun-0.1.3.dist-info/RECORD
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
-
duckrun/core.py,sha256=Ok2IS15NcV6zFuFKFi2GOe1NKREoBQzjwAay-fCNf38,13774
|
3
|
-
duckrun-0.1.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
-
duckrun-0.1.3.dist-info/METADATA,sha256=BYek_gAWR_6QdCAJQAV7QnhoSQsaG0aprlMtAce9Z0k,3805
|
5
|
-
duckrun-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
-
duckrun-0.1.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
-
duckrun-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|