duckrun 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/core.py +17 -10
- {duckrun-0.1.3.dist-info → duckrun-0.1.4.dist-info}/METADATA +13 -9
- duckrun-0.1.4.dist-info/RECORD +7 -0
- duckrun-0.1.3.dist-info/RECORD +0 -7
- {duckrun-0.1.3.dist-info → duckrun-0.1.4.dist-info}/WHEEL +0 -0
- {duckrun-0.1.3.dist-info → duckrun-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.1.3.dist-info → duckrun-0.1.4.dist-info}/top_level.txt +0 -0
duckrun/core.py
CHANGED
@@ -16,23 +16,21 @@ class Duckrun:
|
|
16
16
|
SQL: ('table_name', 'mode', {params})
|
17
17
|
|
18
18
|
Usage:
|
19
|
+
# For pipelines:
|
19
20
|
dr = Duckrun.connect(workspace, lakehouse, schema, sql_folder)
|
20
|
-
|
21
|
-
pipeline = [
|
22
|
-
('download', (urls, paths, depth)),
|
23
|
-
('staging', 'overwrite', {'run_date': '2024-06-01'}),
|
24
|
-
('transform', 'append')
|
25
|
-
]
|
26
|
-
|
27
21
|
dr.run(pipeline)
|
22
|
+
|
23
|
+
# For data exploration only:
|
24
|
+
dr = Duckrun.connect(workspace, lakehouse, schema)
|
25
|
+
dr.sql("SELECT * FROM table").show()
|
28
26
|
"""
|
29
27
|
|
30
28
|
def __init__(self, workspace: str, lakehouse_name: str, schema: str,
|
31
|
-
sql_folder: str, compaction_threshold: int = 10):
|
29
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 10):
|
32
30
|
self.workspace = workspace
|
33
31
|
self.lakehouse_name = lakehouse_name
|
34
32
|
self.schema = schema
|
35
|
-
self.sql_folder = sql_folder.strip()
|
33
|
+
self.sql_folder = sql_folder.strip() if sql_folder else None
|
36
34
|
self.compaction_threshold = compaction_threshold
|
37
35
|
self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
|
38
36
|
self.con = duckdb.connect()
|
@@ -41,7 +39,7 @@ class Duckrun:
|
|
41
39
|
|
42
40
|
@classmethod
|
43
41
|
def connect(cls, workspace: str, lakehouse_name: str, schema: str,
|
44
|
-
sql_folder: str, compaction_threshold: int =
|
42
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 100):
|
45
43
|
"""Create and connect to lakehouse"""
|
46
44
|
print("Connecting to Lakehouse...")
|
47
45
|
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
|
@@ -114,6 +112,9 @@ class Duckrun:
|
|
114
112
|
return name.split('__', 1)[0] if '__' in name else name
|
115
113
|
|
116
114
|
def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
|
115
|
+
if self.sql_folder is None:
|
116
|
+
raise RuntimeError("sql_folder is not configured. Cannot read SQL files.")
|
117
|
+
|
117
118
|
is_url = self.sql_folder.startswith("http")
|
118
119
|
if is_url:
|
119
120
|
url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
|
@@ -159,6 +160,9 @@ class Duckrun:
|
|
159
160
|
return content
|
160
161
|
|
161
162
|
def _load_py_function(self, name: str) -> Optional[Callable]:
|
163
|
+
if self.sql_folder is None:
|
164
|
+
raise RuntimeError("sql_folder is not configured. Cannot load Python functions.")
|
165
|
+
|
162
166
|
is_url = self.sql_folder.startswith("http")
|
163
167
|
try:
|
164
168
|
if is_url:
|
@@ -267,6 +271,9 @@ class Duckrun:
|
|
267
271
|
]
|
268
272
|
dr.run(pipeline)
|
269
273
|
"""
|
274
|
+
if self.sql_folder is None:
|
275
|
+
raise RuntimeError("sql_folder is not configured. Cannot run pipelines. Set sql_folder when creating connection.")
|
276
|
+
|
270
277
|
for i, task in enumerate(pipeline, 1):
|
271
278
|
print(f"\n{'='*60}")
|
272
279
|
print(f"Task {i}/{len(pipeline)}: {task[0]}")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
License-Expression: MIT
|
6
6
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
@@ -35,14 +35,14 @@ pip install duckrun
|
|
35
35
|
## Quick Start
|
36
36
|
|
37
37
|
```python
|
38
|
-
import duckrun
|
38
|
+
import duckrun
|
39
39
|
|
40
|
-
# Connect to your Fabric lakehouse
|
41
|
-
|
40
|
+
# Connect to your Fabric lakehouse (using `con` pattern)
|
41
|
+
con = duckrun.connect(
|
42
42
|
workspace="my_workspace",
|
43
43
|
lakehouse_name="my_lakehouse",
|
44
44
|
schema="dbo",
|
45
|
-
sql_folder="./sql" # folder containing your .sql and .py files
|
45
|
+
sql_folder="./sql" # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
|
46
46
|
)
|
47
47
|
|
48
48
|
# Define your pipeline
|
@@ -53,9 +53,11 @@ pipeline = [
|
|
53
53
|
]
|
54
54
|
|
55
55
|
# Run it
|
56
|
-
|
56
|
+
con.run(pipeline)
|
57
57
|
```
|
58
58
|
|
59
|
+
Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
|
60
|
+
|
59
61
|
## Early Exit
|
60
62
|
|
61
63
|
In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
|
@@ -138,12 +140,14 @@ Both write to the same `sales` table, but use different SQL files.
|
|
138
140
|
|
139
141
|
```python
|
140
142
|
# Run queries
|
141
|
-
|
143
|
+
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
142
144
|
|
143
145
|
# Get as DataFrame
|
144
|
-
df =
|
146
|
+
df = con.sql("SELECT COUNT(*) FROM sales").df()
|
145
147
|
```
|
146
148
|
|
149
|
+
Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
|
150
|
+
|
147
151
|
|
148
152
|
|
149
153
|
## Remote SQL Files
|
@@ -151,7 +155,7 @@ df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
|
|
151
155
|
You can load SQL/Python files from a URL:
|
152
156
|
|
153
157
|
```python
|
154
|
-
|
158
|
+
con = duckrun.connect(
|
155
159
|
workspace="Analytics",
|
156
160
|
lakehouse_name="Sales",
|
157
161
|
schema="dbo",
|
@@ -0,0 +1,7 @@
|
|
1
|
+
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
+
duckrun/core.py,sha256=u56bWZDKevbplARgnFdI0wm9BfrIVyAiu3eOIwE5FJc,14259
|
3
|
+
duckrun-0.1.4.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
+
duckrun-0.1.4.dist-info/METADATA,sha256=eoPhYn2zC0s_YyEGdiCe1Gs7iWfKY9vakYm3rZdMrrs,4377
|
5
|
+
duckrun-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
duckrun-0.1.4.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
+
duckrun-0.1.4.dist-info/RECORD,,
|
duckrun-0.1.3.dist-info/RECORD
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
-
duckrun/core.py,sha256=Ok2IS15NcV6zFuFKFi2GOe1NKREoBQzjwAay-fCNf38,13774
|
3
|
-
duckrun-0.1.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
4
|
-
duckrun-0.1.3.dist-info/METADATA,sha256=BYek_gAWR_6QdCAJQAV7QnhoSQsaG0aprlMtAce9Z0k,3805
|
5
|
-
duckrun-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
-
duckrun-0.1.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
-
duckrun-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|