duckrun 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.1.2 → duckrun-0.1.4}/PKG-INFO +13 -9
- {duckrun-0.1.2 → duckrun-0.1.4}/README.md +12 -8
- {duckrun-0.1.2 → duckrun-0.1.4}/duckrun/core.py +48 -41
- {duckrun-0.1.2 → duckrun-0.1.4}/duckrun.egg-info/PKG-INFO +13 -9
- {duckrun-0.1.2 → duckrun-0.1.4}/pyproject.toml +1 -1
- {duckrun-0.1.2 → duckrun-0.1.4}/LICENSE +0 -0
- {duckrun-0.1.2 → duckrun-0.1.4}/duckrun/__init__.py +0 -0
- {duckrun-0.1.2 → duckrun-0.1.4}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.1.2 → duckrun-0.1.4}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.1.2 → duckrun-0.1.4}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.1.2 → duckrun-0.1.4}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.1.2 → duckrun-0.1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
License-Expression: MIT
|
6
6
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
@@ -35,14 +35,14 @@ pip install duckrun
|
|
35
35
|
## Quick Start
|
36
36
|
|
37
37
|
```python
|
38
|
-
import duckrun
|
38
|
+
import duckrun
|
39
39
|
|
40
|
-
# Connect to your Fabric lakehouse
|
41
|
-
|
40
|
+
# Connect to your Fabric lakehouse (using `con` pattern)
|
41
|
+
con = duckrun.connect(
|
42
42
|
workspace="my_workspace",
|
43
43
|
lakehouse_name="my_lakehouse",
|
44
44
|
schema="dbo",
|
45
|
-
sql_folder="./sql" # folder containing your .sql and .py files
|
45
|
+
sql_folder="./sql" # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
|
46
46
|
)
|
47
47
|
|
48
48
|
# Define your pipeline
|
@@ -53,9 +53,11 @@ pipeline = [
|
|
53
53
|
]
|
54
54
|
|
55
55
|
# Run it
|
56
|
-
|
56
|
+
con.run(pipeline)
|
57
57
|
```
|
58
58
|
|
59
|
+
Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
|
60
|
+
|
59
61
|
## Early Exit
|
60
62
|
|
61
63
|
In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
|
@@ -138,12 +140,14 @@ Both write to the same `sales` table, but use different SQL files.
|
|
138
140
|
|
139
141
|
```python
|
140
142
|
# Run queries
|
141
|
-
|
143
|
+
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
142
144
|
|
143
145
|
# Get as DataFrame
|
144
|
-
df =
|
146
|
+
df = con.sql("SELECT COUNT(*) FROM sales").df()
|
145
147
|
```
|
146
148
|
|
149
|
+
Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
|
150
|
+
|
147
151
|
|
148
152
|
|
149
153
|
## Remote SQL Files
|
@@ -151,7 +155,7 @@ df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
|
|
151
155
|
You can load SQL/Python files from a URL:
|
152
156
|
|
153
157
|
```python
|
154
|
-
|
158
|
+
con = duckrun.connect(
|
155
159
|
workspace="Analytics",
|
156
160
|
lakehouse_name="Sales",
|
157
161
|
schema="dbo",
|
@@ -19,14 +19,14 @@ pip install duckrun
|
|
19
19
|
## Quick Start
|
20
20
|
|
21
21
|
```python
|
22
|
-
import duckrun
|
22
|
+
import duckrun
|
23
23
|
|
24
|
-
# Connect to your Fabric lakehouse
|
25
|
-
|
24
|
+
# Connect to your Fabric lakehouse (using `con` pattern)
|
25
|
+
con = duckrun.connect(
|
26
26
|
workspace="my_workspace",
|
27
27
|
lakehouse_name="my_lakehouse",
|
28
28
|
schema="dbo",
|
29
|
-
sql_folder="./sql" # folder containing your .sql and .py files
|
29
|
+
sql_folder="./sql" # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
|
30
30
|
)
|
31
31
|
|
32
32
|
# Define your pipeline
|
@@ -37,9 +37,11 @@ pipeline = [
|
|
37
37
|
]
|
38
38
|
|
39
39
|
# Run it
|
40
|
-
|
40
|
+
con.run(pipeline)
|
41
41
|
```
|
42
42
|
|
43
|
+
Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
|
44
|
+
|
43
45
|
## Early Exit
|
44
46
|
|
45
47
|
In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
|
@@ -122,12 +124,14 @@ Both write to the same `sales` table, but use different SQL files.
|
|
122
124
|
|
123
125
|
```python
|
124
126
|
# Run queries
|
125
|
-
|
127
|
+
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
126
128
|
|
127
129
|
# Get as DataFrame
|
128
|
-
df =
|
130
|
+
df = con.sql("SELECT COUNT(*) FROM sales").df()
|
129
131
|
```
|
130
132
|
|
133
|
+
Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
|
134
|
+
|
131
135
|
|
132
136
|
|
133
137
|
## Remote SQL Files
|
@@ -135,7 +139,7 @@ df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
|
|
135
139
|
You can load SQL/Python files from a URL:
|
136
140
|
|
137
141
|
```python
|
138
|
-
|
142
|
+
con = duckrun.connect(
|
139
143
|
workspace="Analytics",
|
140
144
|
lakehouse_name="Sales",
|
141
145
|
schema="dbo",
|
@@ -16,23 +16,21 @@ class Duckrun:
|
|
16
16
|
SQL: ('table_name', 'mode', {params})
|
17
17
|
|
18
18
|
Usage:
|
19
|
+
# For pipelines:
|
19
20
|
dr = Duckrun.connect(workspace, lakehouse, schema, sql_folder)
|
20
|
-
|
21
|
-
pipeline = [
|
22
|
-
('download', (urls, paths, depth)),
|
23
|
-
('staging', 'overwrite', {'run_date': '2024-06-01'}),
|
24
|
-
('transform', 'append')
|
25
|
-
]
|
26
|
-
|
27
21
|
dr.run(pipeline)
|
22
|
+
|
23
|
+
# For data exploration only:
|
24
|
+
dr = Duckrun.connect(workspace, lakehouse, schema)
|
25
|
+
dr.sql("SELECT * FROM table").show()
|
28
26
|
"""
|
29
27
|
|
30
28
|
def __init__(self, workspace: str, lakehouse_name: str, schema: str,
|
31
|
-
sql_folder: str, compaction_threshold: int = 10):
|
29
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 10):
|
32
30
|
self.workspace = workspace
|
33
31
|
self.lakehouse_name = lakehouse_name
|
34
32
|
self.schema = schema
|
35
|
-
self.sql_folder = sql_folder.strip()
|
33
|
+
self.sql_folder = sql_folder.strip() if sql_folder else None
|
36
34
|
self.compaction_threshold = compaction_threshold
|
37
35
|
self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
|
38
36
|
self.con = duckdb.connect()
|
@@ -41,7 +39,7 @@ class Duckrun:
|
|
41
39
|
|
42
40
|
@classmethod
|
43
41
|
def connect(cls, workspace: str, lakehouse_name: str, schema: str,
|
44
|
-
sql_folder: str, compaction_threshold: int =
|
42
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 100):
|
45
43
|
"""Create and connect to lakehouse"""
|
46
44
|
print("Connecting to Lakehouse...")
|
47
45
|
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
|
@@ -64,44 +62,44 @@ class Duckrun:
|
|
64
62
|
def _attach_lakehouse(self):
|
65
63
|
self._create_onelake_secret()
|
66
64
|
try:
|
67
|
-
#
|
65
|
+
# Use expensive list operation but filter for _delta_log folders only
|
66
|
+
# This avoids parsing JSON content that causes Iceberg metadata issues
|
67
|
+
print(f"Scanning for Delta tables in {self.schema}... (this may take a moment)")
|
68
|
+
|
68
69
|
list_tables_query = f"""
|
69
|
-
SELECT DISTINCT
|
70
|
-
|
71
|
-
|
70
|
+
SELECT DISTINCT
|
71
|
+
regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) as table_name
|
72
|
+
FROM glob("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/{self.schema}/**")
|
73
|
+
WHERE file LIKE '%/_delta_log/%'
|
74
|
+
AND file NOT LIKE '%/metadata/%'
|
72
75
|
AND file NOT LIKE '%/iceberg/%'
|
73
|
-
AND
|
74
|
-
AND split_part(file, '_delta_log', 1) NOT LIKE '%/iceberg'
|
76
|
+
AND regexp_extract(file, 'Tables/{self.schema}/([^/]+)/_delta_log', 1) IS NOT NULL
|
75
77
|
"""
|
78
|
+
|
76
79
|
list_tables_df = self.con.sql(list_tables_query).df()
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables.")
|
80
|
+
|
81
|
+
if list_tables_df.empty:
|
82
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}.")
|
81
83
|
return
|
84
|
+
|
85
|
+
table_names = list_tables_df['table_name'].tolist()
|
82
86
|
|
83
|
-
print(f"Found {len(
|
87
|
+
print(f"Found {len(table_names)} Delta tables. Attaching as views...")
|
84
88
|
|
85
|
-
for
|
86
|
-
|
87
|
-
if
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
|
100
|
-
""")
|
101
|
-
print(f" ✓ Attached: {table}")
|
102
|
-
except Exception as e:
|
103
|
-
print(f" ⚠ Skipped {table}: {str(e)[:100]}")
|
104
|
-
continue
|
89
|
+
for table in table_names:
|
90
|
+
# Skip Iceberg-related folders and empty names
|
91
|
+
if not table or table in ('metadata', 'iceberg'):
|
92
|
+
continue
|
93
|
+
|
94
|
+
try:
|
95
|
+
self.con.sql(f"""
|
96
|
+
CREATE OR REPLACE VIEW {table}
|
97
|
+
AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
|
98
|
+
""")
|
99
|
+
print(f" ✓ Attached: {table}")
|
100
|
+
except Exception as e:
|
101
|
+
print(f" ⚠ Skipped {table}: {str(e)[:100]}")
|
102
|
+
continue
|
105
103
|
|
106
104
|
print("\nAttached tables (views) in DuckDB:")
|
107
105
|
self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
|
@@ -114,6 +112,9 @@ class Duckrun:
|
|
114
112
|
return name.split('__', 1)[0] if '__' in name else name
|
115
113
|
|
116
114
|
def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
|
115
|
+
if self.sql_folder is None:
|
116
|
+
raise RuntimeError("sql_folder is not configured. Cannot read SQL files.")
|
117
|
+
|
117
118
|
is_url = self.sql_folder.startswith("http")
|
118
119
|
if is_url:
|
119
120
|
url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
|
@@ -159,6 +160,9 @@ class Duckrun:
|
|
159
160
|
return content
|
160
161
|
|
161
162
|
def _load_py_function(self, name: str) -> Optional[Callable]:
|
163
|
+
if self.sql_folder is None:
|
164
|
+
raise RuntimeError("sql_folder is not configured. Cannot load Python functions.")
|
165
|
+
|
162
166
|
is_url = self.sql_folder.startswith("http")
|
163
167
|
try:
|
164
168
|
if is_url:
|
@@ -267,6 +271,9 @@ class Duckrun:
|
|
267
271
|
]
|
268
272
|
dr.run(pipeline)
|
269
273
|
"""
|
274
|
+
if self.sql_folder is None:
|
275
|
+
raise RuntimeError("sql_folder is not configured. Cannot run pipelines. Set sql_folder when creating connection.")
|
276
|
+
|
270
277
|
for i, task in enumerate(pipeline, 1):
|
271
278
|
print(f"\n{'='*60}")
|
272
279
|
print(f"Task {i}/{len(pipeline)}: {task[0]}")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: duckrun
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
|
5
5
|
License-Expression: MIT
|
6
6
|
Project-URL: Homepage, https://github.com/djouallah/duckrun
|
@@ -35,14 +35,14 @@ pip install duckrun
|
|
35
35
|
## Quick Start
|
36
36
|
|
37
37
|
```python
|
38
|
-
import duckrun
|
38
|
+
import duckrun
|
39
39
|
|
40
|
-
# Connect to your Fabric lakehouse
|
41
|
-
|
40
|
+
# Connect to your Fabric lakehouse (using `con` pattern)
|
41
|
+
con = duckrun.connect(
|
42
42
|
workspace="my_workspace",
|
43
43
|
lakehouse_name="my_lakehouse",
|
44
44
|
schema="dbo",
|
45
|
-
sql_folder="./sql" # folder containing your .sql and .py files
|
45
|
+
sql_folder="./sql" # optional: folder containing your .sql and .py files (only needed for pipeline tasks)
|
46
46
|
)
|
47
47
|
|
48
48
|
# Define your pipeline
|
@@ -53,9 +53,11 @@ pipeline = [
|
|
53
53
|
]
|
54
54
|
|
55
55
|
# Run it
|
56
|
-
|
56
|
+
con.run(pipeline)
|
57
57
|
```
|
58
58
|
|
59
|
+
Note: the `sql/` folder is optional — if all you want to do is explore data with SQL (for example by calling `con.sql(...)`), you don't need to provide a `sql_folder`.
|
60
|
+
|
59
61
|
## Early Exit
|
60
62
|
|
61
63
|
In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
|
@@ -138,12 +140,14 @@ Both write to the same `sales` table, but use different SQL files.
|
|
138
140
|
|
139
141
|
```python
|
140
142
|
# Run queries
|
141
|
-
|
143
|
+
con.sql("SELECT * FROM my_table LIMIT 10").show()
|
142
144
|
|
143
145
|
# Get as DataFrame
|
144
|
-
df =
|
146
|
+
df = con.sql("SELECT COUNT(*) FROM sales").df()
|
145
147
|
```
|
146
148
|
|
149
|
+
Explanation: DuckDB is connected to the lakehouse through `con`, so it is aware of the tables in that lakehouse (including tables created by your pipelines). That means you can query those tables directly with `con.sql(...)` just like any other DuckDB query. If you don't provide a `sql_folder`, you can still use `con.sql(...)` to explore existing tables.
|
150
|
+
|
147
151
|
|
148
152
|
|
149
153
|
## Remote SQL Files
|
@@ -151,7 +155,7 @@ df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
|
|
151
155
|
You can load SQL/Python files from a URL:
|
152
156
|
|
153
157
|
```python
|
154
|
-
|
158
|
+
con = duckrun.connect(
|
155
159
|
workspace="Analytics",
|
156
160
|
lakehouse_name="Sales",
|
157
161
|
schema="dbo",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|