duckrun 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/__init__.py +10 -0
- duckrun/core.py +310 -0
- duckrun-0.0.0.dist-info/METADATA +5 -0
- duckrun-0.0.0.dist-info/RECORD +7 -0
- duckrun-0.0.0.dist-info/WHEEL +5 -0
- duckrun-0.0.0.dist-info/licenses/LICENSE +1 -0
- duckrun-0.0.0.dist-info/top_level.txt +1 -0
duckrun/__init__.py
ADDED
duckrun/core.py
ADDED
@@ -0,0 +1,310 @@
|
|
1
|
+
import duckdb
|
2
|
+
import requests
|
3
|
+
import os
|
4
|
+
import importlib.util
|
5
|
+
from deltalake import DeltaTable, write_deltalake
|
6
|
+
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
7
|
+
from string import Template
|
8
|
+
|
9
|
+
class Duckrun:
|
10
|
+
"""
|
11
|
+
Lakehouse task runner with clean tuple-based API.
|
12
|
+
Powered by DuckDB for fast data processing.
|
13
|
+
|
14
|
+
Task formats:
|
15
|
+
Python: ('function_name', (arg1, arg2, ...))
|
16
|
+
SQL: ('table_name', 'mode', {params})
|
17
|
+
|
18
|
+
Usage:
|
19
|
+
dr = Duckrun.connect(workspace, lakehouse, schema, sql_folder)
|
20
|
+
|
21
|
+
pipeline = [
|
22
|
+
('download', (urls, paths, depth)),
|
23
|
+
('staging', 'overwrite', {'run_date': '2024-06-01'}),
|
24
|
+
('transform', 'append')
|
25
|
+
]
|
26
|
+
|
27
|
+
dr.run(pipeline)
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(self, workspace: str, lakehouse_name: str, schema: str,
|
31
|
+
sql_folder: str, compaction_threshold: int = 10):
|
32
|
+
self.workspace = workspace
|
33
|
+
self.lakehouse_name = lakehouse_name
|
34
|
+
self.schema = schema
|
35
|
+
self.sql_folder = sql_folder.strip()
|
36
|
+
self.compaction_threshold = compaction_threshold
|
37
|
+
self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
|
38
|
+
self.con = duckdb.connect()
|
39
|
+
self.con.sql("SET preserve_insertion_order = false")
|
40
|
+
self._attach_lakehouse()
|
41
|
+
|
42
|
+
@classmethod
|
43
|
+
def connect(cls, workspace: str, lakehouse_name: str, schema: str,
|
44
|
+
sql_folder: str, compaction_threshold: int = 10):
|
45
|
+
"""Create and connect to lakehouse"""
|
46
|
+
print("Connecting to Lakehouse...")
|
47
|
+
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
|
48
|
+
|
49
|
+
def _get_storage_token(self):
|
50
|
+
return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
|
51
|
+
|
52
|
+
def _create_onelake_secret(self):
|
53
|
+
token = self._get_storage_token()
|
54
|
+
if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
55
|
+
self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
|
56
|
+
else:
|
57
|
+
print("Please login to Azure CLI")
|
58
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
59
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
60
|
+
token = credential.get_token("https://storage.azure.com/.default")
|
61
|
+
os.environ["AZURE_STORAGE_TOKEN"] = token.token
|
62
|
+
self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
|
63
|
+
|
64
|
+
def _attach_lakehouse(self):
|
65
|
+
self._create_onelake_secret()
|
66
|
+
try:
|
67
|
+
list_tables_query = f"""
|
68
|
+
SELECT DISTINCT(split_part(file, '_delta_log', 1)) as tables
|
69
|
+
FROM glob ("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/*/*/_delta_log/*.json")
|
70
|
+
"""
|
71
|
+
list_tables_df = self.con.sql(list_tables_query).df()
|
72
|
+
list_tables = list_tables_df['tables'].tolist() if not list_tables_df.empty else []
|
73
|
+
|
74
|
+
if not list_tables:
|
75
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables.")
|
76
|
+
return
|
77
|
+
|
78
|
+
print(f"Found {len(list_tables)} Delta tables. Attaching as views...")
|
79
|
+
|
80
|
+
for table_path in list_tables:
|
81
|
+
parts = table_path.strip("/").split("/")
|
82
|
+
if len(parts) >= 2:
|
83
|
+
potential_schema = parts[-2]
|
84
|
+
table = parts[-1]
|
85
|
+
if potential_schema == self.schema:
|
86
|
+
try:
|
87
|
+
self.con.sql(f"""
|
88
|
+
CREATE OR REPLACE VIEW {table}
|
89
|
+
AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
|
90
|
+
""")
|
91
|
+
except Exception as e:
|
92
|
+
print(f"Error creating view for table {table}: {e}")
|
93
|
+
print("\nAttached tables (views) in DuckDB:")
|
94
|
+
self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
|
95
|
+
except Exception as e:
|
96
|
+
print(f"Error attaching lakehouse: {e}")
|
97
|
+
|
98
|
+
def _normalize_table_name(self, name: str) -> str:
|
99
|
+
"""Extract base table name before first '__'"""
|
100
|
+
return name.split('__', 1)[0] if '__' in name else name
|
101
|
+
|
102
|
+
def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
|
103
|
+
is_url = self.sql_folder.startswith("http")
|
104
|
+
if is_url:
|
105
|
+
url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
|
106
|
+
try:
|
107
|
+
resp = requests.get(url)
|
108
|
+
resp.raise_for_status()
|
109
|
+
content = resp.text
|
110
|
+
except Exception as e:
|
111
|
+
print(f"Failed to fetch SQL from {url}: {e}")
|
112
|
+
return None
|
113
|
+
else:
|
114
|
+
path = os.path.join(self.sql_folder, f"{table_name}.sql")
|
115
|
+
try:
|
116
|
+
with open(path, 'r') as f:
|
117
|
+
content = f.read()
|
118
|
+
except Exception as e:
|
119
|
+
print(f"Failed to read SQL file {path}: {e}")
|
120
|
+
return None
|
121
|
+
|
122
|
+
if not content.strip():
|
123
|
+
print(f"SQL file is empty: {table_name}.sql")
|
124
|
+
return None
|
125
|
+
|
126
|
+
# Auto-inject common params, merge with user params
|
127
|
+
full_params = {
|
128
|
+
'ws': self.workspace,
|
129
|
+
'lh': self.lakehouse_name,
|
130
|
+
'schema': self.schema
|
131
|
+
}
|
132
|
+
if params:
|
133
|
+
full_params.update(params)
|
134
|
+
|
135
|
+
try:
|
136
|
+
template = Template(content)
|
137
|
+
content = template.substitute(full_params)
|
138
|
+
except KeyError as e:
|
139
|
+
print(f"Missing parameter in SQL file: ${e}")
|
140
|
+
return None
|
141
|
+
except Exception as e:
|
142
|
+
print(f"Error during SQL template substitution: {e}")
|
143
|
+
return None
|
144
|
+
|
145
|
+
return content
|
146
|
+
|
147
|
+
def _load_py_function(self, name: str) -> Optional[Callable]:
|
148
|
+
is_url = self.sql_folder.startswith("http")
|
149
|
+
try:
|
150
|
+
if is_url:
|
151
|
+
url = f"{self.sql_folder.rstrip('/')}/{name}.py".strip()
|
152
|
+
resp = requests.get(url)
|
153
|
+
resp.raise_for_status()
|
154
|
+
code = resp.text
|
155
|
+
namespace = {}
|
156
|
+
exec(code, namespace)
|
157
|
+
func = namespace.get(name)
|
158
|
+
return func if callable(func) else None
|
159
|
+
else:
|
160
|
+
path = os.path.join(self.sql_folder, f"{name}.py")
|
161
|
+
if not os.path.isfile(path):
|
162
|
+
print(f"Python file not found: {path}")
|
163
|
+
return None
|
164
|
+
spec = importlib.util.spec_from_file_location(name, path)
|
165
|
+
mod = importlib.util.module_from_spec(spec)
|
166
|
+
spec.loader.exec_module(mod)
|
167
|
+
func = getattr(mod, name, None)
|
168
|
+
return func if callable(func) else None
|
169
|
+
except Exception as e:
|
170
|
+
print(f"Error loading Python function '{name}': {e}")
|
171
|
+
return None
|
172
|
+
|
173
|
+
def _run_python(self, name: str, args: tuple) -> Any:
|
174
|
+
"""Execute Python task, return result"""
|
175
|
+
self._create_onelake_secret()
|
176
|
+
func = self._load_py_function(name)
|
177
|
+
if not func:
|
178
|
+
raise RuntimeError(f"Python function '{name}' not found")
|
179
|
+
|
180
|
+
print(f"Running Python: {name}{args}")
|
181
|
+
result = func(*args)
|
182
|
+
print(f"✅ Python '{name}' completed")
|
183
|
+
return result
|
184
|
+
|
185
|
+
def _run_sql(self, table: str, mode: str, params: Dict) -> str:
|
186
|
+
"""Execute SQL task, write to Delta, return normalized table name"""
|
187
|
+
self._create_onelake_secret()
|
188
|
+
|
189
|
+
if mode not in {'overwrite', 'append', 'ignore'}:
|
190
|
+
raise ValueError(f"Invalid mode '{mode}'. Use: overwrite, append, or ignore")
|
191
|
+
|
192
|
+
sql = self._read_sql_file(table, params)
|
193
|
+
if sql is None:
|
194
|
+
raise RuntimeError(f"Failed to read SQL file for '{table}'")
|
195
|
+
|
196
|
+
normalized_table = self._normalize_table_name(table)
|
197
|
+
path = f"{self.table_base_url}{self.schema}/{normalized_table}"
|
198
|
+
|
199
|
+
if mode == 'overwrite':
|
200
|
+
self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
|
201
|
+
df = self.con.sql(sql).record_batch()
|
202
|
+
write_deltalake(path, df, mode='overwrite')
|
203
|
+
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
204
|
+
dt = DeltaTable(path)
|
205
|
+
dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
|
206
|
+
dt.cleanup_metadata()
|
207
|
+
|
208
|
+
elif mode == 'append':
|
209
|
+
df = self.con.sql(sql).record_batch()
|
210
|
+
write_deltalake(path, df, mode='append')
|
211
|
+
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
212
|
+
dt = DeltaTable(path)
|
213
|
+
if len(dt.file_uris()) > self.compaction_threshold:
|
214
|
+
print(f"Compacting {normalized_table} ({len(dt.file_uris())} files)")
|
215
|
+
dt.optimize.compact()
|
216
|
+
dt.vacuum(dry_run=False)
|
217
|
+
dt.cleanup_metadata()
|
218
|
+
|
219
|
+
elif mode == 'ignore':
|
220
|
+
try:
|
221
|
+
DeltaTable(path)
|
222
|
+
print(f"Table {normalized_table} exists. Skipping (mode='ignore')")
|
223
|
+
except Exception:
|
224
|
+
print(f"Table {normalized_table} doesn't exist. Creating...")
|
225
|
+
self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
|
226
|
+
df = self.con.sql(sql).record_batch()
|
227
|
+
write_deltalake(path, df, mode='overwrite')
|
228
|
+
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
229
|
+
dt = DeltaTable(path)
|
230
|
+
dt.vacuum(dry_run=False)
|
231
|
+
dt.cleanup_metadata()
|
232
|
+
|
233
|
+
print(f"✅ SQL '{table}' → '{normalized_table}' ({mode})")
|
234
|
+
return normalized_table
|
235
|
+
|
236
|
+
def run(self, pipeline: List[Tuple]) -> bool:
|
237
|
+
"""
|
238
|
+
Execute pipeline of tasks.
|
239
|
+
|
240
|
+
Task formats:
|
241
|
+
- Python: ('function_name', (arg1, arg2, ...))
|
242
|
+
- SQL: ('table_name', 'mode') or ('table_name', 'mode', {params})
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
True if all tasks succeeded
|
246
|
+
|
247
|
+
Example:
|
248
|
+
pipeline = [
|
249
|
+
('download', (urls, paths, depth)),
|
250
|
+
('staging', 'overwrite', {'run_date': '2024-06-01'}),
|
251
|
+
('transform', 'append'), # {} optional!
|
252
|
+
('calendar', 'ignore') # {} optional!
|
253
|
+
]
|
254
|
+
dr.run(pipeline)
|
255
|
+
"""
|
256
|
+
for i, task in enumerate(pipeline, 1):
|
257
|
+
print(f"\n{'='*60}")
|
258
|
+
print(f"Task {i}/{len(pipeline)}: {task[0]}")
|
259
|
+
print('='*60)
|
260
|
+
|
261
|
+
try:
|
262
|
+
if len(task) == 2:
|
263
|
+
# Could be Python: ('name', (args,)) or SQL: ('table', 'mode')
|
264
|
+
name, second = task
|
265
|
+
if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
|
266
|
+
# SQL task without params: ('table', 'mode')
|
267
|
+
self._run_sql(name, second, {})
|
268
|
+
else:
|
269
|
+
# Python task: ('name', (args,))
|
270
|
+
args = second if isinstance(second, (tuple, list)) else (second,)
|
271
|
+
self._run_python(name, tuple(args))
|
272
|
+
|
273
|
+
elif len(task) == 3:
|
274
|
+
# SQL task with params: ('table', 'mode', {params})
|
275
|
+
table, mode, params = task
|
276
|
+
if not isinstance(params, dict):
|
277
|
+
raise ValueError(f"Expected dict for params, got {type(params)}")
|
278
|
+
self._run_sql(table, mode, params)
|
279
|
+
|
280
|
+
else:
|
281
|
+
raise ValueError(f"Invalid task format: {task}")
|
282
|
+
|
283
|
+
except Exception as e:
|
284
|
+
print(f"\n❌ Task {i} failed: {e}")
|
285
|
+
return False
|
286
|
+
|
287
|
+
print(f"\n{'='*60}")
|
288
|
+
print("✅ All tasks completed successfully")
|
289
|
+
print('='*60)
|
290
|
+
return True
|
291
|
+
|
292
|
+
def sql(self, query: str):
|
293
|
+
"""
|
294
|
+
Execute raw SQL query.
|
295
|
+
|
296
|
+
Example:
|
297
|
+
dr.sql("SELECT * FROM table").show()
|
298
|
+
df = dr.sql("SELECT * FROM table").df()
|
299
|
+
"""
|
300
|
+
return self.con.sql(query)
|
301
|
+
|
302
|
+
def get_connection(self):
|
303
|
+
"""Get underlying DuckDB connection"""
|
304
|
+
return self.con
|
305
|
+
|
306
|
+
def close(self):
|
307
|
+
"""Close DuckDB connection"""
|
308
|
+
if self.con:
|
309
|
+
self.con.close()
|
310
|
+
print("Connection closed")
|
@@ -0,0 +1,7 @@
|
|
1
|
+
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
+
duckrun/core.py,sha256=-Vf2nYwhdsVpTZS9mGBtm8j_HNAcHR7Cj075pida3Yw,13133
|
3
|
+
duckrun-0.0.0.dist-info/licenses/LICENSE,sha256=b0pMNsWFx7PvXXtQo-XLqFnPRirAtdWBWwQp39phnWI,20
|
4
|
+
duckrun-0.0.0.dist-info/METADATA,sha256=lIxf9HnDm_FlrLTRid0-ByYLw40wiozlSeOpvPgTomE,100
|
5
|
+
duckrun-0.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
duckrun-0.0.0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
7
|
+
duckrun-0.0.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
### **5. `LICENSE`**
|
@@ -0,0 +1 @@
|
|
1
|
+
duckrun
|