duckrun 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ """Duckrun - Lakehouse task runner powered by DuckDB"""
2
+
3
+ from duckrun.core import Duckrun
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ # Expose connect at module level for: import duckrun as dr
8
+ connect = Duckrun.connect
9
+
10
+ __all__ = ["Duckrun", "connect"]
duckrun/core.py ADDED
@@ -0,0 +1,310 @@
1
+ import duckdb
2
+ import requests
3
+ import os
4
+ import importlib.util
5
+ from deltalake import DeltaTable, write_deltalake
6
+ from typing import List, Tuple, Union, Optional, Callable, Dict, Any
7
+ from string import Template
8
+
9
+ class Duckrun:
10
+ """
11
+ Lakehouse task runner with clean tuple-based API.
12
+ Powered by DuckDB for fast data processing.
13
+
14
+ Task formats:
15
+ Python: ('function_name', (arg1, arg2, ...))
16
+ SQL: ('table_name', 'mode', {params})
17
+
18
+ Usage:
19
+ dr = Duckrun.connect(workspace, lakehouse, schema, sql_folder)
20
+
21
+ pipeline = [
22
+ ('download', (urls, paths, depth)),
23
+ ('staging', 'overwrite', {'run_date': '2024-06-01'}),
24
+ ('transform', 'append')
25
+ ]
26
+
27
+ dr.run(pipeline)
28
+ """
29
+
30
+ def __init__(self, workspace: str, lakehouse_name: str, schema: str,
31
+ sql_folder: str, compaction_threshold: int = 10):
32
+ self.workspace = workspace
33
+ self.lakehouse_name = lakehouse_name
34
+ self.schema = schema
35
+ self.sql_folder = sql_folder.strip()
36
+ self.compaction_threshold = compaction_threshold
37
+ self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
38
+ self.con = duckdb.connect()
39
+ self.con.sql("SET preserve_insertion_order = false")
40
+ self._attach_lakehouse()
41
+
42
+ @classmethod
43
+ def connect(cls, workspace: str, lakehouse_name: str, schema: str,
44
+ sql_folder: str, compaction_threshold: int = 10):
45
+ """Create and connect to lakehouse"""
46
+ print("Connecting to Lakehouse...")
47
+ return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold)
48
+
49
+ def _get_storage_token(self):
50
+ return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
51
+
52
+ def _create_onelake_secret(self):
53
+ token = self._get_storage_token()
54
+ if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
55
+ self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
56
+ else:
57
+ print("Please login to Azure CLI")
58
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
59
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
60
+ token = credential.get_token("https://storage.azure.com/.default")
61
+ os.environ["AZURE_STORAGE_TOKEN"] = token.token
62
+ self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
63
+
64
+ def _attach_lakehouse(self):
65
+ self._create_onelake_secret()
66
+ try:
67
+ list_tables_query = f"""
68
+ SELECT DISTINCT(split_part(file, '_delta_log', 1)) as tables
69
+ FROM glob ("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/*/*/_delta_log/*.json")
70
+ """
71
+ list_tables_df = self.con.sql(list_tables_query).df()
72
+ list_tables = list_tables_df['tables'].tolist() if not list_tables_df.empty else []
73
+
74
+ if not list_tables:
75
+ print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables.")
76
+ return
77
+
78
+ print(f"Found {len(list_tables)} Delta tables. Attaching as views...")
79
+
80
+ for table_path in list_tables:
81
+ parts = table_path.strip("/").split("/")
82
+ if len(parts) >= 2:
83
+ potential_schema = parts[-2]
84
+ table = parts[-1]
85
+ if potential_schema == self.schema:
86
+ try:
87
+ self.con.sql(f"""
88
+ CREATE OR REPLACE VIEW {table}
89
+ AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
90
+ """)
91
+ except Exception as e:
92
+ print(f"Error creating view for table {table}: {e}")
93
+ print("\nAttached tables (views) in DuckDB:")
94
+ self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
95
+ except Exception as e:
96
+ print(f"Error attaching lakehouse: {e}")
97
+
98
+ def _normalize_table_name(self, name: str) -> str:
99
+ """Extract base table name before first '__'"""
100
+ return name.split('__', 1)[0] if '__' in name else name
101
+
102
+ def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
103
+ is_url = self.sql_folder.startswith("http")
104
+ if is_url:
105
+ url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
106
+ try:
107
+ resp = requests.get(url)
108
+ resp.raise_for_status()
109
+ content = resp.text
110
+ except Exception as e:
111
+ print(f"Failed to fetch SQL from {url}: {e}")
112
+ return None
113
+ else:
114
+ path = os.path.join(self.sql_folder, f"{table_name}.sql")
115
+ try:
116
+ with open(path, 'r') as f:
117
+ content = f.read()
118
+ except Exception as e:
119
+ print(f"Failed to read SQL file {path}: {e}")
120
+ return None
121
+
122
+ if not content.strip():
123
+ print(f"SQL file is empty: {table_name}.sql")
124
+ return None
125
+
126
+ # Auto-inject common params, merge with user params
127
+ full_params = {
128
+ 'ws': self.workspace,
129
+ 'lh': self.lakehouse_name,
130
+ 'schema': self.schema
131
+ }
132
+ if params:
133
+ full_params.update(params)
134
+
135
+ try:
136
+ template = Template(content)
137
+ content = template.substitute(full_params)
138
+ except KeyError as e:
139
+ print(f"Missing parameter in SQL file: ${e}")
140
+ return None
141
+ except Exception as e:
142
+ print(f"Error during SQL template substitution: {e}")
143
+ return None
144
+
145
+ return content
146
+
147
+ def _load_py_function(self, name: str) -> Optional[Callable]:
148
+ is_url = self.sql_folder.startswith("http")
149
+ try:
150
+ if is_url:
151
+ url = f"{self.sql_folder.rstrip('/')}/{name}.py".strip()
152
+ resp = requests.get(url)
153
+ resp.raise_for_status()
154
+ code = resp.text
155
+ namespace = {}
156
+ exec(code, namespace)
157
+ func = namespace.get(name)
158
+ return func if callable(func) else None
159
+ else:
160
+ path = os.path.join(self.sql_folder, f"{name}.py")
161
+ if not os.path.isfile(path):
162
+ print(f"Python file not found: {path}")
163
+ return None
164
+ spec = importlib.util.spec_from_file_location(name, path)
165
+ mod = importlib.util.module_from_spec(spec)
166
+ spec.loader.exec_module(mod)
167
+ func = getattr(mod, name, None)
168
+ return func if callable(func) else None
169
+ except Exception as e:
170
+ print(f"Error loading Python function '{name}': {e}")
171
+ return None
172
+
173
+ def _run_python(self, name: str, args: tuple) -> Any:
174
+ """Execute Python task, return result"""
175
+ self._create_onelake_secret()
176
+ func = self._load_py_function(name)
177
+ if not func:
178
+ raise RuntimeError(f"Python function '{name}' not found")
179
+
180
+ print(f"Running Python: {name}{args}")
181
+ result = func(*args)
182
+ print(f"✅ Python '{name}' completed")
183
+ return result
184
+
185
+ def _run_sql(self, table: str, mode: str, params: Dict) -> str:
186
+ """Execute SQL task, write to Delta, return normalized table name"""
187
+ self._create_onelake_secret()
188
+
189
+ if mode not in {'overwrite', 'append', 'ignore'}:
190
+ raise ValueError(f"Invalid mode '{mode}'. Use: overwrite, append, or ignore")
191
+
192
+ sql = self._read_sql_file(table, params)
193
+ if sql is None:
194
+ raise RuntimeError(f"Failed to read SQL file for '{table}'")
195
+
196
+ normalized_table = self._normalize_table_name(table)
197
+ path = f"{self.table_base_url}{self.schema}/{normalized_table}"
198
+
199
+ if mode == 'overwrite':
200
+ self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
201
+ df = self.con.sql(sql).record_batch()
202
+ write_deltalake(path, df, mode='overwrite')
203
+ self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
204
+ dt = DeltaTable(path)
205
+ dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
206
+ dt.cleanup_metadata()
207
+
208
+ elif mode == 'append':
209
+ df = self.con.sql(sql).record_batch()
210
+ write_deltalake(path, df, mode='append')
211
+ self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
212
+ dt = DeltaTable(path)
213
+ if len(dt.file_uris()) > self.compaction_threshold:
214
+ print(f"Compacting {normalized_table} ({len(dt.file_uris())} files)")
215
+ dt.optimize.compact()
216
+ dt.vacuum(dry_run=False)
217
+ dt.cleanup_metadata()
218
+
219
+ elif mode == 'ignore':
220
+ try:
221
+ DeltaTable(path)
222
+ print(f"Table {normalized_table} exists. Skipping (mode='ignore')")
223
+ except Exception:
224
+ print(f"Table {normalized_table} doesn't exist. Creating...")
225
+ self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
226
+ df = self.con.sql(sql).record_batch()
227
+ write_deltalake(path, df, mode='overwrite')
228
+ self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
229
+ dt = DeltaTable(path)
230
+ dt.vacuum(dry_run=False)
231
+ dt.cleanup_metadata()
232
+
233
+ print(f"✅ SQL '{table}' → '{normalized_table}' ({mode})")
234
+ return normalized_table
235
+
236
+ def run(self, pipeline: List[Tuple]) -> bool:
237
+ """
238
+ Execute pipeline of tasks.
239
+
240
+ Task formats:
241
+ - Python: ('function_name', (arg1, arg2, ...))
242
+ - SQL: ('table_name', 'mode') or ('table_name', 'mode', {params})
243
+
244
+ Returns:
245
+ True if all tasks succeeded
246
+
247
+ Example:
248
+ pipeline = [
249
+ ('download', (urls, paths, depth)),
250
+ ('staging', 'overwrite', {'run_date': '2024-06-01'}),
251
+ ('transform', 'append'), # {} optional!
252
+ ('calendar', 'ignore') # {} optional!
253
+ ]
254
+ dr.run(pipeline)
255
+ """
256
+ for i, task in enumerate(pipeline, 1):
257
+ print(f"\n{'='*60}")
258
+ print(f"Task {i}/{len(pipeline)}: {task[0]}")
259
+ print('='*60)
260
+
261
+ try:
262
+ if len(task) == 2:
263
+ # Could be Python: ('name', (args,)) or SQL: ('table', 'mode')
264
+ name, second = task
265
+ if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
266
+ # SQL task without params: ('table', 'mode')
267
+ self._run_sql(name, second, {})
268
+ else:
269
+ # Python task: ('name', (args,))
270
+ args = second if isinstance(second, (tuple, list)) else (second,)
271
+ self._run_python(name, tuple(args))
272
+
273
+ elif len(task) == 3:
274
+ # SQL task with params: ('table', 'mode', {params})
275
+ table, mode, params = task
276
+ if not isinstance(params, dict):
277
+ raise ValueError(f"Expected dict for params, got {type(params)}")
278
+ self._run_sql(table, mode, params)
279
+
280
+ else:
281
+ raise ValueError(f"Invalid task format: {task}")
282
+
283
+ except Exception as e:
284
+ print(f"\n❌ Task {i} failed: {e}")
285
+ return False
286
+
287
+ print(f"\n{'='*60}")
288
+ print("✅ All tasks completed successfully")
289
+ print('='*60)
290
+ return True
291
+
292
+ def sql(self, query: str):
293
+ """
294
+ Execute raw SQL query.
295
+
296
+ Example:
297
+ dr.sql("SELECT * FROM table").show()
298
+ df = dr.sql("SELECT * FROM table").df()
299
+ """
300
+ return self.con.sql(query)
301
+
302
+ def get_connection(self):
303
+ """Get underlying DuckDB connection"""
304
+ return self.con
305
+
306
+ def close(self):
307
+ """Close DuckDB connection"""
308
+ if self.con:
309
+ self.con.close()
310
+ print("Connection closed")
@@ -0,0 +1,5 @@
1
+ Metadata-Version: 2.4
2
+ Name: duckrun
3
+ Version: 0.0.0
4
+ License-File: LICENSE
5
+ Dynamic: license-file
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=-Vf2nYwhdsVpTZS9mGBtm8j_HNAcHR7Cj075pida3Yw,13133
3
+ duckrun-0.0.0.dist-info/licenses/LICENSE,sha256=b0pMNsWFx7PvXXtQo-XLqFnPRirAtdWBWwQp39phnWI,20
4
+ duckrun-0.0.0.dist-info/METADATA,sha256=lIxf9HnDm_FlrLTRid0-ByYLw40wiozlSeOpvPgTomE,100
5
+ duckrun-0.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.0.0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ ### **5. `LICENSE`**
@@ -0,0 +1 @@
1
+ duckrun