duckrun 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/files.py ADDED
@@ -0,0 +1,251 @@
1
+ """
2
+ File operations functionality for duckrun - OneLake Files copy and download
3
+ """
4
+ import os
5
+ from typing import Optional, List
6
+ import obstore as obs
7
+ from obstore.store import AzureStore
8
+
9
+
10
+ def copy(duckrun_instance, local_folder: str, remote_folder: str,
11
+ file_extensions: Optional[List[str]] = None,
12
+ overwrite: bool = False) -> bool:
13
+ """
14
+ Copy files from a local folder to OneLake Files section.
15
+
16
+ Args:
17
+ duckrun_instance: The Duckrun connection instance
18
+ local_folder: Path to local folder containing files to upload
19
+ remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
20
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
21
+ overwrite: Whether to overwrite existing files (default: False)
22
+
23
+ Returns:
24
+ True if all files uploaded successfully, False otherwise
25
+
26
+ Examples:
27
+ # Upload all files from local folder to a target folder
28
+ dr.copy("./local_data", "uploaded_data")
29
+
30
+ # Upload only CSV files to a specific subfolder
31
+ dr.copy("./reports", "daily_reports", ['.csv'])
32
+
33
+ # Upload with overwrite enabled
34
+ dr.copy("./backup", "backups", overwrite=True)
35
+ """
36
+ if not os.path.exists(local_folder):
37
+ print(f"❌ Local folder not found: {local_folder}")
38
+ return False
39
+
40
+ if not os.path.isdir(local_folder):
41
+ print(f"❌ Path is not a directory: {local_folder}")
42
+ return False
43
+
44
+ # Get Azure token
45
+ token = duckrun_instance._get_storage_token()
46
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
47
+ print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
48
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
49
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
50
+ token_obj = credential.get_token("https://storage.azure.com/.default")
51
+ token = token_obj.token
52
+ os.environ["AZURE_STORAGE_TOKEN"] = token
53
+
54
+ # Setup OneLake Files URL (not Tables)
55
+ files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
56
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
57
+
58
+ # Collect files to upload
59
+ files_to_upload = []
60
+ for root, dirs, files in os.walk(local_folder):
61
+ for file in files:
62
+ local_file_path = os.path.join(root, file)
63
+
64
+ # Filter by extensions if specified
65
+ if file_extensions:
66
+ _, ext = os.path.splitext(file)
67
+ if ext.lower() not in [e.lower() for e in file_extensions]:
68
+ continue
69
+
70
+ # Calculate relative path from local_folder
71
+ rel_path = os.path.relpath(local_file_path, local_folder)
72
+
73
+ # Build remote path in OneLake Files (remote_folder is now mandatory)
74
+ remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
75
+
76
+ files_to_upload.append((local_file_path, remote_path))
77
+
78
+ if not files_to_upload:
79
+ print(f"No files found to upload in {local_folder}")
80
+ if file_extensions:
81
+ print(f" (filtered by extensions: {file_extensions})")
82
+ return True
83
+
84
+ print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
85
+ print(f" Target folder: {remote_folder}")
86
+
87
+ uploaded_count = 0
88
+ failed_count = 0
89
+
90
+ for local_path, remote_path in files_to_upload:
91
+ try:
92
+ # Check if file exists (if not overwriting)
93
+ if not overwrite:
94
+ try:
95
+ obs.head(store, remote_path)
96
+ print(f" ⏭ Skipped (exists): {remote_path}")
97
+ continue
98
+ except Exception:
99
+ # File doesn't exist, proceed with upload
100
+ pass
101
+
102
+ # Read local file
103
+ with open(local_path, 'rb') as f:
104
+ file_data = f.read()
105
+
106
+ # Upload to OneLake Files
107
+ obs.put(store, remote_path, file_data)
108
+
109
+ file_size = len(file_data)
110
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
111
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
112
+
113
+ print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
114
+ uploaded_count += 1
115
+
116
+ except Exception as e:
117
+ print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
118
+ failed_count += 1
119
+
120
+ print(f"\n{'='*60}")
121
+ if failed_count == 0:
122
+ print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
123
+ else:
124
+ print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
125
+ print(f"{'='*60}")
126
+
127
+ return failed_count == 0
128
+
129
+
130
+ def download(duckrun_instance, remote_folder: str = "", local_folder: str = "./downloaded_files",
131
+ file_extensions: Optional[List[str]] = None,
132
+ overwrite: bool = False) -> bool:
133
+ """
134
+ Download files from OneLake Files section to a local folder.
135
+
136
+ Args:
137
+ duckrun_instance: The Duckrun connection instance
138
+ remote_folder: Optional subfolder path in OneLake Files to download from
139
+ local_folder: Local folder path to download files to (default: "./downloaded_files")
140
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
141
+ overwrite: Whether to overwrite existing local files (default: False)
142
+
143
+ Returns:
144
+ True if all files downloaded successfully, False otherwise
145
+
146
+ Examples:
147
+ # Download all files from OneLake Files root
148
+ dr.download()
149
+
150
+ # Download only CSV files from a specific subfolder
151
+ dr.download("daily_reports", "./reports", ['.csv'])
152
+ """
153
+ # Get Azure token
154
+ token = duckrun_instance._get_storage_token()
155
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
156
+ print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
157
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
158
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
159
+ token_obj = credential.get_token("https://storage.azure.com/.default")
160
+ token = token_obj.token
161
+ os.environ["AZURE_STORAGE_TOKEN"] = token
162
+
163
+ # Setup OneLake Files URL (not Tables)
164
+ files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
165
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
166
+
167
+ # Create local directory
168
+ os.makedirs(local_folder, exist_ok=True)
169
+
170
+ # List files in OneLake Files
171
+ print(f"📁 Discovering files in OneLake Files...")
172
+ if remote_folder:
173
+ print(f" Source folder: {remote_folder}")
174
+ prefix = f"{remote_folder.strip('/')}/"
175
+ else:
176
+ prefix = ""
177
+
178
+ try:
179
+ list_stream = obs.list(store, prefix=prefix)
180
+ files_to_download = []
181
+
182
+ for batch in list_stream:
183
+ for obj in batch:
184
+ remote_path = obj["path"]
185
+
186
+ # Filter by extensions if specified
187
+ if file_extensions:
188
+ _, ext = os.path.splitext(remote_path)
189
+ if ext.lower() not in [e.lower() for e in file_extensions]:
190
+ continue
191
+
192
+ # Calculate local path
193
+ if remote_folder:
194
+ rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
195
+ else:
196
+ rel_path = remote_path
197
+
198
+ local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
199
+ files_to_download.append((remote_path, local_path))
200
+
201
+ if not files_to_download:
202
+ print(f"No files found to download")
203
+ if file_extensions:
204
+ print(f" (filtered by extensions: {file_extensions})")
205
+ return True
206
+
207
+ print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
208
+
209
+ downloaded_count = 0
210
+ failed_count = 0
211
+
212
+ for remote_path, local_path in files_to_download:
213
+ try:
214
+ # Check if local file exists (if not overwriting)
215
+ if not overwrite and os.path.exists(local_path):
216
+ print(f" ⏭ Skipped (exists): {local_path}")
217
+ continue
218
+
219
+ # Ensure local directory exists
220
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
221
+
222
+ # Download file
223
+ data = obs.get(store, remote_path).bytes()
224
+
225
+ # Write to local file
226
+ with open(local_path, 'wb') as f:
227
+ f.write(data)
228
+
229
+ file_size = len(data)
230
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
231
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
232
+
233
+ print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
234
+ downloaded_count += 1
235
+
236
+ except Exception as e:
237
+ print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
238
+ failed_count += 1
239
+
240
+ print(f"\n{'='*60}")
241
+ if failed_count == 0:
242
+ print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
243
+ else:
244
+ print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
245
+ print(f"{'='*60}")
246
+
247
+ return failed_count == 0
248
+
249
+ except Exception as e:
250
+ print(f"❌ Error listing files from OneLake: {e}")
251
+ return False
duckrun/runner.py ADDED
@@ -0,0 +1,287 @@
1
+ """
2
+ Pipeline execution functionality for duckrun
3
+ """
4
+ import os
5
+ import requests
6
+ import importlib.util
7
+ from typing import List, Tuple, Dict, Optional, Callable, Any
8
+ from string import Template
9
+ from deltalake import DeltaTable, write_deltalake
10
+ # Row Group configuration for optimal Delta Lake performance
11
+ RG = 8_000_000
12
+
13
+
14
+ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
15
+ """
16
+ Build arguments for write_deltalake based on requirements:
17
+ - If schema_mode='merge': use rust engine (no row group params)
18
+ - Otherwise: use pyarrow engine with row group optimization
19
+ """
20
+ args = {
21
+ 'table_or_uri': path,
22
+ 'data': df,
23
+ 'mode': mode
24
+ }
25
+
26
+ # Add partition_by if specified
27
+ if partition_by:
28
+ args['partition_by'] = partition_by
29
+
30
+ # Engine selection based on schema_mode
31
+ if schema_mode == 'merge':
32
+ # Use rust engine for schema merging (no row group params supported)
33
+ args['schema_mode'] = 'merge'
34
+ args['engine'] = 'rust'
35
+ else:
36
+ # Use pyarrow engine with row group optimization (default)
37
+ args['max_rows_per_file'] = RG
38
+ args['max_rows_per_group'] = RG
39
+ args['min_rows_per_group'] = RG
40
+
41
+ return args
42
+
43
+
44
+ def run(duckrun_instance, pipeline: List[Tuple]) -> bool:
45
+ """
46
+ Execute pipeline of tasks.
47
+
48
+ Task formats:
49
+ - Python: ('function_name', (arg1, arg2, ...))
50
+ - SQL: ('table_name', 'mode') or ('table_name', 'mode', {sql_params})
51
+ - SQL with Delta options: ('table_name', 'mode', {sql_params}, {delta_options})
52
+
53
+ Returns:
54
+ True if all tasks succeeded
55
+ False if any task failed (exception) or Python task returned 0 (early exit)
56
+ """
57
+ if duckrun_instance.sql_folder is None:
58
+ raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
59
+
60
+ for i, task in enumerate(pipeline, 1):
61
+ print(f"\n{'='*60}")
62
+ print(f"Task {i}/{len(pipeline)}: {task[0]}")
63
+ print('='*60)
64
+
65
+ try:
66
+ result = None
67
+
68
+ if len(task) == 2:
69
+ name, second = task
70
+ if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
71
+ result = _run_sql(duckrun_instance, name, second, {}, {})
72
+ else:
73
+ args = second if isinstance(second, (tuple, list)) else (second,)
74
+ result = _run_python(duckrun_instance, name, tuple(args))
75
+
76
+ elif len(task) == 3:
77
+ table, mode, params = task
78
+ if not isinstance(params, dict):
79
+ raise ValueError(f"Expected dict for params, got {type(params)}")
80
+ result = _run_sql(duckrun_instance, table, mode, params, {})
81
+
82
+ elif len(task) == 4:
83
+ table, mode, params, delta_options = task
84
+ if not isinstance(params, dict):
85
+ raise ValueError(f"Expected dict for SQL params, got {type(params)}")
86
+ if not isinstance(delta_options, dict):
87
+ raise ValueError(f"Expected dict for Delta options, got {type(delta_options)}")
88
+ result = _run_sql(duckrun_instance, table, mode, params, delta_options)
89
+
90
+ else:
91
+ raise ValueError(f"Invalid task format: {task}")
92
+
93
+ # Check if Python task returned 0 (early exit condition)
94
+ # Only check for Python tasks as SQL tasks return table names (strings) and only stop on exceptions
95
+ if (len(task) == 2 and
96
+ not isinstance(task[1], str) and
97
+ result == 0):
98
+ print(f"\n⏹️ Python task {i} returned 0 - stopping pipeline execution")
99
+ print(f" Remaining tasks ({len(pipeline) - i}) will not be executed")
100
+ return False
101
+
102
+ except Exception as e:
103
+ print(f"\n❌ Task {i} failed: {e}")
104
+ return False
105
+
106
+ print(f"\n{'='*60}")
107
+ print("✅ All tasks completed successfully")
108
+ print('='*60)
109
+ return True
110
+
111
+
112
+ def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
113
+ """Execute Python task, return result"""
114
+ duckrun_instance._create_onelake_secret()
115
+ func = _load_py_function(duckrun_instance, name)
116
+ if not func:
117
+ raise RuntimeError(f"Python function '{name}' not found")
118
+
119
+ print(f"Running Python: {name}{args}")
120
+ result = func(*args)
121
+ print(f"✅ Python '{name}' completed")
122
+ return result
123
+
124
+
125
+ def _run_sql(duckrun_instance, table: str, mode: str, params: Dict, delta_options: Dict = None) -> str:
126
+ """Execute SQL task, write to Delta, return normalized table name"""
127
+ duckrun_instance._create_onelake_secret()
128
+
129
+ if mode not in {'overwrite', 'append', 'ignore'}:
130
+ raise ValueError(f"Invalid mode '{mode}'. Use: overwrite, append, or ignore")
131
+
132
+ sql = _read_sql_file(duckrun_instance, table, params)
133
+ if sql is None:
134
+ raise RuntimeError(f"Failed to read SQL file for '{table}'")
135
+
136
+ normalized_table = _normalize_table_name(table)
137
+ path = f"{duckrun_instance.table_base_url}{duckrun_instance.schema}/{normalized_table}"
138
+
139
+ # Extract Delta Lake specific options from delta_options
140
+ delta_options = delta_options or {}
141
+ merge_schema = delta_options.get('mergeSchema')
142
+ schema_mode = 'merge' if str(merge_schema).lower() in ('true', '1') else None
143
+ partition_by = delta_options.get('partitionBy') or delta_options.get('partition_by')
144
+
145
+ if mode == 'overwrite':
146
+ duckrun_instance.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
147
+ df = duckrun_instance.con.sql(sql).record_batch()
148
+
149
+ write_args = _build_write_deltalake_args(
150
+ path, df, 'overwrite',
151
+ schema_mode=schema_mode,
152
+ partition_by=partition_by
153
+ )
154
+ write_deltalake(**write_args)
155
+
156
+ duckrun_instance.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
157
+ dt = DeltaTable(path)
158
+ dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
159
+ dt.cleanup_metadata()
160
+
161
+ elif mode == 'append':
162
+ df = duckrun_instance.con.sql(sql).record_batch()
163
+
164
+ write_args = _build_write_deltalake_args(
165
+ path, df, 'append',
166
+ schema_mode=schema_mode,
167
+ partition_by=partition_by
168
+ )
169
+ write_deltalake(**write_args)
170
+
171
+ duckrun_instance.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
172
+ dt = DeltaTable(path)
173
+ if len(dt.file_uris()) > duckrun_instance.compaction_threshold:
174
+ print(f"Compacting {normalized_table} ({len(dt.file_uris())} files)")
175
+ dt.optimize.compact()
176
+ dt.vacuum(dry_run=False)
177
+ dt.cleanup_metadata()
178
+
179
+ elif mode == 'ignore':
180
+ try:
181
+ DeltaTable(path)
182
+ print(f"Table {normalized_table} exists. Skipping (mode='ignore')")
183
+ except Exception:
184
+ print(f"Table {normalized_table} doesn't exist. Creating...")
185
+ duckrun_instance.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
186
+ df = duckrun_instance.con.sql(sql).record_batch()
187
+
188
+ write_args = _build_write_deltalake_args(
189
+ path, df, 'overwrite',
190
+ schema_mode=schema_mode,
191
+ partition_by=partition_by
192
+ )
193
+ write_deltalake(**write_args)
194
+
195
+ duckrun_instance.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
196
+ dt = DeltaTable(path)
197
+ dt.vacuum(dry_run=False)
198
+ dt.cleanup_metadata()
199
+
200
+ engine_info = f" (engine=rust, schema_mode=merge)" if schema_mode == 'merge' else " (engine=pyarrow)"
201
+ partition_info = f" partitioned by {partition_by}" if partition_by else ""
202
+ print(f"✅ SQL '{table}' → '{normalized_table}' ({mode}){engine_info}{partition_info}")
203
+ return normalized_table
204
+
205
+
206
+ def _normalize_table_name(name: str) -> str:
207
+ """Extract base table name before first '__'"""
208
+ return name.split('__', 1)[0] if '__' in name else name
209
+
210
+
211
+ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
212
+ if duckrun_instance.sql_folder is None:
213
+ raise RuntimeError("sql_folder is not configured. Cannot read SQL files.")
214
+
215
+ is_url = duckrun_instance.sql_folder.startswith("http")
216
+ if is_url:
217
+ url = f"{duckrun_instance.sql_folder.rstrip('/')}/{table_name}.sql".strip()
218
+ try:
219
+ resp = requests.get(url)
220
+ resp.raise_for_status()
221
+ content = resp.text
222
+ except Exception as e:
223
+ print(f"Failed to fetch SQL from {url}: {e}")
224
+ return None
225
+ else:
226
+ path = os.path.join(duckrun_instance.sql_folder, f"{table_name}.sql")
227
+ try:
228
+ with open(path, 'r') as f:
229
+ content = f.read()
230
+ except Exception as e:
231
+ print(f"Failed to read SQL file {path}: {e}")
232
+ return None
233
+
234
+ if not content.strip():
235
+ print(f"SQL file is empty: {table_name}.sql")
236
+ return None
237
+
238
+ full_params = {
239
+ 'ws': duckrun_instance.workspace,
240
+ 'lh': duckrun_instance.lakehouse_name,
241
+ 'schema': duckrun_instance.schema,
242
+ 'storage_account': duckrun_instance.storage_account
243
+ }
244
+ if params:
245
+ full_params.update(params)
246
+
247
+ try:
248
+ template = Template(content)
249
+ content = template.substitute(full_params)
250
+ except KeyError as e:
251
+ print(f"Missing parameter in SQL file: ${e}")
252
+ return None
253
+ except Exception as e:
254
+ print(f"Error during SQL template substitution: {e}")
255
+ return None
256
+
257
+ return content
258
+
259
+
260
+ def _load_py_function(duckrun_instance, name: str) -> Optional[Callable]:
261
+ if duckrun_instance.sql_folder is None:
262
+ raise RuntimeError("sql_folder is not configured. Cannot load Python functions.")
263
+
264
+ is_url = duckrun_instance.sql_folder.startswith("http")
265
+ try:
266
+ if is_url:
267
+ url = f"{duckrun_instance.sql_folder.rstrip('/')}/{name}.py".strip()
268
+ resp = requests.get(url)
269
+ resp.raise_for_status()
270
+ code = resp.text
271
+ namespace = {}
272
+ exec(code, namespace)
273
+ func = namespace.get(name)
274
+ return func if callable(func) else None
275
+ else:
276
+ path = os.path.join(duckrun_instance.sql_folder, f"{name}.py")
277
+ if not os.path.isfile(path):
278
+ print(f"Python file not found: {path}")
279
+ return None
280
+ spec = importlib.util.spec_from_file_location(name, path)
281
+ mod = importlib.util.module_from_spec(spec)
282
+ spec.loader.exec_module(mod)
283
+ func = getattr(mod, name, None)
284
+ return func if callable(func) else None
285
+ except Exception as e:
286
+ print(f"Error loading Python function '{name}': {e}")
287
+ return None