duckrun 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,365 @@
1
+ import duckdb
2
+ import requests
3
+ import os
4
+ import importlib.util
5
+ from deltalake import DeltaTable, write_deltalake
6
+ from typing import List, Tuple, Union, Optional, Callable, Dict, Any
7
+ from string import Template
8
+ import obstore as obs
9
+ from obstore.store import AzureStore
10
+ from datetime import datetime
11
+ from .stats import get_stats as _get_stats
12
+ from .runner import run as _run
13
+ from .files import copy as _copy, download as _download
14
+ from .writer import QueryResult
15
+
16
+ class Duckrun:
17
+ """
18
+ Lakehouse task runner with clean tuple-based API.
19
+ Powered by DuckDB for fast data processing.
20
+
21
+ Task formats:
22
+ Python: ('function_name', (arg1, arg2, ...))
23
+ SQL: ('table_name', 'mode', {params})
24
+
25
+ Usage:
26
+ # For pipelines:
27
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
28
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
29
+ dr.run(pipeline)
30
+
31
+ # For data exploration with Spark-style API:
32
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse")
33
+ dr.sql("SELECT * FROM table").show()
34
+ dr.sql("SELECT 43").write.mode("append").saveAsTable("test")
35
+
36
+ # Schema evolution and partitioning (exact Spark API):
37
+ dr.sql("SELECT * FROM source").write.mode("append").option("mergeSchema", "true").partitionBy("region").saveAsTable("sales")
38
+
39
+ # Pipeline formats:
40
+ pipeline = [
41
+ # SQL with parameters only
42
+ ('table_name', 'mode', {'param1': 'value1'}),
43
+
44
+ # SQL with Delta options (4-tuple format)
45
+ ('table_name', 'mode', {'param1': 'value1'}, {'mergeSchema': 'true', 'partitionBy': ['region']}),
46
+
47
+ # Python task
48
+ ('process_data', ('table_name',))
49
+ ]
50
+ """
51
+
52
+ def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
53
+ sql_folder: Optional[str] = None, compaction_threshold: int = 10,
54
+ scan_all_schemas: bool = False, storage_account: str = "onelake"):
55
+ self.workspace = workspace
56
+ self.lakehouse_name = lakehouse_name
57
+ self.schema = schema
58
+ self.sql_folder = sql_folder.strip() if sql_folder else None
59
+ self.compaction_threshold = compaction_threshold
60
+ self.scan_all_schemas = scan_all_schemas
61
+ self.storage_account = storage_account
62
+ self.table_base_url = f'abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
63
+ self.con = duckdb.connect()
64
+ self.con.sql("SET preserve_insertion_order = false")
65
+ self._attach_lakehouse()
66
+
67
+ @classmethod
68
+ def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
69
+ compaction_threshold: int = 100, storage_account: str = "onelake"):
70
+ """
71
+ Create and connect to lakehouse.
72
+
73
+ Uses compact format: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
74
+
75
+ Args:
76
+ connection_string: OneLake path "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
77
+ sql_folder: Optional path or URL to SQL files folder
78
+ compaction_threshold: File count threshold for compaction
79
+ storage_account: Storage account name (default: "onelake")
80
+
81
+ Examples:
82
+ dr = Duckrun.connect("ws/lh.lakehouse/schema", sql_folder="./sql")
83
+ dr = Duckrun.connect("ws/lh.lakehouse/schema") # no SQL folder
84
+ dr = Duckrun.connect("ws/lh.lakehouse") # defaults to dbo schema
85
+ dr = Duckrun.connect("ws/lh.lakehouse", storage_account="xxx-onelake") # custom storage
86
+ """
87
+ print("Connecting to Lakehouse...")
88
+
89
+ scan_all_schemas = False
90
+
91
+ # Only support compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
92
+ if not connection_string or "/" not in connection_string:
93
+ raise ValueError(
94
+ "Invalid connection string format. "
95
+ "Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
96
+ )
97
+
98
+ parts = connection_string.split("/")
99
+ if len(parts) == 2:
100
+ workspace, lakehouse_name = parts
101
+ scan_all_schemas = True
102
+ schema = "dbo"
103
+ elif len(parts) == 3:
104
+ workspace, lakehouse_name, schema = parts
105
+ else:
106
+ raise ValueError(
107
+ f"Invalid connection string format: '{connection_string}'. "
108
+ "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
109
+ )
110
+
111
+ if lakehouse_name.endswith(".lakehouse"):
112
+ lakehouse_name = lakehouse_name[:-10]
113
+
114
+ if not workspace or not lakehouse_name:
115
+ raise ValueError(
116
+ "Missing required parameters. Use compact format:\n"
117
+ " connect('workspace/lakehouse.lakehouse/schema', 'sql_folder')\n"
118
+ " connect('workspace/lakehouse.lakehouse') # defaults to dbo"
119
+ )
120
+
121
+ return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
122
+
123
+ def _get_storage_token(self):
124
+ return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
125
+
126
+ def _create_onelake_secret(self):
127
+ token = self._get_storage_token()
128
+ if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
129
+ self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
130
+ else:
131
+ print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
132
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
133
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
134
+ token = credential.get_token("https://storage.azure.com/.default")
135
+ os.environ["AZURE_STORAGE_TOKEN"] = token.token
136
+ self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
137
+
138
+ def _discover_tables_fast(self) -> List[Tuple[str, str]]:
139
+ """
140
+ Fast Delta table discovery using obstore with list_with_delimiter.
141
+ Only lists directories, not files - super fast!
142
+
143
+ Returns:
144
+ List of tuples: [(schema, table_name), ...]
145
+ """
146
+ token = self._get_storage_token()
147
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
148
+ print("Authenticating with Azure for table discovery (trying CLI, will fallback to browser if needed)...")
149
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
150
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
151
+ token_obj = credential.get_token("https://storage.azure.com/.default")
152
+ token = token_obj.token
153
+ os.environ["AZURE_STORAGE_TOKEN"] = token
154
+
155
+ url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
156
+ store = AzureStore.from_url(url, bearer_token=token)
157
+
158
+ base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
159
+ tables_found = []
160
+
161
+ if self.scan_all_schemas:
162
+ # Discover all schemas first
163
+ schemas_result = obs.list_with_delimiter(store, prefix=base_path)
164
+ schemas = [
165
+ prefix.rstrip('/').split('/')[-1]
166
+ for prefix in schemas_result['common_prefixes']
167
+ ]
168
+
169
+ # Discover tables in each schema
170
+ for schema_name in schemas:
171
+ schema_path = f"{base_path}{schema_name}/"
172
+ result = obs.list_with_delimiter(store, prefix=schema_path)
173
+
174
+ for table_prefix in result['common_prefixes']:
175
+ table_name = table_prefix.rstrip('/').split('/')[-1]
176
+ # Skip non-table directories
177
+ if table_name not in ('metadata', 'iceberg'):
178
+ tables_found.append((schema_name, table_name))
179
+ else:
180
+ # Scan specific schema only
181
+ print(f"🔍 Discovering tables in schema '{self.schema}'...")
182
+ schema_path = f"{base_path}{self.schema}/"
183
+ result = obs.list_with_delimiter(store, prefix=schema_path)
184
+
185
+ for table_prefix in result['common_prefixes']:
186
+ table_name = table_prefix.rstrip('/').split('/')[-1]
187
+ if table_name not in ('metadata', 'iceberg'):
188
+ tables_found.append((self.schema, table_name))
189
+
190
+ return tables_found
191
+
192
+ def _attach_lakehouse(self):
193
+ """Attach lakehouse tables as DuckDB views using fast discovery"""
194
+ self._create_onelake_secret()
195
+
196
+ try:
197
+ tables = self._discover_tables_fast()
198
+
199
+ if not tables:
200
+ if self.scan_all_schemas:
201
+ print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
202
+ else:
203
+ print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
204
+ return
205
+
206
+ # Group tables by schema for display
207
+ schema_tables = {}
208
+ for schema_name, table_name in tables:
209
+ if schema_name not in schema_tables:
210
+ schema_tables[schema_name] = []
211
+ schema_tables[schema_name].append(table_name)
212
+
213
+ # Display tables by schema
214
+ print(f"\n📊 Found {len(tables)} tables:")
215
+ for schema_name in sorted(schema_tables.keys()):
216
+ table_list = sorted(schema_tables[schema_name])
217
+ print(f" {schema_name}: {', '.join(table_list)}")
218
+
219
+ attached_count = 0
220
+ skipped_tables = []
221
+
222
+ for schema_name, table_name in tables:
223
+ try:
224
+ if self.scan_all_schemas:
225
+ # Create proper schema.table structure in DuckDB
226
+ self.con.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
227
+ view_name = f"{schema_name}.{table_name}"
228
+ else:
229
+ # Single schema mode - use just table name
230
+ view_name = table_name
231
+
232
+ self.con.sql(f"""
233
+ CREATE OR REPLACE VIEW {view_name}
234
+ AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
235
+ """)
236
+ attached_count += 1
237
+ except Exception as e:
238
+ skipped_tables.append(f"{schema_name}.{table_name}")
239
+ continue
240
+
241
+ print(f"\n{'='*60}")
242
+ print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
243
+ if skipped_tables:
244
+ print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
245
+ print(f"{'='*60}\n")
246
+
247
+ except Exception as e:
248
+ print(f"❌ Error attaching lakehouse: {e}")
249
+ print("Continuing without pre-attached tables.")
250
+
251
+ def run(self, pipeline: List[Tuple]) -> bool:
252
+ """
253
+ Execute pipeline of tasks.
254
+
255
+ Task formats:
256
+ - Python: ('function_name', (arg1, arg2, ...))
257
+ - SQL: ('table_name', 'mode') or ('table_name', 'mode', {sql_params})
258
+ - SQL with Delta options: ('table_name', 'mode', {sql_params}, {delta_options})
259
+
260
+ Returns:
261
+ True if all tasks succeeded
262
+ False if any task failed (exception) or Python task returned 0 (early exit)
263
+ """
264
+ return _run(self, pipeline)
265
+
266
+ def copy(self, local_folder: str, remote_folder: str,
267
+ file_extensions: Optional[List[str]] = None,
268
+ overwrite: bool = False) -> bool:
269
+ """
270
+ Copy files from a local folder to OneLake Files section.
271
+
272
+ Args:
273
+ local_folder: Path to local folder containing files to upload
274
+ remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
275
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
276
+ overwrite: Whether to overwrite existing files (default: False)
277
+
278
+ Returns:
279
+ True if all files uploaded successfully, False otherwise
280
+
281
+ Examples:
282
+ # Upload all files from local folder to a target folder
283
+ dr.copy("./local_data", "uploaded_data")
284
+
285
+ # Upload only CSV files to a specific subfolder
286
+ dr.copy("./reports", "daily_reports", ['.csv'])
287
+
288
+ # Upload with overwrite enabled
289
+ dr.copy("./backup", "backups", overwrite=True)
290
+ """
291
+ return _copy(self, local_folder, remote_folder, file_extensions, overwrite)
292
+
293
+ def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
294
+ file_extensions: Optional[List[str]] = None,
295
+ overwrite: bool = False) -> bool:
296
+ """
297
+ Download files from OneLake Files section to a local folder.
298
+
299
+ Args:
300
+ remote_folder: Optional subfolder path in OneLake Files to download from
301
+ local_folder: Local folder path to download files to (default: "./downloaded_files")
302
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
303
+ overwrite: Whether to overwrite existing local files (default: False)
304
+
305
+ Returns:
306
+ True if all files downloaded successfully, False otherwise
307
+
308
+ Examples:
309
+ # Download all files from OneLake Files root
310
+ dr.download()
311
+
312
+ # Download only CSV files from a specific subfolder
313
+ dr.download("daily_reports", "./reports", ['.csv'])
314
+ """
315
+ return _download(self, remote_folder, local_folder, file_extensions, overwrite)
316
+
317
+ def sql(self, query: str):
318
+ """
319
+ Execute raw SQL query with Spark-style write API.
320
+
321
+ Example:
322
+ dr.sql("SELECT * FROM table").show()
323
+ df = dr.sql("SELECT * FROM table").df()
324
+ dr.sql("SELECT 43 as value").write.mode("append").saveAsTable("test")
325
+ """
326
+ relation = self.con.sql(query)
327
+ return QueryResult(relation, self)
328
+
329
+ def get_connection(self):
330
+ """Get underlying DuckDB connection"""
331
+ return self.con
332
+
333
+ def get_stats(self, source: str):
334
+ """
335
+ Get comprehensive statistics for Delta Lake tables.
336
+
337
+ Args:
338
+ source: Can be one of:
339
+ - Table name: 'table_name' (uses current schema)
340
+ - Schema.table: 'schema.table_name' (specific table in schema)
341
+ - Schema only: 'schema' (all tables in schema)
342
+
343
+ Returns:
344
+ Arrow table with statistics including total rows, file count, row groups,
345
+ average row group size, file sizes, VORDER status, and timestamp
346
+
347
+ Examples:
348
+ con = duckrun.connect("tmp/data.lakehouse/aemo")
349
+
350
+ # Single table in current schema
351
+ stats = con.get_stats('price')
352
+
353
+ # Specific table in different schema
354
+ stats = con.get_stats('aemo.price')
355
+
356
+ # All tables in a schema
357
+ stats = con.get_stats('aemo')
358
+ """
359
+ return _get_stats(self, source)
360
+
361
+ def close(self):
362
+ """Close DuckDB connection"""
363
+ if self.con:
364
+ self.con.close()
365
+ print("Connection closed")
@@ -0,0 +1,251 @@
1
+ """
2
+ File operations functionality for duckrun - OneLake Files copy and download
3
+ """
4
+ import os
5
+ from typing import Optional, List
6
+ import obstore as obs
7
+ from obstore.store import AzureStore
8
+
9
+
10
+ def copy(duckrun_instance, local_folder: str, remote_folder: str,
11
+ file_extensions: Optional[List[str]] = None,
12
+ overwrite: bool = False) -> bool:
13
+ """
14
+ Copy files from a local folder to OneLake Files section.
15
+
16
+ Args:
17
+ duckrun_instance: The Duckrun connection instance
18
+ local_folder: Path to local folder containing files to upload
19
+ remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
20
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
21
+ overwrite: Whether to overwrite existing files (default: False)
22
+
23
+ Returns:
24
+ True if all files uploaded successfully, False otherwise
25
+
26
+ Examples:
27
+ # Upload all files from local folder to a target folder
28
+ dr.copy("./local_data", "uploaded_data")
29
+
30
+ # Upload only CSV files to a specific subfolder
31
+ dr.copy("./reports", "daily_reports", ['.csv'])
32
+
33
+ # Upload with overwrite enabled
34
+ dr.copy("./backup", "backups", overwrite=True)
35
+ """
36
+ if not os.path.exists(local_folder):
37
+ print(f"❌ Local folder not found: {local_folder}")
38
+ return False
39
+
40
+ if not os.path.isdir(local_folder):
41
+ print(f"❌ Path is not a directory: {local_folder}")
42
+ return False
43
+
44
+ # Get Azure token
45
+ token = duckrun_instance._get_storage_token()
46
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
47
+ print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
48
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
49
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
50
+ token_obj = credential.get_token("https://storage.azure.com/.default")
51
+ token = token_obj.token
52
+ os.environ["AZURE_STORAGE_TOKEN"] = token
53
+
54
+ # Setup OneLake Files URL (not Tables)
55
+ files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
56
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
57
+
58
+ # Collect files to upload
59
+ files_to_upload = []
60
+ for root, dirs, files in os.walk(local_folder):
61
+ for file in files:
62
+ local_file_path = os.path.join(root, file)
63
+
64
+ # Filter by extensions if specified
65
+ if file_extensions:
66
+ _, ext = os.path.splitext(file)
67
+ if ext.lower() not in [e.lower() for e in file_extensions]:
68
+ continue
69
+
70
+ # Calculate relative path from local_folder
71
+ rel_path = os.path.relpath(local_file_path, local_folder)
72
+
73
+ # Build remote path in OneLake Files (remote_folder is now mandatory)
74
+ remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
75
+
76
+ files_to_upload.append((local_file_path, remote_path))
77
+
78
+ if not files_to_upload:
79
+ print(f"No files found to upload in {local_folder}")
80
+ if file_extensions:
81
+ print(f" (filtered by extensions: {file_extensions})")
82
+ return True
83
+
84
+ print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
85
+ print(f" Target folder: {remote_folder}")
86
+
87
+ uploaded_count = 0
88
+ failed_count = 0
89
+
90
+ for local_path, remote_path in files_to_upload:
91
+ try:
92
+ # Check if file exists (if not overwriting)
93
+ if not overwrite:
94
+ try:
95
+ obs.head(store, remote_path)
96
+ print(f" ⏭ Skipped (exists): {remote_path}")
97
+ continue
98
+ except Exception:
99
+ # File doesn't exist, proceed with upload
100
+ pass
101
+
102
+ # Read local file
103
+ with open(local_path, 'rb') as f:
104
+ file_data = f.read()
105
+
106
+ # Upload to OneLake Files
107
+ obs.put(store, remote_path, file_data)
108
+
109
+ file_size = len(file_data)
110
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
111
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
112
+
113
+ print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
114
+ uploaded_count += 1
115
+
116
+ except Exception as e:
117
+ print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
118
+ failed_count += 1
119
+
120
+ print(f"\n{'='*60}")
121
+ if failed_count == 0:
122
+ print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
123
+ else:
124
+ print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
125
+ print(f"{'='*60}")
126
+
127
+ return failed_count == 0
128
+
129
+
130
+ def download(duckrun_instance, remote_folder: str = "", local_folder: str = "./downloaded_files",
131
+ file_extensions: Optional[List[str]] = None,
132
+ overwrite: bool = False) -> bool:
133
+ """
134
+ Download files from OneLake Files section to a local folder.
135
+
136
+ Args:
137
+ duckrun_instance: The Duckrun connection instance
138
+ remote_folder: Optional subfolder path in OneLake Files to download from
139
+ local_folder: Local folder path to download files to (default: "./downloaded_files")
140
+ file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
141
+ overwrite: Whether to overwrite existing local files (default: False)
142
+
143
+ Returns:
144
+ True if all files downloaded successfully, False otherwise
145
+
146
+ Examples:
147
+ # Download all files from OneLake Files root
148
+ dr.download()
149
+
150
+ # Download only CSV files from a specific subfolder
151
+ dr.download("daily_reports", "./reports", ['.csv'])
152
+ """
153
+ # Get Azure token
154
+ token = duckrun_instance._get_storage_token()
155
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
156
+ print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
157
+ from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
158
+ credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
159
+ token_obj = credential.get_token("https://storage.azure.com/.default")
160
+ token = token_obj.token
161
+ os.environ["AZURE_STORAGE_TOKEN"] = token
162
+
163
+ # Setup OneLake Files URL (not Tables)
164
+ files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
165
+ store = AzureStore.from_url(files_base_url, bearer_token=token)
166
+
167
+ # Create local directory
168
+ os.makedirs(local_folder, exist_ok=True)
169
+
170
+ # List files in OneLake Files
171
+ print(f"📁 Discovering files in OneLake Files...")
172
+ if remote_folder:
173
+ print(f" Source folder: {remote_folder}")
174
+ prefix = f"{remote_folder.strip('/')}/"
175
+ else:
176
+ prefix = ""
177
+
178
+ try:
179
+ list_stream = obs.list(store, prefix=prefix)
180
+ files_to_download = []
181
+
182
+ for batch in list_stream:
183
+ for obj in batch:
184
+ remote_path = obj["path"]
185
+
186
+ # Filter by extensions if specified
187
+ if file_extensions:
188
+ _, ext = os.path.splitext(remote_path)
189
+ if ext.lower() not in [e.lower() for e in file_extensions]:
190
+ continue
191
+
192
+ # Calculate local path
193
+ if remote_folder:
194
+ rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
195
+ else:
196
+ rel_path = remote_path
197
+
198
+ local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
199
+ files_to_download.append((remote_path, local_path))
200
+
201
+ if not files_to_download:
202
+ print(f"No files found to download")
203
+ if file_extensions:
204
+ print(f" (filtered by extensions: {file_extensions})")
205
+ return True
206
+
207
+ print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
208
+
209
+ downloaded_count = 0
210
+ failed_count = 0
211
+
212
+ for remote_path, local_path in files_to_download:
213
+ try:
214
+ # Check if local file exists (if not overwriting)
215
+ if not overwrite and os.path.exists(local_path):
216
+ print(f" ⏭ Skipped (exists): {local_path}")
217
+ continue
218
+
219
+ # Ensure local directory exists
220
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
221
+
222
+ # Download file
223
+ data = obs.get(store, remote_path).bytes()
224
+
225
+ # Write to local file
226
+ with open(local_path, 'wb') as f:
227
+ f.write(data)
228
+
229
+ file_size = len(data)
230
+ size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
231
+ size_unit = "MB" if file_size > 1024*1024 else "KB"
232
+
233
+ print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
234
+ downloaded_count += 1
235
+
236
+ except Exception as e:
237
+ print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
238
+ failed_count += 1
239
+
240
+ print(f"\n{'='*60}")
241
+ if failed_count == 0:
242
+ print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
243
+ else:
244
+ print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
245
+ print(f"{'='*60}")
246
+
247
+ return failed_count == 0
248
+
249
+ except Exception as e:
250
+ print(f"❌ Error listing files from OneLake: {e}")
251
+ return False