duckrun 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.2 → duckrun-0.2.4}/PKG-INFO +1 -1
- duckrun-0.2.4/duckrun/core.py +365 -0
- duckrun-0.2.4/duckrun/files.py +251 -0
- duckrun-0.2.4/duckrun/runner.py +287 -0
- duckrun-0.2.4/duckrun/stats.py +231 -0
- duckrun-0.2.4/duckrun/writer.py +165 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.2 → duckrun-0.2.4}/duckrun.egg-info/SOURCES.txt +4 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/pyproject.toml +1 -1
- duckrun-0.2.2/duckrun/core.py +0 -890
- {duckrun-0.2.2 → duckrun-0.2.4}/LICENSE +0 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/README.md +0 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/duckrun/__init__.py +0 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.2 → duckrun-0.2.4}/setup.cfg +0 -0
@@ -0,0 +1,365 @@
|
|
1
|
+
import duckdb
|
2
|
+
import requests
|
3
|
+
import os
|
4
|
+
import importlib.util
|
5
|
+
from deltalake import DeltaTable, write_deltalake
|
6
|
+
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
7
|
+
from string import Template
|
8
|
+
import obstore as obs
|
9
|
+
from obstore.store import AzureStore
|
10
|
+
from datetime import datetime
|
11
|
+
from .stats import get_stats as _get_stats
|
12
|
+
from .runner import run as _run
|
13
|
+
from .files import copy as _copy, download as _download
|
14
|
+
from .writer import QueryResult
|
15
|
+
|
16
|
+
class Duckrun:
|
17
|
+
"""
|
18
|
+
Lakehouse task runner with clean tuple-based API.
|
19
|
+
Powered by DuckDB for fast data processing.
|
20
|
+
|
21
|
+
Task formats:
|
22
|
+
Python: ('function_name', (arg1, arg2, ...))
|
23
|
+
SQL: ('table_name', 'mode', {params})
|
24
|
+
|
25
|
+
Usage:
|
26
|
+
# For pipelines:
|
27
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse/schema", sql_folder="./sql")
|
28
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
|
29
|
+
dr.run(pipeline)
|
30
|
+
|
31
|
+
# For data exploration with Spark-style API:
|
32
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse")
|
33
|
+
dr.sql("SELECT * FROM table").show()
|
34
|
+
dr.sql("SELECT 43").write.mode("append").saveAsTable("test")
|
35
|
+
|
36
|
+
# Schema evolution and partitioning (exact Spark API):
|
37
|
+
dr.sql("SELECT * FROM source").write.mode("append").option("mergeSchema", "true").partitionBy("region").saveAsTable("sales")
|
38
|
+
|
39
|
+
# Pipeline formats:
|
40
|
+
pipeline = [
|
41
|
+
# SQL with parameters only
|
42
|
+
('table_name', 'mode', {'param1': 'value1'}),
|
43
|
+
|
44
|
+
# SQL with Delta options (4-tuple format)
|
45
|
+
('table_name', 'mode', {'param1': 'value1'}, {'mergeSchema': 'true', 'partitionBy': ['region']}),
|
46
|
+
|
47
|
+
# Python task
|
48
|
+
('process_data', ('table_name',))
|
49
|
+
]
|
50
|
+
"""
|
51
|
+
|
52
|
+
def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
|
53
|
+
sql_folder: Optional[str] = None, compaction_threshold: int = 10,
|
54
|
+
scan_all_schemas: bool = False, storage_account: str = "onelake"):
|
55
|
+
self.workspace = workspace
|
56
|
+
self.lakehouse_name = lakehouse_name
|
57
|
+
self.schema = schema
|
58
|
+
self.sql_folder = sql_folder.strip() if sql_folder else None
|
59
|
+
self.compaction_threshold = compaction_threshold
|
60
|
+
self.scan_all_schemas = scan_all_schemas
|
61
|
+
self.storage_account = storage_account
|
62
|
+
self.table_base_url = f'abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
|
63
|
+
self.con = duckdb.connect()
|
64
|
+
self.con.sql("SET preserve_insertion_order = false")
|
65
|
+
self._attach_lakehouse()
|
66
|
+
|
67
|
+
@classmethod
|
68
|
+
def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
|
69
|
+
compaction_threshold: int = 100, storage_account: str = "onelake"):
|
70
|
+
"""
|
71
|
+
Create and connect to lakehouse.
|
72
|
+
|
73
|
+
Uses compact format: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
|
74
|
+
|
75
|
+
Args:
|
76
|
+
connection_string: OneLake path "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
77
|
+
sql_folder: Optional path or URL to SQL files folder
|
78
|
+
compaction_threshold: File count threshold for compaction
|
79
|
+
storage_account: Storage account name (default: "onelake")
|
80
|
+
|
81
|
+
Examples:
|
82
|
+
dr = Duckrun.connect("ws/lh.lakehouse/schema", sql_folder="./sql")
|
83
|
+
dr = Duckrun.connect("ws/lh.lakehouse/schema") # no SQL folder
|
84
|
+
dr = Duckrun.connect("ws/lh.lakehouse") # defaults to dbo schema
|
85
|
+
dr = Duckrun.connect("ws/lh.lakehouse", storage_account="xxx-onelake") # custom storage
|
86
|
+
"""
|
87
|
+
print("Connecting to Lakehouse...")
|
88
|
+
|
89
|
+
scan_all_schemas = False
|
90
|
+
|
91
|
+
# Only support compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
92
|
+
if not connection_string or "/" not in connection_string:
|
93
|
+
raise ValueError(
|
94
|
+
"Invalid connection string format. "
|
95
|
+
"Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
|
96
|
+
)
|
97
|
+
|
98
|
+
parts = connection_string.split("/")
|
99
|
+
if len(parts) == 2:
|
100
|
+
workspace, lakehouse_name = parts
|
101
|
+
scan_all_schemas = True
|
102
|
+
schema = "dbo"
|
103
|
+
elif len(parts) == 3:
|
104
|
+
workspace, lakehouse_name, schema = parts
|
105
|
+
else:
|
106
|
+
raise ValueError(
|
107
|
+
f"Invalid connection string format: '{connection_string}'. "
|
108
|
+
"Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
|
109
|
+
)
|
110
|
+
|
111
|
+
if lakehouse_name.endswith(".lakehouse"):
|
112
|
+
lakehouse_name = lakehouse_name[:-10]
|
113
|
+
|
114
|
+
if not workspace or not lakehouse_name:
|
115
|
+
raise ValueError(
|
116
|
+
"Missing required parameters. Use compact format:\n"
|
117
|
+
" connect('workspace/lakehouse.lakehouse/schema', 'sql_folder')\n"
|
118
|
+
" connect('workspace/lakehouse.lakehouse') # defaults to dbo"
|
119
|
+
)
|
120
|
+
|
121
|
+
return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
|
122
|
+
|
123
|
+
def _get_storage_token(self):
|
124
|
+
return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
|
125
|
+
|
126
|
+
def _create_onelake_secret(self):
|
127
|
+
token = self._get_storage_token()
|
128
|
+
if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
129
|
+
self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
|
130
|
+
else:
|
131
|
+
print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
|
132
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
133
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
134
|
+
token = credential.get_token("https://storage.azure.com/.default")
|
135
|
+
os.environ["AZURE_STORAGE_TOKEN"] = token.token
|
136
|
+
self.con.sql("CREATE OR REPLACE PERSISTENT SECRET onelake (TYPE azure, PROVIDER credential_chain, CHAIN 'cli', ACCOUNT_NAME 'onelake')")
|
137
|
+
|
138
|
+
def _discover_tables_fast(self) -> List[Tuple[str, str]]:
|
139
|
+
"""
|
140
|
+
Fast Delta table discovery using obstore with list_with_delimiter.
|
141
|
+
Only lists directories, not files - super fast!
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
List of tuples: [(schema, table_name), ...]
|
145
|
+
"""
|
146
|
+
token = self._get_storage_token()
|
147
|
+
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
148
|
+
print("Authenticating with Azure for table discovery (trying CLI, will fallback to browser if needed)...")
|
149
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
150
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
151
|
+
token_obj = credential.get_token("https://storage.azure.com/.default")
|
152
|
+
token = token_obj.token
|
153
|
+
os.environ["AZURE_STORAGE_TOKEN"] = token
|
154
|
+
|
155
|
+
url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
|
156
|
+
store = AzureStore.from_url(url, bearer_token=token)
|
157
|
+
|
158
|
+
base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
|
159
|
+
tables_found = []
|
160
|
+
|
161
|
+
if self.scan_all_schemas:
|
162
|
+
# Discover all schemas first
|
163
|
+
schemas_result = obs.list_with_delimiter(store, prefix=base_path)
|
164
|
+
schemas = [
|
165
|
+
prefix.rstrip('/').split('/')[-1]
|
166
|
+
for prefix in schemas_result['common_prefixes']
|
167
|
+
]
|
168
|
+
|
169
|
+
# Discover tables in each schema
|
170
|
+
for schema_name in schemas:
|
171
|
+
schema_path = f"{base_path}{schema_name}/"
|
172
|
+
result = obs.list_with_delimiter(store, prefix=schema_path)
|
173
|
+
|
174
|
+
for table_prefix in result['common_prefixes']:
|
175
|
+
table_name = table_prefix.rstrip('/').split('/')[-1]
|
176
|
+
# Skip non-table directories
|
177
|
+
if table_name not in ('metadata', 'iceberg'):
|
178
|
+
tables_found.append((schema_name, table_name))
|
179
|
+
else:
|
180
|
+
# Scan specific schema only
|
181
|
+
print(f"🔍 Discovering tables in schema '{self.schema}'...")
|
182
|
+
schema_path = f"{base_path}{self.schema}/"
|
183
|
+
result = obs.list_with_delimiter(store, prefix=schema_path)
|
184
|
+
|
185
|
+
for table_prefix in result['common_prefixes']:
|
186
|
+
table_name = table_prefix.rstrip('/').split('/')[-1]
|
187
|
+
if table_name not in ('metadata', 'iceberg'):
|
188
|
+
tables_found.append((self.schema, table_name))
|
189
|
+
|
190
|
+
return tables_found
|
191
|
+
|
192
|
+
def _attach_lakehouse(self):
|
193
|
+
"""Attach lakehouse tables as DuckDB views using fast discovery"""
|
194
|
+
self._create_onelake_secret()
|
195
|
+
|
196
|
+
try:
|
197
|
+
tables = self._discover_tables_fast()
|
198
|
+
|
199
|
+
if not tables:
|
200
|
+
if self.scan_all_schemas:
|
201
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/")
|
202
|
+
else:
|
203
|
+
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
|
204
|
+
return
|
205
|
+
|
206
|
+
# Group tables by schema for display
|
207
|
+
schema_tables = {}
|
208
|
+
for schema_name, table_name in tables:
|
209
|
+
if schema_name not in schema_tables:
|
210
|
+
schema_tables[schema_name] = []
|
211
|
+
schema_tables[schema_name].append(table_name)
|
212
|
+
|
213
|
+
# Display tables by schema
|
214
|
+
print(f"\n📊 Found {len(tables)} tables:")
|
215
|
+
for schema_name in sorted(schema_tables.keys()):
|
216
|
+
table_list = sorted(schema_tables[schema_name])
|
217
|
+
print(f" {schema_name}: {', '.join(table_list)}")
|
218
|
+
|
219
|
+
attached_count = 0
|
220
|
+
skipped_tables = []
|
221
|
+
|
222
|
+
for schema_name, table_name in tables:
|
223
|
+
try:
|
224
|
+
if self.scan_all_schemas:
|
225
|
+
# Create proper schema.table structure in DuckDB
|
226
|
+
self.con.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
|
227
|
+
view_name = f"{schema_name}.{table_name}"
|
228
|
+
else:
|
229
|
+
# Single schema mode - use just table name
|
230
|
+
view_name = table_name
|
231
|
+
|
232
|
+
self.con.sql(f"""
|
233
|
+
CREATE OR REPLACE VIEW {view_name}
|
234
|
+
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
235
|
+
""")
|
236
|
+
attached_count += 1
|
237
|
+
except Exception as e:
|
238
|
+
skipped_tables.append(f"{schema_name}.{table_name}")
|
239
|
+
continue
|
240
|
+
|
241
|
+
print(f"\n{'='*60}")
|
242
|
+
print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
|
243
|
+
if skipped_tables:
|
244
|
+
print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
|
245
|
+
print(f"{'='*60}\n")
|
246
|
+
|
247
|
+
except Exception as e:
|
248
|
+
print(f"❌ Error attaching lakehouse: {e}")
|
249
|
+
print("Continuing without pre-attached tables.")
|
250
|
+
|
251
|
+
def run(self, pipeline: List[Tuple]) -> bool:
|
252
|
+
"""
|
253
|
+
Execute pipeline of tasks.
|
254
|
+
|
255
|
+
Task formats:
|
256
|
+
- Python: ('function_name', (arg1, arg2, ...))
|
257
|
+
- SQL: ('table_name', 'mode') or ('table_name', 'mode', {sql_params})
|
258
|
+
- SQL with Delta options: ('table_name', 'mode', {sql_params}, {delta_options})
|
259
|
+
|
260
|
+
Returns:
|
261
|
+
True if all tasks succeeded
|
262
|
+
False if any task failed (exception) or Python task returned 0 (early exit)
|
263
|
+
"""
|
264
|
+
return _run(self, pipeline)
|
265
|
+
|
266
|
+
def copy(self, local_folder: str, remote_folder: str,
|
267
|
+
file_extensions: Optional[List[str]] = None,
|
268
|
+
overwrite: bool = False) -> bool:
|
269
|
+
"""
|
270
|
+
Copy files from a local folder to OneLake Files section.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
local_folder: Path to local folder containing files to upload
|
274
|
+
remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
|
275
|
+
file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
|
276
|
+
overwrite: Whether to overwrite existing files (default: False)
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
True if all files uploaded successfully, False otherwise
|
280
|
+
|
281
|
+
Examples:
|
282
|
+
# Upload all files from local folder to a target folder
|
283
|
+
dr.copy("./local_data", "uploaded_data")
|
284
|
+
|
285
|
+
# Upload only CSV files to a specific subfolder
|
286
|
+
dr.copy("./reports", "daily_reports", ['.csv'])
|
287
|
+
|
288
|
+
# Upload with overwrite enabled
|
289
|
+
dr.copy("./backup", "backups", overwrite=True)
|
290
|
+
"""
|
291
|
+
return _copy(self, local_folder, remote_folder, file_extensions, overwrite)
|
292
|
+
|
293
|
+
def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
|
294
|
+
file_extensions: Optional[List[str]] = None,
|
295
|
+
overwrite: bool = False) -> bool:
|
296
|
+
"""
|
297
|
+
Download files from OneLake Files section to a local folder.
|
298
|
+
|
299
|
+
Args:
|
300
|
+
remote_folder: Optional subfolder path in OneLake Files to download from
|
301
|
+
local_folder: Local folder path to download files to (default: "./downloaded_files")
|
302
|
+
file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
|
303
|
+
overwrite: Whether to overwrite existing local files (default: False)
|
304
|
+
|
305
|
+
Returns:
|
306
|
+
True if all files downloaded successfully, False otherwise
|
307
|
+
|
308
|
+
Examples:
|
309
|
+
# Download all files from OneLake Files root
|
310
|
+
dr.download()
|
311
|
+
|
312
|
+
# Download only CSV files from a specific subfolder
|
313
|
+
dr.download("daily_reports", "./reports", ['.csv'])
|
314
|
+
"""
|
315
|
+
return _download(self, remote_folder, local_folder, file_extensions, overwrite)
|
316
|
+
|
317
|
+
def sql(self, query: str):
|
318
|
+
"""
|
319
|
+
Execute raw SQL query with Spark-style write API.
|
320
|
+
|
321
|
+
Example:
|
322
|
+
dr.sql("SELECT * FROM table").show()
|
323
|
+
df = dr.sql("SELECT * FROM table").df()
|
324
|
+
dr.sql("SELECT 43 as value").write.mode("append").saveAsTable("test")
|
325
|
+
"""
|
326
|
+
relation = self.con.sql(query)
|
327
|
+
return QueryResult(relation, self)
|
328
|
+
|
329
|
+
def get_connection(self):
|
330
|
+
"""Get underlying DuckDB connection"""
|
331
|
+
return self.con
|
332
|
+
|
333
|
+
def get_stats(self, source: str):
|
334
|
+
"""
|
335
|
+
Get comprehensive statistics for Delta Lake tables.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
source: Can be one of:
|
339
|
+
- Table name: 'table_name' (uses current schema)
|
340
|
+
- Schema.table: 'schema.table_name' (specific table in schema)
|
341
|
+
- Schema only: 'schema' (all tables in schema)
|
342
|
+
|
343
|
+
Returns:
|
344
|
+
Arrow table with statistics including total rows, file count, row groups,
|
345
|
+
average row group size, file sizes, VORDER status, and timestamp
|
346
|
+
|
347
|
+
Examples:
|
348
|
+
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
349
|
+
|
350
|
+
# Single table in current schema
|
351
|
+
stats = con.get_stats('price')
|
352
|
+
|
353
|
+
# Specific table in different schema
|
354
|
+
stats = con.get_stats('aemo.price')
|
355
|
+
|
356
|
+
# All tables in a schema
|
357
|
+
stats = con.get_stats('aemo')
|
358
|
+
"""
|
359
|
+
return _get_stats(self, source)
|
360
|
+
|
361
|
+
def close(self):
|
362
|
+
"""Close DuckDB connection"""
|
363
|
+
if self.con:
|
364
|
+
self.con.close()
|
365
|
+
print("Connection closed")
|
@@ -0,0 +1,251 @@
|
|
1
|
+
"""
|
2
|
+
File operations functionality for duckrun - OneLake Files copy and download
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
from typing import Optional, List
|
6
|
+
import obstore as obs
|
7
|
+
from obstore.store import AzureStore
|
8
|
+
|
9
|
+
|
10
|
+
def copy(duckrun_instance, local_folder: str, remote_folder: str,
|
11
|
+
file_extensions: Optional[List[str]] = None,
|
12
|
+
overwrite: bool = False) -> bool:
|
13
|
+
"""
|
14
|
+
Copy files from a local folder to OneLake Files section.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
duckrun_instance: The Duckrun connection instance
|
18
|
+
local_folder: Path to local folder containing files to upload
|
19
|
+
remote_folder: Target subfolder path in OneLake Files (e.g., "reports/daily") - REQUIRED
|
20
|
+
file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
|
21
|
+
overwrite: Whether to overwrite existing files (default: False)
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
True if all files uploaded successfully, False otherwise
|
25
|
+
|
26
|
+
Examples:
|
27
|
+
# Upload all files from local folder to a target folder
|
28
|
+
dr.copy("./local_data", "uploaded_data")
|
29
|
+
|
30
|
+
# Upload only CSV files to a specific subfolder
|
31
|
+
dr.copy("./reports", "daily_reports", ['.csv'])
|
32
|
+
|
33
|
+
# Upload with overwrite enabled
|
34
|
+
dr.copy("./backup", "backups", overwrite=True)
|
35
|
+
"""
|
36
|
+
if not os.path.exists(local_folder):
|
37
|
+
print(f"❌ Local folder not found: {local_folder}")
|
38
|
+
return False
|
39
|
+
|
40
|
+
if not os.path.isdir(local_folder):
|
41
|
+
print(f"❌ Path is not a directory: {local_folder}")
|
42
|
+
return False
|
43
|
+
|
44
|
+
# Get Azure token
|
45
|
+
token = duckrun_instance._get_storage_token()
|
46
|
+
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
47
|
+
print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
|
48
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
49
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
50
|
+
token_obj = credential.get_token("https://storage.azure.com/.default")
|
51
|
+
token = token_obj.token
|
52
|
+
os.environ["AZURE_STORAGE_TOKEN"] = token
|
53
|
+
|
54
|
+
# Setup OneLake Files URL (not Tables)
|
55
|
+
files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
|
56
|
+
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
57
|
+
|
58
|
+
# Collect files to upload
|
59
|
+
files_to_upload = []
|
60
|
+
for root, dirs, files in os.walk(local_folder):
|
61
|
+
for file in files:
|
62
|
+
local_file_path = os.path.join(root, file)
|
63
|
+
|
64
|
+
# Filter by extensions if specified
|
65
|
+
if file_extensions:
|
66
|
+
_, ext = os.path.splitext(file)
|
67
|
+
if ext.lower() not in [e.lower() for e in file_extensions]:
|
68
|
+
continue
|
69
|
+
|
70
|
+
# Calculate relative path from local_folder
|
71
|
+
rel_path = os.path.relpath(local_file_path, local_folder)
|
72
|
+
|
73
|
+
# Build remote path in OneLake Files (remote_folder is now mandatory)
|
74
|
+
remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
|
75
|
+
|
76
|
+
files_to_upload.append((local_file_path, remote_path))
|
77
|
+
|
78
|
+
if not files_to_upload:
|
79
|
+
print(f"No files found to upload in {local_folder}")
|
80
|
+
if file_extensions:
|
81
|
+
print(f" (filtered by extensions: {file_extensions})")
|
82
|
+
return True
|
83
|
+
|
84
|
+
print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
|
85
|
+
print(f" Target folder: {remote_folder}")
|
86
|
+
|
87
|
+
uploaded_count = 0
|
88
|
+
failed_count = 0
|
89
|
+
|
90
|
+
for local_path, remote_path in files_to_upload:
|
91
|
+
try:
|
92
|
+
# Check if file exists (if not overwriting)
|
93
|
+
if not overwrite:
|
94
|
+
try:
|
95
|
+
obs.head(store, remote_path)
|
96
|
+
print(f" ⏭ Skipped (exists): {remote_path}")
|
97
|
+
continue
|
98
|
+
except Exception:
|
99
|
+
# File doesn't exist, proceed with upload
|
100
|
+
pass
|
101
|
+
|
102
|
+
# Read local file
|
103
|
+
with open(local_path, 'rb') as f:
|
104
|
+
file_data = f.read()
|
105
|
+
|
106
|
+
# Upload to OneLake Files
|
107
|
+
obs.put(store, remote_path, file_data)
|
108
|
+
|
109
|
+
file_size = len(file_data)
|
110
|
+
size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
|
111
|
+
size_unit = "MB" if file_size > 1024*1024 else "KB"
|
112
|
+
|
113
|
+
print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
|
114
|
+
uploaded_count += 1
|
115
|
+
|
116
|
+
except Exception as e:
|
117
|
+
print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
|
118
|
+
failed_count += 1
|
119
|
+
|
120
|
+
print(f"\n{'='*60}")
|
121
|
+
if failed_count == 0:
|
122
|
+
print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
|
123
|
+
else:
|
124
|
+
print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
|
125
|
+
print(f"{'='*60}")
|
126
|
+
|
127
|
+
return failed_count == 0
|
128
|
+
|
129
|
+
|
130
|
+
def download(duckrun_instance, remote_folder: str = "", local_folder: str = "./downloaded_files",
|
131
|
+
file_extensions: Optional[List[str]] = None,
|
132
|
+
overwrite: bool = False) -> bool:
|
133
|
+
"""
|
134
|
+
Download files from OneLake Files section to a local folder.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
duckrun_instance: The Duckrun connection instance
|
138
|
+
remote_folder: Optional subfolder path in OneLake Files to download from
|
139
|
+
local_folder: Local folder path to download files to (default: "./downloaded_files")
|
140
|
+
file_extensions: Optional list of file extensions to filter (e.g., ['.csv', '.parquet'])
|
141
|
+
overwrite: Whether to overwrite existing local files (default: False)
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
True if all files downloaded successfully, False otherwise
|
145
|
+
|
146
|
+
Examples:
|
147
|
+
# Download all files from OneLake Files root
|
148
|
+
dr.download()
|
149
|
+
|
150
|
+
# Download only CSV files from a specific subfolder
|
151
|
+
dr.download("daily_reports", "./reports", ['.csv'])
|
152
|
+
"""
|
153
|
+
# Get Azure token
|
154
|
+
token = duckrun_instance._get_storage_token()
|
155
|
+
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
156
|
+
print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
|
157
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
158
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
159
|
+
token_obj = credential.get_token("https://storage.azure.com/.default")
|
160
|
+
token = token_obj.token
|
161
|
+
os.environ["AZURE_STORAGE_TOKEN"] = token
|
162
|
+
|
163
|
+
# Setup OneLake Files URL (not Tables)
|
164
|
+
files_base_url = f'abfss://{duckrun_instance.workspace}@{duckrun_instance.storage_account}.dfs.fabric.microsoft.com/{duckrun_instance.lakehouse_name}.Lakehouse/Files/'
|
165
|
+
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
166
|
+
|
167
|
+
# Create local directory
|
168
|
+
os.makedirs(local_folder, exist_ok=True)
|
169
|
+
|
170
|
+
# List files in OneLake Files
|
171
|
+
print(f"📁 Discovering files in OneLake Files...")
|
172
|
+
if remote_folder:
|
173
|
+
print(f" Source folder: {remote_folder}")
|
174
|
+
prefix = f"{remote_folder.strip('/')}/"
|
175
|
+
else:
|
176
|
+
prefix = ""
|
177
|
+
|
178
|
+
try:
|
179
|
+
list_stream = obs.list(store, prefix=prefix)
|
180
|
+
files_to_download = []
|
181
|
+
|
182
|
+
for batch in list_stream:
|
183
|
+
for obj in batch:
|
184
|
+
remote_path = obj["path"]
|
185
|
+
|
186
|
+
# Filter by extensions if specified
|
187
|
+
if file_extensions:
|
188
|
+
_, ext = os.path.splitext(remote_path)
|
189
|
+
if ext.lower() not in [e.lower() for e in file_extensions]:
|
190
|
+
continue
|
191
|
+
|
192
|
+
# Calculate local path
|
193
|
+
if remote_folder:
|
194
|
+
rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
|
195
|
+
else:
|
196
|
+
rel_path = remote_path
|
197
|
+
|
198
|
+
local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
|
199
|
+
files_to_download.append((remote_path, local_path))
|
200
|
+
|
201
|
+
if not files_to_download:
|
202
|
+
print(f"No files found to download")
|
203
|
+
if file_extensions:
|
204
|
+
print(f" (filtered by extensions: {file_extensions})")
|
205
|
+
return True
|
206
|
+
|
207
|
+
print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
|
208
|
+
|
209
|
+
downloaded_count = 0
|
210
|
+
failed_count = 0
|
211
|
+
|
212
|
+
for remote_path, local_path in files_to_download:
|
213
|
+
try:
|
214
|
+
# Check if local file exists (if not overwriting)
|
215
|
+
if not overwrite and os.path.exists(local_path):
|
216
|
+
print(f" ⏭ Skipped (exists): {local_path}")
|
217
|
+
continue
|
218
|
+
|
219
|
+
# Ensure local directory exists
|
220
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
221
|
+
|
222
|
+
# Download file
|
223
|
+
data = obs.get(store, remote_path).bytes()
|
224
|
+
|
225
|
+
# Write to local file
|
226
|
+
with open(local_path, 'wb') as f:
|
227
|
+
f.write(data)
|
228
|
+
|
229
|
+
file_size = len(data)
|
230
|
+
size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
|
231
|
+
size_unit = "MB" if file_size > 1024*1024 else "KB"
|
232
|
+
|
233
|
+
print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
|
234
|
+
downloaded_count += 1
|
235
|
+
|
236
|
+
except Exception as e:
|
237
|
+
print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
|
238
|
+
failed_count += 1
|
239
|
+
|
240
|
+
print(f"\n{'='*60}")
|
241
|
+
if failed_count == 0:
|
242
|
+
print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
|
243
|
+
else:
|
244
|
+
print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
|
245
|
+
print(f"{'='*60}")
|
246
|
+
|
247
|
+
return failed_count == 0
|
248
|
+
|
249
|
+
except Exception as e:
|
250
|
+
print(f"❌ Error listing files from OneLake: {e}")
|
251
|
+
return False
|