duckrun 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/core.py +56 -581
- duckrun/files.py +251 -0
- duckrun/runner.py +287 -0
- duckrun/stats.py +231 -0
- duckrun/writer.py +165 -0
- {duckrun-0.2.2.dist-info → duckrun-0.2.4.dist-info}/METADATA +1 -1
- duckrun-0.2.4.dist-info/RECORD +11 -0
- duckrun-0.2.2.dist-info/RECORD +0 -7
- {duckrun-0.2.2.dist-info → duckrun-0.2.4.dist-info}/WHEEL +0 -0
- {duckrun-0.2.2.dist-info → duckrun-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.2.dist-info → duckrun-0.2.4.dist-info}/top_level.txt +0 -0
duckrun/core.py
CHANGED
@@ -7,155 +7,11 @@ from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
|
7
7
|
from string import Template
|
8
8
|
import obstore as obs
|
9
9
|
from obstore.store import AzureStore
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
|
16
|
-
"""
|
17
|
-
Build arguments for write_deltalake based on requirements:
|
18
|
-
- If schema_mode='merge': use rust engine (no row group params)
|
19
|
-
- Otherwise: use pyarrow engine with row group optimization
|
20
|
-
"""
|
21
|
-
args = {
|
22
|
-
'table_or_uri': path,
|
23
|
-
'data': df,
|
24
|
-
'mode': mode
|
25
|
-
}
|
26
|
-
|
27
|
-
# Add partition_by if specified
|
28
|
-
if partition_by:
|
29
|
-
args['partition_by'] = partition_by
|
30
|
-
|
31
|
-
# Engine selection based on schema_mode
|
32
|
-
if schema_mode == 'merge':
|
33
|
-
# Use rust engine for schema merging (no row group params supported)
|
34
|
-
args['schema_mode'] = 'merge'
|
35
|
-
args['engine'] = 'rust'
|
36
|
-
else:
|
37
|
-
# Use pyarrow engine with row group optimization (default)
|
38
|
-
args['max_rows_per_file'] = RG
|
39
|
-
args['max_rows_per_group'] = RG
|
40
|
-
args['min_rows_per_group'] = RG
|
41
|
-
|
42
|
-
return args
|
43
|
-
|
44
|
-
|
45
|
-
class DeltaWriter:
|
46
|
-
"""Spark-style write API for Delta Lake"""
|
47
|
-
|
48
|
-
def __init__(self, relation, duckrun_instance):
|
49
|
-
self.relation = relation
|
50
|
-
self.duckrun = duckrun_instance
|
51
|
-
self._format = "delta"
|
52
|
-
self._mode = "overwrite"
|
53
|
-
self._schema_mode = None
|
54
|
-
self._partition_by = None
|
55
|
-
|
56
|
-
def format(self, format_type: str):
|
57
|
-
"""Set output format (only 'delta' supported)"""
|
58
|
-
if format_type.lower() != "delta":
|
59
|
-
raise ValueError(f"Only 'delta' format is supported, got '{format_type}'")
|
60
|
-
self._format = "delta"
|
61
|
-
return self
|
62
|
-
|
63
|
-
def mode(self, write_mode: str):
|
64
|
-
"""Set write mode: 'overwrite' or 'append'"""
|
65
|
-
if write_mode not in {"overwrite", "append"}:
|
66
|
-
raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
|
67
|
-
self._mode = write_mode
|
68
|
-
return self
|
69
|
-
|
70
|
-
def option(self, key: str, value):
|
71
|
-
"""Set write option (Spark-compatible)"""
|
72
|
-
if key == "mergeSchema":
|
73
|
-
if str(value).lower() in ("true", "1"):
|
74
|
-
self._schema_mode = "merge"
|
75
|
-
else:
|
76
|
-
self._schema_mode = None
|
77
|
-
else:
|
78
|
-
raise ValueError(f"Unsupported option: {key}")
|
79
|
-
return self
|
80
|
-
|
81
|
-
def partitionBy(self, *columns):
|
82
|
-
"""Set partition columns (Spark-compatible)"""
|
83
|
-
if len(columns) == 1 and isinstance(columns[0], (list, tuple)):
|
84
|
-
# Handle partitionBy(["col1", "col2"]) case
|
85
|
-
self._partition_by = list(columns[0])
|
86
|
-
else:
|
87
|
-
# Handle partitionBy("col1", "col2") case
|
88
|
-
self._partition_by = list(columns)
|
89
|
-
return self
|
90
|
-
|
91
|
-
def saveAsTable(self, table_name: str):
|
92
|
-
"""Save query result as Delta table"""
|
93
|
-
if self._format != "delta":
|
94
|
-
raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
|
95
|
-
|
96
|
-
if "." in table_name:
|
97
|
-
schema, table = table_name.split(".", 1)
|
98
|
-
else:
|
99
|
-
schema = self.duckrun.schema
|
100
|
-
table = table_name
|
101
|
-
|
102
|
-
self.duckrun._create_onelake_secret()
|
103
|
-
path = f"{self.duckrun.table_base_url}{schema}/{table}"
|
104
|
-
df = self.relation.record_batch()
|
105
|
-
|
106
|
-
# Build write arguments based on schema_mode and partition_by
|
107
|
-
write_args = _build_write_deltalake_args(
|
108
|
-
path, df, self._mode,
|
109
|
-
schema_mode=self._schema_mode,
|
110
|
-
partition_by=self._partition_by
|
111
|
-
)
|
112
|
-
|
113
|
-
engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
|
114
|
-
partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
|
115
|
-
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
|
116
|
-
|
117
|
-
write_deltalake(**write_args)
|
118
|
-
|
119
|
-
self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
|
120
|
-
self.duckrun.con.sql(f"""
|
121
|
-
CREATE OR REPLACE VIEW {table}
|
122
|
-
AS SELECT * FROM delta_scan('{path}')
|
123
|
-
""")
|
124
|
-
|
125
|
-
dt = DeltaTable(path)
|
126
|
-
|
127
|
-
if self._mode == "overwrite":
|
128
|
-
dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
|
129
|
-
dt.cleanup_metadata()
|
130
|
-
print(f"✅ Table {schema}.{table} created/overwritten")
|
131
|
-
else:
|
132
|
-
file_count = len(dt.file_uris())
|
133
|
-
if file_count > self.duckrun.compaction_threshold:
|
134
|
-
print(f"Compacting {schema}.{table} ({file_count} files)")
|
135
|
-
dt.optimize.compact()
|
136
|
-
dt.vacuum(dry_run=False)
|
137
|
-
dt.cleanup_metadata()
|
138
|
-
print(f"✅ Data appended to {schema}.{table}")
|
139
|
-
|
140
|
-
return table
|
141
|
-
|
142
|
-
|
143
|
-
class QueryResult:
|
144
|
-
"""Wrapper for DuckDB relation with write API"""
|
145
|
-
|
146
|
-
def __init__(self, relation, duckrun_instance):
|
147
|
-
self.relation = relation
|
148
|
-
self.duckrun = duckrun_instance
|
149
|
-
|
150
|
-
@property
|
151
|
-
def write(self):
|
152
|
-
"""Access write API"""
|
153
|
-
return DeltaWriter(self.relation, self.duckrun)
|
154
|
-
|
155
|
-
def __getattr__(self, name):
|
156
|
-
"""Delegate all other methods to underlying DuckDB relation"""
|
157
|
-
return getattr(self.relation, name)
|
158
|
-
|
10
|
+
from datetime import datetime
|
11
|
+
from .stats import get_stats as _get_stats
|
12
|
+
from .runner import run as _run
|
13
|
+
from .files import copy as _copy, download as _download
|
14
|
+
from .writer import QueryResult
|
159
15
|
|
160
16
|
class Duckrun:
|
161
17
|
"""
|
@@ -244,8 +100,6 @@ class Duckrun:
|
|
244
100
|
workspace, lakehouse_name = parts
|
245
101
|
scan_all_schemas = True
|
246
102
|
schema = "dbo"
|
247
|
-
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
248
|
-
print(f" Scanning all schemas for table discovery...\n")
|
249
103
|
elif len(parts) == 3:
|
250
104
|
workspace, lakehouse_name, schema = parts
|
251
105
|
else:
|
@@ -306,16 +160,13 @@ class Duckrun:
|
|
306
160
|
|
307
161
|
if self.scan_all_schemas:
|
308
162
|
# Discover all schemas first
|
309
|
-
print("🔍 Discovering schemas...")
|
310
163
|
schemas_result = obs.list_with_delimiter(store, prefix=base_path)
|
311
164
|
schemas = [
|
312
165
|
prefix.rstrip('/').split('/')[-1]
|
313
166
|
for prefix in schemas_result['common_prefixes']
|
314
167
|
]
|
315
|
-
print(f" Found {len(schemas)} schemas: {', '.join(schemas)}\n")
|
316
168
|
|
317
169
|
# Discover tables in each schema
|
318
|
-
print("🔍 Discovering tables...")
|
319
170
|
for schema_name in schemas:
|
320
171
|
schema_path = f"{base_path}{schema_name}/"
|
321
172
|
result = obs.list_with_delimiter(store, prefix=schema_path)
|
@@ -352,9 +203,22 @@ class Duckrun:
|
|
352
203
|
print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
|
353
204
|
return
|
354
205
|
|
355
|
-
|
206
|
+
# Group tables by schema for display
|
207
|
+
schema_tables = {}
|
208
|
+
for schema_name, table_name in tables:
|
209
|
+
if schema_name not in schema_tables:
|
210
|
+
schema_tables[schema_name] = []
|
211
|
+
schema_tables[schema_name].append(table_name)
|
212
|
+
|
213
|
+
# Display tables by schema
|
214
|
+
print(f"\n📊 Found {len(tables)} tables:")
|
215
|
+
for schema_name in sorted(schema_tables.keys()):
|
216
|
+
table_list = sorted(schema_tables[schema_name])
|
217
|
+
print(f" {schema_name}: {', '.join(table_list)}")
|
356
218
|
|
357
219
|
attached_count = 0
|
220
|
+
skipped_tables = []
|
221
|
+
|
358
222
|
for schema_name, table_name in tables:
|
359
223
|
try:
|
360
224
|
if self.scan_all_schemas:
|
@@ -369,197 +233,21 @@ class Duckrun:
|
|
369
233
|
CREATE OR REPLACE VIEW {view_name}
|
370
234
|
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
371
235
|
""")
|
372
|
-
print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
|
373
236
|
attached_count += 1
|
374
237
|
except Exception as e:
|
375
|
-
|
238
|
+
skipped_tables.append(f"{schema_name}.{table_name}")
|
376
239
|
continue
|
377
240
|
|
378
241
|
print(f"\n{'='*60}")
|
379
|
-
print(f"✅
|
242
|
+
print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
|
243
|
+
if skipped_tables:
|
244
|
+
print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
|
380
245
|
print(f"{'='*60}\n")
|
381
|
-
|
382
|
-
if self.scan_all_schemas:
|
383
|
-
print(f"\n💡 Note: Tables use schema.table format (e.g., aemo.calendar, dbo.results)")
|
384
|
-
print(f" Default schema for operations: {self.schema}\n")
|
385
246
|
|
386
247
|
except Exception as e:
|
387
248
|
print(f"❌ Error attaching lakehouse: {e}")
|
388
249
|
print("Continuing without pre-attached tables.")
|
389
250
|
|
390
|
-
def _normalize_table_name(self, name: str) -> str:
|
391
|
-
"""Extract base table name before first '__'"""
|
392
|
-
return name.split('__', 1)[0] if '__' in name else name
|
393
|
-
|
394
|
-
def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
|
395
|
-
if self.sql_folder is None:
|
396
|
-
raise RuntimeError("sql_folder is not configured. Cannot read SQL files.")
|
397
|
-
|
398
|
-
is_url = self.sql_folder.startswith("http")
|
399
|
-
if is_url:
|
400
|
-
url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
|
401
|
-
try:
|
402
|
-
resp = requests.get(url)
|
403
|
-
resp.raise_for_status()
|
404
|
-
content = resp.text
|
405
|
-
except Exception as e:
|
406
|
-
print(f"Failed to fetch SQL from {url}: {e}")
|
407
|
-
return None
|
408
|
-
else:
|
409
|
-
path = os.path.join(self.sql_folder, f"{table_name}.sql")
|
410
|
-
try:
|
411
|
-
with open(path, 'r') as f:
|
412
|
-
content = f.read()
|
413
|
-
except Exception as e:
|
414
|
-
print(f"Failed to read SQL file {path}: {e}")
|
415
|
-
return None
|
416
|
-
|
417
|
-
if not content.strip():
|
418
|
-
print(f"SQL file is empty: {table_name}.sql")
|
419
|
-
return None
|
420
|
-
|
421
|
-
full_params = {
|
422
|
-
'ws': self.workspace,
|
423
|
-
'lh': self.lakehouse_name,
|
424
|
-
'schema': self.schema,
|
425
|
-
'storage_account': self.storage_account
|
426
|
-
}
|
427
|
-
if params:
|
428
|
-
full_params.update(params)
|
429
|
-
|
430
|
-
try:
|
431
|
-
template = Template(content)
|
432
|
-
content = template.substitute(full_params)
|
433
|
-
except KeyError as e:
|
434
|
-
print(f"Missing parameter in SQL file: ${e}")
|
435
|
-
return None
|
436
|
-
except Exception as e:
|
437
|
-
print(f"Error during SQL template substitution: {e}")
|
438
|
-
return None
|
439
|
-
|
440
|
-
return content
|
441
|
-
|
442
|
-
def _load_py_function(self, name: str) -> Optional[Callable]:
|
443
|
-
if self.sql_folder is None:
|
444
|
-
raise RuntimeError("sql_folder is not configured. Cannot load Python functions.")
|
445
|
-
|
446
|
-
is_url = self.sql_folder.startswith("http")
|
447
|
-
try:
|
448
|
-
if is_url:
|
449
|
-
url = f"{self.sql_folder.rstrip('/')}/{name}.py".strip()
|
450
|
-
resp = requests.get(url)
|
451
|
-
resp.raise_for_status()
|
452
|
-
code = resp.text
|
453
|
-
namespace = {}
|
454
|
-
exec(code, namespace)
|
455
|
-
func = namespace.get(name)
|
456
|
-
return func if callable(func) else None
|
457
|
-
else:
|
458
|
-
path = os.path.join(self.sql_folder, f"{name}.py")
|
459
|
-
if not os.path.isfile(path):
|
460
|
-
print(f"Python file not found: {path}")
|
461
|
-
return None
|
462
|
-
spec = importlib.util.spec_from_file_location(name, path)
|
463
|
-
mod = importlib.util.module_from_spec(spec)
|
464
|
-
spec.loader.exec_module(mod)
|
465
|
-
func = getattr(mod, name, None)
|
466
|
-
return func if callable(func) else None
|
467
|
-
except Exception as e:
|
468
|
-
print(f"Error loading Python function '{name}': {e}")
|
469
|
-
return None
|
470
|
-
|
471
|
-
def _run_python(self, name: str, args: tuple) -> Any:
|
472
|
-
"""Execute Python task, return result"""
|
473
|
-
self._create_onelake_secret()
|
474
|
-
func = self._load_py_function(name)
|
475
|
-
if not func:
|
476
|
-
raise RuntimeError(f"Python function '{name}' not found")
|
477
|
-
|
478
|
-
print(f"Running Python: {name}{args}")
|
479
|
-
result = func(*args)
|
480
|
-
print(f"✅ Python '{name}' completed")
|
481
|
-
return result
|
482
|
-
|
483
|
-
def _run_sql(self, table: str, mode: str, params: Dict, delta_options: Dict = None) -> str:
|
484
|
-
"""Execute SQL task, write to Delta, return normalized table name"""
|
485
|
-
self._create_onelake_secret()
|
486
|
-
|
487
|
-
if mode not in {'overwrite', 'append', 'ignore'}:
|
488
|
-
raise ValueError(f"Invalid mode '{mode}'. Use: overwrite, append, or ignore")
|
489
|
-
|
490
|
-
sql = self._read_sql_file(table, params)
|
491
|
-
if sql is None:
|
492
|
-
raise RuntimeError(f"Failed to read SQL file for '{table}'")
|
493
|
-
|
494
|
-
normalized_table = self._normalize_table_name(table)
|
495
|
-
path = f"{self.table_base_url}{self.schema}/{normalized_table}"
|
496
|
-
|
497
|
-
# Extract Delta Lake specific options from delta_options
|
498
|
-
delta_options = delta_options or {}
|
499
|
-
merge_schema = delta_options.get('mergeSchema')
|
500
|
-
schema_mode = 'merge' if str(merge_schema).lower() in ('true', '1') else None
|
501
|
-
partition_by = delta_options.get('partitionBy') or delta_options.get('partition_by')
|
502
|
-
|
503
|
-
if mode == 'overwrite':
|
504
|
-
self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
|
505
|
-
df = self.con.sql(sql).record_batch()
|
506
|
-
|
507
|
-
write_args = _build_write_deltalake_args(
|
508
|
-
path, df, 'overwrite',
|
509
|
-
schema_mode=schema_mode,
|
510
|
-
partition_by=partition_by
|
511
|
-
)
|
512
|
-
write_deltalake(**write_args)
|
513
|
-
|
514
|
-
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
515
|
-
dt = DeltaTable(path)
|
516
|
-
dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
|
517
|
-
dt.cleanup_metadata()
|
518
|
-
|
519
|
-
elif mode == 'append':
|
520
|
-
df = self.con.sql(sql).record_batch()
|
521
|
-
|
522
|
-
write_args = _build_write_deltalake_args(
|
523
|
-
path, df, 'append',
|
524
|
-
schema_mode=schema_mode,
|
525
|
-
partition_by=partition_by
|
526
|
-
)
|
527
|
-
write_deltalake(**write_args)
|
528
|
-
|
529
|
-
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
530
|
-
dt = DeltaTable(path)
|
531
|
-
if len(dt.file_uris()) > self.compaction_threshold:
|
532
|
-
print(f"Compacting {normalized_table} ({len(dt.file_uris())} files)")
|
533
|
-
dt.optimize.compact()
|
534
|
-
dt.vacuum(dry_run=False)
|
535
|
-
dt.cleanup_metadata()
|
536
|
-
|
537
|
-
elif mode == 'ignore':
|
538
|
-
try:
|
539
|
-
DeltaTable(path)
|
540
|
-
print(f"Table {normalized_table} exists. Skipping (mode='ignore')")
|
541
|
-
except Exception:
|
542
|
-
print(f"Table {normalized_table} doesn't exist. Creating...")
|
543
|
-
self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
|
544
|
-
df = self.con.sql(sql).record_batch()
|
545
|
-
|
546
|
-
write_args = _build_write_deltalake_args(
|
547
|
-
path, df, 'overwrite',
|
548
|
-
schema_mode=schema_mode,
|
549
|
-
partition_by=partition_by
|
550
|
-
)
|
551
|
-
write_deltalake(**write_args)
|
552
|
-
|
553
|
-
self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
|
554
|
-
dt = DeltaTable(path)
|
555
|
-
dt.vacuum(dry_run=False)
|
556
|
-
dt.cleanup_metadata()
|
557
|
-
|
558
|
-
engine_info = f" (engine=rust, schema_mode=merge)" if schema_mode == 'merge' else " (engine=pyarrow)"
|
559
|
-
partition_info = f" partitioned by {partition_by}" if partition_by else ""
|
560
|
-
print(f"✅ SQL '{table}' → '{normalized_table}' ({mode}){engine_info}{partition_info}")
|
561
|
-
return normalized_table
|
562
|
-
|
563
251
|
def run(self, pipeline: List[Tuple]) -> bool:
|
564
252
|
"""
|
565
253
|
Execute pipeline of tasks.
|
@@ -573,59 +261,7 @@ class Duckrun:
|
|
573
261
|
True if all tasks succeeded
|
574
262
|
False if any task failed (exception) or Python task returned 0 (early exit)
|
575
263
|
"""
|
576
|
-
|
577
|
-
raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
|
578
|
-
|
579
|
-
for i, task in enumerate(pipeline, 1):
|
580
|
-
print(f"\n{'='*60}")
|
581
|
-
print(f"Task {i}/{len(pipeline)}: {task[0]}")
|
582
|
-
print('='*60)
|
583
|
-
|
584
|
-
try:
|
585
|
-
result = None
|
586
|
-
|
587
|
-
if len(task) == 2:
|
588
|
-
name, second = task
|
589
|
-
if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
|
590
|
-
result = self._run_sql(name, second, {}, {})
|
591
|
-
else:
|
592
|
-
args = second if isinstance(second, (tuple, list)) else (second,)
|
593
|
-
result = self._run_python(name, tuple(args))
|
594
|
-
|
595
|
-
elif len(task) == 3:
|
596
|
-
table, mode, params = task
|
597
|
-
if not isinstance(params, dict):
|
598
|
-
raise ValueError(f"Expected dict for params, got {type(params)}")
|
599
|
-
result = self._run_sql(table, mode, params, {})
|
600
|
-
|
601
|
-
elif len(task) == 4:
|
602
|
-
table, mode, params, delta_options = task
|
603
|
-
if not isinstance(params, dict):
|
604
|
-
raise ValueError(f"Expected dict for SQL params, got {type(params)}")
|
605
|
-
if not isinstance(delta_options, dict):
|
606
|
-
raise ValueError(f"Expected dict for Delta options, got {type(delta_options)}")
|
607
|
-
result = self._run_sql(table, mode, params, delta_options)
|
608
|
-
|
609
|
-
else:
|
610
|
-
raise ValueError(f"Invalid task format: {task}")
|
611
|
-
|
612
|
-
# Check if Python task returned 0 (early exit condition)
|
613
|
-
# Only check for Python tasks as SQL tasks return table names (strings) and only stop on exceptions
|
614
|
-
if (len(task) == 2 and
|
615
|
-
not isinstance(task[1], str) and
|
616
|
-
result == 0):
|
617
|
-
print(f"\n⏹️ Python task {i} returned 0 - stopping pipeline execution")
|
618
|
-
print(f" Remaining tasks ({len(pipeline) - i}) will not be executed")
|
619
|
-
return False
|
620
|
-
|
621
|
-
except Exception as e:
|
622
|
-
print(f"\n❌ Task {i} failed: {e}")
|
623
|
-
return False
|
624
|
-
|
625
|
-
print(f"\n{'='*60}")
|
626
|
-
print("✅ All tasks completed successfully")
|
627
|
-
print('='*60)
|
628
|
-
return True
|
264
|
+
return _run(self, pipeline)
|
629
265
|
|
630
266
|
def copy(self, local_folder: str, remote_folder: str,
|
631
267
|
file_extensions: Optional[List[str]] = None,
|
@@ -652,98 +288,7 @@ class Duckrun:
|
|
652
288
|
# Upload with overwrite enabled
|
653
289
|
dr.copy("./backup", "backups", overwrite=True)
|
654
290
|
"""
|
655
|
-
|
656
|
-
print(f"❌ Local folder not found: {local_folder}")
|
657
|
-
return False
|
658
|
-
|
659
|
-
if not os.path.isdir(local_folder):
|
660
|
-
print(f"❌ Path is not a directory: {local_folder}")
|
661
|
-
return False
|
662
|
-
|
663
|
-
# Get Azure token
|
664
|
-
token = self._get_storage_token()
|
665
|
-
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
666
|
-
print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
|
667
|
-
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
668
|
-
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
669
|
-
token_obj = credential.get_token("https://storage.azure.com/.default")
|
670
|
-
token = token_obj.token
|
671
|
-
os.environ["AZURE_STORAGE_TOKEN"] = token
|
672
|
-
|
673
|
-
# Setup OneLake Files URL (not Tables)
|
674
|
-
files_base_url = f'abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
|
675
|
-
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
676
|
-
|
677
|
-
# Collect files to upload
|
678
|
-
files_to_upload = []
|
679
|
-
for root, dirs, files in os.walk(local_folder):
|
680
|
-
for file in files:
|
681
|
-
local_file_path = os.path.join(root, file)
|
682
|
-
|
683
|
-
# Filter by extensions if specified
|
684
|
-
if file_extensions:
|
685
|
-
_, ext = os.path.splitext(file)
|
686
|
-
if ext.lower() not in [e.lower() for e in file_extensions]:
|
687
|
-
continue
|
688
|
-
|
689
|
-
# Calculate relative path from local_folder
|
690
|
-
rel_path = os.path.relpath(local_file_path, local_folder)
|
691
|
-
|
692
|
-
# Build remote path in OneLake Files (remote_folder is now mandatory)
|
693
|
-
remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
|
694
|
-
|
695
|
-
files_to_upload.append((local_file_path, remote_path))
|
696
|
-
|
697
|
-
if not files_to_upload:
|
698
|
-
print(f"No files found to upload in {local_folder}")
|
699
|
-
if file_extensions:
|
700
|
-
print(f" (filtered by extensions: {file_extensions})")
|
701
|
-
return True
|
702
|
-
|
703
|
-
print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
|
704
|
-
print(f" Target folder: {remote_folder}")
|
705
|
-
|
706
|
-
uploaded_count = 0
|
707
|
-
failed_count = 0
|
708
|
-
|
709
|
-
for local_path, remote_path in files_to_upload:
|
710
|
-
try:
|
711
|
-
# Check if file exists (if not overwriting)
|
712
|
-
if not overwrite:
|
713
|
-
try:
|
714
|
-
obs.head(store, remote_path)
|
715
|
-
print(f" ⏭ Skipped (exists): {remote_path}")
|
716
|
-
continue
|
717
|
-
except Exception:
|
718
|
-
# File doesn't exist, proceed with upload
|
719
|
-
pass
|
720
|
-
|
721
|
-
# Read local file
|
722
|
-
with open(local_path, 'rb') as f:
|
723
|
-
file_data = f.read()
|
724
|
-
|
725
|
-
# Upload to OneLake Files
|
726
|
-
obs.put(store, remote_path, file_data)
|
727
|
-
|
728
|
-
file_size = len(file_data)
|
729
|
-
size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
|
730
|
-
size_unit = "MB" if file_size > 1024*1024 else "KB"
|
731
|
-
|
732
|
-
print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
|
733
|
-
uploaded_count += 1
|
734
|
-
|
735
|
-
except Exception as e:
|
736
|
-
print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
|
737
|
-
failed_count += 1
|
738
|
-
|
739
|
-
print(f"\n{'='*60}")
|
740
|
-
if failed_count == 0:
|
741
|
-
print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
|
742
|
-
else:
|
743
|
-
print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
|
744
|
-
print(f"{'='*60}")
|
745
|
-
|
746
|
-
return failed_count == 0
|
291
|
+
return _copy(self, local_folder, remote_folder, file_extensions, overwrite)
|
747
292
|
|
748
293
|
def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
|
749
294
|
file_extensions: Optional[List[str]] = None,
|
@@ -762,110 +307,12 @@ class Duckrun:
|
|
762
307
|
|
763
308
|
Examples:
|
764
309
|
# Download all files from OneLake Files root
|
765
|
-
dr.
|
310
|
+
dr.download()
|
766
311
|
|
767
312
|
# Download only CSV files from a specific subfolder
|
768
|
-
dr.
|
313
|
+
dr.download("daily_reports", "./reports", ['.csv'])
|
769
314
|
"""
|
770
|
-
|
771
|
-
token = self._get_storage_token()
|
772
|
-
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
773
|
-
print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
|
774
|
-
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
775
|
-
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
776
|
-
token_obj = credential.get_token("https://storage.azure.com/.default")
|
777
|
-
token = token_obj.token
|
778
|
-
os.environ["AZURE_STORAGE_TOKEN"] = token
|
779
|
-
|
780
|
-
# Setup OneLake Files URL (not Tables)
|
781
|
-
files_base_url = f'abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
|
782
|
-
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
783
|
-
|
784
|
-
# Create local directory
|
785
|
-
os.makedirs(local_folder, exist_ok=True)
|
786
|
-
|
787
|
-
# List files in OneLake Files
|
788
|
-
print(f"📁 Discovering files in OneLake Files...")
|
789
|
-
if remote_folder:
|
790
|
-
print(f" Source folder: {remote_folder}")
|
791
|
-
prefix = f"{remote_folder.strip('/')}/"
|
792
|
-
else:
|
793
|
-
prefix = ""
|
794
|
-
|
795
|
-
try:
|
796
|
-
list_stream = obs.list(store, prefix=prefix)
|
797
|
-
files_to_download = []
|
798
|
-
|
799
|
-
for batch in list_stream:
|
800
|
-
for obj in batch:
|
801
|
-
remote_path = obj["path"]
|
802
|
-
|
803
|
-
# Filter by extensions if specified
|
804
|
-
if file_extensions:
|
805
|
-
_, ext = os.path.splitext(remote_path)
|
806
|
-
if ext.lower() not in [e.lower() for e in file_extensions]:
|
807
|
-
continue
|
808
|
-
|
809
|
-
# Calculate local path
|
810
|
-
if remote_folder:
|
811
|
-
rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
|
812
|
-
else:
|
813
|
-
rel_path = remote_path
|
814
|
-
|
815
|
-
local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
|
816
|
-
files_to_download.append((remote_path, local_path))
|
817
|
-
|
818
|
-
if not files_to_download:
|
819
|
-
print(f"No files found to download")
|
820
|
-
if file_extensions:
|
821
|
-
print(f" (filtered by extensions: {file_extensions})")
|
822
|
-
return True
|
823
|
-
|
824
|
-
print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
|
825
|
-
|
826
|
-
downloaded_count = 0
|
827
|
-
failed_count = 0
|
828
|
-
|
829
|
-
for remote_path, local_path in files_to_download:
|
830
|
-
try:
|
831
|
-
# Check if local file exists (if not overwriting)
|
832
|
-
if not overwrite and os.path.exists(local_path):
|
833
|
-
print(f" ⏭ Skipped (exists): {local_path}")
|
834
|
-
continue
|
835
|
-
|
836
|
-
# Ensure local directory exists
|
837
|
-
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
838
|
-
|
839
|
-
# Download file
|
840
|
-
data = obs.get(store, remote_path).bytes()
|
841
|
-
|
842
|
-
# Write to local file
|
843
|
-
with open(local_path, 'wb') as f:
|
844
|
-
f.write(data)
|
845
|
-
|
846
|
-
file_size = len(data)
|
847
|
-
size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
|
848
|
-
size_unit = "MB" if file_size > 1024*1024 else "KB"
|
849
|
-
|
850
|
-
print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
|
851
|
-
downloaded_count += 1
|
852
|
-
|
853
|
-
except Exception as e:
|
854
|
-
print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
|
855
|
-
failed_count += 1
|
856
|
-
|
857
|
-
print(f"\n{'='*60}")
|
858
|
-
if failed_count == 0:
|
859
|
-
print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
|
860
|
-
else:
|
861
|
-
print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
|
862
|
-
print(f"{'='*60}")
|
863
|
-
|
864
|
-
return failed_count == 0
|
865
|
-
|
866
|
-
except Exception as e:
|
867
|
-
print(f"❌ Error listing files from OneLake: {e}")
|
868
|
-
return False
|
315
|
+
return _download(self, remote_folder, local_folder, file_extensions, overwrite)
|
869
316
|
|
870
317
|
def sql(self, query: str):
|
871
318
|
"""
|
@@ -883,6 +330,34 @@ class Duckrun:
|
|
883
330
|
"""Get underlying DuckDB connection"""
|
884
331
|
return self.con
|
885
332
|
|
333
|
+
def get_stats(self, source: str):
|
334
|
+
"""
|
335
|
+
Get comprehensive statistics for Delta Lake tables.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
source: Can be one of:
|
339
|
+
- Table name: 'table_name' (uses current schema)
|
340
|
+
- Schema.table: 'schema.table_name' (specific table in schema)
|
341
|
+
- Schema only: 'schema' (all tables in schema)
|
342
|
+
|
343
|
+
Returns:
|
344
|
+
Arrow table with statistics including total rows, file count, row groups,
|
345
|
+
average row group size, file sizes, VORDER status, and timestamp
|
346
|
+
|
347
|
+
Examples:
|
348
|
+
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
349
|
+
|
350
|
+
# Single table in current schema
|
351
|
+
stats = con.get_stats('price')
|
352
|
+
|
353
|
+
# Specific table in different schema
|
354
|
+
stats = con.get_stats('aemo.price')
|
355
|
+
|
356
|
+
# All tables in a schema
|
357
|
+
stats = con.get_stats('aemo')
|
358
|
+
"""
|
359
|
+
return _get_stats(self, source)
|
360
|
+
|
886
361
|
def close(self):
|
887
362
|
"""Close DuckDB connection"""
|
888
363
|
if self.con:
|