duckrun 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -7,155 +7,11 @@ from typing import List, Tuple, Union, Optional, Callable, Dict, Any
7
7
  from string import Template
8
8
  import obstore as obs
9
9
  from obstore.store import AzureStore
10
-
11
- # Row Group configuration for optimal Delta Lake performance
12
- RG = 8_000_000
13
-
14
-
15
- def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
16
- """
17
- Build arguments for write_deltalake based on requirements:
18
- - If schema_mode='merge': use rust engine (no row group params)
19
- - Otherwise: use pyarrow engine with row group optimization
20
- """
21
- args = {
22
- 'table_or_uri': path,
23
- 'data': df,
24
- 'mode': mode
25
- }
26
-
27
- # Add partition_by if specified
28
- if partition_by:
29
- args['partition_by'] = partition_by
30
-
31
- # Engine selection based on schema_mode
32
- if schema_mode == 'merge':
33
- # Use rust engine for schema merging (no row group params supported)
34
- args['schema_mode'] = 'merge'
35
- args['engine'] = 'rust'
36
- else:
37
- # Use pyarrow engine with row group optimization (default)
38
- args['max_rows_per_file'] = RG
39
- args['max_rows_per_group'] = RG
40
- args['min_rows_per_group'] = RG
41
-
42
- return args
43
-
44
-
45
- class DeltaWriter:
46
- """Spark-style write API for Delta Lake"""
47
-
48
- def __init__(self, relation, duckrun_instance):
49
- self.relation = relation
50
- self.duckrun = duckrun_instance
51
- self._format = "delta"
52
- self._mode = "overwrite"
53
- self._schema_mode = None
54
- self._partition_by = None
55
-
56
- def format(self, format_type: str):
57
- """Set output format (only 'delta' supported)"""
58
- if format_type.lower() != "delta":
59
- raise ValueError(f"Only 'delta' format is supported, got '{format_type}'")
60
- self._format = "delta"
61
- return self
62
-
63
- def mode(self, write_mode: str):
64
- """Set write mode: 'overwrite' or 'append'"""
65
- if write_mode not in {"overwrite", "append"}:
66
- raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
67
- self._mode = write_mode
68
- return self
69
-
70
- def option(self, key: str, value):
71
- """Set write option (Spark-compatible)"""
72
- if key == "mergeSchema":
73
- if str(value).lower() in ("true", "1"):
74
- self._schema_mode = "merge"
75
- else:
76
- self._schema_mode = None
77
- else:
78
- raise ValueError(f"Unsupported option: {key}")
79
- return self
80
-
81
- def partitionBy(self, *columns):
82
- """Set partition columns (Spark-compatible)"""
83
- if len(columns) == 1 and isinstance(columns[0], (list, tuple)):
84
- # Handle partitionBy(["col1", "col2"]) case
85
- self._partition_by = list(columns[0])
86
- else:
87
- # Handle partitionBy("col1", "col2") case
88
- self._partition_by = list(columns)
89
- return self
90
-
91
- def saveAsTable(self, table_name: str):
92
- """Save query result as Delta table"""
93
- if self._format != "delta":
94
- raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
95
-
96
- if "." in table_name:
97
- schema, table = table_name.split(".", 1)
98
- else:
99
- schema = self.duckrun.schema
100
- table = table_name
101
-
102
- self.duckrun._create_onelake_secret()
103
- path = f"{self.duckrun.table_base_url}{schema}/{table}"
104
- df = self.relation.record_batch()
105
-
106
- # Build write arguments based on schema_mode and partition_by
107
- write_args = _build_write_deltalake_args(
108
- path, df, self._mode,
109
- schema_mode=self._schema_mode,
110
- partition_by=self._partition_by
111
- )
112
-
113
- engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
114
- partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
115
- print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
116
-
117
- write_deltalake(**write_args)
118
-
119
- self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
120
- self.duckrun.con.sql(f"""
121
- CREATE OR REPLACE VIEW {table}
122
- AS SELECT * FROM delta_scan('{path}')
123
- """)
124
-
125
- dt = DeltaTable(path)
126
-
127
- if self._mode == "overwrite":
128
- dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
129
- dt.cleanup_metadata()
130
- print(f"✅ Table {schema}.{table} created/overwritten")
131
- else:
132
- file_count = len(dt.file_uris())
133
- if file_count > self.duckrun.compaction_threshold:
134
- print(f"Compacting {schema}.{table} ({file_count} files)")
135
- dt.optimize.compact()
136
- dt.vacuum(dry_run=False)
137
- dt.cleanup_metadata()
138
- print(f"✅ Data appended to {schema}.{table}")
139
-
140
- return table
141
-
142
-
143
- class QueryResult:
144
- """Wrapper for DuckDB relation with write API"""
145
-
146
- def __init__(self, relation, duckrun_instance):
147
- self.relation = relation
148
- self.duckrun = duckrun_instance
149
-
150
- @property
151
- def write(self):
152
- """Access write API"""
153
- return DeltaWriter(self.relation, self.duckrun)
154
-
155
- def __getattr__(self, name):
156
- """Delegate all other methods to underlying DuckDB relation"""
157
- return getattr(self.relation, name)
158
-
10
+ from datetime import datetime
11
+ from .stats import get_stats as _get_stats
12
+ from .runner import run as _run
13
+ from .files import copy as _copy, download as _download
14
+ from .writer import QueryResult
159
15
 
160
16
  class Duckrun:
161
17
  """
@@ -244,8 +100,6 @@ class Duckrun:
244
100
  workspace, lakehouse_name = parts
245
101
  scan_all_schemas = True
246
102
  schema = "dbo"
247
- print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
248
- print(f" Scanning all schemas for table discovery...\n")
249
103
  elif len(parts) == 3:
250
104
  workspace, lakehouse_name, schema = parts
251
105
  else:
@@ -306,16 +160,13 @@ class Duckrun:
306
160
 
307
161
  if self.scan_all_schemas:
308
162
  # Discover all schemas first
309
- print("🔍 Discovering schemas...")
310
163
  schemas_result = obs.list_with_delimiter(store, prefix=base_path)
311
164
  schemas = [
312
165
  prefix.rstrip('/').split('/')[-1]
313
166
  for prefix in schemas_result['common_prefixes']
314
167
  ]
315
- print(f" Found {len(schemas)} schemas: {', '.join(schemas)}\n")
316
168
 
317
169
  # Discover tables in each schema
318
- print("🔍 Discovering tables...")
319
170
  for schema_name in schemas:
320
171
  schema_path = f"{base_path}{schema_name}/"
321
172
  result = obs.list_with_delimiter(store, prefix=schema_path)
@@ -352,9 +203,22 @@ class Duckrun:
352
203
  print(f"No Delta tables found in {self.lakehouse_name}.Lakehouse/Tables/{self.schema}/")
353
204
  return
354
205
 
355
- print(f"\n📊 Found {len(tables)} Delta tables. Attaching as views...\n")
206
+ # Group tables by schema for display
207
+ schema_tables = {}
208
+ for schema_name, table_name in tables:
209
+ if schema_name not in schema_tables:
210
+ schema_tables[schema_name] = []
211
+ schema_tables[schema_name].append(table_name)
212
+
213
+ # Display tables by schema
214
+ print(f"\n📊 Found {len(tables)} tables:")
215
+ for schema_name in sorted(schema_tables.keys()):
216
+ table_list = sorted(schema_tables[schema_name])
217
+ print(f" {schema_name}: {', '.join(table_list)}")
356
218
 
357
219
  attached_count = 0
220
+ skipped_tables = []
221
+
358
222
  for schema_name, table_name in tables:
359
223
  try:
360
224
  if self.scan_all_schemas:
@@ -369,197 +233,21 @@ class Duckrun:
369
233
  CREATE OR REPLACE VIEW {view_name}
370
234
  AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
371
235
  """)
372
- print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
373
236
  attached_count += 1
374
237
  except Exception as e:
375
- print(f" ⚠ Skipped {schema_name}.{table_name}: {str(e)[:100]}")
238
+ skipped_tables.append(f"{schema_name}.{table_name}")
376
239
  continue
377
240
 
378
241
  print(f"\n{'='*60}")
379
- print(f"✅ Successfully attached {attached_count}/{len(tables)} tables")
242
+ print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
243
+ if skipped_tables:
244
+ print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
380
245
  print(f"{'='*60}\n")
381
-
382
- if self.scan_all_schemas:
383
- print(f"\n💡 Note: Tables use schema.table format (e.g., aemo.calendar, dbo.results)")
384
- print(f" Default schema for operations: {self.schema}\n")
385
246
 
386
247
  except Exception as e:
387
248
  print(f"❌ Error attaching lakehouse: {e}")
388
249
  print("Continuing without pre-attached tables.")
389
250
 
390
- def _normalize_table_name(self, name: str) -> str:
391
- """Extract base table name before first '__'"""
392
- return name.split('__', 1)[0] if '__' in name else name
393
-
394
- def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
395
- if self.sql_folder is None:
396
- raise RuntimeError("sql_folder is not configured. Cannot read SQL files.")
397
-
398
- is_url = self.sql_folder.startswith("http")
399
- if is_url:
400
- url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
401
- try:
402
- resp = requests.get(url)
403
- resp.raise_for_status()
404
- content = resp.text
405
- except Exception as e:
406
- print(f"Failed to fetch SQL from {url}: {e}")
407
- return None
408
- else:
409
- path = os.path.join(self.sql_folder, f"{table_name}.sql")
410
- try:
411
- with open(path, 'r') as f:
412
- content = f.read()
413
- except Exception as e:
414
- print(f"Failed to read SQL file {path}: {e}")
415
- return None
416
-
417
- if not content.strip():
418
- print(f"SQL file is empty: {table_name}.sql")
419
- return None
420
-
421
- full_params = {
422
- 'ws': self.workspace,
423
- 'lh': self.lakehouse_name,
424
- 'schema': self.schema,
425
- 'storage_account': self.storage_account
426
- }
427
- if params:
428
- full_params.update(params)
429
-
430
- try:
431
- template = Template(content)
432
- content = template.substitute(full_params)
433
- except KeyError as e:
434
- print(f"Missing parameter in SQL file: ${e}")
435
- return None
436
- except Exception as e:
437
- print(f"Error during SQL template substitution: {e}")
438
- return None
439
-
440
- return content
441
-
442
- def _load_py_function(self, name: str) -> Optional[Callable]:
443
- if self.sql_folder is None:
444
- raise RuntimeError("sql_folder is not configured. Cannot load Python functions.")
445
-
446
- is_url = self.sql_folder.startswith("http")
447
- try:
448
- if is_url:
449
- url = f"{self.sql_folder.rstrip('/')}/{name}.py".strip()
450
- resp = requests.get(url)
451
- resp.raise_for_status()
452
- code = resp.text
453
- namespace = {}
454
- exec(code, namespace)
455
- func = namespace.get(name)
456
- return func if callable(func) else None
457
- else:
458
- path = os.path.join(self.sql_folder, f"{name}.py")
459
- if not os.path.isfile(path):
460
- print(f"Python file not found: {path}")
461
- return None
462
- spec = importlib.util.spec_from_file_location(name, path)
463
- mod = importlib.util.module_from_spec(spec)
464
- spec.loader.exec_module(mod)
465
- func = getattr(mod, name, None)
466
- return func if callable(func) else None
467
- except Exception as e:
468
- print(f"Error loading Python function '{name}': {e}")
469
- return None
470
-
471
- def _run_python(self, name: str, args: tuple) -> Any:
472
- """Execute Python task, return result"""
473
- self._create_onelake_secret()
474
- func = self._load_py_function(name)
475
- if not func:
476
- raise RuntimeError(f"Python function '{name}' not found")
477
-
478
- print(f"Running Python: {name}{args}")
479
- result = func(*args)
480
- print(f"✅ Python '{name}' completed")
481
- return result
482
-
483
- def _run_sql(self, table: str, mode: str, params: Dict, delta_options: Dict = None) -> str:
484
- """Execute SQL task, write to Delta, return normalized table name"""
485
- self._create_onelake_secret()
486
-
487
- if mode not in {'overwrite', 'append', 'ignore'}:
488
- raise ValueError(f"Invalid mode '{mode}'. Use: overwrite, append, or ignore")
489
-
490
- sql = self._read_sql_file(table, params)
491
- if sql is None:
492
- raise RuntimeError(f"Failed to read SQL file for '{table}'")
493
-
494
- normalized_table = self._normalize_table_name(table)
495
- path = f"{self.table_base_url}{self.schema}/{normalized_table}"
496
-
497
- # Extract Delta Lake specific options from delta_options
498
- delta_options = delta_options or {}
499
- merge_schema = delta_options.get('mergeSchema')
500
- schema_mode = 'merge' if str(merge_schema).lower() in ('true', '1') else None
501
- partition_by = delta_options.get('partitionBy') or delta_options.get('partition_by')
502
-
503
- if mode == 'overwrite':
504
- self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
505
- df = self.con.sql(sql).record_batch()
506
-
507
- write_args = _build_write_deltalake_args(
508
- path, df, 'overwrite',
509
- schema_mode=schema_mode,
510
- partition_by=partition_by
511
- )
512
- write_deltalake(**write_args)
513
-
514
- self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
515
- dt = DeltaTable(path)
516
- dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
517
- dt.cleanup_metadata()
518
-
519
- elif mode == 'append':
520
- df = self.con.sql(sql).record_batch()
521
-
522
- write_args = _build_write_deltalake_args(
523
- path, df, 'append',
524
- schema_mode=schema_mode,
525
- partition_by=partition_by
526
- )
527
- write_deltalake(**write_args)
528
-
529
- self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
530
- dt = DeltaTable(path)
531
- if len(dt.file_uris()) > self.compaction_threshold:
532
- print(f"Compacting {normalized_table} ({len(dt.file_uris())} files)")
533
- dt.optimize.compact()
534
- dt.vacuum(dry_run=False)
535
- dt.cleanup_metadata()
536
-
537
- elif mode == 'ignore':
538
- try:
539
- DeltaTable(path)
540
- print(f"Table {normalized_table} exists. Skipping (mode='ignore')")
541
- except Exception:
542
- print(f"Table {normalized_table} doesn't exist. Creating...")
543
- self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
544
- df = self.con.sql(sql).record_batch()
545
-
546
- write_args = _build_write_deltalake_args(
547
- path, df, 'overwrite',
548
- schema_mode=schema_mode,
549
- partition_by=partition_by
550
- )
551
- write_deltalake(**write_args)
552
-
553
- self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
554
- dt = DeltaTable(path)
555
- dt.vacuum(dry_run=False)
556
- dt.cleanup_metadata()
557
-
558
- engine_info = f" (engine=rust, schema_mode=merge)" if schema_mode == 'merge' else " (engine=pyarrow)"
559
- partition_info = f" partitioned by {partition_by}" if partition_by else ""
560
- print(f"✅ SQL '{table}' → '{normalized_table}' ({mode}){engine_info}{partition_info}")
561
- return normalized_table
562
-
563
251
  def run(self, pipeline: List[Tuple]) -> bool:
564
252
  """
565
253
  Execute pipeline of tasks.
@@ -573,59 +261,7 @@ class Duckrun:
573
261
  True if all tasks succeeded
574
262
  False if any task failed (exception) or Python task returned 0 (early exit)
575
263
  """
576
- if self.sql_folder is None:
577
- raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
578
-
579
- for i, task in enumerate(pipeline, 1):
580
- print(f"\n{'='*60}")
581
- print(f"Task {i}/{len(pipeline)}: {task[0]}")
582
- print('='*60)
583
-
584
- try:
585
- result = None
586
-
587
- if len(task) == 2:
588
- name, second = task
589
- if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
590
- result = self._run_sql(name, second, {}, {})
591
- else:
592
- args = second if isinstance(second, (tuple, list)) else (second,)
593
- result = self._run_python(name, tuple(args))
594
-
595
- elif len(task) == 3:
596
- table, mode, params = task
597
- if not isinstance(params, dict):
598
- raise ValueError(f"Expected dict for params, got {type(params)}")
599
- result = self._run_sql(table, mode, params, {})
600
-
601
- elif len(task) == 4:
602
- table, mode, params, delta_options = task
603
- if not isinstance(params, dict):
604
- raise ValueError(f"Expected dict for SQL params, got {type(params)}")
605
- if not isinstance(delta_options, dict):
606
- raise ValueError(f"Expected dict for Delta options, got {type(delta_options)}")
607
- result = self._run_sql(table, mode, params, delta_options)
608
-
609
- else:
610
- raise ValueError(f"Invalid task format: {task}")
611
-
612
- # Check if Python task returned 0 (early exit condition)
613
- # Only check for Python tasks as SQL tasks return table names (strings) and only stop on exceptions
614
- if (len(task) == 2 and
615
- not isinstance(task[1], str) and
616
- result == 0):
617
- print(f"\n⏹️ Python task {i} returned 0 - stopping pipeline execution")
618
- print(f" Remaining tasks ({len(pipeline) - i}) will not be executed")
619
- return False
620
-
621
- except Exception as e:
622
- print(f"\n❌ Task {i} failed: {e}")
623
- return False
624
-
625
- print(f"\n{'='*60}")
626
- print("✅ All tasks completed successfully")
627
- print('='*60)
628
- return True
264
+ return _run(self, pipeline)
629
265
 
630
266
  def copy(self, local_folder: str, remote_folder: str,
631
267
  file_extensions: Optional[List[str]] = None,
@@ -652,98 +288,7 @@ class Duckrun:
652
288
  # Upload with overwrite enabled
653
289
  dr.copy("./backup", "backups", overwrite=True)
654
290
  """
655
- if not os.path.exists(local_folder):
656
- print(f"❌ Local folder not found: {local_folder}")
657
- return False
658
-
659
- if not os.path.isdir(local_folder):
660
- print(f"❌ Path is not a directory: {local_folder}")
661
- return False
662
-
663
- # Get Azure token
664
- token = self._get_storage_token()
665
- if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
666
- print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
667
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
668
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
669
- token_obj = credential.get_token("https://storage.azure.com/.default")
670
- token = token_obj.token
671
- os.environ["AZURE_STORAGE_TOKEN"] = token
672
-
673
- # Setup OneLake Files URL (not Tables)
674
- files_base_url = f'abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
675
- store = AzureStore.from_url(files_base_url, bearer_token=token)
676
-
677
- # Collect files to upload
678
- files_to_upload = []
679
- for root, dirs, files in os.walk(local_folder):
680
- for file in files:
681
- local_file_path = os.path.join(root, file)
682
-
683
- # Filter by extensions if specified
684
- if file_extensions:
685
- _, ext = os.path.splitext(file)
686
- if ext.lower() not in [e.lower() for e in file_extensions]:
687
- continue
688
-
689
- # Calculate relative path from local_folder
690
- rel_path = os.path.relpath(local_file_path, local_folder)
691
-
692
- # Build remote path in OneLake Files (remote_folder is now mandatory)
693
- remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
694
-
695
- files_to_upload.append((local_file_path, remote_path))
696
-
697
- if not files_to_upload:
698
- print(f"No files found to upload in {local_folder}")
699
- if file_extensions:
700
- print(f" (filtered by extensions: {file_extensions})")
701
- return True
702
-
703
- print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
704
- print(f" Target folder: {remote_folder}")
705
-
706
- uploaded_count = 0
707
- failed_count = 0
708
-
709
- for local_path, remote_path in files_to_upload:
710
- try:
711
- # Check if file exists (if not overwriting)
712
- if not overwrite:
713
- try:
714
- obs.head(store, remote_path)
715
- print(f" ⏭ Skipped (exists): {remote_path}")
716
- continue
717
- except Exception:
718
- # File doesn't exist, proceed with upload
719
- pass
720
-
721
- # Read local file
722
- with open(local_path, 'rb') as f:
723
- file_data = f.read()
724
-
725
- # Upload to OneLake Files
726
- obs.put(store, remote_path, file_data)
727
-
728
- file_size = len(file_data)
729
- size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
730
- size_unit = "MB" if file_size > 1024*1024 else "KB"
731
-
732
- print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
733
- uploaded_count += 1
734
-
735
- except Exception as e:
736
- print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
737
- failed_count += 1
738
-
739
- print(f"\n{'='*60}")
740
- if failed_count == 0:
741
- print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
742
- else:
743
- print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
744
- print(f"{'='*60}")
745
-
746
- return failed_count == 0
291
+ return _copy(self, local_folder, remote_folder, file_extensions, overwrite)
747
292
 
748
293
  def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
749
294
  file_extensions: Optional[List[str]] = None,
@@ -762,110 +307,12 @@ class Duckrun:
762
307
 
763
308
  Examples:
764
309
  # Download all files from OneLake Files root
765
- dr.download_from_files()
310
+ dr.download()
766
311
 
767
312
  # Download only CSV files from a specific subfolder
768
- dr.download_from_files("daily_reports", "./reports", ['.csv'])
313
+ dr.download("daily_reports", "./reports", ['.csv'])
769
314
  """
770
- # Get Azure token
771
- token = self._get_storage_token()
772
- if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
773
- print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
774
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
775
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
776
- token_obj = credential.get_token("https://storage.azure.com/.default")
777
- token = token_obj.token
778
- os.environ["AZURE_STORAGE_TOKEN"] = token
779
-
780
- # Setup OneLake Files URL (not Tables)
781
- files_base_url = f'abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
782
- store = AzureStore.from_url(files_base_url, bearer_token=token)
783
-
784
- # Create local directory
785
- os.makedirs(local_folder, exist_ok=True)
786
-
787
- # List files in OneLake Files
788
- print(f"📁 Discovering files in OneLake Files...")
789
- if remote_folder:
790
- print(f" Source folder: {remote_folder}")
791
- prefix = f"{remote_folder.strip('/')}/"
792
- else:
793
- prefix = ""
794
-
795
- try:
796
- list_stream = obs.list(store, prefix=prefix)
797
- files_to_download = []
798
-
799
- for batch in list_stream:
800
- for obj in batch:
801
- remote_path = obj["path"]
802
-
803
- # Filter by extensions if specified
804
- if file_extensions:
805
- _, ext = os.path.splitext(remote_path)
806
- if ext.lower() not in [e.lower() for e in file_extensions]:
807
- continue
808
-
809
- # Calculate local path
810
- if remote_folder:
811
- rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
812
- else:
813
- rel_path = remote_path
814
-
815
- local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
816
- files_to_download.append((remote_path, local_path))
817
-
818
- if not files_to_download:
819
- print(f"No files found to download")
820
- if file_extensions:
821
- print(f" (filtered by extensions: {file_extensions})")
822
- return True
823
-
824
- print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
825
-
826
- downloaded_count = 0
827
- failed_count = 0
828
-
829
- for remote_path, local_path in files_to_download:
830
- try:
831
- # Check if local file exists (if not overwriting)
832
- if not overwrite and os.path.exists(local_path):
833
- print(f" ⏭ Skipped (exists): {local_path}")
834
- continue
835
-
836
- # Ensure local directory exists
837
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
838
-
839
- # Download file
840
- data = obs.get(store, remote_path).bytes()
841
-
842
- # Write to local file
843
- with open(local_path, 'wb') as f:
844
- f.write(data)
845
-
846
- file_size = len(data)
847
- size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
848
- size_unit = "MB" if file_size > 1024*1024 else "KB"
849
-
850
- print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
851
- downloaded_count += 1
852
-
853
- except Exception as e:
854
- print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
855
- failed_count += 1
856
-
857
- print(f"\n{'='*60}")
858
- if failed_count == 0:
859
- print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
860
- else:
861
- print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
862
- print(f"{'='*60}")
863
-
864
- return failed_count == 0
865
-
866
- except Exception as e:
867
- print(f"❌ Error listing files from OneLake: {e}")
868
- return False
315
+ return _download(self, remote_folder, local_folder, file_extensions, overwrite)
869
316
 
870
317
  def sql(self, query: str):
871
318
  """
@@ -883,6 +330,34 @@ class Duckrun:
883
330
  """Get underlying DuckDB connection"""
884
331
  return self.con
885
332
 
333
+ def get_stats(self, source: str):
334
+ """
335
+ Get comprehensive statistics for Delta Lake tables.
336
+
337
+ Args:
338
+ source: Can be one of:
339
+ - Table name: 'table_name' (uses current schema)
340
+ - Schema.table: 'schema.table_name' (specific table in schema)
341
+ - Schema only: 'schema' (all tables in schema)
342
+
343
+ Returns:
344
+ Arrow table with statistics including total rows, file count, row groups,
345
+ average row group size, file sizes, VORDER status, and timestamp
346
+
347
+ Examples:
348
+ con = duckrun.connect("tmp/data.lakehouse/aemo")
349
+
350
+ # Single table in current schema
351
+ stats = con.get_stats('price')
352
+
353
+ # Specific table in different schema
354
+ stats = con.get_stats('aemo.price')
355
+
356
+ # All tables in a schema
357
+ stats = con.get_stats('aemo')
358
+ """
359
+ return _get_stats(self, source)
360
+
886
361
  def close(self):
887
362
  """Close DuckDB connection"""
888
363
  if self.con: