duckrun 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -7,155 +7,11 @@ from typing import List, Tuple, Union, Optional, Callable, Dict, Any
7
7
  from string import Template
8
8
  import obstore as obs
9
9
  from obstore.store import AzureStore
10
-
11
- # Row Group configuration for optimal Delta Lake performance
12
- RG = 8_000_000
13
-
14
-
15
- def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
16
- """
17
- Build arguments for write_deltalake based on requirements:
18
- - If schema_mode='merge': use rust engine (no row group params)
19
- - Otherwise: use pyarrow engine with row group optimization
20
- """
21
- args = {
22
- 'table_or_uri': path,
23
- 'data': df,
24
- 'mode': mode
25
- }
26
-
27
- # Add partition_by if specified
28
- if partition_by:
29
- args['partition_by'] = partition_by
30
-
31
- # Engine selection based on schema_mode
32
- if schema_mode == 'merge':
33
- # Use rust engine for schema merging (no row group params supported)
34
- args['schema_mode'] = 'merge'
35
- args['engine'] = 'rust'
36
- else:
37
- # Use pyarrow engine with row group optimization (default)
38
- args['max_rows_per_file'] = RG
39
- args['max_rows_per_group'] = RG
40
- args['min_rows_per_group'] = RG
41
-
42
- return args
43
-
44
-
45
- class DeltaWriter:
46
- """Spark-style write API for Delta Lake"""
47
-
48
- def __init__(self, relation, duckrun_instance):
49
- self.relation = relation
50
- self.duckrun = duckrun_instance
51
- self._format = "delta"
52
- self._mode = "overwrite"
53
- self._schema_mode = None
54
- self._partition_by = None
55
-
56
- def format(self, format_type: str):
57
- """Set output format (only 'delta' supported)"""
58
- if format_type.lower() != "delta":
59
- raise ValueError(f"Only 'delta' format is supported, got '{format_type}'")
60
- self._format = "delta"
61
- return self
62
-
63
- def mode(self, write_mode: str):
64
- """Set write mode: 'overwrite' or 'append'"""
65
- if write_mode not in {"overwrite", "append"}:
66
- raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
67
- self._mode = write_mode
68
- return self
69
-
70
- def option(self, key: str, value):
71
- """Set write option (Spark-compatible)"""
72
- if key == "mergeSchema":
73
- if str(value).lower() in ("true", "1"):
74
- self._schema_mode = "merge"
75
- else:
76
- self._schema_mode = None
77
- else:
78
- raise ValueError(f"Unsupported option: {key}")
79
- return self
80
-
81
- def partitionBy(self, *columns):
82
- """Set partition columns (Spark-compatible)"""
83
- if len(columns) == 1 and isinstance(columns[0], (list, tuple)):
84
- # Handle partitionBy(["col1", "col2"]) case
85
- self._partition_by = list(columns[0])
86
- else:
87
- # Handle partitionBy("col1", "col2") case
88
- self._partition_by = list(columns)
89
- return self
90
-
91
- def saveAsTable(self, table_name: str):
92
- """Save query result as Delta table"""
93
- if self._format != "delta":
94
- raise RuntimeError(f"Only 'delta' format is supported, got '{self._format}'")
95
-
96
- if "." in table_name:
97
- schema, table = table_name.split(".", 1)
98
- else:
99
- schema = self.duckrun.schema
100
- table = table_name
101
-
102
- self.duckrun._create_onelake_secret()
103
- path = f"{self.duckrun.table_base_url}{schema}/{table}"
104
- df = self.relation.record_batch()
105
-
106
- # Build write arguments based on schema_mode and partition_by
107
- write_args = _build_write_deltalake_args(
108
- path, df, self._mode,
109
- schema_mode=self._schema_mode,
110
- partition_by=self._partition_by
111
- )
112
-
113
- engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
114
- partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
115
- print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
116
-
117
- write_deltalake(**write_args)
118
-
119
- self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
120
- self.duckrun.con.sql(f"""
121
- CREATE OR REPLACE VIEW {table}
122
- AS SELECT * FROM delta_scan('{path}')
123
- """)
124
-
125
- dt = DeltaTable(path)
126
-
127
- if self._mode == "overwrite":
128
- dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
129
- dt.cleanup_metadata()
130
- print(f"✅ Table {schema}.{table} created/overwritten")
131
- else:
132
- file_count = len(dt.file_uris())
133
- if file_count > self.duckrun.compaction_threshold:
134
- print(f"Compacting {schema}.{table} ({file_count} files)")
135
- dt.optimize.compact()
136
- dt.vacuum(dry_run=False)
137
- dt.cleanup_metadata()
138
- print(f"✅ Data appended to {schema}.{table}")
139
-
140
- return table
141
-
142
-
143
- class QueryResult:
144
- """Wrapper for DuckDB relation with write API"""
145
-
146
- def __init__(self, relation, duckrun_instance):
147
- self.relation = relation
148
- self.duckrun = duckrun_instance
149
-
150
- @property
151
- def write(self):
152
- """Access write API"""
153
- return DeltaWriter(self.relation, self.duckrun)
154
-
155
- def __getattr__(self, name):
156
- """Delegate all other methods to underlying DuckDB relation"""
157
- return getattr(self.relation, name)
158
-
10
+ from datetime import datetime
11
+ from .stats import get_stats as _get_stats
12
+ from .runner import run as _run
13
+ from .files import copy as _copy, download as _download
14
+ from .writer import QueryResult
159
15
 
160
16
  class Duckrun:
161
17
  """
@@ -195,21 +51,22 @@ class Duckrun:
195
51
 
196
52
  def __init__(self, workspace: str, lakehouse_name: str, schema: str = "dbo",
197
53
  sql_folder: Optional[str] = None, compaction_threshold: int = 10,
198
- scan_all_schemas: bool = False):
54
+ scan_all_schemas: bool = False, storage_account: str = "onelake"):
199
55
  self.workspace = workspace
200
56
  self.lakehouse_name = lakehouse_name
201
57
  self.schema = schema
202
58
  self.sql_folder = sql_folder.strip() if sql_folder else None
203
59
  self.compaction_threshold = compaction_threshold
204
60
  self.scan_all_schemas = scan_all_schemas
205
- self.table_base_url = f'abfss://{workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
61
+ self.storage_account = storage_account
62
+ self.table_base_url = f'abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/Tables/'
206
63
  self.con = duckdb.connect()
207
64
  self.con.sql("SET preserve_insertion_order = false")
208
65
  self._attach_lakehouse()
209
66
 
210
67
  @classmethod
211
68
  def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
212
- compaction_threshold: int = 100):
69
+ compaction_threshold: int = 100, storage_account: str = "onelake"):
213
70
  """
214
71
  Create and connect to lakehouse.
215
72
 
@@ -219,11 +76,13 @@ class Duckrun:
219
76
  connection_string: OneLake path "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
220
77
  sql_folder: Optional path or URL to SQL files folder
221
78
  compaction_threshold: File count threshold for compaction
79
+ storage_account: Storage account name (default: "onelake")
222
80
 
223
81
  Examples:
224
82
  dr = Duckrun.connect("ws/lh.lakehouse/schema", sql_folder="./sql")
225
83
  dr = Duckrun.connect("ws/lh.lakehouse/schema") # no SQL folder
226
84
  dr = Duckrun.connect("ws/lh.lakehouse") # defaults to dbo schema
85
+ dr = Duckrun.connect("ws/lh.lakehouse", storage_account="xxx-onelake") # custom storage
227
86
  """
228
87
  print("Connecting to Lakehouse...")
229
88
 
@@ -261,7 +120,7 @@ class Duckrun:
261
120
  " connect('workspace/lakehouse.lakehouse') # defaults to dbo"
262
121
  )
263
122
 
264
- return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
123
+ return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
265
124
 
266
125
  def _get_storage_token(self):
267
126
  return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
@@ -295,7 +154,7 @@ class Duckrun:
295
154
  token = token_obj.token
296
155
  os.environ["AZURE_STORAGE_TOKEN"] = token
297
156
 
298
- url = f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/"
157
+ url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
299
158
  store = AzureStore.from_url(url, bearer_token=token)
300
159
 
301
160
  base_path = f"{self.lakehouse_name}.Lakehouse/Tables/"
@@ -384,178 +243,6 @@ class Duckrun:
384
243
  print(f"❌ Error attaching lakehouse: {e}")
385
244
  print("Continuing without pre-attached tables.")
386
245
 
387
- def _normalize_table_name(self, name: str) -> str:
388
- """Extract base table name before first '__'"""
389
- return name.split('__', 1)[0] if '__' in name else name
390
-
391
- def _read_sql_file(self, table_name: str, params: Optional[Dict] = None) -> Optional[str]:
392
- if self.sql_folder is None:
393
- raise RuntimeError("sql_folder is not configured. Cannot read SQL files.")
394
-
395
- is_url = self.sql_folder.startswith("http")
396
- if is_url:
397
- url = f"{self.sql_folder.rstrip('/')}/{table_name}.sql".strip()
398
- try:
399
- resp = requests.get(url)
400
- resp.raise_for_status()
401
- content = resp.text
402
- except Exception as e:
403
- print(f"Failed to fetch SQL from {url}: {e}")
404
- return None
405
- else:
406
- path = os.path.join(self.sql_folder, f"{table_name}.sql")
407
- try:
408
- with open(path, 'r') as f:
409
- content = f.read()
410
- except Exception as e:
411
- print(f"Failed to read SQL file {path}: {e}")
412
- return None
413
-
414
- if not content.strip():
415
- print(f"SQL file is empty: {table_name}.sql")
416
- return None
417
-
418
- full_params = {
419
- 'ws': self.workspace,
420
- 'lh': self.lakehouse_name,
421
- 'schema': self.schema
422
- }
423
- if params:
424
- full_params.update(params)
425
-
426
- try:
427
- template = Template(content)
428
- content = template.substitute(full_params)
429
- except KeyError as e:
430
- print(f"Missing parameter in SQL file: ${e}")
431
- return None
432
- except Exception as e:
433
- print(f"Error during SQL template substitution: {e}")
434
- return None
435
-
436
- return content
437
-
438
- def _load_py_function(self, name: str) -> Optional[Callable]:
439
- if self.sql_folder is None:
440
- raise RuntimeError("sql_folder is not configured. Cannot load Python functions.")
441
-
442
- is_url = self.sql_folder.startswith("http")
443
- try:
444
- if is_url:
445
- url = f"{self.sql_folder.rstrip('/')}/{name}.py".strip()
446
- resp = requests.get(url)
447
- resp.raise_for_status()
448
- code = resp.text
449
- namespace = {}
450
- exec(code, namespace)
451
- func = namespace.get(name)
452
- return func if callable(func) else None
453
- else:
454
- path = os.path.join(self.sql_folder, f"{name}.py")
455
- if not os.path.isfile(path):
456
- print(f"Python file not found: {path}")
457
- return None
458
- spec = importlib.util.spec_from_file_location(name, path)
459
- mod = importlib.util.module_from_spec(spec)
460
- spec.loader.exec_module(mod)
461
- func = getattr(mod, name, None)
462
- return func if callable(func) else None
463
- except Exception as e:
464
- print(f"Error loading Python function '{name}': {e}")
465
- return None
466
-
467
- def _run_python(self, name: str, args: tuple) -> Any:
468
- """Execute Python task, return result"""
469
- self._create_onelake_secret()
470
- func = self._load_py_function(name)
471
- if not func:
472
- raise RuntimeError(f"Python function '{name}' not found")
473
-
474
- print(f"Running Python: {name}{args}")
475
- result = func(*args)
476
- print(f"✅ Python '{name}' completed")
477
- return result
478
-
479
- def _run_sql(self, table: str, mode: str, params: Dict, delta_options: Dict = None) -> str:
480
- """Execute SQL task, write to Delta, return normalized table name"""
481
- self._create_onelake_secret()
482
-
483
- if mode not in {'overwrite', 'append', 'ignore'}:
484
- raise ValueError(f"Invalid mode '{mode}'. Use: overwrite, append, or ignore")
485
-
486
- sql = self._read_sql_file(table, params)
487
- if sql is None:
488
- raise RuntimeError(f"Failed to read SQL file for '{table}'")
489
-
490
- normalized_table = self._normalize_table_name(table)
491
- path = f"{self.table_base_url}{self.schema}/{normalized_table}"
492
-
493
- # Extract Delta Lake specific options from delta_options
494
- delta_options = delta_options or {}
495
- merge_schema = delta_options.get('mergeSchema')
496
- schema_mode = 'merge' if str(merge_schema).lower() in ('true', '1') else None
497
- partition_by = delta_options.get('partitionBy') or delta_options.get('partition_by')
498
-
499
- if mode == 'overwrite':
500
- self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
501
- df = self.con.sql(sql).record_batch()
502
-
503
- write_args = _build_write_deltalake_args(
504
- path, df, 'overwrite',
505
- schema_mode=schema_mode,
506
- partition_by=partition_by
507
- )
508
- write_deltalake(**write_args)
509
-
510
- self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
511
- dt = DeltaTable(path)
512
- dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
513
- dt.cleanup_metadata()
514
-
515
- elif mode == 'append':
516
- df = self.con.sql(sql).record_batch()
517
-
518
- write_args = _build_write_deltalake_args(
519
- path, df, 'append',
520
- schema_mode=schema_mode,
521
- partition_by=partition_by
522
- )
523
- write_deltalake(**write_args)
524
-
525
- self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
526
- dt = DeltaTable(path)
527
- if len(dt.file_uris()) > self.compaction_threshold:
528
- print(f"Compacting {normalized_table} ({len(dt.file_uris())} files)")
529
- dt.optimize.compact()
530
- dt.vacuum(dry_run=False)
531
- dt.cleanup_metadata()
532
-
533
- elif mode == 'ignore':
534
- try:
535
- DeltaTable(path)
536
- print(f"Table {normalized_table} exists. Skipping (mode='ignore')")
537
- except Exception:
538
- print(f"Table {normalized_table} doesn't exist. Creating...")
539
- self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
540
- df = self.con.sql(sql).record_batch()
541
-
542
- write_args = _build_write_deltalake_args(
543
- path, df, 'overwrite',
544
- schema_mode=schema_mode,
545
- partition_by=partition_by
546
- )
547
- write_deltalake(**write_args)
548
-
549
- self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
550
- dt = DeltaTable(path)
551
- dt.vacuum(dry_run=False)
552
- dt.cleanup_metadata()
553
-
554
- engine_info = f" (engine=rust, schema_mode=merge)" if schema_mode == 'merge' else " (engine=pyarrow)"
555
- partition_info = f" partitioned by {partition_by}" if partition_by else ""
556
- print(f"✅ SQL '{table}' → '{normalized_table}' ({mode}){engine_info}{partition_info}")
557
- return normalized_table
558
-
559
246
  def run(self, pipeline: List[Tuple]) -> bool:
560
247
  """
561
248
  Execute pipeline of tasks.
@@ -569,59 +256,7 @@ class Duckrun:
569
256
  True if all tasks succeeded
570
257
  False if any task failed (exception) or Python task returned 0 (early exit)
571
258
  """
572
- if self.sql_folder is None:
573
- raise RuntimeError("sql_folder is not configured. Cannot run pipelines.")
574
-
575
- for i, task in enumerate(pipeline, 1):
576
- print(f"\n{'='*60}")
577
- print(f"Task {i}/{len(pipeline)}: {task[0]}")
578
- print('='*60)
579
-
580
- try:
581
- result = None
582
-
583
- if len(task) == 2:
584
- name, second = task
585
- if isinstance(second, str) and second in {'overwrite', 'append', 'ignore'}:
586
- result = self._run_sql(name, second, {}, {})
587
- else:
588
- args = second if isinstance(second, (tuple, list)) else (second,)
589
- result = self._run_python(name, tuple(args))
590
-
591
- elif len(task) == 3:
592
- table, mode, params = task
593
- if not isinstance(params, dict):
594
- raise ValueError(f"Expected dict for params, got {type(params)}")
595
- result = self._run_sql(table, mode, params, {})
596
-
597
- elif len(task) == 4:
598
- table, mode, params, delta_options = task
599
- if not isinstance(params, dict):
600
- raise ValueError(f"Expected dict for SQL params, got {type(params)}")
601
- if not isinstance(delta_options, dict):
602
- raise ValueError(f"Expected dict for Delta options, got {type(delta_options)}")
603
- result = self._run_sql(table, mode, params, delta_options)
604
-
605
- else:
606
- raise ValueError(f"Invalid task format: {task}")
607
-
608
- # Check if Python task returned 0 (early exit condition)
609
- # Only check for Python tasks as SQL tasks return table names (strings) and only stop on exceptions
610
- if (len(task) == 2 and
611
- not isinstance(task[1], str) and
612
- result == 0):
613
- print(f"\n⏹️ Python task {i} returned 0 - stopping pipeline execution")
614
- print(f" Remaining tasks ({len(pipeline) - i}) will not be executed")
615
- return False
616
-
617
- except Exception as e:
618
- print(f"\n❌ Task {i} failed: {e}")
619
- return False
620
-
621
- print(f"\n{'='*60}")
622
- print("✅ All tasks completed successfully")
623
- print('='*60)
624
- return True
259
+ return _run(self, pipeline)
625
260
 
626
261
  def copy(self, local_folder: str, remote_folder: str,
627
262
  file_extensions: Optional[List[str]] = None,
@@ -648,98 +283,7 @@ class Duckrun:
648
283
  # Upload with overwrite enabled
649
284
  dr.copy("./backup", "backups", overwrite=True)
650
285
  """
651
- if not os.path.exists(local_folder):
652
- print(f"❌ Local folder not found: {local_folder}")
653
- return False
654
-
655
- if not os.path.isdir(local_folder):
656
- print(f"❌ Path is not a directory: {local_folder}")
657
- return False
658
-
659
- # Get Azure token
660
- token = self._get_storage_token()
661
- if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
662
- print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
663
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
664
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
665
- token_obj = credential.get_token("https://storage.azure.com/.default")
666
- token = token_obj.token
667
- os.environ["AZURE_STORAGE_TOKEN"] = token
668
-
669
- # Setup OneLake Files URL (not Tables)
670
- files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
671
- store = AzureStore.from_url(files_base_url, bearer_token=token)
672
-
673
- # Collect files to upload
674
- files_to_upload = []
675
- for root, dirs, files in os.walk(local_folder):
676
- for file in files:
677
- local_file_path = os.path.join(root, file)
678
-
679
- # Filter by extensions if specified
680
- if file_extensions:
681
- _, ext = os.path.splitext(file)
682
- if ext.lower() not in [e.lower() for e in file_extensions]:
683
- continue
684
-
685
- # Calculate relative path from local_folder
686
- rel_path = os.path.relpath(local_file_path, local_folder)
687
-
688
- # Build remote path in OneLake Files (remote_folder is now mandatory)
689
- remote_path = f"{remote_folder.strip('/')}/{rel_path}".replace("\\", "/")
690
-
691
- files_to_upload.append((local_file_path, remote_path))
692
-
693
- if not files_to_upload:
694
- print(f"No files found to upload in {local_folder}")
695
- if file_extensions:
696
- print(f" (filtered by extensions: {file_extensions})")
697
- return True
698
-
699
- print(f"📁 Uploading {len(files_to_upload)} files from '{local_folder}' to OneLake Files...")
700
- print(f" Target folder: {remote_folder}")
701
-
702
- uploaded_count = 0
703
- failed_count = 0
704
-
705
- for local_path, remote_path in files_to_upload:
706
- try:
707
- # Check if file exists (if not overwriting)
708
- if not overwrite:
709
- try:
710
- obs.head(store, remote_path)
711
- print(f" ⏭ Skipped (exists): {remote_path}")
712
- continue
713
- except Exception:
714
- # File doesn't exist, proceed with upload
715
- pass
716
-
717
- # Read local file
718
- with open(local_path, 'rb') as f:
719
- file_data = f.read()
720
-
721
- # Upload to OneLake Files
722
- obs.put(store, remote_path, file_data)
723
-
724
- file_size = len(file_data)
725
- size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
726
- size_unit = "MB" if file_size > 1024*1024 else "KB"
727
-
728
- print(f" ✓ Uploaded: {local_path} → {remote_path} ({size_mb:.1f} {size_unit})")
729
- uploaded_count += 1
730
-
731
- except Exception as e:
732
- print(f" ❌ Failed: {local_path} → {remote_path} | Error: {str(e)[:100]}")
733
- failed_count += 1
734
-
735
- print(f"\n{'='*60}")
736
- if failed_count == 0:
737
- print(f"✅ Successfully uploaded all {uploaded_count} files to OneLake Files")
738
- else:
739
- print(f"⚠ Uploaded {uploaded_count} files, {failed_count} failed")
740
- print(f"{'='*60}")
741
-
742
- return failed_count == 0
286
+ return _copy(self, local_folder, remote_folder, file_extensions, overwrite)
743
287
 
744
288
  def download(self, remote_folder: str = "", local_folder: str = "./downloaded_files",
745
289
  file_extensions: Optional[List[str]] = None,
@@ -758,110 +302,12 @@ class Duckrun:
758
302
 
759
303
  Examples:
760
304
  # Download all files from OneLake Files root
761
- dr.download_from_files()
305
+ dr.download()
762
306
 
763
307
  # Download only CSV files from a specific subfolder
764
- dr.download_from_files("daily_reports", "./reports", ['.csv'])
308
+ dr.download("daily_reports", "./reports", ['.csv'])
765
309
  """
766
- # Get Azure token
767
- token = self._get_storage_token()
768
- if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
769
- print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
770
- from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
771
- credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
772
- token_obj = credential.get_token("https://storage.azure.com/.default")
773
- token = token_obj.token
774
- os.environ["AZURE_STORAGE_TOKEN"] = token
775
-
776
- # Setup OneLake Files URL (not Tables)
777
- files_base_url = f'abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Files/'
778
- store = AzureStore.from_url(files_base_url, bearer_token=token)
779
-
780
- # Create local directory
781
- os.makedirs(local_folder, exist_ok=True)
782
-
783
- # List files in OneLake Files
784
- print(f"📁 Discovering files in OneLake Files...")
785
- if remote_folder:
786
- print(f" Source folder: {remote_folder}")
787
- prefix = f"{remote_folder.strip('/')}/"
788
- else:
789
- prefix = ""
790
-
791
- try:
792
- list_stream = obs.list(store, prefix=prefix)
793
- files_to_download = []
794
-
795
- for batch in list_stream:
796
- for obj in batch:
797
- remote_path = obj["path"]
798
-
799
- # Filter by extensions if specified
800
- if file_extensions:
801
- _, ext = os.path.splitext(remote_path)
802
- if ext.lower() not in [e.lower() for e in file_extensions]:
803
- continue
804
-
805
- # Calculate local path
806
- if remote_folder:
807
- rel_path = os.path.relpath(remote_path, remote_folder.strip('/'))
808
- else:
809
- rel_path = remote_path
810
-
811
- local_path = os.path.join(local_folder, rel_path).replace('/', os.sep)
812
- files_to_download.append((remote_path, local_path))
813
-
814
- if not files_to_download:
815
- print(f"No files found to download")
816
- if file_extensions:
817
- print(f" (filtered by extensions: {file_extensions})")
818
- return True
819
-
820
- print(f"📥 Downloading {len(files_to_download)} files to '{local_folder}'...")
821
-
822
- downloaded_count = 0
823
- failed_count = 0
824
-
825
- for remote_path, local_path in files_to_download:
826
- try:
827
- # Check if local file exists (if not overwriting)
828
- if not overwrite and os.path.exists(local_path):
829
- print(f" ⏭ Skipped (exists): {local_path}")
830
- continue
831
-
832
- # Ensure local directory exists
833
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
834
-
835
- # Download file
836
- data = obs.get(store, remote_path).bytes()
837
-
838
- # Write to local file
839
- with open(local_path, 'wb') as f:
840
- f.write(data)
841
-
842
- file_size = len(data)
843
- size_mb = file_size / (1024 * 1024) if file_size > 1024*1024 else file_size / 1024
844
- size_unit = "MB" if file_size > 1024*1024 else "KB"
845
-
846
- print(f" ✓ Downloaded: {remote_path} → {local_path} ({size_mb:.1f} {size_unit})")
847
- downloaded_count += 1
848
-
849
- except Exception as e:
850
- print(f" ❌ Failed: {remote_path} → {local_path} | Error: {str(e)[:100]}")
851
- failed_count += 1
852
-
853
- print(f"\n{'='*60}")
854
- if failed_count == 0:
855
- print(f"✅ Successfully downloaded all {downloaded_count} files from OneLake Files")
856
- else:
857
- print(f"⚠ Downloaded {downloaded_count} files, {failed_count} failed")
858
- print(f"{'='*60}")
859
-
860
- return failed_count == 0
861
-
862
- except Exception as e:
863
- print(f"❌ Error listing files from OneLake: {e}")
864
- return False
310
+ return _download(self, remote_folder, local_folder, file_extensions, overwrite)
865
311
 
866
312
  def sql(self, query: str):
867
313
  """
@@ -879,6 +325,34 @@ class Duckrun:
879
325
  """Get underlying DuckDB connection"""
880
326
  return self.con
881
327
 
328
+ def get_stats(self, source: str):
329
+ """
330
+ Get comprehensive statistics for Delta Lake tables.
331
+
332
+ Args:
333
+ source: Can be one of:
334
+ - Table name: 'table_name' (uses current schema)
335
+ - Schema.table: 'schema.table_name' (specific table in schema)
336
+ - Schema only: 'schema' (all tables in schema)
337
+
338
+ Returns:
339
+ Arrow table with statistics including total rows, file count, row groups,
340
+ average row group size, file sizes, VORDER status, and timestamp
341
+
342
+ Examples:
343
+ con = duckrun.connect("tmp/data.lakehouse/aemo")
344
+
345
+ # Single table in current schema
346
+ stats = con.get_stats('price')
347
+
348
+ # Specific table in different schema
349
+ stats = con.get_stats('aemo.price')
350
+
351
+ # All tables in a schema
352
+ stats = con.get_stats('aemo')
353
+ """
354
+ return _get_stats(self, source)
355
+
882
356
  def close(self):
883
357
  """Close DuckDB connection"""
884
358
  if self.con: