duckrun 0.2.13__py3-none-any.whl → 0.2.19.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

duckrun/core.py CHANGED
@@ -4,20 +4,82 @@ import os
4
4
  import importlib.util
5
5
  import json
6
6
  import time
7
- from deltalake import DeltaTable, write_deltalake
8
7
  from typing import List, Tuple, Union, Optional, Callable, Dict, Any
9
8
  from string import Template
10
- import obstore as obs
11
- from obstore.store import AzureStore
12
9
  from datetime import datetime
13
10
  from .stats import get_stats as _get_stats
14
11
  from .runner import run as _run
15
12
  from .files import copy as _copy, download as _download
16
13
  from .writer import QueryResult
17
14
 
18
- class Duckrun:
15
+
16
+ class WorkspaceOperationsMixin:
17
+ """
18
+ Mixin class for workspace-level operations that work for both
19
+ full Duckrun connections and workspace-only connections.
19
20
  """
20
- Lakehouse task runner with clean tuple-based API.
21
+
22
+ def import_notebook_from_web(self, url: str,
23
+ notebook_name: Optional[str] = None,
24
+ overwrite: bool = False) -> dict:
25
+ """
26
+ Import a Jupyter notebook from a web URL into the workspace.
27
+
28
+ Args:
29
+ url: URL to the notebook file (e.g., GitHub raw URL). Required.
30
+ notebook_name: Name for the imported notebook. Optional - derived from URL if not provided.
31
+ overwrite: Whether to overwrite if notebook already exists (default: False)
32
+
33
+ Returns:
34
+ Dictionary with import result
35
+
36
+ Examples:
37
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
38
+ result = con.import_notebook_from_web(
39
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
40
+ )
41
+
42
+ ws = duckrun.connect("workspace")
43
+ result = ws.import_notebook_from_web(
44
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
45
+ )
46
+ """
47
+ from .notebook import import_notebook_from_web as _import_notebook_from_web
48
+
49
+ # Get workspace name from either self.workspace or self.workspace_name
50
+ workspace_name = getattr(self, 'workspace', None) or getattr(self, 'workspace_name', None)
51
+
52
+ return _import_notebook_from_web(
53
+ url=url,
54
+ notebook_name=notebook_name,
55
+ overwrite=overwrite,
56
+ workspace_name=workspace_name
57
+ )
58
+
59
+ def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
60
+ """Helper method to get workspace ID from name"""
61
+ try:
62
+ url = "https://api.fabric.microsoft.com/v1/workspaces"
63
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
64
+
65
+ response = requests.get(url, headers=headers)
66
+ response.raise_for_status()
67
+
68
+ workspaces = response.json().get("value", [])
69
+ for workspace in workspaces:
70
+ if workspace.get("displayName") == workspace_name:
71
+ return workspace.get("id")
72
+
73
+ return None
74
+
75
+ except Exception:
76
+ return None
77
+
78
+
79
+ class Duckrun(WorkspaceOperationsMixin):
80
+ """
81
+ OneLake task runner with clean tuple-based API.
82
+ Supports lakehouses, warehouses, databases, and other OneLake items.
21
83
  Powered by DuckDB for fast data processing.
22
84
 
23
85
  Task formats:
@@ -30,6 +92,10 @@ class Duckrun:
30
92
  dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
31
93
  dr.run(pipeline)
32
94
 
95
+ # For other OneLake items:
96
+ dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
97
+ dr = Duckrun.connect("workspace/warehouse.Warehouse")
98
+
33
99
  # For data exploration with Spark-style API:
34
100
  dr = Duckrun.connect("workspace/lakehouse.lakehouse")
35
101
  dr.sql("SELECT * FROM table").show()
@@ -53,7 +119,8 @@ class Duckrun:
53
119
 
54
120
  def __init__(self, workspace_id: str, lakehouse_id: str, schema: str = "dbo",
55
121
  sql_folder: Optional[str] = None, compaction_threshold: int = 10,
56
- scan_all_schemas: bool = False, storage_account: str = "onelake"):
122
+ scan_all_schemas: bool = False, storage_account: str = "onelake",
123
+ token_only: bool = False):
57
124
  # Store GUIDs for internal use
58
125
  self.workspace_id = workspace_id
59
126
  self.lakehouse_id = lakehouse_id
@@ -62,25 +129,55 @@ class Duckrun:
62
129
  self.compaction_threshold = compaction_threshold
63
130
  self.scan_all_schemas = scan_all_schemas
64
131
  self.storage_account = storage_account
132
+ self.token_only = token_only
65
133
 
66
- # Construct proper ABFSS URLs
134
+ # Store both full name (with .ItemType) and display name (without .ItemType) for backward compatibility
135
+ # lakehouse_id: Full name with suffix for API calls (e.g., "data.Lakehouse")
136
+ # lakehouse_display_name: Name only without suffix for user code/templates (e.g., "data")
137
+ self.lakehouse_id = lakehouse_id
138
+
139
+ # Extract display name (remove .ItemType suffix if present)
67
140
  import re
141
+ # Check if lakehouse_id has .ItemType suffix
142
+ if not re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', lakehouse_id, re.IGNORECASE):
143
+ # Friendly name - extract base name without suffix
144
+ for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
145
+ if lakehouse_id.endswith(suffix):
146
+ self.lakehouse_display_name = lakehouse_id[:-len(suffix)]
147
+ break
148
+ else:
149
+ self.lakehouse_display_name = lakehouse_id
150
+ else:
151
+ # GUID - use as is
152
+ self.lakehouse_display_name = lakehouse_id
153
+
154
+ # Construct proper ABFSS URLs
155
+ # Format: abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{item}/Tables/
156
+ # where {workspace} and {item} can be:
157
+ # - Names with .lakehouse suffix (lakehouse optimization when no spaces in workspace)
158
+ # - GUIDs (when resolved via API for non-lakehouse items or items with spaces)
68
159
  guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
69
- # If lakehouse_id is a GUID, use as-is
160
+
161
+ # Determine the item URL part for ABFSS
70
162
  if guid_pattern.match(lakehouse_id):
71
- lakehouse_url_part = lakehouse_id
163
+ # Already a GUID - use as-is (from API resolution)
164
+ item_url_part = lakehouse_id
72
165
  else:
73
- # If workspace name has no spaces, always append .lakehouse unless already present
74
- if " " not in workspace_id and not lakehouse_id.endswith('.lakehouse'):
75
- lakehouse_url_part = f'{lakehouse_id}.lakehouse'
76
- else:
77
- lakehouse_url_part = lakehouse_id
78
- self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Tables/'
79
- self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Files/'
166
+ # Friendly name - use as-is (already includes .ItemType suffix from connect())
167
+ item_url_part = lakehouse_id
168
+
169
+ self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Tables/'
170
+ self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Files/'
80
171
 
81
172
  # Keep legacy properties for backward compatibility
82
173
  self.workspace = workspace_id
83
- self.lakehouse_name = lakehouse_id
174
+ self.lakehouse_name = self.lakehouse_display_name # Use display name (without suffix) for backward compatibility
175
+
176
+ # Store display name without suffix for backward compatibility with user Python functions
177
+ # Extract base name by removing .ItemType suffix if present
178
+ import re
179
+ suffix_pattern = re.compile(r'\.(Lakehouse|Warehouse|Database|SnowflakeDatabase)$', re.IGNORECASE)
180
+ self.lakehouse_display_name = suffix_pattern.sub('', lakehouse_id)
84
181
 
85
182
  self.con = duckdb.connect()
86
183
  self.con.sql("SET preserve_insertion_order = false")
@@ -93,25 +190,36 @@ class Duckrun:
93
190
  except ImportError:
94
191
  pass # Not in Colab, use default transport
95
192
 
96
- self._attach_lakehouse()
97
- self._register_lookup_functions()
193
+ # Only attach lakehouse and register functions if not token_only mode
194
+ if not token_only:
195
+ self._attach_lakehouse()
196
+ self._register_lookup_functions()
197
+ else:
198
+ # In token_only mode, just create the secret for authentication
199
+ self._create_onelake_secret()
200
+ print("✓ Token authenticated (fast mode - tables not listed)")
98
201
 
99
202
  @classmethod
100
203
  def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
101
- compaction_threshold: int = 100, storage_account: str = "onelake"):
204
+ compaction_threshold: int = 100, storage_account: str = "onelake",
205
+ token_only: bool = False):
102
206
  """
103
- Create and connect to lakehouse or workspace.
207
+ Create and connect to OneLake items (lakehouse, warehouse, database, etc.) or workspace.
104
208
 
105
209
  Smart detection based on connection string format:
106
210
  - "workspace" → workspace management only
107
- - "ws/lh.lakehouse/schema" → full lakehouse connection
108
- - "ws/lh.lakehouse" → lakehouse connection (defaults to dbo schema)
211
+ - "ws/item.lakehouse/schema" → lakehouse connection with specific schema
212
+ - "ws/item.lakehouse" → lakehouse connection (defaults to dbo schema)
213
+ - "ws/item.warehouse" → warehouse connection
214
+ - "ws/item.database" → database connection
215
+ - "ws/item.snowflakedatabase" → Snowflake database connection
109
216
 
110
217
  Args:
111
218
  connection_string: OneLake path or workspace name
112
219
  sql_folder: Optional path or URL to SQL files folder
113
220
  compaction_threshold: File count threshold for compaction
114
221
  storage_account: Storage account name (default: "onelake")
222
+ token_only: If True, only authenticate without listing tables (faster connection)
115
223
 
116
224
  Examples:
117
225
  # Workspace management only (supports spaces in names)
@@ -119,16 +227,26 @@ class Duckrun:
119
227
  ws.list_lakehouses()
120
228
  ws.create_lakehouse_if_not_exists("New Lakehouse")
121
229
 
122
- # Full lakehouse connections (supports spaces in names)
230
+ # Lakehouse connections (supports spaces in names)
123
231
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/schema", sql_folder="./sql")
124
232
  dr = Duckrun.connect("Data Workspace/Sales Data.lakehouse/analytics") # spaces supported
125
233
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse") # defaults to dbo schema
126
234
  dr = Duckrun.connect("workspace/lakehouse.lakehouse", storage_account="xxx-onelake") # custom storage
127
235
 
236
+ # Warehouse and database connections (always uses API to resolve GUIDs)
237
+ dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
238
+ dr = Duckrun.connect("My Workspace/My Warehouse.Warehouse")
239
+ dr = Duckrun.connect("workspace/database.Database")
240
+
241
+ # Fast connection without table listing (token only)
242
+ dr = Duckrun.connect("workspace/lakehouse.lakehouse", token_only=True)
243
+
128
244
  Note:
129
- Internally resolves friendly names (with spaces) to GUIDs and constructs proper ABFSS URLs:
130
- "My Workspace/My Lakehouse.lakehouse/schema" becomes
131
- "abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/lakehouse_guid/Tables/schema"
245
+ - Lakehouse items without spaces in workspace name use optimization (no API calls)
246
+ - Non-lakehouse items always resolve to GUIDs via Fabric API
247
+ - Internally constructs proper ABFSS URLs:
248
+ "My Workspace/My Item.lakehouse/schema" →
249
+ "abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/item_guid/Tables/schema"
132
250
  """
133
251
 
134
252
  # Check if it's a workspace-only connection (no "/" means workspace name only)
@@ -137,70 +255,94 @@ class Duckrun:
137
255
 
138
256
  scan_all_schemas = False
139
257
 
140
- # Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
141
- # Support workspace and lakehouse names with spaces
258
+ # Parse connection string: "ws/item_name.item_type/schema" or "ws/item_name.item_type"
259
+ # Support workspace and item names with spaces
260
+ # Item types: .lakehouse, .Lakehouse, .warehouse, .Warehouse, .database, .Database, .snowflakedatabase, .SnowflakeDatabase
142
261
  parts = connection_string.split("/")
143
262
  if len(parts) == 2:
144
- workspace_name, lakehouse_name = parts
263
+ workspace_name, item_name_with_type = parts
145
264
  scan_all_schemas = True
146
265
  schema = "dbo"
147
266
  elif len(parts) == 3:
148
- workspace_name, lakehouse_name, schema = parts
267
+ workspace_name, item_name_with_type, schema = parts
149
268
  else:
150
269
  raise ValueError(
151
270
  f"Invalid connection string format: '{connection_string}'. "
152
271
  "Expected formats:\n"
153
272
  " 'workspace name' (workspace management only)\n"
154
- " 'workspace name/lakehouse name.lakehouse' (lakehouse with dbo schema)\n"
155
- " 'workspace name/lakehouse name.lakehouse/schema' (lakehouse with specific schema)"
273
+ " 'workspace name/item name.item_type' (item with dbo schema)\n"
274
+ " 'workspace name/item name.item_type/schema' (item with specific schema)\n"
275
+ "Supported item types: .lakehouse, .warehouse, .database, .snowflakedatabase (case-insensitive)"
156
276
  )
157
277
 
158
- if lakehouse_name.endswith(".lakehouse"):
159
- lakehouse_name = lakehouse_name[:-10]
278
+ # Extract item type and name
279
+ item_type = None
280
+ item_name = item_name_with_type
281
+
282
+ # Check for known item types (case-insensitive)
283
+ item_type_map = {
284
+ '.lakehouse': 'Lakehouse',
285
+ '.warehouse': 'Warehouse',
286
+ '.database': 'Database',
287
+ '.snowflakedatabase': 'SnowflakeDatabase'
288
+ }
160
289
 
161
- if not workspace_name or not lakehouse_name:
290
+ # Parse item type and normalize the suffix to proper case
291
+ item_name_normalized = item_name_with_type
292
+ for suffix, mapped_type in item_type_map.items():
293
+ if item_name_with_type.lower().endswith(suffix):
294
+ item_type = mapped_type
295
+ item_name = item_name_with_type[:-len(suffix)]
296
+ # Normalize to proper case: ItemName.ItemType (e.g., data.Lakehouse)
297
+ item_name_normalized = f"{item_name}.{mapped_type}"
298
+ break
299
+
300
+ if not workspace_name or not item_name:
162
301
  raise ValueError(
163
302
  "Missing required parameters. Use one of these formats:\n"
164
303
  " connect('workspace name') # workspace management\n"
165
- " connect('workspace name/lakehouse name.lakehouse/schema') # full lakehouse\n"
166
- " connect('workspace name/lakehouse name.lakehouse') # defaults to dbo"
304
+ " connect('workspace name/item name.item_type/schema') # full item connection\n"
305
+ " connect('workspace name/item name.item_type') # defaults to dbo"
167
306
  )
168
307
 
169
- # Resolve friendly names to GUIDs and construct proper ABFSS path
170
- workspace_id, lakehouse_id = cls._resolve_names_to_guids(workspace_name, lakehouse_name)
308
+ # Per OneLake API docs: Can use friendly names if no spaces/special characters
309
+ # Otherwise must resolve to GUIDs
310
+ # Check for spaces or special characters that would require GUID resolution
311
+ has_special_chars = " " in workspace_name or " " in item_name
312
+
313
+ if has_special_chars:
314
+ # Names have spaces/special chars: resolve to GUIDs via API
315
+ workspace_id, item_id = cls._resolve_names_to_guids(workspace_name, item_name, item_type)
316
+ else:
317
+ # No spaces/special chars: use friendly names directly (works for all item types)
318
+ # Use normalized name with proper case for API compatibility
319
+ workspace_id = workspace_name
320
+ item_id = item_name_normalized # Use normalized with proper case
171
321
 
172
- return cls(workspace_id, lakehouse_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
322
+ return cls(workspace_id, item_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account, token_only)
173
323
 
174
324
  @classmethod
175
- def _resolve_names_to_guids(cls, workspace_name: str, lakehouse_name: str) -> tuple[str, str]:
325
+ def _resolve_names_to_guids(cls, workspace_name: str, item_name: str, item_type: Optional[str] = 'Lakehouse') -> tuple[str, str]:
176
326
  """
177
- Resolve friendly workspace and lakehouse names to their GUIDs.
178
-
179
- Optimization: If names don't contain spaces, use them directly (no API calls needed).
180
- Only resolve to GUIDs when names contain spaces or are already GUIDs.
327
+ Resolve friendly workspace and item names to their GUIDs.
181
328
 
182
329
  Args:
183
330
  workspace_name: Display name of the workspace (can contain spaces)
184
- lakehouse_name: Display name of the lakehouse (can contain spaces)
331
+ item_name: Display name of the item (can contain spaces)
332
+ item_type: Type of item - 'Lakehouse', 'Warehouse', 'Database', 'SnowflakeDatabase', etc.
185
333
 
186
334
  Returns:
187
- Tuple of (workspace_id, lakehouse_id) - either resolved GUIDs or original names
335
+ Tuple of (workspace_id, item_id) - resolved GUIDs
188
336
  """
189
337
 
190
338
  # Check if names are already GUIDs first
191
339
  import re
192
340
  guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
193
341
 
194
- if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
195
- return workspace_name, lakehouse_name
196
-
197
- # Optimization: If workspace name has no spaces, use both names directly (old behavior)
198
- # Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
199
- if " " not in workspace_name:
200
- return workspace_name, lakehouse_name
201
-
202
- # Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
342
+ if guid_pattern.match(workspace_name) and guid_pattern.match(item_name):
343
+ return workspace_name, item_name
203
344
 
345
+ # Need to resolve to GUIDs via API
204
346
  try:
205
347
  # Get authentication token using enhanced auth system
206
348
  from .auth import get_fabric_api_token
@@ -218,8 +360,7 @@ class Duckrun:
218
360
 
219
361
  # Resolve workspace name to ID
220
362
  if current_workspace_id:
221
- # In notebook environment, we could use current workspace ID
222
- # but we should validate it matches the requested workspace name
363
+ # In notebook environment, validate it matches the requested workspace name
223
364
  workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
224
365
  if not workspace_id:
225
366
  # Fallback to current workspace if name resolution fails
@@ -231,21 +372,26 @@ class Duckrun:
231
372
  if not workspace_id:
232
373
  raise ValueError(f"Workspace '{workspace_name}' not found")
233
374
 
234
- # Resolve lakehouse name to ID (required for ABFSS URLs with spaces)
235
- lakehouse_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, lakehouse_name)
236
- if not lakehouse_id:
237
- raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
375
+ # Resolve item name to ID based on item type
376
+ if item_type == 'Lakehouse':
377
+ item_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, item_name)
378
+ else:
379
+ # Use generic item resolver for non-lakehouse items
380
+ item_id = cls._resolve_item_id_by_name(token, workspace_id, item_name, item_type)
381
+
382
+ if not item_id:
383
+ raise ValueError(f"{item_type} '{item_name}' not found in workspace '{workspace_name}'")
238
384
 
239
- return workspace_id, lakehouse_id
385
+ return workspace_id, item_id
240
386
 
241
387
  except Exception as e:
242
388
  print(f"❌ Failed to resolve names to GUIDs: {e}")
243
- print(f"❌ Cannot use friendly names with spaces '{workspace_name}'/'{lakehouse_name}' in ABFSS URLs without GUID resolution")
244
- print("❌ Microsoft Fabric requires actual workspace and lakehouse GUIDs for ABFSS access when names contain spaces")
389
+ print(f"❌ Cannot resolve '{workspace_name}'/'{item_name}' ({item_type}) to GUIDs")
390
+ print("❌ Microsoft Fabric requires actual workspace and item GUIDs for ABFSS access")
245
391
  raise ValueError(
246
- f"Unable to resolve workspace '{workspace_name}' and lakehouse '{lakehouse_name}' to GUIDs. "
247
- f"ABFSS URLs require actual GUIDs when names contain spaces. "
248
- f"Please ensure you have proper authentication and the workspace/lakehouse names are correct."
392
+ f"Unable to resolve workspace '{workspace_name}' and {item_type.lower()} '{item_name}' to GUIDs. "
393
+ f"ABFSS URLs require actual GUIDs. "
394
+ f"Please ensure you have proper authentication and the workspace/item names are correct."
249
395
  )
250
396
 
251
397
  @classmethod
@@ -287,6 +433,58 @@ class Duckrun:
287
433
  return None
288
434
  except Exception:
289
435
  return None
436
+
437
+ @classmethod
438
+ def _resolve_item_id_by_name(cls, token: str, workspace_id: str, item_name: str, item_type: str) -> Optional[str]:
439
+ """
440
+ Get item ID from display name within a workspace using generic items API.
441
+ Works for any item type: Warehouse, Database, SnowflakeDatabase, etc.
442
+
443
+ Args:
444
+ token: Fabric API authentication token
445
+ workspace_id: Workspace GUID
446
+ item_name: Display name of the item
447
+ item_type: Type of item (e.g., 'Warehouse', 'Database', 'SnowflakeDatabase')
448
+
449
+ Returns:
450
+ Item GUID if found, None otherwise
451
+ """
452
+ try:
453
+ import requests
454
+ # Use generic items API with type filter
455
+ url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items"
456
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
457
+
458
+ # Add type filter as query parameter
459
+ params = {"type": item_type}
460
+
461
+ print(f" Searching for {item_type} '{item_name}' in workspace {workspace_id}")
462
+ print(f" API URL: {url}?type={item_type}")
463
+
464
+ response = requests.get(url, headers=headers, params=params)
465
+ response.raise_for_status()
466
+
467
+ result = response.json()
468
+ items = result.get("value", [])
469
+
470
+ print(f" Found {len(items)} items of type {item_type}")
471
+ if items:
472
+ print(f" Available items: {[item.get('displayName') for item in items]}")
473
+
474
+ for item in items:
475
+ if item.get("displayName") == item_name:
476
+ item_id = item.get("id")
477
+ print(f" Found matching item: {item_name} -> {item_id}")
478
+ return item_id
479
+
480
+ print(f" Item '{item_name}' not found in the list")
481
+ return None
482
+ except Exception as e:
483
+ print(f" Error resolving {item_type} item: {e}")
484
+ if hasattr(e, 'response') and e.response is not None:
485
+ print(f" Response status: {e.response.status_code}")
486
+ print(f" Response body: {e.response.text}")
487
+ return None
290
488
 
291
489
  @classmethod
292
490
  def connect_workspace(cls, workspace_name: str):
@@ -328,77 +526,138 @@ class Duckrun:
328
526
 
329
527
  def _discover_tables_fast(self) -> List[Tuple[str, str]]:
330
528
  """
331
- Fast Delta table discovery using obstore with list_with_delimiter.
332
- Only lists directories, not files - super fast!
529
+ Fast table discovery using OneLake Delta Table API (Unity Catalog compatible).
530
+ Uses: https://learn.microsoft.com/en-us/fabric/onelake/table-apis/delta-table-apis-overview
333
531
 
334
532
  Returns:
335
533
  List of tuples: [(schema, table_name), ...]
336
534
  """
337
- token = self._get_storage_token()
338
- if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
339
- print("Authenticating with Azure for table discovery (detecting environment automatically)...")
340
- from .auth import get_token
341
- token = get_token()
342
- if not token:
343
- print("❌ Failed to authenticate for table discovery")
344
- return []
345
-
346
- url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
347
- store = AzureStore.from_url(url, bearer_token=token)
348
-
349
- # Use the same lakehouse URL part logic as in __init__ to ensure .lakehouse suffix is added when needed
350
- import re
351
- guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
352
- if guid_pattern.match(self.lakehouse_id):
353
- lakehouse_url_part = self.lakehouse_id
354
- else:
355
- # If workspace name has no spaces, always append .lakehouse unless already present
356
- if " " not in self.workspace_id and not self.lakehouse_id.endswith('.lakehouse'):
357
- lakehouse_url_part = f'{self.lakehouse_id}.lakehouse'
535
+ try:
536
+ # Get storage token for OneLake
537
+ token = self._get_storage_token()
538
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
539
+ print("Authenticating with Azure for table discovery...")
540
+ from .auth import get_token
541
+ token = get_token()
542
+ if not token:
543
+ print("❌ Failed to authenticate for table discovery")
544
+ return []
545
+
546
+ # OneLake Delta Table API endpoint (Unity Catalog compatible)
547
+ base_url = "https://onelake.table.fabric.microsoft.com/delta"
548
+
549
+ # Determine workspace/item identifier for API
550
+ # Per docs: Can use friendly names (WorkspaceName/ItemName.ItemType) if no special characters
551
+ # Otherwise must use GUIDs (WorkspaceID/ItemID)
552
+ import re
553
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
554
+
555
+ # Check if we're using GUIDs or friendly names
556
+ if guid_pattern.match(self.workspace_id) and guid_pattern.match(self.lakehouse_id):
557
+ # Using GUIDs - use them directly in API
558
+ workspace_identifier = self.workspace_id
559
+ item_identifier = self.lakehouse_id
560
+ catalog_name = self.lakehouse_id
358
561
  else:
359
- lakehouse_url_part = self.lakehouse_id
360
-
361
- base_path = f"{lakehouse_url_part}/Tables/"
362
- tables_found = []
363
-
364
- if self.scan_all_schemas:
365
- # Discover all schemas first
366
- schemas_result = obs.list_with_delimiter(store, prefix=base_path)
367
- schemas = [
368
- prefix.rstrip('/').split('/')[-1]
369
- for prefix in schemas_result['common_prefixes']
370
- ]
371
-
372
- # Discover tables in each schema
373
- for schema_name in schemas:
374
- schema_path = f"{base_path}{schema_name}/"
375
- result = obs.list_with_delimiter(store, prefix=schema_path)
562
+ # Using friendly names - lakehouse_id already includes .ItemType suffix
563
+ workspace_identifier = self.workspace_id
564
+ item_identifier = self.lakehouse_id
565
+ catalog_name = self.lakehouse_id
566
+
567
+ print(f"🔍 Discovering tables via OneLake Delta Table API...")
568
+ print(f" Using identifier: {workspace_identifier}/{item_identifier}")
569
+
570
+ tables_found = []
571
+
572
+ if self.scan_all_schemas:
573
+ # First, list all schemas
574
+ schemas_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/schemas"
575
+ params = {"catalog_name": catalog_name}
576
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
376
577
 
377
- for table_prefix in result['common_prefixes']:
378
- table_name = table_prefix.rstrip('/').split('/')[-1]
379
- # Skip non-table directories
380
- if table_name not in ('metadata', 'iceberg'):
381
- tables_found.append((schema_name, table_name))
382
- else:
383
- # Scan specific schema only
384
- schema_path = f"{base_path}{self.schema}/"
385
- result = obs.list_with_delimiter(store, prefix=schema_path)
578
+ schemas_response = requests.get(schemas_url, headers=headers, params=params)
579
+
580
+ if schemas_response.status_code == 200:
581
+ schemas_result = schemas_response.json()
582
+ schemas = schemas_result.get("schemas", [])
583
+ schema_names = [s.get("name") for s in schemas if s.get("name")]
584
+
585
+ print(f" Found {len(schema_names)} schemas: {schema_names}")
586
+
587
+ # Get tables from each schema
588
+ for schema_name in schema_names:
589
+ tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
590
+ tables_params = {
591
+ "catalog_name": catalog_name,
592
+ "schema_name": schema_name
593
+ }
594
+
595
+ tables_response = requests.get(tables_url, headers=headers, params=tables_params)
596
+
597
+ if tables_response.status_code == 200:
598
+ tables_result = tables_response.json()
599
+ tables = tables_result.get("tables", [])
600
+
601
+ for table in tables:
602
+ table_name = table.get("name", "")
603
+ if table_name:
604
+ tables_found.append((schema_name, table_name))
605
+
606
+ if tables:
607
+ print(f" Schema '{schema_name}': {len(tables)} tables")
608
+ else:
609
+ print(f" Failed to list schemas: {schemas_response.status_code}")
610
+ if schemas_response.status_code != 404:
611
+ print(f" Response: {schemas_response.text[:300]}")
612
+ else:
613
+ # Single schema mode - list tables in specific schema
614
+ tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
615
+ params = {
616
+ "catalog_name": catalog_name,
617
+ "schema_name": self.schema
618
+ }
619
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
620
+
621
+ print(f" Listing tables in schema: {self.schema}")
622
+ tables_response = requests.get(tables_url, headers=headers, params=params)
623
+
624
+ if tables_response.status_code == 200:
625
+ tables_result = tables_response.json()
626
+ tables = tables_result.get("tables", [])
627
+
628
+ for table in tables:
629
+ table_name = table.get("name", "")
630
+ if table_name:
631
+ tables_found.append((self.schema, table_name))
632
+
633
+ print(f" Found {len(tables)} tables")
634
+ elif tables_response.status_code == 404:
635
+ print(f" Schema '{self.schema}' not found or has no tables")
636
+ else:
637
+ print(f" Failed to list tables: {tables_response.status_code}")
638
+ print(f" Response: {tables_response.text[:300]}")
386
639
 
387
- for table_prefix in result['common_prefixes']:
388
- table_name = table_prefix.rstrip('/').split('/')[-1]
389
- if table_name not in ('metadata', 'iceberg'):
390
- tables_found.append((self.schema, table_name))
391
-
392
- return tables_found
640
+ return tables_found
641
+
642
+ except Exception as e:
643
+ print(f"❌ Error during table discovery: {e}")
644
+ import traceback
645
+ traceback.print_exc()
646
+ return []
393
647
 
394
648
  def _attach_lakehouse(self):
395
649
  """Attach lakehouse tables as DuckDB views using fast discovery"""
650
+ print(f"🔌 Attaching tables from schema: {self.schema if not self.scan_all_schemas else 'all schemas'}")
396
651
  self._create_onelake_secret()
397
652
 
398
653
  try:
399
654
  tables = self._discover_tables_fast()
400
655
 
401
656
  if not tables:
657
+ if self.scan_all_schemas:
658
+ print(f"⚠️ No tables found in any schema")
659
+ else:
660
+ print(f"⚠️ No tables found in {self.schema} schema")
402
661
  return
403
662
 
404
663
  # Collect table names for display
@@ -421,6 +680,7 @@ class Duckrun:
421
680
  AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
422
681
  """)
423
682
  except Exception as e:
683
+ print(f"⚠️ Failed to attach table {schema_name}.{table_name}: {e}")
424
684
  continue
425
685
 
426
686
  # Print discovered tables as comma-separated list
@@ -429,6 +689,8 @@ class Duckrun:
429
689
 
430
690
  except Exception as e:
431
691
  print(f"❌ Error attaching lakehouse: {e}")
692
+ import traceback
693
+ traceback.print_exc()
432
694
 
433
695
  def _register_lookup_functions(self):
434
696
  """
@@ -567,32 +829,129 @@ class Duckrun:
567
829
 
568
830
  # Register functions in DuckDB
569
831
  try:
570
- self.con.create_function("get_workspace_name", get_workspace_name)
571
- self.con.create_function("get_lakehouse_name", get_lakehouse_name)
572
- self.con.create_function("get_workspace_id_from_name", get_workspace_id_from_name)
573
- self.con.create_function("get_lakehouse_id_from_name", get_lakehouse_id_from_name)
832
+ self.con.create_function("get_workspace_name", get_workspace_name, null_handling='SPECIAL')
833
+ self.con.create_function("get_lakehouse_name", get_lakehouse_name, null_handling='SPECIAL')
834
+ self.con.create_function("get_workspace_id_from_name", get_workspace_id_from_name, null_handling='SPECIAL')
835
+ self.con.create_function("get_lakehouse_id_from_name", get_lakehouse_id_from_name, null_handling='SPECIAL')
574
836
  except Exception as e:
575
837
  print(f"⚠️ Warning: Could not register lookup functions: {e}")
576
838
 
577
- def get_workspace_id(self) -> str:
839
+ def get_workspace_id(self, force: bool = False) -> str:
578
840
  """
579
841
  Get the workspace ID (GUID or name without spaces).
580
842
  Use this when passing workspace parameter to Python functions.
581
843
 
844
+ Args:
845
+ force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
846
+
582
847
  Returns:
583
848
  Workspace ID - either a GUID or workspace name without spaces
584
849
  """
850
+ if not force:
851
+ return self.workspace_id
852
+
853
+ # Force resolution to GUID
854
+ import re
855
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
856
+
857
+ # If already a GUID, return it
858
+ if guid_pattern.match(self.workspace_id):
859
+ return self.workspace_id
860
+
861
+ # Try to get from notebook context first (fastest)
862
+ try:
863
+ import notebookutils # type: ignore
864
+ workspace_guid = notebookutils.runtime.context.get("workspaceId")
865
+ if workspace_guid:
866
+ return workspace_guid
867
+ except ImportError:
868
+ pass
869
+
870
+ # Resolve via API
871
+ try:
872
+ from .auth import get_fabric_api_token
873
+ token = get_fabric_api_token()
874
+ if token:
875
+ resolved_id = self._resolve_workspace_id_by_name(token, self.workspace_id)
876
+ if resolved_id:
877
+ return resolved_id
878
+ except Exception:
879
+ pass
880
+
881
+ # Fallback to original value
585
882
  return self.workspace_id
586
883
 
587
- def get_lakehouse_id(self) -> str:
884
+ def get_item_id(self, force: bool = False) -> str:
588
885
  """
589
- Get the lakehouse ID (GUID or name).
590
- Use this when passing lakehouse parameter to Python functions.
886
+ Get the item ID (GUID or name) - works for lakehouses, warehouses, databases, etc.
887
+ Use this when passing lakehouse/item parameter to Python functions.
888
+
889
+ Args:
890
+ force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
591
891
 
592
892
  Returns:
593
- Lakehouse ID - either a GUID or lakehouse name
893
+ Item ID - either a GUID or item name (supports all OneLake item types)
594
894
  """
895
+ if not force:
896
+ return self.lakehouse_id
897
+
898
+ # Force resolution to GUID
899
+ import re
900
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
901
+
902
+ # If already a GUID, return it
903
+ if guid_pattern.match(self.lakehouse_id):
904
+ return self.lakehouse_id
905
+
906
+ # Detect item type from lakehouse_id (e.g., "data.Lakehouse" -> Lakehouse)
907
+ item_type = None
908
+ item_name = self.lakehouse_id
909
+ for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
910
+ if self.lakehouse_id.endswith(suffix):
911
+ item_type = suffix[1:] # Remove the leading dot
912
+ item_name = self.lakehouse_id[:-len(suffix)]
913
+ break
914
+
915
+ # Try to get from notebook context first (only works for lakehouses)
916
+ if item_type == 'Lakehouse' or item_type is None:
917
+ try:
918
+ import notebookutils # type: ignore
919
+ lakehouse_guid = notebookutils.lakehouse.get("id")
920
+ if lakehouse_guid:
921
+ return lakehouse_guid
922
+ except (ImportError, Exception):
923
+ pass
924
+
925
+ # Resolve via API
926
+ try:
927
+ from .auth import get_fabric_api_token
928
+ token = get_fabric_api_token()
929
+ if token:
930
+ # First get workspace GUID
931
+ workspace_guid = self.get_workspace_id(force=True)
932
+
933
+ # Use appropriate resolver based on item type
934
+ if item_type == 'Lakehouse' or item_type is None:
935
+ # Use lakehouse-specific API
936
+ resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, item_name if item_name else self.lakehouse_id)
937
+ else:
938
+ # Use generic items API for warehouses, databases, etc.
939
+ resolved_id = self._resolve_item_id_by_name(token, workspace_guid, item_name, item_type)
940
+
941
+ if resolved_id:
942
+ return resolved_id
943
+ except Exception:
944
+ pass
945
+
946
+ # Fallback to original value
595
947
  return self.lakehouse_id
948
+
949
+ def get_lakehouse_id(self, force: bool = False) -> str:
950
+ """
951
+ Deprecated: Use get_item_id() instead.
952
+ Backward compatibility alias for get_item_id().
953
+ """
954
+ return self.get_item_id(force)
596
955
 
597
956
  def run(self, pipeline: List[Tuple]) -> bool:
598
957
  """
@@ -676,33 +1035,44 @@ class Duckrun:
676
1035
  """Get underlying DuckDB connection"""
677
1036
  return self.con
678
1037
 
679
- def get_stats(self, source: str):
1038
+ def get_stats(self, source: str = None, detailed = False):
680
1039
  """
681
1040
  Get comprehensive statistics for Delta Lake tables.
682
1041
 
683
1042
  Args:
684
- source: Can be one of:
1043
+ source: Optional. Can be one of:
1044
+ - None: Use all tables in the connection's schema (default)
685
1045
  - Table name: 'table_name' (uses current schema)
686
1046
  - Schema.table: 'schema.table_name' (specific table in schema)
687
1047
  - Schema only: 'schema' (all tables in schema)
1048
+ detailed: Optional. Controls the level of detail in statistics:
1049
+ - False (default): Aggregated table-level stats
1050
+ - True: Row group level statistics with compression details
688
1051
 
689
1052
  Returns:
690
- Arrow table with statistics including total rows, file count, row groups,
691
- average row group size, file sizes, VORDER status, and timestamp
1053
+ DataFrame with statistics based on detailed parameter:
1054
+ - If detailed=False: Aggregated table-level summary
1055
+ - If detailed=True: Granular file and row group level stats
692
1056
 
693
1057
  Examples:
694
1058
  con = duckrun.connect("tmp/data.lakehouse/aemo")
695
1059
 
696
- # Single table in current schema
1060
+ # All tables in current schema (aemo) - aggregated
1061
+ stats = con.get_stats()
1062
+
1063
+ # Single table in current schema - aggregated
697
1064
  stats = con.get_stats('price')
698
1065
 
1066
+ # Single table with detailed row group statistics
1067
+ stats_detailed = con.get_stats('price', detailed=True)
1068
+
699
1069
  # Specific table in different schema
700
1070
  stats = con.get_stats('aemo.price')
701
1071
 
702
1072
  # All tables in a schema
703
1073
  stats = con.get_stats('aemo')
704
1074
  """
705
- return _get_stats(self, source)
1075
+ return _get_stats(self, source, detailed)
706
1076
 
707
1077
  def list_lakehouses(self) -> List[str]:
708
1078
  """
@@ -816,7 +1186,7 @@ class Duckrun:
816
1186
  return False
817
1187
 
818
1188
  def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
819
- wait_seconds: int = 5) -> int:
1189
+ wait_seconds: int = 5, refresh: str = "full") -> int:
820
1190
  """
821
1191
  Deploy a semantic model from a BIM file using DirectLake mode.
822
1192
 
@@ -825,8 +1195,11 @@ class Duckrun:
825
1195
  - URL: "https://raw.githubusercontent.com/.../model.bim"
826
1196
  - Local file: "model.bim"
827
1197
  - Workspace/Model: "workspace_name/model_name"
828
- dataset_name: Name for the semantic model (default: source model name if workspace/model format, else lakehouse_schema)
1198
+ dataset_name: Name for the semantic model (default: schema name)
829
1199
  wait_seconds: Seconds to wait for permission propagation (default: 5)
1200
+ refresh: Refresh strategy:
1201
+ - "full": Clear values and process full refresh (default)
1202
+ - "ignore": Skip refresh entirely
830
1203
 
831
1204
  Returns:
832
1205
  1 for success, 0 for failure
@@ -834,14 +1207,17 @@ class Duckrun:
834
1207
  Examples:
835
1208
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
836
1209
 
1210
+ # Deploy with schema name as dataset name (dbo)
1211
+ dr.deploy("https://github.com/.../model.bim")
1212
+
837
1213
  # Deploy from workspace/model (uses same name by default)
838
1214
  dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
839
1215
 
840
1216
  # Deploy with custom name
841
- dr.deploy("Source Workspace/Source Model", dataset_name="Sales Model Copy")
1217
+ dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
842
1218
 
843
- # Deploy from URL or local file
844
- dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
1219
+ # Deploy without refresh
1220
+ dr.deploy("https://github.com/.../model.bim", refresh="ignore")
845
1221
  """
846
1222
  from .semantic_model import deploy_semantic_model
847
1223
 
@@ -853,9 +1229,9 @@ class Duckrun:
853
1229
  if len(parts) == 2:
854
1230
  dataset_name = parts[1] # Use the model name
855
1231
  else:
856
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1232
+ dataset_name = self.schema # Use schema name
857
1233
  else:
858
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1234
+ dataset_name = self.schema # Use schema name
859
1235
 
860
1236
  # Call the deployment function (DirectLake only)
861
1237
  return deploy_semantic_model(
@@ -864,28 +1240,10 @@ class Duckrun:
864
1240
  schema_name=self.schema,
865
1241
  dataset_name=dataset_name,
866
1242
  bim_url_or_path=bim_url,
867
- wait_seconds=wait_seconds
1243
+ wait_seconds=wait_seconds,
1244
+ refresh=refresh
868
1245
  )
869
1246
 
870
- def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
871
- """Helper method to get workspace ID from name"""
872
- try:
873
- url = "https://api.fabric.microsoft.com/v1/workspaces"
874
- headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
875
-
876
- response = requests.get(url, headers=headers)
877
- response.raise_for_status()
878
-
879
- workspaces = response.json().get("value", [])
880
- for workspace in workspaces:
881
- if workspace.get("displayName") == workspace_name:
882
- return workspace.get("id")
883
-
884
- return None
885
-
886
- except Exception:
887
- return None
888
-
889
1247
  def close(self):
890
1248
  """Close DuckDB connection"""
891
1249
  if self.con:
@@ -893,7 +1251,7 @@ class Duckrun:
893
1251
  print("Connection closed")
894
1252
 
895
1253
 
896
- class WorkspaceConnection:
1254
+ class WorkspaceConnection(WorkspaceOperationsMixin):
897
1255
  """
898
1256
  Simple workspace connection for lakehouse management operations.
899
1257
  """
@@ -1133,23 +1491,4 @@ class WorkspaceConnection:
1133
1491
  print(f"❌ Error downloading semantic model: {e}")
1134
1492
  import traceback
1135
1493
  traceback.print_exc()
1136
- return None
1137
-
1138
- def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
1139
- """Helper method to get workspace ID from name"""
1140
- try:
1141
- url = "https://api.fabric.microsoft.com/v1/workspaces"
1142
- headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
1143
-
1144
- response = requests.get(url, headers=headers)
1145
- response.raise_for_status()
1146
-
1147
- workspaces = response.json().get("value", [])
1148
- for workspace in workspaces:
1149
- if workspace.get("displayName") == workspace_name:
1150
- return workspace.get("id")
1151
-
1152
- return None
1153
-
1154
- except Exception:
1155
1494
  return None