duckrun 0.2.14.dev1__py3-none-any.whl → 0.2.14.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

duckrun/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
 
5
- __version__ = "0.2.14.dev1"
5
+ __version__ = "0.2.14.dev2"
6
6
 
7
7
  # Expose unified connect method at module level
8
8
  connect = Duckrun.connect
duckrun/auth.py CHANGED
@@ -2,9 +2,21 @@
2
2
  Enhanced authentication module for duckrun - supports multiple notebook environments
3
3
  """
4
4
  import os
5
+ import sys
5
6
  from typing import Optional, Tuple
6
7
 
7
8
 
9
+ def safe_print(message: str):
10
+ """Print message with safe encoding handling for Windows"""
11
+ try:
12
+ print(message)
13
+ except UnicodeEncodeError:
14
+ # Fallback: remove emojis and special chars
15
+ import re
16
+ clean_message = re.sub(r'[^\x00-\x7F]+', '', message)
17
+ print(clean_message)
18
+
19
+
8
20
  def get_token() -> Optional[str]:
9
21
  """
10
22
  Smart authentication that works across multiple environments:
duckrun/core.py CHANGED
@@ -4,11 +4,8 @@ import os
4
4
  import importlib.util
5
5
  import json
6
6
  import time
7
- from deltalake import DeltaTable, write_deltalake
8
7
  from typing import List, Tuple, Union, Optional, Callable, Dict, Any
9
8
  from string import Template
10
- import obstore as obs
11
- from obstore.store import AzureStore
12
9
  from datetime import datetime
13
10
  from .stats import get_stats as _get_stats
14
11
  from .runner import run as _run
@@ -17,7 +14,8 @@ from .writer import QueryResult
17
14
 
18
15
  class Duckrun:
19
16
  """
20
- Lakehouse task runner with clean tuple-based API.
17
+ OneLake task runner with clean tuple-based API.
18
+ Supports lakehouses, warehouses, databases, and other OneLake items.
21
19
  Powered by DuckDB for fast data processing.
22
20
 
23
21
  Task formats:
@@ -30,6 +28,10 @@ class Duckrun:
30
28
  dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
31
29
  dr.run(pipeline)
32
30
 
31
+ # For other OneLake items:
32
+ dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
33
+ dr = Duckrun.connect("workspace/warehouse.Warehouse")
34
+
33
35
  # For data exploration with Spark-style API:
34
36
  dr = Duckrun.connect("workspace/lakehouse.lakehouse")
35
37
  dr.sql("SELECT * FROM table").show()
@@ -65,24 +67,53 @@ class Duckrun:
65
67
  self.storage_account = storage_account
66
68
  self.token_only = token_only
67
69
 
68
- # Construct proper ABFSS URLs
70
+ # Store both full name (with .ItemType) and display name (without .ItemType) for backward compatibility
71
+ # lakehouse_id: Full name with suffix for API calls (e.g., "data.Lakehouse")
72
+ # lakehouse_display_name: Name only without suffix for user code/templates (e.g., "data")
73
+ self.lakehouse_id = lakehouse_id
74
+
75
+ # Extract display name (remove .ItemType suffix if present)
69
76
  import re
77
+ # Check if lakehouse_id has .ItemType suffix
78
+ if not re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', lakehouse_id, re.IGNORECASE):
79
+ # Friendly name - extract base name without suffix
80
+ for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
81
+ if lakehouse_id.endswith(suffix):
82
+ self.lakehouse_display_name = lakehouse_id[:-len(suffix)]
83
+ break
84
+ else:
85
+ self.lakehouse_display_name = lakehouse_id
86
+ else:
87
+ # GUID - use as is
88
+ self.lakehouse_display_name = lakehouse_id
89
+
90
+ # Construct proper ABFSS URLs
91
+ # Format: abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{item}/Tables/
92
+ # where {workspace} and {item} can be:
93
+ # - Names with .lakehouse suffix (lakehouse optimization when no spaces in workspace)
94
+ # - GUIDs (when resolved via API for non-lakehouse items or items with spaces)
70
95
  guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
71
- # If lakehouse_id is a GUID, use as-is
96
+
97
+ # Determine the item URL part for ABFSS
72
98
  if guid_pattern.match(lakehouse_id):
73
- lakehouse_url_part = lakehouse_id
99
+ # Already a GUID - use as-is (from API resolution)
100
+ item_url_part = lakehouse_id
74
101
  else:
75
- # If workspace name has no spaces, always append .lakehouse unless already present
76
- if " " not in workspace_id and not lakehouse_id.endswith('.lakehouse'):
77
- lakehouse_url_part = f'{lakehouse_id}.lakehouse'
78
- else:
79
- lakehouse_url_part = lakehouse_id
80
- self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Tables/'
81
- self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Files/'
102
+ # Friendly name - use as-is (already includes .ItemType suffix from connect())
103
+ item_url_part = lakehouse_id
104
+
105
+ self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Tables/'
106
+ self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Files/'
82
107
 
83
108
  # Keep legacy properties for backward compatibility
84
109
  self.workspace = workspace_id
85
- self.lakehouse_name = lakehouse_id
110
+ self.lakehouse_name = self.lakehouse_display_name # Use display name (without suffix) for backward compatibility
111
+
112
+ # Store display name without suffix for backward compatibility with user Python functions
113
+ # Extract base name by removing .ItemType suffix if present
114
+ import re
115
+ suffix_pattern = re.compile(r'\.(Lakehouse|Warehouse|Database|SnowflakeDatabase)$', re.IGNORECASE)
116
+ self.lakehouse_display_name = suffix_pattern.sub('', lakehouse_id)
86
117
 
87
118
  self.con = duckdb.connect()
88
119
  self.con.sql("SET preserve_insertion_order = false")
@@ -109,12 +140,15 @@ class Duckrun:
109
140
  compaction_threshold: int = 100, storage_account: str = "onelake",
110
141
  token_only: bool = False):
111
142
  """
112
- Create and connect to lakehouse or workspace.
143
+ Create and connect to OneLake items (lakehouse, warehouse, database, etc.) or workspace.
113
144
 
114
145
  Smart detection based on connection string format:
115
146
  - "workspace" → workspace management only
116
- - "ws/lh.lakehouse/schema" → full lakehouse connection
117
- - "ws/lh.lakehouse" → lakehouse connection (defaults to dbo schema)
147
+ - "ws/item.lakehouse/schema" → lakehouse connection with specific schema
148
+ - "ws/item.lakehouse" → lakehouse connection (defaults to dbo schema)
149
+ - "ws/item.warehouse" → warehouse connection
150
+ - "ws/item.database" → database connection
151
+ - "ws/item.snowflakedatabase" → Snowflake database connection
118
152
 
119
153
  Args:
120
154
  connection_string: OneLake path or workspace name
@@ -129,19 +163,26 @@ class Duckrun:
129
163
  ws.list_lakehouses()
130
164
  ws.create_lakehouse_if_not_exists("New Lakehouse")
131
165
 
132
- # Full lakehouse connections (supports spaces in names)
166
+ # Lakehouse connections (supports spaces in names)
133
167
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/schema", sql_folder="./sql")
134
168
  dr = Duckrun.connect("Data Workspace/Sales Data.lakehouse/analytics") # spaces supported
135
169
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse") # defaults to dbo schema
136
170
  dr = Duckrun.connect("workspace/lakehouse.lakehouse", storage_account="xxx-onelake") # custom storage
137
171
 
172
+ # Warehouse and database connections (always uses API to resolve GUIDs)
173
+ dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
174
+ dr = Duckrun.connect("My Workspace/My Warehouse.Warehouse")
175
+ dr = Duckrun.connect("workspace/database.Database")
176
+
138
177
  # Fast connection without table listing (token only)
139
178
  dr = Duckrun.connect("workspace/lakehouse.lakehouse", token_only=True)
140
179
 
141
180
  Note:
142
- Internally resolves friendly names (with spaces) to GUIDs and constructs proper ABFSS URLs:
143
- "My Workspace/My Lakehouse.lakehouse/schema" becomes
144
- "abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/lakehouse_guid/Tables/schema"
181
+ - Lakehouse items without spaces in workspace name use optimization (no API calls)
182
+ - Non-lakehouse items always resolve to GUIDs via Fabric API
183
+ - Internally constructs proper ABFSS URLs:
184
+ "My Workspace/My Item.lakehouse/schema" →
185
+ "abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/item_guid/Tables/schema"
145
186
  """
146
187
 
147
188
  # Check if it's a workspace-only connection (no "/" means workspace name only)
@@ -150,70 +191,94 @@ class Duckrun:
150
191
 
151
192
  scan_all_schemas = False
152
193
 
153
- # Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
154
- # Support workspace and lakehouse names with spaces
194
+ # Parse connection string: "ws/item_name.item_type/schema" or "ws/item_name.item_type"
195
+ # Support workspace and item names with spaces
196
+ # Item types: .lakehouse, .Lakehouse, .warehouse, .Warehouse, .database, .Database, .snowflakedatabase, .SnowflakeDatabase
155
197
  parts = connection_string.split("/")
156
198
  if len(parts) == 2:
157
- workspace_name, lakehouse_name = parts
199
+ workspace_name, item_name_with_type = parts
158
200
  scan_all_schemas = True
159
201
  schema = "dbo"
160
202
  elif len(parts) == 3:
161
- workspace_name, lakehouse_name, schema = parts
203
+ workspace_name, item_name_with_type, schema = parts
162
204
  else:
163
205
  raise ValueError(
164
206
  f"Invalid connection string format: '{connection_string}'. "
165
207
  "Expected formats:\n"
166
208
  " 'workspace name' (workspace management only)\n"
167
- " 'workspace name/lakehouse name.lakehouse' (lakehouse with dbo schema)\n"
168
- " 'workspace name/lakehouse name.lakehouse/schema' (lakehouse with specific schema)"
209
+ " 'workspace name/item name.item_type' (item with dbo schema)\n"
210
+ " 'workspace name/item name.item_type/schema' (item with specific schema)\n"
211
+ "Supported item types: .lakehouse, .warehouse, .database, .snowflakedatabase (case-insensitive)"
169
212
  )
170
213
 
171
- if lakehouse_name.endswith(".lakehouse"):
172
- lakehouse_name = lakehouse_name[:-10]
214
+ # Extract item type and name
215
+ item_type = None
216
+ item_name = item_name_with_type
217
+
218
+ # Check for known item types (case-insensitive)
219
+ item_type_map = {
220
+ '.lakehouse': 'Lakehouse',
221
+ '.warehouse': 'Warehouse',
222
+ '.database': 'Database',
223
+ '.snowflakedatabase': 'SnowflakeDatabase'
224
+ }
225
+
226
+ # Parse item type and normalize the suffix to proper case
227
+ item_name_normalized = item_name_with_type
228
+ for suffix, mapped_type in item_type_map.items():
229
+ if item_name_with_type.lower().endswith(suffix):
230
+ item_type = mapped_type
231
+ item_name = item_name_with_type[:-len(suffix)]
232
+ # Normalize to proper case: ItemName.ItemType (e.g., data.Lakehouse)
233
+ item_name_normalized = f"{item_name}.{mapped_type}"
234
+ break
173
235
 
174
- if not workspace_name or not lakehouse_name:
236
+ if not workspace_name or not item_name:
175
237
  raise ValueError(
176
238
  "Missing required parameters. Use one of these formats:\n"
177
239
  " connect('workspace name') # workspace management\n"
178
- " connect('workspace name/lakehouse name.lakehouse/schema') # full lakehouse\n"
179
- " connect('workspace name/lakehouse name.lakehouse') # defaults to dbo"
240
+ " connect('workspace name/item name.item_type/schema') # full item connection\n"
241
+ " connect('workspace name/item name.item_type') # defaults to dbo"
180
242
  )
181
243
 
182
- # Resolve friendly names to GUIDs and construct proper ABFSS path
183
- workspace_id, lakehouse_id = cls._resolve_names_to_guids(workspace_name, lakehouse_name)
244
+ # Per OneLake API docs: Can use friendly names if no spaces/special characters
245
+ # Otherwise must resolve to GUIDs
246
+ # Check for spaces or special characters that would require GUID resolution
247
+ has_special_chars = " " in workspace_name or " " in item_name
248
+
249
+ if has_special_chars:
250
+ # Names have spaces/special chars: resolve to GUIDs via API
251
+ workspace_id, item_id = cls._resolve_names_to_guids(workspace_name, item_name, item_type)
252
+ else:
253
+ # No spaces/special chars: use friendly names directly (works for all item types)
254
+ # Use normalized name with proper case for API compatibility
255
+ workspace_id = workspace_name
256
+ item_id = item_name_normalized # Use normalized with proper case
184
257
 
185
- return cls(workspace_id, lakehouse_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account, token_only)
258
+ return cls(workspace_id, item_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account, token_only)
186
259
 
187
260
  @classmethod
188
- def _resolve_names_to_guids(cls, workspace_name: str, lakehouse_name: str) -> tuple[str, str]:
261
+ def _resolve_names_to_guids(cls, workspace_name: str, item_name: str, item_type: Optional[str] = 'Lakehouse') -> tuple[str, str]:
189
262
  """
190
- Resolve friendly workspace and lakehouse names to their GUIDs.
191
-
192
- Optimization: If names don't contain spaces, use them directly (no API calls needed).
193
- Only resolve to GUIDs when names contain spaces or are already GUIDs.
263
+ Resolve friendly workspace and item names to their GUIDs.
194
264
 
195
265
  Args:
196
266
  workspace_name: Display name of the workspace (can contain spaces)
197
- lakehouse_name: Display name of the lakehouse (can contain spaces)
267
+ item_name: Display name of the item (can contain spaces)
268
+ item_type: Type of item - 'Lakehouse', 'Warehouse', 'Database', 'SnowflakeDatabase', etc.
198
269
 
199
270
  Returns:
200
- Tuple of (workspace_id, lakehouse_id) - either resolved GUIDs or original names
271
+ Tuple of (workspace_id, item_id) - resolved GUIDs
201
272
  """
202
273
 
203
274
  # Check if names are already GUIDs first
204
275
  import re
205
276
  guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
206
277
 
207
- if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
208
- return workspace_name, lakehouse_name
209
-
210
- # Optimization: If workspace name has no spaces, use both names directly (old behavior)
211
- # Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
212
- if " " not in workspace_name:
213
- return workspace_name, lakehouse_name
214
-
215
- # Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
278
+ if guid_pattern.match(workspace_name) and guid_pattern.match(item_name):
279
+ return workspace_name, item_name
216
280
 
281
+ # Need to resolve to GUIDs via API
217
282
  try:
218
283
  # Get authentication token using enhanced auth system
219
284
  from .auth import get_fabric_api_token
@@ -231,8 +296,7 @@ class Duckrun:
231
296
 
232
297
  # Resolve workspace name to ID
233
298
  if current_workspace_id:
234
- # In notebook environment, we could use current workspace ID
235
- # but we should validate it matches the requested workspace name
299
+ # In notebook environment, validate it matches the requested workspace name
236
300
  workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
237
301
  if not workspace_id:
238
302
  # Fallback to current workspace if name resolution fails
@@ -244,21 +308,26 @@ class Duckrun:
244
308
  if not workspace_id:
245
309
  raise ValueError(f"Workspace '{workspace_name}' not found")
246
310
 
247
- # Resolve lakehouse name to ID (required for ABFSS URLs with spaces)
248
- lakehouse_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, lakehouse_name)
249
- if not lakehouse_id:
250
- raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
311
+ # Resolve item name to ID based on item type
312
+ if item_type == 'Lakehouse':
313
+ item_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, item_name)
314
+ else:
315
+ # Use generic item resolver for non-lakehouse items
316
+ item_id = cls._resolve_item_id_by_name(token, workspace_id, item_name, item_type)
251
317
 
252
- return workspace_id, lakehouse_id
318
+ if not item_id:
319
+ raise ValueError(f"{item_type} '{item_name}' not found in workspace '{workspace_name}'")
320
+
321
+ return workspace_id, item_id
253
322
 
254
323
  except Exception as e:
255
324
  print(f"❌ Failed to resolve names to GUIDs: {e}")
256
- print(f"❌ Cannot use friendly names with spaces '{workspace_name}'/'{lakehouse_name}' in ABFSS URLs without GUID resolution")
257
- print("❌ Microsoft Fabric requires actual workspace and lakehouse GUIDs for ABFSS access when names contain spaces")
325
+ print(f"❌ Cannot resolve '{workspace_name}'/'{item_name}' ({item_type}) to GUIDs")
326
+ print("❌ Microsoft Fabric requires actual workspace and item GUIDs for ABFSS access")
258
327
  raise ValueError(
259
- f"Unable to resolve workspace '{workspace_name}' and lakehouse '{lakehouse_name}' to GUIDs. "
260
- f"ABFSS URLs require actual GUIDs when names contain spaces. "
261
- f"Please ensure you have proper authentication and the workspace/lakehouse names are correct."
328
+ f"Unable to resolve workspace '{workspace_name}' and {item_type.lower()} '{item_name}' to GUIDs. "
329
+ f"ABFSS URLs require actual GUIDs. "
330
+ f"Please ensure you have proper authentication and the workspace/item names are correct."
262
331
  )
263
332
 
264
333
  @classmethod
@@ -300,6 +369,58 @@ class Duckrun:
300
369
  return None
301
370
  except Exception:
302
371
  return None
372
+
373
+ @classmethod
374
+ def _resolve_item_id_by_name(cls, token: str, workspace_id: str, item_name: str, item_type: str) -> Optional[str]:
375
+ """
376
+ Get item ID from display name within a workspace using generic items API.
377
+ Works for any item type: Warehouse, Database, SnowflakeDatabase, etc.
378
+
379
+ Args:
380
+ token: Fabric API authentication token
381
+ workspace_id: Workspace GUID
382
+ item_name: Display name of the item
383
+ item_type: Type of item (e.g., 'Warehouse', 'Database', 'SnowflakeDatabase')
384
+
385
+ Returns:
386
+ Item GUID if found, None otherwise
387
+ """
388
+ try:
389
+ import requests
390
+ # Use generic items API with type filter
391
+ url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items"
392
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
393
+
394
+ # Add type filter as query parameter
395
+ params = {"type": item_type}
396
+
397
+ print(f" Searching for {item_type} '{item_name}' in workspace {workspace_id}")
398
+ print(f" API URL: {url}?type={item_type}")
399
+
400
+ response = requests.get(url, headers=headers, params=params)
401
+ response.raise_for_status()
402
+
403
+ result = response.json()
404
+ items = result.get("value", [])
405
+
406
+ print(f" Found {len(items)} items of type {item_type}")
407
+ if items:
408
+ print(f" Available items: {[item.get('displayName') for item in items]}")
409
+
410
+ for item in items:
411
+ if item.get("displayName") == item_name:
412
+ item_id = item.get("id")
413
+ print(f" Found matching item: {item_name} -> {item_id}")
414
+ return item_id
415
+
416
+ print(f" Item '{item_name}' not found in the list")
417
+ return None
418
+ except Exception as e:
419
+ print(f" Error resolving {item_type} item: {e}")
420
+ if hasattr(e, 'response') and e.response is not None:
421
+ print(f" Response status: {e.response.status_code}")
422
+ print(f" Response body: {e.response.text}")
423
+ return None
303
424
 
304
425
  @classmethod
305
426
  def connect_workspace(cls, workspace_name: str):
@@ -341,77 +462,138 @@ class Duckrun:
341
462
 
342
463
  def _discover_tables_fast(self) -> List[Tuple[str, str]]:
343
464
  """
344
- Fast Delta table discovery using obstore with list_with_delimiter.
345
- Only lists directories, not files - super fast!
465
+ Fast table discovery using OneLake Delta Table API (Unity Catalog compatible).
466
+ Uses: https://learn.microsoft.com/en-us/fabric/onelake/table-apis/delta-table-apis-overview
346
467
 
347
468
  Returns:
348
469
  List of tuples: [(schema, table_name), ...]
349
470
  """
350
- token = self._get_storage_token()
351
- if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
352
- print("Authenticating with Azure for table discovery (detecting environment automatically)...")
353
- from .auth import get_token
354
- token = get_token()
355
- if not token:
356
- print("❌ Failed to authenticate for table discovery")
357
- return []
358
-
359
- url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
360
- store = AzureStore.from_url(url, bearer_token=token)
361
-
362
- # Use the same lakehouse URL part logic as in __init__ to ensure .lakehouse suffix is added when needed
363
- import re
364
- guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
365
- if guid_pattern.match(self.lakehouse_id):
366
- lakehouse_url_part = self.lakehouse_id
367
- else:
368
- # If workspace name has no spaces, always append .lakehouse unless already present
369
- if " " not in self.workspace_id and not self.lakehouse_id.endswith('.lakehouse'):
370
- lakehouse_url_part = f'{self.lakehouse_id}.lakehouse'
471
+ try:
472
+ # Get storage token for OneLake
473
+ token = self._get_storage_token()
474
+ if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
475
+ print("Authenticating with Azure for table discovery...")
476
+ from .auth import get_token
477
+ token = get_token()
478
+ if not token:
479
+ print("❌ Failed to authenticate for table discovery")
480
+ return []
481
+
482
+ # OneLake Delta Table API endpoint (Unity Catalog compatible)
483
+ base_url = "https://onelake.table.fabric.microsoft.com/delta"
484
+
485
+ # Determine workspace/item identifier for API
486
+ # Per docs: Can use friendly names (WorkspaceName/ItemName.ItemType) if no special characters
487
+ # Otherwise must use GUIDs (WorkspaceID/ItemID)
488
+ import re
489
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
490
+
491
+ # Check if we're using GUIDs or friendly names
492
+ if guid_pattern.match(self.workspace_id) and guid_pattern.match(self.lakehouse_id):
493
+ # Using GUIDs - use them directly in API
494
+ workspace_identifier = self.workspace_id
495
+ item_identifier = self.lakehouse_id
496
+ catalog_name = self.lakehouse_id
371
497
  else:
372
- lakehouse_url_part = self.lakehouse_id
373
-
374
- base_path = f"{lakehouse_url_part}/Tables/"
375
- tables_found = []
376
-
377
- if self.scan_all_schemas:
378
- # Discover all schemas first
379
- schemas_result = obs.list_with_delimiter(store, prefix=base_path)
380
- schemas = [
381
- prefix.rstrip('/').split('/')[-1]
382
- for prefix in schemas_result['common_prefixes']
383
- ]
384
-
385
- # Discover tables in each schema
386
- for schema_name in schemas:
387
- schema_path = f"{base_path}{schema_name}/"
388
- result = obs.list_with_delimiter(store, prefix=schema_path)
498
+ # Using friendly names - lakehouse_id already includes .ItemType suffix
499
+ workspace_identifier = self.workspace_id
500
+ item_identifier = self.lakehouse_id
501
+ catalog_name = self.lakehouse_id
502
+
503
+ print(f"🔍 Discovering tables via OneLake Delta Table API...")
504
+ print(f" Using identifier: {workspace_identifier}/{item_identifier}")
505
+
506
+ tables_found = []
507
+
508
+ if self.scan_all_schemas:
509
+ # First, list all schemas
510
+ schemas_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/schemas"
511
+ params = {"catalog_name": catalog_name}
512
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
389
513
 
390
- for table_prefix in result['common_prefixes']:
391
- table_name = table_prefix.rstrip('/').split('/')[-1]
392
- # Skip non-table directories
393
- if table_name not in ('metadata', 'iceberg'):
394
- tables_found.append((schema_name, table_name))
395
- else:
396
- # Scan specific schema only
397
- schema_path = f"{base_path}{self.schema}/"
398
- result = obs.list_with_delimiter(store, prefix=schema_path)
514
+ schemas_response = requests.get(schemas_url, headers=headers, params=params)
515
+
516
+ if schemas_response.status_code == 200:
517
+ schemas_result = schemas_response.json()
518
+ schemas = schemas_result.get("schemas", [])
519
+ schema_names = [s.get("name") for s in schemas if s.get("name")]
520
+
521
+ print(f" Found {len(schema_names)} schemas: {schema_names}")
522
+
523
+ # Get tables from each schema
524
+ for schema_name in schema_names:
525
+ tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
526
+ tables_params = {
527
+ "catalog_name": catalog_name,
528
+ "schema_name": schema_name
529
+ }
530
+
531
+ tables_response = requests.get(tables_url, headers=headers, params=tables_params)
532
+
533
+ if tables_response.status_code == 200:
534
+ tables_result = tables_response.json()
535
+ tables = tables_result.get("tables", [])
536
+
537
+ for table in tables:
538
+ table_name = table.get("name", "")
539
+ if table_name:
540
+ tables_found.append((schema_name, table_name))
541
+
542
+ if tables:
543
+ print(f" Schema '{schema_name}': {len(tables)} tables")
544
+ else:
545
+ print(f" Failed to list schemas: {schemas_response.status_code}")
546
+ if schemas_response.status_code != 404:
547
+ print(f" Response: {schemas_response.text[:300]}")
548
+ else:
549
+ # Single schema mode - list tables in specific schema
550
+ tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
551
+ params = {
552
+ "catalog_name": catalog_name,
553
+ "schema_name": self.schema
554
+ }
555
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
556
+
557
+ print(f" Listing tables in schema: {self.schema}")
558
+ tables_response = requests.get(tables_url, headers=headers, params=params)
559
+
560
+ if tables_response.status_code == 200:
561
+ tables_result = tables_response.json()
562
+ tables = tables_result.get("tables", [])
563
+
564
+ for table in tables:
565
+ table_name = table.get("name", "")
566
+ if table_name:
567
+ tables_found.append((self.schema, table_name))
568
+
569
+ print(f" Found {len(tables)} tables")
570
+ elif tables_response.status_code == 404:
571
+ print(f" Schema '{self.schema}' not found or has no tables")
572
+ else:
573
+ print(f" Failed to list tables: {tables_response.status_code}")
574
+ print(f" Response: {tables_response.text[:300]}")
399
575
 
400
- for table_prefix in result['common_prefixes']:
401
- table_name = table_prefix.rstrip('/').split('/')[-1]
402
- if table_name not in ('metadata', 'iceberg'):
403
- tables_found.append((self.schema, table_name))
404
-
405
- return tables_found
576
+ return tables_found
577
+
578
+ except Exception as e:
579
+ print(f"❌ Error during table discovery: {e}")
580
+ import traceback
581
+ traceback.print_exc()
582
+ return []
406
583
 
407
584
  def _attach_lakehouse(self):
408
585
  """Attach lakehouse tables as DuckDB views using fast discovery"""
586
+ print(f"🔌 Attaching tables from schema: {self.schema if not self.scan_all_schemas else 'all schemas'}")
409
587
  self._create_onelake_secret()
410
588
 
411
589
  try:
412
590
  tables = self._discover_tables_fast()
413
591
 
414
592
  if not tables:
593
+ if self.scan_all_schemas:
594
+ print(f"⚠️ No tables found in any schema")
595
+ else:
596
+ print(f"⚠️ No tables found in {self.schema} schema")
415
597
  return
416
598
 
417
599
  # Collect table names for display
@@ -434,6 +616,7 @@ class Duckrun:
434
616
  AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
435
617
  """)
436
618
  except Exception as e:
619
+ print(f"⚠️ Failed to attach table {schema_name}.{table_name}: {e}")
437
620
  continue
438
621
 
439
622
  # Print discovered tables as comma-separated list
@@ -442,6 +625,8 @@ class Duckrun:
442
625
 
443
626
  except Exception as e:
444
627
  print(f"❌ Error attaching lakehouse: {e}")
628
+ import traceback
629
+ traceback.print_exc()
445
630
 
446
631
  def _register_lookup_functions(self):
447
632
  """
@@ -587,24 +772,97 @@ class Duckrun:
587
772
  except Exception as e:
588
773
  print(f"⚠️ Warning: Could not register lookup functions: {e}")
589
774
 
590
- def get_workspace_id(self) -> str:
775
+ def get_workspace_id(self, force: bool = False) -> str:
591
776
  """
592
777
  Get the workspace ID (GUID or name without spaces).
593
778
  Use this when passing workspace parameter to Python functions.
594
779
 
780
+ Args:
781
+ force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
782
+
595
783
  Returns:
596
784
  Workspace ID - either a GUID or workspace name without spaces
597
785
  """
786
+ if not force:
787
+ return self.workspace_id
788
+
789
+ # Force resolution to GUID
790
+ import re
791
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
792
+
793
+ # If already a GUID, return it
794
+ if guid_pattern.match(self.workspace_id):
795
+ return self.workspace_id
796
+
797
+ # Try to get from notebook context first (fastest)
798
+ try:
799
+ import notebookutils # type: ignore
800
+ workspace_guid = notebookutils.runtime.context.get("workspaceId")
801
+ if workspace_guid:
802
+ return workspace_guid
803
+ except ImportError:
804
+ pass
805
+
806
+ # Resolve via API
807
+ try:
808
+ from .auth import get_fabric_api_token
809
+ token = get_fabric_api_token()
810
+ if token:
811
+ resolved_id = self._resolve_workspace_id_by_name(token, self.workspace_id)
812
+ if resolved_id:
813
+ return resolved_id
814
+ except Exception:
815
+ pass
816
+
817
+ # Fallback to original value
598
818
  return self.workspace_id
599
819
 
600
- def get_lakehouse_id(self) -> str:
820
+ def get_lakehouse_id(self, force: bool = False) -> str:
601
821
  """
602
822
  Get the lakehouse ID (GUID or name).
603
823
  Use this when passing lakehouse parameter to Python functions.
604
824
 
825
+ Args:
826
+ force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
827
+
605
828
  Returns:
606
829
  Lakehouse ID - either a GUID or lakehouse name
607
830
  """
831
+ if not force:
832
+ return self.lakehouse_id
833
+
834
+ # Force resolution to GUID
835
+ import re
836
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
837
+
838
+ # If already a GUID, return it
839
+ if guid_pattern.match(self.lakehouse_id):
840
+ return self.lakehouse_id
841
+
842
+ # Try to get from notebook context first (fastest)
843
+ try:
844
+ import notebookutils # type: ignore
845
+ lakehouse_guid = notebookutils.lakehouse.get("id")
846
+ if lakehouse_guid:
847
+ return lakehouse_guid
848
+ except (ImportError, Exception):
849
+ pass
850
+
851
+ # Resolve via API
852
+ try:
853
+ from .auth import get_fabric_api_token
854
+ token = get_fabric_api_token()
855
+ if token:
856
+ # First get workspace GUID
857
+ workspace_guid = self.get_workspace_id(force=True)
858
+ # Then resolve lakehouse name to ID
859
+ resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, self.lakehouse_id)
860
+ if resolved_id:
861
+ return resolved_id
862
+ except Exception:
863
+ pass
864
+
865
+ # Fallback to original value
608
866
  return self.lakehouse_id
609
867
 
610
868
  def run(self, pipeline: List[Tuple]) -> bool:
duckrun/runner.py CHANGED
@@ -130,9 +130,12 @@ def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
130
130
 
131
131
  # Get original and resolved names
132
132
  original_workspace = duckrun_instance.workspace
133
- original_lakehouse = duckrun_instance.lakehouse_name
133
+ original_lakehouse = duckrun_instance.lakehouse_display_name # Base name without suffix (e.g., "data")
134
134
  resolved_workspace = duckrun_instance.workspace_id
135
- resolved_lakehouse = duckrun_instance.lakehouse_id
135
+
136
+ # Always pass base lakehouse name (without .Lakehouse suffix) to user functions
137
+ # User functions expect just the name like "data", not "data.Lakehouse"
138
+ resolved_lakehouse = duckrun_instance.lakehouse_display_name
136
139
 
137
140
  # Substitute workspace/lakehouse names in args if they differ
138
141
  # This prevents URL encoding issues when names contain spaces
@@ -149,7 +152,7 @@ def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
149
152
  else:
150
153
  substituted_args.append(arg)
151
154
  args = tuple(substituted_args)
152
- print(f"📝 Auto-substituted workspace/lakehouse names in args for URL compatibility")
155
+ print(f"📝 Auto-substituted workspace/lakehouse names in args")
153
156
 
154
157
  print(f"Running Python: {name}{args}")
155
158
  result = func(*args)
@@ -282,12 +285,17 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
282
285
  # If GUID, use just the GUID
283
286
  content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
284
287
  else:
285
- # If not GUID, use legacy format
286
- content = content.replace('${lh}.Lakehouse', f'{duckrun_instance.lakehouse_name}.Lakehouse')
288
+ # If not GUID, check if lakehouse_name already has .ItemType suffix
289
+ if duckrun_instance.lakehouse_name.endswith(('.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase')):
290
+ # Already has suffix - use as is
291
+ content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
292
+ else:
293
+ # No suffix - add .Lakehouse for legacy format
294
+ content = content.replace('${lh}.Lakehouse', f'{duckrun_instance.lakehouse_name}.Lakehouse')
287
295
 
288
296
  full_params = {
289
297
  'ws': duckrun_instance.workspace,
290
- 'lh': duckrun_instance.lakehouse_name,
298
+ 'lh': duckrun_instance.lakehouse_display_name, # Use display name (without suffix) for backward compat
291
299
  'schema': duckrun_instance.schema,
292
300
  'storage_account': duckrun_instance.storage_account,
293
301
  'tables_url': duckrun_instance.table_base_url,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.14.dev1
3
+ Version: 0.2.14.dev3
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -10,7 +10,7 @@ Project-URL: Issues, https://github.com/djouallah/duckrun/issues
10
10
  Requires-Python: >=3.9
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
- Requires-Dist: duckdb>=1.2.2
13
+ Requires-Dist: duckdb>=1.2.0
14
14
  Requires-Dist: deltalake<=0.18.2
15
15
  Requires-Dist: requests>=2.28.0
16
16
  Requires-Dist: obstore>=0.2.0
@@ -0,0 +1,14 @@
1
+ duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
2
+ duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
+ duckrun/core.py,sha256=_D0CnaRNQm_wW4bSP__EAPHEt_VNgf9N-VXWYSZScL8,65829
4
+ duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
+ duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
7
+ duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
8
+ duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
9
+ duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
+ duckrun-0.2.14.dev3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
+ duckrun-0.2.14.dev3.dist-info/METADATA,sha256=tOLtAIHcEJyXk93hvvgZNC3Cx7U2Dy7iatRutBnrU3Y,20771
12
+ duckrun-0.2.14.dev3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ duckrun-0.2.14.dev3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
+ duckrun-0.2.14.dev3.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- duckrun/__init__.py,sha256=OxPnNrxYqv_2XfiQPO27JiZDIxze4jnUE3VaqDdioAg,236
2
- duckrun/auth.py,sha256=dMqIzozgEQ5v7Uc3Mb_OoFZGmsAq0m-VOoYCVL7rehc,9281
3
- duckrun/core.py,sha256=LvxplwziTLb_18n064waoN3oWMuhpVJe_-y6GYfoBOc,53127
4
- duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
- duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
- duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
7
- duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
8
- duckrun/stats.py,sha256=oKIjZ7u5cFVT63FuOl5UqoDsOG3098woSCn-uI6i_sQ,11084
9
- duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
- duckrun-0.2.14.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
- duckrun-0.2.14.dev1.dist-info/METADATA,sha256=MhpAtTMLpzOwOINN7Dgs6ih_JhjhbzxX73W_E6N30pA,20771
12
- duckrun-0.2.14.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- duckrun-0.2.14.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
- duckrun-0.2.14.dev1.dist-info/RECORD,,