duckrun 0.2.13__py3-none-any.whl → 0.2.19.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/__init__.py +4 -2
- duckrun/auth.py +12 -0
- duckrun/core.py +703 -179
- duckrun/notebook.py +324 -0
- duckrun/rle.py +860 -0
- duckrun/runner.py +15 -45
- duckrun/semantic_model.py +143 -17
- duckrun/stats.py +267 -62
- duckrun/writer.py +35 -6
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/METADATA +3 -3
- duckrun-0.2.19.dev5.dist-info/RECORD +16 -0
- duckrun-0.2.13.dist-info/RECORD +0 -14
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/WHEEL +0 -0
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/top_level.txt +0 -0
duckrun/core.py
CHANGED
|
@@ -4,20 +4,82 @@ import os
|
|
|
4
4
|
import importlib.util
|
|
5
5
|
import json
|
|
6
6
|
import time
|
|
7
|
-
from deltalake import DeltaTable, write_deltalake
|
|
8
7
|
from typing import List, Tuple, Union, Optional, Callable, Dict, Any
|
|
9
8
|
from string import Template
|
|
10
|
-
import obstore as obs
|
|
11
|
-
from obstore.store import AzureStore
|
|
12
9
|
from datetime import datetime
|
|
13
10
|
from .stats import get_stats as _get_stats
|
|
14
11
|
from .runner import run as _run
|
|
15
12
|
from .files import copy as _copy, download as _download
|
|
16
13
|
from .writer import QueryResult
|
|
17
14
|
|
|
18
|
-
|
|
15
|
+
|
|
16
|
+
class WorkspaceOperationsMixin:
|
|
17
|
+
"""
|
|
18
|
+
Mixin class for workspace-level operations that work for both
|
|
19
|
+
full Duckrun connections and workspace-only connections.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def import_notebook_from_web(self, url: str,
|
|
23
|
+
notebook_name: Optional[str] = None,
|
|
24
|
+
overwrite: bool = False) -> dict:
|
|
25
|
+
"""
|
|
26
|
+
Import a Jupyter notebook from a web URL into the workspace.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
url: URL to the notebook file (e.g., GitHub raw URL). Required.
|
|
30
|
+
notebook_name: Name for the imported notebook. Optional - derived from URL if not provided.
|
|
31
|
+
overwrite: Whether to overwrite if notebook already exists (default: False)
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Dictionary with import result
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
38
|
+
result = con.import_notebook_from_web(
|
|
39
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
ws = duckrun.connect("workspace")
|
|
43
|
+
result = ws.import_notebook_from_web(
|
|
44
|
+
url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
from .notebook import import_notebook_from_web as _import_notebook_from_web
|
|
48
|
+
|
|
49
|
+
# Get workspace name from either self.workspace or self.workspace_name
|
|
50
|
+
workspace_name = getattr(self, 'workspace', None) or getattr(self, 'workspace_name', None)
|
|
51
|
+
|
|
52
|
+
return _import_notebook_from_web(
|
|
53
|
+
url=url,
|
|
54
|
+
notebook_name=notebook_name,
|
|
55
|
+
overwrite=overwrite,
|
|
56
|
+
workspace_name=workspace_name
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
|
60
|
+
"""Helper method to get workspace ID from name"""
|
|
61
|
+
try:
|
|
62
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
|
63
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
64
|
+
|
|
65
|
+
response = requests.get(url, headers=headers)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
|
|
68
|
+
workspaces = response.json().get("value", [])
|
|
69
|
+
for workspace in workspaces:
|
|
70
|
+
if workspace.get("displayName") == workspace_name:
|
|
71
|
+
return workspace.get("id")
|
|
72
|
+
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
except Exception:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class Duckrun(WorkspaceOperationsMixin):
|
|
19
80
|
"""
|
|
20
|
-
|
|
81
|
+
OneLake task runner with clean tuple-based API.
|
|
82
|
+
Supports lakehouses, warehouses, databases, and other OneLake items.
|
|
21
83
|
Powered by DuckDB for fast data processing.
|
|
22
84
|
|
|
23
85
|
Task formats:
|
|
@@ -30,6 +92,10 @@ class Duckrun:
|
|
|
30
92
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse") # defaults to dbo schema, lists all tables
|
|
31
93
|
dr.run(pipeline)
|
|
32
94
|
|
|
95
|
+
# For other OneLake items:
|
|
96
|
+
dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
|
|
97
|
+
dr = Duckrun.connect("workspace/warehouse.Warehouse")
|
|
98
|
+
|
|
33
99
|
# For data exploration with Spark-style API:
|
|
34
100
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse")
|
|
35
101
|
dr.sql("SELECT * FROM table").show()
|
|
@@ -53,7 +119,8 @@ class Duckrun:
|
|
|
53
119
|
|
|
54
120
|
def __init__(self, workspace_id: str, lakehouse_id: str, schema: str = "dbo",
|
|
55
121
|
sql_folder: Optional[str] = None, compaction_threshold: int = 10,
|
|
56
|
-
scan_all_schemas: bool = False, storage_account: str = "onelake"
|
|
122
|
+
scan_all_schemas: bool = False, storage_account: str = "onelake",
|
|
123
|
+
token_only: bool = False):
|
|
57
124
|
# Store GUIDs for internal use
|
|
58
125
|
self.workspace_id = workspace_id
|
|
59
126
|
self.lakehouse_id = lakehouse_id
|
|
@@ -62,25 +129,55 @@ class Duckrun:
|
|
|
62
129
|
self.compaction_threshold = compaction_threshold
|
|
63
130
|
self.scan_all_schemas = scan_all_schemas
|
|
64
131
|
self.storage_account = storage_account
|
|
132
|
+
self.token_only = token_only
|
|
65
133
|
|
|
66
|
-
#
|
|
134
|
+
# Store both full name (with .ItemType) and display name (without .ItemType) for backward compatibility
|
|
135
|
+
# lakehouse_id: Full name with suffix for API calls (e.g., "data.Lakehouse")
|
|
136
|
+
# lakehouse_display_name: Name only without suffix for user code/templates (e.g., "data")
|
|
137
|
+
self.lakehouse_id = lakehouse_id
|
|
138
|
+
|
|
139
|
+
# Extract display name (remove .ItemType suffix if present)
|
|
67
140
|
import re
|
|
141
|
+
# Check if lakehouse_id has .ItemType suffix
|
|
142
|
+
if not re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', lakehouse_id, re.IGNORECASE):
|
|
143
|
+
# Friendly name - extract base name without suffix
|
|
144
|
+
for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
|
|
145
|
+
if lakehouse_id.endswith(suffix):
|
|
146
|
+
self.lakehouse_display_name = lakehouse_id[:-len(suffix)]
|
|
147
|
+
break
|
|
148
|
+
else:
|
|
149
|
+
self.lakehouse_display_name = lakehouse_id
|
|
150
|
+
else:
|
|
151
|
+
# GUID - use as is
|
|
152
|
+
self.lakehouse_display_name = lakehouse_id
|
|
153
|
+
|
|
154
|
+
# Construct proper ABFSS URLs
|
|
155
|
+
# Format: abfss://{workspace}@{storage_account}.dfs.fabric.microsoft.com/{item}/Tables/
|
|
156
|
+
# where {workspace} and {item} can be:
|
|
157
|
+
# - Names with .lakehouse suffix (lakehouse optimization when no spaces in workspace)
|
|
158
|
+
# - GUIDs (when resolved via API for non-lakehouse items or items with spaces)
|
|
68
159
|
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
69
|
-
|
|
160
|
+
|
|
161
|
+
# Determine the item URL part for ABFSS
|
|
70
162
|
if guid_pattern.match(lakehouse_id):
|
|
71
|
-
|
|
163
|
+
# Already a GUID - use as-is (from API resolution)
|
|
164
|
+
item_url_part = lakehouse_id
|
|
72
165
|
else:
|
|
73
|
-
#
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Tables/'
|
|
79
|
-
self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_url_part}/Files/'
|
|
166
|
+
# Friendly name - use as-is (already includes .ItemType suffix from connect())
|
|
167
|
+
item_url_part = lakehouse_id
|
|
168
|
+
|
|
169
|
+
self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Tables/'
|
|
170
|
+
self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{item_url_part}/Files/'
|
|
80
171
|
|
|
81
172
|
# Keep legacy properties for backward compatibility
|
|
82
173
|
self.workspace = workspace_id
|
|
83
|
-
self.lakehouse_name =
|
|
174
|
+
self.lakehouse_name = self.lakehouse_display_name # Use display name (without suffix) for backward compatibility
|
|
175
|
+
|
|
176
|
+
# Store display name without suffix for backward compatibility with user Python functions
|
|
177
|
+
# Extract base name by removing .ItemType suffix if present
|
|
178
|
+
import re
|
|
179
|
+
suffix_pattern = re.compile(r'\.(Lakehouse|Warehouse|Database|SnowflakeDatabase)$', re.IGNORECASE)
|
|
180
|
+
self.lakehouse_display_name = suffix_pattern.sub('', lakehouse_id)
|
|
84
181
|
|
|
85
182
|
self.con = duckdb.connect()
|
|
86
183
|
self.con.sql("SET preserve_insertion_order = false")
|
|
@@ -93,25 +190,36 @@ class Duckrun:
|
|
|
93
190
|
except ImportError:
|
|
94
191
|
pass # Not in Colab, use default transport
|
|
95
192
|
|
|
96
|
-
|
|
97
|
-
|
|
193
|
+
# Only attach lakehouse and register functions if not token_only mode
|
|
194
|
+
if not token_only:
|
|
195
|
+
self._attach_lakehouse()
|
|
196
|
+
self._register_lookup_functions()
|
|
197
|
+
else:
|
|
198
|
+
# In token_only mode, just create the secret for authentication
|
|
199
|
+
self._create_onelake_secret()
|
|
200
|
+
print("✓ Token authenticated (fast mode - tables not listed)")
|
|
98
201
|
|
|
99
202
|
@classmethod
|
|
100
203
|
def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
|
|
101
|
-
compaction_threshold: int = 100, storage_account: str = "onelake"
|
|
204
|
+
compaction_threshold: int = 100, storage_account: str = "onelake",
|
|
205
|
+
token_only: bool = False):
|
|
102
206
|
"""
|
|
103
|
-
Create and connect to lakehouse or workspace.
|
|
207
|
+
Create and connect to OneLake items (lakehouse, warehouse, database, etc.) or workspace.
|
|
104
208
|
|
|
105
209
|
Smart detection based on connection string format:
|
|
106
210
|
- "workspace" → workspace management only
|
|
107
|
-
- "ws/
|
|
108
|
-
- "ws/
|
|
211
|
+
- "ws/item.lakehouse/schema" → lakehouse connection with specific schema
|
|
212
|
+
- "ws/item.lakehouse" → lakehouse connection (defaults to dbo schema)
|
|
213
|
+
- "ws/item.warehouse" → warehouse connection
|
|
214
|
+
- "ws/item.database" → database connection
|
|
215
|
+
- "ws/item.snowflakedatabase" → Snowflake database connection
|
|
109
216
|
|
|
110
217
|
Args:
|
|
111
218
|
connection_string: OneLake path or workspace name
|
|
112
219
|
sql_folder: Optional path or URL to SQL files folder
|
|
113
220
|
compaction_threshold: File count threshold for compaction
|
|
114
221
|
storage_account: Storage account name (default: "onelake")
|
|
222
|
+
token_only: If True, only authenticate without listing tables (faster connection)
|
|
115
223
|
|
|
116
224
|
Examples:
|
|
117
225
|
# Workspace management only (supports spaces in names)
|
|
@@ -119,16 +227,26 @@ class Duckrun:
|
|
|
119
227
|
ws.list_lakehouses()
|
|
120
228
|
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
|
121
229
|
|
|
122
|
-
#
|
|
230
|
+
# Lakehouse connections (supports spaces in names)
|
|
123
231
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/schema", sql_folder="./sql")
|
|
124
232
|
dr = Duckrun.connect("Data Workspace/Sales Data.lakehouse/analytics") # spaces supported
|
|
125
233
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse") # defaults to dbo schema
|
|
126
234
|
dr = Duckrun.connect("workspace/lakehouse.lakehouse", storage_account="xxx-onelake") # custom storage
|
|
127
235
|
|
|
236
|
+
# Warehouse and database connections (always uses API to resolve GUIDs)
|
|
237
|
+
dr = Duckrun.connect("SNOWFLAKE/ONELAKEUSEAST.SnowflakeDatabase")
|
|
238
|
+
dr = Duckrun.connect("My Workspace/My Warehouse.Warehouse")
|
|
239
|
+
dr = Duckrun.connect("workspace/database.Database")
|
|
240
|
+
|
|
241
|
+
# Fast connection without table listing (token only)
|
|
242
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse", token_only=True)
|
|
243
|
+
|
|
128
244
|
Note:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
245
|
+
- Lakehouse items without spaces in workspace name use optimization (no API calls)
|
|
246
|
+
- Non-lakehouse items always resolve to GUIDs via Fabric API
|
|
247
|
+
- Internally constructs proper ABFSS URLs:
|
|
248
|
+
"My Workspace/My Item.lakehouse/schema" →
|
|
249
|
+
"abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/item_guid/Tables/schema"
|
|
132
250
|
"""
|
|
133
251
|
|
|
134
252
|
# Check if it's a workspace-only connection (no "/" means workspace name only)
|
|
@@ -137,70 +255,94 @@ class Duckrun:
|
|
|
137
255
|
|
|
138
256
|
scan_all_schemas = False
|
|
139
257
|
|
|
140
|
-
# Parse
|
|
141
|
-
# Support workspace and
|
|
258
|
+
# Parse connection string: "ws/item_name.item_type/schema" or "ws/item_name.item_type"
|
|
259
|
+
# Support workspace and item names with spaces
|
|
260
|
+
# Item types: .lakehouse, .Lakehouse, .warehouse, .Warehouse, .database, .Database, .snowflakedatabase, .SnowflakeDatabase
|
|
142
261
|
parts = connection_string.split("/")
|
|
143
262
|
if len(parts) == 2:
|
|
144
|
-
workspace_name,
|
|
263
|
+
workspace_name, item_name_with_type = parts
|
|
145
264
|
scan_all_schemas = True
|
|
146
265
|
schema = "dbo"
|
|
147
266
|
elif len(parts) == 3:
|
|
148
|
-
workspace_name,
|
|
267
|
+
workspace_name, item_name_with_type, schema = parts
|
|
149
268
|
else:
|
|
150
269
|
raise ValueError(
|
|
151
270
|
f"Invalid connection string format: '{connection_string}'. "
|
|
152
271
|
"Expected formats:\n"
|
|
153
272
|
" 'workspace name' (workspace management only)\n"
|
|
154
|
-
" 'workspace name/
|
|
155
|
-
" 'workspace name/
|
|
273
|
+
" 'workspace name/item name.item_type' (item with dbo schema)\n"
|
|
274
|
+
" 'workspace name/item name.item_type/schema' (item with specific schema)\n"
|
|
275
|
+
"Supported item types: .lakehouse, .warehouse, .database, .snowflakedatabase (case-insensitive)"
|
|
156
276
|
)
|
|
157
277
|
|
|
158
|
-
|
|
159
|
-
|
|
278
|
+
# Extract item type and name
|
|
279
|
+
item_type = None
|
|
280
|
+
item_name = item_name_with_type
|
|
281
|
+
|
|
282
|
+
# Check for known item types (case-insensitive)
|
|
283
|
+
item_type_map = {
|
|
284
|
+
'.lakehouse': 'Lakehouse',
|
|
285
|
+
'.warehouse': 'Warehouse',
|
|
286
|
+
'.database': 'Database',
|
|
287
|
+
'.snowflakedatabase': 'SnowflakeDatabase'
|
|
288
|
+
}
|
|
160
289
|
|
|
161
|
-
|
|
290
|
+
# Parse item type and normalize the suffix to proper case
|
|
291
|
+
item_name_normalized = item_name_with_type
|
|
292
|
+
for suffix, mapped_type in item_type_map.items():
|
|
293
|
+
if item_name_with_type.lower().endswith(suffix):
|
|
294
|
+
item_type = mapped_type
|
|
295
|
+
item_name = item_name_with_type[:-len(suffix)]
|
|
296
|
+
# Normalize to proper case: ItemName.ItemType (e.g., data.Lakehouse)
|
|
297
|
+
item_name_normalized = f"{item_name}.{mapped_type}"
|
|
298
|
+
break
|
|
299
|
+
|
|
300
|
+
if not workspace_name or not item_name:
|
|
162
301
|
raise ValueError(
|
|
163
302
|
"Missing required parameters. Use one of these formats:\n"
|
|
164
303
|
" connect('workspace name') # workspace management\n"
|
|
165
|
-
" connect('workspace name/
|
|
166
|
-
" connect('workspace name/
|
|
304
|
+
" connect('workspace name/item name.item_type/schema') # full item connection\n"
|
|
305
|
+
" connect('workspace name/item name.item_type') # defaults to dbo"
|
|
167
306
|
)
|
|
168
307
|
|
|
169
|
-
#
|
|
170
|
-
|
|
308
|
+
# Per OneLake API docs: Can use friendly names if no spaces/special characters
|
|
309
|
+
# Otherwise must resolve to GUIDs
|
|
310
|
+
# Check for spaces or special characters that would require GUID resolution
|
|
311
|
+
has_special_chars = " " in workspace_name or " " in item_name
|
|
312
|
+
|
|
313
|
+
if has_special_chars:
|
|
314
|
+
# Names have spaces/special chars: resolve to GUIDs via API
|
|
315
|
+
workspace_id, item_id = cls._resolve_names_to_guids(workspace_name, item_name, item_type)
|
|
316
|
+
else:
|
|
317
|
+
# No spaces/special chars: use friendly names directly (works for all item types)
|
|
318
|
+
# Use normalized name with proper case for API compatibility
|
|
319
|
+
workspace_id = workspace_name
|
|
320
|
+
item_id = item_name_normalized # Use normalized with proper case
|
|
171
321
|
|
|
172
|
-
return cls(workspace_id,
|
|
322
|
+
return cls(workspace_id, item_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account, token_only)
|
|
173
323
|
|
|
174
324
|
@classmethod
|
|
175
|
-
def _resolve_names_to_guids(cls, workspace_name: str,
|
|
325
|
+
def _resolve_names_to_guids(cls, workspace_name: str, item_name: str, item_type: Optional[str] = 'Lakehouse') -> tuple[str, str]:
|
|
176
326
|
"""
|
|
177
|
-
Resolve friendly workspace and
|
|
178
|
-
|
|
179
|
-
Optimization: If names don't contain spaces, use them directly (no API calls needed).
|
|
180
|
-
Only resolve to GUIDs when names contain spaces or are already GUIDs.
|
|
327
|
+
Resolve friendly workspace and item names to their GUIDs.
|
|
181
328
|
|
|
182
329
|
Args:
|
|
183
330
|
workspace_name: Display name of the workspace (can contain spaces)
|
|
184
|
-
|
|
331
|
+
item_name: Display name of the item (can contain spaces)
|
|
332
|
+
item_type: Type of item - 'Lakehouse', 'Warehouse', 'Database', 'SnowflakeDatabase', etc.
|
|
185
333
|
|
|
186
334
|
Returns:
|
|
187
|
-
Tuple of (workspace_id,
|
|
335
|
+
Tuple of (workspace_id, item_id) - resolved GUIDs
|
|
188
336
|
"""
|
|
189
337
|
|
|
190
338
|
# Check if names are already GUIDs first
|
|
191
339
|
import re
|
|
192
340
|
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
193
341
|
|
|
194
|
-
if guid_pattern.match(workspace_name) and guid_pattern.match(
|
|
195
|
-
return workspace_name,
|
|
196
|
-
|
|
197
|
-
# Optimization: If workspace name has no spaces, use both names directly (old behavior)
|
|
198
|
-
# Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
|
|
199
|
-
if " " not in workspace_name:
|
|
200
|
-
return workspace_name, lakehouse_name
|
|
201
|
-
|
|
202
|
-
# Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
|
|
342
|
+
if guid_pattern.match(workspace_name) and guid_pattern.match(item_name):
|
|
343
|
+
return workspace_name, item_name
|
|
203
344
|
|
|
345
|
+
# Need to resolve to GUIDs via API
|
|
204
346
|
try:
|
|
205
347
|
# Get authentication token using enhanced auth system
|
|
206
348
|
from .auth import get_fabric_api_token
|
|
@@ -218,8 +360,7 @@ class Duckrun:
|
|
|
218
360
|
|
|
219
361
|
# Resolve workspace name to ID
|
|
220
362
|
if current_workspace_id:
|
|
221
|
-
# In notebook environment,
|
|
222
|
-
# but we should validate it matches the requested workspace name
|
|
363
|
+
# In notebook environment, validate it matches the requested workspace name
|
|
223
364
|
workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
|
|
224
365
|
if not workspace_id:
|
|
225
366
|
# Fallback to current workspace if name resolution fails
|
|
@@ -231,21 +372,26 @@ class Duckrun:
|
|
|
231
372
|
if not workspace_id:
|
|
232
373
|
raise ValueError(f"Workspace '{workspace_name}' not found")
|
|
233
374
|
|
|
234
|
-
# Resolve
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
375
|
+
# Resolve item name to ID based on item type
|
|
376
|
+
if item_type == 'Lakehouse':
|
|
377
|
+
item_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, item_name)
|
|
378
|
+
else:
|
|
379
|
+
# Use generic item resolver for non-lakehouse items
|
|
380
|
+
item_id = cls._resolve_item_id_by_name(token, workspace_id, item_name, item_type)
|
|
381
|
+
|
|
382
|
+
if not item_id:
|
|
383
|
+
raise ValueError(f"{item_type} '{item_name}' not found in workspace '{workspace_name}'")
|
|
238
384
|
|
|
239
|
-
return workspace_id,
|
|
385
|
+
return workspace_id, item_id
|
|
240
386
|
|
|
241
387
|
except Exception as e:
|
|
242
388
|
print(f"❌ Failed to resolve names to GUIDs: {e}")
|
|
243
|
-
print(f"❌ Cannot
|
|
244
|
-
print("❌ Microsoft Fabric requires actual workspace and
|
|
389
|
+
print(f"❌ Cannot resolve '{workspace_name}'/'{item_name}' ({item_type}) to GUIDs")
|
|
390
|
+
print("❌ Microsoft Fabric requires actual workspace and item GUIDs for ABFSS access")
|
|
245
391
|
raise ValueError(
|
|
246
|
-
f"Unable to resolve workspace '{workspace_name}' and
|
|
247
|
-
f"ABFSS URLs require actual GUIDs
|
|
248
|
-
f"Please ensure you have proper authentication and the workspace/
|
|
392
|
+
f"Unable to resolve workspace '{workspace_name}' and {item_type.lower()} '{item_name}' to GUIDs. "
|
|
393
|
+
f"ABFSS URLs require actual GUIDs. "
|
|
394
|
+
f"Please ensure you have proper authentication and the workspace/item names are correct."
|
|
249
395
|
)
|
|
250
396
|
|
|
251
397
|
@classmethod
|
|
@@ -287,6 +433,58 @@ class Duckrun:
|
|
|
287
433
|
return None
|
|
288
434
|
except Exception:
|
|
289
435
|
return None
|
|
436
|
+
|
|
437
|
+
@classmethod
|
|
438
|
+
def _resolve_item_id_by_name(cls, token: str, workspace_id: str, item_name: str, item_type: str) -> Optional[str]:
|
|
439
|
+
"""
|
|
440
|
+
Get item ID from display name within a workspace using generic items API.
|
|
441
|
+
Works for any item type: Warehouse, Database, SnowflakeDatabase, etc.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
token: Fabric API authentication token
|
|
445
|
+
workspace_id: Workspace GUID
|
|
446
|
+
item_name: Display name of the item
|
|
447
|
+
item_type: Type of item (e.g., 'Warehouse', 'Database', 'SnowflakeDatabase')
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
Item GUID if found, None otherwise
|
|
451
|
+
"""
|
|
452
|
+
try:
|
|
453
|
+
import requests
|
|
454
|
+
# Use generic items API with type filter
|
|
455
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items"
|
|
456
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
457
|
+
|
|
458
|
+
# Add type filter as query parameter
|
|
459
|
+
params = {"type": item_type}
|
|
460
|
+
|
|
461
|
+
print(f" Searching for {item_type} '{item_name}' in workspace {workspace_id}")
|
|
462
|
+
print(f" API URL: {url}?type={item_type}")
|
|
463
|
+
|
|
464
|
+
response = requests.get(url, headers=headers, params=params)
|
|
465
|
+
response.raise_for_status()
|
|
466
|
+
|
|
467
|
+
result = response.json()
|
|
468
|
+
items = result.get("value", [])
|
|
469
|
+
|
|
470
|
+
print(f" Found {len(items)} items of type {item_type}")
|
|
471
|
+
if items:
|
|
472
|
+
print(f" Available items: {[item.get('displayName') for item in items]}")
|
|
473
|
+
|
|
474
|
+
for item in items:
|
|
475
|
+
if item.get("displayName") == item_name:
|
|
476
|
+
item_id = item.get("id")
|
|
477
|
+
print(f" Found matching item: {item_name} -> {item_id}")
|
|
478
|
+
return item_id
|
|
479
|
+
|
|
480
|
+
print(f" Item '{item_name}' not found in the list")
|
|
481
|
+
return None
|
|
482
|
+
except Exception as e:
|
|
483
|
+
print(f" Error resolving {item_type} item: {e}")
|
|
484
|
+
if hasattr(e, 'response') and e.response is not None:
|
|
485
|
+
print(f" Response status: {e.response.status_code}")
|
|
486
|
+
print(f" Response body: {e.response.text}")
|
|
487
|
+
return None
|
|
290
488
|
|
|
291
489
|
@classmethod
|
|
292
490
|
def connect_workspace(cls, workspace_name: str):
|
|
@@ -328,77 +526,138 @@ class Duckrun:
|
|
|
328
526
|
|
|
329
527
|
def _discover_tables_fast(self) -> List[Tuple[str, str]]:
|
|
330
528
|
"""
|
|
331
|
-
Fast
|
|
332
|
-
|
|
529
|
+
Fast table discovery using OneLake Delta Table API (Unity Catalog compatible).
|
|
530
|
+
Uses: https://learn.microsoft.com/en-us/fabric/onelake/table-apis/delta-table-apis-overview
|
|
333
531
|
|
|
334
532
|
Returns:
|
|
335
533
|
List of tuples: [(schema, table_name), ...]
|
|
336
534
|
"""
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
535
|
+
try:
|
|
536
|
+
# Get storage token for OneLake
|
|
537
|
+
token = self._get_storage_token()
|
|
538
|
+
if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
|
|
539
|
+
print("Authenticating with Azure for table discovery...")
|
|
540
|
+
from .auth import get_token
|
|
541
|
+
token = get_token()
|
|
542
|
+
if not token:
|
|
543
|
+
print("❌ Failed to authenticate for table discovery")
|
|
544
|
+
return []
|
|
545
|
+
|
|
546
|
+
# OneLake Delta Table API endpoint (Unity Catalog compatible)
|
|
547
|
+
base_url = "https://onelake.table.fabric.microsoft.com/delta"
|
|
548
|
+
|
|
549
|
+
# Determine workspace/item identifier for API
|
|
550
|
+
# Per docs: Can use friendly names (WorkspaceName/ItemName.ItemType) if no special characters
|
|
551
|
+
# Otherwise must use GUIDs (WorkspaceID/ItemID)
|
|
552
|
+
import re
|
|
553
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
554
|
+
|
|
555
|
+
# Check if we're using GUIDs or friendly names
|
|
556
|
+
if guid_pattern.match(self.workspace_id) and guid_pattern.match(self.lakehouse_id):
|
|
557
|
+
# Using GUIDs - use them directly in API
|
|
558
|
+
workspace_identifier = self.workspace_id
|
|
559
|
+
item_identifier = self.lakehouse_id
|
|
560
|
+
catalog_name = self.lakehouse_id
|
|
358
561
|
else:
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
schema_path = f"{base_path}{schema_name}/"
|
|
375
|
-
result = obs.list_with_delimiter(store, prefix=schema_path)
|
|
562
|
+
# Using friendly names - lakehouse_id already includes .ItemType suffix
|
|
563
|
+
workspace_identifier = self.workspace_id
|
|
564
|
+
item_identifier = self.lakehouse_id
|
|
565
|
+
catalog_name = self.lakehouse_id
|
|
566
|
+
|
|
567
|
+
print(f"🔍 Discovering tables via OneLake Delta Table API...")
|
|
568
|
+
print(f" Using identifier: {workspace_identifier}/{item_identifier}")
|
|
569
|
+
|
|
570
|
+
tables_found = []
|
|
571
|
+
|
|
572
|
+
if self.scan_all_schemas:
|
|
573
|
+
# First, list all schemas
|
|
574
|
+
schemas_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/schemas"
|
|
575
|
+
params = {"catalog_name": catalog_name}
|
|
576
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
376
577
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
578
|
+
schemas_response = requests.get(schemas_url, headers=headers, params=params)
|
|
579
|
+
|
|
580
|
+
if schemas_response.status_code == 200:
|
|
581
|
+
schemas_result = schemas_response.json()
|
|
582
|
+
schemas = schemas_result.get("schemas", [])
|
|
583
|
+
schema_names = [s.get("name") for s in schemas if s.get("name")]
|
|
584
|
+
|
|
585
|
+
print(f" Found {len(schema_names)} schemas: {schema_names}")
|
|
586
|
+
|
|
587
|
+
# Get tables from each schema
|
|
588
|
+
for schema_name in schema_names:
|
|
589
|
+
tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
|
|
590
|
+
tables_params = {
|
|
591
|
+
"catalog_name": catalog_name,
|
|
592
|
+
"schema_name": schema_name
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
tables_response = requests.get(tables_url, headers=headers, params=tables_params)
|
|
596
|
+
|
|
597
|
+
if tables_response.status_code == 200:
|
|
598
|
+
tables_result = tables_response.json()
|
|
599
|
+
tables = tables_result.get("tables", [])
|
|
600
|
+
|
|
601
|
+
for table in tables:
|
|
602
|
+
table_name = table.get("name", "")
|
|
603
|
+
if table_name:
|
|
604
|
+
tables_found.append((schema_name, table_name))
|
|
605
|
+
|
|
606
|
+
if tables:
|
|
607
|
+
print(f" Schema '{schema_name}': {len(tables)} tables")
|
|
608
|
+
else:
|
|
609
|
+
print(f" Failed to list schemas: {schemas_response.status_code}")
|
|
610
|
+
if schemas_response.status_code != 404:
|
|
611
|
+
print(f" Response: {schemas_response.text[:300]}")
|
|
612
|
+
else:
|
|
613
|
+
# Single schema mode - list tables in specific schema
|
|
614
|
+
tables_url = f"{base_url}/{workspace_identifier}/{item_identifier}/api/2.1/unity-catalog/tables"
|
|
615
|
+
params = {
|
|
616
|
+
"catalog_name": catalog_name,
|
|
617
|
+
"schema_name": self.schema
|
|
618
|
+
}
|
|
619
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
620
|
+
|
|
621
|
+
print(f" Listing tables in schema: {self.schema}")
|
|
622
|
+
tables_response = requests.get(tables_url, headers=headers, params=params)
|
|
623
|
+
|
|
624
|
+
if tables_response.status_code == 200:
|
|
625
|
+
tables_result = tables_response.json()
|
|
626
|
+
tables = tables_result.get("tables", [])
|
|
627
|
+
|
|
628
|
+
for table in tables:
|
|
629
|
+
table_name = table.get("name", "")
|
|
630
|
+
if table_name:
|
|
631
|
+
tables_found.append((self.schema, table_name))
|
|
632
|
+
|
|
633
|
+
print(f" Found {len(tables)} tables")
|
|
634
|
+
elif tables_response.status_code == 404:
|
|
635
|
+
print(f" Schema '{self.schema}' not found or has no tables")
|
|
636
|
+
else:
|
|
637
|
+
print(f" Failed to list tables: {tables_response.status_code}")
|
|
638
|
+
print(f" Response: {tables_response.text[:300]}")
|
|
386
639
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
640
|
+
return tables_found
|
|
641
|
+
|
|
642
|
+
except Exception as e:
|
|
643
|
+
print(f"❌ Error during table discovery: {e}")
|
|
644
|
+
import traceback
|
|
645
|
+
traceback.print_exc()
|
|
646
|
+
return []
|
|
393
647
|
|
|
394
648
|
def _attach_lakehouse(self):
|
|
395
649
|
"""Attach lakehouse tables as DuckDB views using fast discovery"""
|
|
650
|
+
print(f"🔌 Attaching tables from schema: {self.schema if not self.scan_all_schemas else 'all schemas'}")
|
|
396
651
|
self._create_onelake_secret()
|
|
397
652
|
|
|
398
653
|
try:
|
|
399
654
|
tables = self._discover_tables_fast()
|
|
400
655
|
|
|
401
656
|
if not tables:
|
|
657
|
+
if self.scan_all_schemas:
|
|
658
|
+
print(f"⚠️ No tables found in any schema")
|
|
659
|
+
else:
|
|
660
|
+
print(f"⚠️ No tables found in {self.schema} schema")
|
|
402
661
|
return
|
|
403
662
|
|
|
404
663
|
# Collect table names for display
|
|
@@ -421,6 +680,7 @@ class Duckrun:
|
|
|
421
680
|
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
|
422
681
|
""")
|
|
423
682
|
except Exception as e:
|
|
683
|
+
print(f"⚠️ Failed to attach table {schema_name}.{table_name}: {e}")
|
|
424
684
|
continue
|
|
425
685
|
|
|
426
686
|
# Print discovered tables as comma-separated list
|
|
@@ -429,6 +689,8 @@ class Duckrun:
|
|
|
429
689
|
|
|
430
690
|
except Exception as e:
|
|
431
691
|
print(f"❌ Error attaching lakehouse: {e}")
|
|
692
|
+
import traceback
|
|
693
|
+
traceback.print_exc()
|
|
432
694
|
|
|
433
695
|
def _register_lookup_functions(self):
|
|
434
696
|
"""
|
|
@@ -567,32 +829,129 @@ class Duckrun:
|
|
|
567
829
|
|
|
568
830
|
# Register functions in DuckDB
|
|
569
831
|
try:
|
|
570
|
-
self.con.create_function("get_workspace_name", get_workspace_name)
|
|
571
|
-
self.con.create_function("get_lakehouse_name", get_lakehouse_name)
|
|
572
|
-
self.con.create_function("get_workspace_id_from_name", get_workspace_id_from_name)
|
|
573
|
-
self.con.create_function("get_lakehouse_id_from_name", get_lakehouse_id_from_name)
|
|
832
|
+
self.con.create_function("get_workspace_name", get_workspace_name, null_handling='SPECIAL')
|
|
833
|
+
self.con.create_function("get_lakehouse_name", get_lakehouse_name, null_handling='SPECIAL')
|
|
834
|
+
self.con.create_function("get_workspace_id_from_name", get_workspace_id_from_name, null_handling='SPECIAL')
|
|
835
|
+
self.con.create_function("get_lakehouse_id_from_name", get_lakehouse_id_from_name, null_handling='SPECIAL')
|
|
574
836
|
except Exception as e:
|
|
575
837
|
print(f"⚠️ Warning: Could not register lookup functions: {e}")
|
|
576
838
|
|
|
577
|
-
def get_workspace_id(self) -> str:
|
|
839
|
+
def get_workspace_id(self, force: bool = False) -> str:
|
|
578
840
|
"""
|
|
579
841
|
Get the workspace ID (GUID or name without spaces).
|
|
580
842
|
Use this when passing workspace parameter to Python functions.
|
|
581
843
|
|
|
844
|
+
Args:
|
|
845
|
+
force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
|
|
846
|
+
|
|
582
847
|
Returns:
|
|
583
848
|
Workspace ID - either a GUID or workspace name without spaces
|
|
584
849
|
"""
|
|
850
|
+
if not force:
|
|
851
|
+
return self.workspace_id
|
|
852
|
+
|
|
853
|
+
# Force resolution to GUID
|
|
854
|
+
import re
|
|
855
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
856
|
+
|
|
857
|
+
# If already a GUID, return it
|
|
858
|
+
if guid_pattern.match(self.workspace_id):
|
|
859
|
+
return self.workspace_id
|
|
860
|
+
|
|
861
|
+
# Try to get from notebook context first (fastest)
|
|
862
|
+
try:
|
|
863
|
+
import notebookutils # type: ignore
|
|
864
|
+
workspace_guid = notebookutils.runtime.context.get("workspaceId")
|
|
865
|
+
if workspace_guid:
|
|
866
|
+
return workspace_guid
|
|
867
|
+
except ImportError:
|
|
868
|
+
pass
|
|
869
|
+
|
|
870
|
+
# Resolve via API
|
|
871
|
+
try:
|
|
872
|
+
from .auth import get_fabric_api_token
|
|
873
|
+
token = get_fabric_api_token()
|
|
874
|
+
if token:
|
|
875
|
+
resolved_id = self._resolve_workspace_id_by_name(token, self.workspace_id)
|
|
876
|
+
if resolved_id:
|
|
877
|
+
return resolved_id
|
|
878
|
+
except Exception:
|
|
879
|
+
pass
|
|
880
|
+
|
|
881
|
+
# Fallback to original value
|
|
585
882
|
return self.workspace_id
|
|
586
883
|
|
|
587
|
-
def
|
|
884
|
+
def get_item_id(self, force: bool = False) -> str:
|
|
588
885
|
"""
|
|
589
|
-
Get the
|
|
590
|
-
Use this when passing lakehouse parameter to Python functions.
|
|
886
|
+
Get the item ID (GUID or name) - works for lakehouses, warehouses, databases, etc.
|
|
887
|
+
Use this when passing lakehouse/item parameter to Python functions.
|
|
888
|
+
|
|
889
|
+
Args:
|
|
890
|
+
force: If True, always resolve to actual GUID via API. If False, returns stored value (default: False)
|
|
591
891
|
|
|
592
892
|
Returns:
|
|
593
|
-
|
|
893
|
+
Item ID - either a GUID or item name (supports all OneLake item types)
|
|
594
894
|
"""
|
|
895
|
+
if not force:
|
|
896
|
+
return self.lakehouse_id
|
|
897
|
+
|
|
898
|
+
# Force resolution to GUID
|
|
899
|
+
import re
|
|
900
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
|
901
|
+
|
|
902
|
+
# If already a GUID, return it
|
|
903
|
+
if guid_pattern.match(self.lakehouse_id):
|
|
904
|
+
return self.lakehouse_id
|
|
905
|
+
|
|
906
|
+
# Detect item type from lakehouse_id (e.g., "data.Lakehouse" -> Lakehouse)
|
|
907
|
+
item_type = None
|
|
908
|
+
item_name = self.lakehouse_id
|
|
909
|
+
for suffix in ['.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase']:
|
|
910
|
+
if self.lakehouse_id.endswith(suffix):
|
|
911
|
+
item_type = suffix[1:] # Remove the leading dot
|
|
912
|
+
item_name = self.lakehouse_id[:-len(suffix)]
|
|
913
|
+
break
|
|
914
|
+
|
|
915
|
+
# Try to get from notebook context first (only works for lakehouses)
|
|
916
|
+
if item_type == 'Lakehouse' or item_type is None:
|
|
917
|
+
try:
|
|
918
|
+
import notebookutils # type: ignore
|
|
919
|
+
lakehouse_guid = notebookutils.lakehouse.get("id")
|
|
920
|
+
if lakehouse_guid:
|
|
921
|
+
return lakehouse_guid
|
|
922
|
+
except (ImportError, Exception):
|
|
923
|
+
pass
|
|
924
|
+
|
|
925
|
+
# Resolve via API
|
|
926
|
+
try:
|
|
927
|
+
from .auth import get_fabric_api_token
|
|
928
|
+
token = get_fabric_api_token()
|
|
929
|
+
if token:
|
|
930
|
+
# First get workspace GUID
|
|
931
|
+
workspace_guid = self.get_workspace_id(force=True)
|
|
932
|
+
|
|
933
|
+
# Use appropriate resolver based on item type
|
|
934
|
+
if item_type == 'Lakehouse' or item_type is None:
|
|
935
|
+
# Use lakehouse-specific API
|
|
936
|
+
resolved_id = self._resolve_lakehouse_id_by_name(token, workspace_guid, item_name if item_name else self.lakehouse_id)
|
|
937
|
+
else:
|
|
938
|
+
# Use generic items API for warehouses, databases, etc.
|
|
939
|
+
resolved_id = self._resolve_item_id_by_name(token, workspace_guid, item_name, item_type)
|
|
940
|
+
|
|
941
|
+
if resolved_id:
|
|
942
|
+
return resolved_id
|
|
943
|
+
except Exception:
|
|
944
|
+
pass
|
|
945
|
+
|
|
946
|
+
# Fallback to original value
|
|
595
947
|
return self.lakehouse_id
|
|
948
|
+
|
|
949
|
+
def get_lakehouse_id(self, force: bool = False) -> str:
|
|
950
|
+
"""
|
|
951
|
+
Deprecated: Use get_item_id() instead.
|
|
952
|
+
Backward compatibility alias for get_item_id().
|
|
953
|
+
"""
|
|
954
|
+
return self.get_item_id(force)
|
|
596
955
|
|
|
597
956
|
def run(self, pipeline: List[Tuple]) -> bool:
|
|
598
957
|
"""
|
|
@@ -676,33 +1035,44 @@ class Duckrun:
|
|
|
676
1035
|
"""Get underlying DuckDB connection"""
|
|
677
1036
|
return self.con
|
|
678
1037
|
|
|
679
|
-
def get_stats(self, source: str):
|
|
1038
|
+
def get_stats(self, source: str = None, detailed = False):
|
|
680
1039
|
"""
|
|
681
1040
|
Get comprehensive statistics for Delta Lake tables.
|
|
682
1041
|
|
|
683
1042
|
Args:
|
|
684
|
-
source: Can be one of:
|
|
1043
|
+
source: Optional. Can be one of:
|
|
1044
|
+
- None: Use all tables in the connection's schema (default)
|
|
685
1045
|
- Table name: 'table_name' (uses current schema)
|
|
686
1046
|
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
687
1047
|
- Schema only: 'schema' (all tables in schema)
|
|
1048
|
+
detailed: Optional. Controls the level of detail in statistics:
|
|
1049
|
+
- False (default): Aggregated table-level stats
|
|
1050
|
+
- True: Row group level statistics with compression details
|
|
688
1051
|
|
|
689
1052
|
Returns:
|
|
690
|
-
|
|
691
|
-
|
|
1053
|
+
DataFrame with statistics based on detailed parameter:
|
|
1054
|
+
- If detailed=False: Aggregated table-level summary
|
|
1055
|
+
- If detailed=True: Granular file and row group level stats
|
|
692
1056
|
|
|
693
1057
|
Examples:
|
|
694
1058
|
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
695
1059
|
|
|
696
|
-
#
|
|
1060
|
+
# All tables in current schema (aemo) - aggregated
|
|
1061
|
+
stats = con.get_stats()
|
|
1062
|
+
|
|
1063
|
+
# Single table in current schema - aggregated
|
|
697
1064
|
stats = con.get_stats('price')
|
|
698
1065
|
|
|
1066
|
+
# Single table with detailed row group statistics
|
|
1067
|
+
stats_detailed = con.get_stats('price', detailed=True)
|
|
1068
|
+
|
|
699
1069
|
# Specific table in different schema
|
|
700
1070
|
stats = con.get_stats('aemo.price')
|
|
701
1071
|
|
|
702
1072
|
# All tables in a schema
|
|
703
1073
|
stats = con.get_stats('aemo')
|
|
704
1074
|
"""
|
|
705
|
-
return _get_stats(self, source)
|
|
1075
|
+
return _get_stats(self, source, detailed)
|
|
706
1076
|
|
|
707
1077
|
def list_lakehouses(self) -> List[str]:
|
|
708
1078
|
"""
|
|
@@ -816,7 +1186,7 @@ class Duckrun:
|
|
|
816
1186
|
return False
|
|
817
1187
|
|
|
818
1188
|
def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
|
|
819
|
-
wait_seconds: int = 5) -> int:
|
|
1189
|
+
wait_seconds: int = 5, refresh: str = "full") -> int:
|
|
820
1190
|
"""
|
|
821
1191
|
Deploy a semantic model from a BIM file using DirectLake mode.
|
|
822
1192
|
|
|
@@ -825,8 +1195,11 @@ class Duckrun:
|
|
|
825
1195
|
- URL: "https://raw.githubusercontent.com/.../model.bim"
|
|
826
1196
|
- Local file: "model.bim"
|
|
827
1197
|
- Workspace/Model: "workspace_name/model_name"
|
|
828
|
-
dataset_name: Name for the semantic model (default:
|
|
1198
|
+
dataset_name: Name for the semantic model (default: schema name)
|
|
829
1199
|
wait_seconds: Seconds to wait for permission propagation (default: 5)
|
|
1200
|
+
refresh: Refresh strategy:
|
|
1201
|
+
- "full": Clear values and process full refresh (default)
|
|
1202
|
+
- "ignore": Skip refresh entirely
|
|
830
1203
|
|
|
831
1204
|
Returns:
|
|
832
1205
|
1 for success, 0 for failure
|
|
@@ -834,14 +1207,17 @@ class Duckrun:
|
|
|
834
1207
|
Examples:
|
|
835
1208
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
836
1209
|
|
|
1210
|
+
# Deploy with schema name as dataset name (dbo)
|
|
1211
|
+
dr.deploy("https://github.com/.../model.bim")
|
|
1212
|
+
|
|
837
1213
|
# Deploy from workspace/model (uses same name by default)
|
|
838
1214
|
dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
|
|
839
1215
|
|
|
840
1216
|
# Deploy with custom name
|
|
841
|
-
dr.deploy("
|
|
1217
|
+
dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
|
|
842
1218
|
|
|
843
|
-
# Deploy
|
|
844
|
-
dr.deploy("https://
|
|
1219
|
+
# Deploy without refresh
|
|
1220
|
+
dr.deploy("https://github.com/.../model.bim", refresh="ignore")
|
|
845
1221
|
"""
|
|
846
1222
|
from .semantic_model import deploy_semantic_model
|
|
847
1223
|
|
|
@@ -853,9 +1229,9 @@ class Duckrun:
|
|
|
853
1229
|
if len(parts) == 2:
|
|
854
1230
|
dataset_name = parts[1] # Use the model name
|
|
855
1231
|
else:
|
|
856
|
-
dataset_name =
|
|
1232
|
+
dataset_name = self.schema # Use schema name
|
|
857
1233
|
else:
|
|
858
|
-
dataset_name =
|
|
1234
|
+
dataset_name = self.schema # Use schema name
|
|
859
1235
|
|
|
860
1236
|
# Call the deployment function (DirectLake only)
|
|
861
1237
|
return deploy_semantic_model(
|
|
@@ -864,36 +1240,203 @@ class Duckrun:
|
|
|
864
1240
|
schema_name=self.schema,
|
|
865
1241
|
dataset_name=dataset_name,
|
|
866
1242
|
bim_url_or_path=bim_url,
|
|
867
|
-
wait_seconds=wait_seconds
|
|
1243
|
+
wait_seconds=wait_seconds,
|
|
1244
|
+
refresh=refresh
|
|
868
1245
|
)
|
|
869
1246
|
|
|
870
|
-
def
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
1247
|
+
def rle(self, table_name: str = None, mode = "natural",
|
|
1248
|
+
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1249
|
+
max_ordering_depth: int = 3, limit: int = None):
|
|
1250
|
+
"""
|
|
1251
|
+
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1252
|
+
|
|
1253
|
+
Args:
|
|
1254
|
+
table_name: Name of the table to analyze. Can be:
|
|
1255
|
+
- 'table_name' (uses current schema)
|
|
1256
|
+
- 'schema.table_name' (specific schema)
|
|
1257
|
+
mode: Analysis mode or column ordering:
|
|
1258
|
+
- "natural": Calculate RLE for natural order only (fastest)
|
|
1259
|
+
- "auto": Natural order + cardinality-based ordering (recommended)
|
|
1260
|
+
- "advanced": Natural + cardinality + greedy incremental search (most thorough)
|
|
1261
|
+
- List[str]: Specific column ordering to test, e.g., ['date', 'duid']
|
|
1262
|
+
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
|
|
1263
|
+
max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
|
|
1264
|
+
max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
|
|
1265
|
+
limit: Optional row limit for testing/development (default: None, analyzes all rows)
|
|
1266
|
+
|
|
1267
|
+
Returns:
|
|
1268
|
+
DataFrame with RLE analysis results
|
|
1269
|
+
|
|
1270
|
+
Examples:
|
|
1271
|
+
# Natural order only (baseline)
|
|
1272
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
|
1273
|
+
con.rle("mytable") # same as con.rle("mytable", "natural")
|
|
875
1274
|
|
|
876
|
-
|
|
877
|
-
|
|
1275
|
+
# Auto optimization (natural + cardinality-based)
|
|
1276
|
+
con.rle("mytable", "auto")
|
|
878
1277
|
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
1278
|
+
# Advanced optimization (greedy incremental search)
|
|
1279
|
+
con.rle("mytable", "advanced")
|
|
1280
|
+
|
|
1281
|
+
# Test specific column ordering
|
|
1282
|
+
con.rle("mytable", ["date", "duid"])
|
|
1283
|
+
con.rle("mytable", ["cutoff", "time", "DUID", "date"])
|
|
1284
|
+
|
|
1285
|
+
# Advanced with custom depth
|
|
1286
|
+
con.rle("mytable", "advanced", max_ordering_depth=4)
|
|
1287
|
+
|
|
1288
|
+
# Analyze table from different schema
|
|
1289
|
+
con.rle("otherschema.mytable", "auto")
|
|
1290
|
+
|
|
1291
|
+
# Custom thresholds for small tables
|
|
1292
|
+
con.rle("mytable", "auto", max_cardinality_pct=0.05)
|
|
883
1293
|
|
|
1294
|
+
# Limit rows for testing
|
|
1295
|
+
con.rle("mytable", "auto", limit=10000)
|
|
1296
|
+
"""
|
|
1297
|
+
from .rle import (
|
|
1298
|
+
calculate_cardinality_ratio,
|
|
1299
|
+
test_column_orderings_smart,
|
|
1300
|
+
calculate_rle_for_columns
|
|
1301
|
+
)
|
|
1302
|
+
from deltalake import DeltaTable
|
|
1303
|
+
|
|
1304
|
+
# Parse table name and construct path
|
|
1305
|
+
if table_name is None:
|
|
1306
|
+
if mode != "summary":
|
|
1307
|
+
print("⚠️ Table name is required for 'smart' and 'full' modes")
|
|
1308
|
+
return None
|
|
1309
|
+
# TODO: Implement all-tables summary
|
|
1310
|
+
print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
|
|
884
1311
|
return None
|
|
1312
|
+
|
|
1313
|
+
# Parse schema.table or just table
|
|
1314
|
+
if '.' in table_name:
|
|
1315
|
+
schema_name, tbl = table_name.split('.', 1)
|
|
1316
|
+
else:
|
|
1317
|
+
schema_name = self.schema
|
|
1318
|
+
tbl = table_name
|
|
1319
|
+
|
|
1320
|
+
# Construct the full table path using the same logic as get_stats
|
|
1321
|
+
table_path = f"{self.table_base_url}{schema_name}/{tbl}"
|
|
1322
|
+
|
|
1323
|
+
# Verify table exists and is not empty
|
|
1324
|
+
print(f"📊 Analyzing table: {schema_name}.{tbl}")
|
|
1325
|
+
|
|
1326
|
+
try:
|
|
1327
|
+
dt = DeltaTable(table_path)
|
|
1328
|
+
delta_files = dt.files()
|
|
885
1329
|
|
|
886
|
-
|
|
1330
|
+
if not delta_files:
|
|
1331
|
+
print("⚠️ Table is empty (no files)")
|
|
1332
|
+
return None
|
|
1333
|
+
|
|
1334
|
+
except Exception as e:
|
|
1335
|
+
print(f"❌ Error accessing Delta table: {e}")
|
|
887
1336
|
return None
|
|
1337
|
+
|
|
1338
|
+
# Check if mode is a list of columns (custom ordering)
|
|
1339
|
+
if isinstance(mode, list):
|
|
1340
|
+
# User wants to test a specific column ordering
|
|
1341
|
+
print(f"Testing custom column ordering: {', '.join(mode)}")
|
|
1342
|
+
|
|
1343
|
+
# Calculate cardinality for NDV values
|
|
1344
|
+
card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
|
|
1345
|
+
|
|
1346
|
+
# Calculate RLE for the specified ordering
|
|
1347
|
+
rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
|
|
1348
|
+
|
|
1349
|
+
total_rle_all = sum(rle_counts.values())
|
|
1350
|
+
|
|
1351
|
+
print(f"\nResults:")
|
|
1352
|
+
print(f" Custom ordering: [{', '.join(mode)}]")
|
|
1353
|
+
print(f" Total RLE (all columns): {total_rle_all:,} runs")
|
|
1354
|
+
|
|
1355
|
+
# Return as DataFrame for consistency
|
|
1356
|
+
import pandas as pd
|
|
1357
|
+
results = [{
|
|
1358
|
+
'schema': schema_name,
|
|
1359
|
+
'table': tbl,
|
|
1360
|
+
'sort_order': 'custom',
|
|
1361
|
+
'columns_used': ', '.join(mode),
|
|
1362
|
+
'total_rle_all': total_rle_all,
|
|
1363
|
+
**rle_counts
|
|
1364
|
+
}]
|
|
1365
|
+
|
|
1366
|
+
df = pd.DataFrame(results)
|
|
1367
|
+
|
|
1368
|
+
# Transform to long format
|
|
1369
|
+
long_format_results = []
|
|
1370
|
+
|
|
1371
|
+
for _, row in df.iterrows():
|
|
1372
|
+
schema_val = row['schema']
|
|
1373
|
+
table_val = row['table']
|
|
1374
|
+
sort_order = row['sort_order']
|
|
1375
|
+
columns_used = row['columns_used']
|
|
1376
|
+
total_rle_all_val = row['total_rle_all']
|
|
1377
|
+
|
|
1378
|
+
# Get all column names except metadata columns
|
|
1379
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
1380
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
1381
|
+
|
|
1382
|
+
# Get total rows from card_stats if available
|
|
1383
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
1384
|
+
|
|
1385
|
+
# Parse the columns_used to get ordering
|
|
1386
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
1387
|
+
|
|
1388
|
+
# Create one row per data column
|
|
1389
|
+
for col in data_columns:
|
|
1390
|
+
rle_value = row[col]
|
|
1391
|
+
|
|
1392
|
+
# Get NDV from card_stats
|
|
1393
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
1394
|
+
|
|
1395
|
+
# Determine if column was included in the sort and its position
|
|
1396
|
+
is_in_sort = col in sort_columns_list
|
|
1397
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
1398
|
+
comment = '' if is_in_sort else 'not included in the sort'
|
|
1399
|
+
|
|
1400
|
+
long_format_results.append({
|
|
1401
|
+
'schema': schema_val,
|
|
1402
|
+
'table': table_val,
|
|
1403
|
+
'sort_type': sort_order,
|
|
1404
|
+
'column': col,
|
|
1405
|
+
'order': order_position,
|
|
1406
|
+
'RLE': rle_value,
|
|
1407
|
+
'NDV': ndv_value,
|
|
1408
|
+
'total_rows': total_rows,
|
|
1409
|
+
'total_RLE': total_rle_all_val,
|
|
1410
|
+
'comments': comment
|
|
1411
|
+
})
|
|
1412
|
+
|
|
1413
|
+
long_df = pd.DataFrame(long_format_results)
|
|
1414
|
+
|
|
1415
|
+
return long_df
|
|
1416
|
+
|
|
1417
|
+
# All modes now use test_column_orderings_smart with the mode parameter
|
|
1418
|
+
return test_column_orderings_smart(
|
|
1419
|
+
self.con,
|
|
1420
|
+
table_path,
|
|
1421
|
+
table_name=table_name, # Pass table name for cardinality calculation on full dataset
|
|
1422
|
+
mode=mode,
|
|
1423
|
+
limit=limit,
|
|
1424
|
+
min_distinct_threshold=min_distinct_threshold,
|
|
1425
|
+
max_cardinality_pct=max_cardinality_pct,
|
|
1426
|
+
max_ordering_depth=max_ordering_depth,
|
|
1427
|
+
schema_name=schema_name,
|
|
1428
|
+
table_display_name=tbl
|
|
1429
|
+
)
|
|
888
1430
|
|
|
889
1431
|
def close(self):
|
|
890
1432
|
"""Close DuckDB connection"""
|
|
1433
|
+
|
|
891
1434
|
if self.con:
|
|
892
1435
|
self.con.close()
|
|
893
1436
|
print("Connection closed")
|
|
894
1437
|
|
|
895
1438
|
|
|
896
|
-
class WorkspaceConnection:
|
|
1439
|
+
class WorkspaceConnection(WorkspaceOperationsMixin):
|
|
897
1440
|
"""
|
|
898
1441
|
Simple workspace connection for lakehouse management operations.
|
|
899
1442
|
"""
|
|
@@ -1133,23 +1676,4 @@ class WorkspaceConnection:
|
|
|
1133
1676
|
print(f"❌ Error downloading semantic model: {e}")
|
|
1134
1677
|
import traceback
|
|
1135
1678
|
traceback.print_exc()
|
|
1136
|
-
return None
|
|
1137
|
-
|
|
1138
|
-
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
|
1139
|
-
"""Helper method to get workspace ID from name"""
|
|
1140
|
-
try:
|
|
1141
|
-
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
|
1142
|
-
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
1143
|
-
|
|
1144
|
-
response = requests.get(url, headers=headers)
|
|
1145
|
-
response.raise_for_status()
|
|
1146
|
-
|
|
1147
|
-
workspaces = response.json().get("value", [])
|
|
1148
|
-
for workspace in workspaces:
|
|
1149
|
-
if workspace.get("displayName") == workspace_name:
|
|
1150
|
-
return workspace.get("id")
|
|
1151
|
-
|
|
1152
|
-
return None
|
|
1153
|
-
|
|
1154
|
-
except Exception:
|
|
1155
1679
|
return None
|