duckrun 0.2.4__py3-none-any.whl → 0.2.5.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/__init__.py +1 -1
- duckrun/core.py +462 -31
- duckrun/files.py +4 -4
- duckrun/lakehouse.py +402 -0
- duckrun/runner.py +22 -1
- duckrun/stats.py +2 -2
- {duckrun-0.2.4.dist-info → duckrun-0.2.5.dev1.dist-info}/METADATA +1 -1
- duckrun-0.2.5.dev1.dist-info/RECORD +12 -0
- duckrun-0.2.4.dist-info/RECORD +0 -11
- {duckrun-0.2.4.dist-info → duckrun-0.2.5.dev1.dist-info}/WHEEL +0 -0
- {duckrun-0.2.4.dist-info → duckrun-0.2.5.dev1.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.4.dist-info → duckrun-0.2.5.dev1.dist-info}/top_level.txt +0 -0
duckrun/__init__.py
CHANGED
duckrun/core.py
CHANGED
@@ -49,17 +49,27 @@ class Duckrun:
|
|
49
49
|
]
|
50
50
|
"""
|
51
51
|
|
52
|
-
def __init__(self,
|
52
|
+
def __init__(self, workspace_id: str, lakehouse_id: str, schema: str = "dbo",
|
53
53
|
sql_folder: Optional[str] = None, compaction_threshold: int = 10,
|
54
54
|
scan_all_schemas: bool = False, storage_account: str = "onelake"):
|
55
|
-
|
56
|
-
self.
|
55
|
+
# Store GUIDs for internal use
|
56
|
+
self.workspace_id = workspace_id
|
57
|
+
self.lakehouse_id = lakehouse_id
|
57
58
|
self.schema = schema
|
58
59
|
self.sql_folder = sql_folder.strip() if sql_folder else None
|
59
60
|
self.compaction_threshold = compaction_threshold
|
60
61
|
self.scan_all_schemas = scan_all_schemas
|
61
62
|
self.storage_account = storage_account
|
62
|
-
|
63
|
+
|
64
|
+
# Construct proper ABFSS URLs using GUIDs
|
65
|
+
# Both Tables and Files use lakehouse GUID directly (no .Lakehouse suffix)
|
66
|
+
self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_id}/Tables/'
|
67
|
+
self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_id}/Files/'
|
68
|
+
|
69
|
+
# Keep legacy properties for backward compatibility
|
70
|
+
self.workspace = workspace_id
|
71
|
+
self.lakehouse_name = lakehouse_id
|
72
|
+
|
63
73
|
self.con = duckdb.connect()
|
64
74
|
self.con.sql("SET preserve_insertion_order = false")
|
65
75
|
self._attach_lakehouse()
|
@@ -68,57 +78,218 @@ class Duckrun:
|
|
68
78
|
def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
|
69
79
|
compaction_threshold: int = 100, storage_account: str = "onelake"):
|
70
80
|
"""
|
71
|
-
Create and connect to lakehouse.
|
81
|
+
Create and connect to lakehouse or workspace.
|
72
82
|
|
73
|
-
|
83
|
+
Smart detection based on connection string format:
|
84
|
+
- "workspace" → workspace management only
|
85
|
+
- "ws/lh.lakehouse/schema" → full lakehouse connection
|
86
|
+
- "ws/lh.lakehouse" → lakehouse connection (defaults to dbo schema)
|
74
87
|
|
75
88
|
Args:
|
76
|
-
connection_string: OneLake path
|
77
|
-
sql_folder: Optional path or URL to SQL files folder
|
89
|
+
connection_string: OneLake path or workspace name
|
90
|
+
sql_folder: Optional path or URL to SQL files folder
|
78
91
|
compaction_threshold: File count threshold for compaction
|
79
92
|
storage_account: Storage account name (default: "onelake")
|
80
93
|
|
81
94
|
Examples:
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
95
|
+
# Workspace management only (supports spaces in names)
|
96
|
+
ws = Duckrun.connect("My Workspace Name")
|
97
|
+
ws.list_lakehouses()
|
98
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
99
|
+
|
100
|
+
# Full lakehouse connections (supports spaces in names)
|
101
|
+
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/schema", sql_folder="./sql")
|
102
|
+
dr = Duckrun.connect("Data Workspace/Sales Data.lakehouse/analytics") # spaces supported
|
103
|
+
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse") # defaults to dbo schema
|
104
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse", storage_account="xxx-onelake") # custom storage
|
105
|
+
|
106
|
+
Note:
|
107
|
+
Internally resolves friendly names (with spaces) to GUIDs and constructs proper ABFSS URLs:
|
108
|
+
"My Workspace/My Lakehouse.lakehouse/schema" becomes
|
109
|
+
"abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/lakehouse_guid/Tables/schema"
|
86
110
|
"""
|
111
|
+
|
112
|
+
# Check if it's a workspace-only connection (no "/" means workspace name only)
|
113
|
+
if "/" not in connection_string:
|
114
|
+
print(f"Connecting to workspace '{connection_string}' for management operations...")
|
115
|
+
return WorkspaceConnection(connection_string)
|
116
|
+
|
87
117
|
print("Connecting to Lakehouse...")
|
88
118
|
|
89
119
|
scan_all_schemas = False
|
90
120
|
|
91
|
-
#
|
92
|
-
|
93
|
-
raise ValueError(
|
94
|
-
"Invalid connection string format. "
|
95
|
-
"Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
|
96
|
-
)
|
97
|
-
|
121
|
+
# Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
122
|
+
# Support workspace and lakehouse names with spaces
|
98
123
|
parts = connection_string.split("/")
|
99
124
|
if len(parts) == 2:
|
100
|
-
|
125
|
+
workspace_name, lakehouse_name = parts
|
101
126
|
scan_all_schemas = True
|
102
127
|
schema = "dbo"
|
103
128
|
elif len(parts) == 3:
|
104
|
-
|
129
|
+
workspace_name, lakehouse_name, schema = parts
|
105
130
|
else:
|
106
131
|
raise ValueError(
|
107
132
|
f"Invalid connection string format: '{connection_string}'. "
|
108
|
-
"Expected
|
133
|
+
"Expected formats:\n"
|
134
|
+
" 'workspace name' (workspace management only)\n"
|
135
|
+
" 'workspace name/lakehouse name.lakehouse' (lakehouse with dbo schema)\n"
|
136
|
+
" 'workspace name/lakehouse name.lakehouse/schema' (lakehouse with specific schema)"
|
109
137
|
)
|
110
138
|
|
111
139
|
if lakehouse_name.endswith(".lakehouse"):
|
112
140
|
lakehouse_name = lakehouse_name[:-10]
|
113
141
|
|
114
|
-
if not
|
142
|
+
if not workspace_name or not lakehouse_name:
|
115
143
|
raise ValueError(
|
116
|
-
"Missing required parameters. Use
|
117
|
-
" connect('workspace
|
118
|
-
" connect('workspace/lakehouse.lakehouse') #
|
144
|
+
"Missing required parameters. Use one of these formats:\n"
|
145
|
+
" connect('workspace name') # workspace management\n"
|
146
|
+
" connect('workspace name/lakehouse name.lakehouse/schema') # full lakehouse\n"
|
147
|
+
" connect('workspace name/lakehouse name.lakehouse') # defaults to dbo"
|
119
148
|
)
|
120
149
|
|
121
|
-
|
150
|
+
# Resolve friendly names to GUIDs and construct proper ABFSS path
|
151
|
+
workspace_id, lakehouse_id = cls._resolve_names_to_guids(workspace_name, lakehouse_name)
|
152
|
+
|
153
|
+
return cls(workspace_id, lakehouse_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
|
154
|
+
|
155
|
+
@classmethod
|
156
|
+
def _resolve_names_to_guids(cls, workspace_name: str, lakehouse_name: str) -> tuple[str, str]:
|
157
|
+
"""
|
158
|
+
Resolve friendly workspace and lakehouse names to their GUIDs.
|
159
|
+
|
160
|
+
Optimization: If names don't contain spaces, use them directly (no API calls needed).
|
161
|
+
Only resolve to GUIDs when names contain spaces or are already GUIDs.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
workspace_name: Display name of the workspace (can contain spaces)
|
165
|
+
lakehouse_name: Display name of the lakehouse (can contain spaces)
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
Tuple of (workspace_id, lakehouse_id) - either resolved GUIDs or original names
|
169
|
+
"""
|
170
|
+
|
171
|
+
# Check if names are already GUIDs first
|
172
|
+
import re
|
173
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
174
|
+
|
175
|
+
if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
|
176
|
+
print(f"✅ Names are already GUIDs: workspace={workspace_name}, lakehouse={lakehouse_name}")
|
177
|
+
return workspace_name, lakehouse_name
|
178
|
+
|
179
|
+
# Optimization: If workspace name has no spaces, use both names directly (old behavior)
|
180
|
+
# Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
|
181
|
+
if " " not in workspace_name:
|
182
|
+
print(f"✅ Using names directly (workspace has no spaces): workspace={workspace_name}, lakehouse={lakehouse_name}")
|
183
|
+
return workspace_name, lakehouse_name
|
184
|
+
|
185
|
+
# Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
|
186
|
+
print(f"🔍 Resolving '{workspace_name}' workspace and '{lakehouse_name}' lakehouse to GUIDs (workspace has spaces)...")
|
187
|
+
|
188
|
+
try:
|
189
|
+
# Get authentication token (try notebook environment first, then azure-identity)
|
190
|
+
try:
|
191
|
+
import notebookutils # type: ignore
|
192
|
+
token = notebookutils.credentials.getToken("pbi")
|
193
|
+
current_workspace_id = notebookutils.runtime.context.get("workspaceId")
|
194
|
+
except ImportError:
|
195
|
+
current_workspace_id = None
|
196
|
+
# Fallback to azure-identity for external environments
|
197
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
198
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
199
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
200
|
+
token = token_obj.token
|
201
|
+
|
202
|
+
# Resolve workspace name to ID
|
203
|
+
if current_workspace_id:
|
204
|
+
# In notebook environment, we could use current workspace ID
|
205
|
+
# but we should validate it matches the requested workspace name
|
206
|
+
workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
|
207
|
+
if not workspace_id:
|
208
|
+
# Fallback to current workspace if name resolution fails
|
209
|
+
print(f"⚠️ Could not validate workspace name '{workspace_name}', using current workspace")
|
210
|
+
workspace_id = current_workspace_id
|
211
|
+
else:
|
212
|
+
# External environment - must resolve by name
|
213
|
+
workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
|
214
|
+
if not workspace_id:
|
215
|
+
raise ValueError(f"Workspace '{workspace_name}' not found")
|
216
|
+
|
217
|
+
# Resolve lakehouse name to ID (required for ABFSS URLs with spaces)
|
218
|
+
lakehouse_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, lakehouse_name)
|
219
|
+
if not lakehouse_id:
|
220
|
+
raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
|
221
|
+
|
222
|
+
print(f"✅ Resolved: {workspace_name} → {workspace_id}, {lakehouse_name} → {lakehouse_id}")
|
223
|
+
return workspace_id, lakehouse_id
|
224
|
+
|
225
|
+
except Exception as e:
|
226
|
+
print(f"❌ Failed to resolve names to GUIDs: {e}")
|
227
|
+
print(f"❌ Cannot use friendly names with spaces '{workspace_name}'/'{lakehouse_name}' in ABFSS URLs without GUID resolution")
|
228
|
+
print("❌ Microsoft Fabric requires actual workspace and lakehouse GUIDs for ABFSS access when names contain spaces")
|
229
|
+
raise ValueError(
|
230
|
+
f"Unable to resolve workspace '{workspace_name}' and lakehouse '{lakehouse_name}' to GUIDs. "
|
231
|
+
f"ABFSS URLs require actual GUIDs when names contain spaces. "
|
232
|
+
f"Please ensure you have proper authentication and the workspace/lakehouse names are correct."
|
233
|
+
)
|
234
|
+
|
235
|
+
@classmethod
|
236
|
+
def _resolve_workspace_id_by_name(cls, token: str, workspace_name: str) -> Optional[str]:
|
237
|
+
"""Get workspace ID from display name"""
|
238
|
+
try:
|
239
|
+
import requests
|
240
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
241
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
242
|
+
|
243
|
+
response = requests.get(url, headers=headers)
|
244
|
+
response.raise_for_status()
|
245
|
+
|
246
|
+
workspaces = response.json().get("value", [])
|
247
|
+
for workspace in workspaces:
|
248
|
+
if workspace.get("displayName") == workspace_name:
|
249
|
+
return workspace.get("id")
|
250
|
+
|
251
|
+
return None
|
252
|
+
except Exception:
|
253
|
+
return None
|
254
|
+
|
255
|
+
@classmethod
|
256
|
+
def _resolve_lakehouse_id_by_name(cls, token: str, workspace_id: str, lakehouse_name: str) -> Optional[str]:
|
257
|
+
"""Get lakehouse ID from display name within a workspace"""
|
258
|
+
try:
|
259
|
+
import requests
|
260
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
261
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
262
|
+
|
263
|
+
response = requests.get(url, headers=headers)
|
264
|
+
response.raise_for_status()
|
265
|
+
|
266
|
+
lakehouses = response.json().get("value", [])
|
267
|
+
for lakehouse in lakehouses:
|
268
|
+
if lakehouse.get("displayName") == lakehouse_name:
|
269
|
+
return lakehouse.get("id")
|
270
|
+
|
271
|
+
return None
|
272
|
+
except Exception:
|
273
|
+
return None
|
274
|
+
|
275
|
+
@classmethod
|
276
|
+
def connect_workspace(cls, workspace_name: str):
|
277
|
+
"""
|
278
|
+
Connect to a workspace without a specific lakehouse.
|
279
|
+
Used for lakehouse management operations.
|
280
|
+
|
281
|
+
Args:
|
282
|
+
workspace_name: Name of the workspace
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
WorkspaceConnection object with lakehouse management methods
|
286
|
+
|
287
|
+
Example:
|
288
|
+
con = duckrun.connect_workspace("MyWorkspace")
|
289
|
+
con.list_lakehouses()
|
290
|
+
con.create_lakehouse_if_not_exists("newlakehouse")
|
291
|
+
"""
|
292
|
+
return WorkspaceConnection(workspace_name)
|
122
293
|
|
123
294
|
def _get_storage_token(self):
|
124
295
|
return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
|
@@ -155,7 +326,7 @@ class Duckrun:
|
|
155
326
|
url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
|
156
327
|
store = AzureStore.from_url(url, bearer_token=token)
|
157
328
|
|
158
|
-
base_path = f"{self.lakehouse_name}
|
329
|
+
base_path = f"{self.lakehouse_name}/Tables/"
|
159
330
|
tables_found = []
|
160
331
|
|
161
332
|
if self.scan_all_schemas:
|
@@ -198,9 +369,9 @@ class Duckrun:
|
|
198
369
|
|
199
370
|
if not tables:
|
200
371
|
if self.scan_all_schemas:
|
201
|
-
print(f"No Delta tables found in {self.lakehouse_name}
|
372
|
+
print(f"No Delta tables found in {self.lakehouse_name}/Tables/")
|
202
373
|
else:
|
203
|
-
print(f"No Delta tables found in {self.lakehouse_name}
|
374
|
+
print(f"No Delta tables found in {self.lakehouse_name}/Tables/{self.schema}/")
|
204
375
|
return
|
205
376
|
|
206
377
|
# Group tables by schema for display
|
@@ -358,8 +529,268 @@ class Duckrun:
|
|
358
529
|
"""
|
359
530
|
return _get_stats(self, source)
|
360
531
|
|
532
|
+
def list_lakehouses(self) -> List[str]:
|
533
|
+
"""
|
534
|
+
List all lakehouses in the current workspace.
|
535
|
+
|
536
|
+
Returns:
|
537
|
+
List of lakehouse names
|
538
|
+
"""
|
539
|
+
try:
|
540
|
+
# Try to get token from notebook environment first
|
541
|
+
try:
|
542
|
+
import notebookutils # type: ignore
|
543
|
+
token = notebookutils.credentials.getToken("pbi")
|
544
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
545
|
+
except ImportError:
|
546
|
+
# Fallback to azure-identity
|
547
|
+
print("Getting authentication token...")
|
548
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
549
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
550
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
551
|
+
token = token_obj.token
|
552
|
+
|
553
|
+
# Get workspace ID by name
|
554
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace)
|
555
|
+
if not workspace_id:
|
556
|
+
print(f"Workspace '{self.workspace}' not found")
|
557
|
+
return []
|
558
|
+
|
559
|
+
# List lakehouses
|
560
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
561
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
562
|
+
|
563
|
+
response = requests.get(url, headers=headers)
|
564
|
+
response.raise_for_status()
|
565
|
+
|
566
|
+
lakehouses = response.json().get("value", [])
|
567
|
+
lakehouse_names = [lh.get("displayName", "") for lh in lakehouses]
|
568
|
+
|
569
|
+
print(f"Found {len(lakehouse_names)} lakehouses: {lakehouse_names}")
|
570
|
+
return lakehouse_names
|
571
|
+
|
572
|
+
except Exception as e:
|
573
|
+
print(f"Error listing lakehouses: {e}")
|
574
|
+
return []
|
575
|
+
|
576
|
+
def create_lakehouse_if_not_exists(self, lakehouse_name: str) -> bool:
|
577
|
+
"""
|
578
|
+
Create a lakehouse if it doesn't already exist.
|
579
|
+
|
580
|
+
Args:
|
581
|
+
lakehouse_name: Name of the lakehouse to create
|
582
|
+
|
583
|
+
Returns:
|
584
|
+
True if lakehouse exists or was created successfully, False otherwise
|
585
|
+
"""
|
586
|
+
try:
|
587
|
+
# Try to get token from notebook environment first
|
588
|
+
try:
|
589
|
+
import notebookutils # type: ignore
|
590
|
+
token = notebookutils.credentials.getToken("pbi")
|
591
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
592
|
+
except ImportError:
|
593
|
+
# Fallback to azure-identity
|
594
|
+
print("Getting authentication token...")
|
595
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
596
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
597
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
598
|
+
token = token_obj.token
|
599
|
+
|
600
|
+
# Get workspace ID by name
|
601
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace)
|
602
|
+
if not workspace_id:
|
603
|
+
print(f"Workspace '{self.workspace}' not found")
|
604
|
+
return False
|
605
|
+
|
606
|
+
# Check if lakehouse already exists
|
607
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
608
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
609
|
+
|
610
|
+
response = requests.get(url, headers=headers)
|
611
|
+
response.raise_for_status()
|
612
|
+
|
613
|
+
lakehouses = response.json().get("value", [])
|
614
|
+
existing_names = [lh.get("displayName", "") for lh in lakehouses]
|
615
|
+
|
616
|
+
if lakehouse_name in existing_names:
|
617
|
+
print(f"Lakehouse '{lakehouse_name}' already exists")
|
618
|
+
return True
|
619
|
+
|
620
|
+
# Create lakehouse
|
621
|
+
print(f"Creating lakehouse '{lakehouse_name}'...")
|
622
|
+
payload = {
|
623
|
+
"displayName": lakehouse_name,
|
624
|
+
"description": f"Lakehouse {lakehouse_name} created via duckrun"
|
625
|
+
}
|
626
|
+
|
627
|
+
response = requests.post(url, headers=headers, json=payload)
|
628
|
+
response.raise_for_status()
|
629
|
+
|
630
|
+
print(f"✅ Lakehouse '{lakehouse_name}' created successfully")
|
631
|
+
return True
|
632
|
+
|
633
|
+
except Exception as e:
|
634
|
+
print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
|
635
|
+
return False
|
636
|
+
|
637
|
+
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
638
|
+
"""Helper method to get workspace ID from name"""
|
639
|
+
try:
|
640
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
641
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
642
|
+
|
643
|
+
response = requests.get(url, headers=headers)
|
644
|
+
response.raise_for_status()
|
645
|
+
|
646
|
+
workspaces = response.json().get("value", [])
|
647
|
+
for workspace in workspaces:
|
648
|
+
if workspace.get("displayName") == workspace_name:
|
649
|
+
return workspace.get("id")
|
650
|
+
|
651
|
+
return None
|
652
|
+
|
653
|
+
except Exception:
|
654
|
+
return None
|
655
|
+
|
361
656
|
def close(self):
|
362
657
|
"""Close DuckDB connection"""
|
363
658
|
if self.con:
|
364
659
|
self.con.close()
|
365
|
-
print("Connection closed")
|
660
|
+
print("Connection closed")
|
661
|
+
|
662
|
+
|
663
|
+
class WorkspaceConnection:
|
664
|
+
"""
|
665
|
+
Simple workspace connection for lakehouse management operations.
|
666
|
+
"""
|
667
|
+
|
668
|
+
def __init__(self, workspace_name: str):
|
669
|
+
self.workspace_name = workspace_name
|
670
|
+
|
671
|
+
def list_lakehouses(self) -> List[str]:
|
672
|
+
"""
|
673
|
+
List all lakehouses in the workspace.
|
674
|
+
|
675
|
+
Returns:
|
676
|
+
List of lakehouse names
|
677
|
+
"""
|
678
|
+
try:
|
679
|
+
# Try to get token from notebook environment first
|
680
|
+
try:
|
681
|
+
import notebookutils # type: ignore
|
682
|
+
token = notebookutils.credentials.getToken("pbi")
|
683
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
684
|
+
except ImportError:
|
685
|
+
# Fallback to azure-identity
|
686
|
+
print("Getting authentication token...")
|
687
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
688
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
689
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
690
|
+
token = token_obj.token
|
691
|
+
|
692
|
+
# Get workspace ID by name
|
693
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
|
694
|
+
if not workspace_id:
|
695
|
+
print(f"Workspace '{self.workspace_name}' not found")
|
696
|
+
return []
|
697
|
+
|
698
|
+
# List lakehouses
|
699
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
700
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
701
|
+
|
702
|
+
response = requests.get(url, headers=headers)
|
703
|
+
response.raise_for_status()
|
704
|
+
|
705
|
+
lakehouses = response.json().get("value", [])
|
706
|
+
lakehouse_names = [lh.get("displayName", "") for lh in lakehouses]
|
707
|
+
|
708
|
+
print(f"Found {len(lakehouse_names)} lakehouses: {lakehouse_names}")
|
709
|
+
return lakehouse_names
|
710
|
+
|
711
|
+
except Exception as e:
|
712
|
+
print(f"Error listing lakehouses: {e}")
|
713
|
+
return []
|
714
|
+
|
715
|
+
def create_lakehouse_if_not_exists(self, lakehouse_name: str) -> bool:
|
716
|
+
"""
|
717
|
+
Create a lakehouse if it doesn't already exist.
|
718
|
+
|
719
|
+
Args:
|
720
|
+
lakehouse_name: Name of the lakehouse to create
|
721
|
+
|
722
|
+
Returns:
|
723
|
+
True if lakehouse exists or was created successfully, False otherwise
|
724
|
+
"""
|
725
|
+
try:
|
726
|
+
# Try to get token from notebook environment first
|
727
|
+
try:
|
728
|
+
import notebookutils # type: ignore
|
729
|
+
token = notebookutils.credentials.getToken("pbi")
|
730
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
731
|
+
except ImportError:
|
732
|
+
# Fallback to azure-identity
|
733
|
+
print("Getting authentication token...")
|
734
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
735
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
736
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
737
|
+
token = token_obj.token
|
738
|
+
|
739
|
+
# Get workspace ID by name
|
740
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
|
741
|
+
if not workspace_id:
|
742
|
+
print(f"Workspace '{self.workspace_name}' not found")
|
743
|
+
return False
|
744
|
+
|
745
|
+
# Check if lakehouse already exists
|
746
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
747
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
748
|
+
|
749
|
+
response = requests.get(url, headers=headers)
|
750
|
+
response.raise_for_status()
|
751
|
+
|
752
|
+
lakehouses = response.json().get("value", [])
|
753
|
+
existing_names = [lh.get("displayName", "") for lh in lakehouses]
|
754
|
+
|
755
|
+
if lakehouse_name in existing_names:
|
756
|
+
print(f"Lakehouse '{lakehouse_name}' already exists")
|
757
|
+
return True
|
758
|
+
|
759
|
+
# Create lakehouse
|
760
|
+
print(f"Creating lakehouse '{lakehouse_name}'...")
|
761
|
+
payload = {
|
762
|
+
"displayName": lakehouse_name,
|
763
|
+
"description": f"Lakehouse {lakehouse_name} created via duckrun",
|
764
|
+
"creationPayload": {
|
765
|
+
"enableSchemas": True
|
766
|
+
}
|
767
|
+
}
|
768
|
+
|
769
|
+
response = requests.post(url, headers=headers, json=payload)
|
770
|
+
response.raise_for_status()
|
771
|
+
|
772
|
+
print(f"✅ Lakehouse '{lakehouse_name}' created successfully")
|
773
|
+
return True
|
774
|
+
|
775
|
+
except Exception as e:
|
776
|
+
print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
|
777
|
+
return False
|
778
|
+
|
779
|
+
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
780
|
+
"""Helper method to get workspace ID from name"""
|
781
|
+
try:
|
782
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
783
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
784
|
+
|
785
|
+
response = requests.get(url, headers=headers)
|
786
|
+
response.raise_for_status()
|
787
|
+
|
788
|
+
workspaces = response.json().get("value", [])
|
789
|
+
for workspace in workspaces:
|
790
|
+
if workspace.get("displayName") == workspace_name:
|
791
|
+
return workspace.get("id")
|
792
|
+
|
793
|
+
return None
|
794
|
+
|
795
|
+
except Exception:
|
796
|
+
return None
|
duckrun/files.py
CHANGED
@@ -51,8 +51,8 @@ def copy(duckrun_instance, local_folder: str, remote_folder: str,
|
|
51
51
|
token = token_obj.token
|
52
52
|
os.environ["AZURE_STORAGE_TOKEN"] = token
|
53
53
|
|
54
|
-
# Setup OneLake Files URL (
|
55
|
-
files_base_url =
|
54
|
+
# Setup OneLake Files URL (use correct format without .Lakehouse suffix)
|
55
|
+
files_base_url = duckrun_instance.files_base_url
|
56
56
|
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
57
57
|
|
58
58
|
# Collect files to upload
|
@@ -160,8 +160,8 @@ def download(duckrun_instance, remote_folder: str = "", local_folder: str = "./d
|
|
160
160
|
token = token_obj.token
|
161
161
|
os.environ["AZURE_STORAGE_TOKEN"] = token
|
162
162
|
|
163
|
-
# Setup OneLake Files URL (
|
164
|
-
files_base_url =
|
163
|
+
# Setup OneLake Files URL (use correct format without .Lakehouse suffix)
|
164
|
+
files_base_url = duckrun_instance.files_base_url
|
165
165
|
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
166
166
|
|
167
167
|
# Create local directory
|
duckrun/lakehouse.py
ADDED
@@ -0,0 +1,402 @@
|
|
1
|
+
import requests
|
2
|
+
import time
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
class FabricLakehouseManager:
|
6
|
+
"""
|
7
|
+
Manage Microsoft Fabric Lakehouses using REST API only.
|
8
|
+
Works on any machine with Python and internet access.
|
9
|
+
"""
|
10
|
+
|
11
|
+
def __init__(self, access_token: str):
|
12
|
+
"""
|
13
|
+
Initialize with Azure AD access token.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
access_token: Bearer token for Fabric API authentication
|
17
|
+
"""
|
18
|
+
self.base_url = "https://api.fabric.microsoft.com/v1"
|
19
|
+
self.headers = {
|
20
|
+
"Authorization": f"Bearer {access_token}",
|
21
|
+
"Content-Type": "application/json"
|
22
|
+
}
|
23
|
+
|
24
|
+
def get_workspace_id(self, workspace_name: str) -> Optional[str]:
|
25
|
+
"""
|
26
|
+
Get workspace ID from workspace name.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
workspace_name: Name of the workspace
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Workspace ID if found, None otherwise
|
33
|
+
"""
|
34
|
+
if not workspace_name:
|
35
|
+
return None
|
36
|
+
|
37
|
+
try:
|
38
|
+
url = f"{self.base_url}/workspaces"
|
39
|
+
response = requests.get(url, headers=self.headers)
|
40
|
+
response.raise_for_status()
|
41
|
+
|
42
|
+
workspaces = response.json().get("value", [])
|
43
|
+
for workspace in workspaces:
|
44
|
+
if workspace.get("displayName") == workspace_name:
|
45
|
+
return workspace.get("id")
|
46
|
+
|
47
|
+
print(f"Workspace '{workspace_name}' not found")
|
48
|
+
return None
|
49
|
+
|
50
|
+
except Exception as e:
|
51
|
+
print(f"Error getting workspace ID: {e}")
|
52
|
+
return None
|
53
|
+
|
54
|
+
def get_lakehouse(self, lakehouse_name: str, workspace_id: str) -> Optional[dict]:
|
55
|
+
"""
|
56
|
+
Get lakehouse details if it exists.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
lakehouse_name: Name of the lakehouse
|
60
|
+
workspace_id: ID of the workspace
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
Lakehouse details if found, None otherwise
|
64
|
+
"""
|
65
|
+
try:
|
66
|
+
url = f"{self.base_url}/workspaces/{workspace_id}/lakehouses"
|
67
|
+
response = requests.get(url, headers=self.headers)
|
68
|
+
response.raise_for_status()
|
69
|
+
|
70
|
+
lakehouses = response.json().get("value", [])
|
71
|
+
for lakehouse in lakehouses:
|
72
|
+
if lakehouse.get("displayName") == lakehouse_name:
|
73
|
+
return lakehouse
|
74
|
+
|
75
|
+
return None
|
76
|
+
|
77
|
+
except Exception as e:
|
78
|
+
print(f"Error getting lakehouse: {e}")
|
79
|
+
return None
|
80
|
+
|
81
|
+
def create_lakehouse(self, lakehouse_name: str, workspace_id: str,
|
82
|
+
enable_schemas: bool = True) -> Optional[dict]:
|
83
|
+
"""
|
84
|
+
Create a new lakehouse.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
lakehouse_name: Name of the lakehouse
|
88
|
+
workspace_id: ID of the workspace
|
89
|
+
enable_schemas: Whether to enable schemas
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Created lakehouse details if successful, None otherwise
|
93
|
+
"""
|
94
|
+
try:
|
95
|
+
url = f"{self.base_url}/workspaces/{workspace_id}/lakehouses"
|
96
|
+
payload = {
|
97
|
+
"displayName": lakehouse_name,
|
98
|
+
"description": f"Lakehouse {lakehouse_name}"
|
99
|
+
}
|
100
|
+
|
101
|
+
if enable_schemas:
|
102
|
+
payload["creationPayload"] = {
|
103
|
+
"enableSchemas": True
|
104
|
+
}
|
105
|
+
|
106
|
+
response = requests.post(url, headers=self.headers, json=payload)
|
107
|
+
response.raise_for_status()
|
108
|
+
|
109
|
+
# Wait a bit for the lakehouse to be fully provisioned
|
110
|
+
time.sleep(2)
|
111
|
+
|
112
|
+
return response.json()
|
113
|
+
|
114
|
+
except Exception as e:
|
115
|
+
print(f"Error creating lakehouse: {e}")
|
116
|
+
if hasattr(e, 'response') and e.response is not None:
|
117
|
+
print(f"Response: {e.response.text}")
|
118
|
+
return None
|
119
|
+
|
120
|
+
def create_lakehouse_if_not_exists(self, lakehouse_name: str,
|
121
|
+
workspace_name: Optional[str] = None,
|
122
|
+
workspace_id: Optional[str] = None) -> int:
|
123
|
+
"""
|
124
|
+
Create a lakehouse if it doesn't exist.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
lakehouse_name: Name of the lakehouse
|
128
|
+
workspace_name: Optional workspace name
|
129
|
+
workspace_id: Optional workspace ID (takes precedence over workspace_name)
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
1 if successful (lakehouse exists or was created)
|
133
|
+
0 if failed
|
134
|
+
"""
|
135
|
+
# Resolve workspace ID
|
136
|
+
if workspace_id is None and workspace_name:
|
137
|
+
workspace_id = self.get_workspace_id(workspace_name)
|
138
|
+
if workspace_id is None:
|
139
|
+
print(f"Workspace '{workspace_name}' not found - returning 0")
|
140
|
+
return 0
|
141
|
+
elif workspace_id is None:
|
142
|
+
print("No workspace specified - returning 0")
|
143
|
+
return 0
|
144
|
+
|
145
|
+
print(f"Attempting to get lakehouse '{lakehouse_name}' in workspace '{workspace_id}'")
|
146
|
+
|
147
|
+
# Check if lakehouse exists
|
148
|
+
lakehouse = self.get_lakehouse(lakehouse_name, workspace_id)
|
149
|
+
|
150
|
+
if lakehouse:
|
151
|
+
print(f"Lakehouse '{lakehouse_name}' found - returning 1")
|
152
|
+
return 1
|
153
|
+
|
154
|
+
# Create lakehouse if it doesn't exist
|
155
|
+
print(f"Lakehouse not found, attempting to create...")
|
156
|
+
created = self.create_lakehouse(lakehouse_name, workspace_id)
|
157
|
+
|
158
|
+
if created:
|
159
|
+
# Verify creation
|
160
|
+
lakehouse = self.get_lakehouse(lakehouse_name, workspace_id)
|
161
|
+
if lakehouse:
|
162
|
+
print(f"Lakehouse '{lakehouse_name}' created successfully - returning 1")
|
163
|
+
return 1
|
164
|
+
|
165
|
+
print(f"Failed to create lakehouse '{lakehouse_name}' - returning 0")
|
166
|
+
return 0
|
167
|
+
|
168
|
+
|
169
|
+
# Example usage with Azure Identity:
|
170
|
+
def main():
|
171
|
+
"""
|
172
|
+
Example of how to use the FabricLakehouseManager with azure-identity.
|
173
|
+
"""
|
174
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
175
|
+
|
176
|
+
print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
|
177
|
+
|
178
|
+
# Create credential chain (CLI first, then interactive browser)
|
179
|
+
credential = ChainedTokenCredential(
|
180
|
+
AzureCliCredential(),
|
181
|
+
InteractiveBrowserCredential()
|
182
|
+
)
|
183
|
+
|
184
|
+
# Get token for Fabric API (not storage!)
|
185
|
+
# Note: Use Fabric API scope, not storage scope
|
186
|
+
token = credential.get_token("https://api.fabric.microsoft.com/.default")
|
187
|
+
|
188
|
+
print("✓ Authentication successful!")
|
189
|
+
|
190
|
+
# Initialize manager with Fabric token
|
191
|
+
manager = FabricLakehouseManager(token.token)
|
192
|
+
|
193
|
+
# Create lakehouse if not exists
|
194
|
+
result = manager.create_lakehouse_if_not_exists(
|
195
|
+
lakehouse_name="MyLakehouse",
|
196
|
+
workspace_name="MyWorkspace"
|
197
|
+
)
|
198
|
+
|
199
|
+
if result == 1:
|
200
|
+
print("✓ Lakehouse operation successful!")
|
201
|
+
else:
|
202
|
+
print("✗ Lakehouse operation failed!")
|
203
|
+
|
204
|
+
return result
|
205
|
+
|
206
|
+
|
207
|
+
def get_fabric_token():
|
208
|
+
"""
|
209
|
+
Helper function to get Fabric API token.
|
210
|
+
Returns the token string.
|
211
|
+
"""
|
212
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
213
|
+
|
214
|
+
credential = ChainedTokenCredential(
|
215
|
+
AzureCliCredential(),
|
216
|
+
InteractiveBrowserCredential()
|
217
|
+
)
|
218
|
+
|
219
|
+
# Get token for Fabric API
|
220
|
+
token = credential.get_token("https://api.fabric.microsoft.com/.default")
|
221
|
+
return token.token
|
222
|
+
|
223
|
+
|
224
|
+
def create_lakehouse_in_notebook(lakehouse_name: str, workspace_name: Optional[str] = None) -> int:
|
225
|
+
"""
|
226
|
+
Create a lakehouse in a Fabric notebook environment.
|
227
|
+
This function uses the notebook's built-in authentication.
|
228
|
+
|
229
|
+
Args:
|
230
|
+
lakehouse_name: Name of the lakehouse to create
|
231
|
+
workspace_name: Optional workspace name (uses current workspace if None)
|
232
|
+
|
233
|
+
Returns:
|
234
|
+
1 if successful (lakehouse exists or was created)
|
235
|
+
0 if failed
|
236
|
+
"""
|
237
|
+
try:
|
238
|
+
# Try to import fabric notebook utilities (only available in Fabric notebooks)
|
239
|
+
import notebookutils # type: ignore
|
240
|
+
|
241
|
+
# Get authentication token from notebook environment
|
242
|
+
token = notebookutils.credentials.getToken("https://api.fabric.microsoft.com/.default")
|
243
|
+
|
244
|
+
# Initialize manager with notebook token
|
245
|
+
manager = FabricLakehouseManager(token)
|
246
|
+
|
247
|
+
# Get current workspace ID if no workspace specified
|
248
|
+
workspace_id = None
|
249
|
+
if workspace_name:
|
250
|
+
workspace_id = manager.get_workspace_id(workspace_name)
|
251
|
+
else:
|
252
|
+
# In Fabric notebooks, we can get the current workspace from context
|
253
|
+
try:
|
254
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
255
|
+
except:
|
256
|
+
print("Could not get current workspace ID from notebook context")
|
257
|
+
return 0
|
258
|
+
|
259
|
+
if not workspace_id:
|
260
|
+
print(f"Could not resolve workspace ID")
|
261
|
+
return 0
|
262
|
+
|
263
|
+
# Create lakehouse if not exists
|
264
|
+
return manager.create_lakehouse_if_not_exists(
|
265
|
+
lakehouse_name=lakehouse_name,
|
266
|
+
workspace_id=workspace_id
|
267
|
+
)
|
268
|
+
|
269
|
+
except ImportError:
|
270
|
+
print("notebookutils not available - not running in Fabric notebook environment")
|
271
|
+
print("Use FabricLakehouseManager class directly with proper authentication")
|
272
|
+
return 0
|
273
|
+
except Exception as e:
|
274
|
+
print(f"Error creating lakehouse in notebook: {e}")
|
275
|
+
return 0
|
276
|
+
|
277
|
+
|
278
|
+
def create_lakehouse_simple(lakehouse_name: str, access_token: str, workspace_id: str) -> dict:
|
279
|
+
"""
|
280
|
+
Simple function to create a lakehouse with minimal dependencies.
|
281
|
+
Perfect for Fabric notebook environments.
|
282
|
+
|
283
|
+
Args:
|
284
|
+
lakehouse_name: Name of the lakehouse to create
|
285
|
+
access_token: Bearer token for authentication
|
286
|
+
workspace_id: ID of the target workspace
|
287
|
+
|
288
|
+
Returns:
|
289
|
+
Dictionary with creation result
|
290
|
+
"""
|
291
|
+
import requests
|
292
|
+
import time
|
293
|
+
|
294
|
+
base_url = "https://api.fabric.microsoft.com/v1"
|
295
|
+
headers = {
|
296
|
+
"Authorization": f"Bearer {access_token}",
|
297
|
+
"Content-Type": "application/json"
|
298
|
+
}
|
299
|
+
|
300
|
+
try:
|
301
|
+
# First check if lakehouse already exists
|
302
|
+
list_url = f"{base_url}/workspaces/{workspace_id}/lakehouses"
|
303
|
+
response = requests.get(list_url, headers=headers)
|
304
|
+
response.raise_for_status()
|
305
|
+
|
306
|
+
lakehouses = response.json().get("value", [])
|
307
|
+
for lakehouse in lakehouses:
|
308
|
+
if lakehouse.get("displayName") == lakehouse_name:
|
309
|
+
return {
|
310
|
+
"success": True,
|
311
|
+
"message": f"Lakehouse '{lakehouse_name}' already exists",
|
312
|
+
"lakehouse": lakehouse,
|
313
|
+
"created": False
|
314
|
+
}
|
315
|
+
|
316
|
+
# Create new lakehouse
|
317
|
+
create_url = f"{base_url}/workspaces/{workspace_id}/lakehouses"
|
318
|
+
payload = {
|
319
|
+
"displayName": lakehouse_name,
|
320
|
+
"description": f"Lakehouse {lakehouse_name} created via API"
|
321
|
+
}
|
322
|
+
|
323
|
+
response = requests.post(create_url, headers=headers, json=payload)
|
324
|
+
response.raise_for_status()
|
325
|
+
|
326
|
+
# Wait for provisioning
|
327
|
+
time.sleep(3)
|
328
|
+
|
329
|
+
created_lakehouse = response.json()
|
330
|
+
return {
|
331
|
+
"success": True,
|
332
|
+
"message": f"Lakehouse '{lakehouse_name}' created successfully",
|
333
|
+
"lakehouse": created_lakehouse,
|
334
|
+
"created": True
|
335
|
+
}
|
336
|
+
|
337
|
+
except requests.exceptions.RequestException as e:
|
338
|
+
error_msg = f"HTTP error creating lakehouse: {e}"
|
339
|
+
if hasattr(e, 'response') and e.response is not None:
|
340
|
+
error_msg += f" Response: {e.response.text}"
|
341
|
+
|
342
|
+
return {
|
343
|
+
"success": False,
|
344
|
+
"message": error_msg,
|
345
|
+
"lakehouse": None,
|
346
|
+
"created": False
|
347
|
+
}
|
348
|
+
except Exception as e:
|
349
|
+
return {
|
350
|
+
"success": False,
|
351
|
+
"message": f"Unexpected error: {e}",
|
352
|
+
"lakehouse": None,
|
353
|
+
"created": False
|
354
|
+
}
|
355
|
+
|
356
|
+
|
357
|
+
if __name__ == "__main__":
|
358
|
+
# Uncomment to run the example
|
359
|
+
# main()
|
360
|
+
pass
|
361
|
+
|
362
|
+
|
363
|
+
# Usage Examples:
|
364
|
+
"""
|
365
|
+
# Example 1: In a Fabric Notebook (simplest approach)
|
366
|
+
from duckrun.lakehouse import create_lakehouse_in_notebook
|
367
|
+
|
368
|
+
result = create_lakehouse_in_notebook("MyNewLakehouse")
|
369
|
+
if result == 1:
|
370
|
+
print("Lakehouse created or already exists!")
|
371
|
+
|
372
|
+
# Example 2: In a Fabric Notebook with explicit token
|
373
|
+
import notebookutils
|
374
|
+
from duckrun.lakehouse import create_lakehouse_simple
|
375
|
+
|
376
|
+
token = notebookutils.credentials.getToken("https://api.fabric.microsoft.com/.default")
|
377
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
378
|
+
|
379
|
+
result = create_lakehouse_simple("MyLakehouse", token, workspace_id)
|
380
|
+
print(f"Result: {result['message']}")
|
381
|
+
|
382
|
+
# Example 3: Outside Fabric (requires azure-identity package)
|
383
|
+
from duckrun.lakehouse import FabricLakehouseManager, get_fabric_token
|
384
|
+
|
385
|
+
token = get_fabric_token()
|
386
|
+
manager = FabricLakehouseManager(token)
|
387
|
+
result = manager.create_lakehouse_if_not_exists("MyLakehouse", workspace_name="MyWorkspace")
|
388
|
+
|
389
|
+
# Example 4: With explicit workspace and lakehouse details
|
390
|
+
from duckrun.lakehouse import FabricLakehouseManager
|
391
|
+
|
392
|
+
# Get your token however you prefer
|
393
|
+
token = "your_bearer_token_here"
|
394
|
+
manager = FabricLakehouseManager(token)
|
395
|
+
|
396
|
+
# Create lakehouse in specific workspace
|
397
|
+
workspace_id = manager.get_workspace_id("Production Workspace")
|
398
|
+
lakehouse = manager.create_lakehouse("DataLake2024", workspace_id, enable_schemas=True)
|
399
|
+
|
400
|
+
if lakehouse:
|
401
|
+
print(f"Created lakehouse with ID: {lakehouse['id']}")
|
402
|
+
"""
|
duckrun/runner.py
CHANGED
@@ -235,11 +235,28 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
|
|
235
235
|
print(f"SQL file is empty: {table_name}.sql")
|
236
236
|
return None
|
237
237
|
|
238
|
+
import re
|
239
|
+
# Determine if lakehouse_name is a GUID
|
240
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
241
|
+
lakehouse_is_guid = bool(guid_pattern.match(duckrun_instance.lakehouse_name))
|
242
|
+
|
243
|
+
# Smart substitution for ${lh}.Lakehouse
|
244
|
+
# If template contains ${lh}.Lakehouse, replace with correct value
|
245
|
+
if '${lh}.Lakehouse' in content:
|
246
|
+
if lakehouse_is_guid:
|
247
|
+
# If GUID, use just the GUID
|
248
|
+
content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
|
249
|
+
else:
|
250
|
+
# If not GUID, use legacy format
|
251
|
+
content = content.replace('${lh}.Lakehouse', f'{duckrun_instance.lakehouse_name}.Lakehouse')
|
252
|
+
|
238
253
|
full_params = {
|
239
254
|
'ws': duckrun_instance.workspace,
|
240
255
|
'lh': duckrun_instance.lakehouse_name,
|
241
256
|
'schema': duckrun_instance.schema,
|
242
|
-
'storage_account': duckrun_instance.storage_account
|
257
|
+
'storage_account': duckrun_instance.storage_account,
|
258
|
+
'tables_url': duckrun_instance.table_base_url,
|
259
|
+
'files_url': duckrun_instance.files_base_url
|
243
260
|
}
|
244
261
|
if params:
|
245
262
|
full_params.update(params)
|
@@ -247,6 +264,10 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
|
|
247
264
|
try:
|
248
265
|
template = Template(content)
|
249
266
|
content = template.substitute(full_params)
|
267
|
+
# After substitution, remove .Lakehouse if it follows a GUID in any ABFSS URL
|
268
|
+
import re
|
269
|
+
# Pattern: GUID.Lakehouse or GUID.lakehouse (in URLs)
|
270
|
+
content = re.sub(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.(Lakehouse|lakehouse)', r'\1', content)
|
250
271
|
except KeyError as e:
|
251
272
|
print(f"Missing parameter in SQL file: ${e}")
|
252
273
|
return None
|
duckrun/stats.py
CHANGED
@@ -142,8 +142,8 @@ def get_stats(duckrun_instance, source: str):
|
|
142
142
|
print(f"Processing {len(list_tables)} tables: {list_tables}")
|
143
143
|
|
144
144
|
for idx, tbl in enumerate(list_tables):
|
145
|
-
# Construct lakehouse path using ABFSS URL
|
146
|
-
table_path = f"
|
145
|
+
# Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
|
146
|
+
table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
|
147
147
|
|
148
148
|
try:
|
149
149
|
dt = DeltaTable(table_path)
|
@@ -0,0 +1,12 @@
|
|
1
|
+
duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
|
2
|
+
duckrun/core.py,sha256=-UAsAOlOmVkVuQCkbCWTH7aiFSHP0OenAAWSl0i0inY,37107
|
3
|
+
duckrun/files.py,sha256=piWRU5w9jHrW-wuV4Gf-SKY_jhFv9eflxgWO8AZCQTI,10495
|
4
|
+
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
5
|
+
duckrun/runner.py,sha256=lfwNoU1CZXh6bPTHvGWVaUWjzG5crvT7Pzq4onMEVjw,12576
|
6
|
+
duckrun/stats.py,sha256=2FTqoQNVjD84-H1HjStHxZkOpAGKXS79M55B00pOlok,9804
|
7
|
+
duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
|
8
|
+
duckrun-0.2.5.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
9
|
+
duckrun-0.2.5.dev1.dist-info/METADATA,sha256=ZQfd7I-J08MC5ytjKt789PqHdroHI0ld36aMNp-57yE,18344
|
10
|
+
duckrun-0.2.5.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
11
|
+
duckrun-0.2.5.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
12
|
+
duckrun-0.2.5.dev1.dist-info/RECORD,,
|
duckrun-0.2.4.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
-
duckrun/core.py,sha256=m_9DuSZNZ5DOETnkjNGn8HJBYheCgs_7NewcbM9VECI,16500
|
3
|
-
duckrun/files.py,sha256=xba0juMEQPgaznDudmXcwaGH0wv-6aCoHmV_cNF6Y7I,10665
|
4
|
-
duckrun/runner.py,sha256=X5g-57OCHQZ7USKpcBbhYGUcZwLQny2x147DLKrV32c,11417
|
5
|
-
duckrun/stats.py,sha256=B9UfGOndRNfcB2AhOVjuSqgfmF2x-uRmdmBn3usx_jQ,9881
|
6
|
-
duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
|
7
|
-
duckrun-0.2.4.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
8
|
-
duckrun-0.2.4.dist-info/METADATA,sha256=2t7-pNzcPCeseXTjp6Bc18_V41MpjDarG0z-2IzY-Lk,18339
|
9
|
-
duckrun-0.2.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
10
|
-
duckrun-0.2.4.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
11
|
-
duckrun-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|