duckrun 0.2.3__py3-none-any.whl → 0.2.5.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/__init__.py +1 -1
- duckrun/core.py +480 -44
- duckrun/files.py +4 -4
- duckrun/lakehouse.py +402 -0
- duckrun/runner.py +22 -1
- duckrun/stats.py +24 -18
- {duckrun-0.2.3.dist-info → duckrun-0.2.5.dev1.dist-info}/METADATA +1 -1
- duckrun-0.2.5.dev1.dist-info/RECORD +12 -0
- duckrun-0.2.3.dist-info/RECORD +0 -11
- {duckrun-0.2.3.dist-info → duckrun-0.2.5.dev1.dist-info}/WHEEL +0 -0
- {duckrun-0.2.3.dist-info → duckrun-0.2.5.dev1.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.3.dist-info → duckrun-0.2.5.dev1.dist-info}/top_level.txt +0 -0
duckrun/__init__.py
CHANGED
duckrun/core.py
CHANGED
@@ -49,17 +49,27 @@ class Duckrun:
|
|
49
49
|
]
|
50
50
|
"""
|
51
51
|
|
52
|
-
def __init__(self,
|
52
|
+
def __init__(self, workspace_id: str, lakehouse_id: str, schema: str = "dbo",
|
53
53
|
sql_folder: Optional[str] = None, compaction_threshold: int = 10,
|
54
54
|
scan_all_schemas: bool = False, storage_account: str = "onelake"):
|
55
|
-
|
56
|
-
self.
|
55
|
+
# Store GUIDs for internal use
|
56
|
+
self.workspace_id = workspace_id
|
57
|
+
self.lakehouse_id = lakehouse_id
|
57
58
|
self.schema = schema
|
58
59
|
self.sql_folder = sql_folder.strip() if sql_folder else None
|
59
60
|
self.compaction_threshold = compaction_threshold
|
60
61
|
self.scan_all_schemas = scan_all_schemas
|
61
62
|
self.storage_account = storage_account
|
62
|
-
|
63
|
+
|
64
|
+
# Construct proper ABFSS URLs using GUIDs
|
65
|
+
# Both Tables and Files use lakehouse GUID directly (no .Lakehouse suffix)
|
66
|
+
self.table_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_id}/Tables/'
|
67
|
+
self.files_base_url = f'abfss://{workspace_id}@{storage_account}.dfs.fabric.microsoft.com/{lakehouse_id}/Files/'
|
68
|
+
|
69
|
+
# Keep legacy properties for backward compatibility
|
70
|
+
self.workspace = workspace_id
|
71
|
+
self.lakehouse_name = lakehouse_id
|
72
|
+
|
63
73
|
self.con = duckdb.connect()
|
64
74
|
self.con.sql("SET preserve_insertion_order = false")
|
65
75
|
self._attach_lakehouse()
|
@@ -68,59 +78,218 @@ class Duckrun:
|
|
68
78
|
def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
|
69
79
|
compaction_threshold: int = 100, storage_account: str = "onelake"):
|
70
80
|
"""
|
71
|
-
Create and connect to lakehouse.
|
81
|
+
Create and connect to lakehouse or workspace.
|
72
82
|
|
73
|
-
|
83
|
+
Smart detection based on connection string format:
|
84
|
+
- "workspace" → workspace management only
|
85
|
+
- "ws/lh.lakehouse/schema" → full lakehouse connection
|
86
|
+
- "ws/lh.lakehouse" → lakehouse connection (defaults to dbo schema)
|
74
87
|
|
75
88
|
Args:
|
76
|
-
connection_string: OneLake path
|
77
|
-
sql_folder: Optional path or URL to SQL files folder
|
89
|
+
connection_string: OneLake path or workspace name
|
90
|
+
sql_folder: Optional path or URL to SQL files folder
|
78
91
|
compaction_threshold: File count threshold for compaction
|
79
92
|
storage_account: Storage account name (default: "onelake")
|
80
93
|
|
81
94
|
Examples:
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
95
|
+
# Workspace management only (supports spaces in names)
|
96
|
+
ws = Duckrun.connect("My Workspace Name")
|
97
|
+
ws.list_lakehouses()
|
98
|
+
ws.create_lakehouse_if_not_exists("New Lakehouse")
|
99
|
+
|
100
|
+
# Full lakehouse connections (supports spaces in names)
|
101
|
+
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/schema", sql_folder="./sql")
|
102
|
+
dr = Duckrun.connect("Data Workspace/Sales Data.lakehouse/analytics") # spaces supported
|
103
|
+
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse") # defaults to dbo schema
|
104
|
+
dr = Duckrun.connect("workspace/lakehouse.lakehouse", storage_account="xxx-onelake") # custom storage
|
105
|
+
|
106
|
+
Note:
|
107
|
+
Internally resolves friendly names (with spaces) to GUIDs and constructs proper ABFSS URLs:
|
108
|
+
"My Workspace/My Lakehouse.lakehouse/schema" becomes
|
109
|
+
"abfss://workspace_guid@onelake.dfs.fabric.microsoft.com/lakehouse_guid/Tables/schema"
|
86
110
|
"""
|
111
|
+
|
112
|
+
# Check if it's a workspace-only connection (no "/" means workspace name only)
|
113
|
+
if "/" not in connection_string:
|
114
|
+
print(f"Connecting to workspace '{connection_string}' for management operations...")
|
115
|
+
return WorkspaceConnection(connection_string)
|
116
|
+
|
87
117
|
print("Connecting to Lakehouse...")
|
88
118
|
|
89
119
|
scan_all_schemas = False
|
90
120
|
|
91
|
-
#
|
92
|
-
|
93
|
-
raise ValueError(
|
94
|
-
"Invalid connection string format. "
|
95
|
-
"Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
|
96
|
-
)
|
97
|
-
|
121
|
+
# Parse lakehouse connection string: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
|
122
|
+
# Support workspace and lakehouse names with spaces
|
98
123
|
parts = connection_string.split("/")
|
99
124
|
if len(parts) == 2:
|
100
|
-
|
125
|
+
workspace_name, lakehouse_name = parts
|
101
126
|
scan_all_schemas = True
|
102
127
|
schema = "dbo"
|
103
|
-
print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
|
104
|
-
print(f" Scanning all schemas for table discovery...\n")
|
105
128
|
elif len(parts) == 3:
|
106
|
-
|
129
|
+
workspace_name, lakehouse_name, schema = parts
|
107
130
|
else:
|
108
131
|
raise ValueError(
|
109
132
|
f"Invalid connection string format: '{connection_string}'. "
|
110
|
-
"Expected
|
133
|
+
"Expected formats:\n"
|
134
|
+
" 'workspace name' (workspace management only)\n"
|
135
|
+
" 'workspace name/lakehouse name.lakehouse' (lakehouse with dbo schema)\n"
|
136
|
+
" 'workspace name/lakehouse name.lakehouse/schema' (lakehouse with specific schema)"
|
111
137
|
)
|
112
138
|
|
113
139
|
if lakehouse_name.endswith(".lakehouse"):
|
114
140
|
lakehouse_name = lakehouse_name[:-10]
|
115
141
|
|
116
|
-
if not
|
142
|
+
if not workspace_name or not lakehouse_name:
|
117
143
|
raise ValueError(
|
118
|
-
"Missing required parameters. Use
|
119
|
-
" connect('workspace
|
120
|
-
" connect('workspace/lakehouse.lakehouse') #
|
144
|
+
"Missing required parameters. Use one of these formats:\n"
|
145
|
+
" connect('workspace name') # workspace management\n"
|
146
|
+
" connect('workspace name/lakehouse name.lakehouse/schema') # full lakehouse\n"
|
147
|
+
" connect('workspace name/lakehouse name.lakehouse') # defaults to dbo"
|
121
148
|
)
|
122
149
|
|
123
|
-
|
150
|
+
# Resolve friendly names to GUIDs and construct proper ABFSS path
|
151
|
+
workspace_id, lakehouse_id = cls._resolve_names_to_guids(workspace_name, lakehouse_name)
|
152
|
+
|
153
|
+
return cls(workspace_id, lakehouse_id, schema, sql_folder, compaction_threshold, scan_all_schemas, storage_account)
|
154
|
+
|
155
|
+
@classmethod
|
156
|
+
def _resolve_names_to_guids(cls, workspace_name: str, lakehouse_name: str) -> tuple[str, str]:
|
157
|
+
"""
|
158
|
+
Resolve friendly workspace and lakehouse names to their GUIDs.
|
159
|
+
|
160
|
+
Optimization: If names don't contain spaces, use them directly (no API calls needed).
|
161
|
+
Only resolve to GUIDs when names contain spaces or are already GUIDs.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
workspace_name: Display name of the workspace (can contain spaces)
|
165
|
+
lakehouse_name: Display name of the lakehouse (can contain spaces)
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
Tuple of (workspace_id, lakehouse_id) - either resolved GUIDs or original names
|
169
|
+
"""
|
170
|
+
|
171
|
+
# Check if names are already GUIDs first
|
172
|
+
import re
|
173
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
174
|
+
|
175
|
+
if guid_pattern.match(workspace_name) and guid_pattern.match(lakehouse_name):
|
176
|
+
print(f"✅ Names are already GUIDs: workspace={workspace_name}, lakehouse={lakehouse_name}")
|
177
|
+
return workspace_name, lakehouse_name
|
178
|
+
|
179
|
+
# Optimization: If workspace name has no spaces, use both names directly (old behavior)
|
180
|
+
# Note: Lakehouse names cannot contain spaces in Microsoft Fabric, only workspace names can
|
181
|
+
if " " not in workspace_name:
|
182
|
+
print(f"✅ Using names directly (workspace has no spaces): workspace={workspace_name}, lakehouse={lakehouse_name}")
|
183
|
+
return workspace_name, lakehouse_name
|
184
|
+
|
185
|
+
# Workspace name contains spaces - need to resolve both to GUIDs for proper ABFSS URLs
|
186
|
+
print(f"🔍 Resolving '{workspace_name}' workspace and '{lakehouse_name}' lakehouse to GUIDs (workspace has spaces)...")
|
187
|
+
|
188
|
+
try:
|
189
|
+
# Get authentication token (try notebook environment first, then azure-identity)
|
190
|
+
try:
|
191
|
+
import notebookutils # type: ignore
|
192
|
+
token = notebookutils.credentials.getToken("pbi")
|
193
|
+
current_workspace_id = notebookutils.runtime.context.get("workspaceId")
|
194
|
+
except ImportError:
|
195
|
+
current_workspace_id = None
|
196
|
+
# Fallback to azure-identity for external environments
|
197
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
198
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
199
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
200
|
+
token = token_obj.token
|
201
|
+
|
202
|
+
# Resolve workspace name to ID
|
203
|
+
if current_workspace_id:
|
204
|
+
# In notebook environment, we could use current workspace ID
|
205
|
+
# but we should validate it matches the requested workspace name
|
206
|
+
workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
|
207
|
+
if not workspace_id:
|
208
|
+
# Fallback to current workspace if name resolution fails
|
209
|
+
print(f"⚠️ Could not validate workspace name '{workspace_name}', using current workspace")
|
210
|
+
workspace_id = current_workspace_id
|
211
|
+
else:
|
212
|
+
# External environment - must resolve by name
|
213
|
+
workspace_id = cls._resolve_workspace_id_by_name(token, workspace_name)
|
214
|
+
if not workspace_id:
|
215
|
+
raise ValueError(f"Workspace '{workspace_name}' not found")
|
216
|
+
|
217
|
+
# Resolve lakehouse name to ID (required for ABFSS URLs with spaces)
|
218
|
+
lakehouse_id = cls._resolve_lakehouse_id_by_name(token, workspace_id, lakehouse_name)
|
219
|
+
if not lakehouse_id:
|
220
|
+
raise ValueError(f"Lakehouse '{lakehouse_name}' not found in workspace '{workspace_name}'")
|
221
|
+
|
222
|
+
print(f"✅ Resolved: {workspace_name} → {workspace_id}, {lakehouse_name} → {lakehouse_id}")
|
223
|
+
return workspace_id, lakehouse_id
|
224
|
+
|
225
|
+
except Exception as e:
|
226
|
+
print(f"❌ Failed to resolve names to GUIDs: {e}")
|
227
|
+
print(f"❌ Cannot use friendly names with spaces '{workspace_name}'/'{lakehouse_name}' in ABFSS URLs without GUID resolution")
|
228
|
+
print("❌ Microsoft Fabric requires actual workspace and lakehouse GUIDs for ABFSS access when names contain spaces")
|
229
|
+
raise ValueError(
|
230
|
+
f"Unable to resolve workspace '{workspace_name}' and lakehouse '{lakehouse_name}' to GUIDs. "
|
231
|
+
f"ABFSS URLs require actual GUIDs when names contain spaces. "
|
232
|
+
f"Please ensure you have proper authentication and the workspace/lakehouse names are correct."
|
233
|
+
)
|
234
|
+
|
235
|
+
@classmethod
|
236
|
+
def _resolve_workspace_id_by_name(cls, token: str, workspace_name: str) -> Optional[str]:
|
237
|
+
"""Get workspace ID from display name"""
|
238
|
+
try:
|
239
|
+
import requests
|
240
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
241
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
242
|
+
|
243
|
+
response = requests.get(url, headers=headers)
|
244
|
+
response.raise_for_status()
|
245
|
+
|
246
|
+
workspaces = response.json().get("value", [])
|
247
|
+
for workspace in workspaces:
|
248
|
+
if workspace.get("displayName") == workspace_name:
|
249
|
+
return workspace.get("id")
|
250
|
+
|
251
|
+
return None
|
252
|
+
except Exception:
|
253
|
+
return None
|
254
|
+
|
255
|
+
@classmethod
|
256
|
+
def _resolve_lakehouse_id_by_name(cls, token: str, workspace_id: str, lakehouse_name: str) -> Optional[str]:
|
257
|
+
"""Get lakehouse ID from display name within a workspace"""
|
258
|
+
try:
|
259
|
+
import requests
|
260
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
261
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
262
|
+
|
263
|
+
response = requests.get(url, headers=headers)
|
264
|
+
response.raise_for_status()
|
265
|
+
|
266
|
+
lakehouses = response.json().get("value", [])
|
267
|
+
for lakehouse in lakehouses:
|
268
|
+
if lakehouse.get("displayName") == lakehouse_name:
|
269
|
+
return lakehouse.get("id")
|
270
|
+
|
271
|
+
return None
|
272
|
+
except Exception:
|
273
|
+
return None
|
274
|
+
|
275
|
+
@classmethod
|
276
|
+
def connect_workspace(cls, workspace_name: str):
|
277
|
+
"""
|
278
|
+
Connect to a workspace without a specific lakehouse.
|
279
|
+
Used for lakehouse management operations.
|
280
|
+
|
281
|
+
Args:
|
282
|
+
workspace_name: Name of the workspace
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
WorkspaceConnection object with lakehouse management methods
|
286
|
+
|
287
|
+
Example:
|
288
|
+
con = duckrun.connect_workspace("MyWorkspace")
|
289
|
+
con.list_lakehouses()
|
290
|
+
con.create_lakehouse_if_not_exists("newlakehouse")
|
291
|
+
"""
|
292
|
+
return WorkspaceConnection(workspace_name)
|
124
293
|
|
125
294
|
def _get_storage_token(self):
|
126
295
|
return os.environ.get("AZURE_STORAGE_TOKEN", "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE")
|
@@ -157,21 +326,18 @@ class Duckrun:
|
|
157
326
|
url = f"abfss://{self.workspace}@{self.storage_account}.dfs.fabric.microsoft.com/"
|
158
327
|
store = AzureStore.from_url(url, bearer_token=token)
|
159
328
|
|
160
|
-
base_path = f"{self.lakehouse_name}
|
329
|
+
base_path = f"{self.lakehouse_name}/Tables/"
|
161
330
|
tables_found = []
|
162
331
|
|
163
332
|
if self.scan_all_schemas:
|
164
333
|
# Discover all schemas first
|
165
|
-
print("🔍 Discovering schemas...")
|
166
334
|
schemas_result = obs.list_with_delimiter(store, prefix=base_path)
|
167
335
|
schemas = [
|
168
336
|
prefix.rstrip('/').split('/')[-1]
|
169
337
|
for prefix in schemas_result['common_prefixes']
|
170
338
|
]
|
171
|
-
print(f" Found {len(schemas)} schemas: {', '.join(schemas)}\n")
|
172
339
|
|
173
340
|
# Discover tables in each schema
|
174
|
-
print("🔍 Discovering tables...")
|
175
341
|
for schema_name in schemas:
|
176
342
|
schema_path = f"{base_path}{schema_name}/"
|
177
343
|
result = obs.list_with_delimiter(store, prefix=schema_path)
|
@@ -203,14 +369,27 @@ class Duckrun:
|
|
203
369
|
|
204
370
|
if not tables:
|
205
371
|
if self.scan_all_schemas:
|
206
|
-
print(f"No Delta tables found in {self.lakehouse_name}
|
372
|
+
print(f"No Delta tables found in {self.lakehouse_name}/Tables/")
|
207
373
|
else:
|
208
|
-
print(f"No Delta tables found in {self.lakehouse_name}
|
374
|
+
print(f"No Delta tables found in {self.lakehouse_name}/Tables/{self.schema}/")
|
209
375
|
return
|
210
376
|
|
211
|
-
|
377
|
+
# Group tables by schema for display
|
378
|
+
schema_tables = {}
|
379
|
+
for schema_name, table_name in tables:
|
380
|
+
if schema_name not in schema_tables:
|
381
|
+
schema_tables[schema_name] = []
|
382
|
+
schema_tables[schema_name].append(table_name)
|
383
|
+
|
384
|
+
# Display tables by schema
|
385
|
+
print(f"\n📊 Found {len(tables)} tables:")
|
386
|
+
for schema_name in sorted(schema_tables.keys()):
|
387
|
+
table_list = sorted(schema_tables[schema_name])
|
388
|
+
print(f" {schema_name}: {', '.join(table_list)}")
|
212
389
|
|
213
390
|
attached_count = 0
|
391
|
+
skipped_tables = []
|
392
|
+
|
214
393
|
for schema_name, table_name in tables:
|
215
394
|
try:
|
216
395
|
if self.scan_all_schemas:
|
@@ -225,19 +404,16 @@ class Duckrun:
|
|
225
404
|
CREATE OR REPLACE VIEW {view_name}
|
226
405
|
AS SELECT * FROM delta_scan('{self.table_base_url}{schema_name}/{table_name}');
|
227
406
|
""")
|
228
|
-
print(f" ✓ Attached: {schema_name}.{table_name} → {view_name}")
|
229
407
|
attached_count += 1
|
230
408
|
except Exception as e:
|
231
|
-
|
409
|
+
skipped_tables.append(f"{schema_name}.{table_name}")
|
232
410
|
continue
|
233
411
|
|
234
412
|
print(f"\n{'='*60}")
|
235
|
-
print(f"✅
|
413
|
+
print(f"✅ Ready - {attached_count}/{len(tables)} tables available")
|
414
|
+
if skipped_tables:
|
415
|
+
print(f"⚠ Skipped {len(skipped_tables)} tables: {', '.join(skipped_tables[:3])}{'...' if len(skipped_tables) > 3 else ''}")
|
236
416
|
print(f"{'='*60}\n")
|
237
|
-
|
238
|
-
if self.scan_all_schemas:
|
239
|
-
print(f"\n💡 Note: Tables use schema.table format (e.g., aemo.calendar, dbo.results)")
|
240
|
-
print(f" Default schema for operations: {self.schema}\n")
|
241
417
|
|
242
418
|
except Exception as e:
|
243
419
|
print(f"❌ Error attaching lakehouse: {e}")
|
@@ -353,8 +529,268 @@ class Duckrun:
|
|
353
529
|
"""
|
354
530
|
return _get_stats(self, source)
|
355
531
|
|
532
|
+
def list_lakehouses(self) -> List[str]:
|
533
|
+
"""
|
534
|
+
List all lakehouses in the current workspace.
|
535
|
+
|
536
|
+
Returns:
|
537
|
+
List of lakehouse names
|
538
|
+
"""
|
539
|
+
try:
|
540
|
+
# Try to get token from notebook environment first
|
541
|
+
try:
|
542
|
+
import notebookutils # type: ignore
|
543
|
+
token = notebookutils.credentials.getToken("pbi")
|
544
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
545
|
+
except ImportError:
|
546
|
+
# Fallback to azure-identity
|
547
|
+
print("Getting authentication token...")
|
548
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
549
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
550
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
551
|
+
token = token_obj.token
|
552
|
+
|
553
|
+
# Get workspace ID by name
|
554
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace)
|
555
|
+
if not workspace_id:
|
556
|
+
print(f"Workspace '{self.workspace}' not found")
|
557
|
+
return []
|
558
|
+
|
559
|
+
# List lakehouses
|
560
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
561
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
562
|
+
|
563
|
+
response = requests.get(url, headers=headers)
|
564
|
+
response.raise_for_status()
|
565
|
+
|
566
|
+
lakehouses = response.json().get("value", [])
|
567
|
+
lakehouse_names = [lh.get("displayName", "") for lh in lakehouses]
|
568
|
+
|
569
|
+
print(f"Found {len(lakehouse_names)} lakehouses: {lakehouse_names}")
|
570
|
+
return lakehouse_names
|
571
|
+
|
572
|
+
except Exception as e:
|
573
|
+
print(f"Error listing lakehouses: {e}")
|
574
|
+
return []
|
575
|
+
|
576
|
+
def create_lakehouse_if_not_exists(self, lakehouse_name: str) -> bool:
|
577
|
+
"""
|
578
|
+
Create a lakehouse if it doesn't already exist.
|
579
|
+
|
580
|
+
Args:
|
581
|
+
lakehouse_name: Name of the lakehouse to create
|
582
|
+
|
583
|
+
Returns:
|
584
|
+
True if lakehouse exists or was created successfully, False otherwise
|
585
|
+
"""
|
586
|
+
try:
|
587
|
+
# Try to get token from notebook environment first
|
588
|
+
try:
|
589
|
+
import notebookutils # type: ignore
|
590
|
+
token = notebookutils.credentials.getToken("pbi")
|
591
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
592
|
+
except ImportError:
|
593
|
+
# Fallback to azure-identity
|
594
|
+
print("Getting authentication token...")
|
595
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
596
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
597
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
598
|
+
token = token_obj.token
|
599
|
+
|
600
|
+
# Get workspace ID by name
|
601
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace)
|
602
|
+
if not workspace_id:
|
603
|
+
print(f"Workspace '{self.workspace}' not found")
|
604
|
+
return False
|
605
|
+
|
606
|
+
# Check if lakehouse already exists
|
607
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
608
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
609
|
+
|
610
|
+
response = requests.get(url, headers=headers)
|
611
|
+
response.raise_for_status()
|
612
|
+
|
613
|
+
lakehouses = response.json().get("value", [])
|
614
|
+
existing_names = [lh.get("displayName", "") for lh in lakehouses]
|
615
|
+
|
616
|
+
if lakehouse_name in existing_names:
|
617
|
+
print(f"Lakehouse '{lakehouse_name}' already exists")
|
618
|
+
return True
|
619
|
+
|
620
|
+
# Create lakehouse
|
621
|
+
print(f"Creating lakehouse '{lakehouse_name}'...")
|
622
|
+
payload = {
|
623
|
+
"displayName": lakehouse_name,
|
624
|
+
"description": f"Lakehouse {lakehouse_name} created via duckrun"
|
625
|
+
}
|
626
|
+
|
627
|
+
response = requests.post(url, headers=headers, json=payload)
|
628
|
+
response.raise_for_status()
|
629
|
+
|
630
|
+
print(f"✅ Lakehouse '{lakehouse_name}' created successfully")
|
631
|
+
return True
|
632
|
+
|
633
|
+
except Exception as e:
|
634
|
+
print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
|
635
|
+
return False
|
636
|
+
|
637
|
+
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
638
|
+
"""Helper method to get workspace ID from name"""
|
639
|
+
try:
|
640
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
641
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
642
|
+
|
643
|
+
response = requests.get(url, headers=headers)
|
644
|
+
response.raise_for_status()
|
645
|
+
|
646
|
+
workspaces = response.json().get("value", [])
|
647
|
+
for workspace in workspaces:
|
648
|
+
if workspace.get("displayName") == workspace_name:
|
649
|
+
return workspace.get("id")
|
650
|
+
|
651
|
+
return None
|
652
|
+
|
653
|
+
except Exception:
|
654
|
+
return None
|
655
|
+
|
356
656
|
def close(self):
|
357
657
|
"""Close DuckDB connection"""
|
358
658
|
if self.con:
|
359
659
|
self.con.close()
|
360
|
-
print("Connection closed")
|
660
|
+
print("Connection closed")
|
661
|
+
|
662
|
+
|
663
|
+
class WorkspaceConnection:
|
664
|
+
"""
|
665
|
+
Simple workspace connection for lakehouse management operations.
|
666
|
+
"""
|
667
|
+
|
668
|
+
def __init__(self, workspace_name: str):
|
669
|
+
self.workspace_name = workspace_name
|
670
|
+
|
671
|
+
def list_lakehouses(self) -> List[str]:
|
672
|
+
"""
|
673
|
+
List all lakehouses in the workspace.
|
674
|
+
|
675
|
+
Returns:
|
676
|
+
List of lakehouse names
|
677
|
+
"""
|
678
|
+
try:
|
679
|
+
# Try to get token from notebook environment first
|
680
|
+
try:
|
681
|
+
import notebookutils # type: ignore
|
682
|
+
token = notebookutils.credentials.getToken("pbi")
|
683
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
684
|
+
except ImportError:
|
685
|
+
# Fallback to azure-identity
|
686
|
+
print("Getting authentication token...")
|
687
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
688
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
689
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
690
|
+
token = token_obj.token
|
691
|
+
|
692
|
+
# Get workspace ID by name
|
693
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
|
694
|
+
if not workspace_id:
|
695
|
+
print(f"Workspace '{self.workspace_name}' not found")
|
696
|
+
return []
|
697
|
+
|
698
|
+
# List lakehouses
|
699
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
700
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
701
|
+
|
702
|
+
response = requests.get(url, headers=headers)
|
703
|
+
response.raise_for_status()
|
704
|
+
|
705
|
+
lakehouses = response.json().get("value", [])
|
706
|
+
lakehouse_names = [lh.get("displayName", "") for lh in lakehouses]
|
707
|
+
|
708
|
+
print(f"Found {len(lakehouse_names)} lakehouses: {lakehouse_names}")
|
709
|
+
return lakehouse_names
|
710
|
+
|
711
|
+
except Exception as e:
|
712
|
+
print(f"Error listing lakehouses: {e}")
|
713
|
+
return []
|
714
|
+
|
715
|
+
def create_lakehouse_if_not_exists(self, lakehouse_name: str) -> bool:
|
716
|
+
"""
|
717
|
+
Create a lakehouse if it doesn't already exist.
|
718
|
+
|
719
|
+
Args:
|
720
|
+
lakehouse_name: Name of the lakehouse to create
|
721
|
+
|
722
|
+
Returns:
|
723
|
+
True if lakehouse exists or was created successfully, False otherwise
|
724
|
+
"""
|
725
|
+
try:
|
726
|
+
# Try to get token from notebook environment first
|
727
|
+
try:
|
728
|
+
import notebookutils # type: ignore
|
729
|
+
token = notebookutils.credentials.getToken("pbi")
|
730
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
731
|
+
except ImportError:
|
732
|
+
# Fallback to azure-identity
|
733
|
+
print("Getting authentication token...")
|
734
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
735
|
+
credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
|
736
|
+
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
737
|
+
token = token_obj.token
|
738
|
+
|
739
|
+
# Get workspace ID by name
|
740
|
+
workspace_id = self._get_workspace_id_by_name(token, self.workspace_name)
|
741
|
+
if not workspace_id:
|
742
|
+
print(f"Workspace '{self.workspace_name}' not found")
|
743
|
+
return False
|
744
|
+
|
745
|
+
# Check if lakehouse already exists
|
746
|
+
url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
|
747
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
748
|
+
|
749
|
+
response = requests.get(url, headers=headers)
|
750
|
+
response.raise_for_status()
|
751
|
+
|
752
|
+
lakehouses = response.json().get("value", [])
|
753
|
+
existing_names = [lh.get("displayName", "") for lh in lakehouses]
|
754
|
+
|
755
|
+
if lakehouse_name in existing_names:
|
756
|
+
print(f"Lakehouse '{lakehouse_name}' already exists")
|
757
|
+
return True
|
758
|
+
|
759
|
+
# Create lakehouse
|
760
|
+
print(f"Creating lakehouse '{lakehouse_name}'...")
|
761
|
+
payload = {
|
762
|
+
"displayName": lakehouse_name,
|
763
|
+
"description": f"Lakehouse {lakehouse_name} created via duckrun",
|
764
|
+
"creationPayload": {
|
765
|
+
"enableSchemas": True
|
766
|
+
}
|
767
|
+
}
|
768
|
+
|
769
|
+
response = requests.post(url, headers=headers, json=payload)
|
770
|
+
response.raise_for_status()
|
771
|
+
|
772
|
+
print(f"✅ Lakehouse '{lakehouse_name}' created successfully")
|
773
|
+
return True
|
774
|
+
|
775
|
+
except Exception as e:
|
776
|
+
print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
|
777
|
+
return False
|
778
|
+
|
779
|
+
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
780
|
+
"""Helper method to get workspace ID from name"""
|
781
|
+
try:
|
782
|
+
url = "https://api.fabric.microsoft.com/v1/workspaces"
|
783
|
+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
784
|
+
|
785
|
+
response = requests.get(url, headers=headers)
|
786
|
+
response.raise_for_status()
|
787
|
+
|
788
|
+
workspaces = response.json().get("value", [])
|
789
|
+
for workspace in workspaces:
|
790
|
+
if workspace.get("displayName") == workspace_name:
|
791
|
+
return workspace.get("id")
|
792
|
+
|
793
|
+
return None
|
794
|
+
|
795
|
+
except Exception:
|
796
|
+
return None
|
duckrun/files.py
CHANGED
@@ -51,8 +51,8 @@ def copy(duckrun_instance, local_folder: str, remote_folder: str,
|
|
51
51
|
token = token_obj.token
|
52
52
|
os.environ["AZURE_STORAGE_TOKEN"] = token
|
53
53
|
|
54
|
-
# Setup OneLake Files URL (
|
55
|
-
files_base_url =
|
54
|
+
# Setup OneLake Files URL (use correct format without .Lakehouse suffix)
|
55
|
+
files_base_url = duckrun_instance.files_base_url
|
56
56
|
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
57
57
|
|
58
58
|
# Collect files to upload
|
@@ -160,8 +160,8 @@ def download(duckrun_instance, remote_folder: str = "", local_folder: str = "./d
|
|
160
160
|
token = token_obj.token
|
161
161
|
os.environ["AZURE_STORAGE_TOKEN"] = token
|
162
162
|
|
163
|
-
# Setup OneLake Files URL (
|
164
|
-
files_base_url =
|
163
|
+
# Setup OneLake Files URL (use correct format without .Lakehouse suffix)
|
164
|
+
files_base_url = duckrun_instance.files_base_url
|
165
165
|
store = AzureStore.from_url(files_base_url, bearer_token=token)
|
166
166
|
|
167
167
|
# Create local directory
|
duckrun/lakehouse.py
ADDED
@@ -0,0 +1,402 @@
|
|
1
|
+
import requests
|
2
|
+
import time
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
class FabricLakehouseManager:
|
6
|
+
"""
|
7
|
+
Manage Microsoft Fabric Lakehouses using REST API only.
|
8
|
+
Works on any machine with Python and internet access.
|
9
|
+
"""
|
10
|
+
|
11
|
+
def __init__(self, access_token: str):
|
12
|
+
"""
|
13
|
+
Initialize with Azure AD access token.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
access_token: Bearer token for Fabric API authentication
|
17
|
+
"""
|
18
|
+
self.base_url = "https://api.fabric.microsoft.com/v1"
|
19
|
+
self.headers = {
|
20
|
+
"Authorization": f"Bearer {access_token}",
|
21
|
+
"Content-Type": "application/json"
|
22
|
+
}
|
23
|
+
|
24
|
+
def get_workspace_id(self, workspace_name: str) -> Optional[str]:
|
25
|
+
"""
|
26
|
+
Get workspace ID from workspace name.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
workspace_name: Name of the workspace
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Workspace ID if found, None otherwise
|
33
|
+
"""
|
34
|
+
if not workspace_name:
|
35
|
+
return None
|
36
|
+
|
37
|
+
try:
|
38
|
+
url = f"{self.base_url}/workspaces"
|
39
|
+
response = requests.get(url, headers=self.headers)
|
40
|
+
response.raise_for_status()
|
41
|
+
|
42
|
+
workspaces = response.json().get("value", [])
|
43
|
+
for workspace in workspaces:
|
44
|
+
if workspace.get("displayName") == workspace_name:
|
45
|
+
return workspace.get("id")
|
46
|
+
|
47
|
+
print(f"Workspace '{workspace_name}' not found")
|
48
|
+
return None
|
49
|
+
|
50
|
+
except Exception as e:
|
51
|
+
print(f"Error getting workspace ID: {e}")
|
52
|
+
return None
|
53
|
+
|
54
|
+
def get_lakehouse(self, lakehouse_name: str, workspace_id: str) -> Optional[dict]:
|
55
|
+
"""
|
56
|
+
Get lakehouse details if it exists.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
lakehouse_name: Name of the lakehouse
|
60
|
+
workspace_id: ID of the workspace
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
Lakehouse details if found, None otherwise
|
64
|
+
"""
|
65
|
+
try:
|
66
|
+
url = f"{self.base_url}/workspaces/{workspace_id}/lakehouses"
|
67
|
+
response = requests.get(url, headers=self.headers)
|
68
|
+
response.raise_for_status()
|
69
|
+
|
70
|
+
lakehouses = response.json().get("value", [])
|
71
|
+
for lakehouse in lakehouses:
|
72
|
+
if lakehouse.get("displayName") == lakehouse_name:
|
73
|
+
return lakehouse
|
74
|
+
|
75
|
+
return None
|
76
|
+
|
77
|
+
except Exception as e:
|
78
|
+
print(f"Error getting lakehouse: {e}")
|
79
|
+
return None
|
80
|
+
|
81
|
+
def create_lakehouse(self, lakehouse_name: str, workspace_id: str,
|
82
|
+
enable_schemas: bool = True) -> Optional[dict]:
|
83
|
+
"""
|
84
|
+
Create a new lakehouse.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
lakehouse_name: Name of the lakehouse
|
88
|
+
workspace_id: ID of the workspace
|
89
|
+
enable_schemas: Whether to enable schemas
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Created lakehouse details if successful, None otherwise
|
93
|
+
"""
|
94
|
+
try:
|
95
|
+
url = f"{self.base_url}/workspaces/{workspace_id}/lakehouses"
|
96
|
+
payload = {
|
97
|
+
"displayName": lakehouse_name,
|
98
|
+
"description": f"Lakehouse {lakehouse_name}"
|
99
|
+
}
|
100
|
+
|
101
|
+
if enable_schemas:
|
102
|
+
payload["creationPayload"] = {
|
103
|
+
"enableSchemas": True
|
104
|
+
}
|
105
|
+
|
106
|
+
response = requests.post(url, headers=self.headers, json=payload)
|
107
|
+
response.raise_for_status()
|
108
|
+
|
109
|
+
# Wait a bit for the lakehouse to be fully provisioned
|
110
|
+
time.sleep(2)
|
111
|
+
|
112
|
+
return response.json()
|
113
|
+
|
114
|
+
except Exception as e:
|
115
|
+
print(f"Error creating lakehouse: {e}")
|
116
|
+
if hasattr(e, 'response') and e.response is not None:
|
117
|
+
print(f"Response: {e.response.text}")
|
118
|
+
return None
|
119
|
+
|
120
|
+
def create_lakehouse_if_not_exists(self, lakehouse_name: str,
|
121
|
+
workspace_name: Optional[str] = None,
|
122
|
+
workspace_id: Optional[str] = None) -> int:
|
123
|
+
"""
|
124
|
+
Create a lakehouse if it doesn't exist.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
lakehouse_name: Name of the lakehouse
|
128
|
+
workspace_name: Optional workspace name
|
129
|
+
workspace_id: Optional workspace ID (takes precedence over workspace_name)
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
1 if successful (lakehouse exists or was created)
|
133
|
+
0 if failed
|
134
|
+
"""
|
135
|
+
# Resolve workspace ID
|
136
|
+
if workspace_id is None and workspace_name:
|
137
|
+
workspace_id = self.get_workspace_id(workspace_name)
|
138
|
+
if workspace_id is None:
|
139
|
+
print(f"Workspace '{workspace_name}' not found - returning 0")
|
140
|
+
return 0
|
141
|
+
elif workspace_id is None:
|
142
|
+
print("No workspace specified - returning 0")
|
143
|
+
return 0
|
144
|
+
|
145
|
+
print(f"Attempting to get lakehouse '{lakehouse_name}' in workspace '{workspace_id}'")
|
146
|
+
|
147
|
+
# Check if lakehouse exists
|
148
|
+
lakehouse = self.get_lakehouse(lakehouse_name, workspace_id)
|
149
|
+
|
150
|
+
if lakehouse:
|
151
|
+
print(f"Lakehouse '{lakehouse_name}' found - returning 1")
|
152
|
+
return 1
|
153
|
+
|
154
|
+
# Create lakehouse if it doesn't exist
|
155
|
+
print(f"Lakehouse not found, attempting to create...")
|
156
|
+
created = self.create_lakehouse(lakehouse_name, workspace_id)
|
157
|
+
|
158
|
+
if created:
|
159
|
+
# Verify creation
|
160
|
+
lakehouse = self.get_lakehouse(lakehouse_name, workspace_id)
|
161
|
+
if lakehouse:
|
162
|
+
print(f"Lakehouse '{lakehouse_name}' created successfully - returning 1")
|
163
|
+
return 1
|
164
|
+
|
165
|
+
print(f"Failed to create lakehouse '{lakehouse_name}' - returning 0")
|
166
|
+
return 0
|
167
|
+
|
168
|
+
|
169
|
+
# Example usage with Azure Identity:
|
170
|
+
def main():
|
171
|
+
"""
|
172
|
+
Example of how to use the FabricLakehouseManager with azure-identity.
|
173
|
+
"""
|
174
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
175
|
+
|
176
|
+
print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
|
177
|
+
|
178
|
+
# Create credential chain (CLI first, then interactive browser)
|
179
|
+
credential = ChainedTokenCredential(
|
180
|
+
AzureCliCredential(),
|
181
|
+
InteractiveBrowserCredential()
|
182
|
+
)
|
183
|
+
|
184
|
+
# Get token for Fabric API (not storage!)
|
185
|
+
# Note: Use Fabric API scope, not storage scope
|
186
|
+
token = credential.get_token("https://api.fabric.microsoft.com/.default")
|
187
|
+
|
188
|
+
print("✓ Authentication successful!")
|
189
|
+
|
190
|
+
# Initialize manager with Fabric token
|
191
|
+
manager = FabricLakehouseManager(token.token)
|
192
|
+
|
193
|
+
# Create lakehouse if not exists
|
194
|
+
result = manager.create_lakehouse_if_not_exists(
|
195
|
+
lakehouse_name="MyLakehouse",
|
196
|
+
workspace_name="MyWorkspace"
|
197
|
+
)
|
198
|
+
|
199
|
+
if result == 1:
|
200
|
+
print("✓ Lakehouse operation successful!")
|
201
|
+
else:
|
202
|
+
print("✗ Lakehouse operation failed!")
|
203
|
+
|
204
|
+
return result
|
205
|
+
|
206
|
+
|
207
|
+
def get_fabric_token():
|
208
|
+
"""
|
209
|
+
Helper function to get Fabric API token.
|
210
|
+
Returns the token string.
|
211
|
+
"""
|
212
|
+
from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
|
213
|
+
|
214
|
+
credential = ChainedTokenCredential(
|
215
|
+
AzureCliCredential(),
|
216
|
+
InteractiveBrowserCredential()
|
217
|
+
)
|
218
|
+
|
219
|
+
# Get token for Fabric API
|
220
|
+
token = credential.get_token("https://api.fabric.microsoft.com/.default")
|
221
|
+
return token.token
|
222
|
+
|
223
|
+
|
224
|
+
def create_lakehouse_in_notebook(lakehouse_name: str, workspace_name: Optional[str] = None) -> int:
|
225
|
+
"""
|
226
|
+
Create a lakehouse in a Fabric notebook environment.
|
227
|
+
This function uses the notebook's built-in authentication.
|
228
|
+
|
229
|
+
Args:
|
230
|
+
lakehouse_name: Name of the lakehouse to create
|
231
|
+
workspace_name: Optional workspace name (uses current workspace if None)
|
232
|
+
|
233
|
+
Returns:
|
234
|
+
1 if successful (lakehouse exists or was created)
|
235
|
+
0 if failed
|
236
|
+
"""
|
237
|
+
try:
|
238
|
+
# Try to import fabric notebook utilities (only available in Fabric notebooks)
|
239
|
+
import notebookutils # type: ignore
|
240
|
+
|
241
|
+
# Get authentication token from notebook environment
|
242
|
+
token = notebookutils.credentials.getToken("https://api.fabric.microsoft.com/.default")
|
243
|
+
|
244
|
+
# Initialize manager with notebook token
|
245
|
+
manager = FabricLakehouseManager(token)
|
246
|
+
|
247
|
+
# Get current workspace ID if no workspace specified
|
248
|
+
workspace_id = None
|
249
|
+
if workspace_name:
|
250
|
+
workspace_id = manager.get_workspace_id(workspace_name)
|
251
|
+
else:
|
252
|
+
# In Fabric notebooks, we can get the current workspace from context
|
253
|
+
try:
|
254
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
255
|
+
except:
|
256
|
+
print("Could not get current workspace ID from notebook context")
|
257
|
+
return 0
|
258
|
+
|
259
|
+
if not workspace_id:
|
260
|
+
print(f"Could not resolve workspace ID")
|
261
|
+
return 0
|
262
|
+
|
263
|
+
# Create lakehouse if not exists
|
264
|
+
return manager.create_lakehouse_if_not_exists(
|
265
|
+
lakehouse_name=lakehouse_name,
|
266
|
+
workspace_id=workspace_id
|
267
|
+
)
|
268
|
+
|
269
|
+
except ImportError:
|
270
|
+
print("notebookutils not available - not running in Fabric notebook environment")
|
271
|
+
print("Use FabricLakehouseManager class directly with proper authentication")
|
272
|
+
return 0
|
273
|
+
except Exception as e:
|
274
|
+
print(f"Error creating lakehouse in notebook: {e}")
|
275
|
+
return 0
|
276
|
+
|
277
|
+
|
278
|
+
def create_lakehouse_simple(lakehouse_name: str, access_token: str, workspace_id: str) -> dict:
|
279
|
+
"""
|
280
|
+
Simple function to create a lakehouse with minimal dependencies.
|
281
|
+
Perfect for Fabric notebook environments.
|
282
|
+
|
283
|
+
Args:
|
284
|
+
lakehouse_name: Name of the lakehouse to create
|
285
|
+
access_token: Bearer token for authentication
|
286
|
+
workspace_id: ID of the target workspace
|
287
|
+
|
288
|
+
Returns:
|
289
|
+
Dictionary with creation result
|
290
|
+
"""
|
291
|
+
import requests
|
292
|
+
import time
|
293
|
+
|
294
|
+
base_url = "https://api.fabric.microsoft.com/v1"
|
295
|
+
headers = {
|
296
|
+
"Authorization": f"Bearer {access_token}",
|
297
|
+
"Content-Type": "application/json"
|
298
|
+
}
|
299
|
+
|
300
|
+
try:
|
301
|
+
# First check if lakehouse already exists
|
302
|
+
list_url = f"{base_url}/workspaces/{workspace_id}/lakehouses"
|
303
|
+
response = requests.get(list_url, headers=headers)
|
304
|
+
response.raise_for_status()
|
305
|
+
|
306
|
+
lakehouses = response.json().get("value", [])
|
307
|
+
for lakehouse in lakehouses:
|
308
|
+
if lakehouse.get("displayName") == lakehouse_name:
|
309
|
+
return {
|
310
|
+
"success": True,
|
311
|
+
"message": f"Lakehouse '{lakehouse_name}' already exists",
|
312
|
+
"lakehouse": lakehouse,
|
313
|
+
"created": False
|
314
|
+
}
|
315
|
+
|
316
|
+
# Create new lakehouse
|
317
|
+
create_url = f"{base_url}/workspaces/{workspace_id}/lakehouses"
|
318
|
+
payload = {
|
319
|
+
"displayName": lakehouse_name,
|
320
|
+
"description": f"Lakehouse {lakehouse_name} created via API"
|
321
|
+
}
|
322
|
+
|
323
|
+
response = requests.post(create_url, headers=headers, json=payload)
|
324
|
+
response.raise_for_status()
|
325
|
+
|
326
|
+
# Wait for provisioning
|
327
|
+
time.sleep(3)
|
328
|
+
|
329
|
+
created_lakehouse = response.json()
|
330
|
+
return {
|
331
|
+
"success": True,
|
332
|
+
"message": f"Lakehouse '{lakehouse_name}' created successfully",
|
333
|
+
"lakehouse": created_lakehouse,
|
334
|
+
"created": True
|
335
|
+
}
|
336
|
+
|
337
|
+
except requests.exceptions.RequestException as e:
|
338
|
+
error_msg = f"HTTP error creating lakehouse: {e}"
|
339
|
+
if hasattr(e, 'response') and e.response is not None:
|
340
|
+
error_msg += f" Response: {e.response.text}"
|
341
|
+
|
342
|
+
return {
|
343
|
+
"success": False,
|
344
|
+
"message": error_msg,
|
345
|
+
"lakehouse": None,
|
346
|
+
"created": False
|
347
|
+
}
|
348
|
+
except Exception as e:
|
349
|
+
return {
|
350
|
+
"success": False,
|
351
|
+
"message": f"Unexpected error: {e}",
|
352
|
+
"lakehouse": None,
|
353
|
+
"created": False
|
354
|
+
}
|
355
|
+
|
356
|
+
|
357
|
+
if __name__ == "__main__":
|
358
|
+
# Uncomment to run the example
|
359
|
+
# main()
|
360
|
+
pass
|
361
|
+
|
362
|
+
|
363
|
+
# Usage Examples:
|
364
|
+
"""
|
365
|
+
# Example 1: In a Fabric Notebook (simplest approach)
|
366
|
+
from duckrun.lakehouse import create_lakehouse_in_notebook
|
367
|
+
|
368
|
+
result = create_lakehouse_in_notebook("MyNewLakehouse")
|
369
|
+
if result == 1:
|
370
|
+
print("Lakehouse created or already exists!")
|
371
|
+
|
372
|
+
# Example 2: In a Fabric Notebook with explicit token
|
373
|
+
import notebookutils
|
374
|
+
from duckrun.lakehouse import create_lakehouse_simple
|
375
|
+
|
376
|
+
token = notebookutils.credentials.getToken("https://api.fabric.microsoft.com/.default")
|
377
|
+
workspace_id = notebookutils.runtime.context.get("workspaceId")
|
378
|
+
|
379
|
+
result = create_lakehouse_simple("MyLakehouse", token, workspace_id)
|
380
|
+
print(f"Result: {result['message']}")
|
381
|
+
|
382
|
+
# Example 3: Outside Fabric (requires azure-identity package)
|
383
|
+
from duckrun.lakehouse import FabricLakehouseManager, get_fabric_token
|
384
|
+
|
385
|
+
token = get_fabric_token()
|
386
|
+
manager = FabricLakehouseManager(token)
|
387
|
+
result = manager.create_lakehouse_if_not_exists("MyLakehouse", workspace_name="MyWorkspace")
|
388
|
+
|
389
|
+
# Example 4: With explicit workspace and lakehouse details
|
390
|
+
from duckrun.lakehouse import FabricLakehouseManager
|
391
|
+
|
392
|
+
# Get your token however you prefer
|
393
|
+
token = "your_bearer_token_here"
|
394
|
+
manager = FabricLakehouseManager(token)
|
395
|
+
|
396
|
+
# Create lakehouse in specific workspace
|
397
|
+
workspace_id = manager.get_workspace_id("Production Workspace")
|
398
|
+
lakehouse = manager.create_lakehouse("DataLake2024", workspace_id, enable_schemas=True)
|
399
|
+
|
400
|
+
if lakehouse:
|
401
|
+
print(f"Created lakehouse with ID: {lakehouse['id']}")
|
402
|
+
"""
|
duckrun/runner.py
CHANGED
@@ -235,11 +235,28 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
|
|
235
235
|
print(f"SQL file is empty: {table_name}.sql")
|
236
236
|
return None
|
237
237
|
|
238
|
+
import re
|
239
|
+
# Determine if lakehouse_name is a GUID
|
240
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
241
|
+
lakehouse_is_guid = bool(guid_pattern.match(duckrun_instance.lakehouse_name))
|
242
|
+
|
243
|
+
# Smart substitution for ${lh}.Lakehouse
|
244
|
+
# If template contains ${lh}.Lakehouse, replace with correct value
|
245
|
+
if '${lh}.Lakehouse' in content:
|
246
|
+
if lakehouse_is_guid:
|
247
|
+
# If GUID, use just the GUID
|
248
|
+
content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
|
249
|
+
else:
|
250
|
+
# If not GUID, use legacy format
|
251
|
+
content = content.replace('${lh}.Lakehouse', f'{duckrun_instance.lakehouse_name}.Lakehouse')
|
252
|
+
|
238
253
|
full_params = {
|
239
254
|
'ws': duckrun_instance.workspace,
|
240
255
|
'lh': duckrun_instance.lakehouse_name,
|
241
256
|
'schema': duckrun_instance.schema,
|
242
|
-
'storage_account': duckrun_instance.storage_account
|
257
|
+
'storage_account': duckrun_instance.storage_account,
|
258
|
+
'tables_url': duckrun_instance.table_base_url,
|
259
|
+
'files_url': duckrun_instance.files_base_url
|
243
260
|
}
|
244
261
|
if params:
|
245
262
|
full_params.update(params)
|
@@ -247,6 +264,10 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
|
|
247
264
|
try:
|
248
265
|
template = Template(content)
|
249
266
|
content = template.substitute(full_params)
|
267
|
+
# After substitution, remove .Lakehouse if it follows a GUID in any ABFSS URL
|
268
|
+
import re
|
269
|
+
# Pattern: GUID.Lakehouse or GUID.lakehouse (in URLs)
|
270
|
+
content = re.sub(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.(Lakehouse|lakehouse)', r'\1', content)
|
250
271
|
except KeyError as e:
|
251
272
|
print(f"Missing parameter in SQL file: ${e}")
|
252
273
|
return None
|
duckrun/stats.py
CHANGED
@@ -21,33 +21,39 @@ def _table_exists(duckrun_instance, schema_name: str, table_name: str) -> bool:
|
|
21
21
|
|
22
22
|
|
23
23
|
def _schema_exists(duckrun_instance, schema_name: str) -> bool:
|
24
|
-
"""Check if a schema exists by
|
24
|
+
"""Check if a schema exists by querying information_schema."""
|
25
25
|
try:
|
26
|
-
# For main schema,
|
26
|
+
# For main schema, always exists
|
27
27
|
if schema_name == "main":
|
28
|
-
|
28
|
+
return True
|
29
29
|
else:
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
# Use information_schema which works in DuckDB 1.2.2
|
31
|
+
query = f"SELECT 1 FROM information_schema.schemata WHERE schema_name = '{schema_name}' LIMIT 1"
|
32
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
33
|
+
return len(result) > 0
|
33
34
|
except:
|
34
35
|
return False
|
35
36
|
|
36
37
|
|
37
38
|
def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
|
38
|
-
"""Get all existing tables in a schema
|
39
|
+
"""Get all existing tables in a schema using information_schema, excluding temporary tables."""
|
39
40
|
try:
|
40
|
-
# For main schema,
|
41
|
+
# For main schema, use SHOW TABLES
|
41
42
|
if schema_name == "main":
|
42
43
|
query = "SHOW TABLES"
|
44
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
45
|
+
if result:
|
46
|
+
tables = [row[0] for row in result]
|
47
|
+
filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
|
48
|
+
return filtered_tables
|
43
49
|
else:
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
50
|
+
# Use information_schema which works in DuckDB 1.2.2
|
51
|
+
query = f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{schema_name}'"
|
52
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
53
|
+
if result:
|
54
|
+
tables = [row[0] for row in result]
|
55
|
+
filtered_tables = [tbl for tbl in tables if not tbl.startswith('tbl_')]
|
56
|
+
return filtered_tables
|
51
57
|
return []
|
52
58
|
except:
|
53
59
|
return []
|
@@ -136,8 +142,8 @@ def get_stats(duckrun_instance, source: str):
|
|
136
142
|
print(f"Processing {len(list_tables)} tables: {list_tables}")
|
137
143
|
|
138
144
|
for idx, tbl in enumerate(list_tables):
|
139
|
-
# Construct lakehouse path using ABFSS URL
|
140
|
-
table_path = f"
|
145
|
+
# Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
|
146
|
+
table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
|
141
147
|
|
142
148
|
try:
|
143
149
|
dt = DeltaTable(table_path)
|
@@ -218,7 +224,7 @@ def get_stats(duckrun_instance, source: str):
|
|
218
224
|
WHERE tbl IS NOT NULL
|
219
225
|
GROUP BY tbl
|
220
226
|
ORDER BY total_rows DESC
|
221
|
-
''').
|
227
|
+
''').df()
|
222
228
|
|
223
229
|
return final_result
|
224
230
|
|
@@ -0,0 +1,12 @@
|
|
1
|
+
duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
|
2
|
+
duckrun/core.py,sha256=-UAsAOlOmVkVuQCkbCWTH7aiFSHP0OenAAWSl0i0inY,37107
|
3
|
+
duckrun/files.py,sha256=piWRU5w9jHrW-wuV4Gf-SKY_jhFv9eflxgWO8AZCQTI,10495
|
4
|
+
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
5
|
+
duckrun/runner.py,sha256=lfwNoU1CZXh6bPTHvGWVaUWjzG5crvT7Pzq4onMEVjw,12576
|
6
|
+
duckrun/stats.py,sha256=2FTqoQNVjD84-H1HjStHxZkOpAGKXS79M55B00pOlok,9804
|
7
|
+
duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
|
8
|
+
duckrun-0.2.5.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
9
|
+
duckrun-0.2.5.dev1.dist-info/METADATA,sha256=ZQfd7I-J08MC5ytjKt789PqHdroHI0ld36aMNp-57yE,18344
|
10
|
+
duckrun-0.2.5.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
11
|
+
duckrun-0.2.5.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
12
|
+
duckrun-0.2.5.dev1.dist-info/RECORD,,
|
duckrun-0.2.3.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
|
2
|
-
duckrun/core.py,sha256=LN5rc5B3HLimgslZdC8tLKe3rjTl_KD8WxCh1qoJhdM,16443
|
3
|
-
duckrun/files.py,sha256=xba0juMEQPgaznDudmXcwaGH0wv-6aCoHmV_cNF6Y7I,10665
|
4
|
-
duckrun/runner.py,sha256=X5g-57OCHQZ7USKpcBbhYGUcZwLQny2x147DLKrV32c,11417
|
5
|
-
duckrun/stats.py,sha256=jLEkxNo7MjibPMpjMsXyedrJqv9-BAnP1C0L2a7H8Z8,9417
|
6
|
-
duckrun/writer.py,sha256=eWrGtDQTbXi8H3sSt2WucYTdEQUjK97KmQxzCbqAuMs,6221
|
7
|
-
duckrun-0.2.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
8
|
-
duckrun-0.2.3.dist-info/METADATA,sha256=CpJvtR9l8c9b1AV9-KnjN4fZODE_3oJxS3omz4p-qlc,18339
|
9
|
-
duckrun-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
10
|
-
duckrun-0.2.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
11
|
-
duckrun-0.2.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|