duckrun 0.2.9.dev0__py3-none-any.whl → 0.2.9.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/auth.py +9 -0
- duckrun/core.py +39 -0
- duckrun/semantic_model.py +402 -0
- duckrun/stats.py +14 -1
- duckrun/writer.py +46 -16
- {duckrun-0.2.9.dev0.dist-info → duckrun-0.2.9.dev2.dist-info}/METADATA +1 -1
- duckrun-0.2.9.dev2.dist-info/RECORD +14 -0
- duckrun-0.2.9.dev0.dist-info/RECORD +0 -13
- {duckrun-0.2.9.dev0.dist-info → duckrun-0.2.9.dev2.dist-info}/WHEEL +0 -0
- {duckrun-0.2.9.dev0.dist-info → duckrun-0.2.9.dev2.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.9.dev0.dist-info → duckrun-0.2.9.dev2.dist-info}/top_level.txt +0 -0
duckrun/auth.py
CHANGED
@@ -123,6 +123,12 @@ def get_fabric_api_token() -> Optional[str]:
|
|
123
123
|
Returns:
|
124
124
|
Fabric API token string or None if authentication fails
|
125
125
|
"""
|
126
|
+
# Check if we already have a cached Fabric API token
|
127
|
+
fabric_token_env = os.environ.get("FABRIC_API_TOKEN")
|
128
|
+
if fabric_token_env:
|
129
|
+
print("✅ Using cached Fabric API token")
|
130
|
+
return fabric_token_env
|
131
|
+
|
126
132
|
print("🔐 Getting Fabric API token...")
|
127
133
|
|
128
134
|
# Try Fabric notebook environment first
|
@@ -130,6 +136,7 @@ def get_fabric_api_token() -> Optional[str]:
|
|
130
136
|
import notebookutils # type: ignore
|
131
137
|
print("📓 Microsoft Fabric notebook detected - using notebookutils")
|
132
138
|
token = notebookutils.credentials.getToken("pbi")
|
139
|
+
os.environ["FABRIC_API_TOKEN"] = token
|
133
140
|
print("✅ Fabric API token obtained!")
|
134
141
|
return token
|
135
142
|
except ImportError:
|
@@ -158,6 +165,7 @@ def get_fabric_api_token() -> Optional[str]:
|
|
158
165
|
print("🔐 Trying Azure CLI for Fabric API...")
|
159
166
|
credential = AzureCliCredential()
|
160
167
|
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
168
|
+
os.environ["FABRIC_API_TOKEN"] = token_obj.token
|
161
169
|
print("✅ Fabric API token obtained via Azure CLI!")
|
162
170
|
return token_obj.token
|
163
171
|
except Exception as cli_error:
|
@@ -167,6 +175,7 @@ def get_fabric_api_token() -> Optional[str]:
|
|
167
175
|
credential = InteractiveBrowserCredential()
|
168
176
|
|
169
177
|
token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
|
178
|
+
os.environ["FABRIC_API_TOKEN"] = token_obj.token
|
170
179
|
print("✅ Fabric API token obtained!")
|
171
180
|
return token_obj.token
|
172
181
|
|
duckrun/core.py
CHANGED
@@ -696,6 +696,45 @@ class Duckrun:
|
|
696
696
|
print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
|
697
697
|
return False
|
698
698
|
|
699
|
+
def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
|
700
|
+
wait_seconds: int = 5) -> int:
|
701
|
+
"""
|
702
|
+
Deploy a semantic model from a BIM file using DirectLake mode.
|
703
|
+
|
704
|
+
Args:
|
705
|
+
bim_url: URL to the BIM file (e.g., GitHub raw URL)
|
706
|
+
dataset_name: Name for the semantic model (default: lakehouse_schema)
|
707
|
+
wait_seconds: Seconds to wait for permission propagation (default: 5)
|
708
|
+
|
709
|
+
Returns:
|
710
|
+
1 for success, 0 for failure
|
711
|
+
|
712
|
+
Examples:
|
713
|
+
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
714
|
+
|
715
|
+
# Deploy with auto-generated name
|
716
|
+
dr.deploy("https://raw.githubusercontent.com/.../model.bim")
|
717
|
+
|
718
|
+
# Deploy with custom name
|
719
|
+
dr.deploy("https://raw.githubusercontent.com/.../model.bim",
|
720
|
+
dataset_name="Sales Model")
|
721
|
+
"""
|
722
|
+
from .semantic_model import deploy_semantic_model
|
723
|
+
|
724
|
+
# Auto-generate dataset name if not provided
|
725
|
+
if dataset_name is None:
|
726
|
+
dataset_name = f"{self.lakehouse_name}_{self.schema}"
|
727
|
+
|
728
|
+
# Call the deployment function (DirectLake only)
|
729
|
+
return deploy_semantic_model(
|
730
|
+
workspace_name=self.workspace,
|
731
|
+
lakehouse_name=self.lakehouse_name,
|
732
|
+
schema_name=self.schema,
|
733
|
+
dataset_name=dataset_name,
|
734
|
+
bim_url=bim_url,
|
735
|
+
wait_seconds=wait_seconds
|
736
|
+
)
|
737
|
+
|
699
738
|
def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
|
700
739
|
"""Helper method to get workspace ID from name"""
|
701
740
|
try:
|
@@ -0,0 +1,402 @@
|
|
1
|
+
"""
|
2
|
+
Semantic Model Deployer - DirectLake mode for Fabric Lakehouses
|
3
|
+
Uses duckrun's authentication. Works anywhere duckrun works.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import requests
|
7
|
+
import json
|
8
|
+
import time
|
9
|
+
import base64
|
10
|
+
|
11
|
+
|
12
|
+
class FabricRestClient:
|
13
|
+
"""Fabric REST API client using duckrun's authentication."""
|
14
|
+
|
15
|
+
def __init__(self):
|
16
|
+
self.base_url = "https://api.fabric.microsoft.com"
|
17
|
+
self.token = None
|
18
|
+
self._get_token()
|
19
|
+
|
20
|
+
def _get_token(self):
|
21
|
+
"""Get Fabric API token using duckrun's auth module"""
|
22
|
+
from duckrun.auth import get_fabric_api_token
|
23
|
+
self.token = get_fabric_api_token()
|
24
|
+
if not self.token:
|
25
|
+
raise Exception("Failed to get Fabric API token")
|
26
|
+
|
27
|
+
def _get_headers(self):
|
28
|
+
return {
|
29
|
+
"Authorization": f"Bearer {self.token}",
|
30
|
+
"Content-Type": "application/json"
|
31
|
+
}
|
32
|
+
|
33
|
+
def get(self, endpoint: str):
|
34
|
+
url = f"{self.base_url}{endpoint}"
|
35
|
+
response = requests.get(url, headers=self._get_headers())
|
36
|
+
response.raise_for_status()
|
37
|
+
return response
|
38
|
+
|
39
|
+
def post(self, endpoint: str, json: dict = None):
|
40
|
+
url = f"{self.base_url}{endpoint}"
|
41
|
+
response = requests.post(url, headers=self._get_headers(), json=json)
|
42
|
+
response.raise_for_status()
|
43
|
+
return response
|
44
|
+
|
45
|
+
|
46
|
+
def get_workspace_id(workspace_name_or_id, client):
|
47
|
+
"""Get workspace ID by name or validate if already a GUID"""
|
48
|
+
import re
|
49
|
+
|
50
|
+
# Check if input is already a GUID
|
51
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
52
|
+
if guid_pattern.match(workspace_name_or_id):
|
53
|
+
# It's already a GUID, verify it exists
|
54
|
+
try:
|
55
|
+
response = client.get(f"/v1/workspaces/{workspace_name_or_id}")
|
56
|
+
workspace_name = response.json().get('displayName', workspace_name_or_id)
|
57
|
+
print(f"✓ Found workspace: {workspace_name}")
|
58
|
+
return workspace_name_or_id
|
59
|
+
except:
|
60
|
+
raise ValueError(f"Workspace with ID '{workspace_name_or_id}' not found")
|
61
|
+
|
62
|
+
# It's a name, search for it
|
63
|
+
response = client.get("/v1/workspaces")
|
64
|
+
workspaces = response.json().get('value', [])
|
65
|
+
|
66
|
+
workspace_match = next((ws for ws in workspaces if ws.get('displayName') == workspace_name_or_id), None)
|
67
|
+
if not workspace_match:
|
68
|
+
raise ValueError(f"Workspace '{workspace_name_or_id}' not found")
|
69
|
+
|
70
|
+
workspace_id = workspace_match['id']
|
71
|
+
print(f"✓ Found workspace: {workspace_name_or_id}")
|
72
|
+
return workspace_id
|
73
|
+
|
74
|
+
|
75
|
+
def get_lakehouse_id(lakehouse_name_or_id, workspace_id, client):
|
76
|
+
"""Get lakehouse ID by name or validate if already a GUID"""
|
77
|
+
import re
|
78
|
+
|
79
|
+
# Check if input is already a GUID
|
80
|
+
guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
|
81
|
+
if guid_pattern.match(lakehouse_name_or_id):
|
82
|
+
# It's already a GUID, verify it exists
|
83
|
+
try:
|
84
|
+
response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
|
85
|
+
items = response.json().get('value', [])
|
86
|
+
lakehouse_match = next((item for item in items if item.get('id') == lakehouse_name_or_id), None)
|
87
|
+
if lakehouse_match:
|
88
|
+
lakehouse_name = lakehouse_match.get('displayName', lakehouse_name_or_id)
|
89
|
+
print(f"✓ Found lakehouse: {lakehouse_name}")
|
90
|
+
return lakehouse_name_or_id
|
91
|
+
else:
|
92
|
+
raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found")
|
93
|
+
except Exception as e:
|
94
|
+
raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found: {e}")
|
95
|
+
|
96
|
+
# It's a name, search for it
|
97
|
+
response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
|
98
|
+
items = response.json().get('value', [])
|
99
|
+
|
100
|
+
lakehouse_match = next((item for item in items if item.get('displayName') == lakehouse_name_or_id), None)
|
101
|
+
if not lakehouse_match:
|
102
|
+
raise ValueError(f"Lakehouse '{lakehouse_name_or_id}' not found")
|
103
|
+
|
104
|
+
lakehouse_id = lakehouse_match['id']
|
105
|
+
print(f"✓ Found lakehouse: {lakehouse_name_or_id}")
|
106
|
+
return lakehouse_id
|
107
|
+
|
108
|
+
|
109
|
+
def get_dataset_id(dataset_name, workspace_id, client):
|
110
|
+
"""Get dataset ID by name"""
|
111
|
+
response = client.get(f"/v1/workspaces/{workspace_id}/semanticModels")
|
112
|
+
items = response.json().get('value', [])
|
113
|
+
|
114
|
+
dataset_match = next((item for item in items if item.get('displayName') == dataset_name), None)
|
115
|
+
if not dataset_match:
|
116
|
+
raise ValueError(f"Dataset '{dataset_name}' not found")
|
117
|
+
|
118
|
+
return dataset_match['id']
|
119
|
+
|
120
|
+
|
121
|
+
def check_dataset_exists(dataset_name, workspace_id, client):
|
122
|
+
"""Check if dataset already exists"""
|
123
|
+
try:
|
124
|
+
get_dataset_id(dataset_name, workspace_id, client)
|
125
|
+
print(f"⚠️ Dataset '{dataset_name}' already exists")
|
126
|
+
return True
|
127
|
+
except:
|
128
|
+
print(f"✓ Dataset name '{dataset_name}' is available")
|
129
|
+
return False
|
130
|
+
|
131
|
+
|
132
|
+
def refresh_dataset(dataset_name, workspace_id, client):
|
133
|
+
"""Refresh a dataset and monitor progress"""
|
134
|
+
dataset_id = get_dataset_id(dataset_name, workspace_id, client)
|
135
|
+
|
136
|
+
payload = {
|
137
|
+
"type": "full",
|
138
|
+
"commitMode": "transactional",
|
139
|
+
"maxParallelism": 10,
|
140
|
+
"retryCount": 2,
|
141
|
+
"objects": []
|
142
|
+
}
|
143
|
+
|
144
|
+
response = client.post(
|
145
|
+
f"/v1/workspaces/{workspace_id}/semanticModels/{dataset_id}/refreshes",
|
146
|
+
json=payload
|
147
|
+
)
|
148
|
+
|
149
|
+
if response.status_code in [200, 202]:
|
150
|
+
print(f"✓ Refresh initiated")
|
151
|
+
|
152
|
+
refresh_id = response.json().get('id')
|
153
|
+
if refresh_id:
|
154
|
+
print(" Monitoring refresh progress...")
|
155
|
+
max_attempts = 60
|
156
|
+
for attempt in range(max_attempts):
|
157
|
+
time.sleep(5)
|
158
|
+
|
159
|
+
status_response = client.get(
|
160
|
+
f"/v1/workspaces/{workspace_id}/semanticModels/{dataset_id}/refreshes/{refresh_id}"
|
161
|
+
)
|
162
|
+
status = status_response.json().get('status')
|
163
|
+
|
164
|
+
if status == 'Completed':
|
165
|
+
print(f"✓ Refresh completed successfully")
|
166
|
+
return
|
167
|
+
elif status == 'Failed':
|
168
|
+
error = status_response.json().get('error', {})
|
169
|
+
raise Exception(f"Refresh failed: {error.get('message', 'Unknown error')}")
|
170
|
+
elif status == 'Cancelled':
|
171
|
+
raise Exception("Refresh was cancelled")
|
172
|
+
|
173
|
+
if attempt % 6 == 0:
|
174
|
+
print(f" Status: {status}...")
|
175
|
+
|
176
|
+
raise Exception(f"Refresh timed out")
|
177
|
+
|
178
|
+
|
179
|
+
def download_bim_from_github(url):
|
180
|
+
"""Download BIM file from URL"""
|
181
|
+
print(f"Downloading BIM file...")
|
182
|
+
response = requests.get(url)
|
183
|
+
response.raise_for_status()
|
184
|
+
bim_content = response.json()
|
185
|
+
print(f"✓ BIM file downloaded")
|
186
|
+
print(f" - Tables: {len(bim_content.get('model', {}).get('tables', []))}")
|
187
|
+
print(f" - Relationships: {len(bim_content.get('model', {}).get('relationships', []))}")
|
188
|
+
return bim_content
|
189
|
+
|
190
|
+
|
191
|
+
def update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name):
|
192
|
+
"""Update BIM file for DirectLake mode"""
|
193
|
+
|
194
|
+
new_url = f"https://onelake.dfs.fabric.microsoft.com/{workspace_id}/{lakehouse_id}"
|
195
|
+
expression_name = None
|
196
|
+
|
197
|
+
# Update or create DirectLake expression
|
198
|
+
if 'model' in bim_content and 'expressions' in bim_content['model']:
|
199
|
+
for expr in bim_content['model']['expressions']:
|
200
|
+
if 'DirectLake' in expr['name'] or expr.get('kind') == 'm':
|
201
|
+
expression_name = expr['name']
|
202
|
+
expr['expression'] = [
|
203
|
+
"let",
|
204
|
+
f" Source = AzureStorage.DataLake(\"{new_url}\", [HierarchicalNavigation=true])",
|
205
|
+
"in",
|
206
|
+
" Source"
|
207
|
+
]
|
208
|
+
break
|
209
|
+
|
210
|
+
if not expression_name:
|
211
|
+
expression_name = f"DirectLake - {schema_name}"
|
212
|
+
if 'expressions' not in bim_content['model']:
|
213
|
+
bim_content['model']['expressions'] = []
|
214
|
+
|
215
|
+
bim_content['model']['expressions'].append({
|
216
|
+
"name": expression_name,
|
217
|
+
"kind": "m",
|
218
|
+
"expression": [
|
219
|
+
"let",
|
220
|
+
f" Source = AzureStorage.DataLake(\"{new_url}\", [HierarchicalNavigation=true])",
|
221
|
+
"in",
|
222
|
+
" Source"
|
223
|
+
],
|
224
|
+
"lineageTag": f"directlake-{schema_name}-source"
|
225
|
+
})
|
226
|
+
|
227
|
+
# Update table partitions for DirectLake
|
228
|
+
if 'tables' in bim_content['model']:
|
229
|
+
for table in bim_content['model']['tables']:
|
230
|
+
if 'partitions' in table:
|
231
|
+
for partition in table['partitions']:
|
232
|
+
if 'source' in partition:
|
233
|
+
partition['mode'] = 'directLake'
|
234
|
+
partition['source'] = {
|
235
|
+
"type": "entity",
|
236
|
+
"entityName": partition['source'].get('entityName', table['name']),
|
237
|
+
"expressionSource": expression_name,
|
238
|
+
"schemaName": schema_name
|
239
|
+
}
|
240
|
+
|
241
|
+
print(f"✓ Updated BIM for DirectLake")
|
242
|
+
print(f" - OneLake URL: {new_url}")
|
243
|
+
print(f" - Schema: {schema_name}")
|
244
|
+
|
245
|
+
return bim_content
|
246
|
+
|
247
|
+
|
248
|
+
def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
|
249
|
+
"""Create semantic model from BIM using Fabric REST API"""
|
250
|
+
# Convert to base64
|
251
|
+
bim_json = json.dumps(bim_content, indent=2)
|
252
|
+
bim_base64 = base64.b64encode(bim_json.encode('utf-8')).decode('utf-8')
|
253
|
+
|
254
|
+
pbism_content = {"version": "1.0"}
|
255
|
+
pbism_json = json.dumps(pbism_content)
|
256
|
+
pbism_base64 = base64.b64encode(pbism_json.encode('utf-8')).decode('utf-8')
|
257
|
+
|
258
|
+
payload = {
|
259
|
+
"displayName": dataset_name,
|
260
|
+
"definition": {
|
261
|
+
"parts": [
|
262
|
+
{
|
263
|
+
"path": "model.bim",
|
264
|
+
"payload": bim_base64,
|
265
|
+
"payloadType": "InlineBase64"
|
266
|
+
},
|
267
|
+
{
|
268
|
+
"path": "definition.pbism",
|
269
|
+
"payload": pbism_base64,
|
270
|
+
"payloadType": "InlineBase64"
|
271
|
+
}
|
272
|
+
]
|
273
|
+
}
|
274
|
+
}
|
275
|
+
|
276
|
+
response = client.post(
|
277
|
+
f"/v1/workspaces/{workspace_id}/semanticModels",
|
278
|
+
json=payload
|
279
|
+
)
|
280
|
+
|
281
|
+
print(f"✓ Semantic model created")
|
282
|
+
|
283
|
+
# Handle long-running operation
|
284
|
+
if response.status_code == 202:
|
285
|
+
operation_id = response.headers.get('x-ms-operation-id')
|
286
|
+
print(f" Waiting for operation to complete...")
|
287
|
+
|
288
|
+
max_attempts = 30
|
289
|
+
for attempt in range(max_attempts):
|
290
|
+
time.sleep(2)
|
291
|
+
status_response = client.get(f"/v1/operations/{operation_id}")
|
292
|
+
status = status_response.json().get('status')
|
293
|
+
|
294
|
+
if status == 'Succeeded':
|
295
|
+
print(f"✓ Operation completed")
|
296
|
+
break
|
297
|
+
elif status == 'Failed':
|
298
|
+
error = status_response.json().get('error', {})
|
299
|
+
raise Exception(f"Operation failed: {error.get('message')}")
|
300
|
+
elif attempt == max_attempts - 1:
|
301
|
+
raise Exception(f"Operation timed out")
|
302
|
+
|
303
|
+
|
304
|
+
def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
|
305
|
+
bim_url, wait_seconds=5):
|
306
|
+
"""
|
307
|
+
Deploy a semantic model using DirectLake mode.
|
308
|
+
|
309
|
+
Args:
|
310
|
+
workspace_name_or_id: Name or GUID of the target workspace
|
311
|
+
lakehouse_name_or_id: Name or GUID of the lakehouse
|
312
|
+
schema_name: Schema name (e.g., 'dbo', 'staging')
|
313
|
+
dataset_name: Name for the semantic model
|
314
|
+
bim_url: URL to the BIM file
|
315
|
+
wait_seconds: Seconds to wait before refresh (default: 5)
|
316
|
+
|
317
|
+
Returns:
|
318
|
+
1 for success, 0 for failure
|
319
|
+
|
320
|
+
Examples:
|
321
|
+
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
322
|
+
dr.deploy("https://raw.githubusercontent.com/.../model.bim")
|
323
|
+
"""
|
324
|
+
print("=" * 70)
|
325
|
+
print("Semantic Model Deployment (DirectLake)")
|
326
|
+
print("=" * 70)
|
327
|
+
|
328
|
+
client = FabricRestClient()
|
329
|
+
|
330
|
+
try:
|
331
|
+
# Step 1: Get workspace ID
|
332
|
+
print("\n[Step 1/6] Getting workspace information...")
|
333
|
+
workspace_id = get_workspace_id(workspace_name_or_id, client)
|
334
|
+
|
335
|
+
# Step 2: Check if dataset exists
|
336
|
+
print(f"\n[Step 2/6] Checking if dataset '{dataset_name}' exists...")
|
337
|
+
dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
|
338
|
+
|
339
|
+
if dataset_exists:
|
340
|
+
print(f"\n✓ Dataset exists - refreshing...")
|
341
|
+
|
342
|
+
if wait_seconds > 0:
|
343
|
+
print(f" Waiting {wait_seconds} seconds...")
|
344
|
+
time.sleep(wait_seconds)
|
345
|
+
|
346
|
+
print("\n[Step 6/6] Refreshing semantic model...")
|
347
|
+
refresh_dataset(dataset_name, workspace_id, client)
|
348
|
+
|
349
|
+
print("\n" + "=" * 70)
|
350
|
+
print("🎉 Refresh Completed!")
|
351
|
+
print("=" * 70)
|
352
|
+
print(f"Dataset: {dataset_name}")
|
353
|
+
print("=" * 70)
|
354
|
+
return 1
|
355
|
+
|
356
|
+
# Step 3: Get lakehouse ID
|
357
|
+
print(f"\n[Step 3/6] Finding lakehouse...")
|
358
|
+
lakehouse_id = get_lakehouse_id(lakehouse_name_or_id, workspace_id, client)
|
359
|
+
|
360
|
+
# Step 4: Download and update BIM
|
361
|
+
print("\n[Step 4/6] Downloading and configuring BIM file...")
|
362
|
+
bim_content = download_bim_from_github(bim_url)
|
363
|
+
|
364
|
+
modified_bim = update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name)
|
365
|
+
modified_bim['name'] = dataset_name
|
366
|
+
modified_bim['id'] = dataset_name
|
367
|
+
|
368
|
+
# Step 5: Deploy
|
369
|
+
print("\n[Step 5/6] Deploying semantic model...")
|
370
|
+
create_dataset_from_bim(dataset_name, modified_bim, workspace_id, client)
|
371
|
+
|
372
|
+
if wait_seconds > 0:
|
373
|
+
print(f" Waiting {wait_seconds} seconds for permissions...")
|
374
|
+
time.sleep(wait_seconds)
|
375
|
+
|
376
|
+
# Step 6: Refresh
|
377
|
+
print("\n[Step 6/6] Refreshing semantic model...")
|
378
|
+
refresh_dataset(dataset_name, workspace_id, client)
|
379
|
+
|
380
|
+
print("\n" + "=" * 70)
|
381
|
+
print("🎉 Deployment Completed!")
|
382
|
+
print("=" * 70)
|
383
|
+
print(f"Dataset: {dataset_name}")
|
384
|
+
print(f"Workspace: {workspace_name_or_id}")
|
385
|
+
print(f"Lakehouse: {lakehouse_name_or_id}")
|
386
|
+
print(f"Schema: {schema_name}")
|
387
|
+
print("=" * 70)
|
388
|
+
|
389
|
+
return 1
|
390
|
+
|
391
|
+
except Exception as e:
|
392
|
+
print("\n" + "=" * 70)
|
393
|
+
print("❌ Deployment Failed")
|
394
|
+
print("=" * 70)
|
395
|
+
print(f"Error: {str(e)}")
|
396
|
+
print("\n💡 Troubleshooting:")
|
397
|
+
print(f" - Verify workspace '{workspace_name_or_id}' exists")
|
398
|
+
print(f" - Verify lakehouse '{lakehouse_name_or_id}' exists")
|
399
|
+
print(f" - Ensure tables exist in '{schema_name}' schema")
|
400
|
+
print(f" - Check tables are in Delta format")
|
401
|
+
print("=" * 70)
|
402
|
+
return 0
|
duckrun/stats.py
CHANGED
@@ -147,7 +147,20 @@ def get_stats(duckrun_instance, source: str):
|
|
147
147
|
|
148
148
|
try:
|
149
149
|
dt = DeltaTable(table_path)
|
150
|
-
|
150
|
+
add_actions = dt.get_add_actions(flatten=True)
|
151
|
+
|
152
|
+
# Convert to dict - compatible with both old and new deltalake versions
|
153
|
+
# Try to_pydict() first (old versions), fall back to to_pylist() (new versions)
|
154
|
+
try:
|
155
|
+
xx = add_actions.to_pydict()
|
156
|
+
except AttributeError:
|
157
|
+
# New version with arro3: use to_pylist() and convert to dict of lists
|
158
|
+
records = add_actions.to_pylist()
|
159
|
+
if records:
|
160
|
+
# Convert list of dicts to dict of lists
|
161
|
+
xx = {key: [record[key] for record in records] for key in records[0].keys()}
|
162
|
+
else:
|
163
|
+
xx = {}
|
151
164
|
|
152
165
|
# Check if VORDER exists
|
153
166
|
vorder = 'tags.VORDER' in xx.keys()
|
duckrun/writer.py
CHANGED
@@ -1,18 +1,36 @@
|
|
1
1
|
"""
|
2
2
|
Delta Lake writer functionality for duckrun - Spark-style write API
|
3
3
|
"""
|
4
|
-
from deltalake import DeltaTable, write_deltalake
|
4
|
+
from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
|
5
5
|
|
6
6
|
|
7
7
|
# Row Group configuration for optimal Delta Lake performance
|
8
8
|
RG = 8_000_000
|
9
9
|
|
10
|
+
# Check deltalake version once at module load
|
11
|
+
# Version 0.18.x and 0.19.x support engine parameter and row group optimization
|
12
|
+
# Version 0.20+ removed these features (rust only, no row groups)
|
13
|
+
_DELTALAKE_VERSION = tuple(map(int, deltalake_version.split('.')[:2]))
|
14
|
+
_IS_OLD_DELTALAKE = _DELTALAKE_VERSION < (0, 20)
|
15
|
+
|
10
16
|
|
11
17
|
def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
|
12
18
|
"""
|
13
|
-
Build arguments for write_deltalake based on requirements:
|
14
|
-
|
15
|
-
-
|
19
|
+
Build arguments for write_deltalake based on requirements and version:
|
20
|
+
|
21
|
+
deltalake 0.18.2 - 0.19.x:
|
22
|
+
- Has 'engine' parameter (defaults to 'pyarrow')
|
23
|
+
- Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
|
24
|
+
- When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
|
25
|
+
- When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
|
26
|
+
|
27
|
+
deltalake 0.20+:
|
28
|
+
- Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
|
29
|
+
- Does NOT have max_rows_per_file (row group optimization removed)
|
30
|
+
- When mergeSchema=True: must set schema_mode='merge'
|
31
|
+
- When mergeSchema=False: just write normally (no special params)
|
32
|
+
|
33
|
+
Uses version detection for simpler logic.
|
16
34
|
"""
|
17
35
|
args = {
|
18
36
|
'table_or_uri': path,
|
@@ -24,23 +42,24 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
24
42
|
if partition_by:
|
25
43
|
args['partition_by'] = partition_by
|
26
44
|
|
27
|
-
# Engine selection based on schema_mode
|
28
45
|
if schema_mode == 'merge':
|
29
|
-
#
|
46
|
+
# Schema merging mode - must explicitly set schema_mode='merge'
|
30
47
|
args['schema_mode'] = 'merge'
|
31
|
-
args['engine'] = 'rust'
|
32
|
-
else:
|
33
|
-
# Try to use pyarrow engine with row group optimization
|
34
|
-
# Check if row group parameters are supported by inspecting function signature
|
35
|
-
import inspect
|
36
|
-
sig = inspect.signature(write_deltalake)
|
37
48
|
|
38
|
-
if
|
39
|
-
#
|
49
|
+
if _IS_OLD_DELTALAKE:
|
50
|
+
# deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
|
51
|
+
# Do NOT use row group params (they conflict with rust engine)
|
52
|
+
args['engine'] = 'rust'
|
53
|
+
# For version 0.20+: just schema_mode='merge' is enough, rust is default
|
54
|
+
else:
|
55
|
+
# Normal write mode (no schema merging)
|
56
|
+
if _IS_OLD_DELTALAKE:
|
57
|
+
# deltalake 0.18.2-0.19.x: use row group optimization
|
58
|
+
# DON'T set engine parameter - pyarrow is the default and works with row groups
|
40
59
|
args['max_rows_per_file'] = RG
|
41
60
|
args['max_rows_per_group'] = RG
|
42
61
|
args['min_rows_per_group'] = RG
|
43
|
-
# For
|
62
|
+
# For version 0.20+: no optimization available (rust by default, no row group params supported)
|
44
63
|
|
45
64
|
return args
|
46
65
|
|
@@ -113,7 +132,18 @@ class DeltaWriter:
|
|
113
132
|
partition_by=self._partition_by
|
114
133
|
)
|
115
134
|
|
116
|
-
|
135
|
+
# Prepare info message based on version and settings
|
136
|
+
if self._schema_mode == 'merge':
|
137
|
+
if _IS_OLD_DELTALAKE:
|
138
|
+
engine_info = " (engine=rust, schema_mode=merge)"
|
139
|
+
else:
|
140
|
+
engine_info = " (schema_mode=merge, rust by default)"
|
141
|
+
else:
|
142
|
+
if _IS_OLD_DELTALAKE:
|
143
|
+
engine_info = " (engine=pyarrow, optimized row groups)"
|
144
|
+
else:
|
145
|
+
engine_info = " (engine=rust by default)"
|
146
|
+
|
117
147
|
partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
|
118
148
|
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
|
119
149
|
|
@@ -0,0 +1,14 @@
|
|
1
|
+
duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
|
2
|
+
duckrun/auth.py,sha256=qPaLQ7InlV9leA9r6E6VEeYavFFoBi0zSN8m_l1aoQs,9545
|
3
|
+
duckrun/core.py,sha256=ulKRnxTH8MfGtcKAAORBs-_vd-_jIyWmwv9Bims0TsQ,39267
|
4
|
+
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
5
|
+
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
6
|
+
duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
|
7
|
+
duckrun/semantic_model.py,sha256=y_E1VlpqSx9DHOGi--4ZccaODErthzty5CVN4TI-mQ0,15509
|
8
|
+
duckrun/stats.py,sha256=CXfb2DWF3PgOckelJooU0y-BAsNT9NFDfDYEmo0mUQQ,10473
|
9
|
+
duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
|
10
|
+
duckrun-0.2.9.dev2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
11
|
+
duckrun-0.2.9.dev2.dist-info/METADATA,sha256=4vDjuMN2L2Uiv6CLIP7UyaKss6-jjNE7ZUxJC5SLnT8,19277
|
12
|
+
duckrun-0.2.9.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
13
|
+
duckrun-0.2.9.dev2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
14
|
+
duckrun-0.2.9.dev2.dist-info/RECORD,,
|
@@ -1,13 +0,0 @@
|
|
1
|
-
duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
|
2
|
-
duckrun/auth.py,sha256=qColLkvmk8S_qRAXLMGh_TgVeSPkv0j15dv55wgrX1o,9139
|
3
|
-
duckrun/core.py,sha256=bEt_Zi8R0ByKyyWllZpQKnV3D1fHZiR7lwwVrDrINQA,37720
|
4
|
-
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
5
|
-
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
6
|
-
duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
|
7
|
-
duckrun/stats.py,sha256=2FTqoQNVjD84-H1HjStHxZkOpAGKXS79M55B00pOlok,9804
|
8
|
-
duckrun/writer.py,sha256=3UwuoH4yjcomBaTbRXOSjlA82jRhhjErkOWDCX7K7mw,6595
|
9
|
-
duckrun-0.2.9.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
10
|
-
duckrun-0.2.9.dev0.dist-info/METADATA,sha256=_7fH-927WrJ5u0O4jmZHo3vwjlh3PEILc3mrtma5yH0,19277
|
11
|
-
duckrun-0.2.9.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
12
|
-
duckrun-0.2.9.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
13
|
-
duckrun-0.2.9.dev0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|