duckrun 0.2.13__py3-none-any.whl → 0.2.19.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckrun/__init__.py +4 -2
- duckrun/auth.py +12 -0
- duckrun/core.py +703 -179
- duckrun/notebook.py +324 -0
- duckrun/rle.py +860 -0
- duckrun/runner.py +15 -45
- duckrun/semantic_model.py +143 -17
- duckrun/stats.py +267 -62
- duckrun/writer.py +35 -6
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/METADATA +3 -3
- duckrun-0.2.19.dev5.dist-info/RECORD +16 -0
- duckrun-0.2.13.dist-info/RECORD +0 -14
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/WHEEL +0 -0
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.13.dist-info → duckrun-0.2.19.dev5.dist-info}/top_level.txt +0 -0
duckrun/runner.py
CHANGED
|
@@ -7,45 +7,7 @@ import importlib.util
|
|
|
7
7
|
from typing import List, Tuple, Dict, Optional, Callable, Any
|
|
8
8
|
from string import Template
|
|
9
9
|
from deltalake import DeltaTable, write_deltalake
|
|
10
|
-
|
|
11
|
-
RG = 8_000_000
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
|
|
15
|
-
"""
|
|
16
|
-
Build arguments for write_deltalake based on requirements:
|
|
17
|
-
- If schema_mode='merge': use rust engine (no row group params)
|
|
18
|
-
- Otherwise: use pyarrow engine with row group optimization (if supported)
|
|
19
|
-
"""
|
|
20
|
-
args = {
|
|
21
|
-
'table_or_uri': path,
|
|
22
|
-
'data': df,
|
|
23
|
-
'mode': mode
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
# Add partition_by if specified
|
|
27
|
-
if partition_by:
|
|
28
|
-
args['partition_by'] = partition_by
|
|
29
|
-
|
|
30
|
-
# Engine selection based on schema_mode
|
|
31
|
-
if schema_mode == 'merge':
|
|
32
|
-
# Use rust engine for schema merging (no row group params supported)
|
|
33
|
-
args['schema_mode'] = 'merge'
|
|
34
|
-
args['engine'] = 'rust'
|
|
35
|
-
else:
|
|
36
|
-
# Try to use pyarrow engine with row group optimization
|
|
37
|
-
# Check if row group parameters are supported by inspecting function signature
|
|
38
|
-
import inspect
|
|
39
|
-
sig = inspect.signature(write_deltalake)
|
|
40
|
-
|
|
41
|
-
if 'max_rows_per_file' in sig.parameters:
|
|
42
|
-
# Older deltalake version - use row group optimization
|
|
43
|
-
args['max_rows_per_file'] = RG
|
|
44
|
-
args['max_rows_per_group'] = RG
|
|
45
|
-
args['min_rows_per_group'] = RG
|
|
46
|
-
# For newer versions, just use default parameters
|
|
47
|
-
|
|
48
|
-
return args
|
|
10
|
+
from .writer import _build_write_deltalake_args
|
|
49
11
|
|
|
50
12
|
|
|
51
13
|
def run(duckrun_instance, pipeline: List[Tuple]) -> bool:
|
|
@@ -130,9 +92,12 @@ def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
|
|
|
130
92
|
|
|
131
93
|
# Get original and resolved names
|
|
132
94
|
original_workspace = duckrun_instance.workspace
|
|
133
|
-
original_lakehouse = duckrun_instance.
|
|
95
|
+
original_lakehouse = duckrun_instance.lakehouse_display_name # Base name without suffix (e.g., "data")
|
|
134
96
|
resolved_workspace = duckrun_instance.workspace_id
|
|
135
|
-
|
|
97
|
+
|
|
98
|
+
# Always pass base lakehouse name (without .Lakehouse suffix) to user functions
|
|
99
|
+
# User functions expect just the name like "data", not "data.Lakehouse"
|
|
100
|
+
resolved_lakehouse = duckrun_instance.lakehouse_display_name
|
|
136
101
|
|
|
137
102
|
# Substitute workspace/lakehouse names in args if they differ
|
|
138
103
|
# This prevents URL encoding issues when names contain spaces
|
|
@@ -149,7 +114,7 @@ def _run_python(duckrun_instance, name: str, args: tuple) -> Any:
|
|
|
149
114
|
else:
|
|
150
115
|
substituted_args.append(arg)
|
|
151
116
|
args = tuple(substituted_args)
|
|
152
|
-
print(f"📝 Auto-substituted workspace/lakehouse names in args
|
|
117
|
+
print(f"📝 Auto-substituted workspace/lakehouse names in args")
|
|
153
118
|
|
|
154
119
|
print(f"Running Python: {name}{args}")
|
|
155
120
|
result = func(*args)
|
|
@@ -282,12 +247,17 @@ def _read_sql_file(duckrun_instance, table_name: str, params: Optional[Dict] = N
|
|
|
282
247
|
# If GUID, use just the GUID
|
|
283
248
|
content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
|
|
284
249
|
else:
|
|
285
|
-
# If not GUID,
|
|
286
|
-
|
|
250
|
+
# If not GUID, check if lakehouse_name already has .ItemType suffix
|
|
251
|
+
if duckrun_instance.lakehouse_name.endswith(('.Lakehouse', '.Warehouse', '.Database', '.SnowflakeDatabase')):
|
|
252
|
+
# Already has suffix - use as is
|
|
253
|
+
content = content.replace('${lh}.Lakehouse', duckrun_instance.lakehouse_name)
|
|
254
|
+
else:
|
|
255
|
+
# No suffix - add .Lakehouse for legacy format
|
|
256
|
+
content = content.replace('${lh}.Lakehouse', f'{duckrun_instance.lakehouse_name}.Lakehouse')
|
|
287
257
|
|
|
288
258
|
full_params = {
|
|
289
259
|
'ws': duckrun_instance.workspace,
|
|
290
|
-
'lh': duckrun_instance.
|
|
260
|
+
'lh': duckrun_instance.lakehouse_display_name, # Use display name (without suffix) for backward compat
|
|
291
261
|
'schema': duckrun_instance.schema,
|
|
292
262
|
'storage_account': duckrun_instance.storage_account,
|
|
293
263
|
'tables_url': duckrun_instance.table_base_url,
|
duckrun/semantic_model.py
CHANGED
|
@@ -129,29 +129,136 @@ def check_dataset_exists(dataset_name, workspace_id, client):
|
|
|
129
129
|
return False
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
133
|
-
"""Refresh a dataset and monitor progress using Power BI API
|
|
132
|
+
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
|
|
133
|
+
"""Refresh a dataset and monitor progress using Power BI API
|
|
134
|
+
|
|
135
|
+
For DirectLake models, performs refresh based on refresh parameter:
|
|
136
|
+
- refresh="full": Two-step refresh (clearValues + full reframe)
|
|
137
|
+
- refresh="ignore": Skip refresh entirely
|
|
138
|
+
|
|
139
|
+
If a refresh is already in progress, waits for it to complete before starting a new one.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
# Skip refresh entirely if refresh is "ignore"
|
|
143
|
+
if refresh == "ignore":
|
|
144
|
+
print(" Ignoring refresh - skipping refresh")
|
|
145
|
+
return
|
|
134
146
|
|
|
135
147
|
# If dataset_id not provided, look it up by name
|
|
136
148
|
if not dataset_id:
|
|
137
149
|
dataset_id = get_dataset_id(dataset_name, workspace_id, client)
|
|
138
150
|
|
|
139
|
-
|
|
140
|
-
|
|
151
|
+
# Use Power BI API for refresh (not Fabric API)
|
|
152
|
+
powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
|
|
153
|
+
headers = client._get_headers()
|
|
154
|
+
|
|
155
|
+
# Check for in-progress refreshes
|
|
156
|
+
print(" Checking for in-progress refreshes...")
|
|
157
|
+
try:
|
|
158
|
+
status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
|
|
159
|
+
if status_response.status_code == 200:
|
|
160
|
+
refreshes = status_response.json().get('value', [])
|
|
161
|
+
if refreshes:
|
|
162
|
+
latest_refresh = refreshes[0]
|
|
163
|
+
status = latest_refresh.get('status')
|
|
164
|
+
if status in ['InProgress', 'Unknown']:
|
|
165
|
+
refresh_id = latest_refresh.get('requestId')
|
|
166
|
+
print(f" ⚠️ Found in-progress refresh (ID: {refresh_id})")
|
|
167
|
+
print(f" Waiting for current refresh to complete...")
|
|
168
|
+
|
|
169
|
+
# Wait for the in-progress refresh to complete
|
|
170
|
+
max_wait_attempts = 60
|
|
171
|
+
for attempt in range(max_wait_attempts):
|
|
172
|
+
time.sleep(5)
|
|
173
|
+
check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
|
|
174
|
+
if check_response.status_code == 200:
|
|
175
|
+
current_status = check_response.json().get('status')
|
|
176
|
+
|
|
177
|
+
if current_status == 'Completed':
|
|
178
|
+
print(f" ✓ Previous refresh completed")
|
|
179
|
+
break
|
|
180
|
+
elif current_status == 'Failed':
|
|
181
|
+
print(f" ⚠️ Previous refresh failed, continuing with new refresh")
|
|
182
|
+
break
|
|
183
|
+
elif current_status == 'Cancelled':
|
|
184
|
+
print(f" ⚠️ Previous refresh was cancelled, continuing with new refresh")
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
if attempt % 6 == 0:
|
|
188
|
+
print(f" Still waiting... (status: {current_status})")
|
|
189
|
+
else:
|
|
190
|
+
print(f" ⚠️ Timeout waiting for previous refresh, will attempt new refresh anyway")
|
|
191
|
+
except Exception as e:
|
|
192
|
+
print(f" ⚠️ Could not check refresh status: {e}")
|
|
193
|
+
print(f" Continuing with refresh attempt...")
|
|
194
|
+
|
|
195
|
+
# Step 1: clearValues - Purge data from memory
|
|
196
|
+
print(" Step 1: Clearing values from memory...")
|
|
197
|
+
clearvalues_payload = {
|
|
198
|
+
"type": "clearValues",
|
|
141
199
|
"commitMode": "transactional",
|
|
142
200
|
"maxParallelism": 10,
|
|
143
201
|
"retryCount": 2,
|
|
144
202
|
"objects": []
|
|
145
203
|
}
|
|
146
204
|
|
|
147
|
-
|
|
148
|
-
powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
|
|
149
|
-
headers = client._get_headers()
|
|
205
|
+
response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
|
|
150
206
|
|
|
151
|
-
response
|
|
207
|
+
if response.status_code in [200, 202]:
|
|
208
|
+
# For 202, monitor the clearValues operation
|
|
209
|
+
if response.status_code == 202:
|
|
210
|
+
location = response.headers.get('Location')
|
|
211
|
+
if location:
|
|
212
|
+
clear_refresh_id = location.split('/')[-1]
|
|
213
|
+
print(" ✓ Clear values initiated, monitoring progress...")
|
|
214
|
+
|
|
215
|
+
max_attempts = 60
|
|
216
|
+
for attempt in range(max_attempts):
|
|
217
|
+
time.sleep(2)
|
|
218
|
+
|
|
219
|
+
status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
|
|
220
|
+
status_response = requests.get(status_url, headers=headers)
|
|
221
|
+
status_response.raise_for_status()
|
|
222
|
+
status = status_response.json().get('status')
|
|
223
|
+
|
|
224
|
+
if status == 'Completed':
|
|
225
|
+
print(f" ✓ Clear values completed")
|
|
226
|
+
break
|
|
227
|
+
elif status == 'Failed':
|
|
228
|
+
error = status_response.json().get('serviceExceptionJson', '')
|
|
229
|
+
raise Exception(f"Clear values failed: {error}")
|
|
230
|
+
elif status == 'Cancelled':
|
|
231
|
+
raise Exception("Clear values was cancelled")
|
|
232
|
+
|
|
233
|
+
if attempt % 10 == 0 and attempt > 0:
|
|
234
|
+
print(f" Clear values status: {status}...")
|
|
235
|
+
else:
|
|
236
|
+
raise Exception(f"Clear values timed out")
|
|
237
|
+
else:
|
|
238
|
+
print(" ✓ Clear values completed")
|
|
239
|
+
else:
|
|
240
|
+
# Provide detailed error message
|
|
241
|
+
try:
|
|
242
|
+
error_details = response.json()
|
|
243
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
244
|
+
raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
|
|
245
|
+
except (json.JSONDecodeError, ValueError):
|
|
246
|
+
response.raise_for_status()
|
|
247
|
+
|
|
248
|
+
# Step 2: full refresh - Reframe data from Delta tables
|
|
249
|
+
print(" Step 2: Full refresh to reframe data...")
|
|
250
|
+
full_payload = {
|
|
251
|
+
"type": "full",
|
|
252
|
+
"commitMode": "transactional",
|
|
253
|
+
"maxParallelism": 10,
|
|
254
|
+
"retryCount": 2,
|
|
255
|
+
"objects": []
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
response = requests.post(powerbi_url, headers=headers, json=full_payload)
|
|
152
259
|
|
|
153
260
|
if response.status_code in [200, 202]:
|
|
154
|
-
print(f"✓ Refresh initiated")
|
|
261
|
+
print(f" ✓ Refresh initiated")
|
|
155
262
|
|
|
156
263
|
# For 202, get the refresh_id from the Location header
|
|
157
264
|
if response.status_code == 202:
|
|
@@ -183,7 +290,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
|
183
290
|
|
|
184
291
|
raise Exception(f"Refresh timed out")
|
|
185
292
|
else:
|
|
186
|
-
|
|
293
|
+
# Provide detailed error message
|
|
294
|
+
try:
|
|
295
|
+
error_details = response.json()
|
|
296
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
297
|
+
raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
|
|
298
|
+
except (json.JSONDecodeError, ValueError):
|
|
299
|
+
response.raise_for_status()
|
|
187
300
|
|
|
188
301
|
|
|
189
302
|
def download_bim_from_github(url_or_path):
|
|
@@ -431,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
|
|
|
431
544
|
|
|
432
545
|
|
|
433
546
|
def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
|
|
434
|
-
bim_url_or_path, wait_seconds=5):
|
|
547
|
+
bim_url_or_path, wait_seconds=5, refresh="full"):
|
|
435
548
|
"""
|
|
436
549
|
Deploy a semantic model using DirectLake mode.
|
|
437
550
|
|
|
@@ -442,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
442
555
|
dataset_name: Name for the semantic model
|
|
443
556
|
bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
|
|
444
557
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
558
|
+
refresh: Refresh strategy (default: "full")
|
|
559
|
+
- "full": Clear values and process full refresh
|
|
560
|
+
- "ignore": Skip refresh entirely
|
|
445
561
|
|
|
446
562
|
Returns:
|
|
447
563
|
1 for success, 0 for failure
|
|
@@ -454,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
454
570
|
# Using a local file
|
|
455
571
|
dr.deploy("./my_model.bim")
|
|
456
572
|
dr.deploy("C:/path/to/model.bim")
|
|
573
|
+
|
|
574
|
+
# Deploy without refresh
|
|
575
|
+
dr.deploy("./my_model.bim", refresh="ignore")
|
|
457
576
|
"""
|
|
458
577
|
print("=" * 70)
|
|
459
578
|
print("Semantic Model Deployment (DirectLake)")
|
|
@@ -471,14 +590,14 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
471
590
|
dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
|
|
472
591
|
|
|
473
592
|
if dataset_exists:
|
|
474
|
-
print(f"
|
|
593
|
+
print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
|
|
475
594
|
|
|
476
595
|
if wait_seconds > 0:
|
|
477
596
|
print(f" Waiting {wait_seconds} seconds...")
|
|
478
597
|
time.sleep(wait_seconds)
|
|
479
598
|
|
|
480
|
-
print("\n[Step
|
|
481
|
-
refresh_dataset(dataset_name, workspace_id, client)
|
|
599
|
+
print("\n[Step 3/3] Refreshing existing semantic model...")
|
|
600
|
+
refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
|
|
482
601
|
|
|
483
602
|
print("\n" + "=" * 70)
|
|
484
603
|
print("🎉 Refresh Completed!")
|
|
@@ -510,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
510
629
|
|
|
511
630
|
# Step 6: Refresh using the dataset ID returned from creation
|
|
512
631
|
print("\n[Step 6/6] Refreshing semantic model...")
|
|
513
|
-
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
|
|
632
|
+
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
|
|
514
633
|
|
|
515
634
|
print("\n" + "=" * 70)
|
|
516
635
|
print("🎉 Deployment Completed!")
|
|
@@ -537,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
537
656
|
return 0
|
|
538
657
|
|
|
539
658
|
|
|
540
|
-
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
|
|
659
|
+
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
|
|
541
660
|
"""
|
|
542
661
|
Copy a semantic model from one workspace to another.
|
|
543
662
|
|
|
@@ -550,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
550
669
|
destination: Destination in format "workspace/lakehouse.lakehouse/schema"
|
|
551
670
|
new_model_name: Name for the new semantic model (default: same as source)
|
|
552
671
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
672
|
+
refresh: Refresh strategy (default: "full")
|
|
673
|
+
- "full": Clear values and process full refresh
|
|
674
|
+
- "ignore": Skip refresh entirely
|
|
553
675
|
|
|
554
676
|
Returns:
|
|
555
677
|
1 for success, 0 for failure
|
|
@@ -562,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
562
684
|
copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
|
|
563
685
|
new_model_name="Production Model - Copy")
|
|
564
686
|
|
|
687
|
+
# Copy without refresh
|
|
688
|
+
copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
|
|
689
|
+
|
|
565
690
|
# Using the connect pattern
|
|
566
691
|
import duckrun
|
|
567
692
|
duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
|
|
@@ -688,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
688
813
|
schema_name=schema,
|
|
689
814
|
dataset_name=new_model_name,
|
|
690
815
|
bim_url_or_path=temp_bim_path,
|
|
691
|
-
wait_seconds=wait_seconds
|
|
816
|
+
wait_seconds=wait_seconds,
|
|
817
|
+
refresh=refresh
|
|
692
818
|
)
|
|
693
819
|
|
|
694
820
|
# Clean up temp file
|