duckrun 0.2.18.dev1__tar.gz → 0.2.18.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/PKG-INFO +1 -1
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/__init__.py +1 -1
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/core.py +13 -9
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/semantic_model.py +118 -10
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/stats.py +10 -2
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/pyproject.toml +1 -1
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/LICENSE +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/README.md +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/auth.py +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/files.py +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/notebook.py +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/runner.py +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/writer.py +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/setup.cfg +0 -0
|
@@ -1035,12 +1035,13 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1035
1035
|
"""Get underlying DuckDB connection"""
|
|
1036
1036
|
return self.con
|
|
1037
1037
|
|
|
1038
|
-
def get_stats(self, source: str):
|
|
1038
|
+
def get_stats(self, source: str = None):
|
|
1039
1039
|
"""
|
|
1040
1040
|
Get comprehensive statistics for Delta Lake tables.
|
|
1041
1041
|
|
|
1042
1042
|
Args:
|
|
1043
|
-
source: Can be one of:
|
|
1043
|
+
source: Optional. Can be one of:
|
|
1044
|
+
- None: Use all tables in the connection's schema (default)
|
|
1044
1045
|
- Table name: 'table_name' (uses current schema)
|
|
1045
1046
|
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
1046
1047
|
- Schema only: 'schema' (all tables in schema)
|
|
@@ -1052,6 +1053,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1052
1053
|
Examples:
|
|
1053
1054
|
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
1054
1055
|
|
|
1056
|
+
# All tables in current schema (aemo)
|
|
1057
|
+
stats = con.get_stats()
|
|
1058
|
+
|
|
1055
1059
|
# Single table in current schema
|
|
1056
1060
|
stats = con.get_stats('price')
|
|
1057
1061
|
|
|
@@ -1184,7 +1188,7 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1184
1188
|
- URL: "https://raw.githubusercontent.com/.../model.bim"
|
|
1185
1189
|
- Local file: "model.bim"
|
|
1186
1190
|
- Workspace/Model: "workspace_name/model_name"
|
|
1187
|
-
dataset_name: Name for the semantic model (default:
|
|
1191
|
+
dataset_name: Name for the semantic model (default: schema name)
|
|
1188
1192
|
wait_seconds: Seconds to wait for permission propagation (default: 5)
|
|
1189
1193
|
|
|
1190
1194
|
Returns:
|
|
@@ -1193,14 +1197,14 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1193
1197
|
Examples:
|
|
1194
1198
|
dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
|
|
1195
1199
|
|
|
1200
|
+
# Deploy with schema name as dataset name (dbo)
|
|
1201
|
+
dr.deploy("https://github.com/.../model.bim")
|
|
1202
|
+
|
|
1196
1203
|
# Deploy from workspace/model (uses same name by default)
|
|
1197
1204
|
dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
|
|
1198
1205
|
|
|
1199
1206
|
# Deploy with custom name
|
|
1200
|
-
dr.deploy("
|
|
1201
|
-
|
|
1202
|
-
# Deploy from URL or local file
|
|
1203
|
-
dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
|
|
1207
|
+
dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
|
|
1204
1208
|
"""
|
|
1205
1209
|
from .semantic_model import deploy_semantic_model
|
|
1206
1210
|
|
|
@@ -1212,9 +1216,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1212
1216
|
if len(parts) == 2:
|
|
1213
1217
|
dataset_name = parts[1] # Use the model name
|
|
1214
1218
|
else:
|
|
1215
|
-
dataset_name =
|
|
1219
|
+
dataset_name = self.schema # Use schema name
|
|
1216
1220
|
else:
|
|
1217
|
-
dataset_name =
|
|
1221
|
+
dataset_name = self.schema # Use schema name
|
|
1218
1222
|
|
|
1219
1223
|
# Call the deployment function (DirectLake only)
|
|
1220
1224
|
return deploy_semantic_model(
|
|
@@ -130,13 +130,66 @@ def check_dataset_exists(dataset_name, workspace_id, client):
|
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
133
|
-
"""Refresh a dataset and monitor progress using Power BI API
|
|
133
|
+
"""Refresh a dataset and monitor progress using Power BI API
|
|
134
|
+
|
|
135
|
+
For DirectLake models, performs a two-step refresh:
|
|
136
|
+
1. clearValues - Purges data from memory
|
|
137
|
+
2. full - Reframes data from Delta tables
|
|
138
|
+
|
|
139
|
+
If a refresh is already in progress, waits for it to complete before starting a new one.
|
|
140
|
+
"""
|
|
134
141
|
|
|
135
142
|
# If dataset_id not provided, look it up by name
|
|
136
143
|
if not dataset_id:
|
|
137
144
|
dataset_id = get_dataset_id(dataset_name, workspace_id, client)
|
|
138
145
|
|
|
139
|
-
|
|
146
|
+
# Use Power BI API for refresh (not Fabric API)
|
|
147
|
+
powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
|
|
148
|
+
headers = client._get_headers()
|
|
149
|
+
|
|
150
|
+
# Check for in-progress refreshes
|
|
151
|
+
print(" Checking for in-progress refreshes...")
|
|
152
|
+
try:
|
|
153
|
+
status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
|
|
154
|
+
if status_response.status_code == 200:
|
|
155
|
+
refreshes = status_response.json().get('value', [])
|
|
156
|
+
if refreshes:
|
|
157
|
+
latest_refresh = refreshes[0]
|
|
158
|
+
status = latest_refresh.get('status')
|
|
159
|
+
if status in ['InProgress', 'Unknown']:
|
|
160
|
+
refresh_id = latest_refresh.get('requestId')
|
|
161
|
+
print(f" ⚠️ Found in-progress refresh (ID: {refresh_id})")
|
|
162
|
+
print(f" Waiting for current refresh to complete...")
|
|
163
|
+
|
|
164
|
+
# Wait for the in-progress refresh to complete
|
|
165
|
+
max_wait_attempts = 60
|
|
166
|
+
for attempt in range(max_wait_attempts):
|
|
167
|
+
time.sleep(5)
|
|
168
|
+
check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
|
|
169
|
+
if check_response.status_code == 200:
|
|
170
|
+
current_status = check_response.json().get('status')
|
|
171
|
+
|
|
172
|
+
if current_status == 'Completed':
|
|
173
|
+
print(f" ✓ Previous refresh completed")
|
|
174
|
+
break
|
|
175
|
+
elif current_status == 'Failed':
|
|
176
|
+
print(f" ⚠️ Previous refresh failed, continuing with new refresh")
|
|
177
|
+
break
|
|
178
|
+
elif current_status == 'Cancelled':
|
|
179
|
+
print(f" ⚠️ Previous refresh was cancelled, continuing with new refresh")
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
if attempt % 6 == 0:
|
|
183
|
+
print(f" Still waiting... (status: {current_status})")
|
|
184
|
+
else:
|
|
185
|
+
print(f" ⚠️ Timeout waiting for previous refresh, will attempt new refresh anyway")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f" ⚠️ Could not check refresh status: {e}")
|
|
188
|
+
print(f" Continuing with refresh attempt...")
|
|
189
|
+
|
|
190
|
+
# Step 1: clearValues - Purge data from memory
|
|
191
|
+
print(" Step 1: Clearing values from memory...")
|
|
192
|
+
clearvalues_payload = {
|
|
140
193
|
"type": "clearValues",
|
|
141
194
|
"commitMode": "transactional",
|
|
142
195
|
"maxParallelism": 10,
|
|
@@ -144,14 +197,63 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
|
144
197
|
"objects": []
|
|
145
198
|
}
|
|
146
199
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
200
|
+
response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
|
|
201
|
+
|
|
202
|
+
if response.status_code in [200, 202]:
|
|
203
|
+
# For 202, monitor the clearValues operation
|
|
204
|
+
if response.status_code == 202:
|
|
205
|
+
location = response.headers.get('Location')
|
|
206
|
+
if location:
|
|
207
|
+
clear_refresh_id = location.split('/')[-1]
|
|
208
|
+
print(" ✓ Clear values initiated, monitoring progress...")
|
|
209
|
+
|
|
210
|
+
max_attempts = 60
|
|
211
|
+
for attempt in range(max_attempts):
|
|
212
|
+
time.sleep(2)
|
|
213
|
+
|
|
214
|
+
status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
|
|
215
|
+
status_response = requests.get(status_url, headers=headers)
|
|
216
|
+
status_response.raise_for_status()
|
|
217
|
+
status = status_response.json().get('status')
|
|
218
|
+
|
|
219
|
+
if status == 'Completed':
|
|
220
|
+
print(f" ✓ Clear values completed")
|
|
221
|
+
break
|
|
222
|
+
elif status == 'Failed':
|
|
223
|
+
error = status_response.json().get('serviceExceptionJson', '')
|
|
224
|
+
raise Exception(f"Clear values failed: {error}")
|
|
225
|
+
elif status == 'Cancelled':
|
|
226
|
+
raise Exception("Clear values was cancelled")
|
|
227
|
+
|
|
228
|
+
if attempt % 10 == 0 and attempt > 0:
|
|
229
|
+
print(f" Clear values status: {status}...")
|
|
230
|
+
else:
|
|
231
|
+
raise Exception(f"Clear values timed out")
|
|
232
|
+
else:
|
|
233
|
+
print(" ✓ Clear values completed")
|
|
234
|
+
else:
|
|
235
|
+
# Provide detailed error message
|
|
236
|
+
try:
|
|
237
|
+
error_details = response.json()
|
|
238
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
239
|
+
raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
|
|
240
|
+
except (json.JSONDecodeError, ValueError):
|
|
241
|
+
response.raise_for_status()
|
|
242
|
+
|
|
243
|
+
# Step 2: full refresh - Reframe data from Delta tables
|
|
244
|
+
print(" Step 2: Full refresh to reframe data...")
|
|
245
|
+
full_payload = {
|
|
246
|
+
"type": "full",
|
|
247
|
+
"commitMode": "transactional",
|
|
248
|
+
"maxParallelism": 10,
|
|
249
|
+
"retryCount": 2,
|
|
250
|
+
"objects": []
|
|
251
|
+
}
|
|
150
252
|
|
|
151
|
-
response = requests.post(powerbi_url, headers=headers, json=
|
|
253
|
+
response = requests.post(powerbi_url, headers=headers, json=full_payload)
|
|
152
254
|
|
|
153
255
|
if response.status_code in [200, 202]:
|
|
154
|
-
print(f"✓ Refresh initiated")
|
|
256
|
+
print(f" ✓ Refresh initiated")
|
|
155
257
|
|
|
156
258
|
# For 202, get the refresh_id from the Location header
|
|
157
259
|
if response.status_code == 202:
|
|
@@ -183,7 +285,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
|
183
285
|
|
|
184
286
|
raise Exception(f"Refresh timed out")
|
|
185
287
|
else:
|
|
186
|
-
|
|
288
|
+
# Provide detailed error message
|
|
289
|
+
try:
|
|
290
|
+
error_details = response.json()
|
|
291
|
+
error_message = error_details.get('error', {}).get('message', response.text)
|
|
292
|
+
raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
|
|
293
|
+
except (json.JSONDecodeError, ValueError):
|
|
294
|
+
response.raise_for_status()
|
|
187
295
|
|
|
188
296
|
|
|
189
297
|
def download_bim_from_github(url_or_path):
|
|
@@ -471,13 +579,13 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
471
579
|
dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
|
|
472
580
|
|
|
473
581
|
if dataset_exists:
|
|
474
|
-
print(f"
|
|
582
|
+
print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
|
|
475
583
|
|
|
476
584
|
if wait_seconds > 0:
|
|
477
585
|
print(f" Waiting {wait_seconds} seconds...")
|
|
478
586
|
time.sleep(wait_seconds)
|
|
479
587
|
|
|
480
|
-
print("\n[Step
|
|
588
|
+
print("\n[Step 3/3] Refreshing existing semantic model...")
|
|
481
589
|
refresh_dataset(dataset_name, workspace_id, client)
|
|
482
590
|
|
|
483
591
|
print("\n" + "=" * 70)
|
|
@@ -60,13 +60,14 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
|
|
|
60
60
|
return []
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def get_stats(duckrun_instance, source: str):
|
|
63
|
+
def get_stats(duckrun_instance, source: str = None):
|
|
64
64
|
"""
|
|
65
65
|
Get comprehensive statistics for Delta Lake tables.
|
|
66
66
|
|
|
67
67
|
Args:
|
|
68
68
|
duckrun_instance: The Duckrun connection instance
|
|
69
|
-
source: Can be one of:
|
|
69
|
+
source: Optional. Can be one of:
|
|
70
|
+
- None: Use all tables in the connection's schema (default)
|
|
70
71
|
- Table name: 'table_name' (uses main schema in DuckDB)
|
|
71
72
|
- Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
|
|
72
73
|
- Schema only: 'schema' (all tables in schema, if multi-schema)
|
|
@@ -78,6 +79,9 @@ def get_stats(duckrun_instance, source: str):
|
|
|
78
79
|
Examples:
|
|
79
80
|
con = duckrun.connect("tmp/data.lakehouse/test")
|
|
80
81
|
|
|
82
|
+
# All tables in the connection's schema
|
|
83
|
+
stats = con.get_stats()
|
|
84
|
+
|
|
81
85
|
# Single table in main schema (DuckDB uses 'main', not 'test')
|
|
82
86
|
stats = con.get_stats('price_today')
|
|
83
87
|
|
|
@@ -93,6 +97,10 @@ def get_stats(duckrun_instance, source: str):
|
|
|
93
97
|
duckdb_schema = "main"
|
|
94
98
|
url_schema = duckrun_instance.schema # This is from the connection URL path
|
|
95
99
|
|
|
100
|
+
# If source is not provided, default to all tables in the connection's schema
|
|
101
|
+
if source is None:
|
|
102
|
+
source = url_schema
|
|
103
|
+
|
|
96
104
|
# Parse the source and validate existence
|
|
97
105
|
if '.' in source:
|
|
98
106
|
# Format: schema.table - only valid if multi-schema is enabled
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.18.
|
|
7
|
+
version = "0.2.18.dev3"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|