duckrun 0.2.18.dev1__tar.gz → 0.2.18.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev1
3
+ Version: 0.2.18.dev3
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -3,7 +3,7 @@
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
5
 
6
- __version__ = "0.2.18.dev1"
6
+ __version__ = "0.2.18.dev2"
7
7
 
8
8
  # Expose unified connect method at module level
9
9
  connect = Duckrun.connect
@@ -1035,12 +1035,13 @@ class Duckrun(WorkspaceOperationsMixin):
1035
1035
  """Get underlying DuckDB connection"""
1036
1036
  return self.con
1037
1037
 
1038
- def get_stats(self, source: str):
1038
+ def get_stats(self, source: str = None):
1039
1039
  """
1040
1040
  Get comprehensive statistics for Delta Lake tables.
1041
1041
 
1042
1042
  Args:
1043
- source: Can be one of:
1043
+ source: Optional. Can be one of:
1044
+ - None: Use all tables in the connection's schema (default)
1044
1045
  - Table name: 'table_name' (uses current schema)
1045
1046
  - Schema.table: 'schema.table_name' (specific table in schema)
1046
1047
  - Schema only: 'schema' (all tables in schema)
@@ -1052,6 +1053,9 @@ class Duckrun(WorkspaceOperationsMixin):
1052
1053
  Examples:
1053
1054
  con = duckrun.connect("tmp/data.lakehouse/aemo")
1054
1055
 
1056
+ # All tables in current schema (aemo)
1057
+ stats = con.get_stats()
1058
+
1055
1059
  # Single table in current schema
1056
1060
  stats = con.get_stats('price')
1057
1061
 
@@ -1184,7 +1188,7 @@ class Duckrun(WorkspaceOperationsMixin):
1184
1188
  - URL: "https://raw.githubusercontent.com/.../model.bim"
1185
1189
  - Local file: "model.bim"
1186
1190
  - Workspace/Model: "workspace_name/model_name"
1187
- dataset_name: Name for the semantic model (default: source model name if workspace/model format, else lakehouse_schema)
1191
+ dataset_name: Name for the semantic model (default: schema name)
1188
1192
  wait_seconds: Seconds to wait for permission propagation (default: 5)
1189
1193
 
1190
1194
  Returns:
@@ -1193,14 +1197,14 @@ class Duckrun(WorkspaceOperationsMixin):
1193
1197
  Examples:
1194
1198
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
1195
1199
 
1200
+ # Deploy with schema name as dataset name (dbo)
1201
+ dr.deploy("https://github.com/.../model.bim")
1202
+
1196
1203
  # Deploy from workspace/model (uses same name by default)
1197
1204
  dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
1198
1205
 
1199
1206
  # Deploy with custom name
1200
- dr.deploy("Source Workspace/Source Model", dataset_name="Sales Model Copy")
1201
-
1202
- # Deploy from URL or local file
1203
- dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
1207
+ dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
1204
1208
  """
1205
1209
  from .semantic_model import deploy_semantic_model
1206
1210
 
@@ -1212,9 +1216,9 @@ class Duckrun(WorkspaceOperationsMixin):
1212
1216
  if len(parts) == 2:
1213
1217
  dataset_name = parts[1] # Use the model name
1214
1218
  else:
1215
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1219
+ dataset_name = self.schema # Use schema name
1216
1220
  else:
1217
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1221
+ dataset_name = self.schema # Use schema name
1218
1222
 
1219
1223
  # Call the deployment function (DirectLake only)
1220
1224
  return deploy_semantic_model(
@@ -130,13 +130,66 @@ def check_dataset_exists(dataset_name, workspace_id, client):
130
130
 
131
131
 
132
132
  def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
133
- """Refresh a dataset and monitor progress using Power BI API"""
133
+ """Refresh a dataset and monitor progress using Power BI API
134
+
135
+ For DirectLake models, performs a two-step refresh:
136
+ 1. clearValues - Purges data from memory
137
+ 2. full - Reframes data from Delta tables
138
+
139
+ If a refresh is already in progress, waits for it to complete before starting a new one.
140
+ """
134
141
 
135
142
  # If dataset_id not provided, look it up by name
136
143
  if not dataset_id:
137
144
  dataset_id = get_dataset_id(dataset_name, workspace_id, client)
138
145
 
139
- payload = {
146
+ # Use Power BI API for refresh (not Fabric API)
147
+ powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
148
+ headers = client._get_headers()
149
+
150
+ # Check for in-progress refreshes
151
+ print(" Checking for in-progress refreshes...")
152
+ try:
153
+ status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
154
+ if status_response.status_code == 200:
155
+ refreshes = status_response.json().get('value', [])
156
+ if refreshes:
157
+ latest_refresh = refreshes[0]
158
+ status = latest_refresh.get('status')
159
+ if status in ['InProgress', 'Unknown']:
160
+ refresh_id = latest_refresh.get('requestId')
161
+ print(f" ⚠️ Found in-progress refresh (ID: {refresh_id})")
162
+ print(f" Waiting for current refresh to complete...")
163
+
164
+ # Wait for the in-progress refresh to complete
165
+ max_wait_attempts = 60
166
+ for attempt in range(max_wait_attempts):
167
+ time.sleep(5)
168
+ check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
169
+ if check_response.status_code == 200:
170
+ current_status = check_response.json().get('status')
171
+
172
+ if current_status == 'Completed':
173
+ print(f" ✓ Previous refresh completed")
174
+ break
175
+ elif current_status == 'Failed':
176
+ print(f" ⚠️ Previous refresh failed, continuing with new refresh")
177
+ break
178
+ elif current_status == 'Cancelled':
179
+ print(f" ⚠️ Previous refresh was cancelled, continuing with new refresh")
180
+ break
181
+
182
+ if attempt % 6 == 0:
183
+ print(f" Still waiting... (status: {current_status})")
184
+ else:
185
+ print(f" ⚠️ Timeout waiting for previous refresh, will attempt new refresh anyway")
186
+ except Exception as e:
187
+ print(f" ⚠️ Could not check refresh status: {e}")
188
+ print(f" Continuing with refresh attempt...")
189
+
190
+ # Step 1: clearValues - Purge data from memory
191
+ print(" Step 1: Clearing values from memory...")
192
+ clearvalues_payload = {
140
193
  "type": "clearValues",
141
194
  "commitMode": "transactional",
142
195
  "maxParallelism": 10,
@@ -144,14 +197,63 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
144
197
  "objects": []
145
198
  }
146
199
 
147
- # Use Power BI API for refresh (not Fabric API)
148
- powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
149
- headers = client._get_headers()
200
+ response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
201
+
202
+ if response.status_code in [200, 202]:
203
+ # For 202, monitor the clearValues operation
204
+ if response.status_code == 202:
205
+ location = response.headers.get('Location')
206
+ if location:
207
+ clear_refresh_id = location.split('/')[-1]
208
+ print(" ✓ Clear values initiated, monitoring progress...")
209
+
210
+ max_attempts = 60
211
+ for attempt in range(max_attempts):
212
+ time.sleep(2)
213
+
214
+ status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
215
+ status_response = requests.get(status_url, headers=headers)
216
+ status_response.raise_for_status()
217
+ status = status_response.json().get('status')
218
+
219
+ if status == 'Completed':
220
+ print(f" ✓ Clear values completed")
221
+ break
222
+ elif status == 'Failed':
223
+ error = status_response.json().get('serviceExceptionJson', '')
224
+ raise Exception(f"Clear values failed: {error}")
225
+ elif status == 'Cancelled':
226
+ raise Exception("Clear values was cancelled")
227
+
228
+ if attempt % 10 == 0 and attempt > 0:
229
+ print(f" Clear values status: {status}...")
230
+ else:
231
+ raise Exception(f"Clear values timed out")
232
+ else:
233
+ print(" ✓ Clear values completed")
234
+ else:
235
+ # Provide detailed error message
236
+ try:
237
+ error_details = response.json()
238
+ error_message = error_details.get('error', {}).get('message', response.text)
239
+ raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
240
+ except (json.JSONDecodeError, ValueError):
241
+ response.raise_for_status()
242
+
243
+ # Step 2: full refresh - Reframe data from Delta tables
244
+ print(" Step 2: Full refresh to reframe data...")
245
+ full_payload = {
246
+ "type": "full",
247
+ "commitMode": "transactional",
248
+ "maxParallelism": 10,
249
+ "retryCount": 2,
250
+ "objects": []
251
+ }
150
252
 
151
- response = requests.post(powerbi_url, headers=headers, json=payload)
253
+ response = requests.post(powerbi_url, headers=headers, json=full_payload)
152
254
 
153
255
  if response.status_code in [200, 202]:
154
- print(f"✓ Refresh initiated")
256
+ print(f" ✓ Refresh initiated")
155
257
 
156
258
  # For 202, get the refresh_id from the Location header
157
259
  if response.status_code == 202:
@@ -183,7 +285,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
183
285
 
184
286
  raise Exception(f"Refresh timed out")
185
287
  else:
186
- response.raise_for_status()
288
+ # Provide detailed error message
289
+ try:
290
+ error_details = response.json()
291
+ error_message = error_details.get('error', {}).get('message', response.text)
292
+ raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
293
+ except (json.JSONDecodeError, ValueError):
294
+ response.raise_for_status()
187
295
 
188
296
 
189
297
  def download_bim_from_github(url_or_path):
@@ -471,13 +579,13 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
471
579
  dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
472
580
 
473
581
  if dataset_exists:
474
- print(f"\n✓ Dataset exists - refreshing...")
582
+ print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
475
583
 
476
584
  if wait_seconds > 0:
477
585
  print(f" Waiting {wait_seconds} seconds...")
478
586
  time.sleep(wait_seconds)
479
587
 
480
- print("\n[Step 6/6] Refreshing semantic model...")
588
+ print("\n[Step 3/3] Refreshing existing semantic model...")
481
589
  refresh_dataset(dataset_name, workspace_id, client)
482
590
 
483
591
  print("\n" + "=" * 70)
@@ -60,13 +60,14 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
60
60
  return []
61
61
 
62
62
 
63
- def get_stats(duckrun_instance, source: str):
63
+ def get_stats(duckrun_instance, source: str = None):
64
64
  """
65
65
  Get comprehensive statistics for Delta Lake tables.
66
66
 
67
67
  Args:
68
68
  duckrun_instance: The Duckrun connection instance
69
- source: Can be one of:
69
+ source: Optional. Can be one of:
70
+ - None: Use all tables in the connection's schema (default)
70
71
  - Table name: 'table_name' (uses main schema in DuckDB)
71
72
  - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
72
73
  - Schema only: 'schema' (all tables in schema, if multi-schema)
@@ -78,6 +79,9 @@ def get_stats(duckrun_instance, source: str):
78
79
  Examples:
79
80
  con = duckrun.connect("tmp/data.lakehouse/test")
80
81
 
82
+ # All tables in the connection's schema
83
+ stats = con.get_stats()
84
+
81
85
  # Single table in main schema (DuckDB uses 'main', not 'test')
82
86
  stats = con.get_stats('price_today')
83
87
 
@@ -93,6 +97,10 @@ def get_stats(duckrun_instance, source: str):
93
97
  duckdb_schema = "main"
94
98
  url_schema = duckrun_instance.schema # This is from the connection URL path
95
99
 
100
+ # If source is not provided, default to all tables in the connection's schema
101
+ if source is None:
102
+ source = url_schema
103
+
96
104
  # Parse the source and validate existence
97
105
  if '.' in source:
98
106
  # Format: schema.table - only valid if multi-schema is enabled
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev1
3
+ Version: 0.2.18.dev3
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.18.dev1"
7
+ version = "0.2.18.dev3"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes