duckrun 0.2.16.dev0__py3-none-any.whl → 0.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

duckrun/__init__.py CHANGED
@@ -1,10 +1,11 @@
1
1
  """Duckrun - Lakehouse task runner powered by DuckDB"""
2
2
 
3
3
  from duckrun.core import Duckrun
4
+ from duckrun.notebook import import_notebook_from_web, import_notebook
4
5
 
5
- __version__ = "0.2.14.dev2"
6
+ __version__ = "0.2.18"
6
7
 
7
8
  # Expose unified connect method at module level
8
9
  connect = Duckrun.connect
9
10
 
10
- __all__ = ["Duckrun", "connect"]
11
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
duckrun/core.py CHANGED
@@ -12,7 +12,71 @@ from .runner import run as _run
12
12
  from .files import copy as _copy, download as _download
13
13
  from .writer import QueryResult
14
14
 
15
- class Duckrun:
15
+
16
+ class WorkspaceOperationsMixin:
17
+ """
18
+ Mixin class for workspace-level operations that work for both
19
+ full Duckrun connections and workspace-only connections.
20
+ """
21
+
22
+ def import_notebook_from_web(self, url: str,
23
+ notebook_name: Optional[str] = None,
24
+ overwrite: bool = False) -> dict:
25
+ """
26
+ Import a Jupyter notebook from a web URL into the workspace.
27
+
28
+ Args:
29
+ url: URL to the notebook file (e.g., GitHub raw URL). Required.
30
+ notebook_name: Name for the imported notebook. Optional - derived from URL if not provided.
31
+ overwrite: Whether to overwrite if notebook already exists (default: False)
32
+
33
+ Returns:
34
+ Dictionary with import result
35
+
36
+ Examples:
37
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
38
+ result = con.import_notebook_from_web(
39
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
40
+ )
41
+
42
+ ws = duckrun.connect("workspace")
43
+ result = ws.import_notebook_from_web(
44
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb"
45
+ )
46
+ """
47
+ from .notebook import import_notebook_from_web as _import_notebook_from_web
48
+
49
+ # Get workspace name from either self.workspace or self.workspace_name
50
+ workspace_name = getattr(self, 'workspace', None) or getattr(self, 'workspace_name', None)
51
+
52
+ return _import_notebook_from_web(
53
+ url=url,
54
+ notebook_name=notebook_name,
55
+ overwrite=overwrite,
56
+ workspace_name=workspace_name
57
+ )
58
+
59
+ def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
60
+ """Helper method to get workspace ID from name"""
61
+ try:
62
+ url = "https://api.fabric.microsoft.com/v1/workspaces"
63
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
64
+
65
+ response = requests.get(url, headers=headers)
66
+ response.raise_for_status()
67
+
68
+ workspaces = response.json().get("value", [])
69
+ for workspace in workspaces:
70
+ if workspace.get("displayName") == workspace_name:
71
+ return workspace.get("id")
72
+
73
+ return None
74
+
75
+ except Exception:
76
+ return None
77
+
78
+
79
+ class Duckrun(WorkspaceOperationsMixin):
16
80
  """
17
81
  OneLake task runner with clean tuple-based API.
18
82
  Supports lakehouses, warehouses, databases, and other OneLake items.
@@ -971,12 +1035,13 @@ class Duckrun:
971
1035
  """Get underlying DuckDB connection"""
972
1036
  return self.con
973
1037
 
974
- def get_stats(self, source: str):
1038
+ def get_stats(self, source: str = None):
975
1039
  """
976
1040
  Get comprehensive statistics for Delta Lake tables.
977
1041
 
978
1042
  Args:
979
- source: Can be one of:
1043
+ source: Optional. Can be one of:
1044
+ - None: Use all tables in the connection's schema (default)
980
1045
  - Table name: 'table_name' (uses current schema)
981
1046
  - Schema.table: 'schema.table_name' (specific table in schema)
982
1047
  - Schema only: 'schema' (all tables in schema)
@@ -988,6 +1053,9 @@ class Duckrun:
988
1053
  Examples:
989
1054
  con = duckrun.connect("tmp/data.lakehouse/aemo")
990
1055
 
1056
+ # All tables in current schema (aemo)
1057
+ stats = con.get_stats()
1058
+
991
1059
  # Single table in current schema
992
1060
  stats = con.get_stats('price')
993
1061
 
@@ -1111,7 +1179,7 @@ class Duckrun:
1111
1179
  return False
1112
1180
 
1113
1181
  def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
1114
- wait_seconds: int = 5) -> int:
1182
+ wait_seconds: int = 5, refresh: str = "full") -> int:
1115
1183
  """
1116
1184
  Deploy a semantic model from a BIM file using DirectLake mode.
1117
1185
 
@@ -1120,8 +1188,11 @@ class Duckrun:
1120
1188
  - URL: "https://raw.githubusercontent.com/.../model.bim"
1121
1189
  - Local file: "model.bim"
1122
1190
  - Workspace/Model: "workspace_name/model_name"
1123
- dataset_name: Name for the semantic model (default: source model name if workspace/model format, else lakehouse_schema)
1191
+ dataset_name: Name for the semantic model (default: schema name)
1124
1192
  wait_seconds: Seconds to wait for permission propagation (default: 5)
1193
+ refresh: Refresh strategy:
1194
+ - "full": Clear values and process full refresh (default)
1195
+ - "ignore": Skip refresh entirely
1125
1196
 
1126
1197
  Returns:
1127
1198
  1 for success, 0 for failure
@@ -1129,14 +1200,17 @@ class Duckrun:
1129
1200
  Examples:
1130
1201
  dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
1131
1202
 
1203
+ # Deploy with schema name as dataset name (dbo)
1204
+ dr.deploy("https://github.com/.../model.bim")
1205
+
1132
1206
  # Deploy from workspace/model (uses same name by default)
1133
1207
  dr.deploy("Source Workspace/Source Model") # Creates "Source Model"
1134
1208
 
1135
1209
  # Deploy with custom name
1136
- dr.deploy("Source Workspace/Source Model", dataset_name="Sales Model Copy")
1210
+ dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
1137
1211
 
1138
- # Deploy from URL or local file
1139
- dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
1212
+ # Deploy without refresh
1213
+ dr.deploy("https://github.com/.../model.bim", refresh="ignore")
1140
1214
  """
1141
1215
  from .semantic_model import deploy_semantic_model
1142
1216
 
@@ -1148,9 +1222,9 @@ class Duckrun:
1148
1222
  if len(parts) == 2:
1149
1223
  dataset_name = parts[1] # Use the model name
1150
1224
  else:
1151
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1225
+ dataset_name = self.schema # Use schema name
1152
1226
  else:
1153
- dataset_name = f"{self.lakehouse_name}_{self.schema}"
1227
+ dataset_name = self.schema # Use schema name
1154
1228
 
1155
1229
  # Call the deployment function (DirectLake only)
1156
1230
  return deploy_semantic_model(
@@ -1159,28 +1233,10 @@ class Duckrun:
1159
1233
  schema_name=self.schema,
1160
1234
  dataset_name=dataset_name,
1161
1235
  bim_url_or_path=bim_url,
1162
- wait_seconds=wait_seconds
1236
+ wait_seconds=wait_seconds,
1237
+ refresh=refresh
1163
1238
  )
1164
1239
 
1165
- def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
1166
- """Helper method to get workspace ID from name"""
1167
- try:
1168
- url = "https://api.fabric.microsoft.com/v1/workspaces"
1169
- headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
1170
-
1171
- response = requests.get(url, headers=headers)
1172
- response.raise_for_status()
1173
-
1174
- workspaces = response.json().get("value", [])
1175
- for workspace in workspaces:
1176
- if workspace.get("displayName") == workspace_name:
1177
- return workspace.get("id")
1178
-
1179
- return None
1180
-
1181
- except Exception:
1182
- return None
1183
-
1184
1240
  def close(self):
1185
1241
  """Close DuckDB connection"""
1186
1242
  if self.con:
@@ -1188,7 +1244,7 @@ class Duckrun:
1188
1244
  print("Connection closed")
1189
1245
 
1190
1246
 
1191
- class WorkspaceConnection:
1247
+ class WorkspaceConnection(WorkspaceOperationsMixin):
1192
1248
  """
1193
1249
  Simple workspace connection for lakehouse management operations.
1194
1250
  """
@@ -1428,23 +1484,4 @@ class WorkspaceConnection:
1428
1484
  print(f"❌ Error downloading semantic model: {e}")
1429
1485
  import traceback
1430
1486
  traceback.print_exc()
1431
- return None
1432
-
1433
- def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
1434
- """Helper method to get workspace ID from name"""
1435
- try:
1436
- url = "https://api.fabric.microsoft.com/v1/workspaces"
1437
- headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
1438
-
1439
- response = requests.get(url, headers=headers)
1440
- response.raise_for_status()
1441
-
1442
- workspaces = response.json().get("value", [])
1443
- for workspace in workspaces:
1444
- if workspace.get("displayName") == workspace_name:
1445
- return workspace.get("id")
1446
-
1447
- return None
1448
-
1449
- except Exception:
1450
1487
  return None
duckrun/notebook.py ADDED
@@ -0,0 +1,324 @@
1
+ """
2
+ Notebook operations functionality for duckrun - Import notebooks from web using Fabric REST API
3
+ """
4
+ import requests
5
+ import base64
6
+ from typing import Optional
7
+
8
+
9
+ def import_notebook_from_web(
10
+ url: str,
11
+ notebook_name: Optional[str] = None,
12
+ overwrite: bool = False,
13
+ workspace_name: Optional[str] = None
14
+ ) -> dict:
15
+ """
16
+ Import a Jupyter notebook from a web URL into Microsoft Fabric workspace using REST API only.
17
+ Uses duckrun.connect context by default or explicit workspace name.
18
+
19
+ Args:
20
+ url: URL to the notebook file (e.g., GitHub raw URL). Required.
21
+ notebook_name: Name for the imported notebook in Fabric. Optional - will use filename from URL if not provided.
22
+ overwrite: Whether to overwrite if notebook already exists (default: False)
23
+ workspace_name: Target workspace name. Optional - will use current workspace from duckrun context if available.
24
+
25
+ Returns:
26
+ Dictionary with import result:
27
+ {
28
+ "success": bool,
29
+ "message": str,
30
+ "notebook": dict (if successful),
31
+ "overwritten": bool
32
+ }
33
+
34
+ Examples:
35
+ # Basic usage with duckrun context
36
+ import duckrun
37
+ dr = duckrun.connect("MyWorkspace/MyLakehouse.lakehouse")
38
+ from duckrun.notebook import import_notebook_from_web
39
+
40
+ result = import_notebook_from_web(
41
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb",
42
+ notebook_name="MyNotebook"
43
+ )
44
+
45
+ # With explicit workspace
46
+ result = import_notebook_from_web(
47
+ url="https://raw.githubusercontent.com/user/repo/main/notebook.ipynb",
48
+ notebook_name="MyNotebook",
49
+ workspace_name="Analytics Workspace",
50
+ overwrite=True
51
+ )
52
+
53
+ # Minimal usage - derives name from URL
54
+ result = import_notebook_from_web(
55
+ url="https://raw.githubusercontent.com/user/repo/main/RunPerfScenario.ipynb"
56
+ )
57
+ """
58
+ try:
59
+ # Get authentication token
60
+ from duckrun.auth import get_fabric_api_token
61
+ token = get_fabric_api_token()
62
+ if not token:
63
+ return {
64
+ "success": False,
65
+ "message": "Failed to get authentication token",
66
+ "notebook": None,
67
+ "overwritten": False
68
+ }
69
+
70
+ base_url = "https://api.fabric.microsoft.com/v1"
71
+ headers = {
72
+ "Authorization": f"Bearer {token}",
73
+ "Content-Type": "application/json"
74
+ }
75
+
76
+ # Determine workspace ID
77
+ workspace_id = None
78
+
79
+ # Try to get from duckrun context if not provided
80
+ if not workspace_name:
81
+ try:
82
+ # Try to get from notebook context first
83
+ import notebookutils # type: ignore
84
+ workspace_id = notebookutils.runtime.context.get("workspaceId")
85
+ print("📓 Using current workspace from Fabric notebook context")
86
+ except (ImportError, Exception):
87
+ # Not in notebook, try to get from environment/last connection
88
+ pass
89
+
90
+ # If still no workspace_id, resolve from workspace_name
91
+ if not workspace_id:
92
+ if not workspace_name:
93
+ return {
94
+ "success": False,
95
+ "message": "workspace_name must be provided when not in Fabric notebook context",
96
+ "notebook": None,
97
+ "overwritten": False
98
+ }
99
+
100
+ # Get workspace ID by name
101
+ print(f"🔍 Resolving workspace: {workspace_name}")
102
+ ws_url = f"{base_url}/workspaces"
103
+ response = requests.get(ws_url, headers=headers)
104
+ response.raise_for_status()
105
+
106
+ workspaces = response.json().get("value", [])
107
+ workspace = next((ws for ws in workspaces if ws.get("displayName") == workspace_name), None)
108
+
109
+ if not workspace:
110
+ return {
111
+ "success": False,
112
+ "message": f"Workspace '{workspace_name}' not found",
113
+ "notebook": None,
114
+ "overwritten": False
115
+ }
116
+
117
+ workspace_id = workspace.get("id")
118
+ print(f"✓ Found workspace: {workspace_name}")
119
+
120
+ # Derive notebook name from URL if not provided
121
+ if not notebook_name:
122
+ # Extract filename from URL
123
+ notebook_name = url.split("/")[-1]
124
+ if notebook_name.endswith(".ipynb"):
125
+ notebook_name = notebook_name[:-6] # Remove .ipynb extension
126
+ print(f"📝 Using notebook name from URL: {notebook_name}")
127
+
128
+ # Check if notebook already exists
129
+ notebooks_url = f"{base_url}/workspaces/{workspace_id}/notebooks"
130
+ response = requests.get(notebooks_url, headers=headers)
131
+ response.raise_for_status()
132
+
133
+ notebooks = response.json().get("value", [])
134
+ existing_notebook = next((nb for nb in notebooks if nb.get("displayName") == notebook_name), None)
135
+
136
+ if existing_notebook and not overwrite:
137
+ return {
138
+ "success": True,
139
+ "message": f"Notebook '{notebook_name}' already exists (use overwrite=True to replace)",
140
+ "notebook": existing_notebook,
141
+ "overwritten": False
142
+ }
143
+
144
+ # Download notebook content from URL
145
+ print(f"⬇️ Downloading notebook from: {url}")
146
+ response = requests.get(url)
147
+ response.raise_for_status()
148
+ notebook_content = response.text
149
+ print(f"✓ Notebook downloaded successfully")
150
+
151
+ # Convert notebook content to base64
152
+ notebook_base64 = base64.b64encode(notebook_content.encode('utf-8')).decode('utf-8')
153
+
154
+ # Prepare the payload for creating/updating the notebook
155
+ if existing_notebook and overwrite:
156
+ # Update existing notebook
157
+ notebook_id = existing_notebook.get("id")
158
+ print(f"🔄 Updating existing notebook: {notebook_name}")
159
+
160
+ update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
161
+ payload = {
162
+ "definition": {
163
+ "format": "ipynb",
164
+ "parts": [
165
+ {
166
+ "path": "notebook-content.py",
167
+ "payload": notebook_base64,
168
+ "payloadType": "InlineBase64"
169
+ }
170
+ ]
171
+ }
172
+ }
173
+
174
+ response = requests.post(update_url, headers=headers, json=payload)
175
+ response.raise_for_status()
176
+
177
+ # Handle long-running operation
178
+ if response.status_code == 202:
179
+ operation_id = response.headers.get('x-ms-operation-id')
180
+ if operation_id:
181
+ _wait_for_operation(operation_id, headers)
182
+
183
+ return {
184
+ "success": True,
185
+ "message": f"Notebook '{notebook_name}' updated successfully",
186
+ "notebook": existing_notebook,
187
+ "overwritten": True
188
+ }
189
+ else:
190
+ # Create new notebook
191
+ print(f"➕ Creating new notebook: {notebook_name}")
192
+
193
+ payload = {
194
+ "displayName": notebook_name,
195
+ "definition": {
196
+ "format": "ipynb",
197
+ "parts": [
198
+ {
199
+ "path": "notebook-content.py",
200
+ "payload": notebook_base64,
201
+ "payloadType": "InlineBase64"
202
+ }
203
+ ]
204
+ }
205
+ }
206
+
207
+ response = requests.post(notebooks_url, headers=headers, json=payload)
208
+ response.raise_for_status()
209
+
210
+ # Handle long-running operation
211
+ if response.status_code == 202:
212
+ operation_id = response.headers.get('x-ms-operation-id')
213
+ if operation_id:
214
+ _wait_for_operation(operation_id, headers)
215
+
216
+ created_notebook = response.json()
217
+
218
+ return {
219
+ "success": True,
220
+ "message": f"Notebook '{notebook_name}' created successfully",
221
+ "notebook": created_notebook,
222
+ "overwritten": False
223
+ }
224
+
225
+ except requests.exceptions.RequestException as e:
226
+ return {
227
+ "success": False,
228
+ "message": f"HTTP Error: {str(e)}",
229
+ "notebook": None,
230
+ "overwritten": False
231
+ }
232
+ except Exception as e:
233
+ return {
234
+ "success": False,
235
+ "message": f"Error: {str(e)}",
236
+ "notebook": None,
237
+ "overwritten": False
238
+ }
239
+
240
+
241
+ def _wait_for_operation(operation_id: str, headers: dict, max_attempts: int = 30) -> bool:
242
+ """
243
+ Wait for a long-running Fabric API operation to complete.
244
+
245
+ Args:
246
+ operation_id: The operation ID to monitor
247
+ headers: Request headers with authentication
248
+ max_attempts: Maximum number of polling attempts (default: 30)
249
+
250
+ Returns:
251
+ True if operation succeeded, False otherwise
252
+ """
253
+ import time
254
+
255
+ status_url = f"https://api.fabric.microsoft.com/v1/operations/{operation_id}"
256
+
257
+ for attempt in range(max_attempts):
258
+ time.sleep(2)
259
+
260
+ try:
261
+ response = requests.get(status_url, headers=headers)
262
+ response.raise_for_status()
263
+
264
+ status_data = response.json()
265
+ status = status_data.get('status')
266
+
267
+ if status == 'Succeeded':
268
+ print(f"✓ Operation completed successfully")
269
+ return True
270
+ elif status == 'Failed':
271
+ error = status_data.get('error', {})
272
+ print(f"❌ Operation failed: {error.get('message', 'Unknown error')}")
273
+ return False
274
+ else:
275
+ print(f"⏳ Operation in progress... ({status})")
276
+
277
+ except Exception as e:
278
+ print(f"⚠️ Error checking operation status: {e}")
279
+ return False
280
+
281
+ print(f"⚠️ Operation timed out after {max_attempts} attempts")
282
+ return False
283
+
284
+
285
+ # Convenience wrapper for the try-except pattern mentioned in the request
286
+ def import_notebook(
287
+ url: str,
288
+ notebook_name: Optional[str] = None,
289
+ overwrite: bool = False,
290
+ workspace_name: Optional[str] = None
291
+ ) -> None:
292
+ """
293
+ Convenience wrapper that prints results and handles errors.
294
+
295
+ Args:
296
+ url: URL to the notebook file
297
+ notebook_name: Name for the imported notebook
298
+ overwrite: Whether to overwrite if exists
299
+ workspace_name: Target workspace name
300
+
301
+ Examples:
302
+ from duckrun.notebook import import_notebook
303
+
304
+ import_notebook(
305
+ url="https://raw.githubusercontent.com/djouallah/fabric_demo/refs/heads/main/Benchmark/RunPerfScenario.ipynb",
306
+ notebook_name="RunPerfScenario",
307
+ overwrite=False
308
+ )
309
+ """
310
+ try:
311
+ result = import_notebook_from_web(
312
+ url=url,
313
+ notebook_name=notebook_name,
314
+ overwrite=overwrite,
315
+ workspace_name=workspace_name
316
+ )
317
+
318
+ if result["success"]:
319
+ print(f"✅ {result['message']}")
320
+ else:
321
+ print(f"❌ {result['message']}")
322
+
323
+ except Exception as e:
324
+ print(f"Error: {e}")
duckrun/runner.py CHANGED
@@ -7,45 +7,7 @@ import importlib.util
7
7
  from typing import List, Tuple, Dict, Optional, Callable, Any
8
8
  from string import Template
9
9
  from deltalake import DeltaTable, write_deltalake
10
- # Row Group configuration for optimal Delta Lake performance
11
- RG = 8_000_000
12
-
13
-
14
- def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
15
- """
16
- Build arguments for write_deltalake based on requirements:
17
- - If schema_mode='merge': use rust engine (no row group params)
18
- - Otherwise: use pyarrow engine with row group optimization (if supported)
19
- """
20
- args = {
21
- 'table_or_uri': path,
22
- 'data': df,
23
- 'mode': mode
24
- }
25
-
26
- # Add partition_by if specified
27
- if partition_by:
28
- args['partition_by'] = partition_by
29
-
30
- # Engine selection based on schema_mode
31
- if schema_mode == 'merge':
32
- # Use rust engine for schema merging (no row group params supported)
33
- args['schema_mode'] = 'merge'
34
- args['engine'] = 'rust'
35
- else:
36
- # Try to use pyarrow engine with row group optimization
37
- # Check if row group parameters are supported by inspecting function signature
38
- import inspect
39
- sig = inspect.signature(write_deltalake)
40
-
41
- if 'max_rows_per_file' in sig.parameters:
42
- # Older deltalake version - use row group optimization
43
- args['max_rows_per_file'] = RG
44
- args['max_rows_per_group'] = RG
45
- args['min_rows_per_group'] = RG
46
- # For newer versions, just use default parameters
47
-
48
- return args
10
+ from .writer import _build_write_deltalake_args
49
11
 
50
12
 
51
13
  def run(duckrun_instance, pipeline: List[Tuple]) -> bool:
duckrun/semantic_model.py CHANGED
@@ -129,29 +129,136 @@ def check_dataset_exists(dataset_name, workspace_id, client):
129
129
  return False
130
130
 
131
131
 
132
- def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
133
- """Refresh a dataset and monitor progress using Power BI API"""
132
+ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
133
+ """Refresh a dataset and monitor progress using Power BI API
134
+
135
+ For DirectLake models, performs refresh based on refresh parameter:
136
+ - refresh="full": Two-step refresh (clearValues + full reframe)
137
+ - refresh="ignore": Skip refresh entirely
138
+
139
+ If a refresh is already in progress, waits for it to complete before starting a new one.
140
+ """
141
+
142
+ # Skip refresh entirely if refresh is "ignore"
143
+ if refresh == "ignore":
144
+ print(" Ignoring refresh - skipping refresh")
145
+ return
134
146
 
135
147
  # If dataset_id not provided, look it up by name
136
148
  if not dataset_id:
137
149
  dataset_id = get_dataset_id(dataset_name, workspace_id, client)
138
150
 
139
- payload = {
140
- "type": "full",
151
+ # Use Power BI API for refresh (not Fabric API)
152
+ powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
153
+ headers = client._get_headers()
154
+
155
+ # Check for in-progress refreshes
156
+ print(" Checking for in-progress refreshes...")
157
+ try:
158
+ status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
159
+ if status_response.status_code == 200:
160
+ refreshes = status_response.json().get('value', [])
161
+ if refreshes:
162
+ latest_refresh = refreshes[0]
163
+ status = latest_refresh.get('status')
164
+ if status in ['InProgress', 'Unknown']:
165
+ refresh_id = latest_refresh.get('requestId')
166
+ print(f" ⚠️ Found in-progress refresh (ID: {refresh_id})")
167
+ print(f" Waiting for current refresh to complete...")
168
+
169
+ # Wait for the in-progress refresh to complete
170
+ max_wait_attempts = 60
171
+ for attempt in range(max_wait_attempts):
172
+ time.sleep(5)
173
+ check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
174
+ if check_response.status_code == 200:
175
+ current_status = check_response.json().get('status')
176
+
177
+ if current_status == 'Completed':
178
+ print(f" ✓ Previous refresh completed")
179
+ break
180
+ elif current_status == 'Failed':
181
+ print(f" ⚠️ Previous refresh failed, continuing with new refresh")
182
+ break
183
+ elif current_status == 'Cancelled':
184
+ print(f" ⚠️ Previous refresh was cancelled, continuing with new refresh")
185
+ break
186
+
187
+ if attempt % 6 == 0:
188
+ print(f" Still waiting... (status: {current_status})")
189
+ else:
190
+ print(f" ⚠️ Timeout waiting for previous refresh, will attempt new refresh anyway")
191
+ except Exception as e:
192
+ print(f" ⚠️ Could not check refresh status: {e}")
193
+ print(f" Continuing with refresh attempt...")
194
+
195
+ # Step 1: clearValues - Purge data from memory
196
+ print(" Step 1: Clearing values from memory...")
197
+ clearvalues_payload = {
198
+ "type": "clearValues",
141
199
  "commitMode": "transactional",
142
200
  "maxParallelism": 10,
143
201
  "retryCount": 2,
144
202
  "objects": []
145
203
  }
146
204
 
147
- # Use Power BI API for refresh (not Fabric API)
148
- powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
149
- headers = client._get_headers()
205
+ response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
150
206
 
151
- response = requests.post(powerbi_url, headers=headers, json=payload)
207
+ if response.status_code in [200, 202]:
208
+ # For 202, monitor the clearValues operation
209
+ if response.status_code == 202:
210
+ location = response.headers.get('Location')
211
+ if location:
212
+ clear_refresh_id = location.split('/')[-1]
213
+ print(" ✓ Clear values initiated, monitoring progress...")
214
+
215
+ max_attempts = 60
216
+ for attempt in range(max_attempts):
217
+ time.sleep(2)
218
+
219
+ status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
220
+ status_response = requests.get(status_url, headers=headers)
221
+ status_response.raise_for_status()
222
+ status = status_response.json().get('status')
223
+
224
+ if status == 'Completed':
225
+ print(f" ✓ Clear values completed")
226
+ break
227
+ elif status == 'Failed':
228
+ error = status_response.json().get('serviceExceptionJson', '')
229
+ raise Exception(f"Clear values failed: {error}")
230
+ elif status == 'Cancelled':
231
+ raise Exception("Clear values was cancelled")
232
+
233
+ if attempt % 10 == 0 and attempt > 0:
234
+ print(f" Clear values status: {status}...")
235
+ else:
236
+ raise Exception(f"Clear values timed out")
237
+ else:
238
+ print(" ✓ Clear values completed")
239
+ else:
240
+ # Provide detailed error message
241
+ try:
242
+ error_details = response.json()
243
+ error_message = error_details.get('error', {}).get('message', response.text)
244
+ raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
245
+ except (json.JSONDecodeError, ValueError):
246
+ response.raise_for_status()
247
+
248
+ # Step 2: full refresh - Reframe data from Delta tables
249
+ print(" Step 2: Full refresh to reframe data...")
250
+ full_payload = {
251
+ "type": "full",
252
+ "commitMode": "transactional",
253
+ "maxParallelism": 10,
254
+ "retryCount": 2,
255
+ "objects": []
256
+ }
257
+
258
+ response = requests.post(powerbi_url, headers=headers, json=full_payload)
152
259
 
153
260
  if response.status_code in [200, 202]:
154
- print(f"✓ Refresh initiated")
261
+ print(f" ✓ Refresh initiated")
155
262
 
156
263
  # For 202, get the refresh_id from the Location header
157
264
  if response.status_code == 202:
@@ -183,7 +290,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
183
290
 
184
291
  raise Exception(f"Refresh timed out")
185
292
  else:
186
- response.raise_for_status()
293
+ # Provide detailed error message
294
+ try:
295
+ error_details = response.json()
296
+ error_message = error_details.get('error', {}).get('message', response.text)
297
+ raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
298
+ except (json.JSONDecodeError, ValueError):
299
+ response.raise_for_status()
187
300
 
188
301
 
189
302
  def download_bim_from_github(url_or_path):
@@ -431,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
431
544
 
432
545
 
433
546
  def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
434
- bim_url_or_path, wait_seconds=5):
547
+ bim_url_or_path, wait_seconds=5, refresh="full"):
435
548
  """
436
549
  Deploy a semantic model using DirectLake mode.
437
550
 
@@ -442,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
442
555
  dataset_name: Name for the semantic model
443
556
  bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
444
557
  wait_seconds: Seconds to wait before refresh (default: 5)
558
+ refresh: Refresh strategy (default: "full")
559
+ - "full": Clear values and process full refresh
560
+ - "ignore": Skip refresh entirely
445
561
 
446
562
  Returns:
447
563
  1 for success, 0 for failure
@@ -454,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
454
570
  # Using a local file
455
571
  dr.deploy("./my_model.bim")
456
572
  dr.deploy("C:/path/to/model.bim")
573
+
574
+ # Deploy without refresh
575
+ dr.deploy("./my_model.bim", refresh="ignore")
457
576
  """
458
577
  print("=" * 70)
459
578
  print("Semantic Model Deployment (DirectLake)")
@@ -471,14 +590,14 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
471
590
  dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
472
591
 
473
592
  if dataset_exists:
474
- print(f"\n✓ Dataset exists - refreshing...")
593
+ print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
475
594
 
476
595
  if wait_seconds > 0:
477
596
  print(f" Waiting {wait_seconds} seconds...")
478
597
  time.sleep(wait_seconds)
479
598
 
480
- print("\n[Step 6/6] Refreshing semantic model...")
481
- refresh_dataset(dataset_name, workspace_id, client)
599
+ print("\n[Step 3/3] Refreshing existing semantic model...")
600
+ refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
482
601
 
483
602
  print("\n" + "=" * 70)
484
603
  print("🎉 Refresh Completed!")
@@ -510,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
510
629
 
511
630
  # Step 6: Refresh using the dataset ID returned from creation
512
631
  print("\n[Step 6/6] Refreshing semantic model...")
513
- refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
632
+ refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
514
633
 
515
634
  print("\n" + "=" * 70)
516
635
  print("🎉 Deployment Completed!")
@@ -537,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
537
656
  return 0
538
657
 
539
658
 
540
- def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
659
+ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
541
660
  """
542
661
  Copy a semantic model from one workspace to another.
543
662
 
@@ -550,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
550
669
  destination: Destination in format "workspace/lakehouse.lakehouse/schema"
551
670
  new_model_name: Name for the new semantic model (default: same as source)
552
671
  wait_seconds: Seconds to wait before refresh (default: 5)
672
+ refresh: Refresh strategy (default: "full")
673
+ - "full": Clear values and process full refresh
674
+ - "ignore": Skip refresh entirely
553
675
 
554
676
  Returns:
555
677
  1 for success, 0 for failure
@@ -562,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
562
684
  copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
563
685
  new_model_name="Production Model - Copy")
564
686
 
687
+ # Copy without refresh
688
+ copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
689
+
565
690
  # Using the connect pattern
566
691
  import duckrun
567
692
  duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
@@ -688,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
688
813
  schema_name=schema,
689
814
  dataset_name=new_model_name,
690
815
  bim_url_or_path=temp_bim_path,
691
- wait_seconds=wait_seconds
816
+ wait_seconds=wait_seconds,
817
+ refresh=refresh
692
818
  )
693
819
 
694
820
  # Clean up temp file
duckrun/stats.py CHANGED
@@ -60,16 +60,61 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
60
60
  return []
61
61
 
62
62
 
63
- def get_stats(duckrun_instance, source: str):
63
+ def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
64
+ """Match tables across all schemas using a wildcard pattern.
65
+ Pattern can be:
66
+ - '*.summary' - matches 'summary' table in all schemas
67
+ - '*summary' - matches any table ending with 'summary'
68
+ - 'schema.*' - matches all tables in 'schema'
69
+ Returns a dict mapping schema names to lists of matching table names."""
70
+ import fnmatch
71
+
72
+ try:
73
+ # Query all schemas and tables in one go
74
+ query = """
75
+ SELECT table_schema, table_name
76
+ FROM information_schema.tables
77
+ WHERE table_schema NOT LIKE 'pg_%'
78
+ AND table_schema != 'information_schema'
79
+ AND table_name NOT LIKE 'tbl_%'
80
+ """
81
+ result = duckrun_instance.con.execute(query).fetchall()
82
+
83
+ matched = {}
84
+
85
+ # Check if pattern contains a dot (schema.table pattern)
86
+ if '.' in pattern:
87
+ schema_pattern, table_pattern = pattern.split('.', 1)
88
+ for schema, table in result:
89
+ if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
90
+ if schema not in matched:
91
+ matched[schema] = []
92
+ matched[schema].append(table)
93
+ else:
94
+ # Pattern matches only table names
95
+ for schema, table in result:
96
+ if fnmatch.fnmatch(table, pattern):
97
+ if schema not in matched:
98
+ matched[schema] = []
99
+ matched[schema].append(table)
100
+
101
+ return matched
102
+ except:
103
+ return {}
104
+
105
+
106
+ def get_stats(duckrun_instance, source: str = None):
64
107
  """
65
108
  Get comprehensive statistics for Delta Lake tables.
66
109
 
67
110
  Args:
68
111
  duckrun_instance: The Duckrun connection instance
69
- source: Can be one of:
112
+ source: Optional. Can be one of:
113
+ - None: Use all tables in the connection's schema (default)
70
114
  - Table name: 'table_name' (uses main schema in DuckDB)
71
115
  - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
72
116
  - Schema only: 'schema' (all tables in schema, if multi-schema)
117
+ - Wildcard pattern: '*.summary' (matches tables across all schemas)
73
118
 
74
119
  Returns:
75
120
  Arrow table with statistics including total rows, file count, row groups,
@@ -78,6 +123,9 @@ def get_stats(duckrun_instance, source: str):
78
123
  Examples:
79
124
  con = duckrun.connect("tmp/data.lakehouse/test")
80
125
 
126
+ # All tables in the connection's schema
127
+ stats = con.get_stats()
128
+
81
129
  # Single table in main schema (DuckDB uses 'main', not 'test')
82
130
  stats = con.get_stats('price_today')
83
131
 
@@ -86,6 +134,9 @@ def get_stats(duckrun_instance, source: str):
86
134
 
87
135
  # All tables in a schema (only if multi-schema enabled)
88
136
  stats = con.get_stats('aemo')
137
+
138
+ # Wildcard pattern across all schemas (only if multi-schema enabled)
139
+ stats = con.get_stats('*.summary')
89
140
  """
90
141
  timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
91
142
 
@@ -93,8 +144,31 @@ def get_stats(duckrun_instance, source: str):
93
144
  duckdb_schema = "main"
94
145
  url_schema = duckrun_instance.schema # This is from the connection URL path
95
146
 
147
+ # If source is not provided, default to all tables in the connection's schema
148
+ if source is None:
149
+ source = url_schema
150
+
151
+ # Check if source contains wildcard characters
152
+ if '*' in source or '?' in source:
153
+ # Wildcard pattern mode - only valid if multi-schema is enabled
154
+ if not duckrun_instance.scan_all_schemas:
155
+ raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
156
+
157
+ matched_tables = _match_tables_by_pattern(duckrun_instance, source)
158
+
159
+ if not matched_tables:
160
+ raise ValueError(f"No tables found matching pattern '{source}'")
161
+
162
+ # Flatten the matched tables into a list with schema info
163
+ tables_with_schemas = []
164
+ for schema, tables in matched_tables.items():
165
+ for table in tables:
166
+ tables_with_schemas.append((schema, table))
167
+
168
+ print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
169
+
96
170
  # Parse the source and validate existence
97
- if '.' in source:
171
+ elif '.' in source:
98
172
  # Format: schema.table - only valid if multi-schema is enabled
99
173
  schema_name, table_name = source.split('.', 1)
100
174
 
@@ -105,46 +179,45 @@ def get_stats(duckrun_instance, source: str):
105
179
  if not _table_exists(duckrun_instance, schema_name, table_name):
106
180
  raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
107
181
 
108
- list_tables = [table_name]
182
+ tables_with_schemas = [(schema_name, table_name)]
109
183
  else:
110
184
  # Could be just table name or schema name
111
185
  if duckrun_instance.scan_all_schemas:
112
186
  # Multi-schema mode: DuckDB has actual schemas
113
187
  # First check if it's a table in main schema
114
188
  if _table_exists(duckrun_instance, duckdb_schema, source):
115
- list_tables = [source]
116
- schema_name = duckdb_schema
189
+ tables_with_schemas = [(duckdb_schema, source)]
117
190
  # Otherwise, check if it's a schema name
118
191
  elif _schema_exists(duckrun_instance, source):
119
192
  schema_name = source
120
193
  list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
121
194
  if not list_tables:
122
195
  raise ValueError(f"Schema '{source}' exists but contains no tables")
196
+ tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
123
197
  else:
124
198
  raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
125
199
  else:
126
200
  # Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
127
201
  if _table_exists(duckrun_instance, duckdb_schema, source):
128
202
  # It's a table name
129
- list_tables = [source]
130
- schema_name = url_schema # Use URL schema for file path construction
203
+ tables_with_schemas = [(url_schema, source)]
131
204
  elif source == url_schema:
132
205
  # Special case: user asked for stats on the URL schema name - list all tables
133
206
  list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
134
- schema_name = url_schema # Use URL schema for file path construction
135
207
  if not list_tables:
136
208
  raise ValueError(f"No tables found in schema '{url_schema}'")
209
+ tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
137
210
  else:
138
211
  raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
139
212
 
140
213
  # Use the existing connection
141
214
  con = duckrun_instance.con
142
215
 
143
- print(f"Processing {len(list_tables)} tables: {list_tables}")
216
+ print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
144
217
 
145
218
  successful_tables = []
146
- for idx, tbl in enumerate(list_tables):
147
- print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
219
+ for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
220
+ print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
148
221
  # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
149
222
  table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
150
223
 
@@ -171,8 +244,18 @@ def get_stats(duckrun_instance, source: str):
171
244
  print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
172
245
  xx = {}
173
246
 
174
- # Check if VORDER exists
175
- vorder = 'tags.VORDER' in xx.keys()
247
+ # Check if VORDER exists - handle both formats:
248
+ # 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
249
+ # 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
250
+ vorder = False
251
+ if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
252
+ vorder = True
253
+ elif 'tags' in xx.keys() and xx['tags']:
254
+ # Check nested tags dictionary (tags is a list of dicts, one per file)
255
+ for tag_dict in xx['tags']:
256
+ if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
257
+ vorder = True
258
+ break
176
259
 
177
260
  # Calculate total size
178
261
  total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
@@ -187,6 +270,7 @@ def get_stats(duckrun_instance, source: str):
187
270
  con.execute(f'''
188
271
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
189
272
  SELECT
273
+ '{schema_name}' as schema,
190
274
  '{tbl}' as tbl,
191
275
  'empty' as file_name,
192
276
  0 as num_rows,
@@ -202,6 +286,7 @@ def get_stats(duckrun_instance, source: str):
202
286
  con.execute(f'''
203
287
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
204
288
  SELECT
289
+ '{schema_name}' as schema,
205
290
  '{tbl}' as tbl,
206
291
  fm.file_name,
207
292
  fm.num_rows,
@@ -237,6 +322,7 @@ def get_stats(duckrun_instance, source: str):
237
322
  con.execute(f'''
238
323
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
239
324
  SELECT
325
+ '{schema_name}' as schema,
240
326
  '{tbl}' as tbl,
241
327
  'empty' as file_name,
242
328
  0 as num_rows,
@@ -264,6 +350,7 @@ def get_stats(duckrun_instance, source: str):
264
350
  con.execute(f'''
265
351
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
266
352
  SELECT
353
+ '{schema_name}' as schema,
267
354
  '{tbl}' as tbl,
268
355
  fm.file_name,
269
356
  fm.num_rows,
@@ -291,7 +378,7 @@ def get_stats(duckrun_instance, source: str):
291
378
  # No tables were processed successfully - return empty dataframe
292
379
  print("⚠️ No tables could be processed successfully")
293
380
  import pandas as pd
294
- return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
381
+ return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
295
382
  'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
296
383
 
297
384
  # Union all successfully processed temp tables
@@ -301,6 +388,7 @@ def get_stats(duckrun_instance, source: str):
301
388
  # Generate final summary
302
389
  final_result = con.execute(f'''
303
390
  SELECT
391
+ schema,
304
392
  tbl,
305
393
  SUM(num_rows) as total_rows,
306
394
  COUNT(*) as num_files,
@@ -312,7 +400,7 @@ def get_stats(duckrun_instance, source: str):
312
400
  ANY_VALUE(timestamp) as timestamp
313
401
  FROM ({union_query})
314
402
  WHERE tbl IS NOT NULL
315
- GROUP BY tbl
403
+ GROUP BY schema, tbl
316
404
  ORDER BY total_rows DESC
317
405
  ''').df()
318
406
 
duckrun/writer.py CHANGED
@@ -3,6 +3,20 @@ Delta Lake writer functionality for duckrun - Spark-style write API
3
3
  """
4
4
  from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
5
5
 
6
+ # Try to import WriterProperties for Rust engine (available in 0.18.2+)
7
+ try:
8
+ from deltalake.writer import WriterProperties
9
+ _HAS_WRITER_PROPERTIES = True
10
+ except ImportError:
11
+ _HAS_WRITER_PROPERTIES = False
12
+
13
+ # Try to import PyArrow dataset for old PyArrow engine
14
+ try:
15
+ import pyarrow.dataset as ds
16
+ _HAS_PYARROW_DATASET = True
17
+ except ImportError:
18
+ _HAS_PYARROW_DATASET = False
19
+
6
20
 
7
21
  # Row Group configuration for optimal Delta Lake performance
8
22
  RG = 8_000_000
@@ -23,12 +37,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
23
37
  - Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
24
38
  - When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
25
39
  - When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
40
+ - COMPRESSION: Defaults to ZSTD via writer_properties (rust) or file_options (pyarrow)
26
41
 
27
42
  deltalake 0.20+:
28
43
  - Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
29
44
  - Does NOT have max_rows_per_file (row group optimization removed)
30
45
  - When mergeSchema=True: must set schema_mode='merge'
31
46
  - When mergeSchema=False: just write normally (no special params)
47
+ - COMPRESSION: Defaults to ZSTD via writer_properties (rust only)
32
48
 
33
49
  Uses version detection for simpler logic.
34
50
  """
@@ -50,7 +66,13 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
50
66
  # deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
51
67
  # Do NOT use row group params (they conflict with rust engine)
52
68
  args['engine'] = 'rust'
53
- # For version 0.20+: just schema_mode='merge' is enough, rust is default
69
+ # Set ZSTD compression for Rust engine
70
+ if _HAS_WRITER_PROPERTIES:
71
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
72
+ else:
73
+ # Version 0.20+: rust is default, just add compression
74
+ if _HAS_WRITER_PROPERTIES:
75
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
54
76
  else:
55
77
  # Normal write mode (no schema merging)
56
78
  if _IS_OLD_DELTALAKE:
@@ -59,7 +81,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
59
81
  args['max_rows_per_file'] = RG
60
82
  args['max_rows_per_group'] = RG
61
83
  args['min_rows_per_group'] = RG
62
- # For version 0.20+: no optimization available (rust by default, no row group params supported)
84
+ # Set ZSTD compression for PyArrow engine
85
+ if _HAS_PYARROW_DATASET:
86
+ args['file_options'] = ds.ParquetFileFormat().make_write_options(compression='ZSTD')
87
+ else:
88
+ # Version 0.20+: no optimization available (rust by default, no row group params supported)
89
+ # Set ZSTD compression for Rust engine
90
+ if _HAS_WRITER_PROPERTIES:
91
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
63
92
 
64
93
  return args
65
94
 
@@ -135,14 +164,14 @@ class DeltaWriter:
135
164
  # Prepare info message based on version and settings
136
165
  if self._schema_mode == 'merge':
137
166
  if _IS_OLD_DELTALAKE:
138
- engine_info = " (engine=rust, schema_mode=merge)"
167
+ engine_info = " (engine=rust, schema_mode=merge, compression=ZSTD)"
139
168
  else:
140
- engine_info = " (schema_mode=merge, rust by default)"
169
+ engine_info = " (schema_mode=merge, rust by default, compression=ZSTD)"
141
170
  else:
142
171
  if _IS_OLD_DELTALAKE:
143
- engine_info = " (engine=pyarrow, optimized row groups)"
172
+ engine_info = " (engine=pyarrow, optimized row groups, compression=ZSTD)"
144
173
  else:
145
- engine_info = " (engine=rust by default)"
174
+ engine_info = " (engine=rust by default, compression=ZSTD)"
146
175
 
147
176
  partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
148
177
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.16.dev0
4
- Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
3
+ Version: 0.2.18
4
+ Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
7
7
  Project-URL: Homepage, https://github.com/djouallah/duckrun
@@ -0,0 +1,15 @@
1
+ duckrun/__init__.py,sha256=-DPOb_ETaBC0M7YqXj482FE1aZ-SxJeSeY6KB6hPgWU,350
2
+ duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
+ duckrun/core.py,sha256=uMc93xIr1Rjw6fV2_j5ArJI3G6VDqqPgEtj2SZMuWqc,68618
4
+ duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
+ duckrun/notebook.py,sha256=lzDRBoWZ_lePF-_5BbA1_42BImLZC5yrq6nzlmlKglM,12183
7
+ duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
8
+ duckrun/semantic_model.py,sha256=shRPBN1II60K_PH8JOqke-_3hAwLspcx4Add0VJRwwU,35913
9
+ duckrun/stats.py,sha256=cNQyvUMvxMZ5k_JYWWO7GenxrfautIhE6rKB4CzrwbI,19336
10
+ duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
11
+ duckrun-0.2.18.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
12
+ duckrun-0.2.18.dist-info/METADATA,sha256=F1aEfQGlGwrahk3NDQ1gnOV8puJdYm50DVTRNnJsfxE,20802
13
+ duckrun-0.2.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ duckrun-0.2.18.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
15
+ duckrun-0.2.18.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
2
- duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
- duckrun/core.py,sha256=c98sASAWlq0DDIR9gYbj5ZaKOa6MoO8Z09qhRhG4JWI,67097
4
- duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
- duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
- duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
7
- duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
8
- duckrun/stats.py,sha256=EqrCN1xwGo5nZgwezBvb6RepXT6b8H7xgK0yJJGFLfE,15155
9
- duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
- duckrun-0.2.16.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
- duckrun-0.2.16.dev0.dist-info/METADATA,sha256=CembRLJLoYfx6NS_kmtRDVuORH-E32EYPrq7kQ2yHmY,20771
12
- duckrun-0.2.16.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- duckrun-0.2.16.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
- duckrun-0.2.16.dev0.dist-info/RECORD,,