duckrun 0.2.9.dev0__py3-none-any.whl → 0.2.9.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/auth.py CHANGED
@@ -123,6 +123,12 @@ def get_fabric_api_token() -> Optional[str]:
123
123
  Returns:
124
124
  Fabric API token string or None if authentication fails
125
125
  """
126
+ # Check if we already have a cached Fabric API token
127
+ fabric_token_env = os.environ.get("FABRIC_API_TOKEN")
128
+ if fabric_token_env:
129
+ print("✅ Using cached Fabric API token")
130
+ return fabric_token_env
131
+
126
132
  print("🔐 Getting Fabric API token...")
127
133
 
128
134
  # Try Fabric notebook environment first
@@ -130,6 +136,7 @@ def get_fabric_api_token() -> Optional[str]:
130
136
  import notebookutils # type: ignore
131
137
  print("📓 Microsoft Fabric notebook detected - using notebookutils")
132
138
  token = notebookutils.credentials.getToken("pbi")
139
+ os.environ["FABRIC_API_TOKEN"] = token
133
140
  print("✅ Fabric API token obtained!")
134
141
  return token
135
142
  except ImportError:
@@ -158,6 +165,7 @@ def get_fabric_api_token() -> Optional[str]:
158
165
  print("🔐 Trying Azure CLI for Fabric API...")
159
166
  credential = AzureCliCredential()
160
167
  token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
168
+ os.environ["FABRIC_API_TOKEN"] = token_obj.token
161
169
  print("✅ Fabric API token obtained via Azure CLI!")
162
170
  return token_obj.token
163
171
  except Exception as cli_error:
@@ -167,6 +175,7 @@ def get_fabric_api_token() -> Optional[str]:
167
175
  credential = InteractiveBrowserCredential()
168
176
 
169
177
  token_obj = credential.get_token("https://api.fabric.microsoft.com/.default")
178
+ os.environ["FABRIC_API_TOKEN"] = token_obj.token
170
179
  print("✅ Fabric API token obtained!")
171
180
  return token_obj.token
172
181
 
duckrun/core.py CHANGED
@@ -696,6 +696,45 @@ class Duckrun:
696
696
  print(f"❌ Error creating lakehouse '{lakehouse_name}': {e}")
697
697
  return False
698
698
 
699
+ def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
700
+ wait_seconds: int = 5) -> int:
701
+ """
702
+ Deploy a semantic model from a BIM file using DirectLake mode.
703
+
704
+ Args:
705
+ bim_url: URL to the BIM file (e.g., GitHub raw URL)
706
+ dataset_name: Name for the semantic model (default: lakehouse_schema)
707
+ wait_seconds: Seconds to wait for permission propagation (default: 5)
708
+
709
+ Returns:
710
+ 1 for success, 0 for failure
711
+
712
+ Examples:
713
+ dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
714
+
715
+ # Deploy with auto-generated name
716
+ dr.deploy("https://raw.githubusercontent.com/.../model.bim")
717
+
718
+ # Deploy with custom name
719
+ dr.deploy("https://raw.githubusercontent.com/.../model.bim",
720
+ dataset_name="Sales Model")
721
+ """
722
+ from .semantic_model import deploy_semantic_model
723
+
724
+ # Auto-generate dataset name if not provided
725
+ if dataset_name is None:
726
+ dataset_name = f"{self.lakehouse_name}_{self.schema}"
727
+
728
+ # Call the deployment function (DirectLake only)
729
+ return deploy_semantic_model(
730
+ workspace_name=self.workspace,
731
+ lakehouse_name=self.lakehouse_name,
732
+ schema_name=self.schema,
733
+ dataset_name=dataset_name,
734
+ bim_url=bim_url,
735
+ wait_seconds=wait_seconds
736
+ )
737
+
699
738
  def _get_workspace_id_by_name(self, token: str, workspace_name: str) -> Optional[str]:
700
739
  """Helper method to get workspace ID from name"""
701
740
  try:
@@ -0,0 +1,402 @@
1
+ """
2
+ Semantic Model Deployer - DirectLake mode for Fabric Lakehouses
3
+ Uses duckrun's authentication. Works anywhere duckrun works.
4
+ """
5
+
6
+ import requests
7
+ import json
8
+ import time
9
+ import base64
10
+
11
+
12
+ class FabricRestClient:
13
+ """Fabric REST API client using duckrun's authentication."""
14
+
15
+ def __init__(self):
16
+ self.base_url = "https://api.fabric.microsoft.com"
17
+ self.token = None
18
+ self._get_token()
19
+
20
+ def _get_token(self):
21
+ """Get Fabric API token using duckrun's auth module"""
22
+ from duckrun.auth import get_fabric_api_token
23
+ self.token = get_fabric_api_token()
24
+ if not self.token:
25
+ raise Exception("Failed to get Fabric API token")
26
+
27
+ def _get_headers(self):
28
+ return {
29
+ "Authorization": f"Bearer {self.token}",
30
+ "Content-Type": "application/json"
31
+ }
32
+
33
+ def get(self, endpoint: str):
34
+ url = f"{self.base_url}{endpoint}"
35
+ response = requests.get(url, headers=self._get_headers())
36
+ response.raise_for_status()
37
+ return response
38
+
39
+ def post(self, endpoint: str, json: dict = None):
40
+ url = f"{self.base_url}{endpoint}"
41
+ response = requests.post(url, headers=self._get_headers(), json=json)
42
+ response.raise_for_status()
43
+ return response
44
+
45
+
46
+ def get_workspace_id(workspace_name_or_id, client):
47
+ """Get workspace ID by name or validate if already a GUID"""
48
+ import re
49
+
50
+ # Check if input is already a GUID
51
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
52
+ if guid_pattern.match(workspace_name_or_id):
53
+ # It's already a GUID, verify it exists
54
+ try:
55
+ response = client.get(f"/v1/workspaces/{workspace_name_or_id}")
56
+ workspace_name = response.json().get('displayName', workspace_name_or_id)
57
+ print(f"✓ Found workspace: {workspace_name}")
58
+ return workspace_name_or_id
59
+ except:
60
+ raise ValueError(f"Workspace with ID '{workspace_name_or_id}' not found")
61
+
62
+ # It's a name, search for it
63
+ response = client.get("/v1/workspaces")
64
+ workspaces = response.json().get('value', [])
65
+
66
+ workspace_match = next((ws for ws in workspaces if ws.get('displayName') == workspace_name_or_id), None)
67
+ if not workspace_match:
68
+ raise ValueError(f"Workspace '{workspace_name_or_id}' not found")
69
+
70
+ workspace_id = workspace_match['id']
71
+ print(f"✓ Found workspace: {workspace_name_or_id}")
72
+ return workspace_id
73
+
74
+
75
+ def get_lakehouse_id(lakehouse_name_or_id, workspace_id, client):
76
+ """Get lakehouse ID by name or validate if already a GUID"""
77
+ import re
78
+
79
+ # Check if input is already a GUID
80
+ guid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.IGNORECASE)
81
+ if guid_pattern.match(lakehouse_name_or_id):
82
+ # It's already a GUID, verify it exists
83
+ try:
84
+ response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
85
+ items = response.json().get('value', [])
86
+ lakehouse_match = next((item for item in items if item.get('id') == lakehouse_name_or_id), None)
87
+ if lakehouse_match:
88
+ lakehouse_name = lakehouse_match.get('displayName', lakehouse_name_or_id)
89
+ print(f"✓ Found lakehouse: {lakehouse_name}")
90
+ return lakehouse_name_or_id
91
+ else:
92
+ raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found")
93
+ except Exception as e:
94
+ raise ValueError(f"Lakehouse with ID '{lakehouse_name_or_id}' not found: {e}")
95
+
96
+ # It's a name, search for it
97
+ response = client.get(f"/v1/workspaces/{workspace_id}/lakehouses")
98
+ items = response.json().get('value', [])
99
+
100
+ lakehouse_match = next((item for item in items if item.get('displayName') == lakehouse_name_or_id), None)
101
+ if not lakehouse_match:
102
+ raise ValueError(f"Lakehouse '{lakehouse_name_or_id}' not found")
103
+
104
+ lakehouse_id = lakehouse_match['id']
105
+ print(f"✓ Found lakehouse: {lakehouse_name_or_id}")
106
+ return lakehouse_id
107
+
108
+
109
+ def get_dataset_id(dataset_name, workspace_id, client):
110
+ """Get dataset ID by name"""
111
+ response = client.get(f"/v1/workspaces/{workspace_id}/semanticModels")
112
+ items = response.json().get('value', [])
113
+
114
+ dataset_match = next((item for item in items if item.get('displayName') == dataset_name), None)
115
+ if not dataset_match:
116
+ raise ValueError(f"Dataset '{dataset_name}' not found")
117
+
118
+ return dataset_match['id']
119
+
120
+
121
+ def check_dataset_exists(dataset_name, workspace_id, client):
122
+ """Check if dataset already exists"""
123
+ try:
124
+ get_dataset_id(dataset_name, workspace_id, client)
125
+ print(f"⚠️ Dataset '{dataset_name}' already exists")
126
+ return True
127
+ except:
128
+ print(f"✓ Dataset name '{dataset_name}' is available")
129
+ return False
130
+
131
+
132
+ def refresh_dataset(dataset_name, workspace_id, client):
133
+ """Refresh a dataset and monitor progress"""
134
+ dataset_id = get_dataset_id(dataset_name, workspace_id, client)
135
+
136
+ payload = {
137
+ "type": "full",
138
+ "commitMode": "transactional",
139
+ "maxParallelism": 10,
140
+ "retryCount": 2,
141
+ "objects": []
142
+ }
143
+
144
+ response = client.post(
145
+ f"/v1/workspaces/{workspace_id}/semanticModels/{dataset_id}/refreshes",
146
+ json=payload
147
+ )
148
+
149
+ if response.status_code in [200, 202]:
150
+ print(f"✓ Refresh initiated")
151
+
152
+ refresh_id = response.json().get('id')
153
+ if refresh_id:
154
+ print(" Monitoring refresh progress...")
155
+ max_attempts = 60
156
+ for attempt in range(max_attempts):
157
+ time.sleep(5)
158
+
159
+ status_response = client.get(
160
+ f"/v1/workspaces/{workspace_id}/semanticModels/{dataset_id}/refreshes/{refresh_id}"
161
+ )
162
+ status = status_response.json().get('status')
163
+
164
+ if status == 'Completed':
165
+ print(f"✓ Refresh completed successfully")
166
+ return
167
+ elif status == 'Failed':
168
+ error = status_response.json().get('error', {})
169
+ raise Exception(f"Refresh failed: {error.get('message', 'Unknown error')}")
170
+ elif status == 'Cancelled':
171
+ raise Exception("Refresh was cancelled")
172
+
173
+ if attempt % 6 == 0:
174
+ print(f" Status: {status}...")
175
+
176
+ raise Exception(f"Refresh timed out")
177
+
178
+
179
+ def download_bim_from_github(url):
180
+ """Download BIM file from URL"""
181
+ print(f"Downloading BIM file...")
182
+ response = requests.get(url)
183
+ response.raise_for_status()
184
+ bim_content = response.json()
185
+ print(f"✓ BIM file downloaded")
186
+ print(f" - Tables: {len(bim_content.get('model', {}).get('tables', []))}")
187
+ print(f" - Relationships: {len(bim_content.get('model', {}).get('relationships', []))}")
188
+ return bim_content
189
+
190
+
191
+ def update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name):
192
+ """Update BIM file for DirectLake mode"""
193
+
194
+ new_url = f"https://onelake.dfs.fabric.microsoft.com/{workspace_id}/{lakehouse_id}"
195
+ expression_name = None
196
+
197
+ # Update or create DirectLake expression
198
+ if 'model' in bim_content and 'expressions' in bim_content['model']:
199
+ for expr in bim_content['model']['expressions']:
200
+ if 'DirectLake' in expr['name'] or expr.get('kind') == 'm':
201
+ expression_name = expr['name']
202
+ expr['expression'] = [
203
+ "let",
204
+ f" Source = AzureStorage.DataLake(\"{new_url}\", [HierarchicalNavigation=true])",
205
+ "in",
206
+ " Source"
207
+ ]
208
+ break
209
+
210
+ if not expression_name:
211
+ expression_name = f"DirectLake - {schema_name}"
212
+ if 'expressions' not in bim_content['model']:
213
+ bim_content['model']['expressions'] = []
214
+
215
+ bim_content['model']['expressions'].append({
216
+ "name": expression_name,
217
+ "kind": "m",
218
+ "expression": [
219
+ "let",
220
+ f" Source = AzureStorage.DataLake(\"{new_url}\", [HierarchicalNavigation=true])",
221
+ "in",
222
+ " Source"
223
+ ],
224
+ "lineageTag": f"directlake-{schema_name}-source"
225
+ })
226
+
227
+ # Update table partitions for DirectLake
228
+ if 'tables' in bim_content['model']:
229
+ for table in bim_content['model']['tables']:
230
+ if 'partitions' in table:
231
+ for partition in table['partitions']:
232
+ if 'source' in partition:
233
+ partition['mode'] = 'directLake'
234
+ partition['source'] = {
235
+ "type": "entity",
236
+ "entityName": partition['source'].get('entityName', table['name']),
237
+ "expressionSource": expression_name,
238
+ "schemaName": schema_name
239
+ }
240
+
241
+ print(f"✓ Updated BIM for DirectLake")
242
+ print(f" - OneLake URL: {new_url}")
243
+ print(f" - Schema: {schema_name}")
244
+
245
+ return bim_content
246
+
247
+
248
+ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
249
+ """Create semantic model from BIM using Fabric REST API"""
250
+ # Convert to base64
251
+ bim_json = json.dumps(bim_content, indent=2)
252
+ bim_base64 = base64.b64encode(bim_json.encode('utf-8')).decode('utf-8')
253
+
254
+ pbism_content = {"version": "1.0"}
255
+ pbism_json = json.dumps(pbism_content)
256
+ pbism_base64 = base64.b64encode(pbism_json.encode('utf-8')).decode('utf-8')
257
+
258
+ payload = {
259
+ "displayName": dataset_name,
260
+ "definition": {
261
+ "parts": [
262
+ {
263
+ "path": "model.bim",
264
+ "payload": bim_base64,
265
+ "payloadType": "InlineBase64"
266
+ },
267
+ {
268
+ "path": "definition.pbism",
269
+ "payload": pbism_base64,
270
+ "payloadType": "InlineBase64"
271
+ }
272
+ ]
273
+ }
274
+ }
275
+
276
+ response = client.post(
277
+ f"/v1/workspaces/{workspace_id}/semanticModels",
278
+ json=payload
279
+ )
280
+
281
+ print(f"✓ Semantic model created")
282
+
283
+ # Handle long-running operation
284
+ if response.status_code == 202:
285
+ operation_id = response.headers.get('x-ms-operation-id')
286
+ print(f" Waiting for operation to complete...")
287
+
288
+ max_attempts = 30
289
+ for attempt in range(max_attempts):
290
+ time.sleep(2)
291
+ status_response = client.get(f"/v1/operations/{operation_id}")
292
+ status = status_response.json().get('status')
293
+
294
+ if status == 'Succeeded':
295
+ print(f"✓ Operation completed")
296
+ break
297
+ elif status == 'Failed':
298
+ error = status_response.json().get('error', {})
299
+ raise Exception(f"Operation failed: {error.get('message')}")
300
+ elif attempt == max_attempts - 1:
301
+ raise Exception(f"Operation timed out")
302
+
303
+
304
+ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
305
+ bim_url, wait_seconds=5):
306
+ """
307
+ Deploy a semantic model using DirectLake mode.
308
+
309
+ Args:
310
+ workspace_name_or_id: Name or GUID of the target workspace
311
+ lakehouse_name_or_id: Name or GUID of the lakehouse
312
+ schema_name: Schema name (e.g., 'dbo', 'staging')
313
+ dataset_name: Name for the semantic model
314
+ bim_url: URL to the BIM file
315
+ wait_seconds: Seconds to wait before refresh (default: 5)
316
+
317
+ Returns:
318
+ 1 for success, 0 for failure
319
+
320
+ Examples:
321
+ dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
322
+ dr.deploy("https://raw.githubusercontent.com/.../model.bim")
323
+ """
324
+ print("=" * 70)
325
+ print("Semantic Model Deployment (DirectLake)")
326
+ print("=" * 70)
327
+
328
+ client = FabricRestClient()
329
+
330
+ try:
331
+ # Step 1: Get workspace ID
332
+ print("\n[Step 1/6] Getting workspace information...")
333
+ workspace_id = get_workspace_id(workspace_name_or_id, client)
334
+
335
+ # Step 2: Check if dataset exists
336
+ print(f"\n[Step 2/6] Checking if dataset '{dataset_name}' exists...")
337
+ dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
338
+
339
+ if dataset_exists:
340
+ print(f"\n✓ Dataset exists - refreshing...")
341
+
342
+ if wait_seconds > 0:
343
+ print(f" Waiting {wait_seconds} seconds...")
344
+ time.sleep(wait_seconds)
345
+
346
+ print("\n[Step 6/6] Refreshing semantic model...")
347
+ refresh_dataset(dataset_name, workspace_id, client)
348
+
349
+ print("\n" + "=" * 70)
350
+ print("🎉 Refresh Completed!")
351
+ print("=" * 70)
352
+ print(f"Dataset: {dataset_name}")
353
+ print("=" * 70)
354
+ return 1
355
+
356
+ # Step 3: Get lakehouse ID
357
+ print(f"\n[Step 3/6] Finding lakehouse...")
358
+ lakehouse_id = get_lakehouse_id(lakehouse_name_or_id, workspace_id, client)
359
+
360
+ # Step 4: Download and update BIM
361
+ print("\n[Step 4/6] Downloading and configuring BIM file...")
362
+ bim_content = download_bim_from_github(bim_url)
363
+
364
+ modified_bim = update_bim_for_directlake(bim_content, workspace_id, lakehouse_id, schema_name)
365
+ modified_bim['name'] = dataset_name
366
+ modified_bim['id'] = dataset_name
367
+
368
+ # Step 5: Deploy
369
+ print("\n[Step 5/6] Deploying semantic model...")
370
+ create_dataset_from_bim(dataset_name, modified_bim, workspace_id, client)
371
+
372
+ if wait_seconds > 0:
373
+ print(f" Waiting {wait_seconds} seconds for permissions...")
374
+ time.sleep(wait_seconds)
375
+
376
+ # Step 6: Refresh
377
+ print("\n[Step 6/6] Refreshing semantic model...")
378
+ refresh_dataset(dataset_name, workspace_id, client)
379
+
380
+ print("\n" + "=" * 70)
381
+ print("🎉 Deployment Completed!")
382
+ print("=" * 70)
383
+ print(f"Dataset: {dataset_name}")
384
+ print(f"Workspace: {workspace_name_or_id}")
385
+ print(f"Lakehouse: {lakehouse_name_or_id}")
386
+ print(f"Schema: {schema_name}")
387
+ print("=" * 70)
388
+
389
+ return 1
390
+
391
+ except Exception as e:
392
+ print("\n" + "=" * 70)
393
+ print("❌ Deployment Failed")
394
+ print("=" * 70)
395
+ print(f"Error: {str(e)}")
396
+ print("\n💡 Troubleshooting:")
397
+ print(f" - Verify workspace '{workspace_name_or_id}' exists")
398
+ print(f" - Verify lakehouse '{lakehouse_name_or_id}' exists")
399
+ print(f" - Ensure tables exist in '{schema_name}' schema")
400
+ print(f" - Check tables are in Delta format")
401
+ print("=" * 70)
402
+ return 0
duckrun/stats.py CHANGED
@@ -147,7 +147,20 @@ def get_stats(duckrun_instance, source: str):
147
147
 
148
148
  try:
149
149
  dt = DeltaTable(table_path)
150
- xx = dt.get_add_actions(flatten=True).to_pydict()
150
+ add_actions = dt.get_add_actions(flatten=True)
151
+
152
+ # Convert to dict - compatible with both old and new deltalake versions
153
+ # Try to_pydict() first (old versions), fall back to to_pylist() (new versions)
154
+ try:
155
+ xx = add_actions.to_pydict()
156
+ except AttributeError:
157
+ # New version with arro3: use to_pylist() and convert to dict of lists
158
+ records = add_actions.to_pylist()
159
+ if records:
160
+ # Convert list of dicts to dict of lists
161
+ xx = {key: [record[key] for record in records] for key in records[0].keys()}
162
+ else:
163
+ xx = {}
151
164
 
152
165
  # Check if VORDER exists
153
166
  vorder = 'tags.VORDER' in xx.keys()
duckrun/writer.py CHANGED
@@ -1,18 +1,36 @@
1
1
  """
2
2
  Delta Lake writer functionality for duckrun - Spark-style write API
3
3
  """
4
- from deltalake import DeltaTable, write_deltalake
4
+ from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
5
5
 
6
6
 
7
7
  # Row Group configuration for optimal Delta Lake performance
8
8
  RG = 8_000_000
9
9
 
10
+ # Check deltalake version once at module load
11
+ # Version 0.18.x and 0.19.x support engine parameter and row group optimization
12
+ # Version 0.20+ removed these features (rust only, no row groups)
13
+ _DELTALAKE_VERSION = tuple(map(int, deltalake_version.split('.')[:2]))
14
+ _IS_OLD_DELTALAKE = _DELTALAKE_VERSION < (0, 20)
15
+
10
16
 
11
17
  def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
12
18
  """
13
- Build arguments for write_deltalake based on requirements:
14
- - If schema_mode='merge': use rust engine (no row group params)
15
- - Otherwise: use pyarrow engine with row group optimization (if supported)
19
+ Build arguments for write_deltalake based on requirements and version:
20
+
21
+ deltalake 0.18.2 - 0.19.x:
22
+ - Has 'engine' parameter (defaults to 'pyarrow')
23
+ - Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
24
+ - When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
25
+ - When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
26
+
27
+ deltalake 0.20+:
28
+ - Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
29
+ - Does NOT have max_rows_per_file (row group optimization removed)
30
+ - When mergeSchema=True: must set schema_mode='merge'
31
+ - When mergeSchema=False: just write normally (no special params)
32
+
33
+ Uses version detection for simpler logic.
16
34
  """
17
35
  args = {
18
36
  'table_or_uri': path,
@@ -24,23 +42,24 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
24
42
  if partition_by:
25
43
  args['partition_by'] = partition_by
26
44
 
27
- # Engine selection based on schema_mode
28
45
  if schema_mode == 'merge':
29
- # Use rust engine for schema merging (no row group params supported)
46
+ # Schema merging mode - must explicitly set schema_mode='merge'
30
47
  args['schema_mode'] = 'merge'
31
- args['engine'] = 'rust'
32
- else:
33
- # Try to use pyarrow engine with row group optimization
34
- # Check if row group parameters are supported by inspecting function signature
35
- import inspect
36
- sig = inspect.signature(write_deltalake)
37
48
 
38
- if 'max_rows_per_file' in sig.parameters:
39
- # Older deltalake version - use row group optimization
49
+ if _IS_OLD_DELTALAKE:
50
+ # deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
51
+ # Do NOT use row group params (they conflict with rust engine)
52
+ args['engine'] = 'rust'
53
+ # For version 0.20+: just schema_mode='merge' is enough, rust is default
54
+ else:
55
+ # Normal write mode (no schema merging)
56
+ if _IS_OLD_DELTALAKE:
57
+ # deltalake 0.18.2-0.19.x: use row group optimization
58
+ # DON'T set engine parameter - pyarrow is the default and works with row groups
40
59
  args['max_rows_per_file'] = RG
41
60
  args['max_rows_per_group'] = RG
42
61
  args['min_rows_per_group'] = RG
43
- # For newer versions, just use default parameters
62
+ # For version 0.20+: no optimization available (rust by default, no row group params supported)
44
63
 
45
64
  return args
46
65
 
@@ -113,7 +132,18 @@ class DeltaWriter:
113
132
  partition_by=self._partition_by
114
133
  )
115
134
 
116
- engine_info = f" (engine=rust, schema_mode=merge)" if self._schema_mode == 'merge' else " (engine=pyarrow)"
135
+ # Prepare info message based on version and settings
136
+ if self._schema_mode == 'merge':
137
+ if _IS_OLD_DELTALAKE:
138
+ engine_info = " (engine=rust, schema_mode=merge)"
139
+ else:
140
+ engine_info = " (schema_mode=merge, rust by default)"
141
+ else:
142
+ if _IS_OLD_DELTALAKE:
143
+ engine_info = " (engine=pyarrow, optimized row groups)"
144
+ else:
145
+ engine_info = " (engine=rust by default)"
146
+
117
147
  partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
118
148
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
119
149
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.9.dev0
3
+ Version: 0.2.9.dev2
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,14 @@
1
+ duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
2
+ duckrun/auth.py,sha256=qPaLQ7InlV9leA9r6E6VEeYavFFoBi0zSN8m_l1aoQs,9545
3
+ duckrun/core.py,sha256=ulKRnxTH8MfGtcKAAORBs-_vd-_jIyWmwv9Bims0TsQ,39267
4
+ duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
+ duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
7
+ duckrun/semantic_model.py,sha256=y_E1VlpqSx9DHOGi--4ZccaODErthzty5CVN4TI-mQ0,15509
8
+ duckrun/stats.py,sha256=CXfb2DWF3PgOckelJooU0y-BAsNT9NFDfDYEmo0mUQQ,10473
9
+ duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
+ duckrun-0.2.9.dev2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
+ duckrun-0.2.9.dev2.dist-info/METADATA,sha256=4vDjuMN2L2Uiv6CLIP7UyaKss6-jjNE7ZUxJC5SLnT8,19277
12
+ duckrun-0.2.9.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ duckrun-0.2.9.dev2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
+ duckrun-0.2.9.dev2.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- duckrun/__init__.py,sha256=XA85pL2vK1AkmBic8e7WxeqNvcd6SjFX4zsQpImDO6E,230
2
- duckrun/auth.py,sha256=qColLkvmk8S_qRAXLMGh_TgVeSPkv0j15dv55wgrX1o,9139
3
- duckrun/core.py,sha256=bEt_Zi8R0ByKyyWllZpQKnV3D1fHZiR7lwwVrDrINQA,37720
4
- duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
- duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
- duckrun/runner.py,sha256=yrDxfy1RVkb8iK9GKGmIFZHzCvcO_0GVQlbng7Vw_iM,14171
7
- duckrun/stats.py,sha256=2FTqoQNVjD84-H1HjStHxZkOpAGKXS79M55B00pOlok,9804
8
- duckrun/writer.py,sha256=3UwuoH4yjcomBaTbRXOSjlA82jRhhjErkOWDCX7K7mw,6595
9
- duckrun-0.2.9.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
10
- duckrun-0.2.9.dev0.dist-info/METADATA,sha256=_7fH-927WrJ5u0O4jmZHo3vwjlh3PEILc3mrtma5yH0,19277
11
- duckrun-0.2.9.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- duckrun-0.2.9.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
13
- duckrun-0.2.9.dev0.dist-info/RECORD,,