duckrun 0.2.16.dev0__py3-none-any.whl → 0.2.16.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/runner.py CHANGED
@@ -7,45 +7,7 @@ import importlib.util
7
7
  from typing import List, Tuple, Dict, Optional, Callable, Any
8
8
  from string import Template
9
9
  from deltalake import DeltaTable, write_deltalake
10
- # Row Group configuration for optimal Delta Lake performance
11
- RG = 8_000_000
12
-
13
-
14
- def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=None):
15
- """
16
- Build arguments for write_deltalake based on requirements:
17
- - If schema_mode='merge': use rust engine (no row group params)
18
- - Otherwise: use pyarrow engine with row group optimization (if supported)
19
- """
20
- args = {
21
- 'table_or_uri': path,
22
- 'data': df,
23
- 'mode': mode
24
- }
25
-
26
- # Add partition_by if specified
27
- if partition_by:
28
- args['partition_by'] = partition_by
29
-
30
- # Engine selection based on schema_mode
31
- if schema_mode == 'merge':
32
- # Use rust engine for schema merging (no row group params supported)
33
- args['schema_mode'] = 'merge'
34
- args['engine'] = 'rust'
35
- else:
36
- # Try to use pyarrow engine with row group optimization
37
- # Check if row group parameters are supported by inspecting function signature
38
- import inspect
39
- sig = inspect.signature(write_deltalake)
40
-
41
- if 'max_rows_per_file' in sig.parameters:
42
- # Older deltalake version - use row group optimization
43
- args['max_rows_per_file'] = RG
44
- args['max_rows_per_group'] = RG
45
- args['min_rows_per_group'] = RG
46
- # For newer versions, just use default parameters
47
-
48
- return args
10
+ from .writer import _build_write_deltalake_args
49
11
 
50
12
 
51
13
  def run(duckrun_instance, pipeline: List[Tuple]) -> bool:
duckrun/writer.py CHANGED
@@ -3,6 +3,20 @@ Delta Lake writer functionality for duckrun - Spark-style write API
3
3
  """
4
4
  from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
5
5
 
6
+ # Try to import WriterProperties for Rust engine (available in 0.18.2+)
7
+ try:
8
+ from deltalake.writer import WriterProperties
9
+ _HAS_WRITER_PROPERTIES = True
10
+ except ImportError:
11
+ _HAS_WRITER_PROPERTIES = False
12
+
13
+ # Try to import PyArrow dataset for old PyArrow engine
14
+ try:
15
+ import pyarrow.dataset as ds
16
+ _HAS_PYARROW_DATASET = True
17
+ except ImportError:
18
+ _HAS_PYARROW_DATASET = False
19
+
6
20
 
7
21
  # Row Group configuration for optimal Delta Lake performance
8
22
  RG = 8_000_000
@@ -23,12 +37,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
23
37
  - Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
24
38
  - When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
25
39
  - When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
40
+ - COMPRESSION: Defaults to ZSTD via writer_properties (rust) or file_options (pyarrow)
26
41
 
27
42
  deltalake 0.20+:
28
43
  - Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
29
44
  - Does NOT have max_rows_per_file (row group optimization removed)
30
45
  - When mergeSchema=True: must set schema_mode='merge'
31
46
  - When mergeSchema=False: just write normally (no special params)
47
+ - COMPRESSION: Defaults to ZSTD via writer_properties (rust only)
32
48
 
33
49
  Uses version detection for simpler logic.
34
50
  """
@@ -50,7 +66,13 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
50
66
  # deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
51
67
  # Do NOT use row group params (they conflict with rust engine)
52
68
  args['engine'] = 'rust'
53
- # For version 0.20+: just schema_mode='merge' is enough, rust is default
69
+ # Set ZSTD compression for Rust engine
70
+ if _HAS_WRITER_PROPERTIES:
71
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
72
+ else:
73
+ # Version 0.20+: rust is default, just add compression
74
+ if _HAS_WRITER_PROPERTIES:
75
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
54
76
  else:
55
77
  # Normal write mode (no schema merging)
56
78
  if _IS_OLD_DELTALAKE:
@@ -59,7 +81,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
59
81
  args['max_rows_per_file'] = RG
60
82
  args['max_rows_per_group'] = RG
61
83
  args['min_rows_per_group'] = RG
62
- # For version 0.20+: no optimization available (rust by default, no row group params supported)
84
+ # Set ZSTD compression for PyArrow engine
85
+ if _HAS_PYARROW_DATASET:
86
+ args['file_options'] = ds.ParquetFileFormat().make_write_options(compression='ZSTD')
87
+ else:
88
+ # Version 0.20+: no optimization available (rust by default, no row group params supported)
89
+ # Set ZSTD compression for Rust engine
90
+ if _HAS_WRITER_PROPERTIES:
91
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
63
92
 
64
93
  return args
65
94
 
@@ -135,14 +164,14 @@ class DeltaWriter:
135
164
  # Prepare info message based on version and settings
136
165
  if self._schema_mode == 'merge':
137
166
  if _IS_OLD_DELTALAKE:
138
- engine_info = " (engine=rust, schema_mode=merge)"
167
+ engine_info = " (engine=rust, schema_mode=merge, compression=ZSTD)"
139
168
  else:
140
- engine_info = " (schema_mode=merge, rust by default)"
169
+ engine_info = " (schema_mode=merge, rust by default, compression=ZSTD)"
141
170
  else:
142
171
  if _IS_OLD_DELTALAKE:
143
- engine_info = " (engine=pyarrow, optimized row groups)"
172
+ engine_info = " (engine=pyarrow, optimized row groups, compression=ZSTD)"
144
173
  else:
145
- engine_info = " (engine=rust by default)"
174
+ engine_info = " (engine=rust by default, compression=ZSTD)"
146
175
 
147
176
  partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
148
177
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.16.dev0
3
+ Version: 0.2.16.dev2
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -3,12 +3,12 @@ duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
3
  duckrun/core.py,sha256=c98sASAWlq0DDIR9gYbj5ZaKOa6MoO8Z09qhRhG4JWI,67097
4
4
  duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
5
  duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
- duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
6
+ duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
7
7
  duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
8
8
  duckrun/stats.py,sha256=EqrCN1xwGo5nZgwezBvb6RepXT6b8H7xgK0yJJGFLfE,15155
9
- duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
10
- duckrun-0.2.16.dev0.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
- duckrun-0.2.16.dev0.dist-info/METADATA,sha256=CembRLJLoYfx6NS_kmtRDVuORH-E32EYPrq7kQ2yHmY,20771
12
- duckrun-0.2.16.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- duckrun-0.2.16.dev0.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
- duckrun-0.2.16.dev0.dist-info/RECORD,,
9
+ duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
10
+ duckrun-0.2.16.dev2.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
11
+ duckrun-0.2.16.dev2.dist-info/METADATA,sha256=zx6oKCdIOOZuIF4yhK2u8UjgNh16KG03m2dmXGMf90c,20771
12
+ duckrun-0.2.16.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ duckrun-0.2.16.dev2.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
14
+ duckrun-0.2.16.dev2.dist-info/RECORD,,