duckrun 0.2.15__tar.gz → 0.2.16.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.15
3
+ Version: 0.2.16.dev1
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -193,22 +193,26 @@ def get_stats(duckrun_instance, source: str):
193
193
  0 as num_row_groups,
194
194
  0 as size,
195
195
  {vorder} as vorder,
196
+ '' as compression,
196
197
  '{timestamp}' as timestamp
197
198
  WHERE false
198
199
  ''')
199
200
  else:
200
- # Get parquet metadata and create temp table
201
+ # Get parquet metadata and create temp table with compression info
201
202
  con.execute(f'''
202
203
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
203
204
  SELECT
204
205
  '{tbl}' as tbl,
205
- file_name,
206
- num_rows,
207
- num_row_groups,
206
+ fm.file_name,
207
+ fm.num_rows,
208
+ fm.num_row_groups,
208
209
  CEIL({total_size}/(1024*1024)) as size,
209
210
  {vorder} as vorder,
211
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
210
212
  '{timestamp}' as timestamp
211
- FROM parquet_file_metadata({delta})
213
+ FROM parquet_file_metadata({delta}) fm
214
+ LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
215
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
212
216
  ''')
213
217
 
214
218
  except Exception as e:
@@ -239,6 +243,7 @@ def get_stats(duckrun_instance, source: str):
239
243
  0 as num_row_groups,
240
244
  0 as size,
241
245
  false as vorder,
246
+ '' as compression,
242
247
  '{timestamp}' as timestamp
243
248
  WHERE false
244
249
  ''')
@@ -255,18 +260,21 @@ def get_stats(duckrun_instance, source: str):
255
260
  filename = full_path
256
261
  filenames.append(table_path + "/" + filename)
257
262
 
258
- # Use parquet_file_metadata to get actual parquet stats
263
+ # Use parquet_file_metadata to get actual parquet stats with compression
259
264
  con.execute(f'''
260
265
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
261
266
  SELECT
262
267
  '{tbl}' as tbl,
263
- file_name,
264
- num_rows,
265
- num_row_groups,
268
+ fm.file_name,
269
+ fm.num_rows,
270
+ fm.num_row_groups,
266
271
  0 as size,
267
272
  false as vorder,
273
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
268
274
  '{timestamp}' as timestamp
269
- FROM parquet_file_metadata({filenames})
275
+ FROM parquet_file_metadata({filenames}) fm
276
+ LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
277
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
270
278
  ''')
271
279
 
272
280
  print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
@@ -284,7 +292,7 @@ def get_stats(duckrun_instance, source: str):
284
292
  print("⚠️ No tables could be processed successfully")
285
293
  import pandas as pd
286
294
  return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
287
- 'average_row_group', 'file_size_MB', 'vorder', 'timestamp'])
295
+ 'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
288
296
 
289
297
  # Union all successfully processed temp tables
290
298
  union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
@@ -300,6 +308,7 @@ def get_stats(duckrun_instance, source: str):
300
308
  CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
301
309
  MIN(size) as file_size_MB,
302
310
  ANY_VALUE(vorder) as vorder,
311
+ STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
303
312
  ANY_VALUE(timestamp) as timestamp
304
313
  FROM ({union_query})
305
314
  WHERE tbl IS NOT NULL
@@ -3,6 +3,20 @@ Delta Lake writer functionality for duckrun - Spark-style write API
3
3
  """
4
4
  from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
5
5
 
6
+ # Try to import WriterProperties for Rust engine (available in 0.18.2+)
7
+ try:
8
+ from deltalake.writer import WriterProperties
9
+ _HAS_WRITER_PROPERTIES = True
10
+ except ImportError:
11
+ _HAS_WRITER_PROPERTIES = False
12
+
13
+ # Try to import PyArrow dataset for old PyArrow engine
14
+ try:
15
+ import pyarrow.dataset as ds
16
+ _HAS_PYARROW_DATASET = True
17
+ except ImportError:
18
+ _HAS_PYARROW_DATASET = False
19
+
6
20
 
7
21
  # Row Group configuration for optimal Delta Lake performance
8
22
  RG = 8_000_000
@@ -23,12 +37,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
23
37
  - Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
24
38
  - When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
25
39
  - When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
40
+ - COMPRESSION: Defaults to ZSTD via writer_properties (rust) or file_options (pyarrow)
26
41
 
27
42
  deltalake 0.20+:
28
43
  - Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
29
44
  - Does NOT have max_rows_per_file (row group optimization removed)
30
45
  - When mergeSchema=True: must set schema_mode='merge'
31
46
  - When mergeSchema=False: just write normally (no special params)
47
+ - COMPRESSION: Defaults to ZSTD via writer_properties (rust only)
32
48
 
33
49
  Uses version detection for simpler logic.
34
50
  """
@@ -50,7 +66,13 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
50
66
  # deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
51
67
  # Do NOT use row group params (they conflict with rust engine)
52
68
  args['engine'] = 'rust'
53
- # For version 0.20+: just schema_mode='merge' is enough, rust is default
69
+ # Set ZSTD compression for Rust engine
70
+ if _HAS_WRITER_PROPERTIES:
71
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
72
+ else:
73
+ # Version 0.20+: rust is default, just add compression
74
+ if _HAS_WRITER_PROPERTIES:
75
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
54
76
  else:
55
77
  # Normal write mode (no schema merging)
56
78
  if _IS_OLD_DELTALAKE:
@@ -59,7 +81,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
59
81
  args['max_rows_per_file'] = RG
60
82
  args['max_rows_per_group'] = RG
61
83
  args['min_rows_per_group'] = RG
62
- # For version 0.20+: no optimization available (rust by default, no row group params supported)
84
+ # Set ZSTD compression for PyArrow engine
85
+ if _HAS_PYARROW_DATASET:
86
+ args['file_options'] = ds.ParquetFileFormat().make_write_options(compression='ZSTD')
87
+ else:
88
+ # Version 0.20+: no optimization available (rust by default, no row group params supported)
89
+ # Set ZSTD compression for Rust engine
90
+ if _HAS_WRITER_PROPERTIES:
91
+ args['writer_properties'] = WriterProperties(compression='ZSTD')
63
92
 
64
93
  return args
65
94
 
@@ -135,14 +164,14 @@ class DeltaWriter:
135
164
  # Prepare info message based on version and settings
136
165
  if self._schema_mode == 'merge':
137
166
  if _IS_OLD_DELTALAKE:
138
- engine_info = " (engine=rust, schema_mode=merge)"
167
+ engine_info = " (engine=rust, schema_mode=merge, compression=ZSTD)"
139
168
  else:
140
- engine_info = " (schema_mode=merge, rust by default)"
169
+ engine_info = " (schema_mode=merge, rust by default, compression=ZSTD)"
141
170
  else:
142
171
  if _IS_OLD_DELTALAKE:
143
- engine_info = " (engine=pyarrow, optimized row groups)"
172
+ engine_info = " (engine=pyarrow, optimized row groups, compression=ZSTD)"
144
173
  else:
145
- engine_info = " (engine=rust by default)"
174
+ engine_info = " (engine=rust by default, compression=ZSTD)"
146
175
 
147
176
  partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
148
177
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.15
3
+ Version: 0.2.16.dev1
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.15"
7
+ version = "0.2.16.dev1"
8
8
  description = "Lakehouse task runner powered by DuckDB for Microsoft Fabric"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes