duckrun 0.2.15__py3-none-any.whl → 0.2.16.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- duckrun/stats.py +20 -11
- duckrun/writer.py +35 -6
- {duckrun-0.2.15.dist-info → duckrun-0.2.16.dev1.dist-info}/METADATA +1 -1
- duckrun-0.2.16.dev1.dist-info/RECORD +14 -0
- duckrun-0.2.15.dist-info/RECORD +0 -14
- {duckrun-0.2.15.dist-info → duckrun-0.2.16.dev1.dist-info}/WHEEL +0 -0
- {duckrun-0.2.15.dist-info → duckrun-0.2.16.dev1.dist-info}/licenses/LICENSE +0 -0
- {duckrun-0.2.15.dist-info → duckrun-0.2.16.dev1.dist-info}/top_level.txt +0 -0
duckrun/stats.py
CHANGED
|
@@ -193,22 +193,26 @@ def get_stats(duckrun_instance, source: str):
|
|
|
193
193
|
0 as num_row_groups,
|
|
194
194
|
0 as size,
|
|
195
195
|
{vorder} as vorder,
|
|
196
|
+
'' as compression,
|
|
196
197
|
'{timestamp}' as timestamp
|
|
197
198
|
WHERE false
|
|
198
199
|
''')
|
|
199
200
|
else:
|
|
200
|
-
# Get parquet metadata and create temp table
|
|
201
|
+
# Get parquet metadata and create temp table with compression info
|
|
201
202
|
con.execute(f'''
|
|
202
203
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
203
204
|
SELECT
|
|
204
205
|
'{tbl}' as tbl,
|
|
205
|
-
file_name,
|
|
206
|
-
num_rows,
|
|
207
|
-
num_row_groups,
|
|
206
|
+
fm.file_name,
|
|
207
|
+
fm.num_rows,
|
|
208
|
+
fm.num_row_groups,
|
|
208
209
|
CEIL({total_size}/(1024*1024)) as size,
|
|
209
210
|
{vorder} as vorder,
|
|
211
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
210
212
|
'{timestamp}' as timestamp
|
|
211
|
-
FROM parquet_file_metadata({delta})
|
|
213
|
+
FROM parquet_file_metadata({delta}) fm
|
|
214
|
+
LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
|
|
215
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
212
216
|
''')
|
|
213
217
|
|
|
214
218
|
except Exception as e:
|
|
@@ -239,6 +243,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
239
243
|
0 as num_row_groups,
|
|
240
244
|
0 as size,
|
|
241
245
|
false as vorder,
|
|
246
|
+
'' as compression,
|
|
242
247
|
'{timestamp}' as timestamp
|
|
243
248
|
WHERE false
|
|
244
249
|
''')
|
|
@@ -255,18 +260,21 @@ def get_stats(duckrun_instance, source: str):
|
|
|
255
260
|
filename = full_path
|
|
256
261
|
filenames.append(table_path + "/" + filename)
|
|
257
262
|
|
|
258
|
-
# Use parquet_file_metadata to get actual parquet stats
|
|
263
|
+
# Use parquet_file_metadata to get actual parquet stats with compression
|
|
259
264
|
con.execute(f'''
|
|
260
265
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
261
266
|
SELECT
|
|
262
267
|
'{tbl}' as tbl,
|
|
263
|
-
file_name,
|
|
264
|
-
num_rows,
|
|
265
|
-
num_row_groups,
|
|
268
|
+
fm.file_name,
|
|
269
|
+
fm.num_rows,
|
|
270
|
+
fm.num_row_groups,
|
|
266
271
|
0 as size,
|
|
267
272
|
false as vorder,
|
|
273
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
268
274
|
'{timestamp}' as timestamp
|
|
269
|
-
FROM parquet_file_metadata({filenames})
|
|
275
|
+
FROM parquet_file_metadata({filenames}) fm
|
|
276
|
+
LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
|
|
277
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
270
278
|
''')
|
|
271
279
|
|
|
272
280
|
print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
|
|
@@ -284,7 +292,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
284
292
|
print("⚠️ No tables could be processed successfully")
|
|
285
293
|
import pandas as pd
|
|
286
294
|
return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
287
|
-
'average_row_group', 'file_size_MB', 'vorder', 'timestamp'])
|
|
295
|
+
'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
|
|
288
296
|
|
|
289
297
|
# Union all successfully processed temp tables
|
|
290
298
|
union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
|
|
@@ -300,6 +308,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
300
308
|
CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
|
|
301
309
|
MIN(size) as file_size_MB,
|
|
302
310
|
ANY_VALUE(vorder) as vorder,
|
|
311
|
+
STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
|
|
303
312
|
ANY_VALUE(timestamp) as timestamp
|
|
304
313
|
FROM ({union_query})
|
|
305
314
|
WHERE tbl IS NOT NULL
|
duckrun/writer.py
CHANGED
|
@@ -3,6 +3,20 @@ Delta Lake writer functionality for duckrun - Spark-style write API
|
|
|
3
3
|
"""
|
|
4
4
|
from deltalake import DeltaTable, write_deltalake, __version__ as deltalake_version
|
|
5
5
|
|
|
6
|
+
# Try to import WriterProperties for Rust engine (available in 0.18.2+)
|
|
7
|
+
try:
|
|
8
|
+
from deltalake.writer import WriterProperties
|
|
9
|
+
_HAS_WRITER_PROPERTIES = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
_HAS_WRITER_PROPERTIES = False
|
|
12
|
+
|
|
13
|
+
# Try to import PyArrow dataset for old PyArrow engine
|
|
14
|
+
try:
|
|
15
|
+
import pyarrow.dataset as ds
|
|
16
|
+
_HAS_PYARROW_DATASET = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
_HAS_PYARROW_DATASET = False
|
|
19
|
+
|
|
6
20
|
|
|
7
21
|
# Row Group configuration for optimal Delta Lake performance
|
|
8
22
|
RG = 8_000_000
|
|
@@ -23,12 +37,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
|
23
37
|
- Has max_rows_per_file/max_rows_per_group/min_rows_per_group for optimization
|
|
24
38
|
- When mergeSchema=True: must set schema_mode='merge' + engine='rust', NO row group params
|
|
25
39
|
- When mergeSchema=False: use row group params, DON'T set engine (pyarrow is default)
|
|
40
|
+
- COMPRESSION: Defaults to ZSTD via writer_properties (rust) or file_options (pyarrow)
|
|
26
41
|
|
|
27
42
|
deltalake 0.20+:
|
|
28
43
|
- Does NOT have 'engine' parameter (everything is rust, pyarrow deprecated)
|
|
29
44
|
- Does NOT have max_rows_per_file (row group optimization removed)
|
|
30
45
|
- When mergeSchema=True: must set schema_mode='merge'
|
|
31
46
|
- When mergeSchema=False: just write normally (no special params)
|
|
47
|
+
- COMPRESSION: Defaults to ZSTD via writer_properties (rust only)
|
|
32
48
|
|
|
33
49
|
Uses version detection for simpler logic.
|
|
34
50
|
"""
|
|
@@ -50,7 +66,13 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
|
50
66
|
# deltalake 0.18.2-0.19.x: must also set engine='rust' for schema merging
|
|
51
67
|
# Do NOT use row group params (they conflict with rust engine)
|
|
52
68
|
args['engine'] = 'rust'
|
|
53
|
-
|
|
69
|
+
# Set ZSTD compression for Rust engine
|
|
70
|
+
if _HAS_WRITER_PROPERTIES:
|
|
71
|
+
args['writer_properties'] = WriterProperties(compression='ZSTD')
|
|
72
|
+
else:
|
|
73
|
+
# Version 0.20+: rust is default, just add compression
|
|
74
|
+
if _HAS_WRITER_PROPERTIES:
|
|
75
|
+
args['writer_properties'] = WriterProperties(compression='ZSTD')
|
|
54
76
|
else:
|
|
55
77
|
# Normal write mode (no schema merging)
|
|
56
78
|
if _IS_OLD_DELTALAKE:
|
|
@@ -59,7 +81,14 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
|
|
|
59
81
|
args['max_rows_per_file'] = RG
|
|
60
82
|
args['max_rows_per_group'] = RG
|
|
61
83
|
args['min_rows_per_group'] = RG
|
|
62
|
-
|
|
84
|
+
# Set ZSTD compression for PyArrow engine
|
|
85
|
+
if _HAS_PYARROW_DATASET:
|
|
86
|
+
args['file_options'] = ds.ParquetFileFormat().make_write_options(compression='ZSTD')
|
|
87
|
+
else:
|
|
88
|
+
# Version 0.20+: no optimization available (rust by default, no row group params supported)
|
|
89
|
+
# Set ZSTD compression for Rust engine
|
|
90
|
+
if _HAS_WRITER_PROPERTIES:
|
|
91
|
+
args['writer_properties'] = WriterProperties(compression='ZSTD')
|
|
63
92
|
|
|
64
93
|
return args
|
|
65
94
|
|
|
@@ -135,14 +164,14 @@ class DeltaWriter:
|
|
|
135
164
|
# Prepare info message based on version and settings
|
|
136
165
|
if self._schema_mode == 'merge':
|
|
137
166
|
if _IS_OLD_DELTALAKE:
|
|
138
|
-
engine_info = " (engine=rust, schema_mode=merge)"
|
|
167
|
+
engine_info = " (engine=rust, schema_mode=merge, compression=ZSTD)"
|
|
139
168
|
else:
|
|
140
|
-
engine_info = " (schema_mode=merge, rust by default)"
|
|
169
|
+
engine_info = " (schema_mode=merge, rust by default, compression=ZSTD)"
|
|
141
170
|
else:
|
|
142
171
|
if _IS_OLD_DELTALAKE:
|
|
143
|
-
engine_info = " (engine=pyarrow, optimized row groups)"
|
|
172
|
+
engine_info = " (engine=pyarrow, optimized row groups, compression=ZSTD)"
|
|
144
173
|
else:
|
|
145
|
-
engine_info = " (engine=rust by default)"
|
|
174
|
+
engine_info = " (engine=rust by default, compression=ZSTD)"
|
|
146
175
|
|
|
147
176
|
partition_info = f" partitioned by {self._partition_by}" if self._partition_by else ""
|
|
148
177
|
print(f"Writing to Delta table: {schema}.{table} (mode={self._mode}){engine_info}{partition_info}")
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
|
|
2
|
+
duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
|
|
3
|
+
duckrun/core.py,sha256=c98sASAWlq0DDIR9gYbj5ZaKOa6MoO8Z09qhRhG4JWI,67097
|
|
4
|
+
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
|
5
|
+
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
|
6
|
+
duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
|
|
7
|
+
duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
|
|
8
|
+
duckrun/stats.py,sha256=EqrCN1xwGo5nZgwezBvb6RepXT6b8H7xgK0yJJGFLfE,15155
|
|
9
|
+
duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
|
|
10
|
+
duckrun-0.2.16.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
|
11
|
+
duckrun-0.2.16.dev1.dist-info/METADATA,sha256=RmwiOuQMHpS94rUjY0Sveu8Yzwbwp8ekZfta0d8tNAg,20771
|
|
12
|
+
duckrun-0.2.16.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
duckrun-0.2.16.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
|
14
|
+
duckrun-0.2.16.dev1.dist-info/RECORD,,
|
duckrun-0.2.15.dist-info/RECORD
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
duckrun/__init__.py,sha256=oPQXpJEgHpX_KgMrx_TWax9awIbr2B9z32cFuuG_p30,236
|
|
2
|
-
duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
|
|
3
|
-
duckrun/core.py,sha256=c98sASAWlq0DDIR9gYbj5ZaKOa6MoO8Z09qhRhG4JWI,67097
|
|
4
|
-
duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
|
|
5
|
-
duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
|
|
6
|
-
duckrun/runner.py,sha256=JnRJoQ_Db__iXlhjTohplXR83NUJxItgyaa7AzrDxwE,14833
|
|
7
|
-
duckrun/semantic_model.py,sha256=obzlN2-dbEW3JmDop-vrZGGGLi9u3ThhTbgtDjou7uY,29509
|
|
8
|
-
duckrun/stats.py,sha256=xqgtW_HHAizom6E13_UjitNgmz6pzK10XdosPWJO1Ew,14282
|
|
9
|
-
duckrun/writer.py,sha256=svUuPCYOhrz299NgnpTKhARKjfej0PxnoND2iPDSypk,8098
|
|
10
|
-
duckrun-0.2.15.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
|
|
11
|
-
duckrun-0.2.15.dist-info/METADATA,sha256=xExTRo--bAjK6Ioq7O6F_641ZkVGgHj3_d-jHO9tadE,20766
|
|
12
|
-
duckrun-0.2.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
duckrun-0.2.15.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
|
|
14
|
-
duckrun-0.2.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|