duckrun 0.2.14.dev40__tar.gz → 0.2.16.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/PKG-INFO +1 -1
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/stats.py +20 -11
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/pyproject.toml +1 -1
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/LICENSE +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/README.md +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/__init__.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/auth.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/core.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/files.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/runner.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun/writer.py +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.14.dev40 → duckrun-0.2.16.dev0}/setup.cfg +0 -0
|
@@ -193,22 +193,26 @@ def get_stats(duckrun_instance, source: str):
|
|
|
193
193
|
0 as num_row_groups,
|
|
194
194
|
0 as size,
|
|
195
195
|
{vorder} as vorder,
|
|
196
|
+
'' as compression,
|
|
196
197
|
'{timestamp}' as timestamp
|
|
197
198
|
WHERE false
|
|
198
199
|
''')
|
|
199
200
|
else:
|
|
200
|
-
# Get parquet metadata and create temp table
|
|
201
|
+
# Get parquet metadata and create temp table with compression info
|
|
201
202
|
con.execute(f'''
|
|
202
203
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
203
204
|
SELECT
|
|
204
205
|
'{tbl}' as tbl,
|
|
205
|
-
file_name,
|
|
206
|
-
num_rows,
|
|
207
|
-
num_row_groups,
|
|
206
|
+
fm.file_name,
|
|
207
|
+
fm.num_rows,
|
|
208
|
+
fm.num_row_groups,
|
|
208
209
|
CEIL({total_size}/(1024*1024)) as size,
|
|
209
210
|
{vorder} as vorder,
|
|
211
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
210
212
|
'{timestamp}' as timestamp
|
|
211
|
-
FROM parquet_file_metadata({delta})
|
|
213
|
+
FROM parquet_file_metadata({delta}) fm
|
|
214
|
+
LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
|
|
215
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
212
216
|
''')
|
|
213
217
|
|
|
214
218
|
except Exception as e:
|
|
@@ -239,6 +243,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
239
243
|
0 as num_row_groups,
|
|
240
244
|
0 as size,
|
|
241
245
|
false as vorder,
|
|
246
|
+
'' as compression,
|
|
242
247
|
'{timestamp}' as timestamp
|
|
243
248
|
WHERE false
|
|
244
249
|
''')
|
|
@@ -255,18 +260,21 @@ def get_stats(duckrun_instance, source: str):
|
|
|
255
260
|
filename = full_path
|
|
256
261
|
filenames.append(table_path + "/" + filename)
|
|
257
262
|
|
|
258
|
-
# Use parquet_file_metadata to get actual parquet stats
|
|
263
|
+
# Use parquet_file_metadata to get actual parquet stats with compression
|
|
259
264
|
con.execute(f'''
|
|
260
265
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
261
266
|
SELECT
|
|
262
267
|
'{tbl}' as tbl,
|
|
263
|
-
file_name,
|
|
264
|
-
num_rows,
|
|
265
|
-
num_row_groups,
|
|
268
|
+
fm.file_name,
|
|
269
|
+
fm.num_rows,
|
|
270
|
+
fm.num_row_groups,
|
|
266
271
|
0 as size,
|
|
267
272
|
false as vorder,
|
|
273
|
+
COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
|
|
268
274
|
'{timestamp}' as timestamp
|
|
269
|
-
FROM parquet_file_metadata({filenames})
|
|
275
|
+
FROM parquet_file_metadata({filenames}) fm
|
|
276
|
+
LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
|
|
277
|
+
GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
|
|
270
278
|
''')
|
|
271
279
|
|
|
272
280
|
print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
|
|
@@ -284,7 +292,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
284
292
|
print("⚠️ No tables could be processed successfully")
|
|
285
293
|
import pandas as pd
|
|
286
294
|
return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
287
|
-
'average_row_group', 'file_size_MB', 'vorder', 'timestamp'])
|
|
295
|
+
'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
|
|
288
296
|
|
|
289
297
|
# Union all successfully processed temp tables
|
|
290
298
|
union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
|
|
@@ -300,6 +308,7 @@ def get_stats(duckrun_instance, source: str):
|
|
|
300
308
|
CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
|
|
301
309
|
MIN(size) as file_size_MB,
|
|
302
310
|
ANY_VALUE(vorder) as vorder,
|
|
311
|
+
STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
|
|
303
312
|
ANY_VALUE(timestamp) as timestamp
|
|
304
313
|
FROM ({union_query})
|
|
305
314
|
WHERE tbl IS NOT NULL
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|