duckrun 0.2.19.dev0__tar.gz → 0.2.19.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev0
3
+ Version: 0.2.19.dev1
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -290,25 +290,16 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
290
290
  else:
291
291
  # Get parquet metadata and create temp table with compression info
292
292
  if detailed == True:
293
- # Detailed mode: Include row group level statistics
293
+ # Detailed mode: Include ALL parquet_metadata columns
294
294
  con.execute(f'''
295
295
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
296
296
  SELECT
297
297
  '{schema_name}' as schema,
298
298
  '{tbl}' as tbl,
299
- pm.file_name,
300
- pm.row_group_id,
301
- pm.row_group_num_rows,
302
- pm.row_group_num_columns,
303
- pm.row_group_bytes,
304
299
  {vorder} as vorder,
305
- pm.compression,
306
- pm.total_compressed_size,
307
- pm.total_uncompressed_size,
308
- ROUND(pm.total_compressed_size::DOUBLE / NULLIF(pm.total_uncompressed_size, 0), 4) as compression_ratio,
300
+ pm.*,
309
301
  '{timestamp}' as timestamp
310
302
  FROM parquet_metadata({delta}) pm
311
- WHERE pm.column_id = 0 -- Only include first column to avoid duplication per column
312
303
  ''')
313
304
  else:
314
305
  # Aggregated mode: Original summary statistics
@@ -377,25 +368,16 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
377
368
 
378
369
  # Use parquet_file_metadata to get actual parquet stats with compression
379
370
  if detailed == True:
380
- # Detailed mode: Include row group level statistics
371
+ # Detailed mode: Include ALL parquet_metadata columns
381
372
  con.execute(f'''
382
373
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
383
374
  SELECT
384
375
  '{schema_name}' as schema,
385
376
  '{tbl}' as tbl,
386
- pm.file_name,
387
- pm.row_group_id,
388
- pm.row_group_num_rows,
389
- pm.row_group_num_columns,
390
- pm.row_group_bytes,
391
377
  false as vorder,
392
- pm.compression,
393
- pm.total_compressed_size,
394
- pm.total_uncompressed_size,
395
- ROUND(pm.total_compressed_size::DOUBLE / NULLIF(pm.total_uncompressed_size, 0), 4) as compression_ratio,
378
+ pm.*,
396
379
  '{timestamp}' as timestamp
397
380
  FROM parquet_metadata({filenames}) pm
398
- WHERE pm.column_id = 0 -- Only include first column to avoid duplication per column
399
381
  ''')
400
382
  else:
401
383
  # Aggregated mode: Original summary statistics
@@ -431,9 +413,7 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
431
413
  print("⚠️ No tables could be processed successfully")
432
414
  import pandas as pd
433
415
  if detailed == True:
434
- return pd.DataFrame(columns=['schema', 'tbl', 'file_name', 'row_group_id', 'row_group_num_rows',
435
- 'row_group_num_columns', 'row_group_bytes', 'vorder', 'compression',
436
- 'total_compressed_size', 'total_uncompressed_size', 'compression_ratio', 'timestamp'])
416
+ return pd.DataFrame(columns=['schema', 'tbl', 'vorder', 'timestamp'])
437
417
  else:
438
418
  return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
439
419
  'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
@@ -444,25 +424,12 @@ def get_stats(duckrun_instance, source: str = None, detailed = False):
444
424
 
445
425
  # Generate final summary based on detailed flag
446
426
  if detailed == True:
447
- # Detailed mode: Return row group level data without aggregation
427
+ # Detailed mode: Return ALL parquet_metadata columns
448
428
  final_result = con.execute(f'''
449
- SELECT
450
- schema,
451
- tbl,
452
- file_name,
453
- row_group_id,
454
- row_group_num_rows,
455
- row_group_num_columns,
456
- row_group_bytes,
457
- vorder,
458
- compression,
459
- total_compressed_size,
460
- total_uncompressed_size,
461
- compression_ratio,
462
- timestamp
429
+ SELECT *
463
430
  FROM ({union_query})
464
431
  WHERE tbl IS NOT NULL
465
- ORDER BY schema, tbl, file_name, row_group_id
432
+ ORDER BY schema, tbl, file_name, row_group_id, column_id
466
433
  ''').df()
467
434
  else:
468
435
  # Aggregated mode: Original summary statistics
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev0
3
+ Version: 0.2.19.dev1
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev0"
7
+ version = "0.2.19.dev1"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes