duckrun 0.2.19.dev5__tar.gz → 0.2.19.dev7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/core.py +2 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/rle.py +83 -3
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/pyproject.toml +1 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/tests/test_rle.py +2 -2
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/LICENSE +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/README.md +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev7}/setup.cfg +0 -0
|
@@ -1425,7 +1425,8 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1425
1425
|
max_cardinality_pct=max_cardinality_pct,
|
|
1426
1426
|
max_ordering_depth=max_ordering_depth,
|
|
1427
1427
|
schema_name=schema_name,
|
|
1428
|
-
table_display_name=tbl
|
|
1428
|
+
table_display_name=tbl,
|
|
1429
|
+
duckrun_instance=self # Pass duckrun instance for detailed parquet stats
|
|
1429
1430
|
)
|
|
1430
1431
|
|
|
1431
1432
|
def close(self):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
from typing import List, Dict, Tuple, Optional
|
|
3
3
|
import pandas as pd
|
|
4
|
+
from .stats import get_stats
|
|
4
5
|
|
|
5
6
|
def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
|
|
6
7
|
"""
|
|
@@ -392,7 +393,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
392
393
|
max_cardinality_pct: float = 0.01,
|
|
393
394
|
max_ordering_depth: int = 3,
|
|
394
395
|
schema_name: str = None,
|
|
395
|
-
table_display_name: str = None
|
|
396
|
+
table_display_name: str = None,
|
|
397
|
+
duckrun_instance = None) -> pd.DataFrame:
|
|
396
398
|
"""
|
|
397
399
|
Test column orderings for RLE optimization.
|
|
398
400
|
|
|
@@ -412,6 +414,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
412
414
|
max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
|
|
413
415
|
schema_name: Optional schema name to include in results (default: None)
|
|
414
416
|
table_display_name: Optional table name to include in results (default: None)
|
|
417
|
+
duckrun_instance: Optional Duckrun instance to fetch detailed parquet stats (default: None)
|
|
415
418
|
|
|
416
419
|
Returns:
|
|
417
420
|
DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
|
|
@@ -472,6 +475,35 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
472
475
|
print(f"✓ Analysis complete!")
|
|
473
476
|
print(f"{'='*60}")
|
|
474
477
|
|
|
478
|
+
# Get detailed parquet stats if duckrun_instance is provided
|
|
479
|
+
parquet_stats = None
|
|
480
|
+
vorder_status = False
|
|
481
|
+
table_size_mb = None
|
|
482
|
+
if duckrun_instance and table_display_name:
|
|
483
|
+
print("\nFetching detailed parquet metadata...")
|
|
484
|
+
try:
|
|
485
|
+
# For single-schema connections, just use the table name
|
|
486
|
+
# For multi-schema connections, use schema.table format
|
|
487
|
+
if hasattr(duckrun_instance, 'scan_all_schemas') and duckrun_instance.scan_all_schemas and schema_name:
|
|
488
|
+
source_param = f"{schema_name}.{table_display_name}"
|
|
489
|
+
else:
|
|
490
|
+
source_param = table_display_name
|
|
491
|
+
|
|
492
|
+
parquet_stats = get_stats(duckrun_instance, source=source_param, detailed=True)
|
|
493
|
+
print(f"✓ Retrieved parquet metadata for {len(parquet_stats)} row groups/columns")
|
|
494
|
+
|
|
495
|
+
# Get vorder status from the stats if available
|
|
496
|
+
if 'vorder' in parquet_stats.columns:
|
|
497
|
+
vorder_status = parquet_stats['vorder'].iloc[0] if len(parquet_stats) > 0 else False
|
|
498
|
+
|
|
499
|
+
# Calculate total table size from compressed sizes
|
|
500
|
+
if 'total_compressed_size' in parquet_stats.columns:
|
|
501
|
+
total_bytes = parquet_stats['total_compressed_size'].sum()
|
|
502
|
+
table_size_mb = round(total_bytes / (1024 * 1024), 2) if total_bytes else None
|
|
503
|
+
except Exception as e:
|
|
504
|
+
print(f"⚠️ Could not fetch parquet stats: {e}")
|
|
505
|
+
parquet_stats = None
|
|
506
|
+
|
|
475
507
|
# Transform to long format
|
|
476
508
|
long_format_results = []
|
|
477
509
|
|
|
@@ -489,6 +521,45 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
489
521
|
# Get total rows and NDV from card_stats if available
|
|
490
522
|
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
491
523
|
|
|
524
|
+
# Aggregate parquet stats per column if available
|
|
525
|
+
parquet_by_column = {}
|
|
526
|
+
avg_row_group_size = None # Calculate once for the table
|
|
527
|
+
|
|
528
|
+
if parquet_stats is not None and not parquet_stats.empty:
|
|
529
|
+
# Calculate average row group size once (same for all columns)
|
|
530
|
+
if 'row_group_num_rows' in parquet_stats.columns:
|
|
531
|
+
# Get unique row groups to avoid counting duplicates (one entry per column per row group)
|
|
532
|
+
unique_rg_sizes = parquet_stats.drop_duplicates(subset=['row_group_id'])['row_group_num_rows']
|
|
533
|
+
avg_row_group_size = int(unique_rg_sizes.mean())
|
|
534
|
+
|
|
535
|
+
# Determine column name field - can be 'name' or 'path_in_schema'
|
|
536
|
+
col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
|
|
537
|
+
|
|
538
|
+
# Group by column name and aggregate
|
|
539
|
+
for col_name in data_columns:
|
|
540
|
+
col_stats = parquet_stats[parquet_stats[col_name_field] == col_name] if col_name_field in parquet_stats.columns else pd.DataFrame()
|
|
541
|
+
|
|
542
|
+
if not col_stats.empty:
|
|
543
|
+
# Aggregate stats across all row groups for this column
|
|
544
|
+
total_compressed_bytes = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
|
|
545
|
+
total_compressed_mb = round(total_compressed_bytes / (1024 * 1024), 2) if total_compressed_bytes else None
|
|
546
|
+
# Handle both 'encodings' (multiple) and 'encoding' (single) column names
|
|
547
|
+
if 'encodings' in col_stats.columns:
|
|
548
|
+
encodings = col_stats['encodings'].unique().tolist()
|
|
549
|
+
elif 'encoding' in col_stats.columns:
|
|
550
|
+
encodings = col_stats['encoding'].unique().tolist()
|
|
551
|
+
else:
|
|
552
|
+
encodings = []
|
|
553
|
+
compressions = col_stats['compression'].unique().tolist() if 'compression' in col_stats.columns else []
|
|
554
|
+
num_row_groups = col_stats['row_group_id'].nunique() if 'row_group_id' in col_stats.columns else len(col_stats)
|
|
555
|
+
|
|
556
|
+
parquet_by_column[col_name] = {
|
|
557
|
+
'encoding': ', '.join([str(e) for e in encodings if e is not None]),
|
|
558
|
+
'compression': ', '.join([str(c) for c in compressions if c is not None]),
|
|
559
|
+
'total_compressed_size_mb': total_compressed_mb,
|
|
560
|
+
'num_row_groups': num_row_groups
|
|
561
|
+
}
|
|
562
|
+
|
|
492
563
|
# Create one row per data column
|
|
493
564
|
for col in data_columns:
|
|
494
565
|
rle_value = row[col]
|
|
@@ -496,16 +567,25 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
496
567
|
# Get NDV from card_stats
|
|
497
568
|
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
498
569
|
|
|
570
|
+
# Get parquet stats for this column
|
|
571
|
+
col_parquet = parquet_by_column.get(col, {})
|
|
572
|
+
|
|
499
573
|
long_format_results.append({
|
|
500
574
|
'schema': schema_val,
|
|
501
575
|
'table': table_val,
|
|
502
576
|
'sort_type': sort_order,
|
|
503
|
-
'
|
|
577
|
+
'name': col,
|
|
504
578
|
'order': None,
|
|
505
579
|
'RLE': rle_value,
|
|
506
580
|
'NDV': ndv_value,
|
|
507
581
|
'total_rows': total_rows,
|
|
508
582
|
'total_RLE': total_rle_all_val,
|
|
583
|
+
'encoding': col_parquet.get('encoding', ''),
|
|
584
|
+
'compression': col_parquet.get('compression', ''),
|
|
585
|
+
'column_size_mb': col_parquet.get('total_compressed_size_mb', None),
|
|
586
|
+
'avg_row_group_size': avg_row_group_size,
|
|
587
|
+
'table_size_mb': table_size_mb,
|
|
588
|
+
'vorder': vorder_status,
|
|
509
589
|
'comments': ''
|
|
510
590
|
})
|
|
511
591
|
|
|
@@ -829,7 +909,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
829
909
|
'schema': schema_val,
|
|
830
910
|
'table': table_val,
|
|
831
911
|
'sort_type': sort_order,
|
|
832
|
-
'
|
|
912
|
+
'name': col,
|
|
833
913
|
'order': order_position,
|
|
834
914
|
'RLE': rle_value,
|
|
835
915
|
'NDV': ndv_value,
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev7"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
7
7
|
import duckrun
|
|
8
8
|
|
|
9
9
|
# Analyze multiple schemas/tables
|
|
10
|
-
conn = duckrun.connect("tmp/data.lakehouse/
|
|
10
|
+
conn = duckrun.connect("tmp/data.lakehouse/spark_vorder")
|
|
11
11
|
|
|
12
12
|
# Analyze tables - now returns long format automatically
|
|
13
|
-
result = conn.rle("
|
|
13
|
+
result = conn.rle("summary")
|
|
14
14
|
print(result)
|
|
15
15
|
conn.close()
|
|
16
16
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|