duckrun 0.2.19.dev5__tar.gz → 0.2.19.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/core.py +2 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/rle.py +75 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/pyproject.toml +1 -1
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/tests/test_rle.py +2 -2
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/LICENSE +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/README.md +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/setup.cfg +0 -0
|
@@ -1425,7 +1425,8 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1425
1425
|
max_cardinality_pct=max_cardinality_pct,
|
|
1426
1426
|
max_ordering_depth=max_ordering_depth,
|
|
1427
1427
|
schema_name=schema_name,
|
|
1428
|
-
table_display_name=tbl
|
|
1428
|
+
table_display_name=tbl,
|
|
1429
|
+
duckrun_instance=self # Pass duckrun instance for detailed parquet stats
|
|
1429
1430
|
)
|
|
1430
1431
|
|
|
1431
1432
|
def close(self):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
from typing import List, Dict, Tuple, Optional
|
|
3
3
|
import pandas as pd
|
|
4
|
+
from .stats import get_stats
|
|
4
5
|
|
|
5
6
|
def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
|
|
6
7
|
"""
|
|
@@ -392,7 +393,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
392
393
|
max_cardinality_pct: float = 0.01,
|
|
393
394
|
max_ordering_depth: int = 3,
|
|
394
395
|
schema_name: str = None,
|
|
395
|
-
table_display_name: str = None
|
|
396
|
+
table_display_name: str = None,
|
|
397
|
+
duckrun_instance = None) -> pd.DataFrame:
|
|
396
398
|
"""
|
|
397
399
|
Test column orderings for RLE optimization.
|
|
398
400
|
|
|
@@ -412,6 +414,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
412
414
|
max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
|
|
413
415
|
schema_name: Optional schema name to include in results (default: None)
|
|
414
416
|
table_display_name: Optional table name to include in results (default: None)
|
|
417
|
+
duckrun_instance: Optional Duckrun instance to fetch detailed parquet stats (default: None)
|
|
415
418
|
|
|
416
419
|
Returns:
|
|
417
420
|
DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
|
|
@@ -472,6 +475,35 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
472
475
|
print(f"✓ Analysis complete!")
|
|
473
476
|
print(f"{'='*60}")
|
|
474
477
|
|
|
478
|
+
# Get detailed parquet stats if duckrun_instance is provided
|
|
479
|
+
parquet_stats = None
|
|
480
|
+
vorder_status = False
|
|
481
|
+
table_size_mb = None
|
|
482
|
+
if duckrun_instance and table_display_name:
|
|
483
|
+
print("\nFetching detailed parquet metadata...")
|
|
484
|
+
try:
|
|
485
|
+
# For single-schema connections, just use the table name
|
|
486
|
+
# For multi-schema connections, use schema.table format
|
|
487
|
+
if hasattr(duckrun_instance, 'scan_all_schemas') and duckrun_instance.scan_all_schemas and schema_name:
|
|
488
|
+
source_param = f"{schema_name}.{table_display_name}"
|
|
489
|
+
else:
|
|
490
|
+
source_param = table_display_name
|
|
491
|
+
|
|
492
|
+
parquet_stats = get_stats(duckrun_instance, source=source_param, detailed=True)
|
|
493
|
+
print(f"✓ Retrieved parquet metadata for {len(parquet_stats)} row groups/columns")
|
|
494
|
+
|
|
495
|
+
# Get vorder status from the stats if available
|
|
496
|
+
if 'vorder' in parquet_stats.columns:
|
|
497
|
+
vorder_status = parquet_stats['vorder'].iloc[0] if len(parquet_stats) > 0 else False
|
|
498
|
+
|
|
499
|
+
# Calculate total table size from compressed sizes
|
|
500
|
+
if 'total_compressed_size' in parquet_stats.columns:
|
|
501
|
+
total_bytes = parquet_stats['total_compressed_size'].sum()
|
|
502
|
+
table_size_mb = round(total_bytes / (1024 * 1024), 2) if total_bytes else None
|
|
503
|
+
except Exception as e:
|
|
504
|
+
print(f"⚠️ Could not fetch parquet stats: {e}")
|
|
505
|
+
parquet_stats = None
|
|
506
|
+
|
|
475
507
|
# Transform to long format
|
|
476
508
|
long_format_results = []
|
|
477
509
|
|
|
@@ -489,6 +521,38 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
489
521
|
# Get total rows and NDV from card_stats if available
|
|
490
522
|
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
491
523
|
|
|
524
|
+
# Aggregate parquet stats per column if available
|
|
525
|
+
parquet_by_column = {}
|
|
526
|
+
if parquet_stats is not None and not parquet_stats.empty:
|
|
527
|
+
# Determine column name field - can be 'name' or 'path_in_schema'
|
|
528
|
+
col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
|
|
529
|
+
|
|
530
|
+
# Group by column name and aggregate
|
|
531
|
+
for col_name in data_columns:
|
|
532
|
+
col_stats = parquet_stats[parquet_stats[col_name_field] == col_name] if col_name_field in parquet_stats.columns else pd.DataFrame()
|
|
533
|
+
|
|
534
|
+
if not col_stats.empty:
|
|
535
|
+
# Aggregate stats across all row groups for this column
|
|
536
|
+
total_compressed = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
|
|
537
|
+
total_uncompressed = col_stats['total_uncompressed_size'].sum() if 'total_uncompressed_size' in col_stats.columns else None
|
|
538
|
+
# Handle both 'encodings' (multiple) and 'encoding' (single) column names
|
|
539
|
+
if 'encodings' in col_stats.columns:
|
|
540
|
+
encodings = col_stats['encodings'].unique().tolist()
|
|
541
|
+
elif 'encoding' in col_stats.columns:
|
|
542
|
+
encodings = col_stats['encoding'].unique().tolist()
|
|
543
|
+
else:
|
|
544
|
+
encodings = []
|
|
545
|
+
compressions = col_stats['compression'].unique().tolist() if 'compression' in col_stats.columns else []
|
|
546
|
+
num_row_groups = col_stats['row_group_id'].nunique() if 'row_group_id' in col_stats.columns else len(col_stats)
|
|
547
|
+
|
|
548
|
+
parquet_by_column[col_name] = {
|
|
549
|
+
'encoding': ', '.join([str(e) for e in encodings if e is not None]),
|
|
550
|
+
'compression': ', '.join([str(c) for c in compressions if c is not None]),
|
|
551
|
+
'total_compressed_size': total_compressed,
|
|
552
|
+
'total_uncompressed_size': total_uncompressed,
|
|
553
|
+
'num_row_groups': num_row_groups
|
|
554
|
+
}
|
|
555
|
+
|
|
492
556
|
# Create one row per data column
|
|
493
557
|
for col in data_columns:
|
|
494
558
|
rle_value = row[col]
|
|
@@ -496,6 +560,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
496
560
|
# Get NDV from card_stats
|
|
497
561
|
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
498
562
|
|
|
563
|
+
# Get parquet stats for this column
|
|
564
|
+
col_parquet = parquet_by_column.get(col, {})
|
|
565
|
+
|
|
499
566
|
long_format_results.append({
|
|
500
567
|
'schema': schema_val,
|
|
501
568
|
'table': table_val,
|
|
@@ -506,6 +573,13 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
506
573
|
'NDV': ndv_value,
|
|
507
574
|
'total_rows': total_rows,
|
|
508
575
|
'total_RLE': total_rle_all_val,
|
|
576
|
+
'encoding': col_parquet.get('encoding', ''),
|
|
577
|
+
'compression': col_parquet.get('compression', ''),
|
|
578
|
+
'total_compressed_size': col_parquet.get('total_compressed_size', None),
|
|
579
|
+
'total_uncompressed_size': col_parquet.get('total_uncompressed_size', None),
|
|
580
|
+
'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
|
|
581
|
+
'table_size_mb': table_size_mb,
|
|
582
|
+
'vorder': vorder_status,
|
|
509
583
|
'comments': ''
|
|
510
584
|
})
|
|
511
585
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev6"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
7
7
|
import duckrun
|
|
8
8
|
|
|
9
9
|
# Analyze multiple schemas/tables
|
|
10
|
-
conn = duckrun.connect("tmp/data.lakehouse
|
|
10
|
+
conn = duckrun.connect("tmp/data.lakehouse")
|
|
11
11
|
|
|
12
12
|
# Analyze tables - now returns long format automatically
|
|
13
|
-
result = conn.rle("calendar"
|
|
13
|
+
result = conn.rle("aemo.calendar")
|
|
14
14
|
print(result)
|
|
15
15
|
conn.close()
|
|
16
16
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|