duckrun 0.2.19.dev5__tar.gz → 0.2.19.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/core.py +2 -1
  3. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/rle.py +75 -1
  4. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/PKG-INFO +1 -1
  5. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/pyproject.toml +1 -1
  6. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/tests/test_rle.py +2 -2
  7. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/LICENSE +0 -0
  8. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/README.md +0 -0
  9. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/__init__.py +0 -0
  10. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/auth.py +0 -0
  11. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/files.py +0 -0
  12. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/lakehouse.py +0 -0
  13. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/notebook.py +0 -0
  14. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/runner.py +0 -0
  15. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/semantic_model.py +0 -0
  16. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/stats.py +0 -0
  17. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun/writer.py +0 -0
  18. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/SOURCES.txt +0 -0
  19. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.19.dev5 → duckrun-0.2.19.dev6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev5
3
+ Version: 0.2.19.dev6
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1425,7 +1425,8 @@ class Duckrun(WorkspaceOperationsMixin):
1425
1425
  max_cardinality_pct=max_cardinality_pct,
1426
1426
  max_ordering_depth=max_ordering_depth,
1427
1427
  schema_name=schema_name,
1428
- table_display_name=tbl
1428
+ table_display_name=tbl,
1429
+ duckrun_instance=self # Pass duckrun instance for detailed parquet stats
1429
1430
  )
1430
1431
 
1431
1432
  def close(self):
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
  from typing import List, Dict, Tuple, Optional
3
3
  import pandas as pd
4
+ from .stats import get_stats
4
5
 
5
6
  def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
6
7
  """
@@ -392,7 +393,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
392
393
  max_cardinality_pct: float = 0.01,
393
394
  max_ordering_depth: int = 3,
394
395
  schema_name: str = None,
395
- table_display_name: str = None) -> pd.DataFrame:
396
+ table_display_name: str = None,
397
+ duckrun_instance = None) -> pd.DataFrame:
396
398
  """
397
399
  Test column orderings for RLE optimization.
398
400
 
@@ -412,6 +414,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
412
414
  max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
413
415
  schema_name: Optional schema name to include in results (default: None)
414
416
  table_display_name: Optional table name to include in results (default: None)
417
+ duckrun_instance: Optional Duckrun instance to fetch detailed parquet stats (default: None)
415
418
 
416
419
  Returns:
417
420
  DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
@@ -472,6 +475,35 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
472
475
  print(f"✓ Analysis complete!")
473
476
  print(f"{'='*60}")
474
477
 
478
+ # Get detailed parquet stats if duckrun_instance is provided
479
+ parquet_stats = None
480
+ vorder_status = False
481
+ table_size_mb = None
482
+ if duckrun_instance and table_display_name:
483
+ print("\nFetching detailed parquet metadata...")
484
+ try:
485
+ # For single-schema connections, just use the table name
486
+ # For multi-schema connections, use schema.table format
487
+ if hasattr(duckrun_instance, 'scan_all_schemas') and duckrun_instance.scan_all_schemas and schema_name:
488
+ source_param = f"{schema_name}.{table_display_name}"
489
+ else:
490
+ source_param = table_display_name
491
+
492
+ parquet_stats = get_stats(duckrun_instance, source=source_param, detailed=True)
493
+ print(f"✓ Retrieved parquet metadata for {len(parquet_stats)} row groups/columns")
494
+
495
+ # Get vorder status from the stats if available
496
+ if 'vorder' in parquet_stats.columns:
497
+ vorder_status = parquet_stats['vorder'].iloc[0] if len(parquet_stats) > 0 else False
498
+
499
+ # Calculate total table size from compressed sizes
500
+ if 'total_compressed_size' in parquet_stats.columns:
501
+ total_bytes = parquet_stats['total_compressed_size'].sum()
502
+ table_size_mb = round(total_bytes / (1024 * 1024), 2) if total_bytes else None
503
+ except Exception as e:
504
+ print(f"⚠️ Could not fetch parquet stats: {e}")
505
+ parquet_stats = None
506
+
475
507
  # Transform to long format
476
508
  long_format_results = []
477
509
 
@@ -489,6 +521,38 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
489
521
  # Get total rows and NDV from card_stats if available
490
522
  total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
491
523
 
524
+ # Aggregate parquet stats per column if available
525
+ parquet_by_column = {}
526
+ if parquet_stats is not None and not parquet_stats.empty:
527
+ # Determine column name field - can be 'name' or 'path_in_schema'
528
+ col_name_field = 'path_in_schema' if 'path_in_schema' in parquet_stats.columns else 'name'
529
+
530
+ # Group by column name and aggregate
531
+ for col_name in data_columns:
532
+ col_stats = parquet_stats[parquet_stats[col_name_field] == col_name] if col_name_field in parquet_stats.columns else pd.DataFrame()
533
+
534
+ if not col_stats.empty:
535
+ # Aggregate stats across all row groups for this column
536
+ total_compressed = col_stats['total_compressed_size'].sum() if 'total_compressed_size' in col_stats.columns else None
537
+ total_uncompressed = col_stats['total_uncompressed_size'].sum() if 'total_uncompressed_size' in col_stats.columns else None
538
+ # Handle both 'encodings' (multiple) and 'encoding' (single) column names
539
+ if 'encodings' in col_stats.columns:
540
+ encodings = col_stats['encodings'].unique().tolist()
541
+ elif 'encoding' in col_stats.columns:
542
+ encodings = col_stats['encoding'].unique().tolist()
543
+ else:
544
+ encodings = []
545
+ compressions = col_stats['compression'].unique().tolist() if 'compression' in col_stats.columns else []
546
+ num_row_groups = col_stats['row_group_id'].nunique() if 'row_group_id' in col_stats.columns else len(col_stats)
547
+
548
+ parquet_by_column[col_name] = {
549
+ 'encoding': ', '.join([str(e) for e in encodings if e is not None]),
550
+ 'compression': ', '.join([str(c) for c in compressions if c is not None]),
551
+ 'total_compressed_size': total_compressed,
552
+ 'total_uncompressed_size': total_uncompressed,
553
+ 'num_row_groups': num_row_groups
554
+ }
555
+
492
556
  # Create one row per data column
493
557
  for col in data_columns:
494
558
  rle_value = row[col]
@@ -496,6 +560,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
496
560
  # Get NDV from card_stats
497
561
  ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
498
562
 
563
+ # Get parquet stats for this column
564
+ col_parquet = parquet_by_column.get(col, {})
565
+
499
566
  long_format_results.append({
500
567
  'schema': schema_val,
501
568
  'table': table_val,
@@ -506,6 +573,13 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
506
573
  'NDV': ndv_value,
507
574
  'total_rows': total_rows,
508
575
  'total_RLE': total_rle_all_val,
576
+ 'encoding': col_parquet.get('encoding', ''),
577
+ 'compression': col_parquet.get('compression', ''),
578
+ 'total_compressed_size': col_parquet.get('total_compressed_size', None),
579
+ 'total_uncompressed_size': col_parquet.get('total_uncompressed_size', None),
580
+ 'avg_row_group_size': total_rows // col_parquet.get('num_row_groups', 1) if total_rows and col_parquet.get('num_row_groups') else None,
581
+ 'table_size_mb': table_size_mb,
582
+ 'vorder': vorder_status,
509
583
  'comments': ''
510
584
  })
511
585
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev5
3
+ Version: 0.2.19.dev6
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev5"
7
+ version = "0.2.19.dev6"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -7,10 +7,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
7
  import duckrun
8
8
 
9
9
  # Analyze multiple schemas/tables
10
- conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
10
+ conn = duckrun.connect("tmp/data.lakehouse")
11
11
 
12
12
  # Analyze tables - now returns long format automatically
13
- result = conn.rle("calendar",'auto')
13
+ result = conn.rle("aemo.calendar")
14
14
  print(result)
15
15
  conn.close()
16
16
 
File without changes
File without changes
File without changes