duckrun 0.2.19.dev3__tar.gz → 0.2.19.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/core.py +96 -5
  3. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/rle.py +210 -50
  4. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/PKG-INFO +1 -1
  5. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/SOURCES.txt +2 -1
  6. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/pyproject.toml +1 -1
  7. duckrun-0.2.19.dev4/tests/test_rle.py +16 -0
  8. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/LICENSE +0 -0
  9. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/README.md +0 -0
  10. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/__init__.py +0 -0
  11. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/auth.py +0 -0
  12. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/files.py +0 -0
  13. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/lakehouse.py +0 -0
  14. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/notebook.py +0 -0
  15. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/runner.py +0 -0
  16. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/semantic_model.py +0 -0
  17. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/stats.py +0 -0
  18. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/writer.py +0 -0
  19. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev3
3
+ Version: 0.2.19.dev4
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
1244
1244
  refresh=refresh
1245
1245
  )
1246
1246
 
1247
- def rle(self, table_name: str = None, mode: str = "natural",
1247
+ def rle(self, table_name: str = None, mode = "natural",
1248
1248
  min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1249
- max_ordering_depth: int = 3):
1249
+ max_ordering_depth: int = 3, limit: int = None):
1250
1250
  """
1251
1251
  Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1252
1252
 
@@ -1254,13 +1254,15 @@ class Duckrun(WorkspaceOperationsMixin):
1254
1254
  table_name: Name of the table to analyze. Can be:
1255
1255
  - 'table_name' (uses current schema)
1256
1256
  - 'schema.table_name' (specific schema)
1257
- mode: Analysis mode:
1258
- - "natural": Calculate RLE for natural order only (default, fastest)
1257
+ mode: Analysis mode or column ordering:
1258
+ - "natural": Calculate RLE for natural order only (fastest)
1259
1259
  - "auto": Natural order + cardinality-based ordering (recommended)
1260
1260
  - "advanced": Natural + cardinality + greedy incremental search (most thorough)
1261
+ - List[str]: Specific column ordering to test, e.g., ['date', 'duid']
1261
1262
  min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
1262
1263
  max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
1263
1264
  max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
1265
+ limit: Optional row limit for testing/development (default: None, analyzes all rows)
1264
1266
 
1265
1267
  Returns:
1266
1268
  DataFrame with RLE analysis results
@@ -1276,6 +1278,10 @@ class Duckrun(WorkspaceOperationsMixin):
1276
1278
  # Advanced optimization (greedy incremental search)
1277
1279
  con.rle("mytable", "advanced")
1278
1280
 
1281
+ # Test specific column ordering
1282
+ con.rle("mytable", ["date", "duid"])
1283
+ con.rle("mytable", ["cutoff", "time", "DUID", "date"])
1284
+
1279
1285
  # Advanced with custom depth
1280
1286
  con.rle("mytable", "advanced", max_ordering_depth=4)
1281
1287
 
@@ -1284,6 +1290,9 @@ class Duckrun(WorkspaceOperationsMixin):
1284
1290
 
1285
1291
  # Custom thresholds for small tables
1286
1292
  con.rle("mytable", "auto", max_cardinality_pct=0.05)
1293
+
1294
+ # Limit rows for testing
1295
+ con.rle("mytable", "auto", limit=10000)
1287
1296
  """
1288
1297
  from .rle import (
1289
1298
  calculate_cardinality_ratio,
@@ -1326,15 +1335,97 @@ class Duckrun(WorkspaceOperationsMixin):
1326
1335
  print(f"❌ Error accessing Delta table: {e}")
1327
1336
  return None
1328
1337
 
1338
+ # Check if mode is a list of columns (custom ordering)
1339
+ if isinstance(mode, list):
1340
+ # User wants to test a specific column ordering
1341
+ print(f"Testing custom column ordering: {', '.join(mode)}")
1342
+
1343
+ # Calculate cardinality for NDV values
1344
+ card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
1345
+
1346
+ # Calculate RLE for the specified ordering
1347
+ rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
1348
+
1349
+ total_rle_all = sum(rle_counts.values())
1350
+
1351
+ print(f"\nResults:")
1352
+ print(f" Custom ordering: [{', '.join(mode)}]")
1353
+ print(f" Total RLE (all columns): {total_rle_all:,} runs")
1354
+
1355
+ # Return as DataFrame for consistency
1356
+ import pandas as pd
1357
+ results = [{
1358
+ 'schema': schema_name,
1359
+ 'table': tbl,
1360
+ 'sort_order': 'custom',
1361
+ 'columns_used': ', '.join(mode),
1362
+ 'total_rle_all': total_rle_all,
1363
+ **rle_counts
1364
+ }]
1365
+
1366
+ df = pd.DataFrame(results)
1367
+
1368
+ # Transform to long format
1369
+ long_format_results = []
1370
+
1371
+ for _, row in df.iterrows():
1372
+ schema_val = row['schema']
1373
+ table_val = row['table']
1374
+ sort_order = row['sort_order']
1375
+ columns_used = row['columns_used']
1376
+ total_rle_all_val = row['total_rle_all']
1377
+
1378
+ # Get all column names except metadata columns
1379
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
1380
+ data_columns = [col for col in df.columns if col not in metadata_cols]
1381
+
1382
+ # Get total rows from card_stats if available
1383
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
1384
+
1385
+ # Parse the columns_used to get ordering
1386
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
1387
+
1388
+ # Create one row per data column
1389
+ for col in data_columns:
1390
+ rle_value = row[col]
1391
+
1392
+ # Get NDV from card_stats
1393
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
1394
+
1395
+ # Determine if column was included in the sort and its position
1396
+ is_in_sort = col in sort_columns_list
1397
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
1398
+ comment = '' if is_in_sort else 'not included in the sort'
1399
+
1400
+ long_format_results.append({
1401
+ 'schema': schema_val,
1402
+ 'table': table_val,
1403
+ 'sort_type': sort_order,
1404
+ 'column': col,
1405
+ 'order': order_position,
1406
+ 'RLE': rle_value,
1407
+ 'NDV': ndv_value,
1408
+ 'total_rows': total_rows,
1409
+ 'total_RLE': total_rle_all_val,
1410
+ 'comments': comment
1411
+ })
1412
+
1413
+ long_df = pd.DataFrame(long_format_results)
1414
+
1415
+ return long_df
1416
+
1329
1417
  # All modes now use test_column_orderings_smart with the mode parameter
1330
1418
  return test_column_orderings_smart(
1331
1419
  self.con,
1332
1420
  table_path,
1333
1421
  table_name=table_name, # Pass table name for cardinality calculation on full dataset
1334
1422
  mode=mode,
1423
+ limit=limit,
1335
1424
  min_distinct_threshold=min_distinct_threshold,
1336
1425
  max_cardinality_pct=max_cardinality_pct,
1337
- max_ordering_depth=max_ordering_depth
1426
+ max_ordering_depth=max_ordering_depth,
1427
+ schema_name=schema_name,
1428
+ table_display_name=tbl
1338
1429
  )
1339
1430
 
1340
1431
  def close(self):
@@ -198,7 +198,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
198
198
  if sort_columns:
199
199
  order_by = "ORDER BY " + ", ".join(sort_columns)
200
200
  else:
201
- order_by = "ORDER BY file_row_number ASC"
201
+ order_by = "ORDER BY filename, file_row_number ASC"
202
202
 
203
203
  limit_clause = f"LIMIT {limit}" if limit else ""
204
204
 
@@ -210,7 +210,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
210
210
  SELECT
211
211
  {column_name},
212
212
  ROW_NUMBER() OVER ({order_by}) as sort_order
213
- FROM delta_scan('{delta_path}', file_row_number = TRUE)
213
+ FROM delta_scan('{delta_path}', filename = TRUE, file_row_number = TRUE)
214
214
  {limit_clause}
215
215
  ),
216
216
  runs AS (
@@ -403,7 +403,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
403
403
  mode: str = "natural",
404
404
  min_distinct_threshold: int = 2,
405
405
  max_cardinality_pct: float = 0.01,
406
- max_ordering_depth: int = 3) -> pd.DataFrame:
406
+ max_ordering_depth: int = 3,
407
+ schema_name: str = None,
408
+ table_display_name: str = None) -> pd.DataFrame:
407
409
  """
408
410
  Test column orderings for RLE optimization.
409
411
 
@@ -421,12 +423,27 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
421
423
  min_distinct_threshold: Exclude columns with fewer distinct values (default: 2, i.e. only exclude constants with 1 value)
422
424
  max_cardinality_pct: Exclude columns with cardinality ratio above this % (default: 0.01 = 1%)
423
425
  max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
426
+ schema_name: Optional schema name to include in results (default: None)
427
+ table_display_name: Optional table name to include in results (default: None)
424
428
 
425
429
  Returns:
426
- DataFrame with columns: sort_order, columns_used, total_rle_all, and individual column RLE counts
430
+ DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
427
431
  """
428
432
  print("Analyzing column characteristics...")
429
433
 
434
+ # Calculate cardinality ratios first (for all modes)
435
+ print("\nCalculating cardinality ratios on full dataset...")
436
+ if table_name:
437
+ card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
438
+ else:
439
+ # Fallback: use delta_scan directly
440
+ card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
441
+
442
+ print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
443
+ for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
444
+ card_pct = stats['cardinality_ratio'] * 100
445
+ print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
446
+
430
447
  # For "natural" mode, just calculate RLE on natural order
431
448
  if mode == "natural":
432
449
  print("\n" + "="*60)
@@ -453,9 +470,10 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
453
470
 
454
471
  print(f"\nResults:")
455
472
  print(f" Total RLE (all columns): {total_rle_all:,}")
456
- print(f" Average RLE per column: {total_rle_all / len(column_names):.1f}")
457
473
 
458
474
  results = [{
475
+ 'schema': schema_name,
476
+ 'table': table_display_name,
459
477
  'sort_order': 'natural_order',
460
478
  'columns_used': 'file_row_number',
461
479
  'total_rle_all': total_rle_all,
@@ -467,21 +485,48 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
467
485
  print(f"✓ Analysis complete!")
468
486
  print(f"{'='*60}")
469
487
 
470
- return df
471
-
472
- # For "auto" and "advanced" modes, calculate cardinality ratios first
473
- print("\nCalculating cardinality ratios on full dataset...")
474
- if table_name:
475
- card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
476
- else:
477
- # Fallback: use delta_scan directly
478
- card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
479
-
480
- print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
481
- for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
482
- card_pct = stats['cardinality_ratio'] * 100
483
- print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
488
+ # Transform to long format
489
+ long_format_results = []
490
+
491
+ for _, row in df.iterrows():
492
+ schema_val = row['schema']
493
+ table_val = row['table']
494
+ sort_order = row['sort_order']
495
+ columns_used = row['columns_used']
496
+ total_rle_all_val = row['total_rle_all']
497
+
498
+ # Get all column names except metadata columns
499
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
500
+ data_columns = [col for col in df.columns if col not in metadata_cols]
501
+
502
+ # Get total rows and NDV from card_stats if available
503
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
504
+
505
+ # Create one row per data column
506
+ for col in data_columns:
507
+ rle_value = row[col]
508
+
509
+ # Get NDV from card_stats
510
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
511
+
512
+ long_format_results.append({
513
+ 'schema': schema_val,
514
+ 'table': table_val,
515
+ 'sort_type': sort_order,
516
+ 'column': col,
517
+ 'order': None,
518
+ 'RLE': rle_value,
519
+ 'NDV': ndv_value,
520
+ 'total_rows': total_rows,
521
+ 'total_RLE': total_rle_all_val,
522
+ 'comments': ''
523
+ })
524
+
525
+ long_df = pd.DataFrame(long_format_results)
526
+
527
+ return long_df
484
528
 
529
+ # For "auto" and "advanced" modes, continue with optimization
485
530
  # Extract just the ratios for easier handling
486
531
  cardinality_ratios = {col: stats['cardinality_ratio'] for col, stats in card_stats.items()}
487
532
  column_names = list(card_stats.keys())
@@ -532,9 +577,30 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
532
577
  # Filter baseline to only include good_for_reordering columns
533
578
  baseline_filtered = {col: rle for col, rle in baseline.items() if col in good_for_reordering}
534
579
 
535
- print(f"Baseline RLE runs (columns worth reordering):")
536
- for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
537
- print(f" {col}: {baseline_filtered[col]:,} runs")
580
+ # Show column categorization upfront
581
+ print(f"\nColumn Analysis (baseline RLE in natural order):")
582
+
583
+ # Show columns worth reordering first
584
+ if baseline_filtered:
585
+ print(f" Columns included in optimization:")
586
+ for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
587
+ print(f" {col}: {baseline_filtered[col]:,} runs")
588
+ print(f" ─────────────────────────")
589
+ print(f" Subtotal: {sum(baseline_filtered.values()):,} runs")
590
+
591
+ # Show excluded columns (constant or high-cardinality)
592
+ excluded_cols = {col: rle for col, rle in baseline.items()
593
+ if col in constant_cols or col in fragmented_cols}
594
+ if excluded_cols:
595
+ print(f" Columns excluded from optimization:")
596
+ for col in sorted(excluded_cols.keys(), key=lambda c: excluded_cols[c]):
597
+ reason = "constant" if col in constant_cols else "high-cardinality"
598
+ print(f" {col}: {excluded_cols[col]:,} runs ({reason})")
599
+ print(f" ─────────────────────────")
600
+ print(f" Subtotal: {sum(excluded_cols.values()):,} runs")
601
+
602
+ # Show total baseline RLE
603
+ print(f"\nBaseline Total RLE (all columns): {sum(baseline.values()):,} runs")
538
604
 
539
605
  # Define only the most promising orderings to test
540
606
  orderings_to_test = [
@@ -545,20 +611,22 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
545
611
  if mode in ["auto", "advanced"] and len(good_for_reordering) >= 2:
546
612
  orderings_to_test.append((good_for_reordering, 'by_cardinality'))
547
613
 
548
- print(f"\n✓ Testing {len(orderings_to_test)} orderings")
549
- print("="*60)
614
+ # Count only the actual reordering tests (exclude natural_order baseline)
615
+ num_tests = len(orderings_to_test) - 1
550
616
 
551
617
  results = []
552
618
 
553
619
  for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
554
- print(f"\n[{i}/{len(orderings_to_test)}] Testing: {label}")
555
- if sort_cols:
556
- print(f" Order: {', '.join(sort_cols)}")
557
-
558
620
  if i == 1:
559
- # Use baseline for natural order (already calculated)
621
+ # Use baseline for natural order (already calculated and displayed)
560
622
  rle_counts = baseline
561
623
  else:
624
+ # This is an actual reordering test
625
+ test_num = i - 1
626
+ print(f"\n[{test_num}/{num_tests}] Testing: {label}")
627
+ if sort_cols:
628
+ print(f" Order: {', '.join(sort_cols)}")
629
+
562
630
  # Calculate RLE for this ordering
563
631
  rle_counts = calculate_rle_for_columns(con, delta_path, sort_cols, limit)
564
632
 
@@ -572,11 +640,11 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
572
640
  # Calculate weighted score (considering both RLE and cardinality - lower cardinality = better)
573
641
  cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
574
642
 
575
- print(f" Total RLE (all columns): {total_rle_all:,}")
576
- print(f" Optimizable columns RLE: {total_rle_optimizable:,}")
577
- print(f" Avg RLE (optimizable): {total_rle_optimizable / len(rle_filtered):.1f}")
643
+ print(f" Total RLE: {total_rle_all:,} runs")
578
644
 
579
645
  results.append({
646
+ 'schema': schema_name,
647
+ 'table': table_display_name,
580
648
  'sort_order': label,
581
649
  'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
582
650
  'total_rle_all': total_rle_all, # All columns (must be >= row_count)
@@ -599,42 +667,82 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
599
667
  current_best_rle = sum(baseline_filtered.values())
600
668
  remaining_columns = list(good_for_reordering)
601
669
 
670
+ # Get the cardinality-based RLE as the target to beat (both total and optimizable)
671
+ cardinality_rle = results[-1]['optimizable_rle'] if len(results) > 1 else float('inf')
672
+ cardinality_total_rle = results[-1]['total_rle_all'] if len(results) > 1 else float('inf')
673
+
602
674
  for depth in range(1, min(max_ordering_depth + 1, len(good_for_reordering) + 1)):
603
- print(f"\n--- Depth {depth}: Testing {len(remaining_columns)} candidate columns ---")
675
+ num_candidates = len(remaining_columns)
676
+ num_positions = len(current_best_ordering) + 1
677
+ total_tests = num_candidates * num_positions
678
+ print(f"\n--- Depth {depth}: Testing {num_candidates} candidate columns × {num_positions} positions = {total_tests} tests ---")
679
+ print(f" Target to beat: {cardinality_total_rle:,} runs (cardinality ordering)")
604
680
 
605
681
  best_depth_ordering = None
606
682
  best_depth_rle = float('inf')
607
683
  best_depth_col = None
608
684
  best_depth_position = None
685
+ early_exit = False
686
+
687
+ # Sort remaining candidates by baseline RLE (HIGHER first = test worse candidates first)
688
+ # This way we test DUID, time, date before cutoff (which we know is good from cardinality test)
689
+ candidates_sorted = sorted(remaining_columns, key=lambda c: baseline_filtered[c], reverse=True)
609
690
 
610
- # Try adding each remaining column
611
- for candidate_col in remaining_columns:
691
+ test_num = 0
692
+ # Try adding each remaining column (sorted by baseline RLE - worse first)
693
+ for candidate_col in candidates_sorted:
612
694
  # Try inserting at each possible position (including end)
613
695
  for insert_pos in range(len(current_best_ordering) + 1):
696
+ test_num += 1
697
+
614
698
  # Build test ordering: insert candidate at position
615
699
  test_ordering = current_best_ordering[:insert_pos] + [candidate_col] + current_best_ordering[insert_pos:]
616
700
 
701
+ print(f" [{test_num}/{total_tests}] Testing '{candidate_col}' at position {insert_pos}: [{', '.join(test_ordering)}]", end='', flush=True)
702
+
617
703
  # Calculate RLE for this ordering
618
704
  rle_counts = calculate_rle_for_columns(con, delta_path, test_ordering, limit)
619
705
 
620
706
  # Sum RLE for optimizable columns only
621
707
  rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
622
708
  total_rle = sum(rle_filtered.values())
709
+ total_rle_all = sum(rle_counts.values())
710
+
711
+ is_best = total_rle < best_depth_rle
712
+ beats_cardinality = total_rle < cardinality_rle
713
+
714
+ status = ""
715
+ if beats_cardinality:
716
+ status = " 🎯 Beats cardinality!"
717
+
718
+ print(f" → Total: {total_rle_all:,}{status}")
623
719
 
624
720
  # Track best at this depth
625
- if total_rle < best_depth_rle:
721
+ if is_best:
626
722
  best_depth_rle = total_rle
627
723
  best_depth_ordering = test_ordering
628
724
  best_depth_col = candidate_col
629
725
  best_depth_position = insert_pos
630
726
  best_depth_rle_counts = rle_counts
727
+
728
+ # Early exit if we beat cardinality ordering!
729
+ if beats_cardinality:
730
+ print(f"\n ⚡ Early exit! Found ordering better than cardinality. Moving to next depth.")
731
+ early_exit = True
732
+ break
733
+
734
+ if early_exit:
735
+ break
631
736
 
632
737
  # Check if we found improvement
633
738
  if best_depth_rle < current_best_rle:
634
- improvement_pct = ((current_best_rle - best_depth_rle) / current_best_rle) * 100
635
- print(f"✓ Best at depth {depth}: Add '{best_depth_col}' at position {best_depth_position}")
636
- print(f" Ordering: {', '.join(best_depth_ordering)}")
637
- print(f" RLE: {best_depth_rle:,} runs (improved {improvement_pct:.1f}% from previous depth)")
739
+ current_total_rle_all = sum(best_depth_rle_counts.values())
740
+ baseline_total_rle_all = sum(baseline.values())
741
+ improvement_pct = ((baseline_total_rle_all - current_total_rle_all) / baseline_total_rle_all) * 100
742
+ print(f"\n✓ Best at depth {depth}: [{', '.join(best_depth_ordering)}]")
743
+ print(f" Total RLE (all columns): {current_total_rle_all:,} runs")
744
+ print(f" Optimizable RLE: {best_depth_rle:,} runs")
745
+ print(f" Improvement: {improvement_pct:.1f}% better than baseline (total RLE)")
638
746
 
639
747
  # Update for next depth
640
748
  current_best_ordering = best_depth_ordering
@@ -647,6 +755,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
647
755
  cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
648
756
 
649
757
  results.append({
758
+ 'schema': schema_name,
759
+ 'table': table_display_name,
650
760
  'sort_order': f'greedy_depth_{depth}',
651
761
  'columns_used': ', '.join(best_depth_ordering),
652
762
  'total_rle_all': total_rle_all,
@@ -657,8 +767,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
657
767
  **best_depth_rle_counts
658
768
  })
659
769
  else:
660
- print(f"✗ No improvement found at depth {depth} - stopping early")
661
- print(f" Best RLE remains: {current_best_rle:,} runs")
770
+ print(f"\n✗ No improvement found at depth {depth} - stopping early")
771
+ print(f" Best RLE (all columns): {sum(best_depth_rle_counts.values()) if best_depth_rle_counts else sum(baseline.values()):,} runs")
772
+ print(f" Best optimizable RLE: {best_depth_rle if best_depth_rle != float('inf') else current_best_rle:,} runs")
662
773
  break
663
774
 
664
775
  print(f"\n{'='*60}")
@@ -666,7 +777,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
666
777
  print(f"{'='*60}")
667
778
  if current_best_ordering:
668
779
  print(f"Final greedy ordering: {', '.join(current_best_ordering)}")
669
- print(f"Final RLE: {current_best_rle:,} runs")
780
+ print(f"Final optimizable RLE: {current_best_rle:,} runs")
670
781
 
671
782
 
672
783
  # Convert to DataFrame and sort by optimizable RLE (lower is better)
@@ -677,14 +788,14 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
677
788
  print(f"✓ Analysis complete!")
678
789
  print(f"{'='*60}")
679
790
  print(f"Best ordering: {df.iloc[0]['sort_order']}")
680
- print(f"Best optimizable RLE: {df.iloc[0]['optimizable_rle']:,} runs (lower is better)")
681
- print(f"Total RLE (all columns): {df.iloc[0]['total_rle_all']:,} runs")
791
+ print(f"Best total RLE: {df.iloc[0]['total_rle_all']:,} runs (lower is better)")
682
792
 
683
793
 
684
- improvement = baseline_filtered[list(baseline_filtered.keys())[0]] if baseline_filtered else 0
685
- best_rle = df.iloc[0]['optimizable_rle']
686
- if len(df) > 1 and improvement > 0:
687
- pct = ((sum(baseline_filtered.values()) - best_rle) / sum(baseline_filtered.values())) * 100
794
+ # Calculate improvement using total RLE (all columns) for meaningful comparison
795
+ baseline_total_rle = sum(baseline.values())
796
+ best_total_rle = df.iloc[0]['total_rle_all']
797
+ if len(df) > 1 and baseline_total_rle > 0:
798
+ pct = ((baseline_total_rle - best_total_rle) / baseline_total_rle) * 100
688
799
  if pct > 0:
689
800
  print(f"Improvement: {pct:.1f}% fewer runs vs natural order")
690
801
 
@@ -693,7 +804,56 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
693
804
  # Remove: optimizable_rle, avg_rle, cardinality_weighted_score, method
694
805
  display_df = df.drop(columns=['optimizable_rle', 'avg_rle', 'cardinality_weighted_score', 'method'], errors='ignore')
695
806
 
696
- return display_df
807
+ # Transform to long format
808
+ long_format_results = []
809
+
810
+ for _, row in display_df.iterrows():
811
+ schema_val = row['schema']
812
+ table_val = row['table']
813
+ sort_order = row['sort_order']
814
+ columns_used = row['columns_used']
815
+ total_rle_all = row['total_rle_all']
816
+
817
+ # Get all column names except metadata columns
818
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
819
+ data_columns = [col for col in display_df.columns if col not in metadata_cols]
820
+
821
+ # Get total rows and NDV from card_stats if available
822
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
823
+
824
+ # Parse the columns_used to get ordering
825
+ sort_columns_list = []
826
+ if columns_used != 'file_row_number':
827
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
828
+
829
+ # Create one row per data column
830
+ for col in data_columns:
831
+ rle_value = row[col]
832
+
833
+ # Get NDV from card_stats
834
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
835
+
836
+ # Determine if column was included in the sort and its position
837
+ is_in_sort = col in sort_columns_list
838
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
839
+ comment = '' if is_in_sort or columns_used == 'file_row_number' else 'not included in the sort'
840
+
841
+ long_format_results.append({
842
+ 'schema': schema_val,
843
+ 'table': table_val,
844
+ 'sort_type': sort_order,
845
+ 'column': col,
846
+ 'order': order_position,
847
+ 'RLE': rle_value,
848
+ 'NDV': ndv_value,
849
+ 'total_rows': total_rows,
850
+ 'total_RLE': total_rle_all,
851
+ 'comments': comment
852
+ })
853
+
854
+ long_df = pd.DataFrame(long_format_results)
855
+
856
+ return long_df
697
857
 
698
858
 
699
859
  # Example usage:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev3
3
+ Version: 0.2.19.dev4
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -16,4 +16,5 @@ duckrun.egg-info/PKG-INFO
16
16
  duckrun.egg-info/SOURCES.txt
17
17
  duckrun.egg-info/dependency_links.txt
18
18
  duckrun.egg-info/requires.txt
19
- duckrun.egg-info/top_level.txt
19
+ duckrun.egg-info/top_level.txt
20
+ tests/test_rle.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev3"
7
+ version = "0.2.19.dev4"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,16 @@
1
+ import sys
2
+ import os
3
+ import pandas as pd
4
+
5
+ # Add the parent directory to Python path to use local package source
6
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
+ import duckrun
8
+
9
+ # Analyze multiple schemas/tables
10
+ conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
11
+
12
+ # Analyze tables - now returns long format automatically
13
+ result = conn.rle("calendar",'auto')
14
+ print(result)
15
+ conn.close()
16
+
File without changes
File without changes
File without changes