duckrun 0.2.16.dev2__py3-none-any.whl → 0.2.19.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/rle.py ADDED
@@ -0,0 +1,860 @@
1
+ import itertools
2
+ from typing import List, Dict, Tuple, Optional
3
+ import pandas as pd
4
+
5
+ def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
6
+ """
7
+ Analyze Parquet row group statistics to identify columns with constant values.
8
+ This is much faster than reading all data.
9
+
10
+ Returns:
11
+ DataFrame with row group stats per column
12
+ """
13
+ try:
14
+ # Get row group metadata
15
+ metadata = con.sql(f"""
16
+ SELECT * FROM parquet_metadata('{parquet_path}')
17
+ """).df()
18
+
19
+ return metadata
20
+ except Exception as e:
21
+ print(f"Could not read parquet metadata: {e}")
22
+ return None
23
+
24
+
25
+ def estimate_rle_from_row_groups(con, parquet_path: str) -> Dict[str, dict]:
26
+ """
27
+ Estimate RLE potential from Parquet row group statistics.
28
+ If min == max in a row group, that entire group is one RLE run.
29
+
30
+ Returns:
31
+ Dictionary with column stats: {col: {'constant_groups': N, 'total_groups': M, 'constant_ratio': ratio}}
32
+ """
33
+ try:
34
+ # Get row group statistics - this varies by DuckDB version
35
+ # Try to get column chunk stats
36
+ stats_query = f"""
37
+ SELECT
38
+ row_group_id,
39
+ column_id,
40
+ file_offset,
41
+ num_values,
42
+ total_compressed_size,
43
+ total_uncompressed_size
44
+ FROM parquet_file_metadata('{parquet_path}')
45
+ """
46
+
47
+ stats = con.sql(stats_query).df()
48
+ print("Row group metadata available!")
49
+ return stats
50
+
51
+ except Exception as e:
52
+ print(f"Parquet metadata not available in this DuckDB version: {e}")
53
+ print("Falling back to stratified sampling...")
54
+ return None
55
+
56
+
57
+ def stratified_rle_sampling(con, delta_path: str, sort_columns: List[str] = None,
58
+ num_segments: int = 5, segment_size: int = 1000) -> Dict[str, float]:
59
+ """
60
+ Sample RLE density across multiple segments of the file.
61
+
62
+ Args:
63
+ con: DuckDB connection
64
+ delta_path: Path to Delta table
65
+ sort_columns: List of columns to sort by before calculating RLE. If None, uses natural order.
66
+ num_segments: Number of segments to sample across the file
67
+ segment_size: Number of rows per segment
68
+
69
+ Returns:
70
+ Dictionary with estimated RLE runs per column for full file
71
+ """
72
+ # Get total row count
73
+ total_rows = con.sql(f"""
74
+ SELECT COUNT(*) FROM delta_scan('{delta_path}')
75
+ """).fetchone()[0]
76
+
77
+ # Get column names
78
+ columns = con.sql(f"""
79
+ SELECT column_name
80
+ FROM (
81
+ DESCRIBE
82
+ SELECT * FROM delta_scan('{delta_path}', file_row_number = TRUE)
83
+ )
84
+ WHERE column_name != 'file_row_number'
85
+ """).fetchall()
86
+
87
+ column_names = [col[0] for col in columns]
88
+
89
+ # Build ORDER BY clause
90
+ if sort_columns:
91
+ order_by_clause = "ORDER BY " + ", ".join(sort_columns)
92
+ sort_desc = f"sorted by [{', '.join(sort_columns)}]"
93
+ else:
94
+ order_by_clause = "ORDER BY file_row_number"
95
+ sort_desc = "natural order"
96
+
97
+ # Calculate segment positions spread across the file
98
+ segment_positions = []
99
+ if num_segments == 1:
100
+ segment_positions = [0]
101
+ else:
102
+ step = total_rows // (num_segments + 1)
103
+ segment_positions = [step * (i + 1) for i in range(num_segments)]
104
+
105
+ # Sample each segment and calculate RLE density
106
+ all_densities = {col: [] for col in column_names}
107
+
108
+ for seg_idx, start_pos in enumerate(segment_positions, 1):
109
+ for col in column_names:
110
+ # The key fix: we need to sort the ENTIRE dataset first, then sample from it
111
+ # This is expensive but necessary for accurate results
112
+ rle_count = con.sql(f"""
113
+ WITH sorted_data AS (
114
+ SELECT
115
+ *,
116
+ ROW_NUMBER() OVER ({order_by_clause}) as sorted_row_num
117
+ FROM delta_scan('{delta_path}', file_row_number = TRUE)
118
+ ),
119
+ segment_data AS (
120
+ SELECT
121
+ {col},
122
+ sorted_row_num
123
+ FROM sorted_data
124
+ WHERE sorted_row_num >= {start_pos}
125
+ ORDER BY sorted_row_num
126
+ LIMIT {segment_size}
127
+ ),
128
+ runs AS (
129
+ SELECT
130
+ CASE
131
+ WHEN LAG({col}) OVER (ORDER BY sorted_row_num) != {col}
132
+ OR LAG({col}) OVER (ORDER BY sorted_row_num) IS NULL
133
+ THEN 1
134
+ ELSE 0
135
+ END AS new_run
136
+ FROM segment_data
137
+ )
138
+ SELECT SUM(new_run) AS rle_run_count
139
+ FROM runs
140
+ """).fetchone()[0]
141
+
142
+ # Calculate density (runs per row)
143
+ density = rle_count / segment_size
144
+ all_densities[col].append(density)
145
+
146
+ # Estimate total runs for full file
147
+ estimated_runs = {}
148
+ density_stats = {}
149
+
150
+ for col in column_names:
151
+ avg_density = sum(all_densities[col]) / len(all_densities[col])
152
+ min_density = min(all_densities[col])
153
+ max_density = max(all_densities[col])
154
+ std_density = (sum((d - avg_density)**2 for d in all_densities[col]) / len(all_densities[col]))**0.5
155
+
156
+ estimated_total = int(avg_density * total_rows)
157
+ estimated_runs[col] = estimated_total
158
+
159
+ density_stats[col] = {
160
+ 'avg_density': avg_density,
161
+ 'min_density': min_density,
162
+ 'max_density': max_density,
163
+ 'std_density': std_density,
164
+ 'estimated_runs': estimated_total,
165
+ 'variance_coefficient': std_density / avg_density if avg_density > 0 else 0
166
+ }
167
+
168
+ return estimated_runs, density_stats
169
+
170
+
171
+ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = None, limit: int = None) -> Dict[str, int]:
172
+ """
173
+ Calculate RLE runs for all columns in a Delta table, optionally after sorting.
174
+
175
+ Args:
176
+ con: DuckDB connection
177
+ delta_path: Path to Delta table
178
+ sort_columns: List of columns to sort by (in order). If None, uses natural file order.
179
+ limit: Optional limit on number of rows to analyze
180
+
181
+ Returns:
182
+ Dictionary mapping column names to RLE run counts
183
+ """
184
+ # Get all column names
185
+ columns = con.sql(f"""
186
+ SELECT column_name
187
+ FROM (
188
+ DESCRIBE
189
+ SELECT *
190
+ FROM delta_scan('{delta_path}', file_row_number = TRUE)
191
+ )
192
+ WHERE column_name != 'file_row_number'
193
+ """).fetchall()
194
+
195
+ column_names = [col[0] for col in columns]
196
+
197
+ # Build ORDER BY clause
198
+ if sort_columns:
199
+ order_by = "ORDER BY " + ", ".join(sort_columns)
200
+ else:
201
+ order_by = "ORDER BY filename, file_row_number ASC"
202
+
203
+ limit_clause = f"LIMIT {limit}" if limit else ""
204
+
205
+ # Calculate RLE for each column
206
+ results = {}
207
+ for column_name in column_names:
208
+ rle_count = con.sql(f"""
209
+ WITH ordered_data AS (
210
+ SELECT
211
+ {column_name},
212
+ ROW_NUMBER() OVER ({order_by}) as sort_order
213
+ FROM delta_scan('{delta_path}', filename = TRUE, file_row_number = TRUE)
214
+ {limit_clause}
215
+ ),
216
+ runs AS (
217
+ SELECT
218
+ CASE
219
+ WHEN LAG({column_name}) OVER (ORDER BY sort_order) != {column_name}
220
+ OR LAG({column_name}) OVER (ORDER BY sort_order) IS NULL
221
+ THEN 1
222
+ ELSE 0
223
+ END AS new_run
224
+ FROM ordered_data
225
+ )
226
+ SELECT SUM(new_run) AS rle_run_count
227
+ FROM runs
228
+ """).fetchone()[0]
229
+
230
+ results[column_name] = rle_count
231
+
232
+ return results
233
+
234
+
235
+ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet: bool = False,
236
+ use_approx: bool = None, approx_threshold: int = 100_000_000) -> Dict[str, dict]:
237
+ """
238
+ Calculate cardinality ratio for each column (distinct_values / total_rows).
239
+ Lower ratio = better for RLE compression (more repetition).
240
+
241
+ NEVER uses sampling - always scans full dataset with exact distinct counts.
242
+
243
+ Args:
244
+ con: DuckDB connection
245
+ source: Either a table name (default) or parquet file path
246
+ limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
247
+ is_parquet: If True, source is a parquet file path; if False, source is a table name
248
+ use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
249
+ approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
250
+
251
+ Returns:
252
+ Dictionary mapping column names to dict with keys:
253
+ - 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
254
+ - 'total_rows': total row count
255
+ - 'distinct_values': number of distinct values (exact)
256
+ """
257
+ # Build the FROM clause based on source type
258
+ if is_parquet:
259
+ from_clause = f"read_parquet('{source}', file_row_number = TRUE)"
260
+ column_filter = "WHERE column_name != 'file_row_number'"
261
+ else:
262
+ from_clause = source # Table name
263
+ column_filter = ""
264
+
265
+ columns = con.sql(f"""
266
+ SELECT column_name
267
+ FROM (DESCRIBE SELECT * FROM {from_clause})
268
+ {column_filter}
269
+ """).fetchall()
270
+
271
+ column_names = [col[0] for col in columns]
272
+
273
+ if not column_names:
274
+ return {}
275
+
276
+ # Get row count
277
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
278
+ print(f" Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
279
+
280
+ # Build a single query that calculates all cardinality in one pass
281
+ # This scans the data only ONCE instead of once per column
282
+ select_clauses = []
283
+ for col in column_names:
284
+ select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
285
+
286
+ query = f"""
287
+ SELECT
288
+ COUNT(*)::BIGINT as total_rows,
289
+ {', '.join(select_clauses)}
290
+ FROM {from_clause}
291
+ """
292
+
293
+ result = con.sql(query).fetchone()
294
+
295
+ if not result:
296
+ return {}
297
+
298
+ total_rows = result[0]
299
+
300
+ nfv_stats = {}
301
+
302
+ # Parse results (total_rows, distinct_col1, distinct_col2, ...)
303
+ for idx, col in enumerate(column_names, start=1):
304
+ distinct_values = result[idx]
305
+ cardinality_ratio = (distinct_values / total_rows) if total_rows > 0 else 0.0
306
+
307
+ nfv_stats[col] = {
308
+ 'total_rows': total_rows,
309
+ 'distinct_values': distinct_values,
310
+ 'cardinality_ratio': cardinality_ratio
311
+ }
312
+
313
+ return nfv_stats
314
+
315
+
316
+ def filter_promising_combinations(columns: List[str], nfv_scores: Dict[str, float],
317
+ max_combinations: int = 20) -> List[List[str]]:
318
+ """
319
+ Apply heuristics to filter down to the most promising column orderings.
320
+
321
+ Heuristics based on research:
322
+ 1. Time/date columns first (temporal ordering)
323
+ 2. High NFV score columns before low NFV score (more repetition = better RLE)
324
+ 3. Correlated columns together (e.g., date + time)
325
+ 4. Avoid starting with low-NFV columns (high cardinality)
326
+
327
+ Args:
328
+ columns: List of all column names
329
+ nfv_scores: NFV score for each column (higher = more repetition, better for RLE)
330
+ max_combinations: Maximum number of combinations to return
331
+
332
+ Returns:
333
+ List of promising column orderings to test
334
+ """
335
+ # Sort columns by NFV (higher first = better for RLE)
336
+ sorted_by_nfv = sorted(columns, key=lambda c: nfv_scores[c], reverse=True)
337
+
338
+ promising = []
339
+
340
+ # Rule 1: Natural order baseline
341
+ promising.append([])
342
+
343
+ # Rule 2: NFV-based ordering (highest to lowest)
344
+ promising.append(sorted_by_nfv)
345
+
346
+ # Rule 3: Single best column (highest NFV)
347
+ promising.append([sorted_by_nfv[0]])
348
+
349
+ # Rule 4: Time-based patterns (common column names)
350
+ time_cols = [c for c in columns if any(t in c.lower() for t in ['date', 'time', 'timestamp', 'year', 'month', 'day'])]
351
+ if time_cols:
352
+ promising.append(time_cols)
353
+ # Time columns + high NFV columns
354
+ non_time = [c for c in sorted_by_nfv if c not in time_cols]
355
+ if non_time:
356
+ promising.append(time_cols + non_time[:2])
357
+
358
+ # Rule 5: Top 2-3 highest NFV columns in different orders
359
+ top_high_nfv = sorted_by_nfv[:min(3, len(sorted_by_nfv))]
360
+ for perm in itertools.permutations(top_high_nfv, min(2, len(top_high_nfv))):
361
+ promising.append(list(perm))
362
+
363
+ # Rule 6: ID-like columns first (common patterns)
364
+ id_cols = [c for c in columns if any(t in c.lower() for t in ['id', 'key', 'code'])]
365
+ if id_cols:
366
+ promising.append(id_cols)
367
+
368
+ # Rule 7: Categorical/enum-like columns (very low NFV < 0.1)
369
+ categorical = [c for c in sorted_by_nfv if nfv_scores[c] < 0.1]
370
+ if categorical:
371
+ promising.append(categorical)
372
+ # Categorical + time
373
+ if time_cols:
374
+ promising.append(categorical + time_cols)
375
+
376
+ # Remove duplicates while preserving order
377
+ seen = set()
378
+ unique_promising = []
379
+ for combo in promising:
380
+ key = tuple(combo)
381
+ if key not in seen:
382
+ seen.add(key)
383
+ unique_promising.append(combo)
384
+
385
+ # Limit to max_combinations
386
+ return unique_promising[:max_combinations]
387
+
388
+
389
+ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, limit: int = None,
390
+ mode: str = "natural",
391
+ min_distinct_threshold: int = 2,
392
+ max_cardinality_pct: float = 0.01,
393
+ max_ordering_depth: int = 3,
394
+ schema_name: str = None,
395
+ table_display_name: str = None) -> pd.DataFrame:
396
+ """
397
+ Test column orderings for RLE optimization.
398
+
399
+ Modes:
400
+ - "natural": Calculate RLE for natural order only (baseline)
401
+ - "auto": Natural order + cardinality-based ordering (low to high)
402
+ - "advanced": Natural + cardinality + greedy incremental search
403
+
404
+ Args:
405
+ con: DuckDB connection
406
+ delta_path: Path to Delta table (used for RLE calculation with file_row_number via delta_scan)
407
+ table_name: Optional table name for cardinality calculation on full dataset (if None, uses delta_path)
408
+ limit: Optional limit on number of rows to analyze (for testing only)
409
+ mode: Analysis mode - "natural", "auto", or "advanced" (default: "natural")
410
+ min_distinct_threshold: Exclude columns with fewer distinct values (default: 2, i.e. only exclude constants with 1 value)
411
+ max_cardinality_pct: Exclude columns with cardinality ratio above this % (default: 0.01 = 1%)
412
+ max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
413
+ schema_name: Optional schema name to include in results (default: None)
414
+ table_display_name: Optional table name to include in results (default: None)
415
+
416
+ Returns:
417
+ DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
418
+ """
419
+ print("Analyzing column characteristics...")
420
+
421
+ # Calculate cardinality ratios first (for all modes)
422
+ print("\nCalculating cardinality ratios on full dataset...")
423
+ if table_name:
424
+ card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
425
+ else:
426
+ # Fallback: use delta_scan directly
427
+ card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
428
+
429
+ print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
430
+ for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
431
+ card_pct = stats['cardinality_ratio'] * 100
432
+ print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
433
+
434
+ # For "natural" mode, just calculate RLE on natural order
435
+ if mode == "natural":
436
+ print("\n" + "="*60)
437
+ print("Mode: NATURAL ORDER (baseline)")
438
+ print("="*60)
439
+ print("Calculating RLE for natural file order (single pass)...")
440
+
441
+ # Get all column names
442
+ columns = con.sql(f"""
443
+ SELECT column_name
444
+ FROM (
445
+ DESCRIBE
446
+ SELECT * FROM delta_scan('{delta_path}', file_row_number = TRUE)
447
+ )
448
+ WHERE column_name != 'file_row_number'
449
+ """).fetchall()
450
+
451
+ column_names = [col[0] for col in columns]
452
+
453
+ # Calculate RLE for natural order
454
+ rle_counts = calculate_rle_for_columns(con, delta_path, None, limit)
455
+
456
+ total_rle_all = sum(rle_counts.values())
457
+
458
+ print(f"\nResults:")
459
+ print(f" Total RLE (all columns): {total_rle_all:,}")
460
+
461
+ results = [{
462
+ 'schema': schema_name,
463
+ 'table': table_display_name,
464
+ 'sort_order': 'natural_order',
465
+ 'columns_used': 'file_row_number',
466
+ 'total_rle_all': total_rle_all,
467
+ **rle_counts
468
+ }]
469
+
470
+ df = pd.DataFrame(results)
471
+ print(f"\n{'='*60}")
472
+ print(f"✓ Analysis complete!")
473
+ print(f"{'='*60}")
474
+
475
+ # Transform to long format
476
+ long_format_results = []
477
+
478
+ for _, row in df.iterrows():
479
+ schema_val = row['schema']
480
+ table_val = row['table']
481
+ sort_order = row['sort_order']
482
+ columns_used = row['columns_used']
483
+ total_rle_all_val = row['total_rle_all']
484
+
485
+ # Get all column names except metadata columns
486
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
487
+ data_columns = [col for col in df.columns if col not in metadata_cols]
488
+
489
+ # Get total rows and NDV from card_stats if available
490
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
491
+
492
+ # Create one row per data column
493
+ for col in data_columns:
494
+ rle_value = row[col]
495
+
496
+ # Get NDV from card_stats
497
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
498
+
499
+ long_format_results.append({
500
+ 'schema': schema_val,
501
+ 'table': table_val,
502
+ 'sort_type': sort_order,
503
+ 'column': col,
504
+ 'order': None,
505
+ 'RLE': rle_value,
506
+ 'NDV': ndv_value,
507
+ 'total_rows': total_rows,
508
+ 'total_RLE': total_rle_all_val,
509
+ 'comments': ''
510
+ })
511
+
512
+ long_df = pd.DataFrame(long_format_results)
513
+
514
+ return long_df
515
+
516
+ # For "auto" and "advanced" modes, continue with optimization
517
+ # Extract just the ratios for easier handling
518
+ cardinality_ratios = {col: stats['cardinality_ratio'] for col, stats in card_stats.items()}
519
+ column_names = list(card_stats.keys())
520
+
521
+ # Sort columns by cardinality for ordering (lower cardinality = better for RLE)
522
+ sorted_by_cardinality = sorted(column_names, key=lambda c: cardinality_ratios[c])
523
+
524
+ # OPTIMIZATION: Filter columns based on configurable thresholds
525
+ # Exclude columns that won't benefit from reordering:
526
+ # 1. Too constant: < min_distinct_threshold (default: 2, only excludes single-value columns)
527
+ # 2. Too fragmented: cardinality_ratio > max_cardinality_pct (default: 10%)
528
+ total_rows = next(iter(card_stats.values()))['total_rows']
529
+
530
+ constant_cols = [c for c in sorted_by_cardinality
531
+ if card_stats[c]['distinct_values'] < min_distinct_threshold]
532
+
533
+ fragmented_cols = [c for c in sorted_by_cardinality
534
+ if cardinality_ratios[c] > max_cardinality_pct]
535
+
536
+ good_for_reordering = [c for c in sorted_by_cardinality
537
+ if c not in constant_cols and c not in fragmented_cols]
538
+
539
+ if constant_cols:
540
+ print(f"\n✓ Skipping constant columns (< {min_distinct_threshold} distinct values): {', '.join(constant_cols)}")
541
+ print(f" These compress perfectly regardless of ordering.")
542
+
543
+ if fragmented_cols:
544
+ print(f"✓ Skipping high-cardinality columns (cardinality > {max_cardinality_pct*100:.0f}%): {', '.join(fragmented_cols)}")
545
+ print(f" These are too fragmented to benefit from reordering.")
546
+
547
+ if not good_for_reordering:
548
+ print("\n⚠️ No columns suitable for reordering optimization!")
549
+ print(" All columns are either nearly constant or have too many unique values.")
550
+ return None
551
+
552
+ print(f"\n✓ Analyzing {len(good_for_reordering)} columns suitable for reordering")
553
+
554
+ # Get total row count from cardinality calculation
555
+ total_rows = next(iter(card_stats.values()))['total_rows'] if card_stats else 0
556
+ print(f"✓ Table size: {total_rows:,} rows")
557
+
558
+ # Calculate RLE ONLY on natural order for baseline (single pass - fast!)
559
+ print("\n" + "="*60)
560
+ print("Calculating baseline RLE (natural order - single pass)")
561
+ print("="*60)
562
+ baseline = calculate_rle_for_columns(con, delta_path, None, limit)
563
+
564
+ # Filter baseline to only include good_for_reordering columns
565
+ baseline_filtered = {col: rle for col, rle in baseline.items() if col in good_for_reordering}
566
+
567
+ # Show column categorization upfront
568
+ print(f"\nColumn Analysis (baseline RLE in natural order):")
569
+
570
+ # Show columns worth reordering first
571
+ if baseline_filtered:
572
+ print(f" Columns included in optimization:")
573
+ for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
574
+ print(f" {col}: {baseline_filtered[col]:,} runs")
575
+ print(f" ─────────────────────────")
576
+ print(f" Subtotal: {sum(baseline_filtered.values()):,} runs")
577
+
578
+ # Show excluded columns (constant or high-cardinality)
579
+ excluded_cols = {col: rle for col, rle in baseline.items()
580
+ if col in constant_cols or col in fragmented_cols}
581
+ if excluded_cols:
582
+ print(f" Columns excluded from optimization:")
583
+ for col in sorted(excluded_cols.keys(), key=lambda c: excluded_cols[c]):
584
+ reason = "constant" if col in constant_cols else "high-cardinality"
585
+ print(f" {col}: {excluded_cols[col]:,} runs ({reason})")
586
+ print(f" ─────────────────────────")
587
+ print(f" Subtotal: {sum(excluded_cols.values()):,} runs")
588
+
589
+ # Show total baseline RLE
590
+ print(f"\nBaseline Total RLE (all columns): {sum(baseline.values()):,} runs")
591
+
592
+ # Define only the most promising orderings to test
593
+ orderings_to_test = [
594
+ ([], 'natural_order'), # Baseline
595
+ ]
596
+
597
+ # Add cardinality-based ordering for "auto" and "advanced" modes
598
+ if mode in ["auto", "advanced"] and len(good_for_reordering) >= 2:
599
+ orderings_to_test.append((good_for_reordering, 'by_cardinality'))
600
+
601
+ # Count only the actual reordering tests (exclude natural_order baseline)
602
+ num_tests = len(orderings_to_test) - 1
603
+
604
+ results = []
605
+
606
+ for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
607
+ if i == 1:
608
+ # Use baseline for natural order (already calculated and displayed)
609
+ rle_counts = baseline
610
+ else:
611
+ # This is an actual reordering test
612
+ test_num = i - 1
613
+ print(f"\n[{test_num}/{num_tests}] Testing: {label}")
614
+ if sort_cols:
615
+ print(f" Order: {', '.join(sort_cols)}")
616
+
617
+ # Calculate RLE for this ordering
618
+ rle_counts = calculate_rle_for_columns(con, delta_path, sort_cols, limit)
619
+
620
+ # Calculate metrics for ALL columns and optimizable subset
621
+ total_rle_all = sum(rle_counts.values())
622
+
623
+ # Filter to only good_for_reordering columns for scoring/comparison
624
+ rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
625
+ total_rle_optimizable = sum(rle_filtered.values())
626
+
627
+ # Calculate weighted score (considering both RLE and cardinality - lower cardinality = better)
628
+ cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
629
+
630
+ print(f" Total RLE: {total_rle_all:,} runs")
631
+
632
+ results.append({
633
+ 'schema': schema_name,
634
+ 'table': table_display_name,
635
+ 'sort_order': label,
636
+ 'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
637
+ 'total_rle_all': total_rle_all, # All columns (must be >= row_count)
638
+ 'optimizable_rle': total_rle_optimizable, # Only columns we're optimizing
639
+ 'avg_rle': total_rle_optimizable / len(rle_filtered),
640
+ 'cardinality_weighted_score': cardinality_weighted,
641
+ 'method': 'single_pass',
642
+ **rle_counts # Include individual column RLE counts
643
+ })
644
+
645
+ # Greedy incremental search (only in "advanced" mode)
646
+ if mode == "advanced" and max_ordering_depth > 0 and len(good_for_reordering) >= 2:
647
+ print(f"\n{'='*60}")
648
+ print(f"ADVANCED MODE: Greedy Incremental Search (max depth: {max_ordering_depth})")
649
+ print(f"{'='*60}")
650
+ print(f"Building optimal ordering column-by-column, testing all positions")
651
+ print(f"at each depth to find the best incremental improvement.\n")
652
+
653
+ current_best_ordering = []
654
+ current_best_rle = sum(baseline_filtered.values())
655
+ remaining_columns = list(good_for_reordering)
656
+
657
+ # Get the cardinality-based RLE as the target to beat (both total and optimizable)
658
+ cardinality_rle = results[-1]['optimizable_rle'] if len(results) > 1 else float('inf')
659
+ cardinality_total_rle = results[-1]['total_rle_all'] if len(results) > 1 else float('inf')
660
+
661
+ for depth in range(1, min(max_ordering_depth + 1, len(good_for_reordering) + 1)):
662
+ num_candidates = len(remaining_columns)
663
+ num_positions = len(current_best_ordering) + 1
664
+ total_tests = num_candidates * num_positions
665
+ print(f"\n--- Depth {depth}: Testing {num_candidates} candidate columns × {num_positions} positions = {total_tests} tests ---")
666
+ print(f" Target to beat: {cardinality_total_rle:,} runs (cardinality ordering)")
667
+
668
+ best_depth_ordering = None
669
+ best_depth_rle = float('inf')
670
+ best_depth_col = None
671
+ best_depth_position = None
672
+ early_exit = False
673
+
674
+ # Sort remaining candidates by baseline RLE (HIGHER first = test worse candidates first)
675
+ # This way we test DUID, time, date before cutoff (which we know is good from cardinality test)
676
+ candidates_sorted = sorted(remaining_columns, key=lambda c: baseline_filtered[c], reverse=True)
677
+
678
+ test_num = 0
679
+ # Try adding each remaining column (sorted by baseline RLE - worse first)
680
+ for candidate_col in candidates_sorted:
681
+ # Try inserting at each possible position (including end)
682
+ for insert_pos in range(len(current_best_ordering) + 1):
683
+ test_num += 1
684
+
685
+ # Build test ordering: insert candidate at position
686
+ test_ordering = current_best_ordering[:insert_pos] + [candidate_col] + current_best_ordering[insert_pos:]
687
+
688
+ print(f" [{test_num}/{total_tests}] Testing '{candidate_col}' at position {insert_pos}: [{', '.join(test_ordering)}]", end='', flush=True)
689
+
690
+ # Calculate RLE for this ordering
691
+ rle_counts = calculate_rle_for_columns(con, delta_path, test_ordering, limit)
692
+
693
+ # Sum RLE for optimizable columns only
694
+ rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
695
+ total_rle = sum(rle_filtered.values())
696
+ total_rle_all = sum(rle_counts.values())
697
+
698
+ is_best = total_rle < best_depth_rle
699
+ beats_cardinality = total_rle < cardinality_rle
700
+
701
+ status = ""
702
+ if beats_cardinality:
703
+ status = " 🎯 Beats cardinality!"
704
+
705
+ print(f" → Total: {total_rle_all:,}{status}")
706
+
707
+ # Track best at this depth
708
+ if is_best:
709
+ best_depth_rle = total_rle
710
+ best_depth_ordering = test_ordering
711
+ best_depth_col = candidate_col
712
+ best_depth_position = insert_pos
713
+ best_depth_rle_counts = rle_counts
714
+
715
+ # Early exit if we beat cardinality ordering!
716
+ if beats_cardinality:
717
+ print(f"\n ⚡ Early exit! Found ordering better than cardinality. Moving to next depth.")
718
+ early_exit = True
719
+ break
720
+
721
+ if early_exit:
722
+ break
723
+
724
+ # Check if we found improvement
725
+ if best_depth_rle < current_best_rle:
726
+ current_total_rle_all = sum(best_depth_rle_counts.values())
727
+ baseline_total_rle_all = sum(baseline.values())
728
+ improvement_pct = ((baseline_total_rle_all - current_total_rle_all) / baseline_total_rle_all) * 100
729
+ print(f"\n✓ Best at depth {depth}: [{', '.join(best_depth_ordering)}]")
730
+ print(f" Total RLE (all columns): {current_total_rle_all:,} runs")
731
+ print(f" Optimizable RLE: {best_depth_rle:,} runs")
732
+ print(f" Improvement: {improvement_pct:.1f}% better than baseline (total RLE)")
733
+
734
+ # Update for next depth
735
+ current_best_ordering = best_depth_ordering
736
+ current_best_rle = best_depth_rle
737
+ remaining_columns.remove(best_depth_col)
738
+
739
+ # Store this result
740
+ rle_filtered = {col: rle for col, rle in best_depth_rle_counts.items() if col in good_for_reordering}
741
+ total_rle_all = sum(best_depth_rle_counts.values())
742
+ cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
743
+
744
+ results.append({
745
+ 'schema': schema_name,
746
+ 'table': table_display_name,
747
+ 'sort_order': f'greedy_depth_{depth}',
748
+ 'columns_used': ', '.join(best_depth_ordering),
749
+ 'total_rle_all': total_rle_all,
750
+ 'optimizable_rle': best_depth_rle,
751
+ 'avg_rle': best_depth_rle / len(rle_filtered),
752
+ 'cardinality_weighted_score': cardinality_weighted,
753
+ 'method': 'greedy_incremental',
754
+ **best_depth_rle_counts
755
+ })
756
+ else:
757
+ print(f"\n✗ No improvement found at depth {depth} - stopping early")
758
+ print(f" Best RLE (all columns): {sum(best_depth_rle_counts.values()) if best_depth_rle_counts else sum(baseline.values()):,} runs")
759
+ print(f" Best optimizable RLE: {best_depth_rle if best_depth_rle != float('inf') else current_best_rle:,} runs")
760
+ break
761
+
762
+ print(f"\n{'='*60}")
763
+ print(f"Greedy Search Complete")
764
+ print(f"{'='*60}")
765
+ if current_best_ordering:
766
+ print(f"Final greedy ordering: {', '.join(current_best_ordering)}")
767
+ print(f"Final optimizable RLE: {current_best_rle:,} runs")
768
+
769
+
770
+ # Convert to DataFrame and sort by optimizable RLE (lower is better)
771
+ df = pd.DataFrame(results)
772
+ df = df.sort_values('optimizable_rle')
773
+
774
+ print(f"\n{'='*60}")
775
+ print(f"✓ Analysis complete!")
776
+ print(f"{'='*60}")
777
+ print(f"Best ordering: {df.iloc[0]['sort_order']}")
778
+ print(f"Best total RLE: {df.iloc[0]['total_rle_all']:,} runs (lower is better)")
779
+
780
+
781
+ # Calculate improvement using total RLE (all columns) for meaningful comparison
782
+ baseline_total_rle = sum(baseline.values())
783
+ best_total_rle = df.iloc[0]['total_rle_all']
784
+ if len(df) > 1 and baseline_total_rle > 0:
785
+ pct = ((baseline_total_rle - best_total_rle) / baseline_total_rle) * 100
786
+ if pct > 0:
787
+ print(f"Improvement: {pct:.1f}% fewer runs vs natural order")
788
+
789
+ # Remove confusing internal columns from displayed output
790
+ # Keep: sort_order, columns_used, total_rle_all, and individual column RLE counts
791
+ # Remove: optimizable_rle, avg_rle, cardinality_weighted_score, method
792
+ display_df = df.drop(columns=['optimizable_rle', 'avg_rle', 'cardinality_weighted_score', 'method'], errors='ignore')
793
+
794
+ # Transform to long format
795
+ long_format_results = []
796
+
797
+ for _, row in display_df.iterrows():
798
+ schema_val = row['schema']
799
+ table_val = row['table']
800
+ sort_order = row['sort_order']
801
+ columns_used = row['columns_used']
802
+ total_rle_all = row['total_rle_all']
803
+
804
+ # Get all column names except metadata columns
805
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
806
+ data_columns = [col for col in display_df.columns if col not in metadata_cols]
807
+
808
+ # Get total rows and NDV from card_stats if available
809
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
810
+
811
+ # Parse the columns_used to get ordering
812
+ sort_columns_list = []
813
+ if columns_used != 'file_row_number':
814
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
815
+
816
+ # Create one row per data column
817
+ for col in data_columns:
818
+ rle_value = row[col]
819
+
820
+ # Get NDV from card_stats
821
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
822
+
823
+ # Determine if column was included in the sort and its position
824
+ is_in_sort = col in sort_columns_list
825
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
826
+ comment = '' if is_in_sort or columns_used == 'file_row_number' else 'not included in the sort'
827
+
828
+ long_format_results.append({
829
+ 'schema': schema_val,
830
+ 'table': table_val,
831
+ 'sort_type': sort_order,
832
+ 'column': col,
833
+ 'order': order_position,
834
+ 'RLE': rle_value,
835
+ 'NDV': ndv_value,
836
+ 'total_rows': total_rows,
837
+ 'total_RLE': total_rle_all,
838
+ 'comments': comment
839
+ })
840
+
841
+ long_df = pd.DataFrame(long_format_results)
842
+
843
+ return long_df
844
+
845
+
846
+ # Example usage:
847
+ # delta_path = 'abfss://tmp@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Tables/unsorted/summary'
848
+ #
849
+ # # Fast single-pass analysis (recommended for all table sizes)
850
+ # results_df = test_column_orderings_smart(con, delta_path, table_name='summary')
851
+ #
852
+ # # Show results
853
+ # print("\nBest orderings:")
854
+ # print(results_df[['sort_order', 'columns_used', 'optimizable_rle', 'total_rle_all', 'method']].head())
855
+ #
856
+ # # The function automatically:
857
+ # # - Calculates exact cardinality ratios (or approximate for >100M rows)
858
+ # # - Excludes columns that won't benefit from reordering
859
+ # # - Tests only 2-3 most promising orderings (low cardinality first, high cardinality first)
860
+ # # - Uses single-pass RLE calculation (fast!)