duckrun 0.2.19.dev3__tar.gz → 0.2.19.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/core.py +96 -5
  3. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/rle.py +220 -73
  4. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/PKG-INFO +1 -1
  5. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/SOURCES.txt +2 -1
  6. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/pyproject.toml +1 -1
  7. duckrun-0.2.19.dev5/tests/test_rle.py +16 -0
  8. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/LICENSE +0 -0
  9. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/README.md +0 -0
  10. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/__init__.py +0 -0
  11. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/auth.py +0 -0
  12. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/files.py +0 -0
  13. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/lakehouse.py +0 -0
  14. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/notebook.py +0 -0
  15. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/runner.py +0 -0
  16. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/semantic_model.py +0 -0
  17. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/stats.py +0 -0
  18. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/writer.py +0 -0
  19. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev3
3
+ Version: 0.2.19.dev5
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
1244
1244
  refresh=refresh
1245
1245
  )
1246
1246
 
1247
- def rle(self, table_name: str = None, mode: str = "natural",
1247
+ def rle(self, table_name: str = None, mode = "natural",
1248
1248
  min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1249
- max_ordering_depth: int = 3):
1249
+ max_ordering_depth: int = 3, limit: int = None):
1250
1250
  """
1251
1251
  Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1252
1252
 
@@ -1254,13 +1254,15 @@ class Duckrun(WorkspaceOperationsMixin):
1254
1254
  table_name: Name of the table to analyze. Can be:
1255
1255
  - 'table_name' (uses current schema)
1256
1256
  - 'schema.table_name' (specific schema)
1257
- mode: Analysis mode:
1258
- - "natural": Calculate RLE for natural order only (default, fastest)
1257
+ mode: Analysis mode or column ordering:
1258
+ - "natural": Calculate RLE for natural order only (fastest)
1259
1259
  - "auto": Natural order + cardinality-based ordering (recommended)
1260
1260
  - "advanced": Natural + cardinality + greedy incremental search (most thorough)
1261
+ - List[str]: Specific column ordering to test, e.g., ['date', 'duid']
1261
1262
  min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
1262
1263
  max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
1263
1264
  max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
1265
+ limit: Optional row limit for testing/development (default: None, analyzes all rows)
1264
1266
 
1265
1267
  Returns:
1266
1268
  DataFrame with RLE analysis results
@@ -1276,6 +1278,10 @@ class Duckrun(WorkspaceOperationsMixin):
1276
1278
  # Advanced optimization (greedy incremental search)
1277
1279
  con.rle("mytable", "advanced")
1278
1280
 
1281
+ # Test specific column ordering
1282
+ con.rle("mytable", ["date", "duid"])
1283
+ con.rle("mytable", ["cutoff", "time", "DUID", "date"])
1284
+
1279
1285
  # Advanced with custom depth
1280
1286
  con.rle("mytable", "advanced", max_ordering_depth=4)
1281
1287
 
@@ -1284,6 +1290,9 @@ class Duckrun(WorkspaceOperationsMixin):
1284
1290
 
1285
1291
  # Custom thresholds for small tables
1286
1292
  con.rle("mytable", "auto", max_cardinality_pct=0.05)
1293
+
1294
+ # Limit rows for testing
1295
+ con.rle("mytable", "auto", limit=10000)
1287
1296
  """
1288
1297
  from .rle import (
1289
1298
  calculate_cardinality_ratio,
@@ -1326,15 +1335,97 @@ class Duckrun(WorkspaceOperationsMixin):
1326
1335
  print(f"❌ Error accessing Delta table: {e}")
1327
1336
  return None
1328
1337
 
1338
+ # Check if mode is a list of columns (custom ordering)
1339
+ if isinstance(mode, list):
1340
+ # User wants to test a specific column ordering
1341
+ print(f"Testing custom column ordering: {', '.join(mode)}")
1342
+
1343
+ # Calculate cardinality for NDV values
1344
+ card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
1345
+
1346
+ # Calculate RLE for the specified ordering
1347
+ rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
1348
+
1349
+ total_rle_all = sum(rle_counts.values())
1350
+
1351
+ print(f"\nResults:")
1352
+ print(f" Custom ordering: [{', '.join(mode)}]")
1353
+ print(f" Total RLE (all columns): {total_rle_all:,} runs")
1354
+
1355
+ # Return as DataFrame for consistency
1356
+ import pandas as pd
1357
+ results = [{
1358
+ 'schema': schema_name,
1359
+ 'table': tbl,
1360
+ 'sort_order': 'custom',
1361
+ 'columns_used': ', '.join(mode),
1362
+ 'total_rle_all': total_rle_all,
1363
+ **rle_counts
1364
+ }]
1365
+
1366
+ df = pd.DataFrame(results)
1367
+
1368
+ # Transform to long format
1369
+ long_format_results = []
1370
+
1371
+ for _, row in df.iterrows():
1372
+ schema_val = row['schema']
1373
+ table_val = row['table']
1374
+ sort_order = row['sort_order']
1375
+ columns_used = row['columns_used']
1376
+ total_rle_all_val = row['total_rle_all']
1377
+
1378
+ # Get all column names except metadata columns
1379
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
1380
+ data_columns = [col for col in df.columns if col not in metadata_cols]
1381
+
1382
+ # Get total rows from card_stats if available
1383
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
1384
+
1385
+ # Parse the columns_used to get ordering
1386
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
1387
+
1388
+ # Create one row per data column
1389
+ for col in data_columns:
1390
+ rle_value = row[col]
1391
+
1392
+ # Get NDV from card_stats
1393
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
1394
+
1395
+ # Determine if column was included in the sort and its position
1396
+ is_in_sort = col in sort_columns_list
1397
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
1398
+ comment = '' if is_in_sort else 'not included in the sort'
1399
+
1400
+ long_format_results.append({
1401
+ 'schema': schema_val,
1402
+ 'table': table_val,
1403
+ 'sort_type': sort_order,
1404
+ 'column': col,
1405
+ 'order': order_position,
1406
+ 'RLE': rle_value,
1407
+ 'NDV': ndv_value,
1408
+ 'total_rows': total_rows,
1409
+ 'total_RLE': total_rle_all_val,
1410
+ 'comments': comment
1411
+ })
1412
+
1413
+ long_df = pd.DataFrame(long_format_results)
1414
+
1415
+ return long_df
1416
+
1329
1417
  # All modes now use test_column_orderings_smart with the mode parameter
1330
1418
  return test_column_orderings_smart(
1331
1419
  self.con,
1332
1420
  table_path,
1333
1421
  table_name=table_name, # Pass table name for cardinality calculation on full dataset
1334
1422
  mode=mode,
1423
+ limit=limit,
1335
1424
  min_distinct_threshold=min_distinct_threshold,
1336
1425
  max_cardinality_pct=max_cardinality_pct,
1337
- max_ordering_depth=max_ordering_depth
1426
+ max_ordering_depth=max_ordering_depth,
1427
+ schema_name=schema_name,
1428
+ table_display_name=tbl
1338
1429
  )
1339
1430
 
1340
1431
  def close(self):
@@ -198,7 +198,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
198
198
  if sort_columns:
199
199
  order_by = "ORDER BY " + ", ".join(sort_columns)
200
200
  else:
201
- order_by = "ORDER BY file_row_number ASC"
201
+ order_by = "ORDER BY filename, file_row_number ASC"
202
202
 
203
203
  limit_clause = f"LIMIT {limit}" if limit else ""
204
204
 
@@ -210,7 +210,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
210
210
  SELECT
211
211
  {column_name},
212
212
  ROW_NUMBER() OVER ({order_by}) as sort_order
213
- FROM delta_scan('{delta_path}', file_row_number = TRUE)
213
+ FROM delta_scan('{delta_path}', filename = TRUE, file_row_number = TRUE)
214
214
  {limit_clause}
215
215
  ),
216
216
  runs AS (
@@ -238,22 +238,21 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
238
238
  Calculate cardinality ratio for each column (distinct_values / total_rows).
239
239
  Lower ratio = better for RLE compression (more repetition).
240
240
 
241
- NEVER uses sampling - always scans full dataset with exact or approximate distinct counts.
241
+ NEVER uses sampling - always scans full dataset with exact distinct counts.
242
242
 
243
243
  Args:
244
244
  con: DuckDB connection
245
245
  source: Either a table name (default) or parquet file path
246
246
  limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
247
247
  is_parquet: If True, source is a parquet file path; if False, source is a table name
248
- use_approx: If True, use HyperLogLog (approx). If False, use exact COUNT(DISTINCT).
249
- If None (default), auto-decide based on table size threshold.
250
- approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
248
+ use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
249
+ approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
251
250
 
252
251
  Returns:
253
252
  Dictionary mapping column names to dict with keys:
254
253
  - 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
255
254
  - 'total_rows': total row count
256
- - 'distinct_values': number of distinct values (exact or approximate)
255
+ - 'distinct_values': number of distinct values (exact)
257
256
  """
258
257
  # Build the FROM clause based on source type
259
258
  if is_parquet:
@@ -274,26 +273,15 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
274
273
  if not column_names:
275
274
  return {}
276
275
 
277
- # Auto-decide whether to use approximate or exact based on table size
278
- if use_approx is None:
279
- # Quick row count check
280
- total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
281
- use_approx = total_rows > approx_threshold
282
- if use_approx:
283
- print(f" Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
284
- else:
285
- print(f" Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
286
- else:
287
- total_rows = None # Will be calculated in main query
276
+ # Get row count
277
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
278
+ print(f" Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
288
279
 
289
- # Build a single query that calculates all NFV scores in one pass
280
+ # Build a single query that calculates all cardinality in one pass
290
281
  # This scans the data only ONCE instead of once per column
291
282
  select_clauses = []
292
283
  for col in column_names:
293
- if use_approx:
294
- select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
295
- else:
296
- select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
284
+ select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
297
285
 
298
286
  query = f"""
299
287
  SELECT
@@ -307,8 +295,7 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
307
295
  if not result:
308
296
  return {}
309
297
 
310
- if total_rows is None:
311
- total_rows = result[0]
298
+ total_rows = result[0]
312
299
 
313
300
  nfv_stats = {}
314
301
 
@@ -403,7 +390,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
403
390
  mode: str = "natural",
404
391
  min_distinct_threshold: int = 2,
405
392
  max_cardinality_pct: float = 0.01,
406
- max_ordering_depth: int = 3) -> pd.DataFrame:
393
+ max_ordering_depth: int = 3,
394
+ schema_name: str = None,
395
+ table_display_name: str = None) -> pd.DataFrame:
407
396
  """
408
397
  Test column orderings for RLE optimization.
409
398
 
@@ -421,12 +410,27 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
421
410
  min_distinct_threshold: Exclude columns with fewer distinct values (default: 2, i.e. only exclude constants with 1 value)
422
411
  max_cardinality_pct: Exclude columns with cardinality ratio above this % (default: 0.01 = 1%)
423
412
  max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
413
+ schema_name: Optional schema name to include in results (default: None)
414
+ table_display_name: Optional table name to include in results (default: None)
424
415
 
425
416
  Returns:
426
- DataFrame with columns: sort_order, columns_used, total_rle_all, and individual column RLE counts
417
+ DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
427
418
  """
428
419
  print("Analyzing column characteristics...")
429
420
 
421
+ # Calculate cardinality ratios first (for all modes)
422
+ print("\nCalculating cardinality ratios on full dataset...")
423
+ if table_name:
424
+ card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
425
+ else:
426
+ # Fallback: use delta_scan directly
427
+ card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
428
+
429
+ print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
430
+ for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
431
+ card_pct = stats['cardinality_ratio'] * 100
432
+ print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
433
+
430
434
  # For "natural" mode, just calculate RLE on natural order
431
435
  if mode == "natural":
432
436
  print("\n" + "="*60)
@@ -453,9 +457,10 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
453
457
 
454
458
  print(f"\nResults:")
455
459
  print(f" Total RLE (all columns): {total_rle_all:,}")
456
- print(f" Average RLE per column: {total_rle_all / len(column_names):.1f}")
457
460
 
458
461
  results = [{
462
+ 'schema': schema_name,
463
+ 'table': table_display_name,
459
464
  'sort_order': 'natural_order',
460
465
  'columns_used': 'file_row_number',
461
466
  'total_rle_all': total_rle_all,
@@ -467,21 +472,48 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
467
472
  print(f"✓ Analysis complete!")
468
473
  print(f"{'='*60}")
469
474
 
470
- return df
471
-
472
- # For "auto" and "advanced" modes, calculate cardinality ratios first
473
- print("\nCalculating cardinality ratios on full dataset...")
474
- if table_name:
475
- card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
476
- else:
477
- # Fallback: use delta_scan directly
478
- card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
479
-
480
- print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
481
- for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
482
- card_pct = stats['cardinality_ratio'] * 100
483
- print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
475
+ # Transform to long format
476
+ long_format_results = []
477
+
478
+ for _, row in df.iterrows():
479
+ schema_val = row['schema']
480
+ table_val = row['table']
481
+ sort_order = row['sort_order']
482
+ columns_used = row['columns_used']
483
+ total_rle_all_val = row['total_rle_all']
484
+
485
+ # Get all column names except metadata columns
486
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
487
+ data_columns = [col for col in df.columns if col not in metadata_cols]
488
+
489
+ # Get total rows and NDV from card_stats if available
490
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
491
+
492
+ # Create one row per data column
493
+ for col in data_columns:
494
+ rle_value = row[col]
495
+
496
+ # Get NDV from card_stats
497
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
498
+
499
+ long_format_results.append({
500
+ 'schema': schema_val,
501
+ 'table': table_val,
502
+ 'sort_type': sort_order,
503
+ 'column': col,
504
+ 'order': None,
505
+ 'RLE': rle_value,
506
+ 'NDV': ndv_value,
507
+ 'total_rows': total_rows,
508
+ 'total_RLE': total_rle_all_val,
509
+ 'comments': ''
510
+ })
511
+
512
+ long_df = pd.DataFrame(long_format_results)
513
+
514
+ return long_df
484
515
 
516
+ # For "auto" and "advanced" modes, continue with optimization
485
517
  # Extract just the ratios for easier handling
486
518
  cardinality_ratios = {col: stats['cardinality_ratio'] for col, stats in card_stats.items()}
487
519
  column_names = list(card_stats.keys())
@@ -532,9 +564,30 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
532
564
  # Filter baseline to only include good_for_reordering columns
533
565
  baseline_filtered = {col: rle for col, rle in baseline.items() if col in good_for_reordering}
534
566
 
535
- print(f"Baseline RLE runs (columns worth reordering):")
536
- for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
537
- print(f" {col}: {baseline_filtered[col]:,} runs")
567
+ # Show column categorization upfront
568
+ print(f"\nColumn Analysis (baseline RLE in natural order):")
569
+
570
+ # Show columns worth reordering first
571
+ if baseline_filtered:
572
+ print(f" Columns included in optimization:")
573
+ for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
574
+ print(f" {col}: {baseline_filtered[col]:,} runs")
575
+ print(f" ─────────────────────────")
576
+ print(f" Subtotal: {sum(baseline_filtered.values()):,} runs")
577
+
578
+ # Show excluded columns (constant or high-cardinality)
579
+ excluded_cols = {col: rle for col, rle in baseline.items()
580
+ if col in constant_cols or col in fragmented_cols}
581
+ if excluded_cols:
582
+ print(f" Columns excluded from optimization:")
583
+ for col in sorted(excluded_cols.keys(), key=lambda c: excluded_cols[c]):
584
+ reason = "constant" if col in constant_cols else "high-cardinality"
585
+ print(f" {col}: {excluded_cols[col]:,} runs ({reason})")
586
+ print(f" ─────────────────────────")
587
+ print(f" Subtotal: {sum(excluded_cols.values()):,} runs")
588
+
589
+ # Show total baseline RLE
590
+ print(f"\nBaseline Total RLE (all columns): {sum(baseline.values()):,} runs")
538
591
 
539
592
  # Define only the most promising orderings to test
540
593
  orderings_to_test = [
@@ -545,20 +598,22 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
545
598
  if mode in ["auto", "advanced"] and len(good_for_reordering) >= 2:
546
599
  orderings_to_test.append((good_for_reordering, 'by_cardinality'))
547
600
 
548
- print(f"\n✓ Testing {len(orderings_to_test)} orderings")
549
- print("="*60)
601
+ # Count only the actual reordering tests (exclude natural_order baseline)
602
+ num_tests = len(orderings_to_test) - 1
550
603
 
551
604
  results = []
552
605
 
553
606
  for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
554
- print(f"\n[{i}/{len(orderings_to_test)}] Testing: {label}")
555
- if sort_cols:
556
- print(f" Order: {', '.join(sort_cols)}")
557
-
558
607
  if i == 1:
559
- # Use baseline for natural order (already calculated)
608
+ # Use baseline for natural order (already calculated and displayed)
560
609
  rle_counts = baseline
561
610
  else:
611
+ # This is an actual reordering test
612
+ test_num = i - 1
613
+ print(f"\n[{test_num}/{num_tests}] Testing: {label}")
614
+ if sort_cols:
615
+ print(f" Order: {', '.join(sort_cols)}")
616
+
562
617
  # Calculate RLE for this ordering
563
618
  rle_counts = calculate_rle_for_columns(con, delta_path, sort_cols, limit)
564
619
 
@@ -572,11 +627,11 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
572
627
  # Calculate weighted score (considering both RLE and cardinality - lower cardinality = better)
573
628
  cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
574
629
 
575
- print(f" Total RLE (all columns): {total_rle_all:,}")
576
- print(f" Optimizable columns RLE: {total_rle_optimizable:,}")
577
- print(f" Avg RLE (optimizable): {total_rle_optimizable / len(rle_filtered):.1f}")
630
+ print(f" Total RLE: {total_rle_all:,} runs")
578
631
 
579
632
  results.append({
633
+ 'schema': schema_name,
634
+ 'table': table_display_name,
580
635
  'sort_order': label,
581
636
  'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
582
637
  'total_rle_all': total_rle_all, # All columns (must be >= row_count)
@@ -599,42 +654,82 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
599
654
  current_best_rle = sum(baseline_filtered.values())
600
655
  remaining_columns = list(good_for_reordering)
601
656
 
657
+ # Get the cardinality-based RLE as the target to beat (both total and optimizable)
658
+ cardinality_rle = results[-1]['optimizable_rle'] if len(results) > 1 else float('inf')
659
+ cardinality_total_rle = results[-1]['total_rle_all'] if len(results) > 1 else float('inf')
660
+
602
661
  for depth in range(1, min(max_ordering_depth + 1, len(good_for_reordering) + 1)):
603
- print(f"\n--- Depth {depth}: Testing {len(remaining_columns)} candidate columns ---")
662
+ num_candidates = len(remaining_columns)
663
+ num_positions = len(current_best_ordering) + 1
664
+ total_tests = num_candidates * num_positions
665
+ print(f"\n--- Depth {depth}: Testing {num_candidates} candidate columns × {num_positions} positions = {total_tests} tests ---")
666
+ print(f" Target to beat: {cardinality_total_rle:,} runs (cardinality ordering)")
604
667
 
605
668
  best_depth_ordering = None
606
669
  best_depth_rle = float('inf')
607
670
  best_depth_col = None
608
671
  best_depth_position = None
672
+ early_exit = False
673
+
674
+ # Sort remaining candidates by baseline RLE (HIGHER first = test worse candidates first)
675
+ # This way we test DUID, time, date before cutoff (which we know is good from cardinality test)
676
+ candidates_sorted = sorted(remaining_columns, key=lambda c: baseline_filtered[c], reverse=True)
609
677
 
610
- # Try adding each remaining column
611
- for candidate_col in remaining_columns:
678
+ test_num = 0
679
+ # Try adding each remaining column (sorted by baseline RLE - worse first)
680
+ for candidate_col in candidates_sorted:
612
681
  # Try inserting at each possible position (including end)
613
682
  for insert_pos in range(len(current_best_ordering) + 1):
683
+ test_num += 1
684
+
614
685
  # Build test ordering: insert candidate at position
615
686
  test_ordering = current_best_ordering[:insert_pos] + [candidate_col] + current_best_ordering[insert_pos:]
616
687
 
688
+ print(f" [{test_num}/{total_tests}] Testing '{candidate_col}' at position {insert_pos}: [{', '.join(test_ordering)}]", end='', flush=True)
689
+
617
690
  # Calculate RLE for this ordering
618
691
  rle_counts = calculate_rle_for_columns(con, delta_path, test_ordering, limit)
619
692
 
620
693
  # Sum RLE for optimizable columns only
621
694
  rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
622
695
  total_rle = sum(rle_filtered.values())
696
+ total_rle_all = sum(rle_counts.values())
697
+
698
+ is_best = total_rle < best_depth_rle
699
+ beats_cardinality = total_rle < cardinality_rle
700
+
701
+ status = ""
702
+ if beats_cardinality:
703
+ status = " 🎯 Beats cardinality!"
704
+
705
+ print(f" → Total: {total_rle_all:,}{status}")
623
706
 
624
707
  # Track best at this depth
625
- if total_rle < best_depth_rle:
708
+ if is_best:
626
709
  best_depth_rle = total_rle
627
710
  best_depth_ordering = test_ordering
628
711
  best_depth_col = candidate_col
629
712
  best_depth_position = insert_pos
630
713
  best_depth_rle_counts = rle_counts
714
+
715
+ # Early exit if we beat cardinality ordering!
716
+ if beats_cardinality:
717
+ print(f"\n ⚡ Early exit! Found ordering better than cardinality. Moving to next depth.")
718
+ early_exit = True
719
+ break
720
+
721
+ if early_exit:
722
+ break
631
723
 
632
724
  # Check if we found improvement
633
725
  if best_depth_rle < current_best_rle:
634
- improvement_pct = ((current_best_rle - best_depth_rle) / current_best_rle) * 100
635
- print(f"✓ Best at depth {depth}: Add '{best_depth_col}' at position {best_depth_position}")
636
- print(f" Ordering: {', '.join(best_depth_ordering)}")
637
- print(f" RLE: {best_depth_rle:,} runs (improved {improvement_pct:.1f}% from previous depth)")
726
+ current_total_rle_all = sum(best_depth_rle_counts.values())
727
+ baseline_total_rle_all = sum(baseline.values())
728
+ improvement_pct = ((baseline_total_rle_all - current_total_rle_all) / baseline_total_rle_all) * 100
729
+ print(f"\n✓ Best at depth {depth}: [{', '.join(best_depth_ordering)}]")
730
+ print(f" Total RLE (all columns): {current_total_rle_all:,} runs")
731
+ print(f" Optimizable RLE: {best_depth_rle:,} runs")
732
+ print(f" Improvement: {improvement_pct:.1f}% better than baseline (total RLE)")
638
733
 
639
734
  # Update for next depth
640
735
  current_best_ordering = best_depth_ordering
@@ -647,6 +742,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
647
742
  cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
648
743
 
649
744
  results.append({
745
+ 'schema': schema_name,
746
+ 'table': table_display_name,
650
747
  'sort_order': f'greedy_depth_{depth}',
651
748
  'columns_used': ', '.join(best_depth_ordering),
652
749
  'total_rle_all': total_rle_all,
@@ -657,8 +754,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
657
754
  **best_depth_rle_counts
658
755
  })
659
756
  else:
660
- print(f"✗ No improvement found at depth {depth} - stopping early")
661
- print(f" Best RLE remains: {current_best_rle:,} runs")
757
+ print(f"\n✗ No improvement found at depth {depth} - stopping early")
758
+ print(f" Best RLE (all columns): {sum(best_depth_rle_counts.values()) if best_depth_rle_counts else sum(baseline.values()):,} runs")
759
+ print(f" Best optimizable RLE: {best_depth_rle if best_depth_rle != float('inf') else current_best_rle:,} runs")
662
760
  break
663
761
 
664
762
  print(f"\n{'='*60}")
@@ -666,7 +764,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
666
764
  print(f"{'='*60}")
667
765
  if current_best_ordering:
668
766
  print(f"Final greedy ordering: {', '.join(current_best_ordering)}")
669
- print(f"Final RLE: {current_best_rle:,} runs")
767
+ print(f"Final optimizable RLE: {current_best_rle:,} runs")
670
768
 
671
769
 
672
770
  # Convert to DataFrame and sort by optimizable RLE (lower is better)
@@ -677,14 +775,14 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
677
775
  print(f"✓ Analysis complete!")
678
776
  print(f"{'='*60}")
679
777
  print(f"Best ordering: {df.iloc[0]['sort_order']}")
680
- print(f"Best optimizable RLE: {df.iloc[0]['optimizable_rle']:,} runs (lower is better)")
681
- print(f"Total RLE (all columns): {df.iloc[0]['total_rle_all']:,} runs")
778
+ print(f"Best total RLE: {df.iloc[0]['total_rle_all']:,} runs (lower is better)")
682
779
 
683
780
 
684
- improvement = baseline_filtered[list(baseline_filtered.keys())[0]] if baseline_filtered else 0
685
- best_rle = df.iloc[0]['optimizable_rle']
686
- if len(df) > 1 and improvement > 0:
687
- pct = ((sum(baseline_filtered.values()) - best_rle) / sum(baseline_filtered.values())) * 100
781
+ # Calculate improvement using total RLE (all columns) for meaningful comparison
782
+ baseline_total_rle = sum(baseline.values())
783
+ best_total_rle = df.iloc[0]['total_rle_all']
784
+ if len(df) > 1 and baseline_total_rle > 0:
785
+ pct = ((baseline_total_rle - best_total_rle) / baseline_total_rle) * 100
688
786
  if pct > 0:
689
787
  print(f"Improvement: {pct:.1f}% fewer runs vs natural order")
690
788
 
@@ -693,7 +791,56 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
693
791
  # Remove: optimizable_rle, avg_rle, cardinality_weighted_score, method
694
792
  display_df = df.drop(columns=['optimizable_rle', 'avg_rle', 'cardinality_weighted_score', 'method'], errors='ignore')
695
793
 
696
- return display_df
794
+ # Transform to long format
795
+ long_format_results = []
796
+
797
+ for _, row in display_df.iterrows():
798
+ schema_val = row['schema']
799
+ table_val = row['table']
800
+ sort_order = row['sort_order']
801
+ columns_used = row['columns_used']
802
+ total_rle_all = row['total_rle_all']
803
+
804
+ # Get all column names except metadata columns
805
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
806
+ data_columns = [col for col in display_df.columns if col not in metadata_cols]
807
+
808
+ # Get total rows and NDV from card_stats if available
809
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
810
+
811
+ # Parse the columns_used to get ordering
812
+ sort_columns_list = []
813
+ if columns_used != 'file_row_number':
814
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
815
+
816
+ # Create one row per data column
817
+ for col in data_columns:
818
+ rle_value = row[col]
819
+
820
+ # Get NDV from card_stats
821
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
822
+
823
+ # Determine if column was included in the sort and its position
824
+ is_in_sort = col in sort_columns_list
825
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
826
+ comment = '' if is_in_sort or columns_used == 'file_row_number' else 'not included in the sort'
827
+
828
+ long_format_results.append({
829
+ 'schema': schema_val,
830
+ 'table': table_val,
831
+ 'sort_type': sort_order,
832
+ 'column': col,
833
+ 'order': order_position,
834
+ 'RLE': rle_value,
835
+ 'NDV': ndv_value,
836
+ 'total_rows': total_rows,
837
+ 'total_RLE': total_rle_all,
838
+ 'comments': comment
839
+ })
840
+
841
+ long_df = pd.DataFrame(long_format_results)
842
+
843
+ return long_df
697
844
 
698
845
 
699
846
  # Example usage:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev3
3
+ Version: 0.2.19.dev5
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -16,4 +16,5 @@ duckrun.egg-info/PKG-INFO
16
16
  duckrun.egg-info/SOURCES.txt
17
17
  duckrun.egg-info/dependency_links.txt
18
18
  duckrun.egg-info/requires.txt
19
- duckrun.egg-info/top_level.txt
19
+ duckrun.egg-info/top_level.txt
20
+ tests/test_rle.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev3"
7
+ version = "0.2.19.dev5"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,16 @@
1
+ import sys
2
+ import os
3
+ import pandas as pd
4
+
5
+ # Add the parent directory to Python path to use local package source
6
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
7
+ import duckrun
8
+
9
+ # Analyze multiple schemas/tables
10
+ conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
11
+
12
+ # Analyze tables - now returns long format automatically
13
+ result = conn.rle("calendar",'auto')
14
+ print(result)
15
+ conn.close()
16
+
File without changes
File without changes
File without changes