duckrun 0.2.19.dev3__tar.gz → 0.2.19.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/core.py +96 -5
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/rle.py +210 -50
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/SOURCES.txt +2 -1
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/pyproject.toml +1 -1
- duckrun-0.2.19.dev4/tests/test_rle.py +16 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/LICENSE +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/README.md +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev4}/setup.cfg +0 -0
|
@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1244
1244
|
refresh=refresh
|
|
1245
1245
|
)
|
|
1246
1246
|
|
|
1247
|
-
def rle(self, table_name: str = None, mode
|
|
1247
|
+
def rle(self, table_name: str = None, mode = "natural",
|
|
1248
1248
|
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1249
|
-
max_ordering_depth: int = 3):
|
|
1249
|
+
max_ordering_depth: int = 3, limit: int = None):
|
|
1250
1250
|
"""
|
|
1251
1251
|
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1252
1252
|
|
|
@@ -1254,13 +1254,15 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1254
1254
|
table_name: Name of the table to analyze. Can be:
|
|
1255
1255
|
- 'table_name' (uses current schema)
|
|
1256
1256
|
- 'schema.table_name' (specific schema)
|
|
1257
|
-
mode: Analysis mode:
|
|
1258
|
-
- "natural": Calculate RLE for natural order only (
|
|
1257
|
+
mode: Analysis mode or column ordering:
|
|
1258
|
+
- "natural": Calculate RLE for natural order only (fastest)
|
|
1259
1259
|
- "auto": Natural order + cardinality-based ordering (recommended)
|
|
1260
1260
|
- "advanced": Natural + cardinality + greedy incremental search (most thorough)
|
|
1261
|
+
- List[str]: Specific column ordering to test, e.g., ['date', 'duid']
|
|
1261
1262
|
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
|
|
1262
1263
|
max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
|
|
1263
1264
|
max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
|
|
1265
|
+
limit: Optional row limit for testing/development (default: None, analyzes all rows)
|
|
1264
1266
|
|
|
1265
1267
|
Returns:
|
|
1266
1268
|
DataFrame with RLE analysis results
|
|
@@ -1276,6 +1278,10 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1276
1278
|
# Advanced optimization (greedy incremental search)
|
|
1277
1279
|
con.rle("mytable", "advanced")
|
|
1278
1280
|
|
|
1281
|
+
# Test specific column ordering
|
|
1282
|
+
con.rle("mytable", ["date", "duid"])
|
|
1283
|
+
con.rle("mytable", ["cutoff", "time", "DUID", "date"])
|
|
1284
|
+
|
|
1279
1285
|
# Advanced with custom depth
|
|
1280
1286
|
con.rle("mytable", "advanced", max_ordering_depth=4)
|
|
1281
1287
|
|
|
@@ -1284,6 +1290,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1284
1290
|
|
|
1285
1291
|
# Custom thresholds for small tables
|
|
1286
1292
|
con.rle("mytable", "auto", max_cardinality_pct=0.05)
|
|
1293
|
+
|
|
1294
|
+
# Limit rows for testing
|
|
1295
|
+
con.rle("mytable", "auto", limit=10000)
|
|
1287
1296
|
"""
|
|
1288
1297
|
from .rle import (
|
|
1289
1298
|
calculate_cardinality_ratio,
|
|
@@ -1326,15 +1335,97 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1326
1335
|
print(f"❌ Error accessing Delta table: {e}")
|
|
1327
1336
|
return None
|
|
1328
1337
|
|
|
1338
|
+
# Check if mode is a list of columns (custom ordering)
|
|
1339
|
+
if isinstance(mode, list):
|
|
1340
|
+
# User wants to test a specific column ordering
|
|
1341
|
+
print(f"Testing custom column ordering: {', '.join(mode)}")
|
|
1342
|
+
|
|
1343
|
+
# Calculate cardinality for NDV values
|
|
1344
|
+
card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
|
|
1345
|
+
|
|
1346
|
+
# Calculate RLE for the specified ordering
|
|
1347
|
+
rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
|
|
1348
|
+
|
|
1349
|
+
total_rle_all = sum(rle_counts.values())
|
|
1350
|
+
|
|
1351
|
+
print(f"\nResults:")
|
|
1352
|
+
print(f" Custom ordering: [{', '.join(mode)}]")
|
|
1353
|
+
print(f" Total RLE (all columns): {total_rle_all:,} runs")
|
|
1354
|
+
|
|
1355
|
+
# Return as DataFrame for consistency
|
|
1356
|
+
import pandas as pd
|
|
1357
|
+
results = [{
|
|
1358
|
+
'schema': schema_name,
|
|
1359
|
+
'table': tbl,
|
|
1360
|
+
'sort_order': 'custom',
|
|
1361
|
+
'columns_used': ', '.join(mode),
|
|
1362
|
+
'total_rle_all': total_rle_all,
|
|
1363
|
+
**rle_counts
|
|
1364
|
+
}]
|
|
1365
|
+
|
|
1366
|
+
df = pd.DataFrame(results)
|
|
1367
|
+
|
|
1368
|
+
# Transform to long format
|
|
1369
|
+
long_format_results = []
|
|
1370
|
+
|
|
1371
|
+
for _, row in df.iterrows():
|
|
1372
|
+
schema_val = row['schema']
|
|
1373
|
+
table_val = row['table']
|
|
1374
|
+
sort_order = row['sort_order']
|
|
1375
|
+
columns_used = row['columns_used']
|
|
1376
|
+
total_rle_all_val = row['total_rle_all']
|
|
1377
|
+
|
|
1378
|
+
# Get all column names except metadata columns
|
|
1379
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
1380
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
1381
|
+
|
|
1382
|
+
# Get total rows from card_stats if available
|
|
1383
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
1384
|
+
|
|
1385
|
+
# Parse the columns_used to get ordering
|
|
1386
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
1387
|
+
|
|
1388
|
+
# Create one row per data column
|
|
1389
|
+
for col in data_columns:
|
|
1390
|
+
rle_value = row[col]
|
|
1391
|
+
|
|
1392
|
+
# Get NDV from card_stats
|
|
1393
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
1394
|
+
|
|
1395
|
+
# Determine if column was included in the sort and its position
|
|
1396
|
+
is_in_sort = col in sort_columns_list
|
|
1397
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
1398
|
+
comment = '' if is_in_sort else 'not included in the sort'
|
|
1399
|
+
|
|
1400
|
+
long_format_results.append({
|
|
1401
|
+
'schema': schema_val,
|
|
1402
|
+
'table': table_val,
|
|
1403
|
+
'sort_type': sort_order,
|
|
1404
|
+
'column': col,
|
|
1405
|
+
'order': order_position,
|
|
1406
|
+
'RLE': rle_value,
|
|
1407
|
+
'NDV': ndv_value,
|
|
1408
|
+
'total_rows': total_rows,
|
|
1409
|
+
'total_RLE': total_rle_all_val,
|
|
1410
|
+
'comments': comment
|
|
1411
|
+
})
|
|
1412
|
+
|
|
1413
|
+
long_df = pd.DataFrame(long_format_results)
|
|
1414
|
+
|
|
1415
|
+
return long_df
|
|
1416
|
+
|
|
1329
1417
|
# All modes now use test_column_orderings_smart with the mode parameter
|
|
1330
1418
|
return test_column_orderings_smart(
|
|
1331
1419
|
self.con,
|
|
1332
1420
|
table_path,
|
|
1333
1421
|
table_name=table_name, # Pass table name for cardinality calculation on full dataset
|
|
1334
1422
|
mode=mode,
|
|
1423
|
+
limit=limit,
|
|
1335
1424
|
min_distinct_threshold=min_distinct_threshold,
|
|
1336
1425
|
max_cardinality_pct=max_cardinality_pct,
|
|
1337
|
-
max_ordering_depth=max_ordering_depth
|
|
1426
|
+
max_ordering_depth=max_ordering_depth,
|
|
1427
|
+
schema_name=schema_name,
|
|
1428
|
+
table_display_name=tbl
|
|
1338
1429
|
)
|
|
1339
1430
|
|
|
1340
1431
|
def close(self):
|
|
@@ -198,7 +198,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
|
|
|
198
198
|
if sort_columns:
|
|
199
199
|
order_by = "ORDER BY " + ", ".join(sort_columns)
|
|
200
200
|
else:
|
|
201
|
-
order_by = "ORDER BY file_row_number ASC"
|
|
201
|
+
order_by = "ORDER BY filename, file_row_number ASC"
|
|
202
202
|
|
|
203
203
|
limit_clause = f"LIMIT {limit}" if limit else ""
|
|
204
204
|
|
|
@@ -210,7 +210,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
|
|
|
210
210
|
SELECT
|
|
211
211
|
{column_name},
|
|
212
212
|
ROW_NUMBER() OVER ({order_by}) as sort_order
|
|
213
|
-
FROM delta_scan('{delta_path}', file_row_number = TRUE)
|
|
213
|
+
FROM delta_scan('{delta_path}', filename = TRUE, file_row_number = TRUE)
|
|
214
214
|
{limit_clause}
|
|
215
215
|
),
|
|
216
216
|
runs AS (
|
|
@@ -403,7 +403,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
403
403
|
mode: str = "natural",
|
|
404
404
|
min_distinct_threshold: int = 2,
|
|
405
405
|
max_cardinality_pct: float = 0.01,
|
|
406
|
-
max_ordering_depth: int = 3
|
|
406
|
+
max_ordering_depth: int = 3,
|
|
407
|
+
schema_name: str = None,
|
|
408
|
+
table_display_name: str = None) -> pd.DataFrame:
|
|
407
409
|
"""
|
|
408
410
|
Test column orderings for RLE optimization.
|
|
409
411
|
|
|
@@ -421,12 +423,27 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
421
423
|
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2, i.e. only exclude constants with 1 value)
|
|
422
424
|
max_cardinality_pct: Exclude columns with cardinality ratio above this % (default: 0.01 = 1%)
|
|
423
425
|
max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
|
|
426
|
+
schema_name: Optional schema name to include in results (default: None)
|
|
427
|
+
table_display_name: Optional table name to include in results (default: None)
|
|
424
428
|
|
|
425
429
|
Returns:
|
|
426
|
-
DataFrame with columns: sort_order, columns_used, total_rle_all, and individual column RLE counts
|
|
430
|
+
DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
|
|
427
431
|
"""
|
|
428
432
|
print("Analyzing column characteristics...")
|
|
429
433
|
|
|
434
|
+
# Calculate cardinality ratios first (for all modes)
|
|
435
|
+
print("\nCalculating cardinality ratios on full dataset...")
|
|
436
|
+
if table_name:
|
|
437
|
+
card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
|
|
438
|
+
else:
|
|
439
|
+
# Fallback: use delta_scan directly
|
|
440
|
+
card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
|
|
441
|
+
|
|
442
|
+
print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
|
|
443
|
+
for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
|
|
444
|
+
card_pct = stats['cardinality_ratio'] * 100
|
|
445
|
+
print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
|
|
446
|
+
|
|
430
447
|
# For "natural" mode, just calculate RLE on natural order
|
|
431
448
|
if mode == "natural":
|
|
432
449
|
print("\n" + "="*60)
|
|
@@ -453,9 +470,10 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
453
470
|
|
|
454
471
|
print(f"\nResults:")
|
|
455
472
|
print(f" Total RLE (all columns): {total_rle_all:,}")
|
|
456
|
-
print(f" Average RLE per column: {total_rle_all / len(column_names):.1f}")
|
|
457
473
|
|
|
458
474
|
results = [{
|
|
475
|
+
'schema': schema_name,
|
|
476
|
+
'table': table_display_name,
|
|
459
477
|
'sort_order': 'natural_order',
|
|
460
478
|
'columns_used': 'file_row_number',
|
|
461
479
|
'total_rle_all': total_rle_all,
|
|
@@ -467,21 +485,48 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
467
485
|
print(f"✓ Analysis complete!")
|
|
468
486
|
print(f"{'='*60}")
|
|
469
487
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
488
|
+
# Transform to long format
|
|
489
|
+
long_format_results = []
|
|
490
|
+
|
|
491
|
+
for _, row in df.iterrows():
|
|
492
|
+
schema_val = row['schema']
|
|
493
|
+
table_val = row['table']
|
|
494
|
+
sort_order = row['sort_order']
|
|
495
|
+
columns_used = row['columns_used']
|
|
496
|
+
total_rle_all_val = row['total_rle_all']
|
|
497
|
+
|
|
498
|
+
# Get all column names except metadata columns
|
|
499
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
500
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
501
|
+
|
|
502
|
+
# Get total rows and NDV from card_stats if available
|
|
503
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
504
|
+
|
|
505
|
+
# Create one row per data column
|
|
506
|
+
for col in data_columns:
|
|
507
|
+
rle_value = row[col]
|
|
508
|
+
|
|
509
|
+
# Get NDV from card_stats
|
|
510
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
511
|
+
|
|
512
|
+
long_format_results.append({
|
|
513
|
+
'schema': schema_val,
|
|
514
|
+
'table': table_val,
|
|
515
|
+
'sort_type': sort_order,
|
|
516
|
+
'column': col,
|
|
517
|
+
'order': None,
|
|
518
|
+
'RLE': rle_value,
|
|
519
|
+
'NDV': ndv_value,
|
|
520
|
+
'total_rows': total_rows,
|
|
521
|
+
'total_RLE': total_rle_all_val,
|
|
522
|
+
'comments': ''
|
|
523
|
+
})
|
|
524
|
+
|
|
525
|
+
long_df = pd.DataFrame(long_format_results)
|
|
526
|
+
|
|
527
|
+
return long_df
|
|
484
528
|
|
|
529
|
+
# For "auto" and "advanced" modes, continue with optimization
|
|
485
530
|
# Extract just the ratios for easier handling
|
|
486
531
|
cardinality_ratios = {col: stats['cardinality_ratio'] for col, stats in card_stats.items()}
|
|
487
532
|
column_names = list(card_stats.keys())
|
|
@@ -532,9 +577,30 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
532
577
|
# Filter baseline to only include good_for_reordering columns
|
|
533
578
|
baseline_filtered = {col: rle for col, rle in baseline.items() if col in good_for_reordering}
|
|
534
579
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
580
|
+
# Show column categorization upfront
|
|
581
|
+
print(f"\nColumn Analysis (baseline RLE in natural order):")
|
|
582
|
+
|
|
583
|
+
# Show columns worth reordering first
|
|
584
|
+
if baseline_filtered:
|
|
585
|
+
print(f" Columns included in optimization:")
|
|
586
|
+
for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
|
|
587
|
+
print(f" {col}: {baseline_filtered[col]:,} runs")
|
|
588
|
+
print(f" ─────────────────────────")
|
|
589
|
+
print(f" Subtotal: {sum(baseline_filtered.values()):,} runs")
|
|
590
|
+
|
|
591
|
+
# Show excluded columns (constant or high-cardinality)
|
|
592
|
+
excluded_cols = {col: rle for col, rle in baseline.items()
|
|
593
|
+
if col in constant_cols or col in fragmented_cols}
|
|
594
|
+
if excluded_cols:
|
|
595
|
+
print(f" Columns excluded from optimization:")
|
|
596
|
+
for col in sorted(excluded_cols.keys(), key=lambda c: excluded_cols[c]):
|
|
597
|
+
reason = "constant" if col in constant_cols else "high-cardinality"
|
|
598
|
+
print(f" {col}: {excluded_cols[col]:,} runs ({reason})")
|
|
599
|
+
print(f" ─────────────────────────")
|
|
600
|
+
print(f" Subtotal: {sum(excluded_cols.values()):,} runs")
|
|
601
|
+
|
|
602
|
+
# Show total baseline RLE
|
|
603
|
+
print(f"\nBaseline Total RLE (all columns): {sum(baseline.values()):,} runs")
|
|
538
604
|
|
|
539
605
|
# Define only the most promising orderings to test
|
|
540
606
|
orderings_to_test = [
|
|
@@ -545,20 +611,22 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
545
611
|
if mode in ["auto", "advanced"] and len(good_for_reordering) >= 2:
|
|
546
612
|
orderings_to_test.append((good_for_reordering, 'by_cardinality'))
|
|
547
613
|
|
|
548
|
-
|
|
549
|
-
|
|
614
|
+
# Count only the actual reordering tests (exclude natural_order baseline)
|
|
615
|
+
num_tests = len(orderings_to_test) - 1
|
|
550
616
|
|
|
551
617
|
results = []
|
|
552
618
|
|
|
553
619
|
for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
|
|
554
|
-
print(f"\n[{i}/{len(orderings_to_test)}] Testing: {label}")
|
|
555
|
-
if sort_cols:
|
|
556
|
-
print(f" Order: {', '.join(sort_cols)}")
|
|
557
|
-
|
|
558
620
|
if i == 1:
|
|
559
|
-
# Use baseline for natural order (already calculated)
|
|
621
|
+
# Use baseline for natural order (already calculated and displayed)
|
|
560
622
|
rle_counts = baseline
|
|
561
623
|
else:
|
|
624
|
+
# This is an actual reordering test
|
|
625
|
+
test_num = i - 1
|
|
626
|
+
print(f"\n[{test_num}/{num_tests}] Testing: {label}")
|
|
627
|
+
if sort_cols:
|
|
628
|
+
print(f" Order: {', '.join(sort_cols)}")
|
|
629
|
+
|
|
562
630
|
# Calculate RLE for this ordering
|
|
563
631
|
rle_counts = calculate_rle_for_columns(con, delta_path, sort_cols, limit)
|
|
564
632
|
|
|
@@ -572,11 +640,11 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
572
640
|
# Calculate weighted score (considering both RLE and cardinality - lower cardinality = better)
|
|
573
641
|
cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
|
|
574
642
|
|
|
575
|
-
print(f" Total RLE
|
|
576
|
-
print(f" Optimizable columns RLE: {total_rle_optimizable:,}")
|
|
577
|
-
print(f" Avg RLE (optimizable): {total_rle_optimizable / len(rle_filtered):.1f}")
|
|
643
|
+
print(f" Total RLE: {total_rle_all:,} runs")
|
|
578
644
|
|
|
579
645
|
results.append({
|
|
646
|
+
'schema': schema_name,
|
|
647
|
+
'table': table_display_name,
|
|
580
648
|
'sort_order': label,
|
|
581
649
|
'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
|
|
582
650
|
'total_rle_all': total_rle_all, # All columns (must be >= row_count)
|
|
@@ -599,42 +667,82 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
599
667
|
current_best_rle = sum(baseline_filtered.values())
|
|
600
668
|
remaining_columns = list(good_for_reordering)
|
|
601
669
|
|
|
670
|
+
# Get the cardinality-based RLE as the target to beat (both total and optimizable)
|
|
671
|
+
cardinality_rle = results[-1]['optimizable_rle'] if len(results) > 1 else float('inf')
|
|
672
|
+
cardinality_total_rle = results[-1]['total_rle_all'] if len(results) > 1 else float('inf')
|
|
673
|
+
|
|
602
674
|
for depth in range(1, min(max_ordering_depth + 1, len(good_for_reordering) + 1)):
|
|
603
|
-
|
|
675
|
+
num_candidates = len(remaining_columns)
|
|
676
|
+
num_positions = len(current_best_ordering) + 1
|
|
677
|
+
total_tests = num_candidates * num_positions
|
|
678
|
+
print(f"\n--- Depth {depth}: Testing {num_candidates} candidate columns × {num_positions} positions = {total_tests} tests ---")
|
|
679
|
+
print(f" Target to beat: {cardinality_total_rle:,} runs (cardinality ordering)")
|
|
604
680
|
|
|
605
681
|
best_depth_ordering = None
|
|
606
682
|
best_depth_rle = float('inf')
|
|
607
683
|
best_depth_col = None
|
|
608
684
|
best_depth_position = None
|
|
685
|
+
early_exit = False
|
|
686
|
+
|
|
687
|
+
# Sort remaining candidates by baseline RLE (HIGHER first = test worse candidates first)
|
|
688
|
+
# This way we test DUID, time, date before cutoff (which we know is good from cardinality test)
|
|
689
|
+
candidates_sorted = sorted(remaining_columns, key=lambda c: baseline_filtered[c], reverse=True)
|
|
609
690
|
|
|
610
|
-
|
|
611
|
-
|
|
691
|
+
test_num = 0
|
|
692
|
+
# Try adding each remaining column (sorted by baseline RLE - worse first)
|
|
693
|
+
for candidate_col in candidates_sorted:
|
|
612
694
|
# Try inserting at each possible position (including end)
|
|
613
695
|
for insert_pos in range(len(current_best_ordering) + 1):
|
|
696
|
+
test_num += 1
|
|
697
|
+
|
|
614
698
|
# Build test ordering: insert candidate at position
|
|
615
699
|
test_ordering = current_best_ordering[:insert_pos] + [candidate_col] + current_best_ordering[insert_pos:]
|
|
616
700
|
|
|
701
|
+
print(f" [{test_num}/{total_tests}] Testing '{candidate_col}' at position {insert_pos}: [{', '.join(test_ordering)}]", end='', flush=True)
|
|
702
|
+
|
|
617
703
|
# Calculate RLE for this ordering
|
|
618
704
|
rle_counts = calculate_rle_for_columns(con, delta_path, test_ordering, limit)
|
|
619
705
|
|
|
620
706
|
# Sum RLE for optimizable columns only
|
|
621
707
|
rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
|
|
622
708
|
total_rle = sum(rle_filtered.values())
|
|
709
|
+
total_rle_all = sum(rle_counts.values())
|
|
710
|
+
|
|
711
|
+
is_best = total_rle < best_depth_rle
|
|
712
|
+
beats_cardinality = total_rle < cardinality_rle
|
|
713
|
+
|
|
714
|
+
status = ""
|
|
715
|
+
if beats_cardinality:
|
|
716
|
+
status = " 🎯 Beats cardinality!"
|
|
717
|
+
|
|
718
|
+
print(f" → Total: {total_rle_all:,}{status}")
|
|
623
719
|
|
|
624
720
|
# Track best at this depth
|
|
625
|
-
if
|
|
721
|
+
if is_best:
|
|
626
722
|
best_depth_rle = total_rle
|
|
627
723
|
best_depth_ordering = test_ordering
|
|
628
724
|
best_depth_col = candidate_col
|
|
629
725
|
best_depth_position = insert_pos
|
|
630
726
|
best_depth_rle_counts = rle_counts
|
|
727
|
+
|
|
728
|
+
# Early exit if we beat cardinality ordering!
|
|
729
|
+
if beats_cardinality:
|
|
730
|
+
print(f"\n ⚡ Early exit! Found ordering better than cardinality. Moving to next depth.")
|
|
731
|
+
early_exit = True
|
|
732
|
+
break
|
|
733
|
+
|
|
734
|
+
if early_exit:
|
|
735
|
+
break
|
|
631
736
|
|
|
632
737
|
# Check if we found improvement
|
|
633
738
|
if best_depth_rle < current_best_rle:
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
print(f"
|
|
739
|
+
current_total_rle_all = sum(best_depth_rle_counts.values())
|
|
740
|
+
baseline_total_rle_all = sum(baseline.values())
|
|
741
|
+
improvement_pct = ((baseline_total_rle_all - current_total_rle_all) / baseline_total_rle_all) * 100
|
|
742
|
+
print(f"\n✓ Best at depth {depth}: [{', '.join(best_depth_ordering)}]")
|
|
743
|
+
print(f" Total RLE (all columns): {current_total_rle_all:,} runs")
|
|
744
|
+
print(f" Optimizable RLE: {best_depth_rle:,} runs")
|
|
745
|
+
print(f" Improvement: {improvement_pct:.1f}% better than baseline (total RLE)")
|
|
638
746
|
|
|
639
747
|
# Update for next depth
|
|
640
748
|
current_best_ordering = best_depth_ordering
|
|
@@ -647,6 +755,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
647
755
|
cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
|
|
648
756
|
|
|
649
757
|
results.append({
|
|
758
|
+
'schema': schema_name,
|
|
759
|
+
'table': table_display_name,
|
|
650
760
|
'sort_order': f'greedy_depth_{depth}',
|
|
651
761
|
'columns_used': ', '.join(best_depth_ordering),
|
|
652
762
|
'total_rle_all': total_rle_all,
|
|
@@ -657,8 +767,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
657
767
|
**best_depth_rle_counts
|
|
658
768
|
})
|
|
659
769
|
else:
|
|
660
|
-
print(f"✗ No improvement found at depth {depth} - stopping early")
|
|
661
|
-
print(f" Best RLE
|
|
770
|
+
print(f"\n✗ No improvement found at depth {depth} - stopping early")
|
|
771
|
+
print(f" Best RLE (all columns): {sum(best_depth_rle_counts.values()) if best_depth_rle_counts else sum(baseline.values()):,} runs")
|
|
772
|
+
print(f" Best optimizable RLE: {best_depth_rle if best_depth_rle != float('inf') else current_best_rle:,} runs")
|
|
662
773
|
break
|
|
663
774
|
|
|
664
775
|
print(f"\n{'='*60}")
|
|
@@ -666,7 +777,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
666
777
|
print(f"{'='*60}")
|
|
667
778
|
if current_best_ordering:
|
|
668
779
|
print(f"Final greedy ordering: {', '.join(current_best_ordering)}")
|
|
669
|
-
print(f"Final RLE: {current_best_rle:,} runs")
|
|
780
|
+
print(f"Final optimizable RLE: {current_best_rle:,} runs")
|
|
670
781
|
|
|
671
782
|
|
|
672
783
|
# Convert to DataFrame and sort by optimizable RLE (lower is better)
|
|
@@ -677,14 +788,14 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
677
788
|
print(f"✓ Analysis complete!")
|
|
678
789
|
print(f"{'='*60}")
|
|
679
790
|
print(f"Best ordering: {df.iloc[0]['sort_order']}")
|
|
680
|
-
print(f"Best
|
|
681
|
-
print(f"Total RLE (all columns): {df.iloc[0]['total_rle_all']:,} runs")
|
|
791
|
+
print(f"Best total RLE: {df.iloc[0]['total_rle_all']:,} runs (lower is better)")
|
|
682
792
|
|
|
683
793
|
|
|
684
|
-
improvement
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
794
|
+
# Calculate improvement using total RLE (all columns) for meaningful comparison
|
|
795
|
+
baseline_total_rle = sum(baseline.values())
|
|
796
|
+
best_total_rle = df.iloc[0]['total_rle_all']
|
|
797
|
+
if len(df) > 1 and baseline_total_rle > 0:
|
|
798
|
+
pct = ((baseline_total_rle - best_total_rle) / baseline_total_rle) * 100
|
|
688
799
|
if pct > 0:
|
|
689
800
|
print(f"Improvement: {pct:.1f}% fewer runs vs natural order")
|
|
690
801
|
|
|
@@ -693,7 +804,56 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
693
804
|
# Remove: optimizable_rle, avg_rle, cardinality_weighted_score, method
|
|
694
805
|
display_df = df.drop(columns=['optimizable_rle', 'avg_rle', 'cardinality_weighted_score', 'method'], errors='ignore')
|
|
695
806
|
|
|
696
|
-
|
|
807
|
+
# Transform to long format
|
|
808
|
+
long_format_results = []
|
|
809
|
+
|
|
810
|
+
for _, row in display_df.iterrows():
|
|
811
|
+
schema_val = row['schema']
|
|
812
|
+
table_val = row['table']
|
|
813
|
+
sort_order = row['sort_order']
|
|
814
|
+
columns_used = row['columns_used']
|
|
815
|
+
total_rle_all = row['total_rle_all']
|
|
816
|
+
|
|
817
|
+
# Get all column names except metadata columns
|
|
818
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
819
|
+
data_columns = [col for col in display_df.columns if col not in metadata_cols]
|
|
820
|
+
|
|
821
|
+
# Get total rows and NDV from card_stats if available
|
|
822
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
823
|
+
|
|
824
|
+
# Parse the columns_used to get ordering
|
|
825
|
+
sort_columns_list = []
|
|
826
|
+
if columns_used != 'file_row_number':
|
|
827
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
828
|
+
|
|
829
|
+
# Create one row per data column
|
|
830
|
+
for col in data_columns:
|
|
831
|
+
rle_value = row[col]
|
|
832
|
+
|
|
833
|
+
# Get NDV from card_stats
|
|
834
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
835
|
+
|
|
836
|
+
# Determine if column was included in the sort and its position
|
|
837
|
+
is_in_sort = col in sort_columns_list
|
|
838
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
839
|
+
comment = '' if is_in_sort or columns_used == 'file_row_number' else 'not included in the sort'
|
|
840
|
+
|
|
841
|
+
long_format_results.append({
|
|
842
|
+
'schema': schema_val,
|
|
843
|
+
'table': table_val,
|
|
844
|
+
'sort_type': sort_order,
|
|
845
|
+
'column': col,
|
|
846
|
+
'order': order_position,
|
|
847
|
+
'RLE': rle_value,
|
|
848
|
+
'NDV': ndv_value,
|
|
849
|
+
'total_rows': total_rows,
|
|
850
|
+
'total_RLE': total_rle_all,
|
|
851
|
+
'comments': comment
|
|
852
|
+
})
|
|
853
|
+
|
|
854
|
+
long_df = pd.DataFrame(long_format_results)
|
|
855
|
+
|
|
856
|
+
return long_df
|
|
697
857
|
|
|
698
858
|
|
|
699
859
|
# Example usage:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev4"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
# Add the parent directory to Python path to use local package source
|
|
6
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
7
|
+
import duckrun
|
|
8
|
+
|
|
9
|
+
# Analyze multiple schemas/tables
|
|
10
|
+
conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
|
|
11
|
+
|
|
12
|
+
# Analyze tables - now returns long format automatically
|
|
13
|
+
result = conn.rle("calendar",'auto')
|
|
14
|
+
print(result)
|
|
15
|
+
conn.close()
|
|
16
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|