duckrun 0.2.19.dev3__tar.gz → 0.2.19.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/core.py +96 -5
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/rle.py +220 -73
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/SOURCES.txt +2 -1
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/pyproject.toml +1 -1
- duckrun-0.2.19.dev5/tests/test_rle.py +16 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/LICENSE +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/README.md +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev3 → duckrun-0.2.19.dev5}/setup.cfg +0 -0
|
@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1244
1244
|
refresh=refresh
|
|
1245
1245
|
)
|
|
1246
1246
|
|
|
1247
|
-
def rle(self, table_name: str = None, mode
|
|
1247
|
+
def rle(self, table_name: str = None, mode = "natural",
|
|
1248
1248
|
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1249
|
-
max_ordering_depth: int = 3):
|
|
1249
|
+
max_ordering_depth: int = 3, limit: int = None):
|
|
1250
1250
|
"""
|
|
1251
1251
|
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1252
1252
|
|
|
@@ -1254,13 +1254,15 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1254
1254
|
table_name: Name of the table to analyze. Can be:
|
|
1255
1255
|
- 'table_name' (uses current schema)
|
|
1256
1256
|
- 'schema.table_name' (specific schema)
|
|
1257
|
-
mode: Analysis mode:
|
|
1258
|
-
- "natural": Calculate RLE for natural order only (
|
|
1257
|
+
mode: Analysis mode or column ordering:
|
|
1258
|
+
- "natural": Calculate RLE for natural order only (fastest)
|
|
1259
1259
|
- "auto": Natural order + cardinality-based ordering (recommended)
|
|
1260
1260
|
- "advanced": Natural + cardinality + greedy incremental search (most thorough)
|
|
1261
|
+
- List[str]: Specific column ordering to test, e.g., ['date', 'duid']
|
|
1261
1262
|
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
|
|
1262
1263
|
max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
|
|
1263
1264
|
max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
|
|
1265
|
+
limit: Optional row limit for testing/development (default: None, analyzes all rows)
|
|
1264
1266
|
|
|
1265
1267
|
Returns:
|
|
1266
1268
|
DataFrame with RLE analysis results
|
|
@@ -1276,6 +1278,10 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1276
1278
|
# Advanced optimization (greedy incremental search)
|
|
1277
1279
|
con.rle("mytable", "advanced")
|
|
1278
1280
|
|
|
1281
|
+
# Test specific column ordering
|
|
1282
|
+
con.rle("mytable", ["date", "duid"])
|
|
1283
|
+
con.rle("mytable", ["cutoff", "time", "DUID", "date"])
|
|
1284
|
+
|
|
1279
1285
|
# Advanced with custom depth
|
|
1280
1286
|
con.rle("mytable", "advanced", max_ordering_depth=4)
|
|
1281
1287
|
|
|
@@ -1284,6 +1290,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1284
1290
|
|
|
1285
1291
|
# Custom thresholds for small tables
|
|
1286
1292
|
con.rle("mytable", "auto", max_cardinality_pct=0.05)
|
|
1293
|
+
|
|
1294
|
+
# Limit rows for testing
|
|
1295
|
+
con.rle("mytable", "auto", limit=10000)
|
|
1287
1296
|
"""
|
|
1288
1297
|
from .rle import (
|
|
1289
1298
|
calculate_cardinality_ratio,
|
|
@@ -1326,15 +1335,97 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1326
1335
|
print(f"❌ Error accessing Delta table: {e}")
|
|
1327
1336
|
return None
|
|
1328
1337
|
|
|
1338
|
+
# Check if mode is a list of columns (custom ordering)
|
|
1339
|
+
if isinstance(mode, list):
|
|
1340
|
+
# User wants to test a specific column ordering
|
|
1341
|
+
print(f"Testing custom column ordering: {', '.join(mode)}")
|
|
1342
|
+
|
|
1343
|
+
# Calculate cardinality for NDV values
|
|
1344
|
+
card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
|
|
1345
|
+
|
|
1346
|
+
# Calculate RLE for the specified ordering
|
|
1347
|
+
rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
|
|
1348
|
+
|
|
1349
|
+
total_rle_all = sum(rle_counts.values())
|
|
1350
|
+
|
|
1351
|
+
print(f"\nResults:")
|
|
1352
|
+
print(f" Custom ordering: [{', '.join(mode)}]")
|
|
1353
|
+
print(f" Total RLE (all columns): {total_rle_all:,} runs")
|
|
1354
|
+
|
|
1355
|
+
# Return as DataFrame for consistency
|
|
1356
|
+
import pandas as pd
|
|
1357
|
+
results = [{
|
|
1358
|
+
'schema': schema_name,
|
|
1359
|
+
'table': tbl,
|
|
1360
|
+
'sort_order': 'custom',
|
|
1361
|
+
'columns_used': ', '.join(mode),
|
|
1362
|
+
'total_rle_all': total_rle_all,
|
|
1363
|
+
**rle_counts
|
|
1364
|
+
}]
|
|
1365
|
+
|
|
1366
|
+
df = pd.DataFrame(results)
|
|
1367
|
+
|
|
1368
|
+
# Transform to long format
|
|
1369
|
+
long_format_results = []
|
|
1370
|
+
|
|
1371
|
+
for _, row in df.iterrows():
|
|
1372
|
+
schema_val = row['schema']
|
|
1373
|
+
table_val = row['table']
|
|
1374
|
+
sort_order = row['sort_order']
|
|
1375
|
+
columns_used = row['columns_used']
|
|
1376
|
+
total_rle_all_val = row['total_rle_all']
|
|
1377
|
+
|
|
1378
|
+
# Get all column names except metadata columns
|
|
1379
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
1380
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
1381
|
+
|
|
1382
|
+
# Get total rows from card_stats if available
|
|
1383
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
1384
|
+
|
|
1385
|
+
# Parse the columns_used to get ordering
|
|
1386
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
1387
|
+
|
|
1388
|
+
# Create one row per data column
|
|
1389
|
+
for col in data_columns:
|
|
1390
|
+
rle_value = row[col]
|
|
1391
|
+
|
|
1392
|
+
# Get NDV from card_stats
|
|
1393
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
1394
|
+
|
|
1395
|
+
# Determine if column was included in the sort and its position
|
|
1396
|
+
is_in_sort = col in sort_columns_list
|
|
1397
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
1398
|
+
comment = '' if is_in_sort else 'not included in the sort'
|
|
1399
|
+
|
|
1400
|
+
long_format_results.append({
|
|
1401
|
+
'schema': schema_val,
|
|
1402
|
+
'table': table_val,
|
|
1403
|
+
'sort_type': sort_order,
|
|
1404
|
+
'column': col,
|
|
1405
|
+
'order': order_position,
|
|
1406
|
+
'RLE': rle_value,
|
|
1407
|
+
'NDV': ndv_value,
|
|
1408
|
+
'total_rows': total_rows,
|
|
1409
|
+
'total_RLE': total_rle_all_val,
|
|
1410
|
+
'comments': comment
|
|
1411
|
+
})
|
|
1412
|
+
|
|
1413
|
+
long_df = pd.DataFrame(long_format_results)
|
|
1414
|
+
|
|
1415
|
+
return long_df
|
|
1416
|
+
|
|
1329
1417
|
# All modes now use test_column_orderings_smart with the mode parameter
|
|
1330
1418
|
return test_column_orderings_smart(
|
|
1331
1419
|
self.con,
|
|
1332
1420
|
table_path,
|
|
1333
1421
|
table_name=table_name, # Pass table name for cardinality calculation on full dataset
|
|
1334
1422
|
mode=mode,
|
|
1423
|
+
limit=limit,
|
|
1335
1424
|
min_distinct_threshold=min_distinct_threshold,
|
|
1336
1425
|
max_cardinality_pct=max_cardinality_pct,
|
|
1337
|
-
max_ordering_depth=max_ordering_depth
|
|
1426
|
+
max_ordering_depth=max_ordering_depth,
|
|
1427
|
+
schema_name=schema_name,
|
|
1428
|
+
table_display_name=tbl
|
|
1338
1429
|
)
|
|
1339
1430
|
|
|
1340
1431
|
def close(self):
|
|
@@ -198,7 +198,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
|
|
|
198
198
|
if sort_columns:
|
|
199
199
|
order_by = "ORDER BY " + ", ".join(sort_columns)
|
|
200
200
|
else:
|
|
201
|
-
order_by = "ORDER BY file_row_number ASC"
|
|
201
|
+
order_by = "ORDER BY filename, file_row_number ASC"
|
|
202
202
|
|
|
203
203
|
limit_clause = f"LIMIT {limit}" if limit else ""
|
|
204
204
|
|
|
@@ -210,7 +210,7 @@ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = No
|
|
|
210
210
|
SELECT
|
|
211
211
|
{column_name},
|
|
212
212
|
ROW_NUMBER() OVER ({order_by}) as sort_order
|
|
213
|
-
FROM delta_scan('{delta_path}', file_row_number = TRUE)
|
|
213
|
+
FROM delta_scan('{delta_path}', filename = TRUE, file_row_number = TRUE)
|
|
214
214
|
{limit_clause}
|
|
215
215
|
),
|
|
216
216
|
runs AS (
|
|
@@ -238,22 +238,21 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
238
238
|
Calculate cardinality ratio for each column (distinct_values / total_rows).
|
|
239
239
|
Lower ratio = better for RLE compression (more repetition).
|
|
240
240
|
|
|
241
|
-
NEVER uses sampling - always scans full dataset with exact
|
|
241
|
+
NEVER uses sampling - always scans full dataset with exact distinct counts.
|
|
242
242
|
|
|
243
243
|
Args:
|
|
244
244
|
con: DuckDB connection
|
|
245
245
|
source: Either a table name (default) or parquet file path
|
|
246
246
|
limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
|
|
247
247
|
is_parquet: If True, source is a parquet file path; if False, source is a table name
|
|
248
|
-
use_approx:
|
|
249
|
-
|
|
250
|
-
approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
|
|
248
|
+
use_approx: DEPRECATED - always uses exact COUNT(DISTINCT)
|
|
249
|
+
approx_threshold: DEPRECATED - always uses exact COUNT(DISTINCT)
|
|
251
250
|
|
|
252
251
|
Returns:
|
|
253
252
|
Dictionary mapping column names to dict with keys:
|
|
254
253
|
- 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
|
|
255
254
|
- 'total_rows': total row count
|
|
256
|
-
- 'distinct_values': number of distinct values (exact
|
|
255
|
+
- 'distinct_values': number of distinct values (exact)
|
|
257
256
|
"""
|
|
258
257
|
# Build the FROM clause based on source type
|
|
259
258
|
if is_parquet:
|
|
@@ -274,26 +273,15 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
274
273
|
if not column_names:
|
|
275
274
|
return {}
|
|
276
275
|
|
|
277
|
-
#
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
281
|
-
use_approx = total_rows > approx_threshold
|
|
282
|
-
if use_approx:
|
|
283
|
-
print(f" Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
|
|
284
|
-
else:
|
|
285
|
-
print(f" Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
|
|
286
|
-
else:
|
|
287
|
-
total_rows = None # Will be calculated in main query
|
|
276
|
+
# Get row count
|
|
277
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
278
|
+
print(f" Table has {total_rows:,} rows - using exact COUNT(DISTINCT)")
|
|
288
279
|
|
|
289
|
-
# Build a single query that calculates all
|
|
280
|
+
# Build a single query that calculates all cardinality in one pass
|
|
290
281
|
# This scans the data only ONCE instead of once per column
|
|
291
282
|
select_clauses = []
|
|
292
283
|
for col in column_names:
|
|
293
|
-
|
|
294
|
-
select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
|
|
295
|
-
else:
|
|
296
|
-
select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
|
|
284
|
+
select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
|
|
297
285
|
|
|
298
286
|
query = f"""
|
|
299
287
|
SELECT
|
|
@@ -307,8 +295,7 @@ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet:
|
|
|
307
295
|
if not result:
|
|
308
296
|
return {}
|
|
309
297
|
|
|
310
|
-
|
|
311
|
-
total_rows = result[0]
|
|
298
|
+
total_rows = result[0]
|
|
312
299
|
|
|
313
300
|
nfv_stats = {}
|
|
314
301
|
|
|
@@ -403,7 +390,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
403
390
|
mode: str = "natural",
|
|
404
391
|
min_distinct_threshold: int = 2,
|
|
405
392
|
max_cardinality_pct: float = 0.01,
|
|
406
|
-
max_ordering_depth: int = 3
|
|
393
|
+
max_ordering_depth: int = 3,
|
|
394
|
+
schema_name: str = None,
|
|
395
|
+
table_display_name: str = None) -> pd.DataFrame:
|
|
407
396
|
"""
|
|
408
397
|
Test column orderings for RLE optimization.
|
|
409
398
|
|
|
@@ -421,12 +410,27 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
421
410
|
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2, i.e. only exclude constants with 1 value)
|
|
422
411
|
max_cardinality_pct: Exclude columns with cardinality ratio above this % (default: 0.01 = 1%)
|
|
423
412
|
max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
|
|
413
|
+
schema_name: Optional schema name to include in results (default: None)
|
|
414
|
+
table_display_name: Optional table name to include in results (default: None)
|
|
424
415
|
|
|
425
416
|
Returns:
|
|
426
|
-
DataFrame with columns: sort_order, columns_used, total_rle_all, and individual column RLE counts
|
|
417
|
+
DataFrame with columns: schema, table, sort_order, columns_used, total_rle_all, and individual column RLE counts
|
|
427
418
|
"""
|
|
428
419
|
print("Analyzing column characteristics...")
|
|
429
420
|
|
|
421
|
+
# Calculate cardinality ratios first (for all modes)
|
|
422
|
+
print("\nCalculating cardinality ratios on full dataset...")
|
|
423
|
+
if table_name:
|
|
424
|
+
card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
|
|
425
|
+
else:
|
|
426
|
+
# Fallback: use delta_scan directly
|
|
427
|
+
card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
|
|
428
|
+
|
|
429
|
+
print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
|
|
430
|
+
for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
|
|
431
|
+
card_pct = stats['cardinality_ratio'] * 100
|
|
432
|
+
print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
|
|
433
|
+
|
|
430
434
|
# For "natural" mode, just calculate RLE on natural order
|
|
431
435
|
if mode == "natural":
|
|
432
436
|
print("\n" + "="*60)
|
|
@@ -453,9 +457,10 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
453
457
|
|
|
454
458
|
print(f"\nResults:")
|
|
455
459
|
print(f" Total RLE (all columns): {total_rle_all:,}")
|
|
456
|
-
print(f" Average RLE per column: {total_rle_all / len(column_names):.1f}")
|
|
457
460
|
|
|
458
461
|
results = [{
|
|
462
|
+
'schema': schema_name,
|
|
463
|
+
'table': table_display_name,
|
|
459
464
|
'sort_order': 'natural_order',
|
|
460
465
|
'columns_used': 'file_row_number',
|
|
461
466
|
'total_rle_all': total_rle_all,
|
|
@@ -467,21 +472,48 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
467
472
|
print(f"✓ Analysis complete!")
|
|
468
473
|
print(f"{'='*60}")
|
|
469
474
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
475
|
+
# Transform to long format
|
|
476
|
+
long_format_results = []
|
|
477
|
+
|
|
478
|
+
for _, row in df.iterrows():
|
|
479
|
+
schema_val = row['schema']
|
|
480
|
+
table_val = row['table']
|
|
481
|
+
sort_order = row['sort_order']
|
|
482
|
+
columns_used = row['columns_used']
|
|
483
|
+
total_rle_all_val = row['total_rle_all']
|
|
484
|
+
|
|
485
|
+
# Get all column names except metadata columns
|
|
486
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
487
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
488
|
+
|
|
489
|
+
# Get total rows and NDV from card_stats if available
|
|
490
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
491
|
+
|
|
492
|
+
# Create one row per data column
|
|
493
|
+
for col in data_columns:
|
|
494
|
+
rle_value = row[col]
|
|
495
|
+
|
|
496
|
+
# Get NDV from card_stats
|
|
497
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
498
|
+
|
|
499
|
+
long_format_results.append({
|
|
500
|
+
'schema': schema_val,
|
|
501
|
+
'table': table_val,
|
|
502
|
+
'sort_type': sort_order,
|
|
503
|
+
'column': col,
|
|
504
|
+
'order': None,
|
|
505
|
+
'RLE': rle_value,
|
|
506
|
+
'NDV': ndv_value,
|
|
507
|
+
'total_rows': total_rows,
|
|
508
|
+
'total_RLE': total_rle_all_val,
|
|
509
|
+
'comments': ''
|
|
510
|
+
})
|
|
511
|
+
|
|
512
|
+
long_df = pd.DataFrame(long_format_results)
|
|
513
|
+
|
|
514
|
+
return long_df
|
|
484
515
|
|
|
516
|
+
# For "auto" and "advanced" modes, continue with optimization
|
|
485
517
|
# Extract just the ratios for easier handling
|
|
486
518
|
cardinality_ratios = {col: stats['cardinality_ratio'] for col, stats in card_stats.items()}
|
|
487
519
|
column_names = list(card_stats.keys())
|
|
@@ -532,9 +564,30 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
532
564
|
# Filter baseline to only include good_for_reordering columns
|
|
533
565
|
baseline_filtered = {col: rle for col, rle in baseline.items() if col in good_for_reordering}
|
|
534
566
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
567
|
+
# Show column categorization upfront
|
|
568
|
+
print(f"\nColumn Analysis (baseline RLE in natural order):")
|
|
569
|
+
|
|
570
|
+
# Show columns worth reordering first
|
|
571
|
+
if baseline_filtered:
|
|
572
|
+
print(f" Columns included in optimization:")
|
|
573
|
+
for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
|
|
574
|
+
print(f" {col}: {baseline_filtered[col]:,} runs")
|
|
575
|
+
print(f" ─────────────────────────")
|
|
576
|
+
print(f" Subtotal: {sum(baseline_filtered.values()):,} runs")
|
|
577
|
+
|
|
578
|
+
# Show excluded columns (constant or high-cardinality)
|
|
579
|
+
excluded_cols = {col: rle for col, rle in baseline.items()
|
|
580
|
+
if col in constant_cols or col in fragmented_cols}
|
|
581
|
+
if excluded_cols:
|
|
582
|
+
print(f" Columns excluded from optimization:")
|
|
583
|
+
for col in sorted(excluded_cols.keys(), key=lambda c: excluded_cols[c]):
|
|
584
|
+
reason = "constant" if col in constant_cols else "high-cardinality"
|
|
585
|
+
print(f" {col}: {excluded_cols[col]:,} runs ({reason})")
|
|
586
|
+
print(f" ─────────────────────────")
|
|
587
|
+
print(f" Subtotal: {sum(excluded_cols.values()):,} runs")
|
|
588
|
+
|
|
589
|
+
# Show total baseline RLE
|
|
590
|
+
print(f"\nBaseline Total RLE (all columns): {sum(baseline.values()):,} runs")
|
|
538
591
|
|
|
539
592
|
# Define only the most promising orderings to test
|
|
540
593
|
orderings_to_test = [
|
|
@@ -545,20 +598,22 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
545
598
|
if mode in ["auto", "advanced"] and len(good_for_reordering) >= 2:
|
|
546
599
|
orderings_to_test.append((good_for_reordering, 'by_cardinality'))
|
|
547
600
|
|
|
548
|
-
|
|
549
|
-
|
|
601
|
+
# Count only the actual reordering tests (exclude natural_order baseline)
|
|
602
|
+
num_tests = len(orderings_to_test) - 1
|
|
550
603
|
|
|
551
604
|
results = []
|
|
552
605
|
|
|
553
606
|
for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
|
|
554
|
-
print(f"\n[{i}/{len(orderings_to_test)}] Testing: {label}")
|
|
555
|
-
if sort_cols:
|
|
556
|
-
print(f" Order: {', '.join(sort_cols)}")
|
|
557
|
-
|
|
558
607
|
if i == 1:
|
|
559
|
-
# Use baseline for natural order (already calculated)
|
|
608
|
+
# Use baseline for natural order (already calculated and displayed)
|
|
560
609
|
rle_counts = baseline
|
|
561
610
|
else:
|
|
611
|
+
# This is an actual reordering test
|
|
612
|
+
test_num = i - 1
|
|
613
|
+
print(f"\n[{test_num}/{num_tests}] Testing: {label}")
|
|
614
|
+
if sort_cols:
|
|
615
|
+
print(f" Order: {', '.join(sort_cols)}")
|
|
616
|
+
|
|
562
617
|
# Calculate RLE for this ordering
|
|
563
618
|
rle_counts = calculate_rle_for_columns(con, delta_path, sort_cols, limit)
|
|
564
619
|
|
|
@@ -572,11 +627,11 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
572
627
|
# Calculate weighted score (considering both RLE and cardinality - lower cardinality = better)
|
|
573
628
|
cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
|
|
574
629
|
|
|
575
|
-
print(f" Total RLE
|
|
576
|
-
print(f" Optimizable columns RLE: {total_rle_optimizable:,}")
|
|
577
|
-
print(f" Avg RLE (optimizable): {total_rle_optimizable / len(rle_filtered):.1f}")
|
|
630
|
+
print(f" Total RLE: {total_rle_all:,} runs")
|
|
578
631
|
|
|
579
632
|
results.append({
|
|
633
|
+
'schema': schema_name,
|
|
634
|
+
'table': table_display_name,
|
|
580
635
|
'sort_order': label,
|
|
581
636
|
'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
|
|
582
637
|
'total_rle_all': total_rle_all, # All columns (must be >= row_count)
|
|
@@ -599,42 +654,82 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
599
654
|
current_best_rle = sum(baseline_filtered.values())
|
|
600
655
|
remaining_columns = list(good_for_reordering)
|
|
601
656
|
|
|
657
|
+
# Get the cardinality-based RLE as the target to beat (both total and optimizable)
|
|
658
|
+
cardinality_rle = results[-1]['optimizable_rle'] if len(results) > 1 else float('inf')
|
|
659
|
+
cardinality_total_rle = results[-1]['total_rle_all'] if len(results) > 1 else float('inf')
|
|
660
|
+
|
|
602
661
|
for depth in range(1, min(max_ordering_depth + 1, len(good_for_reordering) + 1)):
|
|
603
|
-
|
|
662
|
+
num_candidates = len(remaining_columns)
|
|
663
|
+
num_positions = len(current_best_ordering) + 1
|
|
664
|
+
total_tests = num_candidates * num_positions
|
|
665
|
+
print(f"\n--- Depth {depth}: Testing {num_candidates} candidate columns × {num_positions} positions = {total_tests} tests ---")
|
|
666
|
+
print(f" Target to beat: {cardinality_total_rle:,} runs (cardinality ordering)")
|
|
604
667
|
|
|
605
668
|
best_depth_ordering = None
|
|
606
669
|
best_depth_rle = float('inf')
|
|
607
670
|
best_depth_col = None
|
|
608
671
|
best_depth_position = None
|
|
672
|
+
early_exit = False
|
|
673
|
+
|
|
674
|
+
# Sort remaining candidates by baseline RLE (HIGHER first = test worse candidates first)
|
|
675
|
+
# This way we test DUID, time, date before cutoff (which we know is good from cardinality test)
|
|
676
|
+
candidates_sorted = sorted(remaining_columns, key=lambda c: baseline_filtered[c], reverse=True)
|
|
609
677
|
|
|
610
|
-
|
|
611
|
-
|
|
678
|
+
test_num = 0
|
|
679
|
+
# Try adding each remaining column (sorted by baseline RLE - worse first)
|
|
680
|
+
for candidate_col in candidates_sorted:
|
|
612
681
|
# Try inserting at each possible position (including end)
|
|
613
682
|
for insert_pos in range(len(current_best_ordering) + 1):
|
|
683
|
+
test_num += 1
|
|
684
|
+
|
|
614
685
|
# Build test ordering: insert candidate at position
|
|
615
686
|
test_ordering = current_best_ordering[:insert_pos] + [candidate_col] + current_best_ordering[insert_pos:]
|
|
616
687
|
|
|
688
|
+
print(f" [{test_num}/{total_tests}] Testing '{candidate_col}' at position {insert_pos}: [{', '.join(test_ordering)}]", end='', flush=True)
|
|
689
|
+
|
|
617
690
|
# Calculate RLE for this ordering
|
|
618
691
|
rle_counts = calculate_rle_for_columns(con, delta_path, test_ordering, limit)
|
|
619
692
|
|
|
620
693
|
# Sum RLE for optimizable columns only
|
|
621
694
|
rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
|
|
622
695
|
total_rle = sum(rle_filtered.values())
|
|
696
|
+
total_rle_all = sum(rle_counts.values())
|
|
697
|
+
|
|
698
|
+
is_best = total_rle < best_depth_rle
|
|
699
|
+
beats_cardinality = total_rle < cardinality_rle
|
|
700
|
+
|
|
701
|
+
status = ""
|
|
702
|
+
if beats_cardinality:
|
|
703
|
+
status = " 🎯 Beats cardinality!"
|
|
704
|
+
|
|
705
|
+
print(f" → Total: {total_rle_all:,}{status}")
|
|
623
706
|
|
|
624
707
|
# Track best at this depth
|
|
625
|
-
if
|
|
708
|
+
if is_best:
|
|
626
709
|
best_depth_rle = total_rle
|
|
627
710
|
best_depth_ordering = test_ordering
|
|
628
711
|
best_depth_col = candidate_col
|
|
629
712
|
best_depth_position = insert_pos
|
|
630
713
|
best_depth_rle_counts = rle_counts
|
|
714
|
+
|
|
715
|
+
# Early exit if we beat cardinality ordering!
|
|
716
|
+
if beats_cardinality:
|
|
717
|
+
print(f"\n ⚡ Early exit! Found ordering better than cardinality. Moving to next depth.")
|
|
718
|
+
early_exit = True
|
|
719
|
+
break
|
|
720
|
+
|
|
721
|
+
if early_exit:
|
|
722
|
+
break
|
|
631
723
|
|
|
632
724
|
# Check if we found improvement
|
|
633
725
|
if best_depth_rle < current_best_rle:
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
print(f"
|
|
726
|
+
current_total_rle_all = sum(best_depth_rle_counts.values())
|
|
727
|
+
baseline_total_rle_all = sum(baseline.values())
|
|
728
|
+
improvement_pct = ((baseline_total_rle_all - current_total_rle_all) / baseline_total_rle_all) * 100
|
|
729
|
+
print(f"\n✓ Best at depth {depth}: [{', '.join(best_depth_ordering)}]")
|
|
730
|
+
print(f" Total RLE (all columns): {current_total_rle_all:,} runs")
|
|
731
|
+
print(f" Optimizable RLE: {best_depth_rle:,} runs")
|
|
732
|
+
print(f" Improvement: {improvement_pct:.1f}% better than baseline (total RLE)")
|
|
638
733
|
|
|
639
734
|
# Update for next depth
|
|
640
735
|
current_best_ordering = best_depth_ordering
|
|
@@ -647,6 +742,8 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
647
742
|
cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
|
|
648
743
|
|
|
649
744
|
results.append({
|
|
745
|
+
'schema': schema_name,
|
|
746
|
+
'table': table_display_name,
|
|
650
747
|
'sort_order': f'greedy_depth_{depth}',
|
|
651
748
|
'columns_used': ', '.join(best_depth_ordering),
|
|
652
749
|
'total_rle_all': total_rle_all,
|
|
@@ -657,8 +754,9 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
657
754
|
**best_depth_rle_counts
|
|
658
755
|
})
|
|
659
756
|
else:
|
|
660
|
-
print(f"✗ No improvement found at depth {depth} - stopping early")
|
|
661
|
-
print(f" Best RLE
|
|
757
|
+
print(f"\n✗ No improvement found at depth {depth} - stopping early")
|
|
758
|
+
print(f" Best RLE (all columns): {sum(best_depth_rle_counts.values()) if best_depth_rle_counts else sum(baseline.values()):,} runs")
|
|
759
|
+
print(f" Best optimizable RLE: {best_depth_rle if best_depth_rle != float('inf') else current_best_rle:,} runs")
|
|
662
760
|
break
|
|
663
761
|
|
|
664
762
|
print(f"\n{'='*60}")
|
|
@@ -666,7 +764,7 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
666
764
|
print(f"{'='*60}")
|
|
667
765
|
if current_best_ordering:
|
|
668
766
|
print(f"Final greedy ordering: {', '.join(current_best_ordering)}")
|
|
669
|
-
print(f"Final RLE: {current_best_rle:,} runs")
|
|
767
|
+
print(f"Final optimizable RLE: {current_best_rle:,} runs")
|
|
670
768
|
|
|
671
769
|
|
|
672
770
|
# Convert to DataFrame and sort by optimizable RLE (lower is better)
|
|
@@ -677,14 +775,14 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
677
775
|
print(f"✓ Analysis complete!")
|
|
678
776
|
print(f"{'='*60}")
|
|
679
777
|
print(f"Best ordering: {df.iloc[0]['sort_order']}")
|
|
680
|
-
print(f"Best
|
|
681
|
-
print(f"Total RLE (all columns): {df.iloc[0]['total_rle_all']:,} runs")
|
|
778
|
+
print(f"Best total RLE: {df.iloc[0]['total_rle_all']:,} runs (lower is better)")
|
|
682
779
|
|
|
683
780
|
|
|
684
|
-
improvement
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
781
|
+
# Calculate improvement using total RLE (all columns) for meaningful comparison
|
|
782
|
+
baseline_total_rle = sum(baseline.values())
|
|
783
|
+
best_total_rle = df.iloc[0]['total_rle_all']
|
|
784
|
+
if len(df) > 1 and baseline_total_rle > 0:
|
|
785
|
+
pct = ((baseline_total_rle - best_total_rle) / baseline_total_rle) * 100
|
|
688
786
|
if pct > 0:
|
|
689
787
|
print(f"Improvement: {pct:.1f}% fewer runs vs natural order")
|
|
690
788
|
|
|
@@ -693,7 +791,56 @@ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, li
|
|
|
693
791
|
# Remove: optimizable_rle, avg_rle, cardinality_weighted_score, method
|
|
694
792
|
display_df = df.drop(columns=['optimizable_rle', 'avg_rle', 'cardinality_weighted_score', 'method'], errors='ignore')
|
|
695
793
|
|
|
696
|
-
|
|
794
|
+
# Transform to long format
|
|
795
|
+
long_format_results = []
|
|
796
|
+
|
|
797
|
+
for _, row in display_df.iterrows():
|
|
798
|
+
schema_val = row['schema']
|
|
799
|
+
table_val = row['table']
|
|
800
|
+
sort_order = row['sort_order']
|
|
801
|
+
columns_used = row['columns_used']
|
|
802
|
+
total_rle_all = row['total_rle_all']
|
|
803
|
+
|
|
804
|
+
# Get all column names except metadata columns
|
|
805
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
806
|
+
data_columns = [col for col in display_df.columns if col not in metadata_cols]
|
|
807
|
+
|
|
808
|
+
# Get total rows and NDV from card_stats if available
|
|
809
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
810
|
+
|
|
811
|
+
# Parse the columns_used to get ordering
|
|
812
|
+
sort_columns_list = []
|
|
813
|
+
if columns_used != 'file_row_number':
|
|
814
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
815
|
+
|
|
816
|
+
# Create one row per data column
|
|
817
|
+
for col in data_columns:
|
|
818
|
+
rle_value = row[col]
|
|
819
|
+
|
|
820
|
+
# Get NDV from card_stats
|
|
821
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
822
|
+
|
|
823
|
+
# Determine if column was included in the sort and its position
|
|
824
|
+
is_in_sort = col in sort_columns_list
|
|
825
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
826
|
+
comment = '' if is_in_sort or columns_used == 'file_row_number' else 'not included in the sort'
|
|
827
|
+
|
|
828
|
+
long_format_results.append({
|
|
829
|
+
'schema': schema_val,
|
|
830
|
+
'table': table_val,
|
|
831
|
+
'sort_type': sort_order,
|
|
832
|
+
'column': col,
|
|
833
|
+
'order': order_position,
|
|
834
|
+
'RLE': rle_value,
|
|
835
|
+
'NDV': ndv_value,
|
|
836
|
+
'total_rows': total_rows,
|
|
837
|
+
'total_RLE': total_rle_all,
|
|
838
|
+
'comments': comment
|
|
839
|
+
})
|
|
840
|
+
|
|
841
|
+
long_df = pd.DataFrame(long_format_results)
|
|
842
|
+
|
|
843
|
+
return long_df
|
|
697
844
|
|
|
698
845
|
|
|
699
846
|
# Example usage:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev5"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
# Add the parent directory to Python path to use local package source
|
|
6
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
7
|
+
import duckrun
|
|
8
|
+
|
|
9
|
+
# Analyze multiple schemas/tables
|
|
10
|
+
conn = duckrun.connect("tmp/data.lakehouse/deltars_sorted")
|
|
11
|
+
|
|
12
|
+
# Analyze tables - now returns long format automatically
|
|
13
|
+
result = conn.rle("calendar",'auto')
|
|
14
|
+
print(result)
|
|
15
|
+
conn.close()
|
|
16
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|