duckrun 0.2.19.dev2__tar.gz → 0.2.19.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/core.py +125 -53
- duckrun-0.2.19.dev4/duckrun/rle.py +873 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/pyproject.toml +1 -1
- duckrun-0.2.19.dev4/tests/test_rle.py +16 -0
- duckrun-0.2.19.dev2/duckrun/rle.py +0 -521
- duckrun-0.2.19.dev2/tests/test_rle.py +0 -10
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/LICENSE +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/README.md +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/setup.cfg +0 -0
|
@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1244
1244
|
refresh=refresh
|
|
1245
1245
|
)
|
|
1246
1246
|
|
|
1247
|
-
def rle(self, table_name: str = None, mode
|
|
1248
|
-
|
|
1249
|
-
|
|
1247
|
+
def rle(self, table_name: str = None, mode = "natural",
|
|
1248
|
+
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1249
|
+
max_ordering_depth: int = 3, limit: int = None):
|
|
1250
1250
|
"""
|
|
1251
1251
|
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1252
1252
|
|
|
@@ -1254,38 +1254,48 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1254
1254
|
table_name: Name of the table to analyze. Can be:
|
|
1255
1255
|
- 'table_name' (uses current schema)
|
|
1256
1256
|
- 'schema.table_name' (specific schema)
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
- "
|
|
1260
|
-
- "
|
|
1261
|
-
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
num_segments: Number of segments for stratified sampling
|
|
1267
|
-
segment_size: Size of each segment for sampling
|
|
1257
|
+
mode: Analysis mode or column ordering:
|
|
1258
|
+
- "natural": Calculate RLE for natural order only (fastest)
|
|
1259
|
+
- "auto": Natural order + cardinality-based ordering (recommended)
|
|
1260
|
+
- "advanced": Natural + cardinality + greedy incremental search (most thorough)
|
|
1261
|
+
- List[str]: Specific column ordering to test, e.g., ['date', 'duid']
|
|
1262
|
+
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
|
|
1263
|
+
max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
|
|
1264
|
+
max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
|
|
1265
|
+
limit: Optional row limit for testing/development (default: None, analyzes all rows)
|
|
1268
1266
|
|
|
1269
1267
|
Returns:
|
|
1270
1268
|
DataFrame with RLE analysis results
|
|
1271
1269
|
|
|
1272
1270
|
Examples:
|
|
1273
|
-
#
|
|
1271
|
+
# Natural order only (baseline)
|
|
1274
1272
|
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
|
1275
|
-
con.rle("mytable") #
|
|
1276
|
-
con.rle("mytable", "summary")
|
|
1273
|
+
con.rle("mytable") # same as con.rle("mytable", "natural")
|
|
1277
1274
|
|
|
1278
|
-
#
|
|
1279
|
-
con.rle("mytable", "
|
|
1275
|
+
# Auto optimization (natural + cardinality-based)
|
|
1276
|
+
con.rle("mytable", "auto")
|
|
1277
|
+
|
|
1278
|
+
# Advanced optimization (greedy incremental search)
|
|
1279
|
+
con.rle("mytable", "advanced")
|
|
1280
|
+
|
|
1281
|
+
# Test specific column ordering
|
|
1282
|
+
con.rle("mytable", ["date", "duid"])
|
|
1283
|
+
con.rle("mytable", ["cutoff", "time", "DUID", "date"])
|
|
1284
|
+
|
|
1285
|
+
# Advanced with custom depth
|
|
1286
|
+
con.rle("mytable", "advanced", max_ordering_depth=4)
|
|
1280
1287
|
|
|
1281
1288
|
# Analyze table from different schema
|
|
1282
|
-
con.rle("otherschema.mytable", "
|
|
1289
|
+
con.rle("otherschema.mytable", "auto")
|
|
1283
1290
|
|
|
1284
|
-
#
|
|
1285
|
-
con.rle("mytable", "
|
|
1291
|
+
# Custom thresholds for small tables
|
|
1292
|
+
con.rle("mytable", "auto", max_cardinality_pct=0.05)
|
|
1293
|
+
|
|
1294
|
+
# Limit rows for testing
|
|
1295
|
+
con.rle("mytable", "auto", limit=10000)
|
|
1286
1296
|
"""
|
|
1287
1297
|
from .rle import (
|
|
1288
|
-
|
|
1298
|
+
calculate_cardinality_ratio,
|
|
1289
1299
|
test_column_orderings_smart,
|
|
1290
1300
|
calculate_rle_for_columns
|
|
1291
1301
|
)
|
|
@@ -1310,7 +1320,7 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1310
1320
|
# Construct the full table path using the same logic as get_stats
|
|
1311
1321
|
table_path = f"{self.table_base_url}{schema_name}/{tbl}"
|
|
1312
1322
|
|
|
1313
|
-
#
|
|
1323
|
+
# Verify table exists and is not empty
|
|
1314
1324
|
print(f"📊 Analyzing table: {schema_name}.{tbl}")
|
|
1315
1325
|
|
|
1316
1326
|
try:
|
|
@@ -1321,40 +1331,102 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1321
1331
|
print("⚠️ Table is empty (no files)")
|
|
1322
1332
|
return None
|
|
1323
1333
|
|
|
1324
|
-
# Construct full paths for parquet files
|
|
1325
|
-
parquet_paths = [table_path + "/" + f for f in delta_files]
|
|
1326
|
-
|
|
1327
1334
|
except Exception as e:
|
|
1328
1335
|
print(f"❌ Error accessing Delta table: {e}")
|
|
1329
1336
|
return None
|
|
1330
1337
|
|
|
1331
|
-
#
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1338
|
+
# Check if mode is a list of columns (custom ordering)
|
|
1339
|
+
if isinstance(mode, list):
|
|
1340
|
+
# User wants to test a specific column ordering
|
|
1341
|
+
print(f"Testing custom column ordering: {', '.join(mode)}")
|
|
1342
|
+
|
|
1343
|
+
# Calculate cardinality for NDV values
|
|
1344
|
+
card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
|
|
1345
|
+
|
|
1346
|
+
# Calculate RLE for the specified ordering
|
|
1347
|
+
rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
|
|
1348
|
+
|
|
1349
|
+
total_rle_all = sum(rle_counts.values())
|
|
1350
|
+
|
|
1351
|
+
print(f"\nResults:")
|
|
1352
|
+
print(f" Custom ordering: [{', '.join(mode)}]")
|
|
1353
|
+
print(f" Total RLE (all columns): {total_rle_all:,} runs")
|
|
1354
|
+
|
|
1355
|
+
# Return as DataFrame for consistency
|
|
1337
1356
|
import pandas as pd
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1357
|
+
results = [{
|
|
1358
|
+
'schema': schema_name,
|
|
1359
|
+
'table': tbl,
|
|
1360
|
+
'sort_order': 'custom',
|
|
1361
|
+
'columns_used': ', '.join(mode),
|
|
1362
|
+
'total_rle_all': total_rle_all,
|
|
1363
|
+
**rle_counts
|
|
1364
|
+
}]
|
|
1365
|
+
|
|
1366
|
+
df = pd.DataFrame(results)
|
|
1367
|
+
|
|
1368
|
+
# Transform to long format
|
|
1369
|
+
long_format_results = []
|
|
1370
|
+
|
|
1371
|
+
for _, row in df.iterrows():
|
|
1372
|
+
schema_val = row['schema']
|
|
1373
|
+
table_val = row['table']
|
|
1374
|
+
sort_order = row['sort_order']
|
|
1375
|
+
columns_used = row['columns_used']
|
|
1376
|
+
total_rle_all_val = row['total_rle_all']
|
|
1377
|
+
|
|
1378
|
+
# Get all column names except metadata columns
|
|
1379
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
1380
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
1381
|
+
|
|
1382
|
+
# Get total rows from card_stats if available
|
|
1383
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
1384
|
+
|
|
1385
|
+
# Parse the columns_used to get ordering
|
|
1386
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
1387
|
+
|
|
1388
|
+
# Create one row per data column
|
|
1389
|
+
for col in data_columns:
|
|
1390
|
+
rle_value = row[col]
|
|
1391
|
+
|
|
1392
|
+
# Get NDV from card_stats
|
|
1393
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
1394
|
+
|
|
1395
|
+
# Determine if column was included in the sort and its position
|
|
1396
|
+
is_in_sort = col in sort_columns_list
|
|
1397
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
1398
|
+
comment = '' if is_in_sort else 'not included in the sort'
|
|
1399
|
+
|
|
1400
|
+
long_format_results.append({
|
|
1401
|
+
'schema': schema_val,
|
|
1402
|
+
'table': table_val,
|
|
1403
|
+
'sort_type': sort_order,
|
|
1404
|
+
'column': col,
|
|
1405
|
+
'order': order_position,
|
|
1406
|
+
'RLE': rle_value,
|
|
1407
|
+
'NDV': ndv_value,
|
|
1408
|
+
'total_rows': total_rows,
|
|
1409
|
+
'total_RLE': total_rle_all_val,
|
|
1410
|
+
'comments': comment
|
|
1411
|
+
})
|
|
1412
|
+
|
|
1413
|
+
long_df = pd.DataFrame(long_format_results)
|
|
1414
|
+
|
|
1415
|
+
return long_df
|
|
1416
|
+
|
|
1417
|
+
# All modes now use test_column_orderings_smart with the mode parameter
|
|
1418
|
+
return test_column_orderings_smart(
|
|
1419
|
+
self.con,
|
|
1420
|
+
table_path,
|
|
1421
|
+
table_name=table_name, # Pass table name for cardinality calculation on full dataset
|
|
1422
|
+
mode=mode,
|
|
1423
|
+
limit=limit,
|
|
1424
|
+
min_distinct_threshold=min_distinct_threshold,
|
|
1425
|
+
max_cardinality_pct=max_cardinality_pct,
|
|
1426
|
+
max_ordering_depth=max_ordering_depth,
|
|
1427
|
+
schema_name=schema_name,
|
|
1428
|
+
table_display_name=tbl
|
|
1429
|
+
)
|
|
1358
1430
|
|
|
1359
1431
|
def close(self):
|
|
1360
1432
|
"""Close DuckDB connection"""
|