duckrun 0.2.19.dev2__tar.gz → 0.2.19.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/core.py +34 -53
- duckrun-0.2.19.dev3/duckrun/rle.py +713 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun.egg-info/SOURCES.txt +1 -2
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/pyproject.toml +1 -1
- duckrun-0.2.19.dev2/duckrun/rle.py +0 -521
- duckrun-0.2.19.dev2/tests/test_rle.py +0 -10
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/LICENSE +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/README.md +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/__init__.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev3}/setup.cfg +0 -0
|
@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1244
1244
|
refresh=refresh
|
|
1245
1245
|
)
|
|
1246
1246
|
|
|
1247
|
-
def rle(self, table_name: str = None, mode: str = "
|
|
1248
|
-
|
|
1249
|
-
|
|
1247
|
+
def rle(self, table_name: str = None, mode: str = "natural",
|
|
1248
|
+
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1249
|
+
max_ordering_depth: int = 3):
|
|
1250
1250
|
"""
|
|
1251
1251
|
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1252
1252
|
|
|
@@ -1254,38 +1254,39 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1254
1254
|
table_name: Name of the table to analyze. Can be:
|
|
1255
1255
|
- 'table_name' (uses current schema)
|
|
1256
1256
|
- 'schema.table_name' (specific schema)
|
|
1257
|
-
- None (analyzes all tables in current schema - summary only)
|
|
1258
1257
|
mode: Analysis mode:
|
|
1259
|
-
- "
|
|
1260
|
-
- "
|
|
1261
|
-
- "
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
use_stratified_sampling: If True, use stratified sampling across entire file (recommended)
|
|
1266
|
-
num_segments: Number of segments for stratified sampling
|
|
1267
|
-
segment_size: Size of each segment for sampling
|
|
1258
|
+
- "natural": Calculate RLE for natural order only (default, fastest)
|
|
1259
|
+
- "auto": Natural order + cardinality-based ordering (recommended)
|
|
1260
|
+
- "advanced": Natural + cardinality + greedy incremental search (most thorough)
|
|
1261
|
+
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
|
|
1262
|
+
max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
|
|
1263
|
+
max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
|
|
1268
1264
|
|
|
1269
1265
|
Returns:
|
|
1270
1266
|
DataFrame with RLE analysis results
|
|
1271
1267
|
|
|
1272
1268
|
Examples:
|
|
1273
|
-
#
|
|
1269
|
+
# Natural order only (baseline)
|
|
1274
1270
|
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
|
1275
|
-
con.rle("mytable") #
|
|
1276
|
-
con.rle("mytable", "summary")
|
|
1271
|
+
con.rle("mytable") # same as con.rle("mytable", "natural")
|
|
1277
1272
|
|
|
1278
|
-
#
|
|
1279
|
-
con.rle("mytable", "
|
|
1273
|
+
# Auto optimization (natural + cardinality-based)
|
|
1274
|
+
con.rle("mytable", "auto")
|
|
1275
|
+
|
|
1276
|
+
# Advanced optimization (greedy incremental search)
|
|
1277
|
+
con.rle("mytable", "advanced")
|
|
1278
|
+
|
|
1279
|
+
# Advanced with custom depth
|
|
1280
|
+
con.rle("mytable", "advanced", max_ordering_depth=4)
|
|
1280
1281
|
|
|
1281
1282
|
# Analyze table from different schema
|
|
1282
|
-
con.rle("otherschema.mytable", "
|
|
1283
|
+
con.rle("otherschema.mytable", "auto")
|
|
1283
1284
|
|
|
1284
|
-
#
|
|
1285
|
-
con.rle("mytable", "
|
|
1285
|
+
# Custom thresholds for small tables
|
|
1286
|
+
con.rle("mytable", "auto", max_cardinality_pct=0.05)
|
|
1286
1287
|
"""
|
|
1287
1288
|
from .rle import (
|
|
1288
|
-
|
|
1289
|
+
calculate_cardinality_ratio,
|
|
1289
1290
|
test_column_orderings_smart,
|
|
1290
1291
|
calculate_rle_for_columns
|
|
1291
1292
|
)
|
|
@@ -1310,7 +1311,7 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1310
1311
|
# Construct the full table path using the same logic as get_stats
|
|
1311
1312
|
table_path = f"{self.table_base_url}{schema_name}/{tbl}"
|
|
1312
1313
|
|
|
1313
|
-
#
|
|
1314
|
+
# Verify table exists and is not empty
|
|
1314
1315
|
print(f"📊 Analyzing table: {schema_name}.{tbl}")
|
|
1315
1316
|
|
|
1316
1317
|
try:
|
|
@@ -1321,40 +1322,20 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1321
1322
|
print("⚠️ Table is empty (no files)")
|
|
1322
1323
|
return None
|
|
1323
1324
|
|
|
1324
|
-
# Construct full paths for parquet files
|
|
1325
|
-
parquet_paths = [table_path + "/" + f for f in delta_files]
|
|
1326
|
-
|
|
1327
1325
|
except Exception as e:
|
|
1328
1326
|
print(f"❌ Error accessing Delta table: {e}")
|
|
1329
1327
|
return None
|
|
1330
1328
|
|
|
1331
|
-
#
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
#
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
])
|
|
1342
|
-
return df
|
|
1343
|
-
|
|
1344
|
-
elif mode in ["smart", "full"]:
|
|
1345
|
-
# Smart or full RLE analysis
|
|
1346
|
-
return test_column_orderings_smart(
|
|
1347
|
-
self.con,
|
|
1348
|
-
parquet_path,
|
|
1349
|
-
limit=limit,
|
|
1350
|
-
max_combinations=max_combinations,
|
|
1351
|
-
use_stratified_sampling=use_stratified_sampling,
|
|
1352
|
-
num_segments=num_segments,
|
|
1353
|
-
segment_size=segment_size
|
|
1354
|
-
)
|
|
1355
|
-
else:
|
|
1356
|
-
print(f"❌ Unknown mode: {mode}. Use 'summary', 'smart', or 'full'")
|
|
1357
|
-
return None
|
|
1329
|
+
# All modes now use test_column_orderings_smart with the mode parameter
|
|
1330
|
+
return test_column_orderings_smart(
|
|
1331
|
+
self.con,
|
|
1332
|
+
table_path,
|
|
1333
|
+
table_name=table_name, # Pass table name for cardinality calculation on full dataset
|
|
1334
|
+
mode=mode,
|
|
1335
|
+
min_distinct_threshold=min_distinct_threshold,
|
|
1336
|
+
max_cardinality_pct=max_cardinality_pct,
|
|
1337
|
+
max_ordering_depth=max_ordering_depth
|
|
1338
|
+
)
|
|
1358
1339
|
|
|
1359
1340
|
def close(self):
|
|
1360
1341
|
"""Close DuckDB connection"""
|