duckrun 0.2.18.dev4__tar.gz → 0.2.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/PKG-INFO +1 -1
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/__init__.py +3 -2
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/core.py +214 -6
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/notebook.py +2 -0
- duckrun-0.2.19/duckrun/rle.py +940 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/stats.py +194 -67
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun.egg-info/SOURCES.txt +4 -1
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/pyproject.toml +1 -1
- duckrun-0.2.19/tests/test_register.py +275 -0
- duckrun-0.2.19/tests/test_rle.py +16 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/LICENSE +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/README.md +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/auth.py +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/files.py +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/runner.py +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun/writer.py +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.18.dev4 → duckrun-0.2.19}/setup.cfg +0 -0
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
from duckrun.core import Duckrun
|
|
4
4
|
from duckrun.notebook import import_notebook_from_web, import_notebook
|
|
5
|
+
from duckrun import rle
|
|
5
6
|
|
|
6
|
-
__version__ = "0.2.18
|
|
7
|
+
__version__ = "0.2.18"
|
|
7
8
|
|
|
8
9
|
# Expose unified connect method at module level
|
|
9
10
|
connect = Duckrun.connect
|
|
10
11
|
|
|
11
|
-
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
|
|
12
|
+
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
|
|
@@ -1035,7 +1035,22 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1035
1035
|
"""Get underlying DuckDB connection"""
|
|
1036
1036
|
return self.con
|
|
1037
1037
|
|
|
1038
|
-
def
|
|
1038
|
+
def register(self, name: str, df):
|
|
1039
|
+
"""
|
|
1040
|
+
Register a pandas DataFrame as a virtual table in DuckDB.
|
|
1041
|
+
|
|
1042
|
+
Args:
|
|
1043
|
+
name: Name for the virtual table
|
|
1044
|
+
df: pandas DataFrame to register
|
|
1045
|
+
|
|
1046
|
+
Example:
|
|
1047
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1048
|
+
con.register("tb", df)
|
|
1049
|
+
con.sql("SELECT * FROM tb").show()
|
|
1050
|
+
"""
|
|
1051
|
+
self.con.register(name, df)
|
|
1052
|
+
|
|
1053
|
+
def get_stats(self, source: str = None, detailed = False):
|
|
1039
1054
|
"""
|
|
1040
1055
|
Get comprehensive statistics for Delta Lake tables.
|
|
1041
1056
|
|
|
@@ -1045,27 +1060,34 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1045
1060
|
- Table name: 'table_name' (uses current schema)
|
|
1046
1061
|
- Schema.table: 'schema.table_name' (specific table in schema)
|
|
1047
1062
|
- Schema only: 'schema' (all tables in schema)
|
|
1063
|
+
detailed: Optional. Controls the level of detail in statistics:
|
|
1064
|
+
- False (default): Aggregated table-level stats
|
|
1065
|
+
- True: Row group level statistics with compression details
|
|
1048
1066
|
|
|
1049
1067
|
Returns:
|
|
1050
|
-
|
|
1051
|
-
|
|
1068
|
+
DataFrame with statistics based on detailed parameter:
|
|
1069
|
+
- If detailed=False: Aggregated table-level summary
|
|
1070
|
+
- If detailed=True: Granular file and row group level stats
|
|
1052
1071
|
|
|
1053
1072
|
Examples:
|
|
1054
1073
|
con = duckrun.connect("tmp/data.lakehouse/aemo")
|
|
1055
1074
|
|
|
1056
|
-
# All tables in current schema (aemo)
|
|
1075
|
+
# All tables in current schema (aemo) - aggregated
|
|
1057
1076
|
stats = con.get_stats()
|
|
1058
1077
|
|
|
1059
|
-
# Single table in current schema
|
|
1078
|
+
# Single table in current schema - aggregated
|
|
1060
1079
|
stats = con.get_stats('price')
|
|
1061
1080
|
|
|
1081
|
+
# Single table with detailed row group statistics
|
|
1082
|
+
stats_detailed = con.get_stats('price', detailed=True)
|
|
1083
|
+
|
|
1062
1084
|
# Specific table in different schema
|
|
1063
1085
|
stats = con.get_stats('aemo.price')
|
|
1064
1086
|
|
|
1065
1087
|
# All tables in a schema
|
|
1066
1088
|
stats = con.get_stats('aemo')
|
|
1067
1089
|
"""
|
|
1068
|
-
return _get_stats(self, source)
|
|
1090
|
+
return _get_stats(self, source, detailed)
|
|
1069
1091
|
|
|
1070
1092
|
def list_lakehouses(self) -> List[str]:
|
|
1071
1093
|
"""
|
|
@@ -1237,8 +1259,194 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1237
1259
|
refresh=refresh
|
|
1238
1260
|
)
|
|
1239
1261
|
|
|
1262
|
+
def rle(self, table_name: str = None, mode = "natural",
|
|
1263
|
+
min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
|
|
1264
|
+
max_ordering_depth: int = 3, limit: int = None):
|
|
1265
|
+
"""
|
|
1266
|
+
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1267
|
+
|
|
1268
|
+
Args:
|
|
1269
|
+
table_name: Name of the table to analyze. Can be:
|
|
1270
|
+
- 'table_name' (uses current schema)
|
|
1271
|
+
- 'schema.table_name' (specific schema)
|
|
1272
|
+
mode: Analysis mode or column ordering:
|
|
1273
|
+
- "natural": Calculate RLE for natural order only (fastest)
|
|
1274
|
+
- "auto": Natural order + cardinality-based ordering (recommended)
|
|
1275
|
+
- "advanced": Natural + cardinality + greedy incremental search (most thorough)
|
|
1276
|
+
- List[str]: Specific column ordering to test, e.g., ['date', 'duid']
|
|
1277
|
+
min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
|
|
1278
|
+
max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
|
|
1279
|
+
max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
|
|
1280
|
+
limit: Optional row limit for testing/development (default: None, analyzes all rows)
|
|
1281
|
+
|
|
1282
|
+
Returns:
|
|
1283
|
+
DataFrame with RLE analysis results
|
|
1284
|
+
|
|
1285
|
+
Examples:
|
|
1286
|
+
# Natural order only (baseline)
|
|
1287
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
|
1288
|
+
con.rle("mytable") # same as con.rle("mytable", "natural")
|
|
1289
|
+
|
|
1290
|
+
# Auto optimization (natural + cardinality-based)
|
|
1291
|
+
con.rle("mytable", "auto")
|
|
1292
|
+
|
|
1293
|
+
# Advanced optimization (greedy incremental search)
|
|
1294
|
+
con.rle("mytable", "advanced")
|
|
1295
|
+
|
|
1296
|
+
# Test specific column ordering
|
|
1297
|
+
con.rle("mytable", ["date", "duid"])
|
|
1298
|
+
con.rle("mytable", ["cutoff", "time", "DUID", "date"])
|
|
1299
|
+
|
|
1300
|
+
# Advanced with custom depth
|
|
1301
|
+
con.rle("mytable", "advanced", max_ordering_depth=4)
|
|
1302
|
+
|
|
1303
|
+
# Analyze table from different schema
|
|
1304
|
+
con.rle("otherschema.mytable", "auto")
|
|
1305
|
+
|
|
1306
|
+
# Custom thresholds for small tables
|
|
1307
|
+
con.rle("mytable", "auto", max_cardinality_pct=0.05)
|
|
1308
|
+
|
|
1309
|
+
# Limit rows for testing
|
|
1310
|
+
con.rle("mytable", "auto", limit=10000)
|
|
1311
|
+
"""
|
|
1312
|
+
from .rle import (
|
|
1313
|
+
calculate_cardinality_ratio,
|
|
1314
|
+
test_column_orderings_smart,
|
|
1315
|
+
calculate_rle_for_columns
|
|
1316
|
+
)
|
|
1317
|
+
from deltalake import DeltaTable
|
|
1318
|
+
|
|
1319
|
+
# Parse table name and construct path
|
|
1320
|
+
if table_name is None:
|
|
1321
|
+
if mode != "summary":
|
|
1322
|
+
print("⚠️ Table name is required for 'smart' and 'full' modes")
|
|
1323
|
+
return None
|
|
1324
|
+
# TODO: Implement all-tables summary
|
|
1325
|
+
print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
|
|
1326
|
+
return None
|
|
1327
|
+
|
|
1328
|
+
# Parse schema.table or just table
|
|
1329
|
+
if '.' in table_name:
|
|
1330
|
+
schema_name, tbl = table_name.split('.', 1)
|
|
1331
|
+
else:
|
|
1332
|
+
schema_name = self.schema
|
|
1333
|
+
tbl = table_name
|
|
1334
|
+
|
|
1335
|
+
# Construct the full table path using the same logic as get_stats
|
|
1336
|
+
table_path = f"{self.table_base_url}{schema_name}/{tbl}"
|
|
1337
|
+
|
|
1338
|
+
# Verify table exists and is not empty
|
|
1339
|
+
print(f"📊 Analyzing table: {schema_name}.{tbl}")
|
|
1340
|
+
|
|
1341
|
+
try:
|
|
1342
|
+
dt = DeltaTable(table_path)
|
|
1343
|
+
delta_files = dt.files()
|
|
1344
|
+
|
|
1345
|
+
if not delta_files:
|
|
1346
|
+
print("⚠️ Table is empty (no files)")
|
|
1347
|
+
return None
|
|
1348
|
+
|
|
1349
|
+
except Exception as e:
|
|
1350
|
+
print(f"❌ Error accessing Delta table: {e}")
|
|
1351
|
+
return None
|
|
1352
|
+
|
|
1353
|
+
# Check if mode is a list of columns (custom ordering)
|
|
1354
|
+
if isinstance(mode, list):
|
|
1355
|
+
# User wants to test a specific column ordering
|
|
1356
|
+
print(f"Testing custom column ordering: {', '.join(mode)}")
|
|
1357
|
+
|
|
1358
|
+
# Calculate cardinality for NDV values
|
|
1359
|
+
card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
|
|
1360
|
+
|
|
1361
|
+
# Calculate RLE for the specified ordering
|
|
1362
|
+
rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
|
|
1363
|
+
|
|
1364
|
+
total_rle_all = sum(rle_counts.values())
|
|
1365
|
+
|
|
1366
|
+
print(f"\nResults:")
|
|
1367
|
+
print(f" Custom ordering: [{', '.join(mode)}]")
|
|
1368
|
+
print(f" Total RLE (all columns): {total_rle_all:,} runs")
|
|
1369
|
+
|
|
1370
|
+
# Return as DataFrame for consistency
|
|
1371
|
+
import pandas as pd
|
|
1372
|
+
results = [{
|
|
1373
|
+
'schema': schema_name,
|
|
1374
|
+
'table': tbl,
|
|
1375
|
+
'sort_order': 'custom',
|
|
1376
|
+
'columns_used': ', '.join(mode),
|
|
1377
|
+
'total_rle_all': total_rle_all,
|
|
1378
|
+
**rle_counts
|
|
1379
|
+
}]
|
|
1380
|
+
|
|
1381
|
+
df = pd.DataFrame(results)
|
|
1382
|
+
|
|
1383
|
+
# Transform to long format
|
|
1384
|
+
long_format_results = []
|
|
1385
|
+
|
|
1386
|
+
for _, row in df.iterrows():
|
|
1387
|
+
schema_val = row['schema']
|
|
1388
|
+
table_val = row['table']
|
|
1389
|
+
sort_order = row['sort_order']
|
|
1390
|
+
columns_used = row['columns_used']
|
|
1391
|
+
total_rle_all_val = row['total_rle_all']
|
|
1392
|
+
|
|
1393
|
+
# Get all column names except metadata columns
|
|
1394
|
+
metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
|
|
1395
|
+
data_columns = [col for col in df.columns if col not in metadata_cols]
|
|
1396
|
+
|
|
1397
|
+
# Get total rows from card_stats if available
|
|
1398
|
+
total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
|
|
1399
|
+
|
|
1400
|
+
# Parse the columns_used to get ordering
|
|
1401
|
+
sort_columns_list = [c.strip() for c in columns_used.split(',')]
|
|
1402
|
+
|
|
1403
|
+
# Create one row per data column
|
|
1404
|
+
for col in data_columns:
|
|
1405
|
+
rle_value = row[col]
|
|
1406
|
+
|
|
1407
|
+
# Get NDV from card_stats
|
|
1408
|
+
ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
|
|
1409
|
+
|
|
1410
|
+
# Determine if column was included in the sort and its position
|
|
1411
|
+
is_in_sort = col in sort_columns_list
|
|
1412
|
+
order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
|
|
1413
|
+
comment = '' if is_in_sort else 'not included in the sort'
|
|
1414
|
+
|
|
1415
|
+
long_format_results.append({
|
|
1416
|
+
'schema': schema_val,
|
|
1417
|
+
'table': table_val,
|
|
1418
|
+
'sort_type': sort_order,
|
|
1419
|
+
'column': col,
|
|
1420
|
+
'order': order_position,
|
|
1421
|
+
'RLE': rle_value,
|
|
1422
|
+
'NDV': ndv_value,
|
|
1423
|
+
'total_rows': total_rows,
|
|
1424
|
+
'total_RLE': total_rle_all_val,
|
|
1425
|
+
'comments': comment
|
|
1426
|
+
})
|
|
1427
|
+
|
|
1428
|
+
long_df = pd.DataFrame(long_format_results)
|
|
1429
|
+
|
|
1430
|
+
return long_df
|
|
1431
|
+
|
|
1432
|
+
# All modes now use test_column_orderings_smart with the mode parameter
|
|
1433
|
+
return test_column_orderings_smart(
|
|
1434
|
+
self.con,
|
|
1435
|
+
table_path,
|
|
1436
|
+
table_name=table_name, # Pass table name for cardinality calculation on full dataset
|
|
1437
|
+
mode=mode,
|
|
1438
|
+
limit=limit,
|
|
1439
|
+
min_distinct_threshold=min_distinct_threshold,
|
|
1440
|
+
max_cardinality_pct=max_cardinality_pct,
|
|
1441
|
+
max_ordering_depth=max_ordering_depth,
|
|
1442
|
+
schema_name=schema_name,
|
|
1443
|
+
table_display_name=tbl,
|
|
1444
|
+
duckrun_instance=self # Pass duckrun instance for detailed parquet stats
|
|
1445
|
+
)
|
|
1446
|
+
|
|
1240
1447
|
def close(self):
|
|
1241
1448
|
"""Close DuckDB connection"""
|
|
1449
|
+
|
|
1242
1450
|
if self.con:
|
|
1243
1451
|
self.con.close()
|
|
1244
1452
|
print("Connection closed")
|
|
@@ -160,6 +160,7 @@ def import_notebook_from_web(
|
|
|
160
160
|
update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
|
|
161
161
|
payload = {
|
|
162
162
|
"definition": {
|
|
163
|
+
"format": "ipynb",
|
|
163
164
|
"parts": [
|
|
164
165
|
{
|
|
165
166
|
"path": "notebook-content.py",
|
|
@@ -192,6 +193,7 @@ def import_notebook_from_web(
|
|
|
192
193
|
payload = {
|
|
193
194
|
"displayName": notebook_name,
|
|
194
195
|
"definition": {
|
|
196
|
+
"format": "ipynb",
|
|
195
197
|
"parts": [
|
|
196
198
|
{
|
|
197
199
|
"path": "notebook-content.py",
|