duckrun 0.2.18.dev5__tar.gz → 0.2.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev5
3
+ Version: 0.2.19
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -2,10 +2,11 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
+ from duckrun import rle
5
6
 
6
- __version__ = "0.2.18.dev2"
7
+ __version__ = "0.2.18"
7
8
 
8
9
  # Expose unified connect method at module level
9
10
  connect = Duckrun.connect
10
11
 
11
- __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
12
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
@@ -1035,7 +1035,22 @@ class Duckrun(WorkspaceOperationsMixin):
1035
1035
  """Get underlying DuckDB connection"""
1036
1036
  return self.con
1037
1037
 
1038
- def get_stats(self, source: str = None):
1038
+ def register(self, name: str, df):
1039
+ """
1040
+ Register a pandas DataFrame as a virtual table in DuckDB.
1041
+
1042
+ Args:
1043
+ name: Name for the virtual table
1044
+ df: pandas DataFrame to register
1045
+
1046
+ Example:
1047
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1048
+ con.register("tb", df)
1049
+ con.sql("SELECT * FROM tb").show()
1050
+ """
1051
+ self.con.register(name, df)
1052
+
1053
+ def get_stats(self, source: str = None, detailed = False):
1039
1054
  """
1040
1055
  Get comprehensive statistics for Delta Lake tables.
1041
1056
 
@@ -1045,27 +1060,34 @@ class Duckrun(WorkspaceOperationsMixin):
1045
1060
  - Table name: 'table_name' (uses current schema)
1046
1061
  - Schema.table: 'schema.table_name' (specific table in schema)
1047
1062
  - Schema only: 'schema' (all tables in schema)
1063
+ detailed: Optional. Controls the level of detail in statistics:
1064
+ - False (default): Aggregated table-level stats
1065
+ - True: Row group level statistics with compression details
1048
1066
 
1049
1067
  Returns:
1050
- Arrow table with statistics including total rows, file count, row groups,
1051
- average row group size, file sizes, VORDER status, and timestamp
1068
+ DataFrame with statistics based on detailed parameter:
1069
+ - If detailed=False: Aggregated table-level summary
1070
+ - If detailed=True: Granular file and row group level stats
1052
1071
 
1053
1072
  Examples:
1054
1073
  con = duckrun.connect("tmp/data.lakehouse/aemo")
1055
1074
 
1056
- # All tables in current schema (aemo)
1075
+ # All tables in current schema (aemo) - aggregated
1057
1076
  stats = con.get_stats()
1058
1077
 
1059
- # Single table in current schema
1078
+ # Single table in current schema - aggregated
1060
1079
  stats = con.get_stats('price')
1061
1080
 
1081
+ # Single table with detailed row group statistics
1082
+ stats_detailed = con.get_stats('price', detailed=True)
1083
+
1062
1084
  # Specific table in different schema
1063
1085
  stats = con.get_stats('aemo.price')
1064
1086
 
1065
1087
  # All tables in a schema
1066
1088
  stats = con.get_stats('aemo')
1067
1089
  """
1068
- return _get_stats(self, source)
1090
+ return _get_stats(self, source, detailed)
1069
1091
 
1070
1092
  def list_lakehouses(self) -> List[str]:
1071
1093
  """
@@ -1237,8 +1259,194 @@ class Duckrun(WorkspaceOperationsMixin):
1237
1259
  refresh=refresh
1238
1260
  )
1239
1261
 
1262
+ def rle(self, table_name: str = None, mode = "natural",
1263
+ min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1264
+ max_ordering_depth: int = 3, limit: int = None):
1265
+ """
1266
+ Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1267
+
1268
+ Args:
1269
+ table_name: Name of the table to analyze. Can be:
1270
+ - 'table_name' (uses current schema)
1271
+ - 'schema.table_name' (specific schema)
1272
+ mode: Analysis mode or column ordering:
1273
+ - "natural": Calculate RLE for natural order only (fastest)
1274
+ - "auto": Natural order + cardinality-based ordering (recommended)
1275
+ - "advanced": Natural + cardinality + greedy incremental search (most thorough)
1276
+ - List[str]: Specific column ordering to test, e.g., ['date', 'duid']
1277
+ min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
1278
+ max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
1279
+ max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
1280
+ limit: Optional row limit for testing/development (default: None, analyzes all rows)
1281
+
1282
+ Returns:
1283
+ DataFrame with RLE analysis results
1284
+
1285
+ Examples:
1286
+ # Natural order only (baseline)
1287
+ con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
1288
+ con.rle("mytable") # same as con.rle("mytable", "natural")
1289
+
1290
+ # Auto optimization (natural + cardinality-based)
1291
+ con.rle("mytable", "auto")
1292
+
1293
+ # Advanced optimization (greedy incremental search)
1294
+ con.rle("mytable", "advanced")
1295
+
1296
+ # Test specific column ordering
1297
+ con.rle("mytable", ["date", "duid"])
1298
+ con.rle("mytable", ["cutoff", "time", "DUID", "date"])
1299
+
1300
+ # Advanced with custom depth
1301
+ con.rle("mytable", "advanced", max_ordering_depth=4)
1302
+
1303
+ # Analyze table from different schema
1304
+ con.rle("otherschema.mytable", "auto")
1305
+
1306
+ # Custom thresholds for small tables
1307
+ con.rle("mytable", "auto", max_cardinality_pct=0.05)
1308
+
1309
+ # Limit rows for testing
1310
+ con.rle("mytable", "auto", limit=10000)
1311
+ """
1312
+ from .rle import (
1313
+ calculate_cardinality_ratio,
1314
+ test_column_orderings_smart,
1315
+ calculate_rle_for_columns
1316
+ )
1317
+ from deltalake import DeltaTable
1318
+
1319
+ # Parse table name and construct path
1320
+ if table_name is None:
1321
+ if mode != "summary":
1322
+ print("⚠️ Table name is required for 'smart' and 'full' modes")
1323
+ return None
1324
+ # TODO: Implement all-tables summary
1325
+ print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
1326
+ return None
1327
+
1328
+ # Parse schema.table or just table
1329
+ if '.' in table_name:
1330
+ schema_name, tbl = table_name.split('.', 1)
1331
+ else:
1332
+ schema_name = self.schema
1333
+ tbl = table_name
1334
+
1335
+ # Construct the full table path using the same logic as get_stats
1336
+ table_path = f"{self.table_base_url}{schema_name}/{tbl}"
1337
+
1338
+ # Verify table exists and is not empty
1339
+ print(f"📊 Analyzing table: {schema_name}.{tbl}")
1340
+
1341
+ try:
1342
+ dt = DeltaTable(table_path)
1343
+ delta_files = dt.files()
1344
+
1345
+ if not delta_files:
1346
+ print("⚠️ Table is empty (no files)")
1347
+ return None
1348
+
1349
+ except Exception as e:
1350
+ print(f"❌ Error accessing Delta table: {e}")
1351
+ return None
1352
+
1353
+ # Check if mode is a list of columns (custom ordering)
1354
+ if isinstance(mode, list):
1355
+ # User wants to test a specific column ordering
1356
+ print(f"Testing custom column ordering: {', '.join(mode)}")
1357
+
1358
+ # Calculate cardinality for NDV values
1359
+ card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
1360
+
1361
+ # Calculate RLE for the specified ordering
1362
+ rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
1363
+
1364
+ total_rle_all = sum(rle_counts.values())
1365
+
1366
+ print(f"\nResults:")
1367
+ print(f" Custom ordering: [{', '.join(mode)}]")
1368
+ print(f" Total RLE (all columns): {total_rle_all:,} runs")
1369
+
1370
+ # Return as DataFrame for consistency
1371
+ import pandas as pd
1372
+ results = [{
1373
+ 'schema': schema_name,
1374
+ 'table': tbl,
1375
+ 'sort_order': 'custom',
1376
+ 'columns_used': ', '.join(mode),
1377
+ 'total_rle_all': total_rle_all,
1378
+ **rle_counts
1379
+ }]
1380
+
1381
+ df = pd.DataFrame(results)
1382
+
1383
+ # Transform to long format
1384
+ long_format_results = []
1385
+
1386
+ for _, row in df.iterrows():
1387
+ schema_val = row['schema']
1388
+ table_val = row['table']
1389
+ sort_order = row['sort_order']
1390
+ columns_used = row['columns_used']
1391
+ total_rle_all_val = row['total_rle_all']
1392
+
1393
+ # Get all column names except metadata columns
1394
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
1395
+ data_columns = [col for col in df.columns if col not in metadata_cols]
1396
+
1397
+ # Get total rows from card_stats if available
1398
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
1399
+
1400
+ # Parse the columns_used to get ordering
1401
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
1402
+
1403
+ # Create one row per data column
1404
+ for col in data_columns:
1405
+ rle_value = row[col]
1406
+
1407
+ # Get NDV from card_stats
1408
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
1409
+
1410
+ # Determine if column was included in the sort and its position
1411
+ is_in_sort = col in sort_columns_list
1412
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
1413
+ comment = '' if is_in_sort else 'not included in the sort'
1414
+
1415
+ long_format_results.append({
1416
+ 'schema': schema_val,
1417
+ 'table': table_val,
1418
+ 'sort_type': sort_order,
1419
+ 'column': col,
1420
+ 'order': order_position,
1421
+ 'RLE': rle_value,
1422
+ 'NDV': ndv_value,
1423
+ 'total_rows': total_rows,
1424
+ 'total_RLE': total_rle_all_val,
1425
+ 'comments': comment
1426
+ })
1427
+
1428
+ long_df = pd.DataFrame(long_format_results)
1429
+
1430
+ return long_df
1431
+
1432
+ # All modes now use test_column_orderings_smart with the mode parameter
1433
+ return test_column_orderings_smart(
1434
+ self.con,
1435
+ table_path,
1436
+ table_name=table_name, # Pass table name for cardinality calculation on full dataset
1437
+ mode=mode,
1438
+ limit=limit,
1439
+ min_distinct_threshold=min_distinct_threshold,
1440
+ max_cardinality_pct=max_cardinality_pct,
1441
+ max_ordering_depth=max_ordering_depth,
1442
+ schema_name=schema_name,
1443
+ table_display_name=tbl,
1444
+ duckrun_instance=self # Pass duckrun instance for detailed parquet stats
1445
+ )
1446
+
1240
1447
  def close(self):
1241
1448
  """Close DuckDB connection"""
1449
+
1242
1450
  if self.con:
1243
1451
  self.con.close()
1244
1452
  print("Connection closed")
@@ -160,6 +160,7 @@ def import_notebook_from_web(
160
160
  update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
161
161
  payload = {
162
162
  "definition": {
163
+ "format": "ipynb",
163
164
  "parts": [
164
165
  {
165
166
  "path": "notebook-content.py",
@@ -192,6 +193,7 @@ def import_notebook_from_web(
192
193
  payload = {
193
194
  "displayName": notebook_name,
194
195
  "definition": {
196
+ "format": "ipynb",
195
197
  "parts": [
196
198
  {
197
199
  "path": "notebook-content.py",