duckrun 0.2.19.dev1__tar.gz → 0.2.19.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/__init__.py +2 -1
  3. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/core.py +201 -0
  4. duckrun-0.2.19.dev8/duckrun/rle.py +940 -0
  5. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun.egg-info/PKG-INFO +1 -1
  6. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun.egg-info/SOURCES.txt +4 -1
  7. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/pyproject.toml +1 -1
  8. duckrun-0.2.19.dev8/tests/test_register.py +275 -0
  9. duckrun-0.2.19.dev8/tests/test_rle.py +16 -0
  10. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/LICENSE +0 -0
  11. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/README.md +0 -0
  12. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/auth.py +0 -0
  13. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/files.py +0 -0
  14. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/lakehouse.py +0 -0
  15. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/notebook.py +0 -0
  16. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/runner.py +0 -0
  17. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/semantic_model.py +0 -0
  18. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/stats.py +0 -0
  19. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun/writer.py +0 -0
  20. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun.egg-info/dependency_links.txt +0 -0
  21. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun.egg-info/requires.txt +0 -0
  22. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/duckrun.egg-info/top_level.txt +0 -0
  23. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev1
3
+ Version: 0.2.19.dev8
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -2,10 +2,11 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
+ from duckrun import rle
5
6
 
6
7
  __version__ = "0.2.18"
7
8
 
8
9
  # Expose unified connect method at module level
9
10
  connect = Duckrun.connect
10
11
 
11
- __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
12
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
@@ -1035,6 +1035,21 @@ class Duckrun(WorkspaceOperationsMixin):
1035
1035
  """Get underlying DuckDB connection"""
1036
1036
  return self.con
1037
1037
 
1038
+ def register(self, name: str, df):
1039
+ """
1040
+ Register a pandas DataFrame as a virtual table in DuckDB.
1041
+
1042
+ Args:
1043
+ name: Name for the virtual table
1044
+ df: pandas DataFrame to register
1045
+
1046
+ Example:
1047
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1048
+ con.register("tb", df)
1049
+ con.sql("SELECT * FROM tb").show()
1050
+ """
1051
+ self.con.register(name, df)
1052
+
1038
1053
  def get_stats(self, source: str = None, detailed = False):
1039
1054
  """
1040
1055
  Get comprehensive statistics for Delta Lake tables.
@@ -1244,8 +1259,194 @@ class Duckrun(WorkspaceOperationsMixin):
1244
1259
  refresh=refresh
1245
1260
  )
1246
1261
 
1262
+ def rle(self, table_name: str = None, mode = "natural",
1263
+ min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1264
+ max_ordering_depth: int = 3, limit: int = None):
1265
+ """
1266
+ Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1267
+
1268
+ Args:
1269
+ table_name: Name of the table to analyze. Can be:
1270
+ - 'table_name' (uses current schema)
1271
+ - 'schema.table_name' (specific schema)
1272
+ mode: Analysis mode or column ordering:
1273
+ - "natural": Calculate RLE for natural order only (fastest)
1274
+ - "auto": Natural order + cardinality-based ordering (recommended)
1275
+ - "advanced": Natural + cardinality + greedy incremental search (most thorough)
1276
+ - List[str]: Specific column ordering to test, e.g., ['date', 'duid']
1277
+ min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
1278
+ max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
1279
+ max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
1280
+ limit: Optional row limit for testing/development (default: None, analyzes all rows)
1281
+
1282
+ Returns:
1283
+ DataFrame with RLE analysis results
1284
+
1285
+ Examples:
1286
+ # Natural order only (baseline)
1287
+ con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
1288
+ con.rle("mytable") # same as con.rle("mytable", "natural")
1289
+
1290
+ # Auto optimization (natural + cardinality-based)
1291
+ con.rle("mytable", "auto")
1292
+
1293
+ # Advanced optimization (greedy incremental search)
1294
+ con.rle("mytable", "advanced")
1295
+
1296
+ # Test specific column ordering
1297
+ con.rle("mytable", ["date", "duid"])
1298
+ con.rle("mytable", ["cutoff", "time", "DUID", "date"])
1299
+
1300
+ # Advanced with custom depth
1301
+ con.rle("mytable", "advanced", max_ordering_depth=4)
1302
+
1303
+ # Analyze table from different schema
1304
+ con.rle("otherschema.mytable", "auto")
1305
+
1306
+ # Custom thresholds for small tables
1307
+ con.rle("mytable", "auto", max_cardinality_pct=0.05)
1308
+
1309
+ # Limit rows for testing
1310
+ con.rle("mytable", "auto", limit=10000)
1311
+ """
1312
+ from .rle import (
1313
+ calculate_cardinality_ratio,
1314
+ test_column_orderings_smart,
1315
+ calculate_rle_for_columns
1316
+ )
1317
+ from deltalake import DeltaTable
1318
+
1319
+ # Parse table name and construct path
1320
+ if table_name is None:
1321
+ if mode != "summary":
1322
+ print("⚠️ Table name is required for 'smart' and 'full' modes")
1323
+ return None
1324
+ # TODO: Implement all-tables summary
1325
+ print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
1326
+ return None
1327
+
1328
+ # Parse schema.table or just table
1329
+ if '.' in table_name:
1330
+ schema_name, tbl = table_name.split('.', 1)
1331
+ else:
1332
+ schema_name = self.schema
1333
+ tbl = table_name
1334
+
1335
+ # Construct the full table path using the same logic as get_stats
1336
+ table_path = f"{self.table_base_url}{schema_name}/{tbl}"
1337
+
1338
+ # Verify table exists and is not empty
1339
+ print(f"📊 Analyzing table: {schema_name}.{tbl}")
1340
+
1341
+ try:
1342
+ dt = DeltaTable(table_path)
1343
+ delta_files = dt.files()
1344
+
1345
+ if not delta_files:
1346
+ print("⚠️ Table is empty (no files)")
1347
+ return None
1348
+
1349
+ except Exception as e:
1350
+ print(f"❌ Error accessing Delta table: {e}")
1351
+ return None
1352
+
1353
+ # Check if mode is a list of columns (custom ordering)
1354
+ if isinstance(mode, list):
1355
+ # User wants to test a specific column ordering
1356
+ print(f"Testing custom column ordering: {', '.join(mode)}")
1357
+
1358
+ # Calculate cardinality for NDV values
1359
+ card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
1360
+
1361
+ # Calculate RLE for the specified ordering
1362
+ rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
1363
+
1364
+ total_rle_all = sum(rle_counts.values())
1365
+
1366
+ print(f"\nResults:")
1367
+ print(f" Custom ordering: [{', '.join(mode)}]")
1368
+ print(f" Total RLE (all columns): {total_rle_all:,} runs")
1369
+
1370
+ # Return as DataFrame for consistency
1371
+ import pandas as pd
1372
+ results = [{
1373
+ 'schema': schema_name,
1374
+ 'table': tbl,
1375
+ 'sort_order': 'custom',
1376
+ 'columns_used': ', '.join(mode),
1377
+ 'total_rle_all': total_rle_all,
1378
+ **rle_counts
1379
+ }]
1380
+
1381
+ df = pd.DataFrame(results)
1382
+
1383
+ # Transform to long format
1384
+ long_format_results = []
1385
+
1386
+ for _, row in df.iterrows():
1387
+ schema_val = row['schema']
1388
+ table_val = row['table']
1389
+ sort_order = row['sort_order']
1390
+ columns_used = row['columns_used']
1391
+ total_rle_all_val = row['total_rle_all']
1392
+
1393
+ # Get all column names except metadata columns
1394
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
1395
+ data_columns = [col for col in df.columns if col not in metadata_cols]
1396
+
1397
+ # Get total rows from card_stats if available
1398
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
1399
+
1400
+ # Parse the columns_used to get ordering
1401
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
1402
+
1403
+ # Create one row per data column
1404
+ for col in data_columns:
1405
+ rle_value = row[col]
1406
+
1407
+ # Get NDV from card_stats
1408
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
1409
+
1410
+ # Determine if column was included in the sort and its position
1411
+ is_in_sort = col in sort_columns_list
1412
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
1413
+ comment = '' if is_in_sort else 'not included in the sort'
1414
+
1415
+ long_format_results.append({
1416
+ 'schema': schema_val,
1417
+ 'table': table_val,
1418
+ 'sort_type': sort_order,
1419
+ 'column': col,
1420
+ 'order': order_position,
1421
+ 'RLE': rle_value,
1422
+ 'NDV': ndv_value,
1423
+ 'total_rows': total_rows,
1424
+ 'total_RLE': total_rle_all_val,
1425
+ 'comments': comment
1426
+ })
1427
+
1428
+ long_df = pd.DataFrame(long_format_results)
1429
+
1430
+ return long_df
1431
+
1432
+ # All modes now use test_column_orderings_smart with the mode parameter
1433
+ return test_column_orderings_smart(
1434
+ self.con,
1435
+ table_path,
1436
+ table_name=table_name, # Pass table name for cardinality calculation on full dataset
1437
+ mode=mode,
1438
+ limit=limit,
1439
+ min_distinct_threshold=min_distinct_threshold,
1440
+ max_cardinality_pct=max_cardinality_pct,
1441
+ max_ordering_depth=max_ordering_depth,
1442
+ schema_name=schema_name,
1443
+ table_display_name=tbl,
1444
+ duckrun_instance=self # Pass duckrun instance for detailed parquet stats
1445
+ )
1446
+
1247
1447
  def close(self):
1248
1448
  """Close DuckDB connection"""
1449
+
1249
1450
  if self.con:
1250
1451
  self.con.close()
1251
1452
  print("Connection closed")