duckrun 0.2.19.dev2__tar.gz → 0.2.19.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/core.py +125 -53
  3. duckrun-0.2.19.dev4/duckrun/rle.py +873 -0
  4. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/PKG-INFO +1 -1
  5. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/pyproject.toml +1 -1
  6. duckrun-0.2.19.dev4/tests/test_rle.py +16 -0
  7. duckrun-0.2.19.dev2/duckrun/rle.py +0 -521
  8. duckrun-0.2.19.dev2/tests/test_rle.py +0 -10
  9. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/LICENSE +0 -0
  10. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/README.md +0 -0
  11. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/__init__.py +0 -0
  12. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/auth.py +0 -0
  13. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/files.py +0 -0
  14. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/lakehouse.py +0 -0
  15. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/notebook.py +0 -0
  16. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/runner.py +0 -0
  17. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/semantic_model.py +0 -0
  18. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/stats.py +0 -0
  19. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun/writer.py +0 -0
  20. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/SOURCES.txt +0 -0
  21. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
  22. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/requires.txt +0 -0
  23. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/duckrun.egg-info/top_level.txt +0 -0
  24. {duckrun-0.2.19.dev2 → duckrun-0.2.19.dev4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev2
3
+ Version: 0.2.19.dev4
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1244,9 +1244,9 @@ class Duckrun(WorkspaceOperationsMixin):
1244
1244
  refresh=refresh
1245
1245
  )
1246
1246
 
1247
- def rle(self, table_name: str = None, mode: str = "summary", sort_columns: List[str] = None,
1248
- limit: int = None, max_combinations: int = 20, use_stratified_sampling: bool = True,
1249
- num_segments: int = 5, segment_size: int = 1000):
1247
+ def rle(self, table_name: str = None, mode = "natural",
1248
+ min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1249
+ max_ordering_depth: int = 3, limit: int = None):
1250
1250
  """
1251
1251
  Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1252
1252
 
@@ -1254,38 +1254,48 @@ class Duckrun(WorkspaceOperationsMixin):
1254
1254
  table_name: Name of the table to analyze. Can be:
1255
1255
  - 'table_name' (uses current schema)
1256
1256
  - 'schema.table_name' (specific schema)
1257
- - None (analyzes all tables in current schema - summary only)
1258
- mode: Analysis mode:
1259
- - "summary": Quick NFV (Number of Distinct Values) analysis (default)
1260
- - "smart": Smart heuristic-based analysis (recommended)
1261
- - "full": Full RLE analysis with all column orderings
1262
- sort_columns: Optional list of columns to sort by for RLE calculation
1263
- limit: Optional limit on number of rows to analyze (ignored if using stratified sampling)
1264
- max_combinations: Maximum number of orderings to test (for smart mode)
1265
- use_stratified_sampling: If True, use stratified sampling across entire file (recommended)
1266
- num_segments: Number of segments for stratified sampling
1267
- segment_size: Size of each segment for sampling
1257
+ mode: Analysis mode or column ordering:
1258
+ - "natural": Calculate RLE for natural order only (fastest)
1259
+ - "auto": Natural order + cardinality-based ordering (recommended)
1260
+ - "advanced": Natural + cardinality + greedy incremental search (most thorough)
1261
+ - List[str]: Specific column ordering to test, e.g., ['date', 'duid']
1262
+ min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
1263
+ max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
1264
+ max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
1265
+ limit: Optional row limit for testing/development (default: None, analyzes all rows)
1268
1266
 
1269
1267
  Returns:
1270
1268
  DataFrame with RLE analysis results
1271
1269
 
1272
1270
  Examples:
1273
- # Quick summary of a specific table
1271
+ # Natural order only (baseline)
1274
1272
  con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
1275
- con.rle("mytable") # defaults to summary mode
1276
- con.rle("mytable", "summary")
1273
+ con.rle("mytable") # same as con.rle("mytable", "natural")
1277
1274
 
1278
- # Smart analysis (finds optimal column ordering)
1279
- con.rle("mytable", "smart")
1275
+ # Auto optimization (natural + cardinality-based)
1276
+ con.rle("mytable", "auto")
1277
+
1278
+ # Advanced optimization (greedy incremental search)
1279
+ con.rle("mytable", "advanced")
1280
+
1281
+ # Test specific column ordering
1282
+ con.rle("mytable", ["date", "duid"])
1283
+ con.rle("mytable", ["cutoff", "time", "DUID", "date"])
1284
+
1285
+ # Advanced with custom depth
1286
+ con.rle("mytable", "advanced", max_ordering_depth=4)
1280
1287
 
1281
1288
  # Analyze table from different schema
1282
- con.rle("otherschema.mytable", "smart")
1289
+ con.rle("otherschema.mytable", "auto")
1283
1290
 
1284
- # Full analysis with custom parameters
1285
- con.rle("mytable", "full", use_stratified_sampling=True, num_segments=10)
1291
+ # Custom thresholds for small tables
1292
+ con.rle("mytable", "auto", max_cardinality_pct=0.05)
1293
+
1294
+ # Limit rows for testing
1295
+ con.rle("mytable", "auto", limit=10000)
1286
1296
  """
1287
1297
  from .rle import (
1288
- calculate_nfv_score,
1298
+ calculate_cardinality_ratio,
1289
1299
  test_column_orderings_smart,
1290
1300
  calculate_rle_for_columns
1291
1301
  )
@@ -1310,7 +1320,7 @@ class Duckrun(WorkspaceOperationsMixin):
1310
1320
  # Construct the full table path using the same logic as get_stats
1311
1321
  table_path = f"{self.table_base_url}{schema_name}/{tbl}"
1312
1322
 
1313
- # Get the actual parquet files from Delta table
1323
+ # Verify table exists and is not empty
1314
1324
  print(f"📊 Analyzing table: {schema_name}.{tbl}")
1315
1325
 
1316
1326
  try:
@@ -1321,40 +1331,102 @@ class Duckrun(WorkspaceOperationsMixin):
1321
1331
  print("⚠️ Table is empty (no files)")
1322
1332
  return None
1323
1333
 
1324
- # Construct full paths for parquet files
1325
- parquet_paths = [table_path + "/" + f for f in delta_files]
1326
-
1327
1334
  except Exception as e:
1328
1335
  print(f"❌ Error accessing Delta table: {e}")
1329
1336
  return None
1330
1337
 
1331
- # For now, analyze the first file (can be extended to analyze all files)
1332
- parquet_path = parquet_paths[0]
1333
-
1334
- if mode == "summary":
1335
- # Quick NFV analysis
1336
- nfv_scores = calculate_nfv_score(self.con, parquet_path, limit)
1338
+ # Check if mode is a list of columns (custom ordering)
1339
+ if isinstance(mode, list):
1340
+ # User wants to test a specific column ordering
1341
+ print(f"Testing custom column ordering: {', '.join(mode)}")
1342
+
1343
+ # Calculate cardinality for NDV values
1344
+ card_stats = calculate_cardinality_ratio(self.con, table_name if table_name else f"delta_scan('{table_path}')", is_parquet=False)
1345
+
1346
+ # Calculate RLE for the specified ordering
1347
+ rle_counts = calculate_rle_for_columns(self.con, table_path, mode, limit)
1348
+
1349
+ total_rle_all = sum(rle_counts.values())
1350
+
1351
+ print(f"\nResults:")
1352
+ print(f" Custom ordering: [{', '.join(mode)}]")
1353
+ print(f" Total RLE (all columns): {total_rle_all:,} runs")
1354
+
1355
+ # Return as DataFrame for consistency
1337
1356
  import pandas as pd
1338
- df = pd.DataFrame([
1339
- {"column": col, "nfv_score": score}
1340
- for col, score in sorted(nfv_scores.items(), key=lambda x: x[1])
1341
- ])
1342
- return df
1343
-
1344
- elif mode in ["smart", "full"]:
1345
- # Smart or full RLE analysis
1346
- return test_column_orderings_smart(
1347
- self.con,
1348
- parquet_path,
1349
- limit=limit,
1350
- max_combinations=max_combinations,
1351
- use_stratified_sampling=use_stratified_sampling,
1352
- num_segments=num_segments,
1353
- segment_size=segment_size
1354
- )
1355
- else:
1356
- print(f"❌ Unknown mode: {mode}. Use 'summary', 'smart', or 'full'")
1357
- return None
1357
+ results = [{
1358
+ 'schema': schema_name,
1359
+ 'table': tbl,
1360
+ 'sort_order': 'custom',
1361
+ 'columns_used': ', '.join(mode),
1362
+ 'total_rle_all': total_rle_all,
1363
+ **rle_counts
1364
+ }]
1365
+
1366
+ df = pd.DataFrame(results)
1367
+
1368
+ # Transform to long format
1369
+ long_format_results = []
1370
+
1371
+ for _, row in df.iterrows():
1372
+ schema_val = row['schema']
1373
+ table_val = row['table']
1374
+ sort_order = row['sort_order']
1375
+ columns_used = row['columns_used']
1376
+ total_rle_all_val = row['total_rle_all']
1377
+
1378
+ # Get all column names except metadata columns
1379
+ metadata_cols = ['schema', 'table', 'sort_order', 'columns_used', 'total_rle_all']
1380
+ data_columns = [col for col in df.columns if col not in metadata_cols]
1381
+
1382
+ # Get total rows from card_stats if available
1383
+ total_rows = card_stats[data_columns[0]]['total_rows'] if card_stats and data_columns else None
1384
+
1385
+ # Parse the columns_used to get ordering
1386
+ sort_columns_list = [c.strip() for c in columns_used.split(',')]
1387
+
1388
+ # Create one row per data column
1389
+ for col in data_columns:
1390
+ rle_value = row[col]
1391
+
1392
+ # Get NDV from card_stats
1393
+ ndv_value = card_stats[col]['distinct_values'] if card_stats and col in card_stats else None
1394
+
1395
+ # Determine if column was included in the sort and its position
1396
+ is_in_sort = col in sort_columns_list
1397
+ order_position = sort_columns_list.index(col) + 1 if is_in_sort else None
1398
+ comment = '' if is_in_sort else 'not included in the sort'
1399
+
1400
+ long_format_results.append({
1401
+ 'schema': schema_val,
1402
+ 'table': table_val,
1403
+ 'sort_type': sort_order,
1404
+ 'column': col,
1405
+ 'order': order_position,
1406
+ 'RLE': rle_value,
1407
+ 'NDV': ndv_value,
1408
+ 'total_rows': total_rows,
1409
+ 'total_RLE': total_rle_all_val,
1410
+ 'comments': comment
1411
+ })
1412
+
1413
+ long_df = pd.DataFrame(long_format_results)
1414
+
1415
+ return long_df
1416
+
1417
+ # All modes now use test_column_orderings_smart with the mode parameter
1418
+ return test_column_orderings_smart(
1419
+ self.con,
1420
+ table_path,
1421
+ table_name=table_name, # Pass table name for cardinality calculation on full dataset
1422
+ mode=mode,
1423
+ limit=limit,
1424
+ min_distinct_threshold=min_distinct_threshold,
1425
+ max_cardinality_pct=max_cardinality_pct,
1426
+ max_ordering_depth=max_ordering_depth,
1427
+ schema_name=schema_name,
1428
+ table_display_name=tbl
1429
+ )
1358
1430
 
1359
1431
  def close(self):
1360
1432
  """Close DuckDB connection"""