duckrun 0.2.19.dev1__tar.gz → 0.2.19.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/__init__.py +2 -1
  3. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/core.py +94 -0
  4. duckrun-0.2.19.dev3/duckrun/rle.py +713 -0
  5. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun.egg-info/PKG-INFO +1 -1
  6. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun.egg-info/SOURCES.txt +1 -0
  7. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/pyproject.toml +1 -1
  8. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/LICENSE +0 -0
  9. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/README.md +0 -0
  10. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/auth.py +0 -0
  11. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/files.py +0 -0
  12. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/lakehouse.py +0 -0
  13. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/notebook.py +0 -0
  14. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/runner.py +0 -0
  15. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/semantic_model.py +0 -0
  16. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/stats.py +0 -0
  17. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun/writer.py +0 -0
  18. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun.egg-info/dependency_links.txt +0 -0
  19. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun.egg-info/requires.txt +0 -0
  20. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/duckrun.egg-info/top_level.txt +0 -0
  21. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev1
3
+ Version: 0.2.19.dev3
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -2,10 +2,11 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
+ from duckrun import rle
5
6
 
6
7
  __version__ = "0.2.18"
7
8
 
8
9
  # Expose unified connect method at module level
9
10
  connect = Duckrun.connect
10
11
 
11
- __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
12
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
@@ -1244,8 +1244,102 @@ class Duckrun(WorkspaceOperationsMixin):
1244
1244
  refresh=refresh
1245
1245
  )
1246
1246
 
1247
+ def rle(self, table_name: str = None, mode: str = "natural",
1248
+ min_distinct_threshold: int = 2, max_cardinality_pct: float = 0.01,
1249
+ max_ordering_depth: int = 3):
1250
+ """
1251
+ Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1252
+
1253
+ Args:
1254
+ table_name: Name of the table to analyze. Can be:
1255
+ - 'table_name' (uses current schema)
1256
+ - 'schema.table_name' (specific schema)
1257
+ mode: Analysis mode:
1258
+ - "natural": Calculate RLE for natural order only (default, fastest)
1259
+ - "auto": Natural order + cardinality-based ordering (recommended)
1260
+ - "advanced": Natural + cardinality + greedy incremental search (most thorough)
1261
+ min_distinct_threshold: Exclude columns with fewer distinct values (default: 2)
1262
+ max_cardinality_pct: Exclude columns with cardinality above this % (default: 0.01 = 1%)
1263
+ max_ordering_depth: Maximum depth for greedy search in "advanced" mode (default: 3)
1264
+
1265
+ Returns:
1266
+ DataFrame with RLE analysis results
1267
+
1268
+ Examples:
1269
+ # Natural order only (baseline)
1270
+ con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
1271
+ con.rle("mytable") # same as con.rle("mytable", "natural")
1272
+
1273
+ # Auto optimization (natural + cardinality-based)
1274
+ con.rle("mytable", "auto")
1275
+
1276
+ # Advanced optimization (greedy incremental search)
1277
+ con.rle("mytable", "advanced")
1278
+
1279
+ # Advanced with custom depth
1280
+ con.rle("mytable", "advanced", max_ordering_depth=4)
1281
+
1282
+ # Analyze table from different schema
1283
+ con.rle("otherschema.mytable", "auto")
1284
+
1285
+ # Custom thresholds for small tables
1286
+ con.rle("mytable", "auto", max_cardinality_pct=0.05)
1287
+ """
1288
+ from .rle import (
1289
+ calculate_cardinality_ratio,
1290
+ test_column_orderings_smart,
1291
+ calculate_rle_for_columns
1292
+ )
1293
+ from deltalake import DeltaTable
1294
+
1295
+ # Parse table name and construct path
1296
+ if table_name is None:
1297
+ if mode != "summary":
1298
+ print("⚠️ Table name is required for 'smart' and 'full' modes")
1299
+ return None
1300
+ # TODO: Implement all-tables summary
1301
+ print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
1302
+ return None
1303
+
1304
+ # Parse schema.table or just table
1305
+ if '.' in table_name:
1306
+ schema_name, tbl = table_name.split('.', 1)
1307
+ else:
1308
+ schema_name = self.schema
1309
+ tbl = table_name
1310
+
1311
+ # Construct the full table path using the same logic as get_stats
1312
+ table_path = f"{self.table_base_url}{schema_name}/{tbl}"
1313
+
1314
+ # Verify table exists and is not empty
1315
+ print(f"📊 Analyzing table: {schema_name}.{tbl}")
1316
+
1317
+ try:
1318
+ dt = DeltaTable(table_path)
1319
+ delta_files = dt.files()
1320
+
1321
+ if not delta_files:
1322
+ print("⚠️ Table is empty (no files)")
1323
+ return None
1324
+
1325
+ except Exception as e:
1326
+ print(f"❌ Error accessing Delta table: {e}")
1327
+ return None
1328
+
1329
+ # All modes now use test_column_orderings_smart with the mode parameter
1330
+ return test_column_orderings_smart(
1331
+ self.con,
1332
+ table_path,
1333
+ table_name=table_name, # Pass table name for cardinality calculation on full dataset
1334
+ mode=mode,
1335
+ min_distinct_threshold=min_distinct_threshold,
1336
+ max_cardinality_pct=max_cardinality_pct,
1337
+ max_ordering_depth=max_ordering_depth
1338
+ )
1339
+
1247
1340
  def close(self):
1248
1341
  """Close DuckDB connection"""
1342
+
1249
1343
  if self.con:
1250
1344
  self.con.close()
1251
1345
  print("Connection closed")
@@ -0,0 +1,713 @@
1
+ import itertools
2
+ from typing import List, Dict, Tuple, Optional
3
+ import pandas as pd
4
+
5
+ def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
6
+ """
7
+ Analyze Parquet row group statistics to identify columns with constant values.
8
+ This is much faster than reading all data.
9
+
10
+ Returns:
11
+ DataFrame with row group stats per column
12
+ """
13
+ try:
14
+ # Get row group metadata
15
+ metadata = con.sql(f"""
16
+ SELECT * FROM parquet_metadata('{parquet_path}')
17
+ """).df()
18
+
19
+ return metadata
20
+ except Exception as e:
21
+ print(f"Could not read parquet metadata: {e}")
22
+ return None
23
+
24
+
25
+ def estimate_rle_from_row_groups(con, parquet_path: str) -> Dict[str, dict]:
26
+ """
27
+ Estimate RLE potential from Parquet row group statistics.
28
+ If min == max in a row group, that entire group is one RLE run.
29
+
30
+ Returns:
31
+ Dictionary with column stats: {col: {'constant_groups': N, 'total_groups': M, 'constant_ratio': ratio}}
32
+ """
33
+ try:
34
+ # Get row group statistics - this varies by DuckDB version
35
+ # Try to get column chunk stats
36
+ stats_query = f"""
37
+ SELECT
38
+ row_group_id,
39
+ column_id,
40
+ file_offset,
41
+ num_values,
42
+ total_compressed_size,
43
+ total_uncompressed_size
44
+ FROM parquet_file_metadata('{parquet_path}')
45
+ """
46
+
47
+ stats = con.sql(stats_query).df()
48
+ print("Row group metadata available!")
49
+ return stats
50
+
51
+ except Exception as e:
52
+ print(f"Parquet metadata not available in this DuckDB version: {e}")
53
+ print("Falling back to stratified sampling...")
54
+ return None
55
+
56
+
57
+ def stratified_rle_sampling(con, delta_path: str, sort_columns: List[str] = None,
58
+ num_segments: int = 5, segment_size: int = 1000) -> Dict[str, float]:
59
+ """
60
+ Sample RLE density across multiple segments of the file.
61
+
62
+ Args:
63
+ con: DuckDB connection
64
+ delta_path: Path to Delta table
65
+ sort_columns: List of columns to sort by before calculating RLE. If None, uses natural order.
66
+ num_segments: Number of segments to sample across the file
67
+ segment_size: Number of rows per segment
68
+
69
+ Returns:
70
+ Dictionary with estimated RLE runs per column for full file
71
+ """
72
+ # Get total row count
73
+ total_rows = con.sql(f"""
74
+ SELECT COUNT(*) FROM delta_scan('{delta_path}')
75
+ """).fetchone()[0]
76
+
77
+ # Get column names
78
+ columns = con.sql(f"""
79
+ SELECT column_name
80
+ FROM (
81
+ DESCRIBE
82
+ SELECT * FROM delta_scan('{delta_path}', file_row_number = TRUE)
83
+ )
84
+ WHERE column_name != 'file_row_number'
85
+ """).fetchall()
86
+
87
+ column_names = [col[0] for col in columns]
88
+
89
+ # Build ORDER BY clause
90
+ if sort_columns:
91
+ order_by_clause = "ORDER BY " + ", ".join(sort_columns)
92
+ sort_desc = f"sorted by [{', '.join(sort_columns)}]"
93
+ else:
94
+ order_by_clause = "ORDER BY file_row_number"
95
+ sort_desc = "natural order"
96
+
97
+ # Calculate segment positions spread across the file
98
+ segment_positions = []
99
+ if num_segments == 1:
100
+ segment_positions = [0]
101
+ else:
102
+ step = total_rows // (num_segments + 1)
103
+ segment_positions = [step * (i + 1) for i in range(num_segments)]
104
+
105
+ # Sample each segment and calculate RLE density
106
+ all_densities = {col: [] for col in column_names}
107
+
108
+ for seg_idx, start_pos in enumerate(segment_positions, 1):
109
+ for col in column_names:
110
+ # The key fix: we need to sort the ENTIRE dataset first, then sample from it
111
+ # This is expensive but necessary for accurate results
112
+ rle_count = con.sql(f"""
113
+ WITH sorted_data AS (
114
+ SELECT
115
+ *,
116
+ ROW_NUMBER() OVER ({order_by_clause}) as sorted_row_num
117
+ FROM delta_scan('{delta_path}', file_row_number = TRUE)
118
+ ),
119
+ segment_data AS (
120
+ SELECT
121
+ {col},
122
+ sorted_row_num
123
+ FROM sorted_data
124
+ WHERE sorted_row_num >= {start_pos}
125
+ ORDER BY sorted_row_num
126
+ LIMIT {segment_size}
127
+ ),
128
+ runs AS (
129
+ SELECT
130
+ CASE
131
+ WHEN LAG({col}) OVER (ORDER BY sorted_row_num) != {col}
132
+ OR LAG({col}) OVER (ORDER BY sorted_row_num) IS NULL
133
+ THEN 1
134
+ ELSE 0
135
+ END AS new_run
136
+ FROM segment_data
137
+ )
138
+ SELECT SUM(new_run) AS rle_run_count
139
+ FROM runs
140
+ """).fetchone()[0]
141
+
142
+ # Calculate density (runs per row)
143
+ density = rle_count / segment_size
144
+ all_densities[col].append(density)
145
+
146
+ # Estimate total runs for full file
147
+ estimated_runs = {}
148
+ density_stats = {}
149
+
150
+ for col in column_names:
151
+ avg_density = sum(all_densities[col]) / len(all_densities[col])
152
+ min_density = min(all_densities[col])
153
+ max_density = max(all_densities[col])
154
+ std_density = (sum((d - avg_density)**2 for d in all_densities[col]) / len(all_densities[col]))**0.5
155
+
156
+ estimated_total = int(avg_density * total_rows)
157
+ estimated_runs[col] = estimated_total
158
+
159
+ density_stats[col] = {
160
+ 'avg_density': avg_density,
161
+ 'min_density': min_density,
162
+ 'max_density': max_density,
163
+ 'std_density': std_density,
164
+ 'estimated_runs': estimated_total,
165
+ 'variance_coefficient': std_density / avg_density if avg_density > 0 else 0
166
+ }
167
+
168
+ return estimated_runs, density_stats
169
+
170
+
171
+ def calculate_rle_for_columns(con, delta_path: str, sort_columns: List[str] = None, limit: int = None) -> Dict[str, int]:
172
+ """
173
+ Calculate RLE runs for all columns in a Delta table, optionally after sorting.
174
+
175
+ Args:
176
+ con: DuckDB connection
177
+ delta_path: Path to Delta table
178
+ sort_columns: List of columns to sort by (in order). If None, uses natural file order.
179
+ limit: Optional limit on number of rows to analyze
180
+
181
+ Returns:
182
+ Dictionary mapping column names to RLE run counts
183
+ """
184
+ # Get all column names
185
+ columns = con.sql(f"""
186
+ SELECT column_name
187
+ FROM (
188
+ DESCRIBE
189
+ SELECT *
190
+ FROM delta_scan('{delta_path}', file_row_number = TRUE)
191
+ )
192
+ WHERE column_name != 'file_row_number'
193
+ """).fetchall()
194
+
195
+ column_names = [col[0] for col in columns]
196
+
197
+ # Build ORDER BY clause
198
+ if sort_columns:
199
+ order_by = "ORDER BY " + ", ".join(sort_columns)
200
+ else:
201
+ order_by = "ORDER BY file_row_number ASC"
202
+
203
+ limit_clause = f"LIMIT {limit}" if limit else ""
204
+
205
+ # Calculate RLE for each column
206
+ results = {}
207
+ for column_name in column_names:
208
+ rle_count = con.sql(f"""
209
+ WITH ordered_data AS (
210
+ SELECT
211
+ {column_name},
212
+ ROW_NUMBER() OVER ({order_by}) as sort_order
213
+ FROM delta_scan('{delta_path}', file_row_number = TRUE)
214
+ {limit_clause}
215
+ ),
216
+ runs AS (
217
+ SELECT
218
+ CASE
219
+ WHEN LAG({column_name}) OVER (ORDER BY sort_order) != {column_name}
220
+ OR LAG({column_name}) OVER (ORDER BY sort_order) IS NULL
221
+ THEN 1
222
+ ELSE 0
223
+ END AS new_run
224
+ FROM ordered_data
225
+ )
226
+ SELECT SUM(new_run) AS rle_run_count
227
+ FROM runs
228
+ """).fetchone()[0]
229
+
230
+ results[column_name] = rle_count
231
+
232
+ return results
233
+
234
+
235
+ def calculate_cardinality_ratio(con, source: str, limit: int = None, is_parquet: bool = False,
236
+ use_approx: bool = None, approx_threshold: int = 100_000_000) -> Dict[str, dict]:
237
+ """
238
+ Calculate cardinality ratio for each column (distinct_values / total_rows).
239
+ Lower ratio = better for RLE compression (more repetition).
240
+
241
+ NEVER uses sampling - always scans full dataset with exact or approximate distinct counts.
242
+
243
+ Args:
244
+ con: DuckDB connection
245
+ source: Either a table name (default) or parquet file path
246
+ limit: DEPRECATED - kept for backward compatibility but ignored. Always scans full dataset.
247
+ is_parquet: If True, source is a parquet file path; if False, source is a table name
248
+ use_approx: If True, use HyperLogLog (approx). If False, use exact COUNT(DISTINCT).
249
+ If None (default), auto-decide based on table size threshold.
250
+ approx_threshold: Row count threshold for using HyperLogLog (default: 100M rows)
251
+
252
+ Returns:
253
+ Dictionary mapping column names to dict with keys:
254
+ - 'cardinality_ratio': distinct/total, range 0-1, lower is better for RLE
255
+ - 'total_rows': total row count
256
+ - 'distinct_values': number of distinct values (exact or approximate)
257
+ """
258
+ # Build the FROM clause based on source type
259
+ if is_parquet:
260
+ from_clause = f"read_parquet('{source}', file_row_number = TRUE)"
261
+ column_filter = "WHERE column_name != 'file_row_number'"
262
+ else:
263
+ from_clause = source # Table name
264
+ column_filter = ""
265
+
266
+ columns = con.sql(f"""
267
+ SELECT column_name
268
+ FROM (DESCRIBE SELECT * FROM {from_clause})
269
+ {column_filter}
270
+ """).fetchall()
271
+
272
+ column_names = [col[0] for col in columns]
273
+
274
+ if not column_names:
275
+ return {}
276
+
277
+ # Auto-decide whether to use approximate or exact based on table size
278
+ if use_approx is None:
279
+ # Quick row count check
280
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
281
+ use_approx = total_rows > approx_threshold
282
+ if use_approx:
283
+ print(f" Table has {total_rows:,} rows (>{approx_threshold:,}) - using HyperLogLog approximation")
284
+ else:
285
+ print(f" Table has {total_rows:,} rows (<={approx_threshold:,}) - using exact COUNT(DISTINCT)")
286
+ else:
287
+ total_rows = None # Will be calculated in main query
288
+
289
+ # Build a single query that calculates all NFV scores in one pass
290
+ # This scans the data only ONCE instead of once per column
291
+ select_clauses = []
292
+ for col in column_names:
293
+ if use_approx:
294
+ select_clauses.append(f"approx_count_distinct({col}) as distinct_{col}")
295
+ else:
296
+ select_clauses.append(f"COUNT(DISTINCT {col}) as distinct_{col}")
297
+
298
+ query = f"""
299
+ SELECT
300
+ COUNT(*)::BIGINT as total_rows,
301
+ {', '.join(select_clauses)}
302
+ FROM {from_clause}
303
+ """
304
+
305
+ result = con.sql(query).fetchone()
306
+
307
+ if not result:
308
+ return {}
309
+
310
+ if total_rows is None:
311
+ total_rows = result[0]
312
+
313
+ nfv_stats = {}
314
+
315
+ # Parse results (total_rows, distinct_col1, distinct_col2, ...)
316
+ for idx, col in enumerate(column_names, start=1):
317
+ distinct_values = result[idx]
318
+ cardinality_ratio = (distinct_values / total_rows) if total_rows > 0 else 0.0
319
+
320
+ nfv_stats[col] = {
321
+ 'total_rows': total_rows,
322
+ 'distinct_values': distinct_values,
323
+ 'cardinality_ratio': cardinality_ratio
324
+ }
325
+
326
+ return nfv_stats
327
+
328
+
329
+ def filter_promising_combinations(columns: List[str], nfv_scores: Dict[str, float],
330
+ max_combinations: int = 20) -> List[List[str]]:
331
+ """
332
+ Apply heuristics to filter down to the most promising column orderings.
333
+
334
+ Heuristics based on research:
335
+ 1. Time/date columns first (temporal ordering)
336
+ 2. High NFV score columns before low NFV score (more repetition = better RLE)
337
+ 3. Correlated columns together (e.g., date + time)
338
+ 4. Avoid starting with low-NFV columns (high cardinality)
339
+
340
+ Args:
341
+ columns: List of all column names
342
+ nfv_scores: NFV score for each column (higher = more repetition, better for RLE)
343
+ max_combinations: Maximum number of combinations to return
344
+
345
+ Returns:
346
+ List of promising column orderings to test
347
+ """
348
+ # Sort columns by NFV (higher first = better for RLE)
349
+ sorted_by_nfv = sorted(columns, key=lambda c: nfv_scores[c], reverse=True)
350
+
351
+ promising = []
352
+
353
+ # Rule 1: Natural order baseline
354
+ promising.append([])
355
+
356
+ # Rule 2: NFV-based ordering (highest to lowest)
357
+ promising.append(sorted_by_nfv)
358
+
359
+ # Rule 3: Single best column (highest NFV)
360
+ promising.append([sorted_by_nfv[0]])
361
+
362
+ # Rule 4: Time-based patterns (common column names)
363
+ time_cols = [c for c in columns if any(t in c.lower() for t in ['date', 'time', 'timestamp', 'year', 'month', 'day'])]
364
+ if time_cols:
365
+ promising.append(time_cols)
366
+ # Time columns + high NFV columns
367
+ non_time = [c for c in sorted_by_nfv if c not in time_cols]
368
+ if non_time:
369
+ promising.append(time_cols + non_time[:2])
370
+
371
+ # Rule 5: Top 2-3 highest NFV columns in different orders
372
+ top_high_nfv = sorted_by_nfv[:min(3, len(sorted_by_nfv))]
373
+ for perm in itertools.permutations(top_high_nfv, min(2, len(top_high_nfv))):
374
+ promising.append(list(perm))
375
+
376
+ # Rule 6: ID-like columns first (common patterns)
377
+ id_cols = [c for c in columns if any(t in c.lower() for t in ['id', 'key', 'code'])]
378
+ if id_cols:
379
+ promising.append(id_cols)
380
+
381
+ # Rule 7: Categorical/enum-like columns (very low NFV < 0.1)
382
+ categorical = [c for c in sorted_by_nfv if nfv_scores[c] < 0.1]
383
+ if categorical:
384
+ promising.append(categorical)
385
+ # Categorical + time
386
+ if time_cols:
387
+ promising.append(categorical + time_cols)
388
+
389
+ # Remove duplicates while preserving order
390
+ seen = set()
391
+ unique_promising = []
392
+ for combo in promising:
393
+ key = tuple(combo)
394
+ if key not in seen:
395
+ seen.add(key)
396
+ unique_promising.append(combo)
397
+
398
+ # Limit to max_combinations
399
+ return unique_promising[:max_combinations]
400
+
401
+
402
+ def test_column_orderings_smart(con, delta_path: str, table_name: str = None, limit: int = None,
403
+ mode: str = "natural",
404
+ min_distinct_threshold: int = 2,
405
+ max_cardinality_pct: float = 0.01,
406
+ max_ordering_depth: int = 3) -> pd.DataFrame:
407
+ """
408
+ Test column orderings for RLE optimization.
409
+
410
+ Modes:
411
+ - "natural": Calculate RLE for natural order only (baseline)
412
+ - "auto": Natural order + cardinality-based ordering (low to high)
413
+ - "advanced": Natural + cardinality + greedy incremental search
414
+
415
+ Args:
416
+ con: DuckDB connection
417
+ delta_path: Path to Delta table (used for RLE calculation with file_row_number via delta_scan)
418
+ table_name: Optional table name for cardinality calculation on full dataset (if None, uses delta_path)
419
+ limit: Optional limit on number of rows to analyze (for testing only)
420
+ mode: Analysis mode - "natural", "auto", or "advanced" (default: "natural")
421
+ min_distinct_threshold: Exclude columns with fewer distinct values (default: 2, i.e. only exclude constants with 1 value)
422
+ max_cardinality_pct: Exclude columns with cardinality ratio above this % (default: 0.01 = 1%)
423
+ max_ordering_depth: Maximum depth for greedy incremental search in "advanced" mode (default: 3)
424
+
425
+ Returns:
426
+ DataFrame with columns: sort_order, columns_used, total_rle_all, and individual column RLE counts
427
+ """
428
+ print("Analyzing column characteristics...")
429
+
430
+ # For "natural" mode, just calculate RLE on natural order
431
+ if mode == "natural":
432
+ print("\n" + "="*60)
433
+ print("Mode: NATURAL ORDER (baseline)")
434
+ print("="*60)
435
+ print("Calculating RLE for natural file order (single pass)...")
436
+
437
+ # Get all column names
438
+ columns = con.sql(f"""
439
+ SELECT column_name
440
+ FROM (
441
+ DESCRIBE
442
+ SELECT * FROM delta_scan('{delta_path}', file_row_number = TRUE)
443
+ )
444
+ WHERE column_name != 'file_row_number'
445
+ """).fetchall()
446
+
447
+ column_names = [col[0] for col in columns]
448
+
449
+ # Calculate RLE for natural order
450
+ rle_counts = calculate_rle_for_columns(con, delta_path, None, limit)
451
+
452
+ total_rle_all = sum(rle_counts.values())
453
+
454
+ print(f"\nResults:")
455
+ print(f" Total RLE (all columns): {total_rle_all:,}")
456
+ print(f" Average RLE per column: {total_rle_all / len(column_names):.1f}")
457
+
458
+ results = [{
459
+ 'sort_order': 'natural_order',
460
+ 'columns_used': 'file_row_number',
461
+ 'total_rle_all': total_rle_all,
462
+ **rle_counts
463
+ }]
464
+
465
+ df = pd.DataFrame(results)
466
+ print(f"\n{'='*60}")
467
+ print(f"✓ Analysis complete!")
468
+ print(f"{'='*60}")
469
+
470
+ return df
471
+
472
+ # For "auto" and "advanced" modes, calculate cardinality ratios first
473
+ print("\nCalculating cardinality ratios on full dataset...")
474
+ if table_name:
475
+ card_stats = calculate_cardinality_ratio(con, table_name, is_parquet=False)
476
+ else:
477
+ # Fallback: use delta_scan directly
478
+ card_stats = calculate_cardinality_ratio(con, f"delta_scan('{delta_path}')", is_parquet=False)
479
+
480
+ print(f"\nColumn Cardinality Ratios (lower = better for RLE):")
481
+ for col, stats in sorted(card_stats.items(), key=lambda x: x[1]['cardinality_ratio']):
482
+ card_pct = stats['cardinality_ratio'] * 100
483
+ print(f" {col}: {card_pct:.3f}% (distinct: {stats['distinct_values']:,}, rows: {stats['total_rows']:,})")
484
+
485
+ # Extract just the ratios for easier handling
486
+ cardinality_ratios = {col: stats['cardinality_ratio'] for col, stats in card_stats.items()}
487
+ column_names = list(card_stats.keys())
488
+
489
+ # Sort columns by cardinality for ordering (lower cardinality = better for RLE)
490
+ sorted_by_cardinality = sorted(column_names, key=lambda c: cardinality_ratios[c])
491
+
492
+ # OPTIMIZATION: Filter columns based on configurable thresholds
493
+ # Exclude columns that won't benefit from reordering:
494
+ # 1. Too constant: < min_distinct_threshold (default: 2, only excludes single-value columns)
495
+ # 2. Too fragmented: cardinality_ratio > max_cardinality_pct (default: 10%)
496
+ total_rows = next(iter(card_stats.values()))['total_rows']
497
+
498
+ constant_cols = [c for c in sorted_by_cardinality
499
+ if card_stats[c]['distinct_values'] < min_distinct_threshold]
500
+
501
+ fragmented_cols = [c for c in sorted_by_cardinality
502
+ if cardinality_ratios[c] > max_cardinality_pct]
503
+
504
+ good_for_reordering = [c for c in sorted_by_cardinality
505
+ if c not in constant_cols and c not in fragmented_cols]
506
+
507
+ if constant_cols:
508
+ print(f"\n✓ Skipping constant columns (< {min_distinct_threshold} distinct values): {', '.join(constant_cols)}")
509
+ print(f" These compress perfectly regardless of ordering.")
510
+
511
+ if fragmented_cols:
512
+ print(f"✓ Skipping high-cardinality columns (cardinality > {max_cardinality_pct*100:.0f}%): {', '.join(fragmented_cols)}")
513
+ print(f" These are too fragmented to benefit from reordering.")
514
+
515
+ if not good_for_reordering:
516
+ print("\n⚠️ No columns suitable for reordering optimization!")
517
+ print(" All columns are either nearly constant or have too many unique values.")
518
+ return None
519
+
520
+ print(f"\n✓ Analyzing {len(good_for_reordering)} columns suitable for reordering")
521
+
522
+ # Get total row count from cardinality calculation
523
+ total_rows = next(iter(card_stats.values()))['total_rows'] if card_stats else 0
524
+ print(f"✓ Table size: {total_rows:,} rows")
525
+
526
+ # Calculate RLE ONLY on natural order for baseline (single pass - fast!)
527
+ print("\n" + "="*60)
528
+ print("Calculating baseline RLE (natural order - single pass)")
529
+ print("="*60)
530
+ baseline = calculate_rle_for_columns(con, delta_path, None, limit)
531
+
532
+ # Filter baseline to only include good_for_reordering columns
533
+ baseline_filtered = {col: rle for col, rle in baseline.items() if col in good_for_reordering}
534
+
535
+ print(f"Baseline RLE runs (columns worth reordering):")
536
+ for col in sorted(baseline_filtered.keys(), key=lambda c: baseline_filtered[c]):
537
+ print(f" {col}: {baseline_filtered[col]:,} runs")
538
+
539
+ # Define only the most promising orderings to test
540
+ orderings_to_test = [
541
+ ([], 'natural_order'), # Baseline
542
+ ]
543
+
544
+ # Add cardinality-based ordering for "auto" and "advanced" modes
545
+ if mode in ["auto", "advanced"] and len(good_for_reordering) >= 2:
546
+ orderings_to_test.append((good_for_reordering, 'by_cardinality'))
547
+
548
+ print(f"\n✓ Testing {len(orderings_to_test)} orderings")
549
+ print("="*60)
550
+
551
+ results = []
552
+
553
+ for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
554
+ print(f"\n[{i}/{len(orderings_to_test)}] Testing: {label}")
555
+ if sort_cols:
556
+ print(f" Order: {', '.join(sort_cols)}")
557
+
558
+ if i == 1:
559
+ # Use baseline for natural order (already calculated)
560
+ rle_counts = baseline
561
+ else:
562
+ # Calculate RLE for this ordering
563
+ rle_counts = calculate_rle_for_columns(con, delta_path, sort_cols, limit)
564
+
565
+ # Calculate metrics for ALL columns and optimizable subset
566
+ total_rle_all = sum(rle_counts.values())
567
+
568
+ # Filter to only good_for_reordering columns for scoring/comparison
569
+ rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
570
+ total_rle_optimizable = sum(rle_filtered.values())
571
+
572
+ # Calculate weighted score (considering both RLE and cardinality - lower cardinality = better)
573
+ cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
574
+
575
+ print(f" Total RLE (all columns): {total_rle_all:,}")
576
+ print(f" Optimizable columns RLE: {total_rle_optimizable:,}")
577
+ print(f" Avg RLE (optimizable): {total_rle_optimizable / len(rle_filtered):.1f}")
578
+
579
+ results.append({
580
+ 'sort_order': label,
581
+ 'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
582
+ 'total_rle_all': total_rle_all, # All columns (must be >= row_count)
583
+ 'optimizable_rle': total_rle_optimizable, # Only columns we're optimizing
584
+ 'avg_rle': total_rle_optimizable / len(rle_filtered),
585
+ 'cardinality_weighted_score': cardinality_weighted,
586
+ 'method': 'single_pass',
587
+ **rle_counts # Include individual column RLE counts
588
+ })
589
+
590
+ # Greedy incremental search (only in "advanced" mode)
591
+ if mode == "advanced" and max_ordering_depth > 0 and len(good_for_reordering) >= 2:
592
+ print(f"\n{'='*60}")
593
+ print(f"ADVANCED MODE: Greedy Incremental Search (max depth: {max_ordering_depth})")
594
+ print(f"{'='*60}")
595
+ print(f"Building optimal ordering column-by-column, testing all positions")
596
+ print(f"at each depth to find the best incremental improvement.\n")
597
+
598
+ current_best_ordering = []
599
+ current_best_rle = sum(baseline_filtered.values())
600
+ remaining_columns = list(good_for_reordering)
601
+
602
+ for depth in range(1, min(max_ordering_depth + 1, len(good_for_reordering) + 1)):
603
+ print(f"\n--- Depth {depth}: Testing {len(remaining_columns)} candidate columns ---")
604
+
605
+ best_depth_ordering = None
606
+ best_depth_rle = float('inf')
607
+ best_depth_col = None
608
+ best_depth_position = None
609
+
610
+ # Try adding each remaining column
611
+ for candidate_col in remaining_columns:
612
+ # Try inserting at each possible position (including end)
613
+ for insert_pos in range(len(current_best_ordering) + 1):
614
+ # Build test ordering: insert candidate at position
615
+ test_ordering = current_best_ordering[:insert_pos] + [candidate_col] + current_best_ordering[insert_pos:]
616
+
617
+ # Calculate RLE for this ordering
618
+ rle_counts = calculate_rle_for_columns(con, delta_path, test_ordering, limit)
619
+
620
+ # Sum RLE for optimizable columns only
621
+ rle_filtered = {col: rle for col, rle in rle_counts.items() if col in good_for_reordering}
622
+ total_rle = sum(rle_filtered.values())
623
+
624
+ # Track best at this depth
625
+ if total_rle < best_depth_rle:
626
+ best_depth_rle = total_rle
627
+ best_depth_ordering = test_ordering
628
+ best_depth_col = candidate_col
629
+ best_depth_position = insert_pos
630
+ best_depth_rle_counts = rle_counts
631
+
632
+ # Check if we found improvement
633
+ if best_depth_rle < current_best_rle:
634
+ improvement_pct = ((current_best_rle - best_depth_rle) / current_best_rle) * 100
635
+ print(f"✓ Best at depth {depth}: Add '{best_depth_col}' at position {best_depth_position}")
636
+ print(f" Ordering: {', '.join(best_depth_ordering)}")
637
+ print(f" RLE: {best_depth_rle:,} runs (improved {improvement_pct:.1f}% from previous depth)")
638
+
639
+ # Update for next depth
640
+ current_best_ordering = best_depth_ordering
641
+ current_best_rle = best_depth_rle
642
+ remaining_columns.remove(best_depth_col)
643
+
644
+ # Store this result
645
+ rle_filtered = {col: rle for col, rle in best_depth_rle_counts.items() if col in good_for_reordering}
646
+ total_rle_all = sum(best_depth_rle_counts.values())
647
+ cardinality_weighted = sum(rle_filtered[col] * cardinality_ratios[col] for col in rle_filtered.keys())
648
+
649
+ results.append({
650
+ 'sort_order': f'greedy_depth_{depth}',
651
+ 'columns_used': ', '.join(best_depth_ordering),
652
+ 'total_rle_all': total_rle_all,
653
+ 'optimizable_rle': best_depth_rle,
654
+ 'avg_rle': best_depth_rle / len(rle_filtered),
655
+ 'cardinality_weighted_score': cardinality_weighted,
656
+ 'method': 'greedy_incremental',
657
+ **best_depth_rle_counts
658
+ })
659
+ else:
660
+ print(f"✗ No improvement found at depth {depth} - stopping early")
661
+ print(f" Best RLE remains: {current_best_rle:,} runs")
662
+ break
663
+
664
+ print(f"\n{'='*60}")
665
+ print(f"Greedy Search Complete")
666
+ print(f"{'='*60}")
667
+ if current_best_ordering:
668
+ print(f"Final greedy ordering: {', '.join(current_best_ordering)}")
669
+ print(f"Final RLE: {current_best_rle:,} runs")
670
+
671
+
672
+ # Convert to DataFrame and sort by optimizable RLE (lower is better)
673
+ df = pd.DataFrame(results)
674
+ df = df.sort_values('optimizable_rle')
675
+
676
+ print(f"\n{'='*60}")
677
+ print(f"✓ Analysis complete!")
678
+ print(f"{'='*60}")
679
+ print(f"Best ordering: {df.iloc[0]['sort_order']}")
680
+ print(f"Best optimizable RLE: {df.iloc[0]['optimizable_rle']:,} runs (lower is better)")
681
+ print(f"Total RLE (all columns): {df.iloc[0]['total_rle_all']:,} runs")
682
+
683
+
684
+ improvement = baseline_filtered[list(baseline_filtered.keys())[0]] if baseline_filtered else 0
685
+ best_rle = df.iloc[0]['optimizable_rle']
686
+ if len(df) > 1 and improvement > 0:
687
+ pct = ((sum(baseline_filtered.values()) - best_rle) / sum(baseline_filtered.values())) * 100
688
+ if pct > 0:
689
+ print(f"Improvement: {pct:.1f}% fewer runs vs natural order")
690
+
691
+ # Remove confusing internal columns from displayed output
692
+ # Keep: sort_order, columns_used, total_rle_all, and individual column RLE counts
693
+ # Remove: optimizable_rle, avg_rle, cardinality_weighted_score, method
694
+ display_df = df.drop(columns=['optimizable_rle', 'avg_rle', 'cardinality_weighted_score', 'method'], errors='ignore')
695
+
696
+ return display_df
697
+
698
+
699
+ # Example usage:
700
+ # delta_path = 'abfss://tmp@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Tables/unsorted/summary'
701
+ #
702
+ # # Fast single-pass analysis (recommended for all table sizes)
703
+ # results_df = test_column_orderings_smart(con, delta_path, table_name='summary')
704
+ #
705
+ # # Show results
706
+ # print("\nBest orderings:")
707
+ # print(results_df[['sort_order', 'columns_used', 'optimizable_rle', 'total_rle_all', 'method']].head())
708
+ #
709
+ # # The function automatically:
710
+ # # - Calculates exact cardinality ratios (or approximate for >100M rows)
711
+ # # - Excludes columns that won't benefit from reordering
712
+ # # - Tests only 2-3 most promising orderings (low cardinality first, high cardinality first)
713
+ # # - Uses single-pass RLE calculation (fast!)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev1
3
+ Version: 0.2.19.dev3
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -7,6 +7,7 @@ duckrun/core.py
7
7
  duckrun/files.py
8
8
  duckrun/lakehouse.py
9
9
  duckrun/notebook.py
10
+ duckrun/rle.py
10
11
  duckrun/runner.py
11
12
  duckrun/semantic_model.py
12
13
  duckrun/stats.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev1"
7
+ version = "0.2.19.dev3"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes