duckrun 0.2.19.dev1__tar.gz → 0.2.19.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/PKG-INFO +1 -1
  2. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/__init__.py +2 -1
  3. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/core.py +113 -0
  4. duckrun-0.2.19.dev2/duckrun/rle.py +521 -0
  5. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/PKG-INFO +1 -1
  6. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/SOURCES.txt +3 -1
  7. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/pyproject.toml +1 -1
  8. duckrun-0.2.19.dev2/tests/test_rle.py +10 -0
  9. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/LICENSE +0 -0
  10. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/README.md +0 -0
  11. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/auth.py +0 -0
  12. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/files.py +0 -0
  13. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/lakehouse.py +0 -0
  14. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/notebook.py +0 -0
  15. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/runner.py +0 -0
  16. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/semantic_model.py +0 -0
  17. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/stats.py +0 -0
  18. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/writer.py +0 -0
  19. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev1
3
+ Version: 0.2.19.dev2
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -2,10 +2,11 @@
2
2
 
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
+ from duckrun import rle
5
6
 
6
7
  __version__ = "0.2.18"
7
8
 
8
9
  # Expose unified connect method at module level
9
10
  connect = Duckrun.connect
10
11
 
11
- __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
12
+ __all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
@@ -1244,8 +1244,121 @@ class Duckrun(WorkspaceOperationsMixin):
1244
1244
  refresh=refresh
1245
1245
  )
1246
1246
 
1247
+ def rle(self, table_name: str = None, mode: str = "summary", sort_columns: List[str] = None,
1248
+ limit: int = None, max_combinations: int = 20, use_stratified_sampling: bool = True,
1249
+ num_segments: int = 5, segment_size: int = 1000):
1250
+ """
1251
+ Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
1252
+
1253
+ Args:
1254
+ table_name: Name of the table to analyze. Can be:
1255
+ - 'table_name' (uses current schema)
1256
+ - 'schema.table_name' (specific schema)
1257
+ - None (analyzes all tables in current schema - summary only)
1258
+ mode: Analysis mode:
1259
+ - "summary": Quick NFV (Number of Distinct Values) analysis (default)
1260
+ - "smart": Smart heuristic-based analysis (recommended)
1261
+ - "full": Full RLE analysis with all column orderings
1262
+ sort_columns: Optional list of columns to sort by for RLE calculation
1263
+ limit: Optional limit on number of rows to analyze (ignored if using stratified sampling)
1264
+ max_combinations: Maximum number of orderings to test (for smart mode)
1265
+ use_stratified_sampling: If True, use stratified sampling across entire file (recommended)
1266
+ num_segments: Number of segments for stratified sampling
1267
+ segment_size: Size of each segment for sampling
1268
+
1269
+ Returns:
1270
+ DataFrame with RLE analysis results
1271
+
1272
+ Examples:
1273
+ # Quick summary of a specific table
1274
+ con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
1275
+ con.rle("mytable") # defaults to summary mode
1276
+ con.rle("mytable", "summary")
1277
+
1278
+ # Smart analysis (finds optimal column ordering)
1279
+ con.rle("mytable", "smart")
1280
+
1281
+ # Analyze table from different schema
1282
+ con.rle("otherschema.mytable", "smart")
1283
+
1284
+ # Full analysis with custom parameters
1285
+ con.rle("mytable", "full", use_stratified_sampling=True, num_segments=10)
1286
+ """
1287
+ from .rle import (
1288
+ calculate_nfv_score,
1289
+ test_column_orderings_smart,
1290
+ calculate_rle_for_columns
1291
+ )
1292
+ from deltalake import DeltaTable
1293
+
1294
+ # Parse table name and construct path
1295
+ if table_name is None:
1296
+ if mode != "summary":
1297
+ print("⚠️ Table name is required for 'smart' and 'full' modes")
1298
+ return None
1299
+ # TODO: Implement all-tables summary
1300
+ print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
1301
+ return None
1302
+
1303
+ # Parse schema.table or just table
1304
+ if '.' in table_name:
1305
+ schema_name, tbl = table_name.split('.', 1)
1306
+ else:
1307
+ schema_name = self.schema
1308
+ tbl = table_name
1309
+
1310
+ # Construct the full table path using the same logic as get_stats
1311
+ table_path = f"{self.table_base_url}{schema_name}/{tbl}"
1312
+
1313
+ # Get the actual parquet files from Delta table
1314
+ print(f"📊 Analyzing table: {schema_name}.{tbl}")
1315
+
1316
+ try:
1317
+ dt = DeltaTable(table_path)
1318
+ delta_files = dt.files()
1319
+
1320
+ if not delta_files:
1321
+ print("⚠️ Table is empty (no files)")
1322
+ return None
1323
+
1324
+ # Construct full paths for parquet files
1325
+ parquet_paths = [table_path + "/" + f for f in delta_files]
1326
+
1327
+ except Exception as e:
1328
+ print(f"❌ Error accessing Delta table: {e}")
1329
+ return None
1330
+
1331
+ # For now, analyze the first file (can be extended to analyze all files)
1332
+ parquet_path = parquet_paths[0]
1333
+
1334
+ if mode == "summary":
1335
+ # Quick NFV analysis
1336
+ nfv_scores = calculate_nfv_score(self.con, parquet_path, limit)
1337
+ import pandas as pd
1338
+ df = pd.DataFrame([
1339
+ {"column": col, "nfv_score": score}
1340
+ for col, score in sorted(nfv_scores.items(), key=lambda x: x[1])
1341
+ ])
1342
+ return df
1343
+
1344
+ elif mode in ["smart", "full"]:
1345
+ # Smart or full RLE analysis
1346
+ return test_column_orderings_smart(
1347
+ self.con,
1348
+ parquet_path,
1349
+ limit=limit,
1350
+ max_combinations=max_combinations,
1351
+ use_stratified_sampling=use_stratified_sampling,
1352
+ num_segments=num_segments,
1353
+ segment_size=segment_size
1354
+ )
1355
+ else:
1356
+ print(f"❌ Unknown mode: {mode}. Use 'summary', 'smart', or 'full'")
1357
+ return None
1358
+
1247
1359
  def close(self):
1248
1360
  """Close DuckDB connection"""
1361
+
1249
1362
  if self.con:
1250
1363
  self.con.close()
1251
1364
  print("Connection closed")
@@ -0,0 +1,521 @@
1
+ import itertools
2
+ from typing import List, Dict, Tuple, Optional
3
+ import pandas as pd
4
+
5
+ def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
6
+ """
7
+ Analyze Parquet row group statistics to identify columns with constant values.
8
+ This is much faster than reading all data.
9
+
10
+ Returns:
11
+ DataFrame with row group stats per column
12
+ """
13
+ try:
14
+ # Get row group metadata
15
+ metadata = con.sql(f"""
16
+ SELECT * FROM parquet_metadata('{parquet_path}')
17
+ """).df()
18
+
19
+ return metadata
20
+ except Exception as e:
21
+ print(f"Could not read parquet metadata: {e}")
22
+ return None
23
+
24
+
25
+ def estimate_rle_from_row_groups(con, parquet_path: str) -> Dict[str, dict]:
26
+ """
27
+ Estimate RLE potential from Parquet row group statistics.
28
+ If min == max in a row group, that entire group is one RLE run.
29
+
30
+ Returns:
31
+ Dictionary with column stats: {col: {'constant_groups': N, 'total_groups': M, 'constant_ratio': ratio}}
32
+ """
33
+ try:
34
+ # Get row group statistics - this varies by DuckDB version
35
+ # Try to get column chunk stats
36
+ stats_query = f"""
37
+ SELECT
38
+ row_group_id,
39
+ column_id,
40
+ file_offset,
41
+ num_values,
42
+ total_compressed_size,
43
+ total_uncompressed_size
44
+ FROM parquet_file_metadata('{parquet_path}')
45
+ """
46
+
47
+ stats = con.sql(stats_query).df()
48
+ print("Row group metadata available!")
49
+ return stats
50
+
51
+ except Exception as e:
52
+ print(f"Parquet metadata not available in this DuckDB version: {e}")
53
+ print("Falling back to stratified sampling...")
54
+ return None
55
+
56
+
57
+ def stratified_rle_sampling(con, parquet_path: str, sort_columns: List[str] = None,
58
+ num_segments: int = 5, segment_size: int = 1000) -> Dict[str, float]:
59
+ """
60
+ Sample RLE density across multiple segments of the file.
61
+
62
+ Args:
63
+ con: DuckDB connection
64
+ parquet_path: Path to parquet file
65
+ sort_columns: List of columns to sort by before calculating RLE. If None, uses natural order.
66
+ num_segments: Number of segments to sample across the file
67
+ segment_size: Number of rows per segment
68
+
69
+ Returns:
70
+ Dictionary with estimated RLE runs per column for full file
71
+ """
72
+ # Get total row count
73
+ total_rows = con.sql(f"""
74
+ SELECT COUNT(*) FROM read_parquet('{parquet_path}')
75
+ """).fetchone()[0]
76
+
77
+ # Get column names
78
+ columns = con.sql(f"""
79
+ SELECT column_name
80
+ FROM (
81
+ DESCRIBE
82
+ SELECT * FROM read_parquet('{parquet_path}', file_row_number = TRUE)
83
+ )
84
+ WHERE column_name != 'file_row_number'
85
+ """).fetchall()
86
+
87
+ column_names = [col[0] for col in columns]
88
+
89
+ # Build ORDER BY clause
90
+ if sort_columns:
91
+ order_by_clause = "ORDER BY " + ", ".join(sort_columns)
92
+ sort_desc = f"sorted by [{', '.join(sort_columns)}]"
93
+ else:
94
+ order_by_clause = "ORDER BY file_row_number"
95
+ sort_desc = "natural order"
96
+
97
+ # Calculate segment positions spread across the file
98
+ segment_positions = []
99
+ if num_segments == 1:
100
+ segment_positions = [0]
101
+ else:
102
+ step = total_rows // (num_segments + 1)
103
+ segment_positions = [step * (i + 1) for i in range(num_segments)]
104
+
105
+ # Sample each segment and calculate RLE density
106
+ all_densities = {col: [] for col in column_names}
107
+
108
+ for seg_idx, start_pos in enumerate(segment_positions, 1):
109
+ for col in column_names:
110
+ # The key fix: we need to sort the ENTIRE dataset first, then sample from it
111
+ # This is expensive but necessary for accurate results
112
+ rle_count = con.sql(f"""
113
+ WITH sorted_data AS (
114
+ SELECT
115
+ *,
116
+ ROW_NUMBER() OVER ({order_by_clause}) as sorted_row_num
117
+ FROM read_parquet('{parquet_path}', file_row_number = TRUE)
118
+ ),
119
+ segment_data AS (
120
+ SELECT
121
+ {col},
122
+ sorted_row_num
123
+ FROM sorted_data
124
+ WHERE sorted_row_num >= {start_pos}
125
+ ORDER BY sorted_row_num
126
+ LIMIT {segment_size}
127
+ ),
128
+ runs AS (
129
+ SELECT
130
+ CASE
131
+ WHEN LAG({col}) OVER (ORDER BY sorted_row_num) != {col}
132
+ OR LAG({col}) OVER (ORDER BY sorted_row_num) IS NULL
133
+ THEN 1
134
+ ELSE 0
135
+ END AS new_run
136
+ FROM segment_data
137
+ )
138
+ SELECT SUM(new_run) AS rle_run_count
139
+ FROM runs
140
+ """).fetchone()[0]
141
+
142
+ # Calculate density (runs per row)
143
+ density = rle_count / segment_size
144
+ all_densities[col].append(density)
145
+
146
+ # Estimate total runs for full file
147
+ estimated_runs = {}
148
+ density_stats = {}
149
+
150
+ for col in column_names:
151
+ avg_density = sum(all_densities[col]) / len(all_densities[col])
152
+ min_density = min(all_densities[col])
153
+ max_density = max(all_densities[col])
154
+ std_density = (sum((d - avg_density)**2 for d in all_densities[col]) / len(all_densities[col]))**0.5
155
+
156
+ estimated_total = int(avg_density * total_rows)
157
+ estimated_runs[col] = estimated_total
158
+
159
+ density_stats[col] = {
160
+ 'avg_density': avg_density,
161
+ 'min_density': min_density,
162
+ 'max_density': max_density,
163
+ 'std_density': std_density,
164
+ 'estimated_runs': estimated_total,
165
+ 'variance_coefficient': std_density / avg_density if avg_density > 0 else 0
166
+ }
167
+
168
+ return estimated_runs, density_stats
169
+
170
+
171
+ def calculate_rle_for_columns(con, parquet_path: str, sort_columns: List[str] = None, limit: int = None) -> Dict[str, int]:
172
+ """
173
+ Calculate RLE runs for all columns in a parquet file, optionally after sorting.
174
+
175
+ Args:
176
+ con: DuckDB connection
177
+ parquet_path: Path to parquet file
178
+ sort_columns: List of columns to sort by (in order). If None, uses natural file order.
179
+ limit: Optional limit on number of rows to analyze
180
+
181
+ Returns:
182
+ Dictionary mapping column names to RLE run counts
183
+ """
184
+ # Get all column names
185
+ columns = con.sql(f"""
186
+ SELECT column_name
187
+ FROM (
188
+ DESCRIBE
189
+ SELECT *
190
+ FROM read_parquet('{parquet_path}', file_row_number = TRUE)
191
+ )
192
+ WHERE column_name != 'file_row_number'
193
+ """).fetchall()
194
+
195
+ column_names = [col[0] for col in columns]
196
+
197
+ # Build ORDER BY clause
198
+ if sort_columns:
199
+ order_by = "ORDER BY " + ", ".join(sort_columns)
200
+ else:
201
+ order_by = "ORDER BY file_row_number ASC"
202
+
203
+ limit_clause = f"LIMIT {limit}" if limit else ""
204
+
205
+ # Calculate RLE for each column
206
+ results = {}
207
+ for column_name in column_names:
208
+ rle_count = con.sql(f"""
209
+ WITH ordered_data AS (
210
+ SELECT
211
+ {column_name},
212
+ file_row_number
213
+ FROM read_parquet('{parquet_path}', file_row_number = TRUE)
214
+ {order_by}
215
+ {limit_clause}
216
+ ),
217
+ runs AS (
218
+ SELECT
219
+ CASE
220
+ WHEN LAG({column_name}) OVER (ORDER BY file_row_number) != {column_name}
221
+ OR LAG({column_name}) OVER (ORDER BY file_row_number) IS NULL
222
+ THEN 1
223
+ ELSE 0
224
+ END AS new_run
225
+ FROM ordered_data
226
+ )
227
+ SELECT SUM(new_run) AS rle_run_count
228
+ FROM runs
229
+ """).fetchone()[0]
230
+
231
+ results[column_name] = rle_count
232
+
233
+ return results
234
+
235
+
236
+ def calculate_nfv_score(con, parquet_path: str, limit: int = None) -> Dict[str, float]:
237
+ """
238
+ Calculate Number of Distinct Values (NFV) for each column.
239
+ Lower NFV = better for RLE compression.
240
+
241
+ Returns:
242
+ Dictionary mapping column names to NFV ratios (0-1, lower is better)
243
+ """
244
+ limit_clause = f"LIMIT {limit}" if limit else ""
245
+
246
+ columns = con.sql(f"""
247
+ SELECT column_name
248
+ FROM (
249
+ DESCRIBE
250
+ SELECT *
251
+ FROM read_parquet('{parquet_path}', file_row_number = TRUE)
252
+ )
253
+ WHERE column_name != 'file_row_number'
254
+ """).fetchall()
255
+
256
+ column_names = [col[0] for col in columns]
257
+ nfv_scores = {}
258
+
259
+ for col in column_names:
260
+ result = con.sql(f"""
261
+ WITH data AS (
262
+ SELECT {col}
263
+ FROM read_parquet('{parquet_path}', file_row_number = TRUE)
264
+ {limit_clause}
265
+ )
266
+ SELECT
267
+ COUNT(DISTINCT {col})::FLOAT / COUNT(*)::FLOAT as nfv_ratio
268
+ FROM data
269
+ """).fetchone()
270
+
271
+ nfv_scores[col] = result[0] if result else 1.0
272
+
273
+ return nfv_scores
274
+
275
+
276
+ def filter_promising_combinations(columns: List[str], nfv_scores: Dict[str, float],
277
+ max_combinations: int = 20) -> List[List[str]]:
278
+ """
279
+ Apply heuristics to filter down to the most promising column orderings.
280
+
281
+ Heuristics based on research:
282
+ 1. Time/date columns first (temporal ordering)
283
+ 2. Low cardinality columns before high cardinality
284
+ 3. Correlated columns together (e.g., date + time)
285
+ 4. Avoid starting with high-cardinality columns
286
+
287
+ Args:
288
+ columns: List of all column names
289
+ nfv_scores: NFV ratio for each column (lower = fewer distinct values)
290
+ max_combinations: Maximum number of combinations to return
291
+
292
+ Returns:
293
+ List of promising column orderings to test
294
+ """
295
+ # Sort columns by NFV (lower first = better for RLE)
296
+ sorted_by_nfv = sorted(columns, key=lambda c: nfv_scores[c])
297
+
298
+ promising = []
299
+
300
+ # Rule 1: Natural order baseline
301
+ promising.append([])
302
+
303
+ # Rule 2: NFV-based ordering (lowest to highest)
304
+ promising.append(sorted_by_nfv)
305
+
306
+ # Rule 3: Single best column (lowest NFV)
307
+ promising.append([sorted_by_nfv[0]])
308
+
309
+ # Rule 4: Time-based patterns (common column names)
310
+ time_cols = [c for c in columns if any(t in c.lower() for t in ['date', 'time', 'timestamp', 'year', 'month', 'day'])]
311
+ if time_cols:
312
+ promising.append(time_cols)
313
+ # Time columns + low NFV columns
314
+ non_time = [c for c in sorted_by_nfv if c not in time_cols]
315
+ if non_time:
316
+ promising.append(time_cols + non_time[:2])
317
+
318
+ # Rule 5: Top 2-3 lowest NFV columns in different orders
319
+ top_low_nfv = sorted_by_nfv[:min(3, len(sorted_by_nfv))]
320
+ for perm in itertools.permutations(top_low_nfv, min(2, len(top_low_nfv))):
321
+ promising.append(list(perm))
322
+
323
+ # Rule 6: ID-like columns first (common patterns)
324
+ id_cols = [c for c in columns if any(t in c.lower() for t in ['id', 'key', 'code'])]
325
+ if id_cols:
326
+ promising.append(id_cols)
327
+
328
+ # Rule 7: Categorical/enum-like columns (very low NFV < 0.1)
329
+ categorical = [c for c in sorted_by_nfv if nfv_scores[c] < 0.1]
330
+ if categorical:
331
+ promising.append(categorical)
332
+ # Categorical + time
333
+ if time_cols:
334
+ promising.append(categorical + time_cols)
335
+
336
+ # Remove duplicates while preserving order
337
+ seen = set()
338
+ unique_promising = []
339
+ for combo in promising:
340
+ key = tuple(combo)
341
+ if key not in seen:
342
+ seen.add(key)
343
+ unique_promising.append(combo)
344
+
345
+ # Limit to max_combinations
346
+ return unique_promising[:max_combinations]
347
+
348
+
349
+ def test_column_orderings_smart(con, parquet_path: str, limit: int = None,
350
+ max_combinations: int = 20, use_stratified_sampling: bool = True,
351
+ num_segments: int = 5, segment_size: int = 1000) -> pd.DataFrame:
352
+ """
353
+ Test column orderings using heuristics to avoid testing all combinations.
354
+
355
+ This uses research-backed heuristics:
356
+ - Temporal columns (date/time) should be sorted first
357
+ - Low cardinality (NFV) columns compress better
358
+ - Columns with correlation should be grouped
359
+
360
+ Args:
361
+ con: DuckDB connection
362
+ parquet_path: Path to parquet file
363
+ limit: Optional limit on number of rows to analyze (ignored if use_stratified_sampling=True)
364
+ max_combinations: Maximum number of orderings to test
365
+ use_stratified_sampling: If True, use stratified sampling across entire file
366
+ num_segments: Number of segments for stratified sampling
367
+ segment_size: Size of each segment for sampling
368
+
369
+ Returns:
370
+ DataFrame with columns: sort_order, total_rle, avg_rle, nfv_weighted_score, and individual column RLE counts
371
+ """
372
+ print("Analyzing column characteristics...")
373
+
374
+ # Try to get row group metadata first
375
+ print("\nAttempting to read Parquet row group metadata...")
376
+ row_group_stats = analyze_parquet_row_groups(con, parquet_path)
377
+ if row_group_stats is not None:
378
+ print("✓ Row group metadata available")
379
+ print(row_group_stats.head())
380
+
381
+ # Get NFV scores for all columns (still use sampling for this as it's cheap)
382
+ sample_size = limit if limit else 100000
383
+ nfv_scores = calculate_nfv_score(con, parquet_path, sample_size)
384
+
385
+ print(f"\nColumn NFV Scores (lower = better for RLE):")
386
+ for col, score in sorted(nfv_scores.items(), key=lambda x: x[1]):
387
+ print(f" {col}: {score:.4f}")
388
+
389
+ # Decide whether to use stratified sampling or simple limit
390
+ if use_stratified_sampling and not limit:
391
+ print("\n" + "="*60)
392
+ print("Using STRATIFIED SAMPLING across entire file")
393
+ print("="*60)
394
+
395
+ # Get total row count
396
+ total_rows = con.sql(f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')").fetchone()[0]
397
+ print(f"Total rows in file: {total_rows:,}")
398
+ print(f"Sampling strategy: {num_segments} segments of {segment_size} rows each")
399
+
400
+ # Get baseline with natural order
401
+ print("\nCalculating baseline (natural order)...")
402
+ estimated_runs, density_stats = stratified_rle_sampling(
403
+ con, parquet_path, None, num_segments, segment_size
404
+ )
405
+
406
+ print("\nBaseline RLE Density Statistics:")
407
+ for col, stats in sorted(density_stats.items(), key=lambda x: x[1]['estimated_runs']):
408
+ cv = stats['variance_coefficient']
409
+ warning = " ⚠️ HIGH VARIANCE" if cv > 0.3 else ""
410
+ print(f" {col}: {stats['estimated_runs']:,} runs (density: {stats['avg_density']:.4f}, CV: {cv:.2f}){warning}")
411
+
412
+ use_estimation = True
413
+ else:
414
+ print("\n" + "="*60)
415
+ print(f"Using simple sampling (first {limit or 'all'} rows)")
416
+ print("="*60)
417
+ use_estimation = False
418
+
419
+ # Get baseline (natural file order)
420
+ if not use_estimation:
421
+ print("\nCalculating baseline (natural file order)...")
422
+ baseline = calculate_rle_for_columns(con, parquet_path, None, limit)
423
+ column_names = list(baseline.keys())
424
+ else:
425
+ column_names = list(nfv_scores.keys())
426
+
427
+ # Sort columns by NFV for the NFV-based ordering
428
+ sorted_by_nfv = sorted(column_names, key=lambda c: nfv_scores[c])
429
+
430
+ # Exclude obvious columns (very low NFV < 0.0001) from permutations
431
+ # These are likely constant columns that compress perfectly anywhere
432
+ nfv_threshold = 0.0001
433
+ non_trivial_cols = [c for c in sorted_by_nfv if nfv_scores[c] >= nfv_threshold]
434
+ trivial_cols = [c for c in sorted_by_nfv if nfv_scores[c] < nfv_threshold]
435
+
436
+ if trivial_cols:
437
+ print(f"\nExcluding trivial columns from permutations (NFV < {nfv_threshold}): {', '.join(trivial_cols)}")
438
+
439
+ # Define specific orderings to test
440
+ orderings_to_test = [
441
+ ([], 'current_order'), # Natural file order
442
+ (sorted_by_nfv, 'order_by_nfv') # Sorted by NFV (low to high)
443
+ ]
444
+
445
+ # Add permutations of top N lowest NFV columns (excluding trivial ones)
446
+ top_n = min(3, len(non_trivial_cols)) # Top 3 non-trivial or fewer
447
+ print(f"\nGenerating permutations of top {top_n} lowest non-trivial NFV columns...")
448
+ for perm in itertools.permutations(non_trivial_cols[:top_n]):
449
+ orderings_to_test.append((list(perm), f"perm_{', '.join(perm)}"))
450
+
451
+ print(f"Testing {len(orderings_to_test)} orderings...")
452
+ results = []
453
+
454
+ for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
455
+ print(f"\n[{i}/{len(orderings_to_test)}] Testing: {label}")
456
+
457
+ if use_estimation:
458
+ # Use stratified sampling for this ordering
459
+ print(f" Sort order: {', '.join(sort_cols) if sort_cols else 'natural (file_row_number)'}")
460
+ est_runs, _ = stratified_rle_sampling(
461
+ con, parquet_path, sort_cols if sort_cols else None, num_segments, segment_size
462
+ )
463
+ rle_counts = est_runs
464
+ else:
465
+ # Use regular calculation
466
+ rle_counts = calculate_rle_for_columns(con, parquet_path, sort_cols if sort_cols else None, limit)
467
+
468
+ # Calculate weighted score (considering both RLE and NFV)
469
+ nfv_weighted = sum(rle_counts[col] * nfv_scores[col] for col in rle_counts.keys())
470
+
471
+ results.append({
472
+ 'sort_order': label,
473
+ 'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
474
+ 'total_rle': sum(rle_counts.values()),
475
+ 'avg_rle': sum(rle_counts.values()) / len(rle_counts),
476
+ 'nfv_weighted_score': nfv_weighted,
477
+ 'estimation_method': 'stratified' if use_estimation else 'sequential',
478
+ **rle_counts
479
+ })
480
+
481
+ # Convert to DataFrame and sort by total RLE
482
+ df = pd.DataFrame(results)
483
+ df = df.sort_values('total_rle')
484
+
485
+ print(f"\n✓ Analysis complete! Tested {len(orderings_to_test)} orderings.")
486
+
487
+ if use_estimation:
488
+ print("\n⚠️ Note: RLE counts are ESTIMATES based on stratified sampling.")
489
+ print(" Use these for relative comparison. Run full analysis on best candidate.")
490
+
491
+ return df
492
+
493
+
494
+ # Example usage:
495
+ # parquet_path = 'abfss://tmp@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Tables/unsorted/summary/0-1c557fc2-59fe-487f-a3ee-67b5e63257df-0.parquet'
496
+ #
497
+ # # OPTION 1: Fast stratified sampling across entire file (recommended for large files)
498
+ # results_df = test_column_orderings_smart(
499
+ # con,
500
+ # parquet_path,
501
+ # use_stratified_sampling=True,
502
+ # num_segments=5, # Sample 5 segments across the file
503
+ # segment_size=1000 # 1000 rows per segment
504
+ # )
505
+ #
506
+ # # OPTION 2: Traditional approach with limited rows (faster but less accurate)
507
+ # results_df = test_column_orderings_smart(
508
+ # con,
509
+ # parquet_path,
510
+ # limit=10000,
511
+ # use_stratified_sampling=False
512
+ # )
513
+ #
514
+ # # Show results
515
+ # print("\nTop 5 best orderings:")
516
+ # print(results_df[['sort_order', 'columns_used', 'total_rle', 'estimation_method']].head(5))
517
+ #
518
+ # # Once you identify the best ordering, verify with full file scan:
519
+ # best_ordering = results_df.iloc[0]['columns_used'].split(', ')
520
+ # print(f"\nVerifying best ordering on FULL file: {best_ordering}")
521
+ # full_rle = calculate_rle_for_columns(con, parquet_path, best_ordering if best_ordering[0] != 'file_row_number' else None, limit=None)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.19.dev1
3
+ Version: 0.2.19.dev2
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -7,6 +7,7 @@ duckrun/core.py
7
7
  duckrun/files.py
8
8
  duckrun/lakehouse.py
9
9
  duckrun/notebook.py
10
+ duckrun/rle.py
10
11
  duckrun/runner.py
11
12
  duckrun/semantic_model.py
12
13
  duckrun/stats.py
@@ -15,4 +16,5 @@ duckrun.egg-info/PKG-INFO
15
16
  duckrun.egg-info/SOURCES.txt
16
17
  duckrun.egg-info/dependency_links.txt
17
18
  duckrun.egg-info/requires.txt
18
- duckrun.egg-info/top_level.txt
19
+ duckrun.egg-info/top_level.txt
20
+ tests/test_rle.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.19.dev1"
7
+ version = "0.2.19.dev2"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,10 @@
1
+ import duckrun
2
+
3
+ # Test RLE integration
4
+ con = duckrun.connect("tmp/data.lakehouse/unsorted")
5
+
6
+ # Test smart mode on calendar table
7
+ print("Testing RLE smart mode on calendar table...")
8
+ result = con.rle("calendar", "full")
9
+ print("\nTop 5 best orderings:")
10
+ print(result[['sort_order', 'columns_used', 'total_rle', 'estimation_method']].head())
File without changes
File without changes
File without changes