duckrun 0.2.21.dev2__tar.gz → 0.2.22.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/PKG-INFO +1 -1
  2. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/core.py +92 -0
  3. duckrun-0.2.22.dev0/duckrun/rle.py +362 -0
  4. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/PKG-INFO +1 -1
  5. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/SOURCES.txt +1 -0
  6. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/pyproject.toml +1 -1
  7. duckrun-0.2.22.dev0/tests/test_rle_analysis.py +149 -0
  8. duckrun-0.2.21.dev2/duckrun/rle.py +0 -940
  9. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/LICENSE +0 -0
  10. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/README.md +0 -0
  11. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/__init__.py +0 -0
  12. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/auth.py +0 -0
  13. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/ducklake_metadata.py +0 -0
  14. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/files.py +0 -0
  15. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/lakehouse.py +0 -0
  16. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/notebook.py +0 -0
  17. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/runner.py +0 -0
  18. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/semantic_model.py +0 -0
  19. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/stats.py +0 -0
  20. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/writer.py +0 -0
  21. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/dependency_links.txt +0 -0
  22. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/requires.txt +0 -0
  23. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/top_level.txt +0 -0
  24. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/setup.cfg +0 -0
  25. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_checkpoint_format.py +0 -0
  26. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_deploy_fresh.py +0 -0
  27. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_ducklake_export.py +0 -0
  28. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_filename.py +0 -0
  29. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_register.py +0 -0
  30. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_rle.py +0 -0
  31. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_writer_dictionary.py +0 -0
  32. {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_writer_integration.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.21.dev2
3
+ Version: 0.2.22.dev0
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1050,6 +1050,98 @@ class Duckrun(WorkspaceOperationsMixin):
1050
1050
  """
1051
1051
  self.con.register(name, df)
1052
1052
 
1053
+ def get_rle_stats(self, table_name: str, top_n_values: int = 10):
1054
+ """
1055
+ Get comprehensive table statistics including NDV and value frequency analysis.
1056
+
1057
+ Analyzes column characteristics for RLE compression optimization.
1058
+
1059
+ Args:
1060
+ table_name: Name of the table to analyze
1061
+ top_n_values: Number of top frequent values to show per column (default: 10)
1062
+
1063
+ Returns:
1064
+ DataFrame with statistics for each column:
1065
+ - column_name: Name of the column
1066
+ - data_type: Data type
1067
+ - total_rows: Total number of rows
1068
+ - null_count, null_pct: NULL statistics
1069
+ - ndv: Number of distinct values (exact)
1070
+ - cardinality_ratio: NDV / total_rows (lower = better for RLE)
1071
+ - top_value, top_value_count, top_value_pct: Most frequent value stats
1072
+ - top_n_coverage: Percentage covered by top N values
1073
+ - repetition_score: RLE potential score (higher = better)
1074
+
1075
+ Examples:
1076
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1077
+
1078
+ # Analyze a table
1079
+ stats = con.get_rle_stats('sales')
1080
+ print(stats)
1081
+
1082
+ # Show top 20 values per column
1083
+ stats = con.get_rle_stats('sales', top_n_values=20)
1084
+ """
1085
+ from .rle import get_table_stats as _get_rle_stats
1086
+ return _get_rle_stats(self, table_name, top_n_values)
1087
+
1088
+ def get_value_frequency(self, table_name: str, column_name: str, limit: int = 20):
1089
+ """
1090
+ Get detailed value frequency distribution for a specific column.
1091
+
1092
+ Args:
1093
+ table_name: Name of the table
1094
+ column_name: Name of the column to analyze
1095
+ limit: Maximum number of values to return (default: 20)
1096
+
1097
+ Returns:
1098
+ DataFrame with value frequencies:
1099
+ - value: The distinct value
1100
+ - count: Number of occurrences
1101
+ - percentage: Percentage of total rows
1102
+ - cumulative_pct: Cumulative percentage
1103
+
1104
+ Examples:
1105
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1106
+
1107
+ # Get top 20 values for a column
1108
+ freq = con.get_value_frequency('sales', 'status')
1109
+ print(freq)
1110
+ """
1111
+ from .rle import get_value_frequency_details as _get_value_frequency
1112
+ return _get_value_frequency(self, table_name, column_name, limit)
1113
+
1114
+ def find_optimal_sort_order(self, table_name: str, max_combinations: int = 10):
1115
+ """
1116
+ Find optimal column sort order for compression using V-Order-like testing.
1117
+
1118
+ Tests different column orderings and measures RLE compression effectiveness.
1119
+ This simulates how V-Order/VertiPaq optimizes data layout.
1120
+
1121
+ Args:
1122
+ table_name: Name of the table to analyze
1123
+ max_combinations: Maximum sort orderings to test (default: 10)
1124
+
1125
+ Returns:
1126
+ DataFrame with tested orderings ranked by compression:
1127
+ - sort_order: Column ordering (e.g., "date → DUID → time")
1128
+ - total_runs: Total RLE runs (fewer = better compression)
1129
+ - compression_score: Compression effectiveness (higher = better)
1130
+ - Individual RLE counts per column
1131
+
1132
+ Examples:
1133
+ con = duckrun.connect("workspace/lakehouse.lakehouse")
1134
+
1135
+ # Find optimal sort order
1136
+ optimal = con.find_optimal_sort_order('energy_data')
1137
+ print(optimal)
1138
+
1139
+ # Test more combinations
1140
+ optimal = con.find_optimal_sort_order('energy_data', max_combinations=20)
1141
+ """
1142
+ from .rle import find_optimal_sort_order as _find_optimal_sort_order
1143
+ return _find_optimal_sort_order(self, table_name, max_combinations)
1144
+
1053
1145
  def get_stats(self, source: str = None, detailed = False):
1054
1146
  """
1055
1147
  Get comprehensive statistics for Delta Lake tables.
@@ -0,0 +1,362 @@
1
+ from typing import List, Dict, Tuple, Optional
2
+ import pandas as pd
3
+
4
+
5
+ def get_table_stats(duckrun_con, table_name: str,
6
+ top_n_values: int = 10) -> pd.DataFrame:
7
+ """
8
+ Get comprehensive table statistics including NDV and value frequency analysis.
9
+
10
+ The theory: If a value appears frequently (high repetition), it may provide better RLE compression
11
+ even if the column has higher NDV. This function helps identify such patterns.
12
+
13
+ Args:
14
+ duckrun_con: Duckrun connection (from duckrun.connect())
15
+ table_name: Name of the table to analyze
16
+ top_n_values: Number of top frequent values to show per column (default: 10)
17
+
18
+ Returns:
19
+ DataFrame with columns:
20
+ - column_name: Name of the column
21
+ - data_type: Data type of the column
22
+ - total_rows: Total number of rows
23
+ - null_count: Number of NULL values
24
+ - null_pct: Percentage of NULL values
25
+ - ndv: Number of distinct values (exact)
26
+ - cardinality_ratio: NDV / total_rows (lower = better for RLE)
27
+ - top_value: Most frequent value
28
+ - top_value_count: Count of most frequent value
29
+ - top_value_pct: Percentage of most frequent value
30
+ - top_n_coverage: Percentage covered by top N values
31
+ - repetition_score: Custom score indicating RLE potential (higher = better)
32
+ """
33
+ con = duckrun_con.con # Get underlying DuckDB connection
34
+ from_clause = table_name
35
+
36
+ # Get column names and types
37
+ schema_info = con.sql(f"""
38
+ SELECT column_name, column_type
39
+ FROM (DESCRIBE SELECT * FROM {from_clause})
40
+ """).df()
41
+
42
+ if schema_info.empty:
43
+ return pd.DataFrame()
44
+
45
+ # Get total row count once
46
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
47
+ print(f"Analyzing {len(schema_info)} columns across {total_rows:,} rows...")
48
+
49
+ results = []
50
+
51
+ for idx, row in schema_info.iterrows():
52
+ col_name = row['column_name']
53
+ col_type = row['column_type']
54
+
55
+ print(f" [{idx+1}/{len(schema_info)}] Analyzing column: {col_name}")
56
+
57
+ # Get basic stats in one query
58
+ stats_query = f"""
59
+ SELECT
60
+ COUNT(*) as total,
61
+ COUNT({col_name}) as non_null,
62
+ COUNT(DISTINCT {col_name}) as ndv
63
+ FROM {from_clause}
64
+ """
65
+
66
+ stats = con.sql(stats_query).fetchone()
67
+ total = stats[0]
68
+ non_null = stats[1]
69
+ ndv = stats[2]
70
+ null_count = total - non_null
71
+ null_pct = (null_count / total * 100) if total > 0 else 0
72
+ cardinality_ratio = (ndv / total) if total > 0 else 0
73
+
74
+ # Get top N values with their frequencies
75
+ top_values_query = f"""
76
+ SELECT
77
+ {col_name} as value,
78
+ COUNT(*) as count,
79
+ COUNT(*) * 100.0 / {total} as percentage
80
+ FROM {from_clause}
81
+ WHERE {col_name} IS NOT NULL
82
+ GROUP BY {col_name}
83
+ ORDER BY count DESC
84
+ LIMIT {top_n_values}
85
+ """
86
+
87
+ top_values = con.sql(top_values_query).df()
88
+
89
+ # Extract top value info
90
+ if not top_values.empty:
91
+ top_value = top_values.iloc[0]['value']
92
+ top_value_count = top_values.iloc[0]['count']
93
+ top_value_pct = top_values.iloc[0]['percentage']
94
+ top_n_coverage = top_values['percentage'].sum()
95
+ else:
96
+ top_value = None
97
+ top_value_count = 0
98
+ top_value_pct = 0
99
+ top_n_coverage = 0
100
+
101
+ # Calculate repetition score: higher means better for RLE
102
+ # Score considers:
103
+ # 1. How much the top value covers (higher = better)
104
+ # 2. How much top N values cover (higher = better)
105
+ # 3. Inverse of cardinality ratio (lower cardinality = better)
106
+ repetition_score = (top_value_pct * 2 + top_n_coverage) / 3 / (cardinality_ratio + 0.01)
107
+
108
+ results.append({
109
+ 'column_name': col_name,
110
+ 'data_type': col_type,
111
+ 'total_rows': total_rows,
112
+ 'null_count': null_count,
113
+ 'null_pct': round(null_pct, 2),
114
+ 'ndv': ndv,
115
+ 'cardinality_ratio': round(cardinality_ratio, 4),
116
+ 'top_value': top_value,
117
+ 'top_value_count': top_value_count,
118
+ 'top_value_pct': round(top_value_pct, 2),
119
+ 'top_n_coverage': round(top_n_coverage, 2),
120
+ 'repetition_score': round(repetition_score, 2)
121
+ })
122
+
123
+ df = pd.DataFrame(results)
124
+
125
+ # Sort by repetition score (best RLE candidates first)
126
+ df = df.sort_values('repetition_score', ascending=False).reset_index(drop=True)
127
+
128
+ print(f"\n✓ Analysis complete!")
129
+ print(f"\nTop columns by repetition score (best RLE candidates):")
130
+ for idx, row in df.head(5).iterrows():
131
+ print(f" {idx+1}. {row['column_name']}: score={row['repetition_score']}, "
132
+ f"top_value_pct={row['top_value_pct']}%, ndv={row['ndv']:,}")
133
+
134
+ return df
135
+
136
+
137
+ def get_value_frequency_details(duckrun_con, table_name: str, column_name: str,
138
+ limit: int = 20) -> pd.DataFrame:
139
+ """
140
+ Get detailed value frequency distribution for a specific column.
141
+
142
+ Shows the most frequent values and their counts/percentages.
143
+ Useful for understanding repetition patterns that drive RLE compression.
144
+
145
+ Args:
146
+ duckrun_con: Duckrun connection (from duckrun.connect())
147
+ table_name: Name of the table to analyze
148
+ column_name: Name of the column to analyze
149
+ limit: Maximum number of values to return (default: 20)
150
+
151
+ Returns:
152
+ DataFrame with columns:
153
+ - value: The distinct value
154
+ - count: Number of occurrences
155
+ - percentage: Percentage of total rows
156
+ - cumulative_pct: Cumulative percentage
157
+ """
158
+ con = duckrun_con.con # Get underlying DuckDB connection
159
+ from_clause = table_name
160
+
161
+ # Get total row count
162
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
163
+
164
+ # Get value frequencies
165
+ query = f"""
166
+ WITH value_counts AS (
167
+ SELECT
168
+ {column_name} as value,
169
+ COUNT(*) as count,
170
+ COUNT(*) * 100.0 / {total_rows} as percentage
171
+ FROM {from_clause}
172
+ WHERE {column_name} IS NOT NULL
173
+ GROUP BY {column_name}
174
+ ORDER BY count DESC
175
+ LIMIT {limit}
176
+ )
177
+ SELECT
178
+ value,
179
+ count,
180
+ percentage,
181
+ SUM(percentage) OVER (ORDER BY count DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as cumulative_pct
182
+ FROM value_counts
183
+ ORDER BY count DESC
184
+ """
185
+
186
+ df = con.sql(query).df()
187
+
188
+ # Round percentages
189
+ if not df.empty:
190
+ df['percentage'] = df['percentage'].round(2)
191
+ df['cumulative_pct'] = df['cumulative_pct'].round(2)
192
+
193
+ return df
194
+
195
+
196
+ def find_optimal_sort_order(duckrun_con, table_name: str,
197
+ max_combinations: int = 10) -> pd.DataFrame:
198
+ """
199
+ Determine optimal sort order using V-Order-like logic: pure compression testing.
200
+
201
+ This mimics how VertiPaq/V-Order actually works:
202
+ 1. Calculate cardinality for each column
203
+ 2. Test different sort orderings
204
+ 3. Measure actual RLE run counts for each ordering
205
+ 4. Pick the ordering with best overall compression (fewest total runs)
206
+
207
+ NO semantic understanding, NO query pattern assumptions.
208
+ Pure mechanical testing of compression effectiveness.
209
+
210
+ Args:
211
+ duckrun_con: Duckrun connection (from duckrun.connect())
212
+ table_name: Name of the table to analyze
213
+ max_combinations: Maximum sort orderings to test (default: 10)
214
+
215
+ Returns:
216
+ DataFrame with tested orderings ranked by compression effectiveness
217
+ """
218
+ from itertools import permutations
219
+
220
+ con = duckrun_con.con # Get underlying DuckDB connection
221
+ from_clause = table_name
222
+
223
+ # Get column names and cardinalities
224
+ print("Step 1: Analyzing column cardinalities...")
225
+ schema_info = con.sql(f"""
226
+ SELECT column_name, column_type
227
+ FROM (DESCRIBE SELECT * FROM {from_clause})
228
+ """).df()
229
+
230
+ total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
231
+
232
+ # Calculate NDV for each column
233
+ cardinality_map = {}
234
+ for _, row in schema_info.iterrows():
235
+ col = row['column_name']
236
+ ndv = con.sql(f"SELECT COUNT(DISTINCT {col}) FROM {from_clause}").fetchone()[0]
237
+ cardinality_ratio = ndv / total_rows
238
+ cardinality_map[col] = {'ndv': ndv, 'ratio': cardinality_ratio}
239
+ print(f" {col}: {ndv:,} distinct ({cardinality_ratio*100:.4f}%)")
240
+
241
+ # Filter to low-cardinality columns only (< 1% cardinality)
242
+ # High cardinality columns won't benefit from reordering
243
+ low_card_cols = [col for col, stats in cardinality_map.items()
244
+ if stats['ratio'] < 0.01]
245
+
246
+ print(f"\nStep 2: Testing sort orderings for {len(low_card_cols)} low-cardinality columns...")
247
+ print(f"Columns to test: {', '.join(low_card_cols)}")
248
+
249
+ if len(low_card_cols) < 2:
250
+ print("Not enough columns to test different orderings!")
251
+ return pd.DataFrame()
252
+
253
+ # Generate candidate orderings
254
+ # Start with cardinality-based orderings
255
+ sorted_by_card = sorted(low_card_cols, key=lambda c: cardinality_map[c]['ndv'])
256
+
257
+ test_orderings = [
258
+ sorted_by_card, # Lowest cardinality first
259
+ sorted_by_card[::-1], # Highest cardinality first
260
+ ]
261
+
262
+ # Add some permutations of top 3 columns
263
+ if len(low_card_cols) >= 3:
264
+ for perm in permutations(sorted_by_card[:3]):
265
+ if list(perm) not in test_orderings:
266
+ test_orderings.append(list(perm))
267
+ if len(test_orderings) >= max_combinations:
268
+ break
269
+
270
+ # Test each ordering by calculating actual RLE runs
271
+ print(f"\nStep 3: Testing {len(test_orderings)} different orderings...")
272
+ results = []
273
+
274
+ for idx, ordering in enumerate(test_orderings, 1):
275
+ print(f"\n[{idx}/{len(test_orderings)}] Testing: {' → '.join(ordering)}")
276
+
277
+ # Calculate RLE runs for each column with this ordering
278
+ # We'll sort the data by the ordering and count runs
279
+ order_clause = ', '.join(ordering)
280
+
281
+ column_rle = {}
282
+ for col in schema_info['column_name']:
283
+ # Count runs: a new run starts when value changes
284
+ rle_query = f"""
285
+ WITH sorted_data AS (
286
+ SELECT
287
+ {col},
288
+ ROW_NUMBER() OVER (ORDER BY {order_clause}) as rn
289
+ FROM {from_clause}
290
+ ),
291
+ with_prev AS (
292
+ SELECT
293
+ {col},
294
+ LAG({col}) OVER (ORDER BY rn) as prev_val
295
+ FROM sorted_data
296
+ )
297
+ SELECT COUNT(*) as runs
298
+ FROM with_prev
299
+ WHERE prev_val IS NULL OR {col} != prev_val OR {col} IS NULL OR prev_val IS NULL
300
+ """
301
+
302
+ runs = con.sql(rle_query).fetchone()[0]
303
+ column_rle[col] = runs
304
+ print(f" {col}: {runs:,} runs")
305
+
306
+ total_runs = sum(column_rle.values())
307
+ print(f" TOTAL: {total_runs:,} runs")
308
+
309
+ results.append({
310
+ 'sort_order': ' → '.join(ordering),
311
+ 'total_runs': total_runs,
312
+ 'compression_score': total_rows / total_runs, # Higher = better compression
313
+ **column_rle
314
+ })
315
+
316
+ # Create results DataFrame
317
+ df = pd.DataFrame(results)
318
+ df = df.sort_values('total_runs').reset_index(drop=True)
319
+
320
+ print("\n" + "=" * 80)
321
+ print("RESULTS: Best to Worst Compression")
322
+ print("=" * 80)
323
+
324
+ for idx, row in df.iterrows():
325
+ print(f"\n{idx + 1}. {row['sort_order']}")
326
+ print(f" Total runs: {row['total_runs']:,}")
327
+ print(f" Compression score: {row['compression_score']:.2f}x")
328
+ if idx == 0:
329
+ print(" ⭐ BEST COMPRESSION")
330
+
331
+ print("\n" + "=" * 80)
332
+ print("CONCLUSION")
333
+ print("=" * 80)
334
+ best = df.iloc[0]
335
+ print(f"\nOptimal sort order: {best['sort_order']}")
336
+ print(f"This ordering achieves the fewest total RLE runs ({best['total_runs']:,})")
337
+ print(f"\nThis is how V-Order actually works:")
338
+ print("✓ No query pattern assumptions")
339
+ print("✓ No semantic understanding")
340
+ print("✓ Pure compression effectiveness testing")
341
+ print("✓ Mechanical optimization based on data patterns")
342
+
343
+ return df
344
+
345
+
346
+ # Example usage:
347
+ #
348
+ # import duckrun
349
+ #
350
+ # con = duckrun.connect('workspace/lakehouse.lakehouse')
351
+ #
352
+ # # Get RLE statistics:
353
+ # stats_df = con.get_rle_stats('my_table', top_n_values=10)
354
+ # print(stats_df)
355
+ #
356
+ # # Detailed frequency distribution for a specific column:
357
+ # freq_df = con.get_value_frequency('my_table', 'status_column', limit=20)
358
+ # print(freq_df)
359
+ #
360
+ # # Find optimal sort order (V-Order simulation):
361
+ # optimal_df = con.find_optimal_sort_order('my_table', max_combinations=10)
362
+ # print(optimal_df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.21.dev2
3
+ Version: 0.2.22.dev0
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -24,5 +24,6 @@ tests/test_ducklake_export.py
24
24
  tests/test_filename.py
25
25
  tests/test_register.py
26
26
  tests/test_rle.py
27
+ tests/test_rle_analysis.py
27
28
  tests/test_writer_dictionary.py
28
29
  tests/test_writer_integration.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.21.dev2"
7
+ version = "0.2.22.dev0"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,149 @@
1
+ """
2
+ Test: RLE Analysis with Real Parquet Data
3
+
4
+ This test demonstrates the refactored RLE module using real parquet data.
5
+ It analyzes table statistics, NDV, and value frequency patterns.
6
+ """
7
+
8
+ import duckdb
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # Add parent directory to path to import duckrun
13
+ sys.path.insert(0, str(Path(__file__).parent.parent))
14
+
15
+ from duckrun.rle import get_table_stats, get_value_frequency_details
16
+
17
+
18
+ def test_rle_with_real_parquet():
19
+ """Test RLE analysis with real parquet file"""
20
+
21
+ # Path to the test parquet file
22
+ parquet_path = Path(__file__).parent / "part-00000-19052469-6a9d-4faa-86ac-60efce3e4443-c000.snappy.parquet"
23
+
24
+ if not parquet_path.exists():
25
+ print(f"❌ Error: Parquet file not found at {parquet_path}")
26
+ return False
27
+
28
+ print("=" * 80)
29
+ print("RLE ANALYSIS TEST: Real Parquet Data")
30
+ print("=" * 80)
31
+ print(f"File: {parquet_path.name}")
32
+ print(f"Size: {parquet_path.stat().st_size:,} bytes")
33
+
34
+ # Connect to DuckDB
35
+ con = duckdb.connect(':memory:')
36
+
37
+ try:
38
+ # First, let's see the schema
39
+ print("\n" + "=" * 80)
40
+ print("SCHEMA INSPECTION")
41
+ print("=" * 80)
42
+
43
+ schema_df = con.sql(f"""
44
+ SELECT * FROM parquet_schema('{parquet_path}')
45
+ """).df()
46
+
47
+ print(f"\nColumns found: {len(schema_df)}")
48
+ print(schema_df.to_string(index=False))
49
+
50
+ # Get row count
51
+ row_count = con.sql(f"""
52
+ SELECT COUNT(*) FROM read_parquet('{parquet_path}')
53
+ """).fetchone()[0]
54
+
55
+ print(f"\nTotal rows: {row_count:,}")
56
+
57
+ # Run comprehensive RLE analysis
58
+ print("\n" + "=" * 80)
59
+ print("COMPREHENSIVE RLE ANALYSIS")
60
+ print("=" * 80)
61
+
62
+ stats_df = get_table_stats(con, str(parquet_path), is_parquet=True, top_n_values=10)
63
+
64
+ # Display results
65
+ print("\n" + "=" * 80)
66
+ print("RESULTS: Columns Ranked by RLE Potential")
67
+ print("=" * 80)
68
+
69
+ print("\n" + stats_df[['column_name', 'data_type', 'ndv', 'cardinality_ratio',
70
+ 'top_value_pct', 'top_n_coverage', 'repetition_score']].to_string(index=False))
71
+
72
+ # Detailed analysis of top 3 columns
73
+ print("\n" + "=" * 80)
74
+ print("DETAILED VALUE FREQUENCY ANALYSIS")
75
+ print("=" * 80)
76
+
77
+ for idx in range(min(3, len(stats_df))):
78
+ col_name = stats_df.iloc[idx]['column_name']
79
+ score = stats_df.iloc[idx]['repetition_score']
80
+
81
+ print(f"\n[{idx+1}] Column: {col_name} (repetition_score: {score})")
82
+ print("-" * 80)
83
+
84
+ freq_df = get_value_frequency_details(con, str(parquet_path), col_name,
85
+ is_parquet=True, limit=15)
86
+ print(freq_df.to_string(index=False))
87
+
88
+ if not freq_df.empty:
89
+ print(f"\n✓ Top value appears {freq_df.iloc[0]['percentage']:.2f}% of the time")
90
+ print(f"✓ Top 15 values cover {freq_df['cumulative_pct'].iloc[-1]:.2f}% of all data")
91
+
92
+ # Summary and recommendations
93
+ print("\n" + "=" * 80)
94
+ print("SUMMARY & RECOMMENDATIONS")
95
+ print("=" * 80)
96
+
97
+ # Categorize columns
98
+ excellent = stats_df[stats_df['repetition_score'] > 100]
99
+ good = stats_df[(stats_df['repetition_score'] >= 10) & (stats_df['repetition_score'] <= 100)]
100
+ poor = stats_df[stats_df['repetition_score'] < 10]
101
+
102
+ print(f"\n📊 RLE Compression Potential:")
103
+ print(f" Excellent (score > 100): {len(excellent)} columns")
104
+ if len(excellent) > 0:
105
+ print(f" {', '.join(excellent['column_name'].tolist())}")
106
+
107
+ print(f" Good (score 10-100): {len(good)} columns")
108
+ if len(good) > 0:
109
+ print(f" {', '.join(good['column_name'].tolist())}")
110
+
111
+ print(f" Poor (score < 10): {len(poor)} columns")
112
+ if len(poor) > 0:
113
+ print(f" {', '.join(poor['column_name'].tolist())}")
114
+
115
+ print(f"\n💡 Sorting Recommendation:")
116
+ top_3 = stats_df.head(3)['column_name'].tolist()
117
+ print(f" For optimal RLE compression, consider sorting by:")
118
+ for i, col in enumerate(top_3, 1):
119
+ print(f" {i}. {col}")
120
+
121
+ print(f"\n✅ Test completed successfully!")
122
+
123
+ return True
124
+
125
+ except Exception as e:
126
+ print(f"\n❌ Error during analysis: {e}")
127
+ import traceback
128
+ traceback.print_exc()
129
+ return False
130
+
131
+ finally:
132
+ con.close()
133
+
134
+
135
+ if __name__ == "__main__":
136
+ print("\n" + "=" * 80)
137
+ print("STARTING RLE ANALYSIS TEST")
138
+ print("=" * 80)
139
+
140
+ success = test_rle_with_real_parquet()
141
+
142
+ print("\n" + "=" * 80)
143
+ if success:
144
+ print("✅ TEST PASSED")
145
+ else:
146
+ print("❌ TEST FAILED")
147
+ print("=" * 80)
148
+
149
+ sys.exit(0 if success else 1)