duckrun 0.2.21.dev2__tar.gz → 0.2.22.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/PKG-INFO +1 -1
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/core.py +92 -0
- duckrun-0.2.22.dev0/duckrun/rle.py +362 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/SOURCES.txt +1 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/pyproject.toml +1 -1
- duckrun-0.2.22.dev0/tests/test_rle_analysis.py +149 -0
- duckrun-0.2.21.dev2/duckrun/rle.py +0 -940
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/LICENSE +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/README.md +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/__init__.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/auth.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/ducklake_metadata.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/files.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/notebook.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/runner.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/stats.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun/writer.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/setup.cfg +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_checkpoint_format.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_deploy_fresh.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_ducklake_export.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_filename.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_register.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_rle.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_writer_dictionary.py +0 -0
- {duckrun-0.2.21.dev2 → duckrun-0.2.22.dev0}/tests/test_writer_integration.py +0 -0
|
@@ -1050,6 +1050,98 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1050
1050
|
"""
|
|
1051
1051
|
self.con.register(name, df)
|
|
1052
1052
|
|
|
1053
|
+
def get_rle_stats(self, table_name: str, top_n_values: int = 10):
|
|
1054
|
+
"""
|
|
1055
|
+
Get comprehensive table statistics including NDV and value frequency analysis.
|
|
1056
|
+
|
|
1057
|
+
Analyzes column characteristics for RLE compression optimization.
|
|
1058
|
+
|
|
1059
|
+
Args:
|
|
1060
|
+
table_name: Name of the table to analyze
|
|
1061
|
+
top_n_values: Number of top frequent values to show per column (default: 10)
|
|
1062
|
+
|
|
1063
|
+
Returns:
|
|
1064
|
+
DataFrame with statistics for each column:
|
|
1065
|
+
- column_name: Name of the column
|
|
1066
|
+
- data_type: Data type
|
|
1067
|
+
- total_rows: Total number of rows
|
|
1068
|
+
- null_count, null_pct: NULL statistics
|
|
1069
|
+
- ndv: Number of distinct values (exact)
|
|
1070
|
+
- cardinality_ratio: NDV / total_rows (lower = better for RLE)
|
|
1071
|
+
- top_value, top_value_count, top_value_pct: Most frequent value stats
|
|
1072
|
+
- top_n_coverage: Percentage covered by top N values
|
|
1073
|
+
- repetition_score: RLE potential score (higher = better)
|
|
1074
|
+
|
|
1075
|
+
Examples:
|
|
1076
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1077
|
+
|
|
1078
|
+
# Analyze a table
|
|
1079
|
+
stats = con.get_rle_stats('sales')
|
|
1080
|
+
print(stats)
|
|
1081
|
+
|
|
1082
|
+
# Show top 20 values per column
|
|
1083
|
+
stats = con.get_rle_stats('sales', top_n_values=20)
|
|
1084
|
+
"""
|
|
1085
|
+
from .rle import get_table_stats as _get_rle_stats
|
|
1086
|
+
return _get_rle_stats(self, table_name, top_n_values)
|
|
1087
|
+
|
|
1088
|
+
def get_value_frequency(self, table_name: str, column_name: str, limit: int = 20):
|
|
1089
|
+
"""
|
|
1090
|
+
Get detailed value frequency distribution for a specific column.
|
|
1091
|
+
|
|
1092
|
+
Args:
|
|
1093
|
+
table_name: Name of the table
|
|
1094
|
+
column_name: Name of the column to analyze
|
|
1095
|
+
limit: Maximum number of values to return (default: 20)
|
|
1096
|
+
|
|
1097
|
+
Returns:
|
|
1098
|
+
DataFrame with value frequencies:
|
|
1099
|
+
- value: The distinct value
|
|
1100
|
+
- count: Number of occurrences
|
|
1101
|
+
- percentage: Percentage of total rows
|
|
1102
|
+
- cumulative_pct: Cumulative percentage
|
|
1103
|
+
|
|
1104
|
+
Examples:
|
|
1105
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1106
|
+
|
|
1107
|
+
# Get top 20 values for a column
|
|
1108
|
+
freq = con.get_value_frequency('sales', 'status')
|
|
1109
|
+
print(freq)
|
|
1110
|
+
"""
|
|
1111
|
+
from .rle import get_value_frequency_details as _get_value_frequency
|
|
1112
|
+
return _get_value_frequency(self, table_name, column_name, limit)
|
|
1113
|
+
|
|
1114
|
+
def find_optimal_sort_order(self, table_name: str, max_combinations: int = 10):
|
|
1115
|
+
"""
|
|
1116
|
+
Find optimal column sort order for compression using V-Order-like testing.
|
|
1117
|
+
|
|
1118
|
+
Tests different column orderings and measures RLE compression effectiveness.
|
|
1119
|
+
This simulates how V-Order/VertiPaq optimizes data layout.
|
|
1120
|
+
|
|
1121
|
+
Args:
|
|
1122
|
+
table_name: Name of the table to analyze
|
|
1123
|
+
max_combinations: Maximum sort orderings to test (default: 10)
|
|
1124
|
+
|
|
1125
|
+
Returns:
|
|
1126
|
+
DataFrame with tested orderings ranked by compression:
|
|
1127
|
+
- sort_order: Column ordering (e.g., "date → DUID → time")
|
|
1128
|
+
- total_runs: Total RLE runs (fewer = better compression)
|
|
1129
|
+
- compression_score: Compression effectiveness (higher = better)
|
|
1130
|
+
- Individual RLE counts per column
|
|
1131
|
+
|
|
1132
|
+
Examples:
|
|
1133
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse")
|
|
1134
|
+
|
|
1135
|
+
# Find optimal sort order
|
|
1136
|
+
optimal = con.find_optimal_sort_order('energy_data')
|
|
1137
|
+
print(optimal)
|
|
1138
|
+
|
|
1139
|
+
# Test more combinations
|
|
1140
|
+
optimal = con.find_optimal_sort_order('energy_data', max_combinations=20)
|
|
1141
|
+
"""
|
|
1142
|
+
from .rle import find_optimal_sort_order as _find_optimal_sort_order
|
|
1143
|
+
return _find_optimal_sort_order(self, table_name, max_combinations)
|
|
1144
|
+
|
|
1053
1145
|
def get_stats(self, source: str = None, detailed = False):
|
|
1054
1146
|
"""
|
|
1055
1147
|
Get comprehensive statistics for Delta Lake tables.
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
from typing import List, Dict, Tuple, Optional
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_table_stats(duckrun_con, table_name: str,
|
|
6
|
+
top_n_values: int = 10) -> pd.DataFrame:
|
|
7
|
+
"""
|
|
8
|
+
Get comprehensive table statistics including NDV and value frequency analysis.
|
|
9
|
+
|
|
10
|
+
The theory: If a value appears frequently (high repetition), it may provide better RLE compression
|
|
11
|
+
even if the column has higher NDV. This function helps identify such patterns.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
duckrun_con: Duckrun connection (from duckrun.connect())
|
|
15
|
+
table_name: Name of the table to analyze
|
|
16
|
+
top_n_values: Number of top frequent values to show per column (default: 10)
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
DataFrame with columns:
|
|
20
|
+
- column_name: Name of the column
|
|
21
|
+
- data_type: Data type of the column
|
|
22
|
+
- total_rows: Total number of rows
|
|
23
|
+
- null_count: Number of NULL values
|
|
24
|
+
- null_pct: Percentage of NULL values
|
|
25
|
+
- ndv: Number of distinct values (exact)
|
|
26
|
+
- cardinality_ratio: NDV / total_rows (lower = better for RLE)
|
|
27
|
+
- top_value: Most frequent value
|
|
28
|
+
- top_value_count: Count of most frequent value
|
|
29
|
+
- top_value_pct: Percentage of most frequent value
|
|
30
|
+
- top_n_coverage: Percentage covered by top N values
|
|
31
|
+
- repetition_score: Custom score indicating RLE potential (higher = better)
|
|
32
|
+
"""
|
|
33
|
+
con = duckrun_con.con # Get underlying DuckDB connection
|
|
34
|
+
from_clause = table_name
|
|
35
|
+
|
|
36
|
+
# Get column names and types
|
|
37
|
+
schema_info = con.sql(f"""
|
|
38
|
+
SELECT column_name, column_type
|
|
39
|
+
FROM (DESCRIBE SELECT * FROM {from_clause})
|
|
40
|
+
""").df()
|
|
41
|
+
|
|
42
|
+
if schema_info.empty:
|
|
43
|
+
return pd.DataFrame()
|
|
44
|
+
|
|
45
|
+
# Get total row count once
|
|
46
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
47
|
+
print(f"Analyzing {len(schema_info)} columns across {total_rows:,} rows...")
|
|
48
|
+
|
|
49
|
+
results = []
|
|
50
|
+
|
|
51
|
+
for idx, row in schema_info.iterrows():
|
|
52
|
+
col_name = row['column_name']
|
|
53
|
+
col_type = row['column_type']
|
|
54
|
+
|
|
55
|
+
print(f" [{idx+1}/{len(schema_info)}] Analyzing column: {col_name}")
|
|
56
|
+
|
|
57
|
+
# Get basic stats in one query
|
|
58
|
+
stats_query = f"""
|
|
59
|
+
SELECT
|
|
60
|
+
COUNT(*) as total,
|
|
61
|
+
COUNT({col_name}) as non_null,
|
|
62
|
+
COUNT(DISTINCT {col_name}) as ndv
|
|
63
|
+
FROM {from_clause}
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
stats = con.sql(stats_query).fetchone()
|
|
67
|
+
total = stats[0]
|
|
68
|
+
non_null = stats[1]
|
|
69
|
+
ndv = stats[2]
|
|
70
|
+
null_count = total - non_null
|
|
71
|
+
null_pct = (null_count / total * 100) if total > 0 else 0
|
|
72
|
+
cardinality_ratio = (ndv / total) if total > 0 else 0
|
|
73
|
+
|
|
74
|
+
# Get top N values with their frequencies
|
|
75
|
+
top_values_query = f"""
|
|
76
|
+
SELECT
|
|
77
|
+
{col_name} as value,
|
|
78
|
+
COUNT(*) as count,
|
|
79
|
+
COUNT(*) * 100.0 / {total} as percentage
|
|
80
|
+
FROM {from_clause}
|
|
81
|
+
WHERE {col_name} IS NOT NULL
|
|
82
|
+
GROUP BY {col_name}
|
|
83
|
+
ORDER BY count DESC
|
|
84
|
+
LIMIT {top_n_values}
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
top_values = con.sql(top_values_query).df()
|
|
88
|
+
|
|
89
|
+
# Extract top value info
|
|
90
|
+
if not top_values.empty:
|
|
91
|
+
top_value = top_values.iloc[0]['value']
|
|
92
|
+
top_value_count = top_values.iloc[0]['count']
|
|
93
|
+
top_value_pct = top_values.iloc[0]['percentage']
|
|
94
|
+
top_n_coverage = top_values['percentage'].sum()
|
|
95
|
+
else:
|
|
96
|
+
top_value = None
|
|
97
|
+
top_value_count = 0
|
|
98
|
+
top_value_pct = 0
|
|
99
|
+
top_n_coverage = 0
|
|
100
|
+
|
|
101
|
+
# Calculate repetition score: higher means better for RLE
|
|
102
|
+
# Score considers:
|
|
103
|
+
# 1. How much the top value covers (higher = better)
|
|
104
|
+
# 2. How much top N values cover (higher = better)
|
|
105
|
+
# 3. Inverse of cardinality ratio (lower cardinality = better)
|
|
106
|
+
repetition_score = (top_value_pct * 2 + top_n_coverage) / 3 / (cardinality_ratio + 0.01)
|
|
107
|
+
|
|
108
|
+
results.append({
|
|
109
|
+
'column_name': col_name,
|
|
110
|
+
'data_type': col_type,
|
|
111
|
+
'total_rows': total_rows,
|
|
112
|
+
'null_count': null_count,
|
|
113
|
+
'null_pct': round(null_pct, 2),
|
|
114
|
+
'ndv': ndv,
|
|
115
|
+
'cardinality_ratio': round(cardinality_ratio, 4),
|
|
116
|
+
'top_value': top_value,
|
|
117
|
+
'top_value_count': top_value_count,
|
|
118
|
+
'top_value_pct': round(top_value_pct, 2),
|
|
119
|
+
'top_n_coverage': round(top_n_coverage, 2),
|
|
120
|
+
'repetition_score': round(repetition_score, 2)
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
df = pd.DataFrame(results)
|
|
124
|
+
|
|
125
|
+
# Sort by repetition score (best RLE candidates first)
|
|
126
|
+
df = df.sort_values('repetition_score', ascending=False).reset_index(drop=True)
|
|
127
|
+
|
|
128
|
+
print(f"\n✓ Analysis complete!")
|
|
129
|
+
print(f"\nTop columns by repetition score (best RLE candidates):")
|
|
130
|
+
for idx, row in df.head(5).iterrows():
|
|
131
|
+
print(f" {idx+1}. {row['column_name']}: score={row['repetition_score']}, "
|
|
132
|
+
f"top_value_pct={row['top_value_pct']}%, ndv={row['ndv']:,}")
|
|
133
|
+
|
|
134
|
+
return df
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_value_frequency_details(duckrun_con, table_name: str, column_name: str,
|
|
138
|
+
limit: int = 20) -> pd.DataFrame:
|
|
139
|
+
"""
|
|
140
|
+
Get detailed value frequency distribution for a specific column.
|
|
141
|
+
|
|
142
|
+
Shows the most frequent values and their counts/percentages.
|
|
143
|
+
Useful for understanding repetition patterns that drive RLE compression.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
duckrun_con: Duckrun connection (from duckrun.connect())
|
|
147
|
+
table_name: Name of the table to analyze
|
|
148
|
+
column_name: Name of the column to analyze
|
|
149
|
+
limit: Maximum number of values to return (default: 20)
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
DataFrame with columns:
|
|
153
|
+
- value: The distinct value
|
|
154
|
+
- count: Number of occurrences
|
|
155
|
+
- percentage: Percentage of total rows
|
|
156
|
+
- cumulative_pct: Cumulative percentage
|
|
157
|
+
"""
|
|
158
|
+
con = duckrun_con.con # Get underlying DuckDB connection
|
|
159
|
+
from_clause = table_name
|
|
160
|
+
|
|
161
|
+
# Get total row count
|
|
162
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
163
|
+
|
|
164
|
+
# Get value frequencies
|
|
165
|
+
query = f"""
|
|
166
|
+
WITH value_counts AS (
|
|
167
|
+
SELECT
|
|
168
|
+
{column_name} as value,
|
|
169
|
+
COUNT(*) as count,
|
|
170
|
+
COUNT(*) * 100.0 / {total_rows} as percentage
|
|
171
|
+
FROM {from_clause}
|
|
172
|
+
WHERE {column_name} IS NOT NULL
|
|
173
|
+
GROUP BY {column_name}
|
|
174
|
+
ORDER BY count DESC
|
|
175
|
+
LIMIT {limit}
|
|
176
|
+
)
|
|
177
|
+
SELECT
|
|
178
|
+
value,
|
|
179
|
+
count,
|
|
180
|
+
percentage,
|
|
181
|
+
SUM(percentage) OVER (ORDER BY count DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as cumulative_pct
|
|
182
|
+
FROM value_counts
|
|
183
|
+
ORDER BY count DESC
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
df = con.sql(query).df()
|
|
187
|
+
|
|
188
|
+
# Round percentages
|
|
189
|
+
if not df.empty:
|
|
190
|
+
df['percentage'] = df['percentage'].round(2)
|
|
191
|
+
df['cumulative_pct'] = df['cumulative_pct'].round(2)
|
|
192
|
+
|
|
193
|
+
return df
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def find_optimal_sort_order(duckrun_con, table_name: str,
|
|
197
|
+
max_combinations: int = 10) -> pd.DataFrame:
|
|
198
|
+
"""
|
|
199
|
+
Determine optimal sort order using V-Order-like logic: pure compression testing.
|
|
200
|
+
|
|
201
|
+
This mimics how VertiPaq/V-Order actually works:
|
|
202
|
+
1. Calculate cardinality for each column
|
|
203
|
+
2. Test different sort orderings
|
|
204
|
+
3. Measure actual RLE run counts for each ordering
|
|
205
|
+
4. Pick the ordering with best overall compression (fewest total runs)
|
|
206
|
+
|
|
207
|
+
NO semantic understanding, NO query pattern assumptions.
|
|
208
|
+
Pure mechanical testing of compression effectiveness.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
duckrun_con: Duckrun connection (from duckrun.connect())
|
|
212
|
+
table_name: Name of the table to analyze
|
|
213
|
+
max_combinations: Maximum sort orderings to test (default: 10)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
DataFrame with tested orderings ranked by compression effectiveness
|
|
217
|
+
"""
|
|
218
|
+
from itertools import permutations
|
|
219
|
+
|
|
220
|
+
con = duckrun_con.con # Get underlying DuckDB connection
|
|
221
|
+
from_clause = table_name
|
|
222
|
+
|
|
223
|
+
# Get column names and cardinalities
|
|
224
|
+
print("Step 1: Analyzing column cardinalities...")
|
|
225
|
+
schema_info = con.sql(f"""
|
|
226
|
+
SELECT column_name, column_type
|
|
227
|
+
FROM (DESCRIBE SELECT * FROM {from_clause})
|
|
228
|
+
""").df()
|
|
229
|
+
|
|
230
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM {from_clause}").fetchone()[0]
|
|
231
|
+
|
|
232
|
+
# Calculate NDV for each column
|
|
233
|
+
cardinality_map = {}
|
|
234
|
+
for _, row in schema_info.iterrows():
|
|
235
|
+
col = row['column_name']
|
|
236
|
+
ndv = con.sql(f"SELECT COUNT(DISTINCT {col}) FROM {from_clause}").fetchone()[0]
|
|
237
|
+
cardinality_ratio = ndv / total_rows
|
|
238
|
+
cardinality_map[col] = {'ndv': ndv, 'ratio': cardinality_ratio}
|
|
239
|
+
print(f" {col}: {ndv:,} distinct ({cardinality_ratio*100:.4f}%)")
|
|
240
|
+
|
|
241
|
+
# Filter to low-cardinality columns only (< 1% cardinality)
|
|
242
|
+
# High cardinality columns won't benefit from reordering
|
|
243
|
+
low_card_cols = [col for col, stats in cardinality_map.items()
|
|
244
|
+
if stats['ratio'] < 0.01]
|
|
245
|
+
|
|
246
|
+
print(f"\nStep 2: Testing sort orderings for {len(low_card_cols)} low-cardinality columns...")
|
|
247
|
+
print(f"Columns to test: {', '.join(low_card_cols)}")
|
|
248
|
+
|
|
249
|
+
if len(low_card_cols) < 2:
|
|
250
|
+
print("Not enough columns to test different orderings!")
|
|
251
|
+
return pd.DataFrame()
|
|
252
|
+
|
|
253
|
+
# Generate candidate orderings
|
|
254
|
+
# Start with cardinality-based orderings
|
|
255
|
+
sorted_by_card = sorted(low_card_cols, key=lambda c: cardinality_map[c]['ndv'])
|
|
256
|
+
|
|
257
|
+
test_orderings = [
|
|
258
|
+
sorted_by_card, # Lowest cardinality first
|
|
259
|
+
sorted_by_card[::-1], # Highest cardinality first
|
|
260
|
+
]
|
|
261
|
+
|
|
262
|
+
# Add some permutations of top 3 columns
|
|
263
|
+
if len(low_card_cols) >= 3:
|
|
264
|
+
for perm in permutations(sorted_by_card[:3]):
|
|
265
|
+
if list(perm) not in test_orderings:
|
|
266
|
+
test_orderings.append(list(perm))
|
|
267
|
+
if len(test_orderings) >= max_combinations:
|
|
268
|
+
break
|
|
269
|
+
|
|
270
|
+
# Test each ordering by calculating actual RLE runs
|
|
271
|
+
print(f"\nStep 3: Testing {len(test_orderings)} different orderings...")
|
|
272
|
+
results = []
|
|
273
|
+
|
|
274
|
+
for idx, ordering in enumerate(test_orderings, 1):
|
|
275
|
+
print(f"\n[{idx}/{len(test_orderings)}] Testing: {' → '.join(ordering)}")
|
|
276
|
+
|
|
277
|
+
# Calculate RLE runs for each column with this ordering
|
|
278
|
+
# We'll sort the data by the ordering and count runs
|
|
279
|
+
order_clause = ', '.join(ordering)
|
|
280
|
+
|
|
281
|
+
column_rle = {}
|
|
282
|
+
for col in schema_info['column_name']:
|
|
283
|
+
# Count runs: a new run starts when value changes
|
|
284
|
+
rle_query = f"""
|
|
285
|
+
WITH sorted_data AS (
|
|
286
|
+
SELECT
|
|
287
|
+
{col},
|
|
288
|
+
ROW_NUMBER() OVER (ORDER BY {order_clause}) as rn
|
|
289
|
+
FROM {from_clause}
|
|
290
|
+
),
|
|
291
|
+
with_prev AS (
|
|
292
|
+
SELECT
|
|
293
|
+
{col},
|
|
294
|
+
LAG({col}) OVER (ORDER BY rn) as prev_val
|
|
295
|
+
FROM sorted_data
|
|
296
|
+
)
|
|
297
|
+
SELECT COUNT(*) as runs
|
|
298
|
+
FROM with_prev
|
|
299
|
+
WHERE prev_val IS NULL OR {col} != prev_val OR {col} IS NULL OR prev_val IS NULL
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
runs = con.sql(rle_query).fetchone()[0]
|
|
303
|
+
column_rle[col] = runs
|
|
304
|
+
print(f" {col}: {runs:,} runs")
|
|
305
|
+
|
|
306
|
+
total_runs = sum(column_rle.values())
|
|
307
|
+
print(f" TOTAL: {total_runs:,} runs")
|
|
308
|
+
|
|
309
|
+
results.append({
|
|
310
|
+
'sort_order': ' → '.join(ordering),
|
|
311
|
+
'total_runs': total_runs,
|
|
312
|
+
'compression_score': total_rows / total_runs, # Higher = better compression
|
|
313
|
+
**column_rle
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
# Create results DataFrame
|
|
317
|
+
df = pd.DataFrame(results)
|
|
318
|
+
df = df.sort_values('total_runs').reset_index(drop=True)
|
|
319
|
+
|
|
320
|
+
print("\n" + "=" * 80)
|
|
321
|
+
print("RESULTS: Best to Worst Compression")
|
|
322
|
+
print("=" * 80)
|
|
323
|
+
|
|
324
|
+
for idx, row in df.iterrows():
|
|
325
|
+
print(f"\n{idx + 1}. {row['sort_order']}")
|
|
326
|
+
print(f" Total runs: {row['total_runs']:,}")
|
|
327
|
+
print(f" Compression score: {row['compression_score']:.2f}x")
|
|
328
|
+
if idx == 0:
|
|
329
|
+
print(" ⭐ BEST COMPRESSION")
|
|
330
|
+
|
|
331
|
+
print("\n" + "=" * 80)
|
|
332
|
+
print("CONCLUSION")
|
|
333
|
+
print("=" * 80)
|
|
334
|
+
best = df.iloc[0]
|
|
335
|
+
print(f"\nOptimal sort order: {best['sort_order']}")
|
|
336
|
+
print(f"This ordering achieves the fewest total RLE runs ({best['total_runs']:,})")
|
|
337
|
+
print(f"\nThis is how V-Order actually works:")
|
|
338
|
+
print("✓ No query pattern assumptions")
|
|
339
|
+
print("✓ No semantic understanding")
|
|
340
|
+
print("✓ Pure compression effectiveness testing")
|
|
341
|
+
print("✓ Mechanical optimization based on data patterns")
|
|
342
|
+
|
|
343
|
+
return df
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# Example usage:
|
|
347
|
+
#
|
|
348
|
+
# import duckrun
|
|
349
|
+
#
|
|
350
|
+
# con = duckrun.connect('workspace/lakehouse.lakehouse')
|
|
351
|
+
#
|
|
352
|
+
# # Get RLE statistics:
|
|
353
|
+
# stats_df = con.get_rle_stats('my_table', top_n_values=10)
|
|
354
|
+
# print(stats_df)
|
|
355
|
+
#
|
|
356
|
+
# # Detailed frequency distribution for a specific column:
|
|
357
|
+
# freq_df = con.get_value_frequency('my_table', 'status_column', limit=20)
|
|
358
|
+
# print(freq_df)
|
|
359
|
+
#
|
|
360
|
+
# # Find optimal sort order (V-Order simulation):
|
|
361
|
+
# optimal_df = con.find_optimal_sort_order('my_table', max_combinations=10)
|
|
362
|
+
# print(optimal_df)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.22.dev0"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test: RLE Analysis with Real Parquet Data
|
|
3
|
+
|
|
4
|
+
This test demonstrates the refactored RLE module using real parquet data.
|
|
5
|
+
It analyzes table statistics, NDV, and value frequency patterns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import duckdb
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
# Add parent directory to path to import duckrun
|
|
13
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
14
|
+
|
|
15
|
+
from duckrun.rle import get_table_stats, get_value_frequency_details
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_rle_with_real_parquet():
|
|
19
|
+
"""Test RLE analysis with real parquet file"""
|
|
20
|
+
|
|
21
|
+
# Path to the test parquet file
|
|
22
|
+
parquet_path = Path(__file__).parent / "part-00000-19052469-6a9d-4faa-86ac-60efce3e4443-c000.snappy.parquet"
|
|
23
|
+
|
|
24
|
+
if not parquet_path.exists():
|
|
25
|
+
print(f"❌ Error: Parquet file not found at {parquet_path}")
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
print("=" * 80)
|
|
29
|
+
print("RLE ANALYSIS TEST: Real Parquet Data")
|
|
30
|
+
print("=" * 80)
|
|
31
|
+
print(f"File: {parquet_path.name}")
|
|
32
|
+
print(f"Size: {parquet_path.stat().st_size:,} bytes")
|
|
33
|
+
|
|
34
|
+
# Connect to DuckDB
|
|
35
|
+
con = duckdb.connect(':memory:')
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
# First, let's see the schema
|
|
39
|
+
print("\n" + "=" * 80)
|
|
40
|
+
print("SCHEMA INSPECTION")
|
|
41
|
+
print("=" * 80)
|
|
42
|
+
|
|
43
|
+
schema_df = con.sql(f"""
|
|
44
|
+
SELECT * FROM parquet_schema('{parquet_path}')
|
|
45
|
+
""").df()
|
|
46
|
+
|
|
47
|
+
print(f"\nColumns found: {len(schema_df)}")
|
|
48
|
+
print(schema_df.to_string(index=False))
|
|
49
|
+
|
|
50
|
+
# Get row count
|
|
51
|
+
row_count = con.sql(f"""
|
|
52
|
+
SELECT COUNT(*) FROM read_parquet('{parquet_path}')
|
|
53
|
+
""").fetchone()[0]
|
|
54
|
+
|
|
55
|
+
print(f"\nTotal rows: {row_count:,}")
|
|
56
|
+
|
|
57
|
+
# Run comprehensive RLE analysis
|
|
58
|
+
print("\n" + "=" * 80)
|
|
59
|
+
print("COMPREHENSIVE RLE ANALYSIS")
|
|
60
|
+
print("=" * 80)
|
|
61
|
+
|
|
62
|
+
stats_df = get_table_stats(con, str(parquet_path), is_parquet=True, top_n_values=10)
|
|
63
|
+
|
|
64
|
+
# Display results
|
|
65
|
+
print("\n" + "=" * 80)
|
|
66
|
+
print("RESULTS: Columns Ranked by RLE Potential")
|
|
67
|
+
print("=" * 80)
|
|
68
|
+
|
|
69
|
+
print("\n" + stats_df[['column_name', 'data_type', 'ndv', 'cardinality_ratio',
|
|
70
|
+
'top_value_pct', 'top_n_coverage', 'repetition_score']].to_string(index=False))
|
|
71
|
+
|
|
72
|
+
# Detailed analysis of top 3 columns
|
|
73
|
+
print("\n" + "=" * 80)
|
|
74
|
+
print("DETAILED VALUE FREQUENCY ANALYSIS")
|
|
75
|
+
print("=" * 80)
|
|
76
|
+
|
|
77
|
+
for idx in range(min(3, len(stats_df))):
|
|
78
|
+
col_name = stats_df.iloc[idx]['column_name']
|
|
79
|
+
score = stats_df.iloc[idx]['repetition_score']
|
|
80
|
+
|
|
81
|
+
print(f"\n[{idx+1}] Column: {col_name} (repetition_score: {score})")
|
|
82
|
+
print("-" * 80)
|
|
83
|
+
|
|
84
|
+
freq_df = get_value_frequency_details(con, str(parquet_path), col_name,
|
|
85
|
+
is_parquet=True, limit=15)
|
|
86
|
+
print(freq_df.to_string(index=False))
|
|
87
|
+
|
|
88
|
+
if not freq_df.empty:
|
|
89
|
+
print(f"\n✓ Top value appears {freq_df.iloc[0]['percentage']:.2f}% of the time")
|
|
90
|
+
print(f"✓ Top 15 values cover {freq_df['cumulative_pct'].iloc[-1]:.2f}% of all data")
|
|
91
|
+
|
|
92
|
+
# Summary and recommendations
|
|
93
|
+
print("\n" + "=" * 80)
|
|
94
|
+
print("SUMMARY & RECOMMENDATIONS")
|
|
95
|
+
print("=" * 80)
|
|
96
|
+
|
|
97
|
+
# Categorize columns
|
|
98
|
+
excellent = stats_df[stats_df['repetition_score'] > 100]
|
|
99
|
+
good = stats_df[(stats_df['repetition_score'] >= 10) & (stats_df['repetition_score'] <= 100)]
|
|
100
|
+
poor = stats_df[stats_df['repetition_score'] < 10]
|
|
101
|
+
|
|
102
|
+
print(f"\n📊 RLE Compression Potential:")
|
|
103
|
+
print(f" Excellent (score > 100): {len(excellent)} columns")
|
|
104
|
+
if len(excellent) > 0:
|
|
105
|
+
print(f" {', '.join(excellent['column_name'].tolist())}")
|
|
106
|
+
|
|
107
|
+
print(f" Good (score 10-100): {len(good)} columns")
|
|
108
|
+
if len(good) > 0:
|
|
109
|
+
print(f" {', '.join(good['column_name'].tolist())}")
|
|
110
|
+
|
|
111
|
+
print(f" Poor (score < 10): {len(poor)} columns")
|
|
112
|
+
if len(poor) > 0:
|
|
113
|
+
print(f" {', '.join(poor['column_name'].tolist())}")
|
|
114
|
+
|
|
115
|
+
print(f"\n💡 Sorting Recommendation:")
|
|
116
|
+
top_3 = stats_df.head(3)['column_name'].tolist()
|
|
117
|
+
print(f" For optimal RLE compression, consider sorting by:")
|
|
118
|
+
for i, col in enumerate(top_3, 1):
|
|
119
|
+
print(f" {i}. {col}")
|
|
120
|
+
|
|
121
|
+
print(f"\n✅ Test completed successfully!")
|
|
122
|
+
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
print(f"\n❌ Error during analysis: {e}")
|
|
127
|
+
import traceback
|
|
128
|
+
traceback.print_exc()
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
finally:
|
|
132
|
+
con.close()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
print("\n" + "=" * 80)
|
|
137
|
+
print("STARTING RLE ANALYSIS TEST")
|
|
138
|
+
print("=" * 80)
|
|
139
|
+
|
|
140
|
+
success = test_rle_with_real_parquet()
|
|
141
|
+
|
|
142
|
+
print("\n" + "=" * 80)
|
|
143
|
+
if success:
|
|
144
|
+
print("✅ TEST PASSED")
|
|
145
|
+
else:
|
|
146
|
+
print("❌ TEST FAILED")
|
|
147
|
+
print("=" * 80)
|
|
148
|
+
|
|
149
|
+
sys.exit(0 if success else 1)
|