duckrun 0.2.19.dev1__tar.gz → 0.2.19.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/PKG-INFO +1 -1
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/__init__.py +2 -1
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/core.py +113 -0
- duckrun-0.2.19.dev2/duckrun/rle.py +521 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/SOURCES.txt +3 -1
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/pyproject.toml +1 -1
- duckrun-0.2.19.dev2/tests/test_rle.py +10 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/LICENSE +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/README.md +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/auth.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/files.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/notebook.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/runner.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/semantic_model.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/stats.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun/writer.py +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.19.dev1 → duckrun-0.2.19.dev2}/setup.cfg +0 -0
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
from duckrun.core import Duckrun
|
|
4
4
|
from duckrun.notebook import import_notebook_from_web, import_notebook
|
|
5
|
+
from duckrun import rle
|
|
5
6
|
|
|
6
7
|
__version__ = "0.2.18"
|
|
7
8
|
|
|
8
9
|
# Expose unified connect method at module level
|
|
9
10
|
connect = Duckrun.connect
|
|
10
11
|
|
|
11
|
-
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook"]
|
|
12
|
+
__all__ = ["Duckrun", "connect", "import_notebook_from_web", "import_notebook", "rle"]
|
|
@@ -1244,8 +1244,121 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1244
1244
|
refresh=refresh
|
|
1245
1245
|
)
|
|
1246
1246
|
|
|
1247
|
+
def rle(self, table_name: str = None, mode: str = "summary", sort_columns: List[str] = None,
|
|
1248
|
+
limit: int = None, max_combinations: int = 20, use_stratified_sampling: bool = True,
|
|
1249
|
+
num_segments: int = 5, segment_size: int = 1000):
|
|
1250
|
+
"""
|
|
1251
|
+
Analyze RLE (Run-Length Encoding) compression potential for Delta Lake tables.
|
|
1252
|
+
|
|
1253
|
+
Args:
|
|
1254
|
+
table_name: Name of the table to analyze. Can be:
|
|
1255
|
+
- 'table_name' (uses current schema)
|
|
1256
|
+
- 'schema.table_name' (specific schema)
|
|
1257
|
+
- None (analyzes all tables in current schema - summary only)
|
|
1258
|
+
mode: Analysis mode:
|
|
1259
|
+
- "summary": Quick NFV (Number of Distinct Values) analysis (default)
|
|
1260
|
+
- "smart": Smart heuristic-based analysis (recommended)
|
|
1261
|
+
- "full": Full RLE analysis with all column orderings
|
|
1262
|
+
sort_columns: Optional list of columns to sort by for RLE calculation
|
|
1263
|
+
limit: Optional limit on number of rows to analyze (ignored if using stratified sampling)
|
|
1264
|
+
max_combinations: Maximum number of orderings to test (for smart mode)
|
|
1265
|
+
use_stratified_sampling: If True, use stratified sampling across entire file (recommended)
|
|
1266
|
+
num_segments: Number of segments for stratified sampling
|
|
1267
|
+
segment_size: Size of each segment for sampling
|
|
1268
|
+
|
|
1269
|
+
Returns:
|
|
1270
|
+
DataFrame with RLE analysis results
|
|
1271
|
+
|
|
1272
|
+
Examples:
|
|
1273
|
+
# Quick summary of a specific table
|
|
1274
|
+
con = duckrun.connect("workspace/lakehouse.lakehouse/schema")
|
|
1275
|
+
con.rle("mytable") # defaults to summary mode
|
|
1276
|
+
con.rle("mytable", "summary")
|
|
1277
|
+
|
|
1278
|
+
# Smart analysis (finds optimal column ordering)
|
|
1279
|
+
con.rle("mytable", "smart")
|
|
1280
|
+
|
|
1281
|
+
# Analyze table from different schema
|
|
1282
|
+
con.rle("otherschema.mytable", "smart")
|
|
1283
|
+
|
|
1284
|
+
# Full analysis with custom parameters
|
|
1285
|
+
con.rle("mytable", "full", use_stratified_sampling=True, num_segments=10)
|
|
1286
|
+
"""
|
|
1287
|
+
from .rle import (
|
|
1288
|
+
calculate_nfv_score,
|
|
1289
|
+
test_column_orderings_smart,
|
|
1290
|
+
calculate_rle_for_columns
|
|
1291
|
+
)
|
|
1292
|
+
from deltalake import DeltaTable
|
|
1293
|
+
|
|
1294
|
+
# Parse table name and construct path
|
|
1295
|
+
if table_name is None:
|
|
1296
|
+
if mode != "summary":
|
|
1297
|
+
print("⚠️ Table name is required for 'smart' and 'full' modes")
|
|
1298
|
+
return None
|
|
1299
|
+
# TODO: Implement all-tables summary
|
|
1300
|
+
print("⚠️ All-tables summary not yet implemented. Please specify a table name.")
|
|
1301
|
+
return None
|
|
1302
|
+
|
|
1303
|
+
# Parse schema.table or just table
|
|
1304
|
+
if '.' in table_name:
|
|
1305
|
+
schema_name, tbl = table_name.split('.', 1)
|
|
1306
|
+
else:
|
|
1307
|
+
schema_name = self.schema
|
|
1308
|
+
tbl = table_name
|
|
1309
|
+
|
|
1310
|
+
# Construct the full table path using the same logic as get_stats
|
|
1311
|
+
table_path = f"{self.table_base_url}{schema_name}/{tbl}"
|
|
1312
|
+
|
|
1313
|
+
# Get the actual parquet files from Delta table
|
|
1314
|
+
print(f"📊 Analyzing table: {schema_name}.{tbl}")
|
|
1315
|
+
|
|
1316
|
+
try:
|
|
1317
|
+
dt = DeltaTable(table_path)
|
|
1318
|
+
delta_files = dt.files()
|
|
1319
|
+
|
|
1320
|
+
if not delta_files:
|
|
1321
|
+
print("⚠️ Table is empty (no files)")
|
|
1322
|
+
return None
|
|
1323
|
+
|
|
1324
|
+
# Construct full paths for parquet files
|
|
1325
|
+
parquet_paths = [table_path + "/" + f for f in delta_files]
|
|
1326
|
+
|
|
1327
|
+
except Exception as e:
|
|
1328
|
+
print(f"❌ Error accessing Delta table: {e}")
|
|
1329
|
+
return None
|
|
1330
|
+
|
|
1331
|
+
# For now, analyze the first file (can be extended to analyze all files)
|
|
1332
|
+
parquet_path = parquet_paths[0]
|
|
1333
|
+
|
|
1334
|
+
if mode == "summary":
|
|
1335
|
+
# Quick NFV analysis
|
|
1336
|
+
nfv_scores = calculate_nfv_score(self.con, parquet_path, limit)
|
|
1337
|
+
import pandas as pd
|
|
1338
|
+
df = pd.DataFrame([
|
|
1339
|
+
{"column": col, "nfv_score": score}
|
|
1340
|
+
for col, score in sorted(nfv_scores.items(), key=lambda x: x[1])
|
|
1341
|
+
])
|
|
1342
|
+
return df
|
|
1343
|
+
|
|
1344
|
+
elif mode in ["smart", "full"]:
|
|
1345
|
+
# Smart or full RLE analysis
|
|
1346
|
+
return test_column_orderings_smart(
|
|
1347
|
+
self.con,
|
|
1348
|
+
parquet_path,
|
|
1349
|
+
limit=limit,
|
|
1350
|
+
max_combinations=max_combinations,
|
|
1351
|
+
use_stratified_sampling=use_stratified_sampling,
|
|
1352
|
+
num_segments=num_segments,
|
|
1353
|
+
segment_size=segment_size
|
|
1354
|
+
)
|
|
1355
|
+
else:
|
|
1356
|
+
print(f"❌ Unknown mode: {mode}. Use 'summary', 'smart', or 'full'")
|
|
1357
|
+
return None
|
|
1358
|
+
|
|
1247
1359
|
def close(self):
|
|
1248
1360
|
"""Close DuckDB connection"""
|
|
1361
|
+
|
|
1249
1362
|
if self.con:
|
|
1250
1363
|
self.con.close()
|
|
1251
1364
|
print("Connection closed")
|
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from typing import List, Dict, Tuple, Optional
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
def analyze_parquet_row_groups(con, parquet_path: str) -> pd.DataFrame:
|
|
6
|
+
"""
|
|
7
|
+
Analyze Parquet row group statistics to identify columns with constant values.
|
|
8
|
+
This is much faster than reading all data.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
DataFrame with row group stats per column
|
|
12
|
+
"""
|
|
13
|
+
try:
|
|
14
|
+
# Get row group metadata
|
|
15
|
+
metadata = con.sql(f"""
|
|
16
|
+
SELECT * FROM parquet_metadata('{parquet_path}')
|
|
17
|
+
""").df()
|
|
18
|
+
|
|
19
|
+
return metadata
|
|
20
|
+
except Exception as e:
|
|
21
|
+
print(f"Could not read parquet metadata: {e}")
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def estimate_rle_from_row_groups(con, parquet_path: str) -> Dict[str, dict]:
|
|
26
|
+
"""
|
|
27
|
+
Estimate RLE potential from Parquet row group statistics.
|
|
28
|
+
If min == max in a row group, that entire group is one RLE run.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Dictionary with column stats: {col: {'constant_groups': N, 'total_groups': M, 'constant_ratio': ratio}}
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
# Get row group statistics - this varies by DuckDB version
|
|
35
|
+
# Try to get column chunk stats
|
|
36
|
+
stats_query = f"""
|
|
37
|
+
SELECT
|
|
38
|
+
row_group_id,
|
|
39
|
+
column_id,
|
|
40
|
+
file_offset,
|
|
41
|
+
num_values,
|
|
42
|
+
total_compressed_size,
|
|
43
|
+
total_uncompressed_size
|
|
44
|
+
FROM parquet_file_metadata('{parquet_path}')
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
stats = con.sql(stats_query).df()
|
|
48
|
+
print("Row group metadata available!")
|
|
49
|
+
return stats
|
|
50
|
+
|
|
51
|
+
except Exception as e:
|
|
52
|
+
print(f"Parquet metadata not available in this DuckDB version: {e}")
|
|
53
|
+
print("Falling back to stratified sampling...")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def stratified_rle_sampling(con, parquet_path: str, sort_columns: List[str] = None,
|
|
58
|
+
num_segments: int = 5, segment_size: int = 1000) -> Dict[str, float]:
|
|
59
|
+
"""
|
|
60
|
+
Sample RLE density across multiple segments of the file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
con: DuckDB connection
|
|
64
|
+
parquet_path: Path to parquet file
|
|
65
|
+
sort_columns: List of columns to sort by before calculating RLE. If None, uses natural order.
|
|
66
|
+
num_segments: Number of segments to sample across the file
|
|
67
|
+
segment_size: Number of rows per segment
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Dictionary with estimated RLE runs per column for full file
|
|
71
|
+
"""
|
|
72
|
+
# Get total row count
|
|
73
|
+
total_rows = con.sql(f"""
|
|
74
|
+
SELECT COUNT(*) FROM read_parquet('{parquet_path}')
|
|
75
|
+
""").fetchone()[0]
|
|
76
|
+
|
|
77
|
+
# Get column names
|
|
78
|
+
columns = con.sql(f"""
|
|
79
|
+
SELECT column_name
|
|
80
|
+
FROM (
|
|
81
|
+
DESCRIBE
|
|
82
|
+
SELECT * FROM read_parquet('{parquet_path}', file_row_number = TRUE)
|
|
83
|
+
)
|
|
84
|
+
WHERE column_name != 'file_row_number'
|
|
85
|
+
""").fetchall()
|
|
86
|
+
|
|
87
|
+
column_names = [col[0] for col in columns]
|
|
88
|
+
|
|
89
|
+
# Build ORDER BY clause
|
|
90
|
+
if sort_columns:
|
|
91
|
+
order_by_clause = "ORDER BY " + ", ".join(sort_columns)
|
|
92
|
+
sort_desc = f"sorted by [{', '.join(sort_columns)}]"
|
|
93
|
+
else:
|
|
94
|
+
order_by_clause = "ORDER BY file_row_number"
|
|
95
|
+
sort_desc = "natural order"
|
|
96
|
+
|
|
97
|
+
# Calculate segment positions spread across the file
|
|
98
|
+
segment_positions = []
|
|
99
|
+
if num_segments == 1:
|
|
100
|
+
segment_positions = [0]
|
|
101
|
+
else:
|
|
102
|
+
step = total_rows // (num_segments + 1)
|
|
103
|
+
segment_positions = [step * (i + 1) for i in range(num_segments)]
|
|
104
|
+
|
|
105
|
+
# Sample each segment and calculate RLE density
|
|
106
|
+
all_densities = {col: [] for col in column_names}
|
|
107
|
+
|
|
108
|
+
for seg_idx, start_pos in enumerate(segment_positions, 1):
|
|
109
|
+
for col in column_names:
|
|
110
|
+
# The key fix: we need to sort the ENTIRE dataset first, then sample from it
|
|
111
|
+
# This is expensive but necessary for accurate results
|
|
112
|
+
rle_count = con.sql(f"""
|
|
113
|
+
WITH sorted_data AS (
|
|
114
|
+
SELECT
|
|
115
|
+
*,
|
|
116
|
+
ROW_NUMBER() OVER ({order_by_clause}) as sorted_row_num
|
|
117
|
+
FROM read_parquet('{parquet_path}', file_row_number = TRUE)
|
|
118
|
+
),
|
|
119
|
+
segment_data AS (
|
|
120
|
+
SELECT
|
|
121
|
+
{col},
|
|
122
|
+
sorted_row_num
|
|
123
|
+
FROM sorted_data
|
|
124
|
+
WHERE sorted_row_num >= {start_pos}
|
|
125
|
+
ORDER BY sorted_row_num
|
|
126
|
+
LIMIT {segment_size}
|
|
127
|
+
),
|
|
128
|
+
runs AS (
|
|
129
|
+
SELECT
|
|
130
|
+
CASE
|
|
131
|
+
WHEN LAG({col}) OVER (ORDER BY sorted_row_num) != {col}
|
|
132
|
+
OR LAG({col}) OVER (ORDER BY sorted_row_num) IS NULL
|
|
133
|
+
THEN 1
|
|
134
|
+
ELSE 0
|
|
135
|
+
END AS new_run
|
|
136
|
+
FROM segment_data
|
|
137
|
+
)
|
|
138
|
+
SELECT SUM(new_run) AS rle_run_count
|
|
139
|
+
FROM runs
|
|
140
|
+
""").fetchone()[0]
|
|
141
|
+
|
|
142
|
+
# Calculate density (runs per row)
|
|
143
|
+
density = rle_count / segment_size
|
|
144
|
+
all_densities[col].append(density)
|
|
145
|
+
|
|
146
|
+
# Estimate total runs for full file
|
|
147
|
+
estimated_runs = {}
|
|
148
|
+
density_stats = {}
|
|
149
|
+
|
|
150
|
+
for col in column_names:
|
|
151
|
+
avg_density = sum(all_densities[col]) / len(all_densities[col])
|
|
152
|
+
min_density = min(all_densities[col])
|
|
153
|
+
max_density = max(all_densities[col])
|
|
154
|
+
std_density = (sum((d - avg_density)**2 for d in all_densities[col]) / len(all_densities[col]))**0.5
|
|
155
|
+
|
|
156
|
+
estimated_total = int(avg_density * total_rows)
|
|
157
|
+
estimated_runs[col] = estimated_total
|
|
158
|
+
|
|
159
|
+
density_stats[col] = {
|
|
160
|
+
'avg_density': avg_density,
|
|
161
|
+
'min_density': min_density,
|
|
162
|
+
'max_density': max_density,
|
|
163
|
+
'std_density': std_density,
|
|
164
|
+
'estimated_runs': estimated_total,
|
|
165
|
+
'variance_coefficient': std_density / avg_density if avg_density > 0 else 0
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return estimated_runs, density_stats
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def calculate_rle_for_columns(con, parquet_path: str, sort_columns: List[str] = None, limit: int = None) -> Dict[str, int]:
|
|
172
|
+
"""
|
|
173
|
+
Calculate RLE runs for all columns in a parquet file, optionally after sorting.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
con: DuckDB connection
|
|
177
|
+
parquet_path: Path to parquet file
|
|
178
|
+
sort_columns: List of columns to sort by (in order). If None, uses natural file order.
|
|
179
|
+
limit: Optional limit on number of rows to analyze
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Dictionary mapping column names to RLE run counts
|
|
183
|
+
"""
|
|
184
|
+
# Get all column names
|
|
185
|
+
columns = con.sql(f"""
|
|
186
|
+
SELECT column_name
|
|
187
|
+
FROM (
|
|
188
|
+
DESCRIBE
|
|
189
|
+
SELECT *
|
|
190
|
+
FROM read_parquet('{parquet_path}', file_row_number = TRUE)
|
|
191
|
+
)
|
|
192
|
+
WHERE column_name != 'file_row_number'
|
|
193
|
+
""").fetchall()
|
|
194
|
+
|
|
195
|
+
column_names = [col[0] for col in columns]
|
|
196
|
+
|
|
197
|
+
# Build ORDER BY clause
|
|
198
|
+
if sort_columns:
|
|
199
|
+
order_by = "ORDER BY " + ", ".join(sort_columns)
|
|
200
|
+
else:
|
|
201
|
+
order_by = "ORDER BY file_row_number ASC"
|
|
202
|
+
|
|
203
|
+
limit_clause = f"LIMIT {limit}" if limit else ""
|
|
204
|
+
|
|
205
|
+
# Calculate RLE for each column
|
|
206
|
+
results = {}
|
|
207
|
+
for column_name in column_names:
|
|
208
|
+
rle_count = con.sql(f"""
|
|
209
|
+
WITH ordered_data AS (
|
|
210
|
+
SELECT
|
|
211
|
+
{column_name},
|
|
212
|
+
file_row_number
|
|
213
|
+
FROM read_parquet('{parquet_path}', file_row_number = TRUE)
|
|
214
|
+
{order_by}
|
|
215
|
+
{limit_clause}
|
|
216
|
+
),
|
|
217
|
+
runs AS (
|
|
218
|
+
SELECT
|
|
219
|
+
CASE
|
|
220
|
+
WHEN LAG({column_name}) OVER (ORDER BY file_row_number) != {column_name}
|
|
221
|
+
OR LAG({column_name}) OVER (ORDER BY file_row_number) IS NULL
|
|
222
|
+
THEN 1
|
|
223
|
+
ELSE 0
|
|
224
|
+
END AS new_run
|
|
225
|
+
FROM ordered_data
|
|
226
|
+
)
|
|
227
|
+
SELECT SUM(new_run) AS rle_run_count
|
|
228
|
+
FROM runs
|
|
229
|
+
""").fetchone()[0]
|
|
230
|
+
|
|
231
|
+
results[column_name] = rle_count
|
|
232
|
+
|
|
233
|
+
return results
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def calculate_nfv_score(con, parquet_path: str, limit: int = None) -> Dict[str, float]:
|
|
237
|
+
"""
|
|
238
|
+
Calculate Number of Distinct Values (NFV) for each column.
|
|
239
|
+
Lower NFV = better for RLE compression.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Dictionary mapping column names to NFV ratios (0-1, lower is better)
|
|
243
|
+
"""
|
|
244
|
+
limit_clause = f"LIMIT {limit}" if limit else ""
|
|
245
|
+
|
|
246
|
+
columns = con.sql(f"""
|
|
247
|
+
SELECT column_name
|
|
248
|
+
FROM (
|
|
249
|
+
DESCRIBE
|
|
250
|
+
SELECT *
|
|
251
|
+
FROM read_parquet('{parquet_path}', file_row_number = TRUE)
|
|
252
|
+
)
|
|
253
|
+
WHERE column_name != 'file_row_number'
|
|
254
|
+
""").fetchall()
|
|
255
|
+
|
|
256
|
+
column_names = [col[0] for col in columns]
|
|
257
|
+
nfv_scores = {}
|
|
258
|
+
|
|
259
|
+
for col in column_names:
|
|
260
|
+
result = con.sql(f"""
|
|
261
|
+
WITH data AS (
|
|
262
|
+
SELECT {col}
|
|
263
|
+
FROM read_parquet('{parquet_path}', file_row_number = TRUE)
|
|
264
|
+
{limit_clause}
|
|
265
|
+
)
|
|
266
|
+
SELECT
|
|
267
|
+
COUNT(DISTINCT {col})::FLOAT / COUNT(*)::FLOAT as nfv_ratio
|
|
268
|
+
FROM data
|
|
269
|
+
""").fetchone()
|
|
270
|
+
|
|
271
|
+
nfv_scores[col] = result[0] if result else 1.0
|
|
272
|
+
|
|
273
|
+
return nfv_scores
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def filter_promising_combinations(columns: List[str], nfv_scores: Dict[str, float],
|
|
277
|
+
max_combinations: int = 20) -> List[List[str]]:
|
|
278
|
+
"""
|
|
279
|
+
Apply heuristics to filter down to the most promising column orderings.
|
|
280
|
+
|
|
281
|
+
Heuristics based on research:
|
|
282
|
+
1. Time/date columns first (temporal ordering)
|
|
283
|
+
2. Low cardinality columns before high cardinality
|
|
284
|
+
3. Correlated columns together (e.g., date + time)
|
|
285
|
+
4. Avoid starting with high-cardinality columns
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
columns: List of all column names
|
|
289
|
+
nfv_scores: NFV ratio for each column (lower = fewer distinct values)
|
|
290
|
+
max_combinations: Maximum number of combinations to return
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
List of promising column orderings to test
|
|
294
|
+
"""
|
|
295
|
+
# Sort columns by NFV (lower first = better for RLE)
|
|
296
|
+
sorted_by_nfv = sorted(columns, key=lambda c: nfv_scores[c])
|
|
297
|
+
|
|
298
|
+
promising = []
|
|
299
|
+
|
|
300
|
+
# Rule 1: Natural order baseline
|
|
301
|
+
promising.append([])
|
|
302
|
+
|
|
303
|
+
# Rule 2: NFV-based ordering (lowest to highest)
|
|
304
|
+
promising.append(sorted_by_nfv)
|
|
305
|
+
|
|
306
|
+
# Rule 3: Single best column (lowest NFV)
|
|
307
|
+
promising.append([sorted_by_nfv[0]])
|
|
308
|
+
|
|
309
|
+
# Rule 4: Time-based patterns (common column names)
|
|
310
|
+
time_cols = [c for c in columns if any(t in c.lower() for t in ['date', 'time', 'timestamp', 'year', 'month', 'day'])]
|
|
311
|
+
if time_cols:
|
|
312
|
+
promising.append(time_cols)
|
|
313
|
+
# Time columns + low NFV columns
|
|
314
|
+
non_time = [c for c in sorted_by_nfv if c not in time_cols]
|
|
315
|
+
if non_time:
|
|
316
|
+
promising.append(time_cols + non_time[:2])
|
|
317
|
+
|
|
318
|
+
# Rule 5: Top 2-3 lowest NFV columns in different orders
|
|
319
|
+
top_low_nfv = sorted_by_nfv[:min(3, len(sorted_by_nfv))]
|
|
320
|
+
for perm in itertools.permutations(top_low_nfv, min(2, len(top_low_nfv))):
|
|
321
|
+
promising.append(list(perm))
|
|
322
|
+
|
|
323
|
+
# Rule 6: ID-like columns first (common patterns)
|
|
324
|
+
id_cols = [c for c in columns if any(t in c.lower() for t in ['id', 'key', 'code'])]
|
|
325
|
+
if id_cols:
|
|
326
|
+
promising.append(id_cols)
|
|
327
|
+
|
|
328
|
+
# Rule 7: Categorical/enum-like columns (very low NFV < 0.1)
|
|
329
|
+
categorical = [c for c in sorted_by_nfv if nfv_scores[c] < 0.1]
|
|
330
|
+
if categorical:
|
|
331
|
+
promising.append(categorical)
|
|
332
|
+
# Categorical + time
|
|
333
|
+
if time_cols:
|
|
334
|
+
promising.append(categorical + time_cols)
|
|
335
|
+
|
|
336
|
+
# Remove duplicates while preserving order
|
|
337
|
+
seen = set()
|
|
338
|
+
unique_promising = []
|
|
339
|
+
for combo in promising:
|
|
340
|
+
key = tuple(combo)
|
|
341
|
+
if key not in seen:
|
|
342
|
+
seen.add(key)
|
|
343
|
+
unique_promising.append(combo)
|
|
344
|
+
|
|
345
|
+
# Limit to max_combinations
|
|
346
|
+
return unique_promising[:max_combinations]
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_column_orderings_smart(con, parquet_path: str, limit: int = None,
|
|
350
|
+
max_combinations: int = 20, use_stratified_sampling: bool = True,
|
|
351
|
+
num_segments: int = 5, segment_size: int = 1000) -> pd.DataFrame:
|
|
352
|
+
"""
|
|
353
|
+
Test column orderings using heuristics to avoid testing all combinations.
|
|
354
|
+
|
|
355
|
+
This uses research-backed heuristics:
|
|
356
|
+
- Temporal columns (date/time) should be sorted first
|
|
357
|
+
- Low cardinality (NFV) columns compress better
|
|
358
|
+
- Columns with correlation should be grouped
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
con: DuckDB connection
|
|
362
|
+
parquet_path: Path to parquet file
|
|
363
|
+
limit: Optional limit on number of rows to analyze (ignored if use_stratified_sampling=True)
|
|
364
|
+
max_combinations: Maximum number of orderings to test
|
|
365
|
+
use_stratified_sampling: If True, use stratified sampling across entire file
|
|
366
|
+
num_segments: Number of segments for stratified sampling
|
|
367
|
+
segment_size: Size of each segment for sampling
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
DataFrame with columns: sort_order, total_rle, avg_rle, nfv_weighted_score, and individual column RLE counts
|
|
371
|
+
"""
|
|
372
|
+
print("Analyzing column characteristics...")
|
|
373
|
+
|
|
374
|
+
# Try to get row group metadata first
|
|
375
|
+
print("\nAttempting to read Parquet row group metadata...")
|
|
376
|
+
row_group_stats = analyze_parquet_row_groups(con, parquet_path)
|
|
377
|
+
if row_group_stats is not None:
|
|
378
|
+
print("✓ Row group metadata available")
|
|
379
|
+
print(row_group_stats.head())
|
|
380
|
+
|
|
381
|
+
# Get NFV scores for all columns (still use sampling for this as it's cheap)
|
|
382
|
+
sample_size = limit if limit else 100000
|
|
383
|
+
nfv_scores = calculate_nfv_score(con, parquet_path, sample_size)
|
|
384
|
+
|
|
385
|
+
print(f"\nColumn NFV Scores (lower = better for RLE):")
|
|
386
|
+
for col, score in sorted(nfv_scores.items(), key=lambda x: x[1]):
|
|
387
|
+
print(f" {col}: {score:.4f}")
|
|
388
|
+
|
|
389
|
+
# Decide whether to use stratified sampling or simple limit
|
|
390
|
+
if use_stratified_sampling and not limit:
|
|
391
|
+
print("\n" + "="*60)
|
|
392
|
+
print("Using STRATIFIED SAMPLING across entire file")
|
|
393
|
+
print("="*60)
|
|
394
|
+
|
|
395
|
+
# Get total row count
|
|
396
|
+
total_rows = con.sql(f"SELECT COUNT(*) FROM read_parquet('{parquet_path}')").fetchone()[0]
|
|
397
|
+
print(f"Total rows in file: {total_rows:,}")
|
|
398
|
+
print(f"Sampling strategy: {num_segments} segments of {segment_size} rows each")
|
|
399
|
+
|
|
400
|
+
# Get baseline with natural order
|
|
401
|
+
print("\nCalculating baseline (natural order)...")
|
|
402
|
+
estimated_runs, density_stats = stratified_rle_sampling(
|
|
403
|
+
con, parquet_path, None, num_segments, segment_size
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
print("\nBaseline RLE Density Statistics:")
|
|
407
|
+
for col, stats in sorted(density_stats.items(), key=lambda x: x[1]['estimated_runs']):
|
|
408
|
+
cv = stats['variance_coefficient']
|
|
409
|
+
warning = " ⚠️ HIGH VARIANCE" if cv > 0.3 else ""
|
|
410
|
+
print(f" {col}: {stats['estimated_runs']:,} runs (density: {stats['avg_density']:.4f}, CV: {cv:.2f}){warning}")
|
|
411
|
+
|
|
412
|
+
use_estimation = True
|
|
413
|
+
else:
|
|
414
|
+
print("\n" + "="*60)
|
|
415
|
+
print(f"Using simple sampling (first {limit or 'all'} rows)")
|
|
416
|
+
print("="*60)
|
|
417
|
+
use_estimation = False
|
|
418
|
+
|
|
419
|
+
# Get baseline (natural file order)
|
|
420
|
+
if not use_estimation:
|
|
421
|
+
print("\nCalculating baseline (natural file order)...")
|
|
422
|
+
baseline = calculate_rle_for_columns(con, parquet_path, None, limit)
|
|
423
|
+
column_names = list(baseline.keys())
|
|
424
|
+
else:
|
|
425
|
+
column_names = list(nfv_scores.keys())
|
|
426
|
+
|
|
427
|
+
# Sort columns by NFV for the NFV-based ordering
|
|
428
|
+
sorted_by_nfv = sorted(column_names, key=lambda c: nfv_scores[c])
|
|
429
|
+
|
|
430
|
+
# Exclude obvious columns (very low NFV < 0.0001) from permutations
|
|
431
|
+
# These are likely constant columns that compress perfectly anywhere
|
|
432
|
+
nfv_threshold = 0.0001
|
|
433
|
+
non_trivial_cols = [c for c in sorted_by_nfv if nfv_scores[c] >= nfv_threshold]
|
|
434
|
+
trivial_cols = [c for c in sorted_by_nfv if nfv_scores[c] < nfv_threshold]
|
|
435
|
+
|
|
436
|
+
if trivial_cols:
|
|
437
|
+
print(f"\nExcluding trivial columns from permutations (NFV < {nfv_threshold}): {', '.join(trivial_cols)}")
|
|
438
|
+
|
|
439
|
+
# Define specific orderings to test
|
|
440
|
+
orderings_to_test = [
|
|
441
|
+
([], 'current_order'), # Natural file order
|
|
442
|
+
(sorted_by_nfv, 'order_by_nfv') # Sorted by NFV (low to high)
|
|
443
|
+
]
|
|
444
|
+
|
|
445
|
+
# Add permutations of top N lowest NFV columns (excluding trivial ones)
|
|
446
|
+
top_n = min(3, len(non_trivial_cols)) # Top 3 non-trivial or fewer
|
|
447
|
+
print(f"\nGenerating permutations of top {top_n} lowest non-trivial NFV columns...")
|
|
448
|
+
for perm in itertools.permutations(non_trivial_cols[:top_n]):
|
|
449
|
+
orderings_to_test.append((list(perm), f"perm_{', '.join(perm)}"))
|
|
450
|
+
|
|
451
|
+
print(f"Testing {len(orderings_to_test)} orderings...")
|
|
452
|
+
results = []
|
|
453
|
+
|
|
454
|
+
for i, (sort_cols, label) in enumerate(orderings_to_test, 1):
|
|
455
|
+
print(f"\n[{i}/{len(orderings_to_test)}] Testing: {label}")
|
|
456
|
+
|
|
457
|
+
if use_estimation:
|
|
458
|
+
# Use stratified sampling for this ordering
|
|
459
|
+
print(f" Sort order: {', '.join(sort_cols) if sort_cols else 'natural (file_row_number)'}")
|
|
460
|
+
est_runs, _ = stratified_rle_sampling(
|
|
461
|
+
con, parquet_path, sort_cols if sort_cols else None, num_segments, segment_size
|
|
462
|
+
)
|
|
463
|
+
rle_counts = est_runs
|
|
464
|
+
else:
|
|
465
|
+
# Use regular calculation
|
|
466
|
+
rle_counts = calculate_rle_for_columns(con, parquet_path, sort_cols if sort_cols else None, limit)
|
|
467
|
+
|
|
468
|
+
# Calculate weighted score (considering both RLE and NFV)
|
|
469
|
+
nfv_weighted = sum(rle_counts[col] * nfv_scores[col] for col in rle_counts.keys())
|
|
470
|
+
|
|
471
|
+
results.append({
|
|
472
|
+
'sort_order': label,
|
|
473
|
+
'columns_used': ', '.join(sort_cols) if sort_cols else 'file_row_number',
|
|
474
|
+
'total_rle': sum(rle_counts.values()),
|
|
475
|
+
'avg_rle': sum(rle_counts.values()) / len(rle_counts),
|
|
476
|
+
'nfv_weighted_score': nfv_weighted,
|
|
477
|
+
'estimation_method': 'stratified' if use_estimation else 'sequential',
|
|
478
|
+
**rle_counts
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
# Convert to DataFrame and sort by total RLE
|
|
482
|
+
df = pd.DataFrame(results)
|
|
483
|
+
df = df.sort_values('total_rle')
|
|
484
|
+
|
|
485
|
+
print(f"\n✓ Analysis complete! Tested {len(orderings_to_test)} orderings.")
|
|
486
|
+
|
|
487
|
+
if use_estimation:
|
|
488
|
+
print("\n⚠️ Note: RLE counts are ESTIMATES based on stratified sampling.")
|
|
489
|
+
print(" Use these for relative comparison. Run full analysis on best candidate.")
|
|
490
|
+
|
|
491
|
+
return df
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
# Example usage:
|
|
495
|
+
# parquet_path = 'abfss://tmp@onelake.dfs.fabric.microsoft.com/data.Lakehouse/Tables/unsorted/summary/0-1c557fc2-59fe-487f-a3ee-67b5e63257df-0.parquet'
|
|
496
|
+
#
|
|
497
|
+
# # OPTION 1: Fast stratified sampling across entire file (recommended for large files)
|
|
498
|
+
# results_df = test_column_orderings_smart(
|
|
499
|
+
# con,
|
|
500
|
+
# parquet_path,
|
|
501
|
+
# use_stratified_sampling=True,
|
|
502
|
+
# num_segments=5, # Sample 5 segments across the file
|
|
503
|
+
# segment_size=1000 # 1000 rows per segment
|
|
504
|
+
# )
|
|
505
|
+
#
|
|
506
|
+
# # OPTION 2: Traditional approach with limited rows (faster but less accurate)
|
|
507
|
+
# results_df = test_column_orderings_smart(
|
|
508
|
+
# con,
|
|
509
|
+
# parquet_path,
|
|
510
|
+
# limit=10000,
|
|
511
|
+
# use_stratified_sampling=False
|
|
512
|
+
# )
|
|
513
|
+
#
|
|
514
|
+
# # Show results
|
|
515
|
+
# print("\nTop 5 best orderings:")
|
|
516
|
+
# print(results_df[['sort_order', 'columns_used', 'total_rle', 'estimation_method']].head(5))
|
|
517
|
+
#
|
|
518
|
+
# # Once you identify the best ordering, verify with full file scan:
|
|
519
|
+
# best_ordering = results_df.iloc[0]['columns_used'].split(', ')
|
|
520
|
+
# print(f"\nVerifying best ordering on FULL file: {best_ordering}")
|
|
521
|
+
# full_rle = calculate_rle_for_columns(con, parquet_path, best_ordering if best_ordering[0] != 'file_row_number' else None, limit=None)
|
|
@@ -7,6 +7,7 @@ duckrun/core.py
|
|
|
7
7
|
duckrun/files.py
|
|
8
8
|
duckrun/lakehouse.py
|
|
9
9
|
duckrun/notebook.py
|
|
10
|
+
duckrun/rle.py
|
|
10
11
|
duckrun/runner.py
|
|
11
12
|
duckrun/semantic_model.py
|
|
12
13
|
duckrun/stats.py
|
|
@@ -15,4 +16,5 @@ duckrun.egg-info/PKG-INFO
|
|
|
15
16
|
duckrun.egg-info/SOURCES.txt
|
|
16
17
|
duckrun.egg-info/dependency_links.txt
|
|
17
18
|
duckrun.egg-info/requires.txt
|
|
18
|
-
duckrun.egg-info/top_level.txt
|
|
19
|
+
duckrun.egg-info/top_level.txt
|
|
20
|
+
tests/test_rle.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.19.
|
|
7
|
+
version = "0.2.19.dev2"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import duckrun
|
|
2
|
+
|
|
3
|
+
# Test RLE integration
|
|
4
|
+
con = duckrun.connect("tmp/data.lakehouse/unsorted")
|
|
5
|
+
|
|
6
|
+
# Test smart mode on calendar table
|
|
7
|
+
print("Testing RLE smart mode on calendar table...")
|
|
8
|
+
result = con.rle("calendar", "full")
|
|
9
|
+
print("\nTop 5 best orderings:")
|
|
10
|
+
print(result[['sort_order', 'columns_used', 'total_rle', 'estimation_method']].head())
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|