iparq 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq/__init__.py +1 -1
- iparq/source.py +72 -0
- {iparq-0.3.0.dist-info → iparq-0.4.0.dist-info}/METADATA +2 -2
- iparq-0.4.0.dist-info/RECORD +8 -0
- iparq-0.3.0.dist-info/RECORD +0 -8
- {iparq-0.3.0.dist-info → iparq-0.4.0.dist-info}/WHEEL +0 -0
- {iparq-0.3.0.dist-info → iparq-0.4.0.dist-info}/entry_points.txt +0 -0
- {iparq-0.3.0.dist-info → iparq-0.4.0.dist-info}/licenses/LICENSE +0 -0
iparq/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.0"
|
iparq/source.py
CHANGED
|
@@ -54,6 +54,9 @@ class ColumnInfo(BaseModel):
|
|
|
54
54
|
column_index (int): The index of the column.
|
|
55
55
|
compression_type (str): The compression type used for the column.
|
|
56
56
|
has_bloom_filter (bool): Whether the column has a bloom filter.
|
|
57
|
+
has_min_max (bool): Whether min/max statistics are available.
|
|
58
|
+
min_value (Optional[str]): The minimum value in the column (as string for display).
|
|
59
|
+
max_value (Optional[str]): The maximum value in the column (as string for display).
|
|
57
60
|
"""
|
|
58
61
|
|
|
59
62
|
row_group: int
|
|
@@ -61,6 +64,9 @@ class ColumnInfo(BaseModel):
|
|
|
61
64
|
column_index: int
|
|
62
65
|
compression_type: str
|
|
63
66
|
has_bloom_filter: Optional[bool] = False
|
|
67
|
+
has_min_max: Optional[bool] = False
|
|
68
|
+
min_value: Optional[str] = None
|
|
69
|
+
max_value: Optional[str] = None
|
|
64
70
|
|
|
65
71
|
|
|
66
72
|
class ParquetColumnInfo(BaseModel):
|
|
@@ -203,6 +209,59 @@ def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) ->
|
|
|
203
209
|
)
|
|
204
210
|
|
|
205
211
|
|
|
212
|
+
def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -> None:
|
|
213
|
+
"""
|
|
214
|
+
Updates the column_info model with min/max statistics information.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
parquet_metadata: The Parquet file metadata.
|
|
218
|
+
column_info: The ParquetColumnInfo model to update.
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
num_row_groups = parquet_metadata.num_row_groups
|
|
222
|
+
num_columns = parquet_metadata.num_columns
|
|
223
|
+
|
|
224
|
+
for i in range(num_row_groups):
|
|
225
|
+
row_group = parquet_metadata.row_group(i)
|
|
226
|
+
|
|
227
|
+
for j in range(num_columns):
|
|
228
|
+
column_chunk = row_group.column(j)
|
|
229
|
+
|
|
230
|
+
# Find the corresponding column in our model
|
|
231
|
+
for col in column_info.columns:
|
|
232
|
+
if col.row_group == i and col.column_index == j:
|
|
233
|
+
# Check if this column has statistics
|
|
234
|
+
if column_chunk.is_stats_set:
|
|
235
|
+
stats = column_chunk.statistics
|
|
236
|
+
col.has_min_max = stats.has_min_max
|
|
237
|
+
|
|
238
|
+
if stats.has_min_max:
|
|
239
|
+
# Convert values to string for display, handling potential None values
|
|
240
|
+
try:
|
|
241
|
+
col.min_value = (
|
|
242
|
+
str(stats.min)
|
|
243
|
+
if stats.min is not None
|
|
244
|
+
else "null"
|
|
245
|
+
)
|
|
246
|
+
col.max_value = (
|
|
247
|
+
str(stats.max)
|
|
248
|
+
if stats.max is not None
|
|
249
|
+
else "null"
|
|
250
|
+
)
|
|
251
|
+
except Exception:
|
|
252
|
+
# Fallback for complex types that might not stringify well
|
|
253
|
+
col.min_value = "<unable to display>"
|
|
254
|
+
col.max_value = "<unable to display>"
|
|
255
|
+
else:
|
|
256
|
+
col.has_min_max = False
|
|
257
|
+
break
|
|
258
|
+
except Exception as e:
|
|
259
|
+
console.print(
|
|
260
|
+
f"Error while collecting min/max statistics: {e}",
|
|
261
|
+
style="blink bold red underline on white",
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
206
265
|
def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
207
266
|
"""
|
|
208
267
|
Prints the column information using a Rich table.
|
|
@@ -218,15 +277,27 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
|
218
277
|
table.add_column("Index", justify="center")
|
|
219
278
|
table.add_column("Compression", style="magenta")
|
|
220
279
|
table.add_column("Bloom Filter", justify="center")
|
|
280
|
+
table.add_column("Min Value", style="yellow")
|
|
281
|
+
table.add_column("Max Value", style="yellow")
|
|
221
282
|
|
|
222
283
|
# Add rows to the table
|
|
223
284
|
for col in column_info.columns:
|
|
285
|
+
# Format min/max values for display
|
|
286
|
+
min_display = (
|
|
287
|
+
col.min_value if col.has_min_max and col.min_value is not None else "N/A"
|
|
288
|
+
)
|
|
289
|
+
max_display = (
|
|
290
|
+
col.max_value if col.has_min_max and col.max_value is not None else "N/A"
|
|
291
|
+
)
|
|
292
|
+
|
|
224
293
|
table.add_row(
|
|
225
294
|
str(col.row_group),
|
|
226
295
|
col.column_name,
|
|
227
296
|
str(col.column_index),
|
|
228
297
|
col.compression_type,
|
|
229
298
|
"✅" if col.has_bloom_filter else "❌",
|
|
299
|
+
min_display,
|
|
300
|
+
max_display,
|
|
230
301
|
)
|
|
231
302
|
|
|
232
303
|
# Print the table
|
|
@@ -290,6 +361,7 @@ def inspect_single_file(
|
|
|
290
361
|
# Collect information
|
|
291
362
|
print_compression_types(parquet_metadata, column_info)
|
|
292
363
|
print_bloom_filter_info(parquet_metadata, column_info)
|
|
364
|
+
print_min_max_statistics(parquet_metadata, column_info)
|
|
293
365
|
|
|
294
366
|
# Filter columns if requested
|
|
295
367
|
if column_filter:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iparq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Display version compression and bloom filter information about a parquet file
|
|
5
5
|
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -134,7 +134,7 @@ iparq inspect important.parquet temp_*.parquet
|
|
|
134
134
|
|
|
135
135
|
When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
|
|
136
136
|
|
|
137
|
-
## Example
|
|
137
|
+
## Example output - Bloom Filters
|
|
138
138
|
|
|
139
139
|
```log
|
|
140
140
|
ParquetMetaModel(
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
iparq/__init__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
|
|
2
|
+
iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
iparq/source.py,sha256=HT76AP19e-TheGB5cHlGqsyJGvdsPVMbAYuSo2xH0Bk,15476
|
|
4
|
+
iparq-0.4.0.dist-info/METADATA,sha256=D6parmkBDsraB3C57YX9yistPxOYK-XHi_nBKJ2BGB0,6019
|
|
5
|
+
iparq-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
+
iparq-0.4.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
+
iparq-0.4.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
+
iparq-0.4.0.dist-info/RECORD,,
|
iparq-0.3.0.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
iparq/__init__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
|
|
2
|
-
iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
iparq/source.py,sha256=wMg7c2lw1TSoCd5h3zvMNT_bAqWJMObRq5aPyXrVBik,12288
|
|
4
|
-
iparq-0.3.0.dist-info/METADATA,sha256=mWSvqv57D5PmtzqarAm-vAQoHFDPoiMkHS-nJcnoM_g,6018
|
|
5
|
-
iparq-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
-
iparq-0.3.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
-
iparq-0.3.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
-
iparq-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|