iparq 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq/source.py +121 -7
- {iparq-0.4.1.dist-info → iparq-0.5.0.dist-info}/METADATA +37 -14
- iparq-0.5.0.dist-info/RECORD +8 -0
- {iparq-0.4.1.dist-info → iparq-0.5.0.dist-info}/WHEEL +1 -1
- iparq-0.4.1.dist-info/RECORD +0 -8
- {iparq-0.4.1.dist-info → iparq-0.5.0.dist-info}/entry_points.txt +0 -0
- {iparq-0.4.1.dist-info → iparq-0.5.0.dist-info}/licenses/LICENSE +0 -0
iparq/source.py
CHANGED
|
@@ -57,6 +57,12 @@ class ColumnInfo(BaseModel):
|
|
|
57
57
|
has_min_max (bool): Whether min/max statistics are available.
|
|
58
58
|
min_value (Optional[str]): The minimum value in the column (as string for display).
|
|
59
59
|
max_value (Optional[str]): The maximum value in the column (as string for display).
|
|
60
|
+
is_min_exact (Optional[bool]): Whether the min value is exact (PyArrow 22+).
|
|
61
|
+
is_max_exact (Optional[bool]): Whether the max value is exact (PyArrow 22+).
|
|
62
|
+
is_encrypted (Optional[bool]): Whether the column is encrypted.
|
|
63
|
+
num_values (Optional[int]): Number of values in this column chunk.
|
|
64
|
+
total_compressed_size (Optional[int]): Total compressed size in bytes.
|
|
65
|
+
total_uncompressed_size (Optional[int]): Total uncompressed size in bytes.
|
|
60
66
|
"""
|
|
61
67
|
|
|
62
68
|
row_group: int
|
|
@@ -67,6 +73,12 @@ class ColumnInfo(BaseModel):
|
|
|
67
73
|
has_min_max: Optional[bool] = False
|
|
68
74
|
min_value: Optional[str] = None
|
|
69
75
|
max_value: Optional[str] = None
|
|
76
|
+
is_min_exact: Optional[bool] = None
|
|
77
|
+
is_max_exact: Optional[bool] = None
|
|
78
|
+
is_encrypted: Optional[bool] = None
|
|
79
|
+
num_values: Optional[int] = None
|
|
80
|
+
total_compressed_size: Optional[int] = None
|
|
81
|
+
total_uncompressed_size: Optional[int] = None
|
|
70
82
|
|
|
71
83
|
|
|
72
84
|
class ParquetColumnInfo(BaseModel):
|
|
@@ -158,6 +170,28 @@ def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) ->
|
|
|
158
170
|
compression = column_chunk.compression
|
|
159
171
|
column_name = parquet_metadata.schema.names[j]
|
|
160
172
|
|
|
173
|
+
# Get additional column chunk metadata
|
|
174
|
+
num_values = (
|
|
175
|
+
column_chunk.num_values
|
|
176
|
+
if hasattr(column_chunk, "num_values")
|
|
177
|
+
else None
|
|
178
|
+
)
|
|
179
|
+
total_compressed = (
|
|
180
|
+
column_chunk.total_compressed_size
|
|
181
|
+
if hasattr(column_chunk, "total_compressed_size")
|
|
182
|
+
else None
|
|
183
|
+
)
|
|
184
|
+
total_uncompressed = (
|
|
185
|
+
column_chunk.total_uncompressed_size
|
|
186
|
+
if hasattr(column_chunk, "total_uncompressed_size")
|
|
187
|
+
else None
|
|
188
|
+
)
|
|
189
|
+
is_encrypted = (
|
|
190
|
+
column_chunk.is_crypto_metadata_set()
|
|
191
|
+
if hasattr(column_chunk, "is_crypto_metadata_set")
|
|
192
|
+
else None
|
|
193
|
+
)
|
|
194
|
+
|
|
161
195
|
# Create or update column info
|
|
162
196
|
column_info.columns.append(
|
|
163
197
|
ColumnInfo(
|
|
@@ -165,6 +199,10 @@ def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) ->
|
|
|
165
199
|
column_name=column_name,
|
|
166
200
|
column_index=j,
|
|
167
201
|
compression_type=compression,
|
|
202
|
+
num_values=num_values,
|
|
203
|
+
total_compressed_size=total_compressed,
|
|
204
|
+
total_uncompressed_size=total_uncompressed,
|
|
205
|
+
is_encrypted=is_encrypted,
|
|
168
206
|
)
|
|
169
207
|
)
|
|
170
208
|
except Exception as e:
|
|
@@ -252,6 +290,16 @@ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -
|
|
|
252
290
|
# Fallback for complex types that might not stringify well
|
|
253
291
|
col.min_value = "<unable to display>"
|
|
254
292
|
col.max_value = "<unable to display>"
|
|
293
|
+
|
|
294
|
+
# PyArrow 22+ feature: check if min/max values are exact
|
|
295
|
+
# This helps users understand if statistics can be trusted for query optimization
|
|
296
|
+
try:
|
|
297
|
+
if hasattr(stats, "is_min_value_exact"):
|
|
298
|
+
col.is_min_exact = stats.is_min_value_exact
|
|
299
|
+
if hasattr(stats, "is_max_value_exact"):
|
|
300
|
+
col.is_max_exact = stats.is_max_value_exact
|
|
301
|
+
except Exception:
|
|
302
|
+
pass # Not available in older PyArrow versions
|
|
255
303
|
else:
|
|
256
304
|
col.has_min_max = False
|
|
257
305
|
break
|
|
@@ -262,12 +310,27 @@ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -
|
|
|
262
310
|
)
|
|
263
311
|
|
|
264
312
|
|
|
265
|
-
def
|
|
313
|
+
def format_size(size_bytes: Optional[int]) -> str:
|
|
314
|
+
"""Format bytes into human-readable size."""
|
|
315
|
+
if size_bytes is None:
|
|
316
|
+
return "N/A"
|
|
317
|
+
size: float = float(size_bytes)
|
|
318
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
319
|
+
if abs(size) < 1024.0:
|
|
320
|
+
return f"{size:.1f}{unit}"
|
|
321
|
+
size /= 1024.0
|
|
322
|
+
return f"{size:.1f}TB"
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def print_column_info_table(
|
|
326
|
+
column_info: ParquetColumnInfo, show_sizes: bool = False
|
|
327
|
+
) -> None:
|
|
266
328
|
"""
|
|
267
329
|
Prints the column information using a Rich table.
|
|
268
330
|
|
|
269
331
|
Args:
|
|
270
332
|
column_info: The ParquetColumnInfo model to display.
|
|
333
|
+
show_sizes: Whether to show compressed/uncompressed size columns.
|
|
271
334
|
"""
|
|
272
335
|
table = Table(title="Parquet Column Information")
|
|
273
336
|
|
|
@@ -276,9 +339,18 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
|
276
339
|
table.add_column("Column Name", style="green")
|
|
277
340
|
table.add_column("Index", justify="center")
|
|
278
341
|
table.add_column("Compression", style="magenta")
|
|
279
|
-
table.add_column("Bloom
|
|
342
|
+
table.add_column("Bloom", justify="center")
|
|
343
|
+
table.add_column("Encrypted", justify="center")
|
|
280
344
|
table.add_column("Min Value", style="yellow")
|
|
281
345
|
table.add_column("Max Value", style="yellow")
|
|
346
|
+
table.add_column(
|
|
347
|
+
"Exact", justify="center", style="dim"
|
|
348
|
+
) # Shows if min/max are exact
|
|
349
|
+
|
|
350
|
+
if show_sizes:
|
|
351
|
+
table.add_column("Values", justify="right")
|
|
352
|
+
table.add_column("Compressed", justify="right", style="blue")
|
|
353
|
+
table.add_column("Ratio", justify="right", style="blue")
|
|
282
354
|
|
|
283
355
|
# Add rows to the table
|
|
284
356
|
for col in column_info.columns:
|
|
@@ -290,15 +362,48 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
|
290
362
|
col.max_value if col.has_min_max and col.max_value is not None else "N/A"
|
|
291
363
|
)
|
|
292
364
|
|
|
293
|
-
|
|
365
|
+
# Format exactness indicator (PyArrow 22+ feature)
|
|
366
|
+
exact_display = "N/A"
|
|
367
|
+
if col.is_min_exact is not None and col.is_max_exact is not None:
|
|
368
|
+
if col.is_min_exact and col.is_max_exact:
|
|
369
|
+
exact_display = "✅"
|
|
370
|
+
elif col.is_min_exact or col.is_max_exact:
|
|
371
|
+
exact_display = "~" # Partially exact
|
|
372
|
+
else:
|
|
373
|
+
exact_display = "❌"
|
|
374
|
+
|
|
375
|
+
# Format encryption status
|
|
376
|
+
encrypted_display = "🔒" if col.is_encrypted else "—"
|
|
377
|
+
|
|
378
|
+
row_data = [
|
|
294
379
|
str(col.row_group),
|
|
295
380
|
col.column_name,
|
|
296
381
|
str(col.column_index),
|
|
297
382
|
col.compression_type,
|
|
298
383
|
"✅" if col.has_bloom_filter else "❌",
|
|
384
|
+
encrypted_display,
|
|
299
385
|
min_display,
|
|
300
386
|
max_display,
|
|
301
|
-
|
|
387
|
+
exact_display,
|
|
388
|
+
]
|
|
389
|
+
|
|
390
|
+
if show_sizes:
|
|
391
|
+
# Calculate compression ratio
|
|
392
|
+
ratio = "N/A"
|
|
393
|
+
if col.total_compressed_size and col.total_uncompressed_size:
|
|
394
|
+
ratio = (
|
|
395
|
+
f"{col.total_uncompressed_size / col.total_compressed_size:.1f}x"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
row_data.extend(
|
|
399
|
+
[
|
|
400
|
+
str(col.num_values) if col.num_values else "N/A",
|
|
401
|
+
format_size(col.total_compressed_size),
|
|
402
|
+
ratio,
|
|
403
|
+
]
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
table.add_row(*row_data)
|
|
302
407
|
|
|
303
408
|
# Print the table
|
|
304
409
|
console.print(table)
|
|
@@ -331,6 +436,7 @@ def inspect_single_file(
|
|
|
331
436
|
format: OutputFormat,
|
|
332
437
|
metadata_only: bool,
|
|
333
438
|
column_filter: Optional[str],
|
|
439
|
+
show_sizes: bool = False,
|
|
334
440
|
) -> None:
|
|
335
441
|
"""
|
|
336
442
|
Inspect a single Parquet file and display its metadata, compression settings, and bloom filter information.
|
|
@@ -339,7 +445,7 @@ def inspect_single_file(
|
|
|
339
445
|
Exception: If the file cannot be processed.
|
|
340
446
|
"""
|
|
341
447
|
try:
|
|
342
|
-
|
|
448
|
+
parquet_metadata, compression = read_parquet_metadata(filename)
|
|
343
449
|
except FileNotFoundError:
|
|
344
450
|
raise Exception(f"Cannot open: {filename}.")
|
|
345
451
|
except Exception as e:
|
|
@@ -382,7 +488,7 @@ def inspect_single_file(
|
|
|
382
488
|
|
|
383
489
|
# Print column details if not metadata only
|
|
384
490
|
if not metadata_only:
|
|
385
|
-
print_column_info_table(column_info)
|
|
491
|
+
print_column_info_table(column_info, show_sizes=show_sizes)
|
|
386
492
|
console.print(f"Compression codecs: {compression}")
|
|
387
493
|
|
|
388
494
|
|
|
@@ -404,6 +510,12 @@ def inspect(
|
|
|
404
510
|
column_filter: Optional[str] = typer.Option(
|
|
405
511
|
None, "--column", "-c", help="Filter results to show only specific column"
|
|
406
512
|
),
|
|
513
|
+
show_sizes: bool = typer.Option(
|
|
514
|
+
False,
|
|
515
|
+
"--sizes",
|
|
516
|
+
"-s",
|
|
517
|
+
help="Show column sizes and compression ratios",
|
|
518
|
+
),
|
|
407
519
|
):
|
|
408
520
|
"""
|
|
409
521
|
Inspect Parquet files and display their metadata, compression settings, and bloom filter information.
|
|
@@ -436,7 +548,9 @@ def inspect(
|
|
|
436
548
|
console.print("─" * (len(filename) + 6))
|
|
437
549
|
|
|
438
550
|
try:
|
|
439
|
-
inspect_single_file(
|
|
551
|
+
inspect_single_file(
|
|
552
|
+
filename, format, metadata_only, column_filter, show_sizes
|
|
553
|
+
)
|
|
440
554
|
except Exception as e:
|
|
441
555
|
console.print(f"Error processing {filename}: {e}", style="red")
|
|
442
556
|
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iparq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Display version compression and bloom filter information about a parquet file
|
|
5
5
|
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -30,8 +30,12 @@ Description-Content-Type: text/markdown
|
|
|
30
30
|

|
|
31
31
|
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there's no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- **Bloom filters**: Displays if columns have bloom filters. Read more in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
36
|
+
- **Encryption detection**: Shows if columns are encrypted (🔒)
|
|
37
|
+
- **Statistics exactness**: Indicates if min/max statistics are exact or approximate (PyArrow 22+)
|
|
38
|
+
- **Compression ratios**: Optional display of column sizes and compression efficiency
|
|
35
39
|
|
|
36
40
|
## Installation
|
|
37
41
|
|
|
@@ -102,11 +106,12 @@ Options include:
|
|
|
102
106
|
- `--format`, `-f`: Output format, either `rich` (default) or `json`
|
|
103
107
|
- `--metadata-only`, `-m`: Show only file metadata without column details
|
|
104
108
|
- `--column`, `-c`: Filter results to show only a specific column
|
|
109
|
+
- `--sizes`, `-s`: Show column sizes and compression ratios
|
|
105
110
|
|
|
106
111
|
### Single File Examples:
|
|
107
112
|
|
|
108
113
|
```sh
|
|
109
|
-
# Basic inspection
|
|
114
|
+
# Basic inspection .
|
|
110
115
|
iparq inspect yourfile.parquet
|
|
111
116
|
|
|
112
117
|
# Output in JSON format
|
|
@@ -117,6 +122,9 @@ iparq inspect yourfile.parquet --metadata-only
|
|
|
117
122
|
|
|
118
123
|
# Filter to show only a specific column
|
|
119
124
|
iparq inspect yourfile.parquet --column column_name
|
|
125
|
+
|
|
126
|
+
# Show column sizes and compression ratios
|
|
127
|
+
iparq inspect yourfile.parquet --sizes
|
|
120
128
|
```
|
|
121
129
|
|
|
122
130
|
### Multiple Files and Glob Patterns:
|
|
@@ -137,7 +145,7 @@ iparq inspect important.parquet temp_*.parquet
|
|
|
137
145
|
|
|
138
146
|
When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
|
|
139
147
|
|
|
140
|
-
## Example output
|
|
148
|
+
## Example output
|
|
141
149
|
|
|
142
150
|
```log
|
|
143
151
|
ParquetMetaModel(
|
|
@@ -148,14 +156,29 @@ ParquetMetaModel(
|
|
|
148
156
|
format_version='2.6',
|
|
149
157
|
serialized_size=2223
|
|
150
158
|
)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
┃
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
│ 0 │
|
|
157
|
-
│ 0 │
|
|
158
|
-
|
|
159
|
-
└───────────┴────────────┴───────┴───────────┴────────────┴───────────┴───────────┘
|
|
159
|
+
Parquet Column Information
|
|
160
|
+
┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
|
|
161
|
+
┃ Row Group ┃ Column Name ┃ Index ┃ Compression ┃ Bloom ┃ Encrypted ┃ Min Value ┃ Max Value ┃ Exact ┃
|
|
162
|
+
┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
|
|
163
|
+
│ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │
|
|
164
|
+
│ 0 │ two │ 1 │ SNAPPY │ ✅ │ — │ bar │ foo │ N/A │
|
|
165
|
+
│ 0 │ three │ 2 │ SNAPPY │ ✅ │ — │ False │ True │ N/A │
|
|
166
|
+
└───────────┴─────────────┴───────┴─────────────┴───────┴───────────┴───────────┴───────────┴───────┘
|
|
160
167
|
Compression codecs: {'SNAPPY'}
|
|
161
168
|
```
|
|
169
|
+
|
|
170
|
+
### With `--sizes` flag
|
|
171
|
+
|
|
172
|
+
```log
|
|
173
|
+
iparq inspect yourfile.parquet --sizes
|
|
174
|
+
|
|
175
|
+
Parquet Column Information
|
|
176
|
+
┏━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
|
|
177
|
+
┃ Row ┃ Column ┃ ┃ ┃ ┃ ┃ Min ┃ Max ┃ ┃ ┃ ┃ ┃
|
|
178
|
+
┃ Group ┃ Name ┃ Index ┃ Compr… ┃ Bloom ┃ Encryp… ┃ Value ┃ Value ┃ Exact ┃ Values ┃ Compr… ┃ Ratio ┃
|
|
179
|
+
┡━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
|
|
180
|
+
│ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │ 3 │ 104.0B │ 1.0x │
|
|
181
|
+
│ 0 │ two │ 1 │ SNAPPY │ ✅ │ — │ bar │ foo │ N/A │ 3 │ 80.0B │ 0.9x │
|
|
182
|
+
│ 0 │ three │ 2 │ SNAPPY │ ✅ │ — │ False │ True │ N/A │ 3 │ 42.0B │ 1.0x │
|
|
183
|
+
└────────┴─────────┴───────┴────────┴───────┴─────────┴────────┴─────────┴───────┴────────┴────────┴───────┘
|
|
184
|
+
```
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
iparq/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
|
|
2
|
+
iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
iparq/source.py,sha256=6NG2-5aD0hswZ14BEfezoLXNo_D3GDnFWGi0WnY-wuo,20210
|
|
4
|
+
iparq-0.5.0.dist-info/METADATA,sha256=CO4Aj7pjbnFJeinzY7x25d1biLHw_EyHwxBH25LI_Lw,8401
|
|
5
|
+
iparq-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
iparq-0.5.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
+
iparq-0.5.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
+
iparq-0.5.0.dist-info/RECORD,,
|
iparq-0.4.1.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
iparq/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
|
|
2
|
-
iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
iparq/source.py,sha256=HT76AP19e-TheGB5cHlGqsyJGvdsPVMbAYuSo2xH0Bk,15476
|
|
4
|
-
iparq-0.4.1.dist-info/METADATA,sha256=aKFzoI4pVaAa_D-fW-FeicRHO6PepqOfMuKBXySYS4I,6018
|
|
5
|
-
iparq-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
-
iparq-0.4.1.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
-
iparq-0.4.1.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
-
iparq-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|