iparq 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq/__init__.py +1 -1
- iparq/source.py +153 -33
- {iparq-0.2.6.dist-info → iparq-0.4.0.dist-info}/METADATA +25 -6
- iparq-0.4.0.dist-info/RECORD +8 -0
- iparq-0.2.6.dist-info/RECORD +0 -8
- {iparq-0.2.6.dist-info → iparq-0.4.0.dist-info}/WHEEL +0 -0
- {iparq-0.2.6.dist-info → iparq-0.4.0.dist-info}/entry_points.txt +0 -0
- {iparq-0.2.6.dist-info → iparq-0.4.0.dist-info}/licenses/LICENSE +0 -0
iparq/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.0"
|
iparq/source.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import json
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import List, Optional
|
|
@@ -53,6 +54,9 @@ class ColumnInfo(BaseModel):
|
|
|
53
54
|
column_index (int): The index of the column.
|
|
54
55
|
compression_type (str): The compression type used for the column.
|
|
55
56
|
has_bloom_filter (bool): Whether the column has a bloom filter.
|
|
57
|
+
has_min_max (bool): Whether min/max statistics are available.
|
|
58
|
+
min_value (Optional[str]): The minimum value in the column (as string for display).
|
|
59
|
+
max_value (Optional[str]): The maximum value in the column (as string for display).
|
|
56
60
|
"""
|
|
57
61
|
|
|
58
62
|
row_group: int
|
|
@@ -60,6 +64,9 @@ class ColumnInfo(BaseModel):
|
|
|
60
64
|
column_index: int
|
|
61
65
|
compression_type: str
|
|
62
66
|
has_bloom_filter: Optional[bool] = False
|
|
67
|
+
has_min_max: Optional[bool] = False
|
|
68
|
+
min_value: Optional[str] = None
|
|
69
|
+
max_value: Optional[str] = None
|
|
63
70
|
|
|
64
71
|
|
|
65
72
|
class ParquetColumnInfo(BaseModel):
|
|
@@ -84,22 +91,16 @@ def read_parquet_metadata(filename: str):
|
|
|
84
91
|
tuple: A tuple containing:
|
|
85
92
|
- parquet_metadata (pyarrow.parquet.FileMetaData): The metadata of the Parquet file.
|
|
86
93
|
- compression_codecs (set): A set of compression codecs used in the Parquet file.
|
|
87
|
-
"""
|
|
88
|
-
try:
|
|
89
|
-
compression_codecs = set([])
|
|
90
|
-
parquet_metadata = pq.ParquetFile(filename).metadata
|
|
91
94
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
95
|
+
Raises:
|
|
96
|
+
FileNotFoundError: If the file cannot be found or opened.
|
|
97
|
+
"""
|
|
98
|
+
compression_codecs = set([])
|
|
99
|
+
parquet_metadata = pq.ParquetFile(filename).metadata
|
|
97
100
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
)
|
|
102
|
-
exit(1)
|
|
101
|
+
for i in range(parquet_metadata.num_row_groups):
|
|
102
|
+
for j in range(parquet_metadata.num_columns):
|
|
103
|
+
compression_codecs.add(parquet_metadata.row_group(i).column(j).compression)
|
|
103
104
|
|
|
104
105
|
return parquet_metadata, compression_codecs
|
|
105
106
|
|
|
@@ -208,6 +209,59 @@ def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) ->
|
|
|
208
209
|
)
|
|
209
210
|
|
|
210
211
|
|
|
212
|
+
def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -> None:
|
|
213
|
+
"""
|
|
214
|
+
Updates the column_info model with min/max statistics information.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
parquet_metadata: The Parquet file metadata.
|
|
218
|
+
column_info: The ParquetColumnInfo model to update.
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
num_row_groups = parquet_metadata.num_row_groups
|
|
222
|
+
num_columns = parquet_metadata.num_columns
|
|
223
|
+
|
|
224
|
+
for i in range(num_row_groups):
|
|
225
|
+
row_group = parquet_metadata.row_group(i)
|
|
226
|
+
|
|
227
|
+
for j in range(num_columns):
|
|
228
|
+
column_chunk = row_group.column(j)
|
|
229
|
+
|
|
230
|
+
# Find the corresponding column in our model
|
|
231
|
+
for col in column_info.columns:
|
|
232
|
+
if col.row_group == i and col.column_index == j:
|
|
233
|
+
# Check if this column has statistics
|
|
234
|
+
if column_chunk.is_stats_set:
|
|
235
|
+
stats = column_chunk.statistics
|
|
236
|
+
col.has_min_max = stats.has_min_max
|
|
237
|
+
|
|
238
|
+
if stats.has_min_max:
|
|
239
|
+
# Convert values to string for display, handling potential None values
|
|
240
|
+
try:
|
|
241
|
+
col.min_value = (
|
|
242
|
+
str(stats.min)
|
|
243
|
+
if stats.min is not None
|
|
244
|
+
else "null"
|
|
245
|
+
)
|
|
246
|
+
col.max_value = (
|
|
247
|
+
str(stats.max)
|
|
248
|
+
if stats.max is not None
|
|
249
|
+
else "null"
|
|
250
|
+
)
|
|
251
|
+
except Exception:
|
|
252
|
+
# Fallback for complex types that might not stringify well
|
|
253
|
+
col.min_value = "<unable to display>"
|
|
254
|
+
col.max_value = "<unable to display>"
|
|
255
|
+
else:
|
|
256
|
+
col.has_min_max = False
|
|
257
|
+
break
|
|
258
|
+
except Exception as e:
|
|
259
|
+
console.print(
|
|
260
|
+
f"Error while collecting min/max statistics: {e}",
|
|
261
|
+
style="blink bold red underline on white",
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
|
|
211
265
|
def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
212
266
|
"""
|
|
213
267
|
Prints the column information using a Rich table.
|
|
@@ -223,15 +277,27 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
|
223
277
|
table.add_column("Index", justify="center")
|
|
224
278
|
table.add_column("Compression", style="magenta")
|
|
225
279
|
table.add_column("Bloom Filter", justify="center")
|
|
280
|
+
table.add_column("Min Value", style="yellow")
|
|
281
|
+
table.add_column("Max Value", style="yellow")
|
|
226
282
|
|
|
227
283
|
# Add rows to the table
|
|
228
284
|
for col in column_info.columns:
|
|
285
|
+
# Format min/max values for display
|
|
286
|
+
min_display = (
|
|
287
|
+
col.min_value if col.has_min_max and col.min_value is not None else "N/A"
|
|
288
|
+
)
|
|
289
|
+
max_display = (
|
|
290
|
+
col.max_value if col.has_min_max and col.max_value is not None else "N/A"
|
|
291
|
+
)
|
|
292
|
+
|
|
229
293
|
table.add_row(
|
|
230
294
|
str(col.row_group),
|
|
231
295
|
col.column_name,
|
|
232
296
|
str(col.column_index),
|
|
233
297
|
col.compression_type,
|
|
234
298
|
"✅" if col.has_bloom_filter else "❌",
|
|
299
|
+
min_display,
|
|
300
|
+
max_display,
|
|
235
301
|
)
|
|
236
302
|
|
|
237
303
|
# Print the table
|
|
@@ -260,27 +326,24 @@ def output_json(
|
|
|
260
326
|
print(json.dumps(result, indent=2))
|
|
261
327
|
|
|
262
328
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
),
|
|
270
|
-
metadata_only: bool = typer.Option(
|
|
271
|
-
False,
|
|
272
|
-
"--metadata-only",
|
|
273
|
-
"-m",
|
|
274
|
-
help="Show only file metadata without column details",
|
|
275
|
-
),
|
|
276
|
-
column_filter: Optional[str] = typer.Option(
|
|
277
|
-
None, "--column", "-c", help="Filter results to show only specific column"
|
|
278
|
-
),
|
|
279
|
-
):
|
|
329
|
+
def inspect_single_file(
|
|
330
|
+
filename: str,
|
|
331
|
+
format: OutputFormat,
|
|
332
|
+
metadata_only: bool,
|
|
333
|
+
column_filter: Optional[str],
|
|
334
|
+
) -> None:
|
|
280
335
|
"""
|
|
281
|
-
Inspect a Parquet file and display its metadata, compression settings, and bloom filter information.
|
|
336
|
+
Inspect a single Parquet file and display its metadata, compression settings, and bloom filter information.
|
|
337
|
+
|
|
338
|
+
Raises:
|
|
339
|
+
Exception: If the file cannot be processed.
|
|
282
340
|
"""
|
|
283
|
-
|
|
341
|
+
try:
|
|
342
|
+
(parquet_metadata, compression) = read_parquet_metadata(filename)
|
|
343
|
+
except FileNotFoundError:
|
|
344
|
+
raise Exception(f"Cannot open: {filename}.")
|
|
345
|
+
except Exception as e:
|
|
346
|
+
raise Exception(f"Failed to read metadata: {e}")
|
|
284
347
|
|
|
285
348
|
# Create metadata model
|
|
286
349
|
meta_model = ParquetMetaModel(
|
|
@@ -298,6 +361,7 @@ def inspect(
|
|
|
298
361
|
# Collect information
|
|
299
362
|
print_compression_types(parquet_metadata, column_info)
|
|
300
363
|
print_bloom_filter_info(parquet_metadata, column_info)
|
|
364
|
+
print_min_max_statistics(parquet_metadata, column_info)
|
|
301
365
|
|
|
302
366
|
# Filter columns if requested
|
|
303
367
|
if column_filter:
|
|
@@ -322,5 +386,61 @@ def inspect(
|
|
|
322
386
|
console.print(f"Compression codecs: {compression}")
|
|
323
387
|
|
|
324
388
|
|
|
389
|
+
@app.command(name="")
|
|
390
|
+
@app.command(name="inspect")
|
|
391
|
+
def inspect(
|
|
392
|
+
filenames: List[str] = typer.Argument(
|
|
393
|
+
..., help="Path(s) or pattern(s) to Parquet files to inspect"
|
|
394
|
+
),
|
|
395
|
+
format: OutputFormat = typer.Option(
|
|
396
|
+
OutputFormat.RICH, "--format", "-f", help="Output format (rich or json)"
|
|
397
|
+
),
|
|
398
|
+
metadata_only: bool = typer.Option(
|
|
399
|
+
False,
|
|
400
|
+
"--metadata-only",
|
|
401
|
+
"-m",
|
|
402
|
+
help="Show only file metadata without column details",
|
|
403
|
+
),
|
|
404
|
+
column_filter: Optional[str] = typer.Option(
|
|
405
|
+
None, "--column", "-c", help="Filter results to show only specific column"
|
|
406
|
+
),
|
|
407
|
+
):
|
|
408
|
+
"""
|
|
409
|
+
Inspect Parquet files and display their metadata, compression settings, and bloom filter information.
|
|
410
|
+
"""
|
|
411
|
+
# Expand glob patterns and collect all matching files
|
|
412
|
+
all_files = []
|
|
413
|
+
for pattern in filenames:
|
|
414
|
+
matches = glob.glob(pattern)
|
|
415
|
+
if matches:
|
|
416
|
+
all_files.extend(matches)
|
|
417
|
+
else:
|
|
418
|
+
# If no matches found, treat as literal filename (for better error reporting)
|
|
419
|
+
all_files.append(pattern)
|
|
420
|
+
|
|
421
|
+
# Remove duplicates while preserving order
|
|
422
|
+
seen = set()
|
|
423
|
+
unique_files = []
|
|
424
|
+
for file in all_files:
|
|
425
|
+
if file not in seen:
|
|
426
|
+
seen.add(file)
|
|
427
|
+
unique_files.append(file)
|
|
428
|
+
|
|
429
|
+
# Process each file
|
|
430
|
+
for i, filename in enumerate(unique_files):
|
|
431
|
+
# For multiple files, add a header to separate results
|
|
432
|
+
if len(unique_files) > 1:
|
|
433
|
+
if i > 0:
|
|
434
|
+
console.print() # Add blank line between files
|
|
435
|
+
console.print(f"[bold blue]File: {filename}[/bold blue]")
|
|
436
|
+
console.print("─" * (len(filename) + 6))
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
inspect_single_file(filename, format, metadata_only, column_filter)
|
|
440
|
+
except Exception as e:
|
|
441
|
+
console.print(f"Error processing {filename}: {e}", style="red")
|
|
442
|
+
continue
|
|
443
|
+
|
|
444
|
+
|
|
325
445
|
if __name__ == "__main__":
|
|
326
446
|
app()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iparq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Display version compression and bloom filter information about a parquet file
|
|
5
5
|
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -88,10 +88,10 @@ Read more about bloom filters in this [great article](https://duckdb.org/2025/03
|
|
|
88
88
|
|
|
89
89
|
## Usage
|
|
90
90
|
|
|
91
|
-
iparq
|
|
91
|
+
iparq supports inspecting single files, multiple files, and glob patterns:
|
|
92
92
|
|
|
93
93
|
```sh
|
|
94
|
-
iparq inspect <filename> [OPTIONS]
|
|
94
|
+
iparq inspect <filename(s)> [OPTIONS]
|
|
95
95
|
```
|
|
96
96
|
|
|
97
97
|
Options include:
|
|
@@ -100,9 +100,12 @@ Options include:
|
|
|
100
100
|
- `--metadata-only`, `-m`: Show only file metadata without column details
|
|
101
101
|
- `--column`, `-c`: Filter results to show only a specific column
|
|
102
102
|
|
|
103
|
-
Examples:
|
|
103
|
+
### Single File Examples:
|
|
104
104
|
|
|
105
105
|
```sh
|
|
106
|
+
# Basic inspection
|
|
107
|
+
iparq inspect yourfile.parquet
|
|
108
|
+
|
|
106
109
|
# Output in JSON format
|
|
107
110
|
iparq inspect yourfile.parquet --format json
|
|
108
111
|
|
|
@@ -113,9 +116,25 @@ iparq inspect yourfile.parquet --metadata-only
|
|
|
113
116
|
iparq inspect yourfile.parquet --column column_name
|
|
114
117
|
```
|
|
115
118
|
|
|
116
|
-
|
|
119
|
+
### Multiple Files and Glob Patterns:
|
|
120
|
+
|
|
121
|
+
```sh
|
|
122
|
+
# Inspect multiple specific files
|
|
123
|
+
iparq inspect file1.parquet file2.parquet file3.parquet
|
|
124
|
+
|
|
125
|
+
# Use glob patterns to inspect all parquet files
|
|
126
|
+
iparq inspect *.parquet
|
|
127
|
+
|
|
128
|
+
# Use specific patterns
|
|
129
|
+
iparq inspect yellow*.parquet data_*.parquet
|
|
130
|
+
|
|
131
|
+
# Combine patterns and specific files
|
|
132
|
+
iparq inspect important.parquet temp_*.parquet
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
|
|
117
136
|
|
|
118
|
-
## Example
|
|
137
|
+
## Example output - Bloom Filters
|
|
119
138
|
|
|
120
139
|
```log
|
|
121
140
|
ParquetMetaModel(
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
iparq/__init__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
|
|
2
|
+
iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
iparq/source.py,sha256=HT76AP19e-TheGB5cHlGqsyJGvdsPVMbAYuSo2xH0Bk,15476
|
|
4
|
+
iparq-0.4.0.dist-info/METADATA,sha256=D6parmkBDsraB3C57YX9yistPxOYK-XHi_nBKJ2BGB0,6019
|
|
5
|
+
iparq-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
+
iparq-0.4.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
+
iparq-0.4.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
+
iparq-0.4.0.dist-info/RECORD,,
|
iparq-0.2.6.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
iparq/__init__.py,sha256=Oz5HbwHMyE87nmwV80AZzpkJPf-wBg7eDuJr_BXZkhU,22
|
|
2
|
-
iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
iparq/source.py,sha256=huC6I0hqwyv4BZ5xjI6FMZs9KH60xVHEKbmX6X8hhiA,10721
|
|
4
|
-
iparq-0.2.6.dist-info/METADATA,sha256=LtiLJlVCHuOlx0gOOTqJ97S8baPFhnd_lOmKG2a-94g,5496
|
|
5
|
-
iparq-0.2.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
-
iparq-0.2.6.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
-
iparq-0.2.6.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
-
iparq-0.2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|