iparq 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq/source.py +40 -0
- {iparq-0.1.5.dist-info → iparq-0.1.7.dist-info}/METADATA +62 -2
- iparq-0.1.7.dist-info/RECORD +7 -0
- iparq-0.1.5.dist-info/RECORD +0 -7
- {iparq-0.1.5.dist-info → iparq-0.1.7.dist-info}/WHEEL +0 -0
- {iparq-0.1.5.dist-info → iparq-0.1.7.dist-info}/entry_points.txt +0 -0
- {iparq-0.1.5.dist-info → iparq-0.1.7.dist-info}/licenses/LICENSE +0 -0
iparq/source.py
CHANGED
|
@@ -120,6 +120,45 @@ def print_compression_types(parquet_metadata) -> None:
|
|
|
120
120
|
pass
|
|
121
121
|
|
|
122
122
|
|
|
123
|
+
def print_bloom_filter_info(parquet_metadata) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Prints information about bloom filters for each column in each row group of the Parquet file.
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
num_row_groups = parquet_metadata.num_row_groups
|
|
129
|
+
num_columns = parquet_metadata.num_columns
|
|
130
|
+
has_bloom_filters = False
|
|
131
|
+
|
|
132
|
+
console.print("[bold underline]Bloom Filter Info:[/bold underline]")
|
|
133
|
+
|
|
134
|
+
for i in range(num_row_groups):
|
|
135
|
+
row_group = parquet_metadata.row_group(i)
|
|
136
|
+
bloom_filters_in_group = False
|
|
137
|
+
|
|
138
|
+
for j in range(num_columns):
|
|
139
|
+
column_chunk = row_group.column(j)
|
|
140
|
+
column_name = parquet_metadata.schema.column(j).name
|
|
141
|
+
|
|
142
|
+
# Check if this column has bloom filters using is_stats_set
|
|
143
|
+
if hasattr(column_chunk, "is_stats_set") and column_chunk.is_stats_set:
|
|
144
|
+
if not bloom_filters_in_group:
|
|
145
|
+
console.print(f"[bold]Row Group {i}:[/bold]")
|
|
146
|
+
bloom_filters_in_group = True
|
|
147
|
+
has_bloom_filters = True
|
|
148
|
+
console.print(
|
|
149
|
+
f" Column '{column_name}' (Index {j}): [green]Has bloom filter[/green]"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if not has_bloom_filters:
|
|
153
|
+
console.print(" [italic]No bloom filters found in any column[/italic]")
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
console.print(
|
|
157
|
+
f"Error while printing bloom filter information: {e}",
|
|
158
|
+
style="blink bold red underline on white",
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
123
162
|
@app.command()
|
|
124
163
|
def main(filename: str):
|
|
125
164
|
"""
|
|
@@ -135,6 +174,7 @@ def main(filename: str):
|
|
|
135
174
|
|
|
136
175
|
print_parquet_metadata(parquet_metadata)
|
|
137
176
|
print_compression_types(parquet_metadata)
|
|
177
|
+
print_bloom_filter_info(parquet_metadata)
|
|
138
178
|
print(f"Compression codecs: {compression}")
|
|
139
179
|
|
|
140
180
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iparq
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Display version and compression information about a parquet file
|
|
5
5
|
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -26,6 +26,10 @@ Description-Content-Type: text/markdown
|
|
|
26
26
|

|
|
27
27
|
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
28
28
|
|
|
29
|
+
***New*** Bloom filters information: Displays if there are bloom filters.
|
|
30
|
+
Read more about bloom filters in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
31
|
+
|
|
32
|
+
|
|
29
33
|
## Installation
|
|
30
34
|
|
|
31
35
|
### Using pip
|
|
@@ -80,7 +84,63 @@ iparq <filename>
|
|
|
80
84
|
|
|
81
85
|
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
82
86
|
|
|
83
|
-
## Example
|
|
87
|
+
## Example ouput - Bloom Filters
|
|
88
|
+
|
|
89
|
+
```log
|
|
90
|
+
ParquetMetaModel(
|
|
91
|
+
created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
|
|
92
|
+
num_columns=1,
|
|
93
|
+
num_rows=100000000,
|
|
94
|
+
num_row_groups=10,
|
|
95
|
+
format_version='1.0',
|
|
96
|
+
serialized_size=1196
|
|
97
|
+
)
|
|
98
|
+
Column Compression Info:
|
|
99
|
+
Row Group 0:
|
|
100
|
+
Column 'r' (Index 0): SNAPPY
|
|
101
|
+
Row Group 1:
|
|
102
|
+
Column 'r' (Index 0): SNAPPY
|
|
103
|
+
Row Group 2:
|
|
104
|
+
Column 'r' (Index 0): SNAPPY
|
|
105
|
+
Row Group 3:
|
|
106
|
+
Column 'r' (Index 0): SNAPPY
|
|
107
|
+
Row Group 4:
|
|
108
|
+
Column 'r' (Index 0): SNAPPY
|
|
109
|
+
Row Group 5:
|
|
110
|
+
Column 'r' (Index 0): SNAPPY
|
|
111
|
+
Row Group 6:
|
|
112
|
+
Column 'r' (Index 0): SNAPPY
|
|
113
|
+
Row Group 7:
|
|
114
|
+
Column 'r' (Index 0): SNAPPY
|
|
115
|
+
Row Group 8:
|
|
116
|
+
Column 'r' (Index 0): SNAPPY
|
|
117
|
+
Row Group 9:
|
|
118
|
+
Column 'r' (Index 0): SNAPPY
|
|
119
|
+
Bloom Filter Info:
|
|
120
|
+
Row Group 0:
|
|
121
|
+
Column 'r' (Index 0): Has bloom filter
|
|
122
|
+
Row Group 1:
|
|
123
|
+
Column 'r' (Index 0): Has bloom filter
|
|
124
|
+
Row Group 2:
|
|
125
|
+
Column 'r' (Index 0): Has bloom filter
|
|
126
|
+
Row Group 3:
|
|
127
|
+
Column 'r' (Index 0): Has bloom filter
|
|
128
|
+
Row Group 4:
|
|
129
|
+
Column 'r' (Index 0): Has bloom filter
|
|
130
|
+
Row Group 5:
|
|
131
|
+
Column 'r' (Index 0): Has bloom filter
|
|
132
|
+
Row Group 6:
|
|
133
|
+
Column 'r' (Index 0): Has bloom filter
|
|
134
|
+
Row Group 7:
|
|
135
|
+
Column 'r' (Index 0): Has bloom filter
|
|
136
|
+
Row Group 8:
|
|
137
|
+
Column 'r' (Index 0): Has bloom filter
|
|
138
|
+
Row Group 9:
|
|
139
|
+
Column 'r' (Index 0): Has bloom filter
|
|
140
|
+
Compression codecs: {'SNAPPY'}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Example output
|
|
84
144
|
|
|
85
145
|
```log
|
|
86
146
|
ParquetMetaModel(
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
+
iparq/source.py,sha256=crKRTuZY6W2zEhFfAzb4XWopaVy9qnEkFqz4jbyGmeM,6439
|
|
3
|
+
iparq-0.1.7.dist-info/METADATA,sha256=ku4ZsLQ1Iq2ovPzKqv8aGqBGBkn3nTviW6hFzFsP6bw,6884
|
|
4
|
+
iparq-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
+
iparq-0.1.7.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
6
|
+
iparq-0.1.7.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
7
|
+
iparq-0.1.7.dist-info/RECORD,,
|
iparq-0.1.5.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
-
iparq/source.py,sha256=7ocTpA7j5C-oSyLkMPhDifpH3cPhqyK3LBu0CjjG83s,4851
|
|
3
|
-
iparq-0.1.5.dist-info/METADATA,sha256=7kLNc40ROyYot5N37GXjSErzpp5WPDWa9_Y2BQpmr6o,5387
|
|
4
|
-
iparq-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
-
iparq-0.1.5.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
6
|
-
iparq-0.1.5.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
7
|
-
iparq-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|