iparq 0.1.7__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq/py.typed +0 -0
- iparq/source.py +187 -43
- iparq-0.2.5.dist-info/METADATA +145 -0
- iparq-0.2.5.dist-info/RECORD +8 -0
- iparq-0.1.7.dist-info/METADATA +0 -216
- iparq-0.1.7.dist-info/RECORD +0 -7
- {iparq-0.1.7.dist-info → iparq-0.2.5.dist-info}/WHEEL +0 -0
- {iparq-0.1.7.dist-info → iparq-0.2.5.dist-info}/entry_points.txt +0 -0
- {iparq-0.1.7.dist-info → iparq-0.2.5.dist-info}/licenses/LICENSE +0 -0
iparq/py.typed
ADDED
|
File without changes
|
iparq/source.py
CHANGED
|
@@ -1,13 +1,27 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
1
5
|
import pyarrow.parquet as pq
|
|
2
6
|
import typer
|
|
3
7
|
from pydantic import BaseModel
|
|
4
8
|
from rich import print
|
|
5
9
|
from rich.console import Console
|
|
10
|
+
from rich.table import Table
|
|
6
11
|
|
|
7
|
-
app = typer.Typer(
|
|
12
|
+
app = typer.Typer(
|
|
13
|
+
help="Inspect Parquet files for metadata, compression, and bloom filters"
|
|
14
|
+
)
|
|
8
15
|
console = Console()
|
|
9
16
|
|
|
10
17
|
|
|
18
|
+
class OutputFormat(str, Enum):
|
|
19
|
+
"""Enum for output format options."""
|
|
20
|
+
|
|
21
|
+
RICH = "rich"
|
|
22
|
+
JSON = "json"
|
|
23
|
+
|
|
24
|
+
|
|
11
25
|
class ParquetMetaModel(BaseModel):
|
|
12
26
|
"""
|
|
13
27
|
ParquetMetaModel is a data model representing metadata for a Parquet file.
|
|
@@ -29,6 +43,36 @@ class ParquetMetaModel(BaseModel):
|
|
|
29
43
|
serialized_size: int
|
|
30
44
|
|
|
31
45
|
|
|
46
|
+
class ColumnInfo(BaseModel):
|
|
47
|
+
"""
|
|
48
|
+
ColumnInfo is a data model representing information about a column in a Parquet file.
|
|
49
|
+
|
|
50
|
+
Attributes:
|
|
51
|
+
row_group (int): The row group index.
|
|
52
|
+
column_name (str): The name of the column.
|
|
53
|
+
column_index (int): The index of the column.
|
|
54
|
+
compression_type (str): The compression type used for the column.
|
|
55
|
+
has_bloom_filter (bool): Whether the column has a bloom filter.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
row_group: int
|
|
59
|
+
column_name: str
|
|
60
|
+
column_index: int
|
|
61
|
+
compression_type: str
|
|
62
|
+
has_bloom_filter: Optional[bool] = False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ParquetColumnInfo(BaseModel):
|
|
66
|
+
"""
|
|
67
|
+
ParquetColumnInfo is a data model representing information about all columns in a Parquet file.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
columns (List[ColumnInfo]): List of column information.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
columns: List[ColumnInfo] = []
|
|
74
|
+
|
|
75
|
+
|
|
32
76
|
def read_parquet_metadata(filename: str):
|
|
33
77
|
"""
|
|
34
78
|
Reads the metadata of a Parquet file and extracts the compression codecs used.
|
|
@@ -94,88 +138,188 @@ def print_parquet_metadata(parquet_metadata):
|
|
|
94
138
|
pass
|
|
95
139
|
|
|
96
140
|
|
|
97
|
-
def print_compression_types(parquet_metadata) -> None:
|
|
141
|
+
def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) -> None:
|
|
98
142
|
"""
|
|
99
|
-
|
|
143
|
+
Collects compression type information for each column and adds it to the column_info model.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
parquet_metadata: The Parquet file metadata.
|
|
147
|
+
column_info: The ParquetColumnInfo model to update.
|
|
100
148
|
"""
|
|
101
149
|
try:
|
|
102
150
|
num_row_groups = parquet_metadata.num_row_groups
|
|
103
151
|
num_columns = parquet_metadata.num_columns
|
|
104
|
-
|
|
152
|
+
|
|
105
153
|
for i in range(num_row_groups):
|
|
106
|
-
|
|
154
|
+
row_group = parquet_metadata.row_group(i)
|
|
107
155
|
for j in range(num_columns):
|
|
108
|
-
column_chunk =
|
|
156
|
+
column_chunk = row_group.column(j)
|
|
109
157
|
compression = column_chunk.compression
|
|
110
|
-
column_name = parquet_metadata.schema.
|
|
111
|
-
|
|
112
|
-
|
|
158
|
+
column_name = parquet_metadata.schema.names[j]
|
|
159
|
+
|
|
160
|
+
# Create or update column info
|
|
161
|
+
column_info.columns.append(
|
|
162
|
+
ColumnInfo(
|
|
163
|
+
row_group=i,
|
|
164
|
+
column_name=column_name,
|
|
165
|
+
column_index=j,
|
|
166
|
+
compression_type=compression,
|
|
167
|
+
)
|
|
113
168
|
)
|
|
114
169
|
except Exception as e:
|
|
115
170
|
console.print(
|
|
116
|
-
f"Error while
|
|
171
|
+
f"Error while collecting compression types: {e}",
|
|
117
172
|
style="blink bold red underline on white",
|
|
118
173
|
)
|
|
119
|
-
finally:
|
|
120
|
-
pass
|
|
121
174
|
|
|
122
175
|
|
|
123
|
-
def print_bloom_filter_info(parquet_metadata) -> None:
|
|
176
|
+
def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) -> None:
|
|
124
177
|
"""
|
|
125
|
-
|
|
178
|
+
Updates the column_info model with bloom filter information.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
parquet_metadata: The Parquet file metadata.
|
|
182
|
+
column_info: The ParquetColumnInfo model to update.
|
|
126
183
|
"""
|
|
127
184
|
try:
|
|
128
185
|
num_row_groups = parquet_metadata.num_row_groups
|
|
129
186
|
num_columns = parquet_metadata.num_columns
|
|
130
|
-
has_bloom_filters = False
|
|
131
|
-
|
|
132
|
-
console.print("[bold underline]Bloom Filter Info:[/bold underline]")
|
|
133
187
|
|
|
134
188
|
for i in range(num_row_groups):
|
|
135
189
|
row_group = parquet_metadata.row_group(i)
|
|
136
|
-
bloom_filters_in_group = False
|
|
137
190
|
|
|
138
191
|
for j in range(num_columns):
|
|
139
192
|
column_chunk = row_group.column(j)
|
|
140
|
-
column_name = parquet_metadata.schema.column(j).name
|
|
141
|
-
|
|
142
|
-
# Check if this column has bloom filters using is_stats_set
|
|
143
|
-
if hasattr(column_chunk, "is_stats_set") and column_chunk.is_stats_set:
|
|
144
|
-
if not bloom_filters_in_group:
|
|
145
|
-
console.print(f"[bold]Row Group {i}:[/bold]")
|
|
146
|
-
bloom_filters_in_group = True
|
|
147
|
-
has_bloom_filters = True
|
|
148
|
-
console.print(
|
|
149
|
-
f" Column '{column_name}' (Index {j}): [green]Has bloom filter[/green]"
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
if not has_bloom_filters:
|
|
153
|
-
console.print(" [italic]No bloom filters found in any column[/italic]")
|
|
154
193
|
|
|
194
|
+
# Find the corresponding column in our model
|
|
195
|
+
for col in column_info.columns:
|
|
196
|
+
if col.row_group == i and col.column_index == j:
|
|
197
|
+
# Check if this column has bloom filters
|
|
198
|
+
has_bloom_filter = (
|
|
199
|
+
hasattr(column_chunk, "is_stats_set")
|
|
200
|
+
and column_chunk.is_stats_set
|
|
201
|
+
)
|
|
202
|
+
col.has_bloom_filter = has_bloom_filter
|
|
203
|
+
break
|
|
155
204
|
except Exception as e:
|
|
156
205
|
console.print(
|
|
157
|
-
f"Error while
|
|
206
|
+
f"Error while collecting bloom filter information: {e}",
|
|
158
207
|
style="blink bold red underline on white",
|
|
159
208
|
)
|
|
160
209
|
|
|
161
210
|
|
|
162
|
-
|
|
163
|
-
def main(filename: str):
|
|
211
|
+
def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
164
212
|
"""
|
|
165
|
-
|
|
213
|
+
Prints the column information using a Rich table.
|
|
166
214
|
|
|
167
215
|
Args:
|
|
168
|
-
|
|
216
|
+
column_info: The ParquetColumnInfo model to display.
|
|
217
|
+
"""
|
|
218
|
+
table = Table(title="Parquet Column Information")
|
|
219
|
+
|
|
220
|
+
# Add table columns
|
|
221
|
+
table.add_column("Row Group", justify="center", style="cyan")
|
|
222
|
+
table.add_column("Column Name", style="green")
|
|
223
|
+
table.add_column("Index", justify="center")
|
|
224
|
+
table.add_column("Compression", style="magenta")
|
|
225
|
+
table.add_column("Bloom Filter", justify="center")
|
|
226
|
+
|
|
227
|
+
# Add rows to the table
|
|
228
|
+
for col in column_info.columns:
|
|
229
|
+
table.add_row(
|
|
230
|
+
str(col.row_group),
|
|
231
|
+
col.column_name,
|
|
232
|
+
str(col.column_index),
|
|
233
|
+
col.compression_type,
|
|
234
|
+
"✅" if col.has_bloom_filter else "❌",
|
|
235
|
+
)
|
|
169
236
|
|
|
170
|
-
|
|
171
|
-
|
|
237
|
+
# Print the table
|
|
238
|
+
console.print(table)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def output_json(
|
|
242
|
+
meta_model: ParquetMetaModel,
|
|
243
|
+
column_info: ParquetColumnInfo,
|
|
244
|
+
compression_codecs: set,
|
|
245
|
+
) -> None:
|
|
246
|
+
"""
|
|
247
|
+
Outputs the parquet information in JSON format.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
meta_model: The Parquet metadata model
|
|
251
|
+
column_info: The column information model
|
|
252
|
+
compression_codecs: Set of compression codecs used
|
|
253
|
+
"""
|
|
254
|
+
result = {
|
|
255
|
+
"metadata": meta_model.model_dump(),
|
|
256
|
+
"columns": [column.model_dump() for column in column_info.columns],
|
|
257
|
+
"compression_codecs": list(compression_codecs),
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
print(json.dumps(result, indent=2))
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@app.command(name="")
|
|
264
|
+
@app.command(name="inspect")
|
|
265
|
+
def inspect(
|
|
266
|
+
filename: str = typer.Argument(..., help="Path to the Parquet file to inspect"),
|
|
267
|
+
format: OutputFormat = typer.Option(
|
|
268
|
+
OutputFormat.RICH, "--format", "-f", help="Output format (rich or json)"
|
|
269
|
+
),
|
|
270
|
+
metadata_only: bool = typer.Option(
|
|
271
|
+
False,
|
|
272
|
+
"--metadata-only",
|
|
273
|
+
"-m",
|
|
274
|
+
help="Show only file metadata without column details",
|
|
275
|
+
),
|
|
276
|
+
column_filter: Optional[str] = typer.Option(
|
|
277
|
+
None, "--column", "-c", help="Filter results to show only specific column"
|
|
278
|
+
),
|
|
279
|
+
):
|
|
280
|
+
"""
|
|
281
|
+
Inspect a Parquet file and display its metadata, compression settings, and bloom filter information.
|
|
172
282
|
"""
|
|
173
283
|
(parquet_metadata, compression) = read_parquet_metadata(filename)
|
|
174
284
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
285
|
+
# Create metadata model
|
|
286
|
+
meta_model = ParquetMetaModel(
|
|
287
|
+
created_by=parquet_metadata.created_by,
|
|
288
|
+
num_columns=parquet_metadata.num_columns,
|
|
289
|
+
num_rows=parquet_metadata.num_rows,
|
|
290
|
+
num_row_groups=parquet_metadata.num_row_groups,
|
|
291
|
+
format_version=str(parquet_metadata.format_version),
|
|
292
|
+
serialized_size=parquet_metadata.serialized_size,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Create a model to store column information
|
|
296
|
+
column_info = ParquetColumnInfo()
|
|
297
|
+
|
|
298
|
+
# Collect information
|
|
299
|
+
print_compression_types(parquet_metadata, column_info)
|
|
300
|
+
print_bloom_filter_info(parquet_metadata, column_info)
|
|
301
|
+
|
|
302
|
+
# Filter columns if requested
|
|
303
|
+
if column_filter:
|
|
304
|
+
column_info.columns = [
|
|
305
|
+
col for col in column_info.columns if col.column_name == column_filter
|
|
306
|
+
]
|
|
307
|
+
if not column_info.columns:
|
|
308
|
+
console.print(
|
|
309
|
+
f"No columns match the filter: {column_filter}", style="yellow"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Output based on format selection
|
|
313
|
+
if format == OutputFormat.JSON:
|
|
314
|
+
output_json(meta_model, column_info, compression)
|
|
315
|
+
else: # Rich format
|
|
316
|
+
# Print the metadata
|
|
317
|
+
console.print(meta_model)
|
|
318
|
+
|
|
319
|
+
# Print column details if not metadata only
|
|
320
|
+
if not metadata_only:
|
|
321
|
+
print_column_info_table(column_info)
|
|
322
|
+
console.print(f"Compression codecs: {compression}")
|
|
179
323
|
|
|
180
324
|
|
|
181
325
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: iparq
|
|
3
|
+
Version: 0.2.5
|
|
4
|
+
Summary: Display version compression and bloom filter information about a parquet file
|
|
5
|
+
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: pyarrow
|
|
9
|
+
Requires-Dist: pydantic
|
|
10
|
+
Requires-Dist: rich
|
|
11
|
+
Requires-Dist: typer[all]
|
|
12
|
+
Provides-Extra: checks
|
|
13
|
+
Requires-Dist: mypy>=1.14.1; extra == 'checks'
|
|
14
|
+
Requires-Dist: ruff>=0.9.3; extra == 'checks'
|
|
15
|
+
Provides-Extra: test
|
|
16
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# iparq
|
|
20
|
+
|
|
21
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-package.yml)
|
|
22
|
+
|
|
23
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/dependabot/dependabot-updates)
|
|
24
|
+
|
|
25
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
|
|
26
|
+
|
|
27
|
+

|
|
28
|
+
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there's no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
29
|
+
|
|
30
|
+
***New*** Bloom filters information: Displays if there are bloom filters.
|
|
31
|
+
Read more about bloom filters in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### Zero installation - Recommended
|
|
36
|
+
|
|
37
|
+
1) Make sure to have Astral's UV installed by following the steps here:
|
|
38
|
+
|
|
39
|
+
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
40
|
+
|
|
41
|
+
2) Execute the following command:
|
|
42
|
+
|
|
43
|
+
```sh
|
|
44
|
+
uvx --refresh iparq inspect yourparquet.parquet
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Using pip
|
|
48
|
+
|
|
49
|
+
1) Install the package using pip:
|
|
50
|
+
|
|
51
|
+
```sh
|
|
52
|
+
pip install iparq
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
2) Verify the installation by running:
|
|
56
|
+
|
|
57
|
+
```sh
|
|
58
|
+
iparq --help
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Using uv
|
|
62
|
+
|
|
63
|
+
1) Make sure to have Astral's UV installed by following the steps here:
|
|
64
|
+
|
|
65
|
+
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
66
|
+
|
|
67
|
+
2) Execute the following command:
|
|
68
|
+
|
|
69
|
+
```sh
|
|
70
|
+
uv pip install iparq
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
3) Verify the installation by running:
|
|
74
|
+
|
|
75
|
+
```sh
|
|
76
|
+
iparq --help
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Using Homebrew in a MAC
|
|
80
|
+
|
|
81
|
+
1) Run the following:
|
|
82
|
+
|
|
83
|
+
```sh
|
|
84
|
+
brew tap MiguelElGallo/tap https://github.com/MiguelElGallo//homebrew-iparq.git
|
|
85
|
+
brew install MiguelElGallo/tap/iparq
|
|
86
|
+
iparq --help
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Usage
|
|
90
|
+
|
|
91
|
+
iparq now supports additional options:
|
|
92
|
+
|
|
93
|
+
```sh
|
|
94
|
+
iparq inspect <filename> [OPTIONS]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Options include:
|
|
98
|
+
|
|
99
|
+
- `--format`, `-f`: Output format, either `rich` (default) or `json`
|
|
100
|
+
- `--metadata-only`, `-m`: Show only file metadata without column details
|
|
101
|
+
- `--column`, `-c`: Filter results to show only a specific column
|
|
102
|
+
|
|
103
|
+
Examples:
|
|
104
|
+
|
|
105
|
+
```sh
|
|
106
|
+
# Output in JSON format
|
|
107
|
+
iparq inspect yourfile.parquet --format json
|
|
108
|
+
|
|
109
|
+
# Show only metadata
|
|
110
|
+
iparq inspect yourfile.parquet --metadata-only
|
|
111
|
+
|
|
112
|
+
# Filter to show only a specific column
|
|
113
|
+
iparq inspect yourfile.parquet --column column_name
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
117
|
+
|
|
118
|
+
## Example ouput - Bloom Filters
|
|
119
|
+
|
|
120
|
+
```log
|
|
121
|
+
ParquetMetaModel(
|
|
122
|
+
created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
|
|
123
|
+
num_columns=1,
|
|
124
|
+
num_rows=100000000,
|
|
125
|
+
num_row_groups=10,
|
|
126
|
+
format_version='1.0',
|
|
127
|
+
serialized_size=1196
|
|
128
|
+
)
|
|
129
|
+
Parquet Column Information
|
|
130
|
+
┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
|
|
131
|
+
┃ Row Group ┃ Column Name ┃ Index ┃ Compression ┃ Bloom Filter ┃
|
|
132
|
+
┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
|
|
133
|
+
│ 0 │ r │ 0 │ SNAPPY │ ✅ │
|
|
134
|
+
│ 1 │ r │ 0 │ SNAPPY │ ✅ │
|
|
135
|
+
│ 2 │ r │ 0 │ SNAPPY │ ✅ │
|
|
136
|
+
│ 3 │ r │ 0 │ SNAPPY │ ✅ │
|
|
137
|
+
│ 4 │ r │ 0 │ SNAPPY │ ✅ │
|
|
138
|
+
│ 5 │ r │ 0 │ SNAPPY │ ✅ │
|
|
139
|
+
│ 6 │ r │ 0 │ SNAPPY │ ✅ │
|
|
140
|
+
│ 7 │ r │ 0 │ SNAPPY │ ✅ │
|
|
141
|
+
│ 8 │ r │ 0 │ SNAPPY │ ✅ │
|
|
142
|
+
│ 9 │ r │ 0 │ SNAPPY │ ✅ │
|
|
143
|
+
└───────────┴─────────────┴───────┴─────────────┴──────────────┘
|
|
144
|
+
Compression codecs: {'SNAPPY'}
|
|
145
|
+
```
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
+
iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
iparq/source.py,sha256=huC6I0hqwyv4BZ5xjI6FMZs9KH60xVHEKbmX6X8hhiA,10721
|
|
4
|
+
iparq-0.2.5.dist-info/METADATA,sha256=QpkD25vwzqlQo9e2JQKFx9VHdyV0So9u20TtZHkf4LY,5501
|
|
5
|
+
iparq-0.2.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
+
iparq-0.2.5.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
+
iparq-0.2.5.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
+
iparq-0.2.5.dist-info/RECORD,,
|
iparq-0.1.7.dist-info/METADATA
DELETED
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: iparq
|
|
3
|
-
Version: 0.1.7
|
|
4
|
-
Summary: Display version and compression information about a parquet file
|
|
5
|
-
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.9
|
|
8
|
-
Requires-Dist: pyarrow>=19.0.0
|
|
9
|
-
Requires-Dist: pydantic>=2.10.6
|
|
10
|
-
Requires-Dist: typer>=0.15.1
|
|
11
|
-
Provides-Extra: checks
|
|
12
|
-
Requires-Dist: mypy>=1.14.1; extra == 'checks'
|
|
13
|
-
Requires-Dist: ruff>=0.9.3; extra == 'checks'
|
|
14
|
-
Provides-Extra: test
|
|
15
|
-
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
|
|
18
|
-
# iparq
|
|
19
|
-
|
|
20
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-package.yml)
|
|
21
|
-
|
|
22
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/dependabot/dependabot-updates)
|
|
23
|
-
|
|
24
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
|
|
25
|
-
|
|
26
|
-

|
|
27
|
-
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
28
|
-
|
|
29
|
-
***New*** Bloom filters information: Displays if there are bloom filters.
|
|
30
|
-
Read more about bloom filters in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
## Installation
|
|
34
|
-
|
|
35
|
-
### Using pip
|
|
36
|
-
|
|
37
|
-
1) Install the package using pip:
|
|
38
|
-
|
|
39
|
-
```sh
|
|
40
|
-
pip install iparq
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
2) Verify the installation by running:
|
|
44
|
-
|
|
45
|
-
```sh
|
|
46
|
-
iparq --help
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
### Using uv
|
|
50
|
-
|
|
51
|
-
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
52
|
-
|
|
53
|
-
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
54
|
-
|
|
55
|
-
2) Execute the following command:
|
|
56
|
-
|
|
57
|
-
```sh
|
|
58
|
-
uv pip install iparq
|
|
59
|
-
```
|
|
60
|
-
|
|
61
|
-
3) Verify the installation by running:
|
|
62
|
-
|
|
63
|
-
```sh
|
|
64
|
-
iparq --help
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
### Using Homebrew in a MAC
|
|
68
|
-
|
|
69
|
-
1) Run the following:
|
|
70
|
-
|
|
71
|
-
```sh
|
|
72
|
-
brew tap MiguelElGallo/tap https://github.com/MiguelElGallo//homebrew-iparq.git
|
|
73
|
-
brew install MiguelElGallo/tap/iparq
|
|
74
|
-
iparq —help
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
## Usage
|
|
78
|
-
|
|
79
|
-
Run
|
|
80
|
-
|
|
81
|
-
```sh
|
|
82
|
-
iparq <filename>
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
86
|
-
|
|
87
|
-
## Example ouput - Bloom Filters
|
|
88
|
-
|
|
89
|
-
```log
|
|
90
|
-
ParquetMetaModel(
|
|
91
|
-
created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
|
|
92
|
-
num_columns=1,
|
|
93
|
-
num_rows=100000000,
|
|
94
|
-
num_row_groups=10,
|
|
95
|
-
format_version='1.0',
|
|
96
|
-
serialized_size=1196
|
|
97
|
-
)
|
|
98
|
-
Column Compression Info:
|
|
99
|
-
Row Group 0:
|
|
100
|
-
Column 'r' (Index 0): SNAPPY
|
|
101
|
-
Row Group 1:
|
|
102
|
-
Column 'r' (Index 0): SNAPPY
|
|
103
|
-
Row Group 2:
|
|
104
|
-
Column 'r' (Index 0): SNAPPY
|
|
105
|
-
Row Group 3:
|
|
106
|
-
Column 'r' (Index 0): SNAPPY
|
|
107
|
-
Row Group 4:
|
|
108
|
-
Column 'r' (Index 0): SNAPPY
|
|
109
|
-
Row Group 5:
|
|
110
|
-
Column 'r' (Index 0): SNAPPY
|
|
111
|
-
Row Group 6:
|
|
112
|
-
Column 'r' (Index 0): SNAPPY
|
|
113
|
-
Row Group 7:
|
|
114
|
-
Column 'r' (Index 0): SNAPPY
|
|
115
|
-
Row Group 8:
|
|
116
|
-
Column 'r' (Index 0): SNAPPY
|
|
117
|
-
Row Group 9:
|
|
118
|
-
Column 'r' (Index 0): SNAPPY
|
|
119
|
-
Bloom Filter Info:
|
|
120
|
-
Row Group 0:
|
|
121
|
-
Column 'r' (Index 0): Has bloom filter
|
|
122
|
-
Row Group 1:
|
|
123
|
-
Column 'r' (Index 0): Has bloom filter
|
|
124
|
-
Row Group 2:
|
|
125
|
-
Column 'r' (Index 0): Has bloom filter
|
|
126
|
-
Row Group 3:
|
|
127
|
-
Column 'r' (Index 0): Has bloom filter
|
|
128
|
-
Row Group 4:
|
|
129
|
-
Column 'r' (Index 0): Has bloom filter
|
|
130
|
-
Row Group 5:
|
|
131
|
-
Column 'r' (Index 0): Has bloom filter
|
|
132
|
-
Row Group 6:
|
|
133
|
-
Column 'r' (Index 0): Has bloom filter
|
|
134
|
-
Row Group 7:
|
|
135
|
-
Column 'r' (Index 0): Has bloom filter
|
|
136
|
-
Row Group 8:
|
|
137
|
-
Column 'r' (Index 0): Has bloom filter
|
|
138
|
-
Row Group 9:
|
|
139
|
-
Column 'r' (Index 0): Has bloom filter
|
|
140
|
-
Compression codecs: {'SNAPPY'}
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
## Example output
|
|
144
|
-
|
|
145
|
-
```log
|
|
146
|
-
ParquetMetaModel(
|
|
147
|
-
created_by='parquet-cpp-arrow version 14.0.2',
|
|
148
|
-
num_columns=19,
|
|
149
|
-
num_rows=2964624,
|
|
150
|
-
num_row_groups=3,
|
|
151
|
-
format_version='2.6',
|
|
152
|
-
serialized_size=6357
|
|
153
|
-
)
|
|
154
|
-
Column Compression Info:
|
|
155
|
-
Row Group 0:
|
|
156
|
-
Column 'VendorID' (Index 0): ZSTD
|
|
157
|
-
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
158
|
-
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
159
|
-
Column 'passenger_count' (Index 3): ZSTD
|
|
160
|
-
Column 'trip_distance' (Index 4): ZSTD
|
|
161
|
-
Column 'RatecodeID' (Index 5): ZSTD
|
|
162
|
-
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
163
|
-
Column 'PULocationID' (Index 7): ZSTD
|
|
164
|
-
Column 'DOLocationID' (Index 8): ZSTD
|
|
165
|
-
Column 'payment_type' (Index 9): ZSTD
|
|
166
|
-
Column 'fare_amount' (Index 10): ZSTD
|
|
167
|
-
Column 'extra' (Index 11): ZSTD
|
|
168
|
-
Column 'mta_tax' (Index 12): ZSTD
|
|
169
|
-
Column 'tip_amount' (Index 13): ZSTD
|
|
170
|
-
Column 'tolls_amount' (Index 14): ZSTD
|
|
171
|
-
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
172
|
-
Column 'total_amount' (Index 16): ZSTD
|
|
173
|
-
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
174
|
-
Column 'Airport_fee' (Index 18): ZSTD
|
|
175
|
-
Row Group 1:
|
|
176
|
-
Column 'VendorID' (Index 0): ZSTD
|
|
177
|
-
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
178
|
-
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
179
|
-
Column 'passenger_count' (Index 3): ZSTD
|
|
180
|
-
Column 'trip_distance' (Index 4): ZSTD
|
|
181
|
-
Column 'RatecodeID' (Index 5): ZSTD
|
|
182
|
-
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
183
|
-
Column 'PULocationID' (Index 7): ZSTD
|
|
184
|
-
Column 'DOLocationID' (Index 8): ZSTD
|
|
185
|
-
Column 'payment_type' (Index 9): ZSTD
|
|
186
|
-
Column 'fare_amount' (Index 10): ZSTD
|
|
187
|
-
Column 'extra' (Index 11): ZSTD
|
|
188
|
-
Column 'mta_tax' (Index 12): ZSTD
|
|
189
|
-
Column 'tip_amount' (Index 13): ZSTD
|
|
190
|
-
Column 'tolls_amount' (Index 14): ZSTD
|
|
191
|
-
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
192
|
-
Column 'total_amount' (Index 16): ZSTD
|
|
193
|
-
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
194
|
-
Column 'Airport_fee' (Index 18): ZSTD
|
|
195
|
-
Row Group 2:
|
|
196
|
-
Column 'VendorID' (Index 0): ZSTD
|
|
197
|
-
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
198
|
-
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
199
|
-
Column 'passenger_count' (Index 3): ZSTD
|
|
200
|
-
Column 'trip_distance' (Index 4): ZSTD
|
|
201
|
-
Column 'RatecodeID' (Index 5): ZSTD
|
|
202
|
-
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
203
|
-
Column 'PULocationID' (Index 7): ZSTD
|
|
204
|
-
Column 'DOLocationID' (Index 8): ZSTD
|
|
205
|
-
Column 'payment_type' (Index 9): ZSTD
|
|
206
|
-
Column 'fare_amount' (Index 10): ZSTD
|
|
207
|
-
Column 'extra' (Index 11): ZSTD
|
|
208
|
-
Column 'mta_tax' (Index 12): ZSTD
|
|
209
|
-
Column 'tip_amount' (Index 13): ZSTD
|
|
210
|
-
Column 'tolls_amount' (Index 14): ZSTD
|
|
211
|
-
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
212
|
-
Column 'total_amount' (Index 16): ZSTD
|
|
213
|
-
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
214
|
-
Column 'Airport_fee' (Index 18): ZSTD
|
|
215
|
-
Compression codecs: {'ZSTD'}
|
|
216
|
-
```
|
iparq-0.1.7.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
-
iparq/source.py,sha256=crKRTuZY6W2zEhFfAzb4XWopaVy9qnEkFqz4jbyGmeM,6439
|
|
3
|
-
iparq-0.1.7.dist-info/METADATA,sha256=ku4ZsLQ1Iq2ovPzKqv8aGqBGBkn3nTviW6hFzFsP6bw,6884
|
|
4
|
-
iparq-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
-
iparq-0.1.7.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
6
|
-
iparq-0.1.7.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
7
|
-
iparq-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|