iparq 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq/source.py +48 -12
- iparq-0.1.5.dist-info/METADATA +156 -0
- iparq-0.1.5.dist-info/RECORD +7 -0
- iparq-0.1.3.dist-info/METADATA +0 -73
- iparq-0.1.3.dist-info/RECORD +0 -7
- {iparq-0.1.3.dist-info → iparq-0.1.5.dist-info}/WHEEL +0 -0
- {iparq-0.1.3.dist-info → iparq-0.1.5.dist-info}/entry_points.txt +0 -0
- {iparq-0.1.3.dist-info → iparq-0.1.5.dist-info}/licenses/LICENSE +0 -0
iparq/source.py
CHANGED
|
@@ -2,8 +2,10 @@ import pyarrow.parquet as pq
|
|
|
2
2
|
import typer
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
from rich import print
|
|
5
|
+
from rich.console import Console
|
|
5
6
|
|
|
6
7
|
app = typer.Typer()
|
|
8
|
+
console = Console()
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
class ParquetMetaModel(BaseModel):
|
|
@@ -39,12 +41,21 @@ def read_parquet_metadata(filename: str):
|
|
|
39
41
|
- parquet_metadata (pyarrow.parquet.FileMetaData): The metadata of the Parquet file.
|
|
40
42
|
- compression_codecs (set): A set of compression codecs used in the Parquet file.
|
|
41
43
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
for
|
|
47
|
-
|
|
44
|
+
try:
|
|
45
|
+
compression_codecs = set([])
|
|
46
|
+
parquet_metadata = pq.ParquetFile(filename).metadata
|
|
47
|
+
|
|
48
|
+
for i in range(parquet_metadata.num_row_groups):
|
|
49
|
+
for j in range(parquet_metadata.num_columns):
|
|
50
|
+
compression_codecs.add(
|
|
51
|
+
parquet_metadata.row_group(i).column(j).compression
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
except FileNotFoundError:
|
|
55
|
+
console.print(
|
|
56
|
+
f"Cannot open: {filename}.", style="blink bold red underline on white"
|
|
57
|
+
)
|
|
58
|
+
exit(1)
|
|
48
59
|
|
|
49
60
|
return parquet_metadata, compression_codecs
|
|
50
61
|
|
|
@@ -75,10 +86,36 @@ def print_parquet_metadata(parquet_metadata):
|
|
|
75
86
|
format_version=str(parquet_metadata.format_version),
|
|
76
87
|
serialized_size=parquet_metadata.serialized_size,
|
|
77
88
|
)
|
|
78
|
-
print(meta)
|
|
89
|
+
console.print(meta)
|
|
79
90
|
|
|
80
91
|
except AttributeError as e:
|
|
81
|
-
print(f"Error: {e}")
|
|
92
|
+
console.print(f"Error: {e}", style="blink bold red underline on white")
|
|
93
|
+
finally:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def print_compression_types(parquet_metadata) -> None:
|
|
98
|
+
"""
|
|
99
|
+
Prints the compression type for each column in each row group of the Parquet file.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
num_row_groups = parquet_metadata.num_row_groups
|
|
103
|
+
num_columns = parquet_metadata.num_columns
|
|
104
|
+
console.print("[bold underline]Column Compression Info:[/bold underline]")
|
|
105
|
+
for i in range(num_row_groups):
|
|
106
|
+
console.print(f"[bold]Row Group {i}:[/bold]")
|
|
107
|
+
for j in range(num_columns):
|
|
108
|
+
column_chunk = parquet_metadata.row_group(i).column(j)
|
|
109
|
+
compression = column_chunk.compression
|
|
110
|
+
column_name = parquet_metadata.schema.column(j).name
|
|
111
|
+
console.print(
|
|
112
|
+
f" Column '{column_name}' (Index {j}): [italic]{compression}[/italic]"
|
|
113
|
+
)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
console.print(
|
|
116
|
+
f"Error while printing compression types: {e}",
|
|
117
|
+
style="blink bold red underline on white",
|
|
118
|
+
)
|
|
82
119
|
finally:
|
|
83
120
|
pass
|
|
84
121
|
|
|
@@ -92,13 +129,12 @@ def main(filename: str):
|
|
|
92
129
|
filename (str): The path to the Parquet file.
|
|
93
130
|
|
|
94
131
|
Returns:
|
|
95
|
-
|
|
132
|
+
Metadata of the Parquet file and the compression codecs used.
|
|
96
133
|
"""
|
|
97
134
|
(parquet_metadata, compression) = read_parquet_metadata(filename)
|
|
98
135
|
|
|
99
|
-
print_parquet_metadata(
|
|
100
|
-
|
|
101
|
-
)
|
|
136
|
+
print_parquet_metadata(parquet_metadata)
|
|
137
|
+
print_compression_types(parquet_metadata)
|
|
102
138
|
print(f"Compression codecs: {compression}")
|
|
103
139
|
|
|
104
140
|
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: iparq
|
|
3
|
+
Version: 0.1.5
|
|
4
|
+
Summary: Display version and compression information about a parquet file
|
|
5
|
+
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: pyarrow>=19.0.0
|
|
9
|
+
Requires-Dist: pydantic>=2.10.6
|
|
10
|
+
Requires-Dist: typer>=0.15.1
|
|
11
|
+
Provides-Extra: checks
|
|
12
|
+
Requires-Dist: mypy>=1.14.1; extra == 'checks'
|
|
13
|
+
Requires-Dist: ruff>=0.9.3; extra == 'checks'
|
|
14
|
+
Provides-Extra: test
|
|
15
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# iparq
|
|
19
|
+
|
|
20
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-package.yml)
|
|
21
|
+
|
|
22
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/dependabot/dependabot-updates)
|
|
23
|
+
|
|
24
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
|
|
25
|
+
|
|
26
|
+

|
|
27
|
+
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
### Using pip
|
|
32
|
+
|
|
33
|
+
1) Install the package using pip:
|
|
34
|
+
|
|
35
|
+
```sh
|
|
36
|
+
pip install iparq
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
2) Verify the installation by running:
|
|
40
|
+
|
|
41
|
+
```sh
|
|
42
|
+
iparq --help
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Using uv
|
|
46
|
+
|
|
47
|
+
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
48
|
+
|
|
49
|
+
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
50
|
+
|
|
51
|
+
2) Execute the following command:
|
|
52
|
+
|
|
53
|
+
```sh
|
|
54
|
+
uv pip install iparq
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
3) Verify the installation by running:
|
|
58
|
+
|
|
59
|
+
```sh
|
|
60
|
+
iparq --help
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Using Homebrew in a MAC
|
|
64
|
+
|
|
65
|
+
1) Run the following:
|
|
66
|
+
|
|
67
|
+
```sh
|
|
68
|
+
brew tap MiguelElGallo/tap https://github.com/MiguelElGallo//homebrew-iparq.git
|
|
69
|
+
brew install MiguelElGallo/tap/iparq
|
|
70
|
+
iparq —help
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Usage
|
|
74
|
+
|
|
75
|
+
Run
|
|
76
|
+
|
|
77
|
+
```sh
|
|
78
|
+
iparq <filename>
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
82
|
+
|
|
83
|
+
## Example output
|
|
84
|
+
|
|
85
|
+
```log
|
|
86
|
+
ParquetMetaModel(
|
|
87
|
+
created_by='parquet-cpp-arrow version 14.0.2',
|
|
88
|
+
num_columns=19,
|
|
89
|
+
num_rows=2964624,
|
|
90
|
+
num_row_groups=3,
|
|
91
|
+
format_version='2.6',
|
|
92
|
+
serialized_size=6357
|
|
93
|
+
)
|
|
94
|
+
Column Compression Info:
|
|
95
|
+
Row Group 0:
|
|
96
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
97
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
98
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
99
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
100
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
101
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
102
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
103
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
104
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
105
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
106
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
107
|
+
Column 'extra' (Index 11): ZSTD
|
|
108
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
109
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
110
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
111
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
112
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
113
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
114
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
115
|
+
Row Group 1:
|
|
116
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
117
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
118
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
119
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
120
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
121
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
122
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
123
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
124
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
125
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
126
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
127
|
+
Column 'extra' (Index 11): ZSTD
|
|
128
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
129
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
130
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
131
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
132
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
133
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
134
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
135
|
+
Row Group 2:
|
|
136
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
137
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
138
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
139
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
140
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
141
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
142
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
143
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
144
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
145
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
146
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
147
|
+
Column 'extra' (Index 11): ZSTD
|
|
148
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
149
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
150
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
151
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
152
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
153
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
154
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
155
|
+
Compression codecs: {'ZSTD'}
|
|
156
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
+
iparq/source.py,sha256=7ocTpA7j5C-oSyLkMPhDifpH3cPhqyK3LBu0CjjG83s,4851
|
|
3
|
+
iparq-0.1.5.dist-info/METADATA,sha256=7kLNc40ROyYot5N37GXjSErzpp5WPDWa9_Y2BQpmr6o,5387
|
|
4
|
+
iparq-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
+
iparq-0.1.5.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
6
|
+
iparq-0.1.5.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
7
|
+
iparq-0.1.5.dist-info/RECORD,,
|
iparq-0.1.3.dist-info/METADATA
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: iparq
|
|
3
|
-
Version: 0.1.3
|
|
4
|
-
Summary: Display version and compression information about a parquet file
|
|
5
|
-
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.9
|
|
8
|
-
Requires-Dist: pyarrow>=19.0.0
|
|
9
|
-
Requires-Dist: pydantic>=2.10.6
|
|
10
|
-
Requires-Dist: typer>=0.15.1
|
|
11
|
-
Description-Content-Type: text/markdown
|
|
12
|
-
|
|
13
|
-
# iparq
|
|
14
|
-
|
|
15
|
-
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create ipq (Information Parquet). My goal with ipq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
16
|
-
|
|
17
|
-
## Installation
|
|
18
|
-
|
|
19
|
-
### Using pip
|
|
20
|
-
|
|
21
|
-
1) Install the package using pip:
|
|
22
|
-
|
|
23
|
-
```sh
|
|
24
|
-
pip install iparq
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
2) Verify the installation by running:
|
|
28
|
-
|
|
29
|
-
```sh
|
|
30
|
-
iparq --help
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
### Using uv
|
|
34
|
-
|
|
35
|
-
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
36
|
-
|
|
37
|
-
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
38
|
-
|
|
39
|
-
2) Execute the following command:
|
|
40
|
-
|
|
41
|
-
```sh
|
|
42
|
-
uv pip install iparq
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
3) Verify the installation by running:
|
|
46
|
-
|
|
47
|
-
```sh
|
|
48
|
-
iparq --help
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
## Usage
|
|
52
|
-
|
|
53
|
-
Run
|
|
54
|
-
|
|
55
|
-
```sh
|
|
56
|
-
iparq <filename>
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
60
|
-
|
|
61
|
-
## Example output
|
|
62
|
-
|
|
63
|
-
```log
|
|
64
|
-
ParquetMetaModel(
|
|
65
|
-
created_by='parquet-cpp-arrow version 14.0.2',
|
|
66
|
-
num_columns=3,
|
|
67
|
-
num_rows=3,
|
|
68
|
-
num_row_groups=1,
|
|
69
|
-
format_version='2.6',
|
|
70
|
-
serialized_size=2223
|
|
71
|
-
)
|
|
72
|
-
Compression codecs: {'SNAPPY'}
|
|
73
|
-
```
|
iparq-0.1.3.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
-
iparq/source.py,sha256=I0K9HQ294PGhsBIS-op_ZNzKvG3J0rkrS5ESQAVPibM,3385
|
|
3
|
-
iparq-0.1.3.dist-info/METADATA,sha256=bSJ6uJO20vBvsw57pdaFxCj2762W8OAiINqnl8ABDmM,1929
|
|
4
|
-
iparq-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
-
iparq-0.1.3.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
6
|
-
iparq-0.1.3.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
7
|
-
iparq-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|