iparq 0.1.4__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq-0.1.7/.github/copilot-instructions.md +12 -0
- {iparq-0.1.4 → iparq-0.1.7}/.gitignore +2 -0
- iparq-0.1.7/CONTRIBUTING.md +32 -0
- iparq-0.1.7/PKG-INFO +216 -0
- iparq-0.1.7/README.md +199 -0
- {iparq-0.1.4 → iparq-0.1.7}/pyproject.toml +1 -1
- {iparq-0.1.4 → iparq-0.1.7}/src/iparq/source.py +68 -3
- iparq-0.1.7/tests/test_cli.py +2 -0
- iparq-0.1.4/PKG-INFO +0 -96
- iparq-0.1.4/README.md +0 -79
- iparq-0.1.4/tests/test_cli.py +0 -46
- {iparq-0.1.4 → iparq-0.1.7}/.github/dependabot.yml +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/.github/workflows/merge.yml +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/.github/workflows/python-package.yml +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/.github/workflows/python-publish.yml +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/.python-version +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/.vscode/launch.json +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/.vscode/settings.json +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/LICENSE +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/dummy.parquet +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/media/iparq.png +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/src/iparq/__init__.py +0 -0
- {iparq-0.1.4 → iparq-0.1.7}/uv.lock +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Contributing to iparq
|
|
2
|
+
|
|
3
|
+
Thank you for considering contributing to iparq! We're excited to collaborate with you. Here are some guidelines to help you get started:
|
|
4
|
+
|
|
5
|
+
## How to Contribute
|
|
6
|
+
|
|
7
|
+
1. **Fork the repository**: Click the "Fork" button at the top right of this page to create a copy of the repository.
|
|
8
|
+
2. **Clone your fork**: Use `git clone <your-fork-url>` to clone your forked repository to your local machine.
|
|
9
|
+
3. **Create a branch**: Use `git checkout -b <branch-name>` to create a new branch for your changes.
|
|
10
|
+
4. **Make your changes**: Make the necessary changes in your local repository.
|
|
11
|
+
5. **Commit your changes**: Use `git commit -m "Description of changes"` to commit your changes.
|
|
12
|
+
6. **Push your changes**: Use `git push origin <branch-name>` to push your changes to your forked repository.
|
|
13
|
+
7. **Create a pull request**: Go to the original repository and create a pull request from your forked repository.
|
|
14
|
+
|
|
15
|
+
## Guidelines
|
|
16
|
+
|
|
17
|
+
- **Code of Conduct**: Please adhere to our [Code of Conduct](CODE_OF_CONDUCT.md) to ensure a welcoming and friendly environment.
|
|
18
|
+
- **Documentation**: Ensure your code changes are well-documented. Update any relevant documentation in the `docs` folder.
|
|
19
|
+
- **Tests**: Include tests for your changes to ensure functionality and avoid regressions.
|
|
20
|
+
- **Commit Messages**: Write clear and concise commit messages. Follow the format: `type(scope): message`.
|
|
21
|
+
|
|
22
|
+
## Reporting Issues
|
|
23
|
+
|
|
24
|
+
If you encounter any issues or bugs, please open an issue in the repository. Provide as much detail as possible, including steps to reproduce the issue and any relevant logs or screenshots.
|
|
25
|
+
|
|
26
|
+
## License
|
|
27
|
+
|
|
28
|
+
By contributing to this project, you agree that your contributions will be licensed under the [MIT License](LICENSE).
|
|
29
|
+
|
|
30
|
+
Thank you for your contributions and support!
|
|
31
|
+
|
|
32
|
+
Happy coding!
|
iparq-0.1.7/PKG-INFO
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: iparq
|
|
3
|
+
Version: 0.1.7
|
|
4
|
+
Summary: Display version and compression information about a parquet file
|
|
5
|
+
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: pyarrow>=19.0.0
|
|
9
|
+
Requires-Dist: pydantic>=2.10.6
|
|
10
|
+
Requires-Dist: typer>=0.15.1
|
|
11
|
+
Provides-Extra: checks
|
|
12
|
+
Requires-Dist: mypy>=1.14.1; extra == 'checks'
|
|
13
|
+
Requires-Dist: ruff>=0.9.3; extra == 'checks'
|
|
14
|
+
Provides-Extra: test
|
|
15
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# iparq
|
|
19
|
+
|
|
20
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-package.yml)
|
|
21
|
+
|
|
22
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/dependabot/dependabot-updates)
|
|
23
|
+
|
|
24
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
|
|
25
|
+
|
|
26
|
+

|
|
27
|
+
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
28
|
+
|
|
29
|
+
***New*** Bloom filters information: Displays if there are bloom filters.
|
|
30
|
+
Read more about bloom filters in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### Using pip
|
|
36
|
+
|
|
37
|
+
1) Install the package using pip:
|
|
38
|
+
|
|
39
|
+
```sh
|
|
40
|
+
pip install iparq
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
2) Verify the installation by running:
|
|
44
|
+
|
|
45
|
+
```sh
|
|
46
|
+
iparq --help
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Using uv
|
|
50
|
+
|
|
51
|
+
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
52
|
+
|
|
53
|
+
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
54
|
+
|
|
55
|
+
2) Execute the following command:
|
|
56
|
+
|
|
57
|
+
```sh
|
|
58
|
+
uv pip install iparq
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
3) Verify the installation by running:
|
|
62
|
+
|
|
63
|
+
```sh
|
|
64
|
+
iparq --help
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Using Homebrew in a MAC
|
|
68
|
+
|
|
69
|
+
1) Run the following:
|
|
70
|
+
|
|
71
|
+
```sh
|
|
72
|
+
brew tap MiguelElGallo/tap https://github.com/MiguelElGallo//homebrew-iparq.git
|
|
73
|
+
brew install MiguelElGallo/tap/iparq
|
|
74
|
+
iparq —help
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Usage
|
|
78
|
+
|
|
79
|
+
Run
|
|
80
|
+
|
|
81
|
+
```sh
|
|
82
|
+
iparq <filename>
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
86
|
+
|
|
87
|
+
## Example ouput - Bloom Filters
|
|
88
|
+
|
|
89
|
+
```log
|
|
90
|
+
ParquetMetaModel(
|
|
91
|
+
created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
|
|
92
|
+
num_columns=1,
|
|
93
|
+
num_rows=100000000,
|
|
94
|
+
num_row_groups=10,
|
|
95
|
+
format_version='1.0',
|
|
96
|
+
serialized_size=1196
|
|
97
|
+
)
|
|
98
|
+
Column Compression Info:
|
|
99
|
+
Row Group 0:
|
|
100
|
+
Column 'r' (Index 0): SNAPPY
|
|
101
|
+
Row Group 1:
|
|
102
|
+
Column 'r' (Index 0): SNAPPY
|
|
103
|
+
Row Group 2:
|
|
104
|
+
Column 'r' (Index 0): SNAPPY
|
|
105
|
+
Row Group 3:
|
|
106
|
+
Column 'r' (Index 0): SNAPPY
|
|
107
|
+
Row Group 4:
|
|
108
|
+
Column 'r' (Index 0): SNAPPY
|
|
109
|
+
Row Group 5:
|
|
110
|
+
Column 'r' (Index 0): SNAPPY
|
|
111
|
+
Row Group 6:
|
|
112
|
+
Column 'r' (Index 0): SNAPPY
|
|
113
|
+
Row Group 7:
|
|
114
|
+
Column 'r' (Index 0): SNAPPY
|
|
115
|
+
Row Group 8:
|
|
116
|
+
Column 'r' (Index 0): SNAPPY
|
|
117
|
+
Row Group 9:
|
|
118
|
+
Column 'r' (Index 0): SNAPPY
|
|
119
|
+
Bloom Filter Info:
|
|
120
|
+
Row Group 0:
|
|
121
|
+
Column 'r' (Index 0): Has bloom filter
|
|
122
|
+
Row Group 1:
|
|
123
|
+
Column 'r' (Index 0): Has bloom filter
|
|
124
|
+
Row Group 2:
|
|
125
|
+
Column 'r' (Index 0): Has bloom filter
|
|
126
|
+
Row Group 3:
|
|
127
|
+
Column 'r' (Index 0): Has bloom filter
|
|
128
|
+
Row Group 4:
|
|
129
|
+
Column 'r' (Index 0): Has bloom filter
|
|
130
|
+
Row Group 5:
|
|
131
|
+
Column 'r' (Index 0): Has bloom filter
|
|
132
|
+
Row Group 6:
|
|
133
|
+
Column 'r' (Index 0): Has bloom filter
|
|
134
|
+
Row Group 7:
|
|
135
|
+
Column 'r' (Index 0): Has bloom filter
|
|
136
|
+
Row Group 8:
|
|
137
|
+
Column 'r' (Index 0): Has bloom filter
|
|
138
|
+
Row Group 9:
|
|
139
|
+
Column 'r' (Index 0): Has bloom filter
|
|
140
|
+
Compression codecs: {'SNAPPY'}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Example output
|
|
144
|
+
|
|
145
|
+
```log
|
|
146
|
+
ParquetMetaModel(
|
|
147
|
+
created_by='parquet-cpp-arrow version 14.0.2',
|
|
148
|
+
num_columns=19,
|
|
149
|
+
num_rows=2964624,
|
|
150
|
+
num_row_groups=3,
|
|
151
|
+
format_version='2.6',
|
|
152
|
+
serialized_size=6357
|
|
153
|
+
)
|
|
154
|
+
Column Compression Info:
|
|
155
|
+
Row Group 0:
|
|
156
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
157
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
158
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
159
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
160
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
161
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
162
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
163
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
164
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
165
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
166
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
167
|
+
Column 'extra' (Index 11): ZSTD
|
|
168
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
169
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
170
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
171
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
172
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
173
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
174
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
175
|
+
Row Group 1:
|
|
176
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
177
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
178
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
179
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
180
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
181
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
182
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
183
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
184
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
185
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
186
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
187
|
+
Column 'extra' (Index 11): ZSTD
|
|
188
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
189
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
190
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
191
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
192
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
193
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
194
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
195
|
+
Row Group 2:
|
|
196
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
197
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
198
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
199
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
200
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
201
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
202
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
203
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
204
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
205
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
206
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
207
|
+
Column 'extra' (Index 11): ZSTD
|
|
208
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
209
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
210
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
211
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
212
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
213
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
214
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
215
|
+
Compression codecs: {'ZSTD'}
|
|
216
|
+
```
|
iparq-0.1.7/README.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# iparq
|
|
2
|
+
|
|
3
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-package.yml)
|
|
4
|
+
|
|
5
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/dependabot/dependabot-updates)
|
|
6
|
+
|
|
7
|
+
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
|
|
8
|
+
|
|
9
|
+

|
|
10
|
+
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
11
|
+
|
|
12
|
+
***New*** Bloom filters information: Displays if there are bloom filters.
|
|
13
|
+
Read more about bloom filters in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
### Using pip
|
|
19
|
+
|
|
20
|
+
1) Install the package using pip:
|
|
21
|
+
|
|
22
|
+
```sh
|
|
23
|
+
pip install iparq
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
2) Verify the installation by running:
|
|
27
|
+
|
|
28
|
+
```sh
|
|
29
|
+
iparq --help
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Using uv
|
|
33
|
+
|
|
34
|
+
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
35
|
+
|
|
36
|
+
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
37
|
+
|
|
38
|
+
2) Execute the following command:
|
|
39
|
+
|
|
40
|
+
```sh
|
|
41
|
+
uv pip install iparq
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
3) Verify the installation by running:
|
|
45
|
+
|
|
46
|
+
```sh
|
|
47
|
+
iparq --help
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Using Homebrew in a MAC
|
|
51
|
+
|
|
52
|
+
1) Run the following:
|
|
53
|
+
|
|
54
|
+
```sh
|
|
55
|
+
brew tap MiguelElGallo/tap https://github.com/MiguelElGallo//homebrew-iparq.git
|
|
56
|
+
brew install MiguelElGallo/tap/iparq
|
|
57
|
+
iparq —help
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Usage
|
|
61
|
+
|
|
62
|
+
Run
|
|
63
|
+
|
|
64
|
+
```sh
|
|
65
|
+
iparq <filename>
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
69
|
+
|
|
70
|
+
## Example ouput - Bloom Filters
|
|
71
|
+
|
|
72
|
+
```log
|
|
73
|
+
ParquetMetaModel(
|
|
74
|
+
created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
|
|
75
|
+
num_columns=1,
|
|
76
|
+
num_rows=100000000,
|
|
77
|
+
num_row_groups=10,
|
|
78
|
+
format_version='1.0',
|
|
79
|
+
serialized_size=1196
|
|
80
|
+
)
|
|
81
|
+
Column Compression Info:
|
|
82
|
+
Row Group 0:
|
|
83
|
+
Column 'r' (Index 0): SNAPPY
|
|
84
|
+
Row Group 1:
|
|
85
|
+
Column 'r' (Index 0): SNAPPY
|
|
86
|
+
Row Group 2:
|
|
87
|
+
Column 'r' (Index 0): SNAPPY
|
|
88
|
+
Row Group 3:
|
|
89
|
+
Column 'r' (Index 0): SNAPPY
|
|
90
|
+
Row Group 4:
|
|
91
|
+
Column 'r' (Index 0): SNAPPY
|
|
92
|
+
Row Group 5:
|
|
93
|
+
Column 'r' (Index 0): SNAPPY
|
|
94
|
+
Row Group 6:
|
|
95
|
+
Column 'r' (Index 0): SNAPPY
|
|
96
|
+
Row Group 7:
|
|
97
|
+
Column 'r' (Index 0): SNAPPY
|
|
98
|
+
Row Group 8:
|
|
99
|
+
Column 'r' (Index 0): SNAPPY
|
|
100
|
+
Row Group 9:
|
|
101
|
+
Column 'r' (Index 0): SNAPPY
|
|
102
|
+
Bloom Filter Info:
|
|
103
|
+
Row Group 0:
|
|
104
|
+
Column 'r' (Index 0): Has bloom filter
|
|
105
|
+
Row Group 1:
|
|
106
|
+
Column 'r' (Index 0): Has bloom filter
|
|
107
|
+
Row Group 2:
|
|
108
|
+
Column 'r' (Index 0): Has bloom filter
|
|
109
|
+
Row Group 3:
|
|
110
|
+
Column 'r' (Index 0): Has bloom filter
|
|
111
|
+
Row Group 4:
|
|
112
|
+
Column 'r' (Index 0): Has bloom filter
|
|
113
|
+
Row Group 5:
|
|
114
|
+
Column 'r' (Index 0): Has bloom filter
|
|
115
|
+
Row Group 6:
|
|
116
|
+
Column 'r' (Index 0): Has bloom filter
|
|
117
|
+
Row Group 7:
|
|
118
|
+
Column 'r' (Index 0): Has bloom filter
|
|
119
|
+
Row Group 8:
|
|
120
|
+
Column 'r' (Index 0): Has bloom filter
|
|
121
|
+
Row Group 9:
|
|
122
|
+
Column 'r' (Index 0): Has bloom filter
|
|
123
|
+
Compression codecs: {'SNAPPY'}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Example output
|
|
127
|
+
|
|
128
|
+
```log
|
|
129
|
+
ParquetMetaModel(
|
|
130
|
+
created_by='parquet-cpp-arrow version 14.0.2',
|
|
131
|
+
num_columns=19,
|
|
132
|
+
num_rows=2964624,
|
|
133
|
+
num_row_groups=3,
|
|
134
|
+
format_version='2.6',
|
|
135
|
+
serialized_size=6357
|
|
136
|
+
)
|
|
137
|
+
Column Compression Info:
|
|
138
|
+
Row Group 0:
|
|
139
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
140
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
141
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
142
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
143
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
144
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
145
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
146
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
147
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
148
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
149
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
150
|
+
Column 'extra' (Index 11): ZSTD
|
|
151
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
152
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
153
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
154
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
155
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
156
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
157
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
158
|
+
Row Group 1:
|
|
159
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
160
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
161
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
162
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
163
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
164
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
165
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
166
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
167
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
168
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
169
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
170
|
+
Column 'extra' (Index 11): ZSTD
|
|
171
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
172
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
173
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
174
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
175
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
176
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
177
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
178
|
+
Row Group 2:
|
|
179
|
+
Column 'VendorID' (Index 0): ZSTD
|
|
180
|
+
Column 'tpep_pickup_datetime' (Index 1): ZSTD
|
|
181
|
+
Column 'tpep_dropoff_datetime' (Index 2): ZSTD
|
|
182
|
+
Column 'passenger_count' (Index 3): ZSTD
|
|
183
|
+
Column 'trip_distance' (Index 4): ZSTD
|
|
184
|
+
Column 'RatecodeID' (Index 5): ZSTD
|
|
185
|
+
Column 'store_and_fwd_flag' (Index 6): ZSTD
|
|
186
|
+
Column 'PULocationID' (Index 7): ZSTD
|
|
187
|
+
Column 'DOLocationID' (Index 8): ZSTD
|
|
188
|
+
Column 'payment_type' (Index 9): ZSTD
|
|
189
|
+
Column 'fare_amount' (Index 10): ZSTD
|
|
190
|
+
Column 'extra' (Index 11): ZSTD
|
|
191
|
+
Column 'mta_tax' (Index 12): ZSTD
|
|
192
|
+
Column 'tip_amount' (Index 13): ZSTD
|
|
193
|
+
Column 'tolls_amount' (Index 14): ZSTD
|
|
194
|
+
Column 'improvement_surcharge' (Index 15): ZSTD
|
|
195
|
+
Column 'total_amount' (Index 16): ZSTD
|
|
196
|
+
Column 'congestion_surcharge' (Index 17): ZSTD
|
|
197
|
+
Column 'Airport_fee' (Index 18): ZSTD
|
|
198
|
+
Compression codecs: {'ZSTD'}
|
|
199
|
+
```
|
|
@@ -94,6 +94,71 @@ def print_parquet_metadata(parquet_metadata):
|
|
|
94
94
|
pass
|
|
95
95
|
|
|
96
96
|
|
|
97
|
+
def print_compression_types(parquet_metadata) -> None:
|
|
98
|
+
"""
|
|
99
|
+
Prints the compression type for each column in each row group of the Parquet file.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
num_row_groups = parquet_metadata.num_row_groups
|
|
103
|
+
num_columns = parquet_metadata.num_columns
|
|
104
|
+
console.print("[bold underline]Column Compression Info:[/bold underline]")
|
|
105
|
+
for i in range(num_row_groups):
|
|
106
|
+
console.print(f"[bold]Row Group {i}:[/bold]")
|
|
107
|
+
for j in range(num_columns):
|
|
108
|
+
column_chunk = parquet_metadata.row_group(i).column(j)
|
|
109
|
+
compression = column_chunk.compression
|
|
110
|
+
column_name = parquet_metadata.schema.column(j).name
|
|
111
|
+
console.print(
|
|
112
|
+
f" Column '{column_name}' (Index {j}): [italic]{compression}[/italic]"
|
|
113
|
+
)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
console.print(
|
|
116
|
+
f"Error while printing compression types: {e}",
|
|
117
|
+
style="blink bold red underline on white",
|
|
118
|
+
)
|
|
119
|
+
finally:
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def print_bloom_filter_info(parquet_metadata) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Prints information about bloom filters for each column in each row group of the Parquet file.
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
num_row_groups = parquet_metadata.num_row_groups
|
|
129
|
+
num_columns = parquet_metadata.num_columns
|
|
130
|
+
has_bloom_filters = False
|
|
131
|
+
|
|
132
|
+
console.print("[bold underline]Bloom Filter Info:[/bold underline]")
|
|
133
|
+
|
|
134
|
+
for i in range(num_row_groups):
|
|
135
|
+
row_group = parquet_metadata.row_group(i)
|
|
136
|
+
bloom_filters_in_group = False
|
|
137
|
+
|
|
138
|
+
for j in range(num_columns):
|
|
139
|
+
column_chunk = row_group.column(j)
|
|
140
|
+
column_name = parquet_metadata.schema.column(j).name
|
|
141
|
+
|
|
142
|
+
# Check if this column has bloom filters using is_stats_set
|
|
143
|
+
if hasattr(column_chunk, "is_stats_set") and column_chunk.is_stats_set:
|
|
144
|
+
if not bloom_filters_in_group:
|
|
145
|
+
console.print(f"[bold]Row Group {i}:[/bold]")
|
|
146
|
+
bloom_filters_in_group = True
|
|
147
|
+
has_bloom_filters = True
|
|
148
|
+
console.print(
|
|
149
|
+
f" Column '{column_name}' (Index {j}): [green]Has bloom filter[/green]"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if not has_bloom_filters:
|
|
153
|
+
console.print(" [italic]No bloom filters found in any column[/italic]")
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
console.print(
|
|
157
|
+
f"Error while printing bloom filter information: {e}",
|
|
158
|
+
style="blink bold red underline on white",
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
97
162
|
@app.command()
|
|
98
163
|
def main(filename: str):
|
|
99
164
|
"""
|
|
@@ -107,9 +172,9 @@ def main(filename: str):
|
|
|
107
172
|
"""
|
|
108
173
|
(parquet_metadata, compression) = read_parquet_metadata(filename)
|
|
109
174
|
|
|
110
|
-
print_parquet_metadata(
|
|
111
|
-
|
|
112
|
-
)
|
|
175
|
+
print_parquet_metadata(parquet_metadata)
|
|
176
|
+
print_compression_types(parquet_metadata)
|
|
177
|
+
print_bloom_filter_info(parquet_metadata)
|
|
113
178
|
print(f"Compression codecs: {compression}")
|
|
114
179
|
|
|
115
180
|
|
iparq-0.1.4/PKG-INFO
DELETED
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: iparq
|
|
3
|
-
Version: 0.1.4
|
|
4
|
-
Summary: Display version and compression information about a parquet file
|
|
5
|
-
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.9
|
|
8
|
-
Requires-Dist: pyarrow>=19.0.0
|
|
9
|
-
Requires-Dist: pydantic>=2.10.6
|
|
10
|
-
Requires-Dist: typer>=0.15.1
|
|
11
|
-
Provides-Extra: checks
|
|
12
|
-
Requires-Dist: mypy>=1.14.1; extra == 'checks'
|
|
13
|
-
Requires-Dist: ruff>=0.9.3; extra == 'checks'
|
|
14
|
-
Provides-Extra: test
|
|
15
|
-
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
|
|
18
|
-
# iparq
|
|
19
|
-
|
|
20
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-package.yml)
|
|
21
|
-
|
|
22
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/dependabot/dependabot-updates)
|
|
23
|
-
|
|
24
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
|
|
25
|
-
|
|
26
|
-

|
|
27
|
-
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
28
|
-
|
|
29
|
-
## Installation
|
|
30
|
-
|
|
31
|
-
### Using pip
|
|
32
|
-
|
|
33
|
-
1) Install the package using pip:
|
|
34
|
-
|
|
35
|
-
```sh
|
|
36
|
-
pip install iparq
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
2) Verify the installation by running:
|
|
40
|
-
|
|
41
|
-
```sh
|
|
42
|
-
iparq --help
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### Using uv
|
|
46
|
-
|
|
47
|
-
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
48
|
-
|
|
49
|
-
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
50
|
-
|
|
51
|
-
2) Execute the following command:
|
|
52
|
-
|
|
53
|
-
```sh
|
|
54
|
-
uv pip install iparq
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
3) Verify the installation by running:
|
|
58
|
-
|
|
59
|
-
```sh
|
|
60
|
-
iparq --help
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
### Using Homebrew in a MAC
|
|
64
|
-
|
|
65
|
-
1) Run the following:
|
|
66
|
-
|
|
67
|
-
```sh
|
|
68
|
-
brew tap MiguelElGallo/tap https://github.com/MiguelElGallo//homebrew-iparq.git
|
|
69
|
-
brew install MiguelElGallo/tap/iparq
|
|
70
|
-
iparq —help
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
## Usage
|
|
74
|
-
|
|
75
|
-
Run
|
|
76
|
-
|
|
77
|
-
```sh
|
|
78
|
-
iparq <filename>
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
## Example output
|
|
85
|
-
|
|
86
|
-
```log
|
|
87
|
-
ParquetMetaModel(
|
|
88
|
-
created_by='parquet-cpp-arrow version 14.0.2',
|
|
89
|
-
num_columns=3,
|
|
90
|
-
num_rows=3,
|
|
91
|
-
num_row_groups=1,
|
|
92
|
-
format_version='2.6',
|
|
93
|
-
serialized_size=2223
|
|
94
|
-
)
|
|
95
|
-
Compression codecs: {'SNAPPY'}
|
|
96
|
-
```
|
iparq-0.1.4/README.md
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
# iparq
|
|
2
|
-
|
|
3
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-package.yml)
|
|
4
|
-
|
|
5
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/dependabot/dependabot-updates)
|
|
6
|
-
|
|
7
|
-
[](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
|
|
8
|
-
|
|
9
|
-

|
|
10
|
-
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
11
|
-
|
|
12
|
-
## Installation
|
|
13
|
-
|
|
14
|
-
### Using pip
|
|
15
|
-
|
|
16
|
-
1) Install the package using pip:
|
|
17
|
-
|
|
18
|
-
```sh
|
|
19
|
-
pip install iparq
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
2) Verify the installation by running:
|
|
23
|
-
|
|
24
|
-
```sh
|
|
25
|
-
iparq --help
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
### Using uv
|
|
29
|
-
|
|
30
|
-
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
31
|
-
|
|
32
|
-
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
33
|
-
|
|
34
|
-
2) Execute the following command:
|
|
35
|
-
|
|
36
|
-
```sh
|
|
37
|
-
uv pip install iparq
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
3) Verify the installation by running:
|
|
41
|
-
|
|
42
|
-
```sh
|
|
43
|
-
iparq --help
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
### Using Homebrew in a MAC
|
|
47
|
-
|
|
48
|
-
1) Run the following:
|
|
49
|
-
|
|
50
|
-
```sh
|
|
51
|
-
brew tap MiguelElGallo/tap https://github.com/MiguelElGallo//homebrew-iparq.git
|
|
52
|
-
brew install MiguelElGallo/tap/iparq
|
|
53
|
-
iparq —help
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
## Usage
|
|
57
|
-
|
|
58
|
-
Run
|
|
59
|
-
|
|
60
|
-
```sh
|
|
61
|
-
iparq <filename>
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
## Example output
|
|
68
|
-
|
|
69
|
-
```log
|
|
70
|
-
ParquetMetaModel(
|
|
71
|
-
created_by='parquet-cpp-arrow version 14.0.2',
|
|
72
|
-
num_columns=3,
|
|
73
|
-
num_rows=3,
|
|
74
|
-
num_row_groups=1,
|
|
75
|
-
format_version='2.6',
|
|
76
|
-
serialized_size=2223
|
|
77
|
-
)
|
|
78
|
-
Compression codecs: {'SNAPPY'}
|
|
79
|
-
```
|
iparq-0.1.4/tests/test_cli.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import shutil
|
|
2
|
-
import subprocess
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
from pydantic import BaseModel
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class FileCopyConfig(BaseModel):
|
|
10
|
-
source: Path
|
|
11
|
-
destination: Path
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@pytest.fixture
|
|
15
|
-
def copy_file(tmp_path: Path) -> Path:
|
|
16
|
-
config = FileCopyConfig(
|
|
17
|
-
source=Path("../dummy.parquet"), destination=tmp_path / "dummy.parquet"
|
|
18
|
-
)
|
|
19
|
-
try:
|
|
20
|
-
shutil.copy(config.source, config.destination)
|
|
21
|
-
except FileNotFoundError:
|
|
22
|
-
print("Source file not found.")
|
|
23
|
-
finally:
|
|
24
|
-
print("Copy operation complete.")
|
|
25
|
-
return config.destination
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def test_empty():
|
|
29
|
-
assert True
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def test_dummy_parquet(copy_file: Path) -> None:
|
|
33
|
-
try:
|
|
34
|
-
result = subprocess.run(
|
|
35
|
-
["iparq", str(copy_file)],
|
|
36
|
-
capture_output=True,
|
|
37
|
-
text=True,
|
|
38
|
-
check=True,
|
|
39
|
-
)
|
|
40
|
-
data = result.stdout
|
|
41
|
-
assert "SNAPPY" in data
|
|
42
|
-
assert "2.6" in data
|
|
43
|
-
except subprocess.CalledProcessError as e:
|
|
44
|
-
print(f"Test failed with error: {e}")
|
|
45
|
-
finally:
|
|
46
|
-
print("Test execution complete.")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|