parquet-analyzer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parquet_analyzer-0.1.0/.gitignore +134 -0
- parquet_analyzer-0.1.0/LICENSE +21 -0
- parquet_analyzer-0.1.0/PKG-INFO +273 -0
- parquet_analyzer-0.1.0/README.md +223 -0
- parquet_analyzer-0.1.0/examples/example-segments.json +1481 -0
- parquet_analyzer-0.1.0/examples/example.json +354 -0
- parquet_analyzer-0.1.0/parquet.thrift +1311 -0
- parquet_analyzer-0.1.0/pyproject.toml +67 -0
- parquet_analyzer-0.1.0/src/parquet/__init__.py +1 -0
- parquet_analyzer-0.1.0/src/parquet/constants.py +14 -0
- parquet_analyzer-0.1.0/src/parquet/ttypes.py +5916 -0
- parquet_analyzer-0.1.0/src/parquet_analyzer/__init__.py +31 -0
- parquet_analyzer-0.1.0/src/parquet_analyzer/__main__.py +7 -0
- parquet_analyzer-0.1.0/src/parquet_analyzer/_core.py +747 -0
- parquet_analyzer-0.1.0/src/parquet_analyzer/cli.py +62 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
# Usually these files are written by a python script from a template
|
|
32
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Installer logs
|
|
37
|
+
pip-log.txt
|
|
38
|
+
pip-delete-this-directory.txt
|
|
39
|
+
|
|
40
|
+
# Unit test / coverage reports
|
|
41
|
+
htmlcov/
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
*.py,cover
|
|
51
|
+
.hypothesis/
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
cover/
|
|
54
|
+
|
|
55
|
+
# Translations
|
|
56
|
+
*.mo
|
|
57
|
+
*.pot
|
|
58
|
+
|
|
59
|
+
# Django stuff:
|
|
60
|
+
*.log
|
|
61
|
+
local_settings.py
|
|
62
|
+
db.sqlite3
|
|
63
|
+
db.sqlite3-journal
|
|
64
|
+
|
|
65
|
+
# Flask stuff:
|
|
66
|
+
instance/
|
|
67
|
+
.webassets-cache
|
|
68
|
+
|
|
69
|
+
# Scrapy stuff:
|
|
70
|
+
.scrapy
|
|
71
|
+
|
|
72
|
+
# Sphinx documentation
|
|
73
|
+
docs/_build/
|
|
74
|
+
|
|
75
|
+
# PyBuilder
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
.python-version
|
|
87
|
+
|
|
88
|
+
# pipenv
|
|
89
|
+
# According to Python official guidance, this file should not be ignored.
|
|
90
|
+
# https://pipenv.pypa.io/en/latest/basics/#pipfile-lock
|
|
91
|
+
# Pipfile.lock
|
|
92
|
+
|
|
93
|
+
# PEP 582; __pypackages__ directory
|
|
94
|
+
__pypackages__/
|
|
95
|
+
|
|
96
|
+
# Celery stuff
|
|
97
|
+
celerybeat-schedule
|
|
98
|
+
celerybeat.pid
|
|
99
|
+
|
|
100
|
+
# SageMath files
|
|
101
|
+
*.sage.py
|
|
102
|
+
|
|
103
|
+
# Environments
|
|
104
|
+
.env
|
|
105
|
+
.venv
|
|
106
|
+
env/
|
|
107
|
+
venv/
|
|
108
|
+
ENV/
|
|
109
|
+
env.bak/
|
|
110
|
+
venv.bak/
|
|
111
|
+
|
|
112
|
+
# Spyder project settings
|
|
113
|
+
.spyderproject
|
|
114
|
+
.spyproject
|
|
115
|
+
|
|
116
|
+
# Rope project settings
|
|
117
|
+
.ropeproject
|
|
118
|
+
|
|
119
|
+
# mkdocs documentation
|
|
120
|
+
/site
|
|
121
|
+
|
|
122
|
+
# mypy
|
|
123
|
+
.mypy_cache/
|
|
124
|
+
.dmypy.json
|
|
125
|
+
dmypy.json
|
|
126
|
+
|
|
127
|
+
# Pyre type checker
|
|
128
|
+
.pyre/
|
|
129
|
+
|
|
130
|
+
# pytype static analyzer
|
|
131
|
+
.pytype/
|
|
132
|
+
|
|
133
|
+
# Cython debug symbols
|
|
134
|
+
cython_debug/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Chungmin Lee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: parquet-analyzer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Inspect the on-disk layout and metadata of Parquet files.
|
|
5
|
+
Project-URL: Homepage, https://github.com/clee704/parquet-analyzer
|
|
6
|
+
Project-URL: Issues, https://github.com/clee704/parquet-analyzer/issues
|
|
7
|
+
Project-URL: Source, https://github.com/clee704/parquet-analyzer
|
|
8
|
+
Author: Chungmin Lee
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2025 Chungmin Lee
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: data-engineering,debugging,parquet,thrift
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
43
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
44
|
+
Classifier: Topic :: System :: Filesystems
|
|
45
|
+
Requires-Python: >=3.11
|
|
46
|
+
Requires-Dist: thrift>=0.16
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: hatch; extra == 'dev'
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
|
|
51
|
+
# Parquet Analyzer
|
|
52
|
+
|
|
53
|
+
A Python tool for deep inspection and analysis of Apache Parquet files, providing detailed insights into file structure, metadata, and binary layout.
|
|
54
|
+
|
|
55
|
+
## Features
|
|
56
|
+
|
|
57
|
+
- **File Structure Analysis**: Parse and visualize the complete binary structure of Parquet files
|
|
58
|
+
- **Metadata Inspection**: Extract and display schema, row group, and column metadata
|
|
59
|
+
- **Page-Level Details**: Analyze data pages, dictionary pages, and their headers
|
|
60
|
+
- **Offset Tracking**: Show exact byte offsets and lengths of all file components
|
|
61
|
+
- **Statistics Summary**: Generate comprehensive file statistics and size breakdowns
|
|
62
|
+
- **Thrift Protocol Support**: Deep dive into Thrift-encoded metadata structures
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install parquet-analyzer
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
To work from a local clone instead, install in editable mode:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install -e .
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Requirements
|
|
77
|
+
|
|
78
|
+
- Python 3.8+
|
|
79
|
+
- thrift>=0.16 (installed automatically)
|
|
80
|
+
|
|
81
|
+
## Usage
|
|
82
|
+
|
|
83
|
+
### Basic Usage
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Analyze a Parquet file and get summary information
|
|
87
|
+
parquet-analyzer example.parquet
|
|
88
|
+
|
|
89
|
+
# Show detailed offset and Thrift structure information
|
|
90
|
+
parquet-analyzer -s example.parquet
|
|
91
|
+
|
|
92
|
+
# Enable debug logging
|
|
93
|
+
parquet-analyzer --log-level DEBUG example.parquet
|
|
94
|
+
|
|
95
|
+
# Run via python -m if the console script is unavailable
|
|
96
|
+
python -m parquet_analyzer example.parquet
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Command Line Options
|
|
100
|
+
|
|
101
|
+
- `parquet_file`: Path to the Parquet file to analyze (required)
|
|
102
|
+
- `-s, --show-offsets-and-thrift-details`: Show detailed byte offsets and Thrift structure information
|
|
103
|
+
- `--log-level LOG_LEVEL`: Set logging level (DEBUG, INFO, WARNING, ERROR)
|
|
104
|
+
|
|
105
|
+
## Output Formats
|
|
106
|
+
|
|
107
|
+
### Standard Output (Default)
|
|
108
|
+
|
|
109
|
+
The default output provides a structured JSON view with three main sections:
|
|
110
|
+
|
|
111
|
+
#### 1. Summary Statistics
|
|
112
|
+
```json
|
|
113
|
+
{
|
|
114
|
+
"summary": {
|
|
115
|
+
"num_rows": 10,
|
|
116
|
+
"num_row_groups": 1,
|
|
117
|
+
"num_columns": 2,
|
|
118
|
+
"num_pages": 2,
|
|
119
|
+
"num_data_pages": 2,
|
|
120
|
+
"num_v1_data_pages": 2,
|
|
121
|
+
"num_v2_data_pages": 0,
|
|
122
|
+
"num_dict_pages": 0,
|
|
123
|
+
"page_header_size": 47,
|
|
124
|
+
"uncompressed_page_data_size": 130,
|
|
125
|
+
"compressed_page_data_size": 96,
|
|
126
|
+
"uncompressed_page_size": 177,
|
|
127
|
+
"compressed_page_size": 143,
|
|
128
|
+
"column_index_size": 48,
|
|
129
|
+
"offset_index_size": 23,
|
|
130
|
+
"bloom_fitler_size": 0,
|
|
131
|
+
"footer_size": 527,
|
|
132
|
+
"file_size": 753
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
#### 2. Footer Metadata
|
|
138
|
+
Complete Parquet file metadata including:
|
|
139
|
+
- Schema definition with column types and repetition levels
|
|
140
|
+
- Row group information
|
|
141
|
+
- Column chunk metadata
|
|
142
|
+
- Encoding and compression details
|
|
143
|
+
|
|
144
|
+
#### 3. Page Information
|
|
145
|
+
Detailed breakdown of all pages organized by column and row group:
|
|
146
|
+
- Data pages with encoding and statistics
|
|
147
|
+
- Dictionary pages
|
|
148
|
+
- Column indexes
|
|
149
|
+
- Offset indexes
|
|
150
|
+
- Bloom filters
|
|
151
|
+
|
|
152
|
+
### Detailed Output (`-s` flag)
|
|
153
|
+
|
|
154
|
+
When using the `-s` flag, the tool outputs a detailed segment-by-segment breakdown showing:
|
|
155
|
+
|
|
156
|
+
```json
|
|
157
|
+
[
|
|
158
|
+
{
|
|
159
|
+
"offset": 0,
|
|
160
|
+
"length": 4,
|
|
161
|
+
"name": "magic_number",
|
|
162
|
+
"value": "PAR1"
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"offset": 4,
|
|
166
|
+
"length": 24,
|
|
167
|
+
"name": "page",
|
|
168
|
+
"value": [
|
|
169
|
+
{
|
|
170
|
+
"offset": 5,
|
|
171
|
+
"length": 1,
|
|
172
|
+
"name": "type",
|
|
173
|
+
"value": 0,
|
|
174
|
+
"metadata": {
|
|
175
|
+
"type": "i32",
|
|
176
|
+
"enum_type": "PageType",
|
|
177
|
+
"enum_name": "DATA_PAGE"
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
]
|
|
181
|
+
}
|
|
182
|
+
]
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
This mode is useful for:
|
|
186
|
+
- Debugging Parquet file corruption
|
|
187
|
+
- Understanding exact binary layout
|
|
188
|
+
- Analyzing file format compliance
|
|
189
|
+
- Optimizing file structure
|
|
190
|
+
|
|
191
|
+
## Understanding the Output
|
|
192
|
+
|
|
193
|
+
### File Structure Components
|
|
194
|
+
|
|
195
|
+
- **Magic Numbers**: PAR1 headers at file start and end
|
|
196
|
+
- **Page Headers**: Thrift-encoded metadata for each data/dictionary page
|
|
197
|
+
- **Page Data**: Compressed/uncompressed column data
|
|
198
|
+
- **Column Indexes**: Statistics for data pages (optional)
|
|
199
|
+
- **Offset Indexes**: Byte offsets for data pages (optional)
|
|
200
|
+
- **Bloom Filters**: Bloom filter data for columns (optional)
|
|
201
|
+
- **Footer**: File metadata including schema and row group information
|
|
202
|
+
- **Footer Length**: 4-byte little-endian footer size
|
|
203
|
+
|
|
204
|
+
### Statistics Explained
|
|
205
|
+
|
|
206
|
+
- `num_rows`: Total number of rows across all row groups
|
|
207
|
+
- `num_row_groups`: Number of row groups in the file
|
|
208
|
+
- `num_columns`: Number of columns in the schema
|
|
209
|
+
- `num_pages`: Total pages (data + dictionary)
|
|
210
|
+
- `num_v1_data_pages`: Data pages using format v1
|
|
211
|
+
- `num_v2_data_pages`: Data pages using format v2
|
|
212
|
+
- `page_header_size`: Total bytes used by page headers
|
|
213
|
+
- `compressed_page_size`: Total compressed data size
|
|
214
|
+
- `uncompressed_page_size`: Total uncompressed data size
|
|
215
|
+
|
|
216
|
+
## Technical Details
|
|
217
|
+
|
|
218
|
+
### Architecture
|
|
219
|
+
|
|
220
|
+
The tool uses a custom Thrift protocol implementation (`OffsetRecordingProtocol`) that wraps the standard Thrift compact protocol to track byte offsets and lengths of all decoded structures. This enables precise mapping of logical Parquet structures to their binary representation.
|
|
221
|
+
|
|
222
|
+
### Key Components
|
|
223
|
+
|
|
224
|
+
- **OffsetRecordingProtocol**: Tracks byte positions during Thrift deserialization
|
|
225
|
+
- **TFileTransport**: File-based transport supporting seeking and offset tracking
|
|
226
|
+
- **Segment Creation**: Converts offset information into structured output
|
|
227
|
+
- **Gap Filling**: Identifies unknown or unaccounted byte ranges
|
|
228
|
+
|
|
229
|
+
### Supported Parquet Features
|
|
230
|
+
|
|
231
|
+
- All Parquet data types (primitive and logical)
|
|
232
|
+
- Compression codecs
|
|
233
|
+
- Encoding types
|
|
234
|
+
- Page formats (v1 and v2)
|
|
235
|
+
- Column indexes and offset indexes
|
|
236
|
+
- Bloom filters
|
|
237
|
+
- Nested schemas
|
|
238
|
+
|
|
239
|
+
## Use Cases
|
|
240
|
+
|
|
241
|
+
### Performance Analysis
|
|
242
|
+
- Identify compression efficiency across columns
|
|
243
|
+
- Analyze page sizes and distribution
|
|
244
|
+
- Understand storage overhead from metadata
|
|
245
|
+
|
|
246
|
+
### File Debugging
|
|
247
|
+
- Locate corrupted segments
|
|
248
|
+
- Verify file format compliance
|
|
249
|
+
- Analyze encoding choices
|
|
250
|
+
|
|
251
|
+
### Schema Evolution
|
|
252
|
+
- Compare file structures across versions
|
|
253
|
+
- Understand metadata changes
|
|
254
|
+
- Analyze backward compatibility
|
|
255
|
+
|
|
256
|
+
### Storage Optimization
|
|
257
|
+
- Identify opportunities for better compression
|
|
258
|
+
- Analyze row group sizing
|
|
259
|
+
- Optimize column ordering
|
|
260
|
+
|
|
261
|
+
## Contributing
|
|
262
|
+
|
|
263
|
+
Contributions are welcome! Please feel free to submit issues, feature requests, or pull requests.
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
Released under the [MIT License](LICENSE).
|
|
268
|
+
|
|
269
|
+
## Related Projects
|
|
270
|
+
|
|
271
|
+
- [Apache Parquet](https://parquet.apache.org/) - The Apache Parquet file format
|
|
272
|
+
- [parquet-python](https://github.com/dask/fastparquet) - Python Parquet libraries
|
|
273
|
+
- [parquet-tools](https://github.com/apache/parquet-mr/tree/master/parquet-tools) - Official Parquet command-line tools
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# Parquet Analyzer
|
|
2
|
+
|
|
3
|
+
A Python tool for deep inspection and analysis of Apache Parquet files, providing detailed insights into file structure, metadata, and binary layout.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **File Structure Analysis**: Parse and visualize the complete binary structure of Parquet files
|
|
8
|
+
- **Metadata Inspection**: Extract and display schema, row group, and column metadata
|
|
9
|
+
- **Page-Level Details**: Analyze data pages, dictionary pages, and their headers
|
|
10
|
+
- **Offset Tracking**: Show exact byte offsets and lengths of all file components
|
|
11
|
+
- **Statistics Summary**: Generate comprehensive file statistics and size breakdowns
|
|
12
|
+
- **Thrift Protocol Support**: Deep dive into Thrift-encoded metadata structures
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install parquet-analyzer
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
To work from a local clone instead, install in editable mode:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Requirements
|
|
27
|
+
|
|
28
|
+
- Python 3.8+
|
|
29
|
+
- thrift>=0.16 (installed automatically)
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
### Basic Usage
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Analyze a Parquet file and get summary information
|
|
37
|
+
parquet-analyzer example.parquet
|
|
38
|
+
|
|
39
|
+
# Show detailed offset and Thrift structure information
|
|
40
|
+
parquet-analyzer -s example.parquet
|
|
41
|
+
|
|
42
|
+
# Enable debug logging
|
|
43
|
+
parquet-analyzer --log-level DEBUG example.parquet
|
|
44
|
+
|
|
45
|
+
# Run via python -m if the console script is unavailable
|
|
46
|
+
python -m parquet_analyzer example.parquet
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Command Line Options
|
|
50
|
+
|
|
51
|
+
- `parquet_file`: Path to the Parquet file to analyze (required)
|
|
52
|
+
- `-s, --show-offsets-and-thrift-details`: Show detailed byte offsets and Thrift structure information
|
|
53
|
+
- `--log-level LOG_LEVEL`: Set logging level (DEBUG, INFO, WARNING, ERROR)
|
|
54
|
+
|
|
55
|
+
## Output Formats
|
|
56
|
+
|
|
57
|
+
### Standard Output (Default)
|
|
58
|
+
|
|
59
|
+
The default output provides a structured JSON view with three main sections:
|
|
60
|
+
|
|
61
|
+
#### 1. Summary Statistics
|
|
62
|
+
```json
|
|
63
|
+
{
|
|
64
|
+
"summary": {
|
|
65
|
+
"num_rows": 10,
|
|
66
|
+
"num_row_groups": 1,
|
|
67
|
+
"num_columns": 2,
|
|
68
|
+
"num_pages": 2,
|
|
69
|
+
"num_data_pages": 2,
|
|
70
|
+
"num_v1_data_pages": 2,
|
|
71
|
+
"num_v2_data_pages": 0,
|
|
72
|
+
"num_dict_pages": 0,
|
|
73
|
+
"page_header_size": 47,
|
|
74
|
+
"uncompressed_page_data_size": 130,
|
|
75
|
+
"compressed_page_data_size": 96,
|
|
76
|
+
"uncompressed_page_size": 177,
|
|
77
|
+
"compressed_page_size": 143,
|
|
78
|
+
"column_index_size": 48,
|
|
79
|
+
"offset_index_size": 23,
|
|
80
|
+
"bloom_fitler_size": 0,
|
|
81
|
+
"footer_size": 527,
|
|
82
|
+
"file_size": 753
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
#### 2. Footer Metadata
|
|
88
|
+
Complete Parquet file metadata including:
|
|
89
|
+
- Schema definition with column types and repetition levels
|
|
90
|
+
- Row group information
|
|
91
|
+
- Column chunk metadata
|
|
92
|
+
- Encoding and compression details
|
|
93
|
+
|
|
94
|
+
#### 3. Page Information
|
|
95
|
+
Detailed breakdown of all pages organized by column and row group:
|
|
96
|
+
- Data pages with encoding and statistics
|
|
97
|
+
- Dictionary pages
|
|
98
|
+
- Column indexes
|
|
99
|
+
- Offset indexes
|
|
100
|
+
- Bloom filters
|
|
101
|
+
|
|
102
|
+
### Detailed Output (`-s` flag)
|
|
103
|
+
|
|
104
|
+
When using the `-s` flag, the tool outputs a detailed segment-by-segment breakdown showing:
|
|
105
|
+
|
|
106
|
+
```json
|
|
107
|
+
[
|
|
108
|
+
{
|
|
109
|
+
"offset": 0,
|
|
110
|
+
"length": 4,
|
|
111
|
+
"name": "magic_number",
|
|
112
|
+
"value": "PAR1"
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
"offset": 4,
|
|
116
|
+
"length": 24,
|
|
117
|
+
"name": "page",
|
|
118
|
+
"value": [
|
|
119
|
+
{
|
|
120
|
+
"offset": 5,
|
|
121
|
+
"length": 1,
|
|
122
|
+
"name": "type",
|
|
123
|
+
"value": 0,
|
|
124
|
+
"metadata": {
|
|
125
|
+
"type": "i32",
|
|
126
|
+
"enum_type": "PageType",
|
|
127
|
+
"enum_name": "DATA_PAGE"
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
]
|
|
131
|
+
}
|
|
132
|
+
]
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
This mode is useful for:
|
|
136
|
+
- Debugging Parquet file corruption
|
|
137
|
+
- Understanding exact binary layout
|
|
138
|
+
- Analyzing file format compliance
|
|
139
|
+
- Optimizing file structure
|
|
140
|
+
|
|
141
|
+
## Understanding the Output
|
|
142
|
+
|
|
143
|
+
### File Structure Components
|
|
144
|
+
|
|
145
|
+
- **Magic Numbers**: PAR1 headers at file start and end
|
|
146
|
+
- **Page Headers**: Thrift-encoded metadata for each data/dictionary page
|
|
147
|
+
- **Page Data**: Compressed/uncompressed column data
|
|
148
|
+
- **Column Indexes**: Statistics for data pages (optional)
|
|
149
|
+
- **Offset Indexes**: Byte offsets for data pages (optional)
|
|
150
|
+
- **Bloom Filters**: Bloom filter data for columns (optional)
|
|
151
|
+
- **Footer**: File metadata including schema and row group information
|
|
152
|
+
- **Footer Length**: 4-byte little-endian footer size
|
|
153
|
+
|
|
154
|
+
### Statistics Explained
|
|
155
|
+
|
|
156
|
+
- `num_rows`: Total number of rows across all row groups
|
|
157
|
+
- `num_row_groups`: Number of row groups in the file
|
|
158
|
+
- `num_columns`: Number of columns in the schema
|
|
159
|
+
- `num_pages`: Total pages (data + dictionary)
|
|
160
|
+
- `num_v1_data_pages`: Data pages using format v1
|
|
161
|
+
- `num_v2_data_pages`: Data pages using format v2
|
|
162
|
+
- `page_header_size`: Total bytes used by page headers
|
|
163
|
+
- `compressed_page_size`: Total compressed data size
|
|
164
|
+
- `uncompressed_page_size`: Total uncompressed data size
|
|
165
|
+
|
|
166
|
+
## Technical Details
|
|
167
|
+
|
|
168
|
+
### Architecture
|
|
169
|
+
|
|
170
|
+
The tool uses a custom Thrift protocol implementation (`OffsetRecordingProtocol`) that wraps the standard Thrift compact protocol to track byte offsets and lengths of all decoded structures. This enables precise mapping of logical Parquet structures to their binary representation.
|
|
171
|
+
|
|
172
|
+
### Key Components
|
|
173
|
+
|
|
174
|
+
- **OffsetRecordingProtocol**: Tracks byte positions during Thrift deserialization
|
|
175
|
+
- **TFileTransport**: File-based transport supporting seeking and offset tracking
|
|
176
|
+
- **Segment Creation**: Converts offset information into structured output
|
|
177
|
+
- **Gap Filling**: Identifies unknown or unaccounted byte ranges
|
|
178
|
+
|
|
179
|
+
### Supported Parquet Features
|
|
180
|
+
|
|
181
|
+
- All Parquet data types (primitive and logical)
|
|
182
|
+
- Compression codecs
|
|
183
|
+
- Encoding types
|
|
184
|
+
- Page formats (v1 and v2)
|
|
185
|
+
- Column indexes and offset indexes
|
|
186
|
+
- Bloom filters
|
|
187
|
+
- Nested schemas
|
|
188
|
+
|
|
189
|
+
## Use Cases
|
|
190
|
+
|
|
191
|
+
### Performance Analysis
|
|
192
|
+
- Identify compression efficiency across columns
|
|
193
|
+
- Analyze page sizes and distribution
|
|
194
|
+
- Understand storage overhead from metadata
|
|
195
|
+
|
|
196
|
+
### File Debugging
|
|
197
|
+
- Locate corrupted segments
|
|
198
|
+
- Verify file format compliance
|
|
199
|
+
- Analyze encoding choices
|
|
200
|
+
|
|
201
|
+
### Schema Evolution
|
|
202
|
+
- Compare file structures across versions
|
|
203
|
+
- Understand metadata changes
|
|
204
|
+
- Analyze backward compatibility
|
|
205
|
+
|
|
206
|
+
### Storage Optimization
|
|
207
|
+
- Identify opportunities for better compression
|
|
208
|
+
- Analyze row group sizing
|
|
209
|
+
- Optimize column ordering
|
|
210
|
+
|
|
211
|
+
## Contributing
|
|
212
|
+
|
|
213
|
+
Contributions are welcome! Please feel free to submit issues, feature requests, or pull requests.
|
|
214
|
+
|
|
215
|
+
## License
|
|
216
|
+
|
|
217
|
+
Released under the [MIT License](LICENSE).
|
|
218
|
+
|
|
219
|
+
## Related Projects
|
|
220
|
+
|
|
221
|
+
- [Apache Parquet](https://parquet.apache.org/) - The Apache Parquet file format
|
|
222
|
+
- [parquet-python](https://github.com/dask/fastparquet) - Python Parquet libraries
|
|
223
|
+
- [parquet-tools](https://github.com/apache/parquet-mr/tree/master/parquet-tools) - Official Parquet command-line tools
|