iparq 0.4.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {iparq-0.4.1 → iparq-0.5.0}/PKG-INFO +37 -14
- {iparq-0.4.1 → iparq-0.5.0}/README.md +36 -13
- {iparq-0.4.1 → iparq-0.5.0}/pyproject.toml +1 -1
- {iparq-0.4.1 → iparq-0.5.0}/src/iparq/source.py +121 -7
- {iparq-0.4.1 → iparq-0.5.0}/tests/test_cli.py +194 -6
- iparq-0.5.0/uv.lock +1042 -0
- iparq-0.4.1/uv.lock +0 -923
- {iparq-0.4.1 → iparq-0.5.0}/.github/FUNDING.yml +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.github/copilot-instructions.md +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.github/dependabot.yml +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.github/workflows/copilot-setup-steps.yml +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.github/workflows/merge.yml +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.github/workflows/python-package.yml +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.github/workflows/python-publish.yml +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.github/workflows/test.yml +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.gitignore +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.python-version +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.vscode/launch.json +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/.vscode/settings.json +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/CONTRIBUTING.md +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/LICENSE +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/dummy.parquet +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/media/iparq.png +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/src/iparq/__init__.py +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/src/iparq/py.typed +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/tests/conftest.py +0 -0
- {iparq-0.4.1 → iparq-0.5.0}/tests/dummy.parquet +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iparq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Display version compression and bloom filter information about a parquet file
|
|
5
5
|
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -30,8 +30,12 @@ Description-Content-Type: text/markdown
|
|
|
30
30
|

|
|
31
31
|
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there's no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- **Bloom filters**: Displays if columns have bloom filters. Read more in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
36
|
+
- **Encryption detection**: Shows if columns are encrypted (🔒)
|
|
37
|
+
- **Statistics exactness**: Indicates if min/max statistics are exact or approximate (PyArrow 22+)
|
|
38
|
+
- **Compression ratios**: Optional display of column sizes and compression efficiency
|
|
35
39
|
|
|
36
40
|
## Installation
|
|
37
41
|
|
|
@@ -102,11 +106,12 @@ Options include:
|
|
|
102
106
|
- `--format`, `-f`: Output format, either `rich` (default) or `json`
|
|
103
107
|
- `--metadata-only`, `-m`: Show only file metadata without column details
|
|
104
108
|
- `--column`, `-c`: Filter results to show only a specific column
|
|
109
|
+
- `--sizes`, `-s`: Show column sizes and compression ratios
|
|
105
110
|
|
|
106
111
|
### Single File Examples:
|
|
107
112
|
|
|
108
113
|
```sh
|
|
109
|
-
# Basic inspection
|
|
114
|
+
# Basic inspection .
|
|
110
115
|
iparq inspect yourfile.parquet
|
|
111
116
|
|
|
112
117
|
# Output in JSON format
|
|
@@ -117,6 +122,9 @@ iparq inspect yourfile.parquet --metadata-only
|
|
|
117
122
|
|
|
118
123
|
# Filter to show only a specific column
|
|
119
124
|
iparq inspect yourfile.parquet --column column_name
|
|
125
|
+
|
|
126
|
+
# Show column sizes and compression ratios
|
|
127
|
+
iparq inspect yourfile.parquet --sizes
|
|
120
128
|
```
|
|
121
129
|
|
|
122
130
|
### Multiple Files and Glob Patterns:
|
|
@@ -137,7 +145,7 @@ iparq inspect important.parquet temp_*.parquet
|
|
|
137
145
|
|
|
138
146
|
When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
|
|
139
147
|
|
|
140
|
-
## Example output
|
|
148
|
+
## Example output
|
|
141
149
|
|
|
142
150
|
```log
|
|
143
151
|
ParquetMetaModel(
|
|
@@ -148,14 +156,29 @@ ParquetMetaModel(
|
|
|
148
156
|
format_version='2.6',
|
|
149
157
|
serialized_size=2223
|
|
150
158
|
)
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
┃
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
│ 0 │
|
|
157
|
-
│ 0 │
|
|
158
|
-
|
|
159
|
-
└───────────┴────────────┴───────┴───────────┴────────────┴───────────┴───────────┘
|
|
159
|
+
Parquet Column Information
|
|
160
|
+
┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
|
|
161
|
+
┃ Row Group ┃ Column Name ┃ Index ┃ Compression ┃ Bloom ┃ Encrypted ┃ Min Value ┃ Max Value ┃ Exact ┃
|
|
162
|
+
┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
|
|
163
|
+
│ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │
|
|
164
|
+
│ 0 │ two │ 1 │ SNAPPY │ ✅ │ — │ bar │ foo │ N/A │
|
|
165
|
+
│ 0 │ three │ 2 │ SNAPPY │ ✅ │ — │ False │ True │ N/A │
|
|
166
|
+
└───────────┴─────────────┴───────┴─────────────┴───────┴───────────┴───────────┴───────────┴───────┘
|
|
160
167
|
Compression codecs: {'SNAPPY'}
|
|
161
168
|
```
|
|
169
|
+
|
|
170
|
+
### With `--sizes` flag
|
|
171
|
+
|
|
172
|
+
```log
|
|
173
|
+
iparq inspect yourfile.parquet --sizes
|
|
174
|
+
|
|
175
|
+
Parquet Column Information
|
|
176
|
+
┏━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
|
|
177
|
+
┃ Row ┃ Column ┃ ┃ ┃ ┃ ┃ Min ┃ Max ┃ ┃ ┃ ┃ ┃
|
|
178
|
+
┃ Group ┃ Name ┃ Index ┃ Compr… ┃ Bloom ┃ Encryp… ┃ Value ┃ Value ┃ Exact ┃ Values ┃ Compr… ┃ Ratio ┃
|
|
179
|
+
┡━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
|
|
180
|
+
│ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │ 3 │ 104.0B │ 1.0x │
|
|
181
|
+
│ 0 │ two │ 1 │ SNAPPY │ ✅ │ — │ bar │ foo │ N/A │ 3 │ 80.0B │ 0.9x │
|
|
182
|
+
│ 0 │ three │ 2 │ SNAPPY │ ✅ │ — │ False │ True │ N/A │ 3 │ 42.0B │ 1.0x │
|
|
183
|
+
└────────┴─────────┴───────┴────────┴───────┴─────────┴────────┴─────────┴───────┴────────┴────────┴───────┘
|
|
184
|
+
```
|
|
@@ -11,8 +11,12 @@
|
|
|
11
11
|

|
|
12
12
|
After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there's no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- **Bloom filters**: Displays if columns have bloom filters. Read more in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
|
|
17
|
+
- **Encryption detection**: Shows if columns are encrypted (🔒)
|
|
18
|
+
- **Statistics exactness**: Indicates if min/max statistics are exact or approximate (PyArrow 22+)
|
|
19
|
+
- **Compression ratios**: Optional display of column sizes and compression efficiency
|
|
16
20
|
|
|
17
21
|
## Installation
|
|
18
22
|
|
|
@@ -83,11 +87,12 @@ Options include:
|
|
|
83
87
|
- `--format`, `-f`: Output format, either `rich` (default) or `json`
|
|
84
88
|
- `--metadata-only`, `-m`: Show only file metadata without column details
|
|
85
89
|
- `--column`, `-c`: Filter results to show only a specific column
|
|
90
|
+
- `--sizes`, `-s`: Show column sizes and compression ratios
|
|
86
91
|
|
|
87
92
|
### Single File Examples:
|
|
88
93
|
|
|
89
94
|
```sh
|
|
90
|
-
# Basic inspection
|
|
95
|
+
# Basic inspection .
|
|
91
96
|
iparq inspect yourfile.parquet
|
|
92
97
|
|
|
93
98
|
# Output in JSON format
|
|
@@ -98,6 +103,9 @@ iparq inspect yourfile.parquet --metadata-only
|
|
|
98
103
|
|
|
99
104
|
# Filter to show only a specific column
|
|
100
105
|
iparq inspect yourfile.parquet --column column_name
|
|
106
|
+
|
|
107
|
+
# Show column sizes and compression ratios
|
|
108
|
+
iparq inspect yourfile.parquet --sizes
|
|
101
109
|
```
|
|
102
110
|
|
|
103
111
|
### Multiple Files and Glob Patterns:
|
|
@@ -118,7 +126,7 @@ iparq inspect important.parquet temp_*.parquet
|
|
|
118
126
|
|
|
119
127
|
When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
|
|
120
128
|
|
|
121
|
-
## Example output
|
|
129
|
+
## Example output
|
|
122
130
|
|
|
123
131
|
```log
|
|
124
132
|
ParquetMetaModel(
|
|
@@ -129,14 +137,29 @@ ParquetMetaModel(
|
|
|
129
137
|
format_version='2.6',
|
|
130
138
|
serialized_size=2223
|
|
131
139
|
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
┃
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
│ 0 │
|
|
138
|
-
│ 0 │
|
|
139
|
-
|
|
140
|
-
└───────────┴────────────┴───────┴───────────┴────────────┴───────────┴───────────┘
|
|
140
|
+
Parquet Column Information
|
|
141
|
+
┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
|
|
142
|
+
┃ Row Group ┃ Column Name ┃ Index ┃ Compression ┃ Bloom ┃ Encrypted ┃ Min Value ┃ Max Value ┃ Exact ┃
|
|
143
|
+
┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
|
|
144
|
+
│ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │
|
|
145
|
+
│ 0 │ two │ 1 │ SNAPPY │ ✅ │ — │ bar │ foo │ N/A │
|
|
146
|
+
│ 0 │ three │ 2 │ SNAPPY │ ✅ │ — │ False │ True │ N/A │
|
|
147
|
+
└───────────┴─────────────┴───────┴─────────────┴───────┴───────────┴───────────┴───────────┴───────┘
|
|
141
148
|
Compression codecs: {'SNAPPY'}
|
|
142
149
|
```
|
|
150
|
+
|
|
151
|
+
### With `--sizes` flag
|
|
152
|
+
|
|
153
|
+
```log
|
|
154
|
+
iparq inspect yourfile.parquet --sizes
|
|
155
|
+
|
|
156
|
+
Parquet Column Information
|
|
157
|
+
┏━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
|
|
158
|
+
┃ Row ┃ Column ┃ ┃ ┃ ┃ ┃ Min ┃ Max ┃ ┃ ┃ ┃ ┃
|
|
159
|
+
┃ Group ┃ Name ┃ Index ┃ Compr… ┃ Bloom ┃ Encryp… ┃ Value ┃ Value ┃ Exact ┃ Values ┃ Compr… ┃ Ratio ┃
|
|
160
|
+
┡━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
|
|
161
|
+
│ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │ 3 │ 104.0B │ 1.0x │
|
|
162
|
+
│ 0 │ two │ 1 │ SNAPPY │ ✅ │ — │ bar │ foo │ N/A │ 3 │ 80.0B │ 0.9x │
|
|
163
|
+
│ 0 │ three │ 2 │ SNAPPY │ ✅ │ — │ False │ True │ N/A │ 3 │ 42.0B │ 1.0x │
|
|
164
|
+
└────────┴─────────┴───────┴────────┴───────┴─────────┴────────┴─────────┴───────┴────────┴────────┴───────┘
|
|
165
|
+
```
|
|
@@ -57,6 +57,12 @@ class ColumnInfo(BaseModel):
|
|
|
57
57
|
has_min_max (bool): Whether min/max statistics are available.
|
|
58
58
|
min_value (Optional[str]): The minimum value in the column (as string for display).
|
|
59
59
|
max_value (Optional[str]): The maximum value in the column (as string for display).
|
|
60
|
+
is_min_exact (Optional[bool]): Whether the min value is exact (PyArrow 22+).
|
|
61
|
+
is_max_exact (Optional[bool]): Whether the max value is exact (PyArrow 22+).
|
|
62
|
+
is_encrypted (Optional[bool]): Whether the column is encrypted.
|
|
63
|
+
num_values (Optional[int]): Number of values in this column chunk.
|
|
64
|
+
total_compressed_size (Optional[int]): Total compressed size in bytes.
|
|
65
|
+
total_uncompressed_size (Optional[int]): Total uncompressed size in bytes.
|
|
60
66
|
"""
|
|
61
67
|
|
|
62
68
|
row_group: int
|
|
@@ -67,6 +73,12 @@ class ColumnInfo(BaseModel):
|
|
|
67
73
|
has_min_max: Optional[bool] = False
|
|
68
74
|
min_value: Optional[str] = None
|
|
69
75
|
max_value: Optional[str] = None
|
|
76
|
+
is_min_exact: Optional[bool] = None
|
|
77
|
+
is_max_exact: Optional[bool] = None
|
|
78
|
+
is_encrypted: Optional[bool] = None
|
|
79
|
+
num_values: Optional[int] = None
|
|
80
|
+
total_compressed_size: Optional[int] = None
|
|
81
|
+
total_uncompressed_size: Optional[int] = None
|
|
70
82
|
|
|
71
83
|
|
|
72
84
|
class ParquetColumnInfo(BaseModel):
|
|
@@ -158,6 +170,28 @@ def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) ->
|
|
|
158
170
|
compression = column_chunk.compression
|
|
159
171
|
column_name = parquet_metadata.schema.names[j]
|
|
160
172
|
|
|
173
|
+
# Get additional column chunk metadata
|
|
174
|
+
num_values = (
|
|
175
|
+
column_chunk.num_values
|
|
176
|
+
if hasattr(column_chunk, "num_values")
|
|
177
|
+
else None
|
|
178
|
+
)
|
|
179
|
+
total_compressed = (
|
|
180
|
+
column_chunk.total_compressed_size
|
|
181
|
+
if hasattr(column_chunk, "total_compressed_size")
|
|
182
|
+
else None
|
|
183
|
+
)
|
|
184
|
+
total_uncompressed = (
|
|
185
|
+
column_chunk.total_uncompressed_size
|
|
186
|
+
if hasattr(column_chunk, "total_uncompressed_size")
|
|
187
|
+
else None
|
|
188
|
+
)
|
|
189
|
+
is_encrypted = (
|
|
190
|
+
column_chunk.is_crypto_metadata_set()
|
|
191
|
+
if hasattr(column_chunk, "is_crypto_metadata_set")
|
|
192
|
+
else None
|
|
193
|
+
)
|
|
194
|
+
|
|
161
195
|
# Create or update column info
|
|
162
196
|
column_info.columns.append(
|
|
163
197
|
ColumnInfo(
|
|
@@ -165,6 +199,10 @@ def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) ->
|
|
|
165
199
|
column_name=column_name,
|
|
166
200
|
column_index=j,
|
|
167
201
|
compression_type=compression,
|
|
202
|
+
num_values=num_values,
|
|
203
|
+
total_compressed_size=total_compressed,
|
|
204
|
+
total_uncompressed_size=total_uncompressed,
|
|
205
|
+
is_encrypted=is_encrypted,
|
|
168
206
|
)
|
|
169
207
|
)
|
|
170
208
|
except Exception as e:
|
|
@@ -252,6 +290,16 @@ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -
|
|
|
252
290
|
# Fallback for complex types that might not stringify well
|
|
253
291
|
col.min_value = "<unable to display>"
|
|
254
292
|
col.max_value = "<unable to display>"
|
|
293
|
+
|
|
294
|
+
# PyArrow 22+ feature: check if min/max values are exact
|
|
295
|
+
# This helps users understand if statistics can be trusted for query optimization
|
|
296
|
+
try:
|
|
297
|
+
if hasattr(stats, "is_min_value_exact"):
|
|
298
|
+
col.is_min_exact = stats.is_min_value_exact
|
|
299
|
+
if hasattr(stats, "is_max_value_exact"):
|
|
300
|
+
col.is_max_exact = stats.is_max_value_exact
|
|
301
|
+
except Exception:
|
|
302
|
+
pass # Not available in older PyArrow versions
|
|
255
303
|
else:
|
|
256
304
|
col.has_min_max = False
|
|
257
305
|
break
|
|
@@ -262,12 +310,27 @@ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -
|
|
|
262
310
|
)
|
|
263
311
|
|
|
264
312
|
|
|
265
|
-
def
|
|
313
|
+
def format_size(size_bytes: Optional[int]) -> str:
|
|
314
|
+
"""Format bytes into human-readable size."""
|
|
315
|
+
if size_bytes is None:
|
|
316
|
+
return "N/A"
|
|
317
|
+
size: float = float(size_bytes)
|
|
318
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
319
|
+
if abs(size) < 1024.0:
|
|
320
|
+
return f"{size:.1f}{unit}"
|
|
321
|
+
size /= 1024.0
|
|
322
|
+
return f"{size:.1f}TB"
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def print_column_info_table(
|
|
326
|
+
column_info: ParquetColumnInfo, show_sizes: bool = False
|
|
327
|
+
) -> None:
|
|
266
328
|
"""
|
|
267
329
|
Prints the column information using a Rich table.
|
|
268
330
|
|
|
269
331
|
Args:
|
|
270
332
|
column_info: The ParquetColumnInfo model to display.
|
|
333
|
+
show_sizes: Whether to show compressed/uncompressed size columns.
|
|
271
334
|
"""
|
|
272
335
|
table = Table(title="Parquet Column Information")
|
|
273
336
|
|
|
@@ -276,9 +339,18 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
|
276
339
|
table.add_column("Column Name", style="green")
|
|
277
340
|
table.add_column("Index", justify="center")
|
|
278
341
|
table.add_column("Compression", style="magenta")
|
|
279
|
-
table.add_column("Bloom
|
|
342
|
+
table.add_column("Bloom", justify="center")
|
|
343
|
+
table.add_column("Encrypted", justify="center")
|
|
280
344
|
table.add_column("Min Value", style="yellow")
|
|
281
345
|
table.add_column("Max Value", style="yellow")
|
|
346
|
+
table.add_column(
|
|
347
|
+
"Exact", justify="center", style="dim"
|
|
348
|
+
) # Shows if min/max are exact
|
|
349
|
+
|
|
350
|
+
if show_sizes:
|
|
351
|
+
table.add_column("Values", justify="right")
|
|
352
|
+
table.add_column("Compressed", justify="right", style="blue")
|
|
353
|
+
table.add_column("Ratio", justify="right", style="blue")
|
|
282
354
|
|
|
283
355
|
# Add rows to the table
|
|
284
356
|
for col in column_info.columns:
|
|
@@ -290,15 +362,48 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
|
290
362
|
col.max_value if col.has_min_max and col.max_value is not None else "N/A"
|
|
291
363
|
)
|
|
292
364
|
|
|
293
|
-
|
|
365
|
+
# Format exactness indicator (PyArrow 22+ feature)
|
|
366
|
+
exact_display = "N/A"
|
|
367
|
+
if col.is_min_exact is not None and col.is_max_exact is not None:
|
|
368
|
+
if col.is_min_exact and col.is_max_exact:
|
|
369
|
+
exact_display = "✅"
|
|
370
|
+
elif col.is_min_exact or col.is_max_exact:
|
|
371
|
+
exact_display = "~" # Partially exact
|
|
372
|
+
else:
|
|
373
|
+
exact_display = "❌"
|
|
374
|
+
|
|
375
|
+
# Format encryption status
|
|
376
|
+
encrypted_display = "🔒" if col.is_encrypted else "—"
|
|
377
|
+
|
|
378
|
+
row_data = [
|
|
294
379
|
str(col.row_group),
|
|
295
380
|
col.column_name,
|
|
296
381
|
str(col.column_index),
|
|
297
382
|
col.compression_type,
|
|
298
383
|
"✅" if col.has_bloom_filter else "❌",
|
|
384
|
+
encrypted_display,
|
|
299
385
|
min_display,
|
|
300
386
|
max_display,
|
|
301
|
-
|
|
387
|
+
exact_display,
|
|
388
|
+
]
|
|
389
|
+
|
|
390
|
+
if show_sizes:
|
|
391
|
+
# Calculate compression ratio
|
|
392
|
+
ratio = "N/A"
|
|
393
|
+
if col.total_compressed_size and col.total_uncompressed_size:
|
|
394
|
+
ratio = (
|
|
395
|
+
f"{col.total_uncompressed_size / col.total_compressed_size:.1f}x"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
row_data.extend(
|
|
399
|
+
[
|
|
400
|
+
str(col.num_values) if col.num_values else "N/A",
|
|
401
|
+
format_size(col.total_compressed_size),
|
|
402
|
+
ratio,
|
|
403
|
+
]
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
table.add_row(*row_data)
|
|
302
407
|
|
|
303
408
|
# Print the table
|
|
304
409
|
console.print(table)
|
|
@@ -331,6 +436,7 @@ def inspect_single_file(
|
|
|
331
436
|
format: OutputFormat,
|
|
332
437
|
metadata_only: bool,
|
|
333
438
|
column_filter: Optional[str],
|
|
439
|
+
show_sizes: bool = False,
|
|
334
440
|
) -> None:
|
|
335
441
|
"""
|
|
336
442
|
Inspect a single Parquet file and display its metadata, compression settings, and bloom filter information.
|
|
@@ -339,7 +445,7 @@ def inspect_single_file(
|
|
|
339
445
|
Exception: If the file cannot be processed.
|
|
340
446
|
"""
|
|
341
447
|
try:
|
|
342
|
-
|
|
448
|
+
parquet_metadata, compression = read_parquet_metadata(filename)
|
|
343
449
|
except FileNotFoundError:
|
|
344
450
|
raise Exception(f"Cannot open: {filename}.")
|
|
345
451
|
except Exception as e:
|
|
@@ -382,7 +488,7 @@ def inspect_single_file(
|
|
|
382
488
|
|
|
383
489
|
# Print column details if not metadata only
|
|
384
490
|
if not metadata_only:
|
|
385
|
-
print_column_info_table(column_info)
|
|
491
|
+
print_column_info_table(column_info, show_sizes=show_sizes)
|
|
386
492
|
console.print(f"Compression codecs: {compression}")
|
|
387
493
|
|
|
388
494
|
|
|
@@ -404,6 +510,12 @@ def inspect(
|
|
|
404
510
|
column_filter: Optional[str] = typer.Option(
|
|
405
511
|
None, "--column", "-c", help="Filter results to show only specific column"
|
|
406
512
|
),
|
|
513
|
+
show_sizes: bool = typer.Option(
|
|
514
|
+
False,
|
|
515
|
+
"--sizes",
|
|
516
|
+
"-s",
|
|
517
|
+
help="Show column sizes and compression ratios",
|
|
518
|
+
),
|
|
407
519
|
):
|
|
408
520
|
"""
|
|
409
521
|
Inspect Parquet files and display their metadata, compression settings, and bloom filter information.
|
|
@@ -436,7 +548,9 @@ def inspect(
|
|
|
436
548
|
console.print("─" * (len(filename) + 6))
|
|
437
549
|
|
|
438
550
|
try:
|
|
439
|
-
inspect_single_file(
|
|
551
|
+
inspect_single_file(
|
|
552
|
+
filename, format, metadata_only, column_filter, show_sizes
|
|
553
|
+
)
|
|
440
554
|
except Exception as e:
|
|
441
555
|
console.print(f"Error processing {filename}: {e}", style="red")
|
|
442
556
|
continue
|
|
@@ -3,7 +3,14 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
from typer.testing import CliRunner
|
|
5
5
|
|
|
6
|
-
from iparq.source import
|
|
6
|
+
from iparq.source import (
|
|
7
|
+
ColumnInfo,
|
|
8
|
+
ParquetColumnInfo,
|
|
9
|
+
ParquetMetaModel,
|
|
10
|
+
app,
|
|
11
|
+
format_size,
|
|
12
|
+
output_json,
|
|
13
|
+
)
|
|
7
14
|
|
|
8
15
|
# Define path to test fixtures
|
|
9
16
|
FIXTURES_DIR = Path(__file__).parent
|
|
@@ -23,10 +30,7 @@ def test_parquet_info():
|
|
|
23
30
|
assert "num_columns=3" in result.stdout
|
|
24
31
|
assert "num_rows=3" in result.stdout
|
|
25
32
|
assert "Parquet Column Information" in result.stdout
|
|
26
|
-
|
|
27
|
-
assert (
|
|
28
|
-
"Value" in result.stdout
|
|
29
|
-
) # This covers "Max Value" which is split across lines
|
|
33
|
+
# Check for data values (these are more reliable than table headers which may be truncated)
|
|
30
34
|
assert "one" in result.stdout and "-1.0" in result.stdout and "2.5" in result.stdout
|
|
31
35
|
assert "two" in result.stdout and "bar" in result.stdout and "foo" in result.stdout
|
|
32
36
|
assert (
|
|
@@ -34,7 +38,7 @@ def test_parquet_info():
|
|
|
34
38
|
and "False" in result.stdout
|
|
35
39
|
and "True" in result.stdout
|
|
36
40
|
)
|
|
37
|
-
assert "
|
|
41
|
+
assert "SNAPPY" in result.stdout
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
def test_metadata_only_flag():
|
|
@@ -171,3 +175,187 @@ def test_error_handling_with_multiple_files():
|
|
|
171
175
|
# Should show error for bad file
|
|
172
176
|
assert "Error processing" in result.stdout
|
|
173
177
|
assert "nonexistent.parquet" in result.stdout
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def test_sizes_flag():
|
|
181
|
+
"""Test that the --sizes flag displays column size information."""
|
|
182
|
+
runner = CliRunner()
|
|
183
|
+
result = runner.invoke(app, ["inspect", "--sizes", str(fixture_path)])
|
|
184
|
+
|
|
185
|
+
assert result.exit_code == 0
|
|
186
|
+
assert "ParquetMetaModel" in result.stdout
|
|
187
|
+
# Check for size-related output (Values, compressed size, ratio)
|
|
188
|
+
# The actual values depend on the test file
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def test_sizes_flag_with_json():
|
|
192
|
+
"""Test that --sizes flag works with JSON output and includes size fields."""
|
|
193
|
+
runner = CliRunner()
|
|
194
|
+
result = runner.invoke(
|
|
195
|
+
app, ["inspect", "--format", "json", "--sizes", str(fixture_path)]
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
assert result.exit_code == 0
|
|
199
|
+
data = json.loads(result.stdout)
|
|
200
|
+
|
|
201
|
+
# Check that size fields are present in columns
|
|
202
|
+
for column in data["columns"]:
|
|
203
|
+
assert "num_values" in column
|
|
204
|
+
assert "total_compressed_size" in column
|
|
205
|
+
assert "total_uncompressed_size" in column
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def test_format_size_bytes():
|
|
209
|
+
"""Test format_size function with bytes."""
|
|
210
|
+
assert format_size(100) == "100.0B"
|
|
211
|
+
assert format_size(0) == "0.0B"
|
|
212
|
+
assert format_size(None) == "N/A"
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def test_format_size_kilobytes():
|
|
216
|
+
"""Test format_size function with kilobytes."""
|
|
217
|
+
assert format_size(1024) == "1.0KB"
|
|
218
|
+
assert format_size(2048) == "2.0KB"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def test_format_size_megabytes():
|
|
222
|
+
"""Test format_size function with megabytes."""
|
|
223
|
+
assert format_size(1024 * 1024) == "1.0MB"
|
|
224
|
+
assert format_size(5 * 1024 * 1024) == "5.0MB"
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def test_format_size_gigabytes():
|
|
228
|
+
"""Test format_size function with gigabytes."""
|
|
229
|
+
assert format_size(1024 * 1024 * 1024) == "1.0GB"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def test_format_size_terabytes():
|
|
233
|
+
"""Test format_size function with terabytes."""
|
|
234
|
+
assert format_size(1024 * 1024 * 1024 * 1024) == "1.0TB"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_column_info_model():
|
|
238
|
+
"""Test ColumnInfo model with new fields."""
|
|
239
|
+
col = ColumnInfo(
|
|
240
|
+
row_group=0,
|
|
241
|
+
column_name="test_col",
|
|
242
|
+
column_index=0,
|
|
243
|
+
compression_type="SNAPPY",
|
|
244
|
+
has_bloom_filter=True,
|
|
245
|
+
has_min_max=True,
|
|
246
|
+
min_value="1",
|
|
247
|
+
max_value="100",
|
|
248
|
+
is_min_exact=True,
|
|
249
|
+
is_max_exact=True,
|
|
250
|
+
is_encrypted=False,
|
|
251
|
+
num_values=1000,
|
|
252
|
+
total_compressed_size=512,
|
|
253
|
+
total_uncompressed_size=1024,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
assert col.is_min_exact is True
|
|
257
|
+
assert col.is_max_exact is True
|
|
258
|
+
assert col.is_encrypted is False
|
|
259
|
+
assert col.num_values == 1000
|
|
260
|
+
assert col.total_compressed_size == 512
|
|
261
|
+
assert col.total_uncompressed_size == 1024
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def test_column_info_model_defaults():
|
|
265
|
+
"""Test ColumnInfo model with default values for new fields."""
|
|
266
|
+
col = ColumnInfo(
|
|
267
|
+
row_group=0,
|
|
268
|
+
column_name="test_col",
|
|
269
|
+
column_index=0,
|
|
270
|
+
compression_type="SNAPPY",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
assert col.is_min_exact is None
|
|
274
|
+
assert col.is_max_exact is None
|
|
275
|
+
assert col.is_encrypted is None
|
|
276
|
+
assert col.num_values is None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def test_output_json_function():
|
|
280
|
+
"""Test the output_json function directly."""
|
|
281
|
+
import io
|
|
282
|
+
import sys
|
|
283
|
+
|
|
284
|
+
meta = ParquetMetaModel(
|
|
285
|
+
created_by="test",
|
|
286
|
+
num_columns=2,
|
|
287
|
+
num_rows=100,
|
|
288
|
+
num_row_groups=1,
|
|
289
|
+
format_version="2.6",
|
|
290
|
+
serialized_size=1000,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
columns = ParquetColumnInfo(
|
|
294
|
+
columns=[
|
|
295
|
+
ColumnInfo(
|
|
296
|
+
row_group=0,
|
|
297
|
+
column_name="col1",
|
|
298
|
+
column_index=0,
|
|
299
|
+
compression_type="ZSTD",
|
|
300
|
+
has_min_max=True,
|
|
301
|
+
min_value="0",
|
|
302
|
+
max_value="99",
|
|
303
|
+
is_min_exact=True,
|
|
304
|
+
is_max_exact=False,
|
|
305
|
+
num_values=100,
|
|
306
|
+
total_compressed_size=256,
|
|
307
|
+
total_uncompressed_size=512,
|
|
308
|
+
)
|
|
309
|
+
]
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
compression_codecs = {"ZSTD"}
|
|
313
|
+
|
|
314
|
+
# Capture stdout
|
|
315
|
+
captured = io.StringIO()
|
|
316
|
+
old_stdout = sys.stdout
|
|
317
|
+
sys.stdout = captured
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
output_json(meta, columns, compression_codecs)
|
|
321
|
+
finally:
|
|
322
|
+
sys.stdout = old_stdout
|
|
323
|
+
|
|
324
|
+
output = captured.getvalue()
|
|
325
|
+
data = json.loads(output)
|
|
326
|
+
|
|
327
|
+
assert data["metadata"]["num_columns"] == 2
|
|
328
|
+
assert data["columns"][0]["is_min_exact"] is True
|
|
329
|
+
assert data["columns"][0]["is_max_exact"] is False
|
|
330
|
+
assert "ZSTD" in data["compression_codecs"]
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def test_column_filter_no_match():
|
|
334
|
+
"""Test filtering by a column name that doesn't exist."""
|
|
335
|
+
runner = CliRunner()
|
|
336
|
+
result = runner.invoke(
|
|
337
|
+
app, ["inspect", "--column", "nonexistent_column", str(fixture_path)]
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
assert result.exit_code == 0
|
|
341
|
+
assert "No columns match the filter" in result.stdout
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def test_nonexistent_file():
|
|
345
|
+
"""Test error handling for non-existent file."""
|
|
346
|
+
runner = CliRunner()
|
|
347
|
+
result = runner.invoke(app, ["inspect", "totally_fake_file.parquet"])
|
|
348
|
+
|
|
349
|
+
assert result.exit_code == 0 # CLI should handle error gracefully
|
|
350
|
+
assert "Error" in result.stdout
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def test_default_command():
|
|
354
|
+
"""Test that the empty command name works as default."""
|
|
355
|
+
runner = CliRunner()
|
|
356
|
+
# The app has both @app.command(name="") and @app.command(name="inspect")
|
|
357
|
+
# So 'inspect' is required but maps to the same function
|
|
358
|
+
result = runner.invoke(app, ["inspect", str(fixture_path)])
|
|
359
|
+
|
|
360
|
+
assert result.exit_code == 0
|
|
361
|
+
assert "ParquetMetaModel" in result.stdout
|