iparq 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq/py.typed +1 -0
- iparq/source.py +111 -34
- {iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/METADATA +18 -5
- iparq-0.2.0.dist-info/RECORD +8 -0
- iparq-0.1.7.dist-info/RECORD +0 -7
- {iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/WHEEL +0 -0
- {iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/entry_points.txt +0 -0
- {iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/licenses/LICENSE +0 -0
iparq/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This empty file marks the package as typed for mypy
|
iparq/source.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
1
3
|
import pyarrow.parquet as pq
|
|
2
4
|
import typer
|
|
3
5
|
from pydantic import BaseModel
|
|
4
6
|
from rich import print
|
|
5
7
|
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
6
9
|
|
|
7
10
|
app = typer.Typer()
|
|
8
11
|
console = Console()
|
|
@@ -29,6 +32,36 @@ class ParquetMetaModel(BaseModel):
|
|
|
29
32
|
serialized_size: int
|
|
30
33
|
|
|
31
34
|
|
|
35
|
+
class ColumnInfo(BaseModel):
|
|
36
|
+
"""
|
|
37
|
+
ColumnInfo is a data model representing information about a column in a Parquet file.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
row_group (int): The row group index.
|
|
41
|
+
column_name (str): The name of the column.
|
|
42
|
+
column_index (int): The index of the column.
|
|
43
|
+
compression_type (str): The compression type used for the column.
|
|
44
|
+
has_bloom_filter (bool): Whether the column has a bloom filter.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
row_group: int
|
|
48
|
+
column_name: str
|
|
49
|
+
column_index: int
|
|
50
|
+
compression_type: str
|
|
51
|
+
has_bloom_filter: Optional[bool] = False
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ParquetColumnInfo(BaseModel):
|
|
55
|
+
"""
|
|
56
|
+
ParquetColumnInfo is a data model representing information about all columns in a Parquet file.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
columns (List[ColumnInfo]): List of column information.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
columns: List[ColumnInfo] = []
|
|
63
|
+
|
|
64
|
+
|
|
32
65
|
def read_parquet_metadata(filename: str):
|
|
33
66
|
"""
|
|
34
67
|
Reads the metadata of a Parquet file and extracts the compression codecs used.
|
|
@@ -94,71 +127,106 @@ def print_parquet_metadata(parquet_metadata):
|
|
|
94
127
|
pass
|
|
95
128
|
|
|
96
129
|
|
|
97
|
-
def print_compression_types(parquet_metadata) -> None:
|
|
130
|
+
def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) -> None:
|
|
98
131
|
"""
|
|
99
|
-
|
|
132
|
+
Collects compression type information for each column and adds it to the column_info model.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
parquet_metadata: The Parquet file metadata.
|
|
136
|
+
column_info: The ParquetColumnInfo model to update.
|
|
100
137
|
"""
|
|
101
138
|
try:
|
|
102
139
|
num_row_groups = parquet_metadata.num_row_groups
|
|
103
140
|
num_columns = parquet_metadata.num_columns
|
|
104
|
-
|
|
141
|
+
|
|
105
142
|
for i in range(num_row_groups):
|
|
106
|
-
|
|
143
|
+
row_group = parquet_metadata.row_group(i)
|
|
107
144
|
for j in range(num_columns):
|
|
108
|
-
column_chunk =
|
|
145
|
+
column_chunk = row_group.column(j)
|
|
109
146
|
compression = column_chunk.compression
|
|
110
|
-
column_name = parquet_metadata.schema.
|
|
111
|
-
|
|
112
|
-
|
|
147
|
+
column_name = parquet_metadata.schema.names[j]
|
|
148
|
+
|
|
149
|
+
# Create or update column info
|
|
150
|
+
column_info.columns.append(
|
|
151
|
+
ColumnInfo(
|
|
152
|
+
row_group=i,
|
|
153
|
+
column_name=column_name,
|
|
154
|
+
column_index=j,
|
|
155
|
+
compression_type=compression,
|
|
156
|
+
)
|
|
113
157
|
)
|
|
114
158
|
except Exception as e:
|
|
115
159
|
console.print(
|
|
116
|
-
f"Error while
|
|
160
|
+
f"Error while collecting compression types: {e}",
|
|
117
161
|
style="blink bold red underline on white",
|
|
118
162
|
)
|
|
119
|
-
finally:
|
|
120
|
-
pass
|
|
121
163
|
|
|
122
164
|
|
|
123
|
-
def print_bloom_filter_info(parquet_metadata) -> None:
|
|
165
|
+
def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) -> None:
|
|
124
166
|
"""
|
|
125
|
-
|
|
167
|
+
Updates the column_info model with bloom filter information.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
parquet_metadata: The Parquet file metadata.
|
|
171
|
+
column_info: The ParquetColumnInfo model to update.
|
|
126
172
|
"""
|
|
127
173
|
try:
|
|
128
174
|
num_row_groups = parquet_metadata.num_row_groups
|
|
129
175
|
num_columns = parquet_metadata.num_columns
|
|
130
|
-
has_bloom_filters = False
|
|
131
|
-
|
|
132
|
-
console.print("[bold underline]Bloom Filter Info:[/bold underline]")
|
|
133
176
|
|
|
134
177
|
for i in range(num_row_groups):
|
|
135
178
|
row_group = parquet_metadata.row_group(i)
|
|
136
|
-
bloom_filters_in_group = False
|
|
137
179
|
|
|
138
180
|
for j in range(num_columns):
|
|
139
181
|
column_chunk = row_group.column(j)
|
|
140
|
-
column_name = parquet_metadata.schema.column(j).name
|
|
141
|
-
|
|
142
|
-
# Check if this column has bloom filters using is_stats_set
|
|
143
|
-
if hasattr(column_chunk, "is_stats_set") and column_chunk.is_stats_set:
|
|
144
|
-
if not bloom_filters_in_group:
|
|
145
|
-
console.print(f"[bold]Row Group {i}:[/bold]")
|
|
146
|
-
bloom_filters_in_group = True
|
|
147
|
-
has_bloom_filters = True
|
|
148
|
-
console.print(
|
|
149
|
-
f" Column '{column_name}' (Index {j}): [green]Has bloom filter[/green]"
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
if not has_bloom_filters:
|
|
153
|
-
console.print(" [italic]No bloom filters found in any column[/italic]")
|
|
154
182
|
|
|
183
|
+
# Find the corresponding column in our model
|
|
184
|
+
for col in column_info.columns:
|
|
185
|
+
if col.row_group == i and col.column_index == j:
|
|
186
|
+
# Check if this column has bloom filters
|
|
187
|
+
has_bloom_filter = (
|
|
188
|
+
hasattr(column_chunk, "is_stats_set")
|
|
189
|
+
and column_chunk.is_stats_set
|
|
190
|
+
)
|
|
191
|
+
col.has_bloom_filter = has_bloom_filter
|
|
192
|
+
break
|
|
155
193
|
except Exception as e:
|
|
156
194
|
console.print(
|
|
157
|
-
f"Error while
|
|
195
|
+
f"Error while collecting bloom filter information: {e}",
|
|
158
196
|
style="blink bold red underline on white",
|
|
159
197
|
)
|
|
160
198
|
|
|
161
199
|
|
|
200
|
+
def print_column_info_table(column_info: ParquetColumnInfo) -> None:
|
|
201
|
+
"""
|
|
202
|
+
Prints the column information using a Rich table.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
column_info: The ParquetColumnInfo model to display.
|
|
206
|
+
"""
|
|
207
|
+
table = Table(title="Parquet Column Information")
|
|
208
|
+
|
|
209
|
+
# Add table columns
|
|
210
|
+
table.add_column("Row Group", justify="center", style="cyan")
|
|
211
|
+
table.add_column("Column Name", style="green")
|
|
212
|
+
table.add_column("Index", justify="center")
|
|
213
|
+
table.add_column("Compression", style="magenta")
|
|
214
|
+
table.add_column("Bloom Filter", justify="center")
|
|
215
|
+
|
|
216
|
+
# Add rows to the table
|
|
217
|
+
for col in column_info.columns:
|
|
218
|
+
table.add_row(
|
|
219
|
+
str(col.row_group),
|
|
220
|
+
col.column_name,
|
|
221
|
+
str(col.column_index),
|
|
222
|
+
col.compression_type,
|
|
223
|
+
"✅" if col.has_bloom_filter else "❌",
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Print the table
|
|
227
|
+
console.print(table)
|
|
228
|
+
|
|
229
|
+
|
|
162
230
|
@app.command()
|
|
163
231
|
def main(filename: str):
|
|
164
232
|
"""
|
|
@@ -173,8 +241,17 @@ def main(filename: str):
|
|
|
173
241
|
(parquet_metadata, compression) = read_parquet_metadata(filename)
|
|
174
242
|
|
|
175
243
|
print_parquet_metadata(parquet_metadata)
|
|
176
|
-
|
|
177
|
-
|
|
244
|
+
|
|
245
|
+
# Create a model to store column information
|
|
246
|
+
column_info = ParquetColumnInfo()
|
|
247
|
+
|
|
248
|
+
# Collect information
|
|
249
|
+
print_compression_types(parquet_metadata, column_info)
|
|
250
|
+
print_bloom_filter_info(parquet_metadata, column_info)
|
|
251
|
+
|
|
252
|
+
# Print the information as a table
|
|
253
|
+
print_column_info_table(column_info)
|
|
254
|
+
|
|
178
255
|
print(f"Compression codecs: {compression}")
|
|
179
256
|
|
|
180
257
|
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iparq
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Display version and
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Display version compression and bloom filter information about a parquet file
|
|
5
5
|
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: >=3.9
|
|
8
|
-
Requires-Dist: pyarrow
|
|
9
|
-
Requires-Dist: pydantic
|
|
10
|
-
Requires-Dist:
|
|
8
|
+
Requires-Dist: pyarrow
|
|
9
|
+
Requires-Dist: pydantic
|
|
10
|
+
Requires-Dist: rich
|
|
11
|
+
Requires-Dist: typer[all]
|
|
11
12
|
Provides-Extra: checks
|
|
12
13
|
Requires-Dist: mypy>=1.14.1; extra == 'checks'
|
|
13
14
|
Requires-Dist: ruff>=0.9.3; extra == 'checks'
|
|
@@ -32,6 +33,18 @@ Read more about bloom filters in this [great article](https://duckdb.org/2025/03
|
|
|
32
33
|
|
|
33
34
|
## Installation
|
|
34
35
|
|
|
36
|
+
### Zero installation - Recommended
|
|
37
|
+
|
|
38
|
+
1) Make sure to have Astral’s UV installed by following the steps here:
|
|
39
|
+
|
|
40
|
+
<https://docs.astral.sh/uv/getting-started/installation/>
|
|
41
|
+
|
|
42
|
+
2) Execute the following command:
|
|
43
|
+
|
|
44
|
+
```sh
|
|
45
|
+
uvx iparq yourparquet.parquet
|
|
46
|
+
```
|
|
47
|
+
|
|
35
48
|
### Using pip
|
|
36
49
|
|
|
37
50
|
1) Install the package using pip:
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
+
iparq/py.typed,sha256=bOHAx3O6ryp453lBypAaF78WipxsJDO9hH0PZFTAWYs,54
|
|
3
|
+
iparq/source.py,sha256=qyBNysMLX0FkjZVw5dPSRuhswX3GuRXvM79v7g7emWM,8482
|
|
4
|
+
iparq-0.2.0.dist-info/METADATA,sha256=TwQ7wiLtdEwh3NwY2talWs4BM-oeJw55LCzgMnlr5Gc,7163
|
|
5
|
+
iparq-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
+
iparq-0.2.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
7
|
+
iparq-0.2.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
8
|
+
iparq-0.2.0.dist-info/RECORD,,
|
iparq-0.1.7.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
|
|
2
|
-
iparq/source.py,sha256=crKRTuZY6W2zEhFfAzb4XWopaVy9qnEkFqz4jbyGmeM,6439
|
|
3
|
-
iparq-0.1.7.dist-info/METADATA,sha256=ku4ZsLQ1Iq2ovPzKqv8aGqBGBkn3nTviW6hFzFsP6bw,6884
|
|
4
|
-
iparq-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
5
|
-
iparq-0.1.7.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
|
|
6
|
-
iparq-0.1.7.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
|
|
7
|
-
iparq-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|