iparq 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
iparq/py.typed ADDED
@@ -0,0 +1 @@
1
+ # This empty file marks the package as typed for mypy
iparq/source.py CHANGED
@@ -1,8 +1,11 @@
1
+ from typing import List, Optional
2
+
1
3
  import pyarrow.parquet as pq
2
4
  import typer
3
5
  from pydantic import BaseModel
4
6
  from rich import print
5
7
  from rich.console import Console
8
+ from rich.table import Table
6
9
 
7
10
  app = typer.Typer()
8
11
  console = Console()
@@ -29,6 +32,36 @@ class ParquetMetaModel(BaseModel):
29
32
  serialized_size: int
30
33
 
31
34
 
35
+ class ColumnInfo(BaseModel):
36
+ """
37
+ ColumnInfo is a data model representing information about a column in a Parquet file.
38
+
39
+ Attributes:
40
+ row_group (int): The row group index.
41
+ column_name (str): The name of the column.
42
+ column_index (int): The index of the column.
43
+ compression_type (str): The compression type used for the column.
44
+ has_bloom_filter (bool): Whether the column has a bloom filter.
45
+ """
46
+
47
+ row_group: int
48
+ column_name: str
49
+ column_index: int
50
+ compression_type: str
51
+ has_bloom_filter: Optional[bool] = False
52
+
53
+
54
+ class ParquetColumnInfo(BaseModel):
55
+ """
56
+ ParquetColumnInfo is a data model representing information about all columns in a Parquet file.
57
+
58
+ Attributes:
59
+ columns (List[ColumnInfo]): List of column information.
60
+ """
61
+
62
+ columns: List[ColumnInfo] = []
63
+
64
+
32
65
  def read_parquet_metadata(filename: str):
33
66
  """
34
67
  Reads the metadata of a Parquet file and extracts the compression codecs used.
@@ -94,30 +127,104 @@ def print_parquet_metadata(parquet_metadata):
94
127
  pass
95
128
 
96
129
 
97
- def print_compression_types(parquet_metadata) -> None:
130
+ def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) -> None:
98
131
  """
99
- Prints the compression type for each column in each row group of the Parquet file.
132
+ Collects compression type information for each column and adds it to the column_info model.
133
+
134
+ Args:
135
+ parquet_metadata: The Parquet file metadata.
136
+ column_info: The ParquetColumnInfo model to update.
100
137
  """
101
138
  try:
102
139
  num_row_groups = parquet_metadata.num_row_groups
103
140
  num_columns = parquet_metadata.num_columns
104
- console.print("[bold underline]Column Compression Info:[/bold underline]")
141
+
105
142
  for i in range(num_row_groups):
106
- console.print(f"[bold]Row Group {i}:[/bold]")
143
+ row_group = parquet_metadata.row_group(i)
107
144
  for j in range(num_columns):
108
- column_chunk = parquet_metadata.row_group(i).column(j)
145
+ column_chunk = row_group.column(j)
109
146
  compression = column_chunk.compression
110
- column_name = parquet_metadata.schema.column(j).name
111
- console.print(
112
- f" Column '{column_name}' (Index {j}): [italic]{compression}[/italic]"
147
+ column_name = parquet_metadata.schema.names[j]
148
+
149
+ # Create or update column info
150
+ column_info.columns.append(
151
+ ColumnInfo(
152
+ row_group=i,
153
+ column_name=column_name,
154
+ column_index=j,
155
+ compression_type=compression,
156
+ )
113
157
  )
114
158
  except Exception as e:
115
159
  console.print(
116
- f"Error while printing compression types: {e}",
160
+ f"Error while collecting compression types: {e}",
117
161
  style="blink bold red underline on white",
118
162
  )
119
- finally:
120
- pass
163
+
164
+
165
+ def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) -> None:
166
+ """
167
+ Updates the column_info model with bloom filter information.
168
+
169
+ Args:
170
+ parquet_metadata: The Parquet file metadata.
171
+ column_info: The ParquetColumnInfo model to update.
172
+ """
173
+ try:
174
+ num_row_groups = parquet_metadata.num_row_groups
175
+ num_columns = parquet_metadata.num_columns
176
+
177
+ for i in range(num_row_groups):
178
+ row_group = parquet_metadata.row_group(i)
179
+
180
+ for j in range(num_columns):
181
+ column_chunk = row_group.column(j)
182
+
183
+ # Find the corresponding column in our model
184
+ for col in column_info.columns:
185
+ if col.row_group == i and col.column_index == j:
186
+ # Check if this column has bloom filters
187
+ has_bloom_filter = (
188
+ hasattr(column_chunk, "is_stats_set")
189
+ and column_chunk.is_stats_set
190
+ )
191
+ col.has_bloom_filter = has_bloom_filter
192
+ break
193
+ except Exception as e:
194
+ console.print(
195
+ f"Error while collecting bloom filter information: {e}",
196
+ style="blink bold red underline on white",
197
+ )
198
+
199
+
200
+ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
201
+ """
202
+ Prints the column information using a Rich table.
203
+
204
+ Args:
205
+ column_info: The ParquetColumnInfo model to display.
206
+ """
207
+ table = Table(title="Parquet Column Information")
208
+
209
+ # Add table columns
210
+ table.add_column("Row Group", justify="center", style="cyan")
211
+ table.add_column("Column Name", style="green")
212
+ table.add_column("Index", justify="center")
213
+ table.add_column("Compression", style="magenta")
214
+ table.add_column("Bloom Filter", justify="center")
215
+
216
+ # Add rows to the table
217
+ for col in column_info.columns:
218
+ table.add_row(
219
+ str(col.row_group),
220
+ col.column_name,
221
+ str(col.column_index),
222
+ col.compression_type,
223
+ "✅" if col.has_bloom_filter else "❌",
224
+ )
225
+
226
+ # Print the table
227
+ console.print(table)
121
228
 
122
229
 
123
230
  @app.command()
@@ -134,7 +241,17 @@ def main(filename: str):
134
241
  (parquet_metadata, compression) = read_parquet_metadata(filename)
135
242
 
136
243
  print_parquet_metadata(parquet_metadata)
137
- print_compression_types(parquet_metadata)
244
+
245
+ # Create a model to store column information
246
+ column_info = ParquetColumnInfo()
247
+
248
+ # Collect information
249
+ print_compression_types(parquet_metadata, column_info)
250
+ print_bloom_filter_info(parquet_metadata, column_info)
251
+
252
+ # Print the information as a table
253
+ print_column_info_table(column_info)
254
+
138
255
  print(f"Compression codecs: {compression}")
139
256
 
140
257
 
@@ -1,13 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: iparq
3
- Version: 0.1.5
4
- Summary: Display version and compression information about a parquet file
3
+ Version: 0.2.0
4
+ Summary: Display version compression and bloom filter information about a parquet file
5
5
  Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.9
8
- Requires-Dist: pyarrow>=19.0.0
9
- Requires-Dist: pydantic>=2.10.6
10
- Requires-Dist: typer>=0.15.1
8
+ Requires-Dist: pyarrow
9
+ Requires-Dist: pydantic
10
+ Requires-Dist: rich
11
+ Requires-Dist: typer[all]
11
12
  Provides-Extra: checks
12
13
  Requires-Dist: mypy>=1.14.1; extra == 'checks'
13
14
  Requires-Dist: ruff>=0.9.3; extra == 'checks'
@@ -26,8 +27,24 @@ Description-Content-Type: text/markdown
26
27
  ![alt text](media/iparq.png)
27
28
  After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there’s no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
28
29
 
30
+ ***New*** Bloom filters information: Displays if there are bloom filters.
31
+ Read more about bloom filters in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
32
+
33
+
29
34
  ## Installation
30
35
 
36
+ ### Zero installation - Recommended
37
+
38
+ 1) Make sure to have Astral’s UV installed by following the steps here:
39
+
40
+ <https://docs.astral.sh/uv/getting-started/installation/>
41
+
42
+ 2) Execute the following command:
43
+
44
+ ```sh
45
+ uvx iparq yourparquet.parquet
46
+ ```
47
+
31
48
  ### Using pip
32
49
 
33
50
  1) Install the package using pip:
@@ -80,7 +97,63 @@ iparq <filename>
80
97
 
81
98
  Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
82
99
 
83
- ## Example output
100
+ ## Example ouput - Bloom Filters
101
+
102
+ ```log
103
+ ParquetMetaModel(
104
+ created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
105
+ num_columns=1,
106
+ num_rows=100000000,
107
+ num_row_groups=10,
108
+ format_version='1.0',
109
+ serialized_size=1196
110
+ )
111
+ Column Compression Info:
112
+ Row Group 0:
113
+ Column 'r' (Index 0): SNAPPY
114
+ Row Group 1:
115
+ Column 'r' (Index 0): SNAPPY
116
+ Row Group 2:
117
+ Column 'r' (Index 0): SNAPPY
118
+ Row Group 3:
119
+ Column 'r' (Index 0): SNAPPY
120
+ Row Group 4:
121
+ Column 'r' (Index 0): SNAPPY
122
+ Row Group 5:
123
+ Column 'r' (Index 0): SNAPPY
124
+ Row Group 6:
125
+ Column 'r' (Index 0): SNAPPY
126
+ Row Group 7:
127
+ Column 'r' (Index 0): SNAPPY
128
+ Row Group 8:
129
+ Column 'r' (Index 0): SNAPPY
130
+ Row Group 9:
131
+ Column 'r' (Index 0): SNAPPY
132
+ Bloom Filter Info:
133
+ Row Group 0:
134
+ Column 'r' (Index 0): Has bloom filter
135
+ Row Group 1:
136
+ Column 'r' (Index 0): Has bloom filter
137
+ Row Group 2:
138
+ Column 'r' (Index 0): Has bloom filter
139
+ Row Group 3:
140
+ Column 'r' (Index 0): Has bloom filter
141
+ Row Group 4:
142
+ Column 'r' (Index 0): Has bloom filter
143
+ Row Group 5:
144
+ Column 'r' (Index 0): Has bloom filter
145
+ Row Group 6:
146
+ Column 'r' (Index 0): Has bloom filter
147
+ Row Group 7:
148
+ Column 'r' (Index 0): Has bloom filter
149
+ Row Group 8:
150
+ Column 'r' (Index 0): Has bloom filter
151
+ Row Group 9:
152
+ Column 'r' (Index 0): Has bloom filter
153
+ Compression codecs: {'SNAPPY'}
154
+ ```
155
+
156
+ ## Example output
84
157
 
85
158
  ```log
86
159
  ParquetMetaModel(
@@ -0,0 +1,8 @@
1
+ iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
2
+ iparq/py.typed,sha256=bOHAx3O6ryp453lBypAaF78WipxsJDO9hH0PZFTAWYs,54
3
+ iparq/source.py,sha256=qyBNysMLX0FkjZVw5dPSRuhswX3GuRXvM79v7g7emWM,8482
4
+ iparq-0.2.0.dist-info/METADATA,sha256=TwQ7wiLtdEwh3NwY2talWs4BM-oeJw55LCzgMnlr5Gc,7163
5
+ iparq-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ iparq-0.2.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
7
+ iparq-0.2.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
8
+ iparq-0.2.0.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
2
- iparq/source.py,sha256=7ocTpA7j5C-oSyLkMPhDifpH3cPhqyK3LBu0CjjG83s,4851
3
- iparq-0.1.5.dist-info/METADATA,sha256=7kLNc40ROyYot5N37GXjSErzpp5WPDWa9_Y2BQpmr6o,5387
4
- iparq-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
- iparq-0.1.5.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
6
- iparq-0.1.5.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
7
- iparq-0.1.5.dist-info/RECORD,,
File without changes