PyPI - iparq - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

iparq 0.1.7py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

iparq/py.typed +1 -0
iparq/source.py +111 -34
{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/METADATA +18 -5
iparq-0.2.0.dist-info/RECORD +8 -0
iparq-0.1.7.dist-info/RECORD +0 -7
{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/WHEEL +0 -0
{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/entry_points.txt +0 -0
{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/licenses/LICENSE +0 -0

iparq/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+ # This empty file marks the package as typed for mypy

iparq/source.py CHANGED Viewed

@@ -1,8 +1,11 @@
+from typing import List, Optional
 import pyarrow.parquet as pq
 import typer
 from pydantic import BaseModel
 from rich import print
 from rich.console import Console
+from rich.table import Table
 app = typer.Typer()
 console = Console()
@@ -29,6 +32,36 @@ class ParquetMetaModel(BaseModel):
     serialized_size: int
+class ColumnInfo(BaseModel):
+    """
+    ColumnInfo is a data model representing information about a column in a Parquet file.
+    Attributes:
+        row_group (int): The row group index.
+        column_name (str): The name of the column.
+        column_index (int): The index of the column.
+        compression_type (str): The compression type used for the column.
+        has_bloom_filter (bool): Whether the column has a bloom filter.
+    """
+    row_group: int
+    column_name: str
+    column_index: int
+    compression_type: str
+    has_bloom_filter: Optional[bool] = False
+class ParquetColumnInfo(BaseModel):
+    """
+    ParquetColumnInfo is a data model representing information about all columns in a Parquet file.
+    Attributes:
+        columns (List[ColumnInfo]): List of column information.
+    """
+    columns: List[ColumnInfo] = []
 def read_parquet_metadata(filename: str):
     """
     Reads the metadata of a Parquet file and extracts the compression codecs used.
@@ -94,71 +127,106 @@ def print_parquet_metadata(parquet_metadata):
         pass
-def print_compression_types(parquet_metadata) -> None:
+def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) -> None:
     """
-    Prints the compression type for each column in each row group of the Parquet file.
+    Collects compression type information for each column and adds it to the column_info model.
+    Args:
+        parquet_metadata: The Parquet file metadata.
+        column_info: The ParquetColumnInfo model to update.
     """
     try:
         num_row_groups = parquet_metadata.num_row_groups
         num_columns = parquet_metadata.num_columns
-        console.print("[bold underline]Column Compression Info:[/bold underline]")
         for i in range(num_row_groups):
-            console.print(f"[bold]Row Group {i}:[/bold]")
+            row_group = parquet_metadata.row_group(i)
             for j in range(num_columns):
-                column_chunk = parquet_metadata.row_group(i).column(j)
+                column_chunk = row_group.column(j)
                 compression = column_chunk.compression
-                column_name = parquet_metadata.schema.column(j).name
-                console.print(
-                    f"  Column '{column_name}' (Index {j}): [italic]{compression}[/italic]"
+                column_name = parquet_metadata.schema.names[j]
+                # Create or update column info
+                column_info.columns.append(
+                    ColumnInfo(
+                        row_group=i,
+                        column_name=column_name,
+                        column_index=j,
+                        compression_type=compression,
+                    )
                 )
     except Exception as e:
         console.print(
-            f"Error while printing compression types: {e}",
+            f"Error while collecting compression types: {e}",
             style="blink bold red underline on white",
         )
-    finally:
-        pass
-def print_bloom_filter_info(parquet_metadata) -> None:
+def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) -> None:
     """
-    Prints information about bloom filters for each column in each row group of the Parquet file.
+    Updates the column_info model with bloom filter information.
+    Args:
+        parquet_metadata: The Parquet file metadata.
+        column_info: The ParquetColumnInfo model to update.
     """
     try:
         num_row_groups = parquet_metadata.num_row_groups
         num_columns = parquet_metadata.num_columns
-        has_bloom_filters = False
-        console.print("[bold underline]Bloom Filter Info:[/bold underline]")
         for i in range(num_row_groups):
             row_group = parquet_metadata.row_group(i)
-            bloom_filters_in_group = False
             for j in range(num_columns):
                 column_chunk = row_group.column(j)
-                column_name = parquet_metadata.schema.column(j).name
-                # Check if this column has bloom filters using is_stats_set
-                if hasattr(column_chunk, "is_stats_set") and column_chunk.is_stats_set:
-                    if not bloom_filters_in_group:
-                        console.print(f"[bold]Row Group {i}:[/bold]")
-                        bloom_filters_in_group = True
-                    has_bloom_filters = True
-                    console.print(
-                        f"  Column '{column_name}' (Index {j}): [green]Has bloom filter[/green]"
-                    )
-        if not has_bloom_filters:
-            console.print("  [italic]No bloom filters found in any column[/italic]")
+                # Find the corresponding column in our model
+                for col in column_info.columns:
+                    if col.row_group == i and col.column_index == j:
+                        # Check if this column has bloom filters
+                        has_bloom_filter = (
+                            hasattr(column_chunk, "is_stats_set")
+                            and column_chunk.is_stats_set
+                        )
+                        col.has_bloom_filter = has_bloom_filter
+                        break
     except Exception as e:
         console.print(
-            f"Error while printing bloom filter information: {e}",
+            f"Error while collecting bloom filter information: {e}",
             style="blink bold red underline on white",
         )
+def print_column_info_table(column_info: ParquetColumnInfo) -> None:
+    """
+    Prints the column information using a Rich table.
+    Args:
+        column_info: The ParquetColumnInfo model to display.
+    """
+    table = Table(title="Parquet Column Information")
+    # Add table columns
+    table.add_column("Row Group", justify="center", style="cyan")
+    table.add_column("Column Name", style="green")
+    table.add_column("Index", justify="center")
+    table.add_column("Compression", style="magenta")
+    table.add_column("Bloom Filter", justify="center")
+    # Add rows to the table
+    for col in column_info.columns:
+        table.add_row(
+            str(col.row_group),
+            col.column_name,
+            str(col.column_index),
+            col.compression_type,
+            "✅" if col.has_bloom_filter else "❌",
+        )
+    # Print the table
+    console.print(table)
 @app.command()
 def main(filename: str):
     """
@@ -173,8 +241,17 @@ def main(filename: str):
     (parquet_metadata, compression) = read_parquet_metadata(filename)
     print_parquet_metadata(parquet_metadata)
-    print_compression_types(parquet_metadata)
-    print_bloom_filter_info(parquet_metadata)
+    # Create a model to store column information
+    column_info = ParquetColumnInfo()
+    # Collect information
+    print_compression_types(parquet_metadata, column_info)
+    print_bloom_filter_info(parquet_metadata, column_info)
+    # Print the information as a table
+    print_column_info_table(column_info)
     print(f"Compression codecs: {compression}")

{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,14 @@
 Metadata-Version: 2.4
 Name: iparq
-Version: 0.1.7
-Summary: Display version and compression information about a parquet file
+Version: 0.2.0
+Summary: Display version compression and bloom filter information about a parquet file
 Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
 License-File: LICENSE
 Requires-Python: >=3.9
-Requires-Dist: pyarrow>=19.0.0
-Requires-Dist: pydantic>=2.10.6
-Requires-Dist: typer>=0.15.1
+Requires-Dist: pyarrow
+Requires-Dist: pydantic
+Requires-Dist: rich
+Requires-Dist: typer[all]
 Provides-Extra: checks
 Requires-Dist: mypy>=1.14.1; extra == 'checks'
 Requires-Dist: ruff>=0.9.3; extra == 'checks'
@@ -32,6 +33,18 @@ Read more about bloom filters in this [great article](https://duckdb.org/2025/03
 ## Installation
+### Zero installation - Recommended
+1) Make sure to have Astral’s UV installed by following the steps here:
+    <https://docs.astral.sh/uv/getting-started/installation/>
+2) Execute the following command:
+    ```sh
+    uvx iparq yourparquet.parquet
+    ```
 ### Using pip
 1) Install the package using pip:

iparq-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
+iparq/py.typed,sha256=bOHAx3O6ryp453lBypAaF78WipxsJDO9hH0PZFTAWYs,54
+iparq/source.py,sha256=qyBNysMLX0FkjZVw5dPSRuhswX3GuRXvM79v7g7emWM,8482
+iparq-0.2.0.dist-info/METADATA,sha256=TwQ7wiLtdEwh3NwY2talWs4BM-oeJw55LCzgMnlr5Gc,7163
+iparq-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+iparq-0.2.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
+iparq-0.2.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
+iparq-0.2.0.dist-info/RECORD,,

iparq-0.1.7.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-iparq/__init__.py,sha256=sXLh7g3KC4QCFxcZGBTpG2scR7hmmBsMjq6LqRptkRg,22
-iparq/source.py,sha256=crKRTuZY6W2zEhFfAzb4XWopaVy9qnEkFqz4jbyGmeM,6439
-iparq-0.1.7.dist-info/METADATA,sha256=ku4ZsLQ1Iq2ovPzKqv8aGqBGBkn3nTviW6hFzFsP6bw,6884
-iparq-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-iparq-0.1.7.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
-iparq-0.1.7.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
-iparq-0.1.7.dist-info/RECORD,,

{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{iparq-0.1.7.dist-info → iparq-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

iparq 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl

iparq 0.1.7py3-none-any.whl → 0.2.0py3-none-any.whl