iparq 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
iparq/source.py CHANGED
@@ -57,6 +57,12 @@ class ColumnInfo(BaseModel):
57
57
  has_min_max (bool): Whether min/max statistics are available.
58
58
  min_value (Optional[str]): The minimum value in the column (as string for display).
59
59
  max_value (Optional[str]): The maximum value in the column (as string for display).
60
+ is_min_exact (Optional[bool]): Whether the min value is exact (PyArrow 22+).
61
+ is_max_exact (Optional[bool]): Whether the max value is exact (PyArrow 22+).
62
+ is_encrypted (Optional[bool]): Whether the column is encrypted.
63
+ num_values (Optional[int]): Number of values in this column chunk.
64
+ total_compressed_size (Optional[int]): Total compressed size in bytes.
65
+ total_uncompressed_size (Optional[int]): Total uncompressed size in bytes.
60
66
  """
61
67
 
62
68
  row_group: int
@@ -67,6 +73,12 @@ class ColumnInfo(BaseModel):
67
73
  has_min_max: Optional[bool] = False
68
74
  min_value: Optional[str] = None
69
75
  max_value: Optional[str] = None
76
+ is_min_exact: Optional[bool] = None
77
+ is_max_exact: Optional[bool] = None
78
+ is_encrypted: Optional[bool] = None
79
+ num_values: Optional[int] = None
80
+ total_compressed_size: Optional[int] = None
81
+ total_uncompressed_size: Optional[int] = None
70
82
 
71
83
 
72
84
  class ParquetColumnInfo(BaseModel):
@@ -158,6 +170,28 @@ def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) ->
158
170
  compression = column_chunk.compression
159
171
  column_name = parquet_metadata.schema.names[j]
160
172
 
173
+ # Get additional column chunk metadata
174
+ num_values = (
175
+ column_chunk.num_values
176
+ if hasattr(column_chunk, "num_values")
177
+ else None
178
+ )
179
+ total_compressed = (
180
+ column_chunk.total_compressed_size
181
+ if hasattr(column_chunk, "total_compressed_size")
182
+ else None
183
+ )
184
+ total_uncompressed = (
185
+ column_chunk.total_uncompressed_size
186
+ if hasattr(column_chunk, "total_uncompressed_size")
187
+ else None
188
+ )
189
+ is_encrypted = (
190
+ column_chunk.is_crypto_metadata_set()
191
+ if hasattr(column_chunk, "is_crypto_metadata_set")
192
+ else None
193
+ )
194
+
161
195
  # Create or update column info
162
196
  column_info.columns.append(
163
197
  ColumnInfo(
@@ -165,6 +199,10 @@ def print_compression_types(parquet_metadata, column_info: ParquetColumnInfo) ->
165
199
  column_name=column_name,
166
200
  column_index=j,
167
201
  compression_type=compression,
202
+ num_values=num_values,
203
+ total_compressed_size=total_compressed,
204
+ total_uncompressed_size=total_uncompressed,
205
+ is_encrypted=is_encrypted,
168
206
  )
169
207
  )
170
208
  except Exception as e:
@@ -252,6 +290,16 @@ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -
252
290
  # Fallback for complex types that might not stringify well
253
291
  col.min_value = "<unable to display>"
254
292
  col.max_value = "<unable to display>"
293
+
294
+ # PyArrow 22+ feature: check if min/max values are exact
295
+ # This helps users understand if statistics can be trusted for query optimization
296
+ try:
297
+ if hasattr(stats, "is_min_value_exact"):
298
+ col.is_min_exact = stats.is_min_value_exact
299
+ if hasattr(stats, "is_max_value_exact"):
300
+ col.is_max_exact = stats.is_max_value_exact
301
+ except Exception:
302
+ pass # Not available in older PyArrow versions
255
303
  else:
256
304
  col.has_min_max = False
257
305
  break
@@ -262,12 +310,27 @@ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -
262
310
  )
263
311
 
264
312
 
265
- def print_column_info_table(column_info: ParquetColumnInfo) -> None:
313
+ def format_size(size_bytes: Optional[int]) -> str:
314
+ """Format bytes into human-readable size."""
315
+ if size_bytes is None:
316
+ return "N/A"
317
+ size: float = float(size_bytes)
318
+ for unit in ["B", "KB", "MB", "GB"]:
319
+ if abs(size) < 1024.0:
320
+ return f"{size:.1f}{unit}"
321
+ size /= 1024.0
322
+ return f"{size:.1f}TB"
323
+
324
+
325
+ def print_column_info_table(
326
+ column_info: ParquetColumnInfo, show_sizes: bool = False
327
+ ) -> None:
266
328
  """
267
329
  Prints the column information using a Rich table.
268
330
 
269
331
  Args:
270
332
  column_info: The ParquetColumnInfo model to display.
333
+ show_sizes: Whether to show compressed/uncompressed size columns.
271
334
  """
272
335
  table = Table(title="Parquet Column Information")
273
336
 
@@ -276,9 +339,18 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
276
339
  table.add_column("Column Name", style="green")
277
340
  table.add_column("Index", justify="center")
278
341
  table.add_column("Compression", style="magenta")
279
- table.add_column("Bloom Filter", justify="center")
342
+ table.add_column("Bloom", justify="center")
343
+ table.add_column("Encrypted", justify="center")
280
344
  table.add_column("Min Value", style="yellow")
281
345
  table.add_column("Max Value", style="yellow")
346
+ table.add_column(
347
+ "Exact", justify="center", style="dim"
348
+ ) # Shows if min/max are exact
349
+
350
+ if show_sizes:
351
+ table.add_column("Values", justify="right")
352
+ table.add_column("Compressed", justify="right", style="blue")
353
+ table.add_column("Ratio", justify="right", style="blue")
282
354
 
283
355
  # Add rows to the table
284
356
  for col in column_info.columns:
@@ -290,15 +362,48 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
290
362
  col.max_value if col.has_min_max and col.max_value is not None else "N/A"
291
363
  )
292
364
 
293
- table.add_row(
365
+ # Format exactness indicator (PyArrow 22+ feature)
366
+ exact_display = "N/A"
367
+ if col.is_min_exact is not None and col.is_max_exact is not None:
368
+ if col.is_min_exact and col.is_max_exact:
369
+ exact_display = "✅"
370
+ elif col.is_min_exact or col.is_max_exact:
371
+ exact_display = "~" # Partially exact
372
+ else:
373
+ exact_display = "❌"
374
+
375
+ # Format encryption status
376
+ encrypted_display = "🔒" if col.is_encrypted else "—"
377
+
378
+ row_data = [
294
379
  str(col.row_group),
295
380
  col.column_name,
296
381
  str(col.column_index),
297
382
  col.compression_type,
298
383
  "✅" if col.has_bloom_filter else "❌",
384
+ encrypted_display,
299
385
  min_display,
300
386
  max_display,
301
- )
387
+ exact_display,
388
+ ]
389
+
390
+ if show_sizes:
391
+ # Calculate compression ratio
392
+ ratio = "N/A"
393
+ if col.total_compressed_size and col.total_uncompressed_size:
394
+ ratio = (
395
+ f"{col.total_uncompressed_size / col.total_compressed_size:.1f}x"
396
+ )
397
+
398
+ row_data.extend(
399
+ [
400
+ str(col.num_values) if col.num_values else "N/A",
401
+ format_size(col.total_compressed_size),
402
+ ratio,
403
+ ]
404
+ )
405
+
406
+ table.add_row(*row_data)
302
407
 
303
408
  # Print the table
304
409
  console.print(table)
@@ -331,6 +436,7 @@ def inspect_single_file(
331
436
  format: OutputFormat,
332
437
  metadata_only: bool,
333
438
  column_filter: Optional[str],
439
+ show_sizes: bool = False,
334
440
  ) -> None:
335
441
  """
336
442
  Inspect a single Parquet file and display its metadata, compression settings, and bloom filter information.
@@ -339,7 +445,7 @@ def inspect_single_file(
339
445
  Exception: If the file cannot be processed.
340
446
  """
341
447
  try:
342
- (parquet_metadata, compression) = read_parquet_metadata(filename)
448
+ parquet_metadata, compression = read_parquet_metadata(filename)
343
449
  except FileNotFoundError:
344
450
  raise Exception(f"Cannot open: {filename}.")
345
451
  except Exception as e:
@@ -382,7 +488,7 @@ def inspect_single_file(
382
488
 
383
489
  # Print column details if not metadata only
384
490
  if not metadata_only:
385
- print_column_info_table(column_info)
491
+ print_column_info_table(column_info, show_sizes=show_sizes)
386
492
  console.print(f"Compression codecs: {compression}")
387
493
 
388
494
 
@@ -404,6 +510,12 @@ def inspect(
404
510
  column_filter: Optional[str] = typer.Option(
405
511
  None, "--column", "-c", help="Filter results to show only specific column"
406
512
  ),
513
+ show_sizes: bool = typer.Option(
514
+ False,
515
+ "--sizes",
516
+ "-s",
517
+ help="Show column sizes and compression ratios",
518
+ ),
407
519
  ):
408
520
  """
409
521
  Inspect Parquet files and display their metadata, compression settings, and bloom filter information.
@@ -436,7 +548,9 @@ def inspect(
436
548
  console.print("─" * (len(filename) + 6))
437
549
 
438
550
  try:
439
- inspect_single_file(filename, format, metadata_only, column_filter)
551
+ inspect_single_file(
552
+ filename, format, metadata_only, column_filter, show_sizes
553
+ )
440
554
  except Exception as e:
441
555
  console.print(f"Error processing {filename}: {e}", style="red")
442
556
  continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: iparq
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: Display version compression and bloom filter information about a parquet file
5
5
  Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
6
6
  License-File: LICENSE
@@ -30,8 +30,12 @@ Description-Content-Type: text/markdown
30
30
  ![alt text](media/iparq.png)
31
31
  After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there's no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
32
32
 
33
- ***New*** Bloom filters information: Displays if there are bloom filters.
34
- Read more about bloom filters in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
33
+ ## Features
34
+
35
+ - **Bloom filters**: Displays if columns have bloom filters. Read more in this [great article](https://duckdb.org/2025/03/07/parquet-bloom-filters-in-duckdb.html).
36
+ - **Encryption detection**: Shows if columns are encrypted (🔒)
37
+ - **Statistics exactness**: Indicates if min/max statistics are exact or approximate (PyArrow 22+)
38
+ - **Compression ratios**: Optional display of column sizes and compression efficiency
35
39
 
36
40
  ## Installation
37
41
 
@@ -102,11 +106,12 @@ Options include:
102
106
  - `--format`, `-f`: Output format, either `rich` (default) or `json`
103
107
  - `--metadata-only`, `-m`: Show only file metadata without column details
104
108
  - `--column`, `-c`: Filter results to show only a specific column
109
+ - `--sizes`, `-s`: Show column sizes and compression ratios
105
110
 
106
111
  ### Single File Examples:
107
112
 
108
113
  ```sh
109
- # Basic inspection
114
+ # Basic inspection .
110
115
  iparq inspect yourfile.parquet
111
116
 
112
117
  # Output in JSON format
@@ -117,6 +122,9 @@ iparq inspect yourfile.parquet --metadata-only
117
122
 
118
123
  # Filter to show only a specific column
119
124
  iparq inspect yourfile.parquet --column column_name
125
+
126
+ # Show column sizes and compression ratios
127
+ iparq inspect yourfile.parquet --sizes
120
128
  ```
121
129
 
122
130
  ### Multiple Files and Glob Patterns:
@@ -137,7 +145,7 @@ iparq inspect important.parquet temp_*.parquet
137
145
 
138
146
  When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
139
147
 
140
- ## Example output - Bloom Filters
148
+ ## Example output
141
149
 
142
150
  ```log
143
151
  ParquetMetaModel(
@@ -148,14 +156,29 @@ ParquetMetaModel(
148
156
  format_version='2.6',
149
157
  serialized_size=2223
150
158
  )
151
- Parquet Column Information
152
- ┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┓
153
- ┃ Column Bloom
154
- ┃ Row Group ┃ Name ┃ Index ┃ Compress… ┃ Filter ┃ Min Value ┃ Max Value ┃
155
- ┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━┩
156
- │ 0 │ one 0 │ SNAPPY ✅ │ -1.0 2.5
157
- │ 0 │ two 1 │ SNAPPY ✅ │ bar foo
158
- │ 0 │ three │ 2 │ SNAPPY │ ✅ │ False │ True │
159
- └───────────┴────────────┴───────┴───────────┴────────────┴───────────┴───────────┘
159
+ Parquet Column Information
160
+ ┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
161
+ Row Group ┃ Column Name Index Compression Bloom Encrypted Min Value Max Value ┃ Exact ┃
162
+ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
163
+ │ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │
164
+ │ 0 │ two 1 │ SNAPPY bar foo N/A │
165
+ │ 0 │ three 2 │ SNAPPY False True N/A │
166
+ └───────────┴─────────────┴───────┴─────────────┴───────┴───────────┴───────────┴───────────┴───────┘
160
167
  Compression codecs: {'SNAPPY'}
161
168
  ```
169
+
170
+ ### With `--sizes` flag
171
+
172
+ ```log
173
+ iparq inspect yourfile.parquet --sizes
174
+
175
+ Parquet Column Information
176
+ ┏━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
177
+ ┃ Row ┃ Column ┃ ┃ ┃ ┃ ┃ Min ┃ Max ┃ ┃ ┃ ┃ ┃
178
+ ┃ Group ┃ Name ┃ Index ┃ Compr… ┃ Bloom ┃ Encryp… ┃ Value ┃ Value ┃ Exact ┃ Values ┃ Compr… ┃ Ratio ┃
179
+ ┡━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
180
+ │ 0 │ one │ 0 │ SNAPPY │ ✅ │ — │ -1.0 │ 2.5 │ N/A │ 3 │ 104.0B │ 1.0x │
181
+ │ 0 │ two │ 1 │ SNAPPY │ ✅ │ — │ bar │ foo │ N/A │ 3 │ 80.0B │ 0.9x │
182
+ │ 0 │ three │ 2 │ SNAPPY │ ✅ │ — │ False │ True │ N/A │ 3 │ 42.0B │ 1.0x │
183
+ └────────┴─────────┴───────┴────────┴───────┴─────────┴────────┴─────────┴───────┴────────┴────────┴───────┘
184
+ ```
@@ -0,0 +1,8 @@
1
+ iparq/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
2
+ iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ iparq/source.py,sha256=6NG2-5aD0hswZ14BEfezoLXNo_D3GDnFWGi0WnY-wuo,20210
4
+ iparq-0.5.0.dist-info/METADATA,sha256=CO4Aj7pjbnFJeinzY7x25d1biLHw_EyHwxBH25LI_Lw,8401
5
+ iparq-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ iparq-0.5.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
7
+ iparq-0.5.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
8
+ iparq-0.5.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,8 +0,0 @@
1
- iparq/__init__.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
2
- iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- iparq/source.py,sha256=HT76AP19e-TheGB5cHlGqsyJGvdsPVMbAYuSo2xH0Bk,15476
4
- iparq-0.4.1.dist-info/METADATA,sha256=aKFzoI4pVaAa_D-fW-FeicRHO6PepqOfMuKBXySYS4I,6018
5
- iparq-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- iparq-0.4.1.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
7
- iparq-0.4.1.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
8
- iparq-0.4.1.dist-info/RECORD,,