iparq 0.2.6__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
iparq/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.6"
1
+ __version__ = "0.4.0"
iparq/source.py CHANGED
@@ -1,3 +1,4 @@
1
+ import glob
1
2
  import json
2
3
  from enum import Enum
3
4
  from typing import List, Optional
@@ -53,6 +54,9 @@ class ColumnInfo(BaseModel):
53
54
  column_index (int): The index of the column.
54
55
  compression_type (str): The compression type used for the column.
55
56
  has_bloom_filter (bool): Whether the column has a bloom filter.
57
+ has_min_max (bool): Whether min/max statistics are available.
58
+ min_value (Optional[str]): The minimum value in the column (as string for display).
59
+ max_value (Optional[str]): The maximum value in the column (as string for display).
56
60
  """
57
61
 
58
62
  row_group: int
@@ -60,6 +64,9 @@ class ColumnInfo(BaseModel):
60
64
  column_index: int
61
65
  compression_type: str
62
66
  has_bloom_filter: Optional[bool] = False
67
+ has_min_max: Optional[bool] = False
68
+ min_value: Optional[str] = None
69
+ max_value: Optional[str] = None
63
70
 
64
71
 
65
72
  class ParquetColumnInfo(BaseModel):
@@ -84,22 +91,16 @@ def read_parquet_metadata(filename: str):
84
91
  tuple: A tuple containing:
85
92
  - parquet_metadata (pyarrow.parquet.FileMetaData): The metadata of the Parquet file.
86
93
  - compression_codecs (set): A set of compression codecs used in the Parquet file.
87
- """
88
- try:
89
- compression_codecs = set([])
90
- parquet_metadata = pq.ParquetFile(filename).metadata
91
94
 
92
- for i in range(parquet_metadata.num_row_groups):
93
- for j in range(parquet_metadata.num_columns):
94
- compression_codecs.add(
95
- parquet_metadata.row_group(i).column(j).compression
96
- )
95
+ Raises:
96
+ FileNotFoundError: If the file cannot be found or opened.
97
+ """
98
+ compression_codecs = set([])
99
+ parquet_metadata = pq.ParquetFile(filename).metadata
97
100
 
98
- except FileNotFoundError:
99
- console.print(
100
- f"Cannot open: {filename}.", style="blink bold red underline on white"
101
- )
102
- exit(1)
101
+ for i in range(parquet_metadata.num_row_groups):
102
+ for j in range(parquet_metadata.num_columns):
103
+ compression_codecs.add(parquet_metadata.row_group(i).column(j).compression)
103
104
 
104
105
  return parquet_metadata, compression_codecs
105
106
 
@@ -208,6 +209,59 @@ def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) ->
208
209
  )
209
210
 
210
211
 
212
+ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -> None:
213
+ """
214
+ Updates the column_info model with min/max statistics information.
215
+
216
+ Args:
217
+ parquet_metadata: The Parquet file metadata.
218
+ column_info: The ParquetColumnInfo model to update.
219
+ """
220
+ try:
221
+ num_row_groups = parquet_metadata.num_row_groups
222
+ num_columns = parquet_metadata.num_columns
223
+
224
+ for i in range(num_row_groups):
225
+ row_group = parquet_metadata.row_group(i)
226
+
227
+ for j in range(num_columns):
228
+ column_chunk = row_group.column(j)
229
+
230
+ # Find the corresponding column in our model
231
+ for col in column_info.columns:
232
+ if col.row_group == i and col.column_index == j:
233
+ # Check if this column has statistics
234
+ if column_chunk.is_stats_set:
235
+ stats = column_chunk.statistics
236
+ col.has_min_max = stats.has_min_max
237
+
238
+ if stats.has_min_max:
239
+ # Convert values to string for display, handling potential None values
240
+ try:
241
+ col.min_value = (
242
+ str(stats.min)
243
+ if stats.min is not None
244
+ else "null"
245
+ )
246
+ col.max_value = (
247
+ str(stats.max)
248
+ if stats.max is not None
249
+ else "null"
250
+ )
251
+ except Exception:
252
+ # Fallback for complex types that might not stringify well
253
+ col.min_value = "<unable to display>"
254
+ col.max_value = "<unable to display>"
255
+ else:
256
+ col.has_min_max = False
257
+ break
258
+ except Exception as e:
259
+ console.print(
260
+ f"Error while collecting min/max statistics: {e}",
261
+ style="blink bold red underline on white",
262
+ )
263
+
264
+
211
265
  def print_column_info_table(column_info: ParquetColumnInfo) -> None:
212
266
  """
213
267
  Prints the column information using a Rich table.
@@ -223,15 +277,27 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
223
277
  table.add_column("Index", justify="center")
224
278
  table.add_column("Compression", style="magenta")
225
279
  table.add_column("Bloom Filter", justify="center")
280
+ table.add_column("Min Value", style="yellow")
281
+ table.add_column("Max Value", style="yellow")
226
282
 
227
283
  # Add rows to the table
228
284
  for col in column_info.columns:
285
+ # Format min/max values for display
286
+ min_display = (
287
+ col.min_value if col.has_min_max and col.min_value is not None else "N/A"
288
+ )
289
+ max_display = (
290
+ col.max_value if col.has_min_max and col.max_value is not None else "N/A"
291
+ )
292
+
229
293
  table.add_row(
230
294
  str(col.row_group),
231
295
  col.column_name,
232
296
  str(col.column_index),
233
297
  col.compression_type,
234
298
  "✅" if col.has_bloom_filter else "❌",
299
+ min_display,
300
+ max_display,
235
301
  )
236
302
 
237
303
  # Print the table
@@ -260,27 +326,24 @@ def output_json(
260
326
  print(json.dumps(result, indent=2))
261
327
 
262
328
 
263
- @app.command(name="")
264
- @app.command(name="inspect")
265
- def inspect(
266
- filename: str = typer.Argument(..., help="Path to the Parquet file to inspect"),
267
- format: OutputFormat = typer.Option(
268
- OutputFormat.RICH, "--format", "-f", help="Output format (rich or json)"
269
- ),
270
- metadata_only: bool = typer.Option(
271
- False,
272
- "--metadata-only",
273
- "-m",
274
- help="Show only file metadata without column details",
275
- ),
276
- column_filter: Optional[str] = typer.Option(
277
- None, "--column", "-c", help="Filter results to show only specific column"
278
- ),
279
- ):
329
+ def inspect_single_file(
330
+ filename: str,
331
+ format: OutputFormat,
332
+ metadata_only: bool,
333
+ column_filter: Optional[str],
334
+ ) -> None:
280
335
  """
281
- Inspect a Parquet file and display its metadata, compression settings, and bloom filter information.
336
+ Inspect a single Parquet file and display its metadata, compression settings, and bloom filter information.
337
+
338
+ Raises:
339
+ Exception: If the file cannot be processed.
282
340
  """
283
- (parquet_metadata, compression) = read_parquet_metadata(filename)
341
+ try:
342
+ (parquet_metadata, compression) = read_parquet_metadata(filename)
343
+ except FileNotFoundError:
344
+ raise Exception(f"Cannot open: {filename}.")
345
+ except Exception as e:
346
+ raise Exception(f"Failed to read metadata: {e}")
284
347
 
285
348
  # Create metadata model
286
349
  meta_model = ParquetMetaModel(
@@ -298,6 +361,7 @@ def inspect(
298
361
  # Collect information
299
362
  print_compression_types(parquet_metadata, column_info)
300
363
  print_bloom_filter_info(parquet_metadata, column_info)
364
+ print_min_max_statistics(parquet_metadata, column_info)
301
365
 
302
366
  # Filter columns if requested
303
367
  if column_filter:
@@ -322,5 +386,61 @@ def inspect(
322
386
  console.print(f"Compression codecs: {compression}")
323
387
 
324
388
 
389
+ @app.command(name="")
390
+ @app.command(name="inspect")
391
+ def inspect(
392
+ filenames: List[str] = typer.Argument(
393
+ ..., help="Path(s) or pattern(s) to Parquet files to inspect"
394
+ ),
395
+ format: OutputFormat = typer.Option(
396
+ OutputFormat.RICH, "--format", "-f", help="Output format (rich or json)"
397
+ ),
398
+ metadata_only: bool = typer.Option(
399
+ False,
400
+ "--metadata-only",
401
+ "-m",
402
+ help="Show only file metadata without column details",
403
+ ),
404
+ column_filter: Optional[str] = typer.Option(
405
+ None, "--column", "-c", help="Filter results to show only specific column"
406
+ ),
407
+ ):
408
+ """
409
+ Inspect Parquet files and display their metadata, compression settings, and bloom filter information.
410
+ """
411
+ # Expand glob patterns and collect all matching files
412
+ all_files = []
413
+ for pattern in filenames:
414
+ matches = glob.glob(pattern)
415
+ if matches:
416
+ all_files.extend(matches)
417
+ else:
418
+ # If no matches found, treat as literal filename (for better error reporting)
419
+ all_files.append(pattern)
420
+
421
+ # Remove duplicates while preserving order
422
+ seen = set()
423
+ unique_files = []
424
+ for file in all_files:
425
+ if file not in seen:
426
+ seen.add(file)
427
+ unique_files.append(file)
428
+
429
+ # Process each file
430
+ for i, filename in enumerate(unique_files):
431
+ # For multiple files, add a header to separate results
432
+ if len(unique_files) > 1:
433
+ if i > 0:
434
+ console.print() # Add blank line between files
435
+ console.print(f"[bold blue]File: {filename}[/bold blue]")
436
+ console.print("─" * (len(filename) + 6))
437
+
438
+ try:
439
+ inspect_single_file(filename, format, metadata_only, column_filter)
440
+ except Exception as e:
441
+ console.print(f"Error processing {filename}: {e}", style="red")
442
+ continue
443
+
444
+
325
445
  if __name__ == "__main__":
326
446
  app()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: iparq
3
- Version: 0.2.6
3
+ Version: 0.4.0
4
4
  Summary: Display version compression and bloom filter information about a parquet file
5
5
  Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
6
6
  License-File: LICENSE
@@ -88,10 +88,10 @@ Read more about bloom filters in this [great article](https://duckdb.org/2025/03
88
88
 
89
89
  ## Usage
90
90
 
91
- iparq now supports additional options:
91
+ iparq supports inspecting single files, multiple files, and glob patterns:
92
92
 
93
93
  ```sh
94
- iparq inspect <filename> [OPTIONS]
94
+ iparq inspect <filename(s)> [OPTIONS]
95
95
  ```
96
96
 
97
97
  Options include:
@@ -100,9 +100,12 @@ Options include:
100
100
  - `--metadata-only`, `-m`: Show only file metadata without column details
101
101
  - `--column`, `-c`: Filter results to show only a specific column
102
102
 
103
- Examples:
103
+ ### Single File Examples:
104
104
 
105
105
  ```sh
106
+ # Basic inspection
107
+ iparq inspect yourfile.parquet
108
+
106
109
  # Output in JSON format
107
110
  iparq inspect yourfile.parquet --format json
108
111
 
@@ -113,9 +116,25 @@ iparq inspect yourfile.parquet --metadata-only
113
116
  iparq inspect yourfile.parquet --column column_name
114
117
  ```
115
118
 
116
- Replace `<filename>` with the path to your .parquet file. The utility will read the metadata of the file and print the compression codecs used in the parquet file.
119
+ ### Multiple Files and Glob Patterns:
120
+
121
+ ```sh
122
+ # Inspect multiple specific files
123
+ iparq inspect file1.parquet file2.parquet file3.parquet
124
+
125
+ # Use glob patterns to inspect all parquet files
126
+ iparq inspect *.parquet
127
+
128
+ # Use specific patterns
129
+ iparq inspect yellow*.parquet data_*.parquet
130
+
131
+ # Combine patterns and specific files
132
+ iparq inspect important.parquet temp_*.parquet
133
+ ```
134
+
135
+ When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
117
136
 
118
- ## Example ouput - Bloom Filters
137
+ ## Example output - Bloom Filters
119
138
 
120
139
  ```log
121
140
  ParquetMetaModel(
@@ -0,0 +1,8 @@
1
+ iparq/__init__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
2
+ iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ iparq/source.py,sha256=HT76AP19e-TheGB5cHlGqsyJGvdsPVMbAYuSo2xH0Bk,15476
4
+ iparq-0.4.0.dist-info/METADATA,sha256=D6parmkBDsraB3C57YX9yistPxOYK-XHi_nBKJ2BGB0,6019
5
+ iparq-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ iparq-0.4.0.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
7
+ iparq-0.4.0.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
8
+ iparq-0.4.0.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- iparq/__init__.py,sha256=Oz5HbwHMyE87nmwV80AZzpkJPf-wBg7eDuJr_BXZkhU,22
2
- iparq/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- iparq/source.py,sha256=huC6I0hqwyv4BZ5xjI6FMZs9KH60xVHEKbmX6X8hhiA,10721
4
- iparq-0.2.6.dist-info/METADATA,sha256=LtiLJlVCHuOlx0gOOTqJ97S8baPFhnd_lOmKG2a-94g,5496
5
- iparq-0.2.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- iparq-0.2.6.dist-info/entry_points.txt,sha256=vrE2lwvuheySWTOJdr_gh9AT47ck02WCHo0muRq5HS8,43
7
- iparq-0.2.6.dist-info/licenses/LICENSE,sha256=apqXCIYD_rrtbJVE-Ex1-1X7N0cBwZTOm4KL3TEFmYA,1067
8
- iparq-0.2.6.dist-info/RECORD,,
File without changes