iparq 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. iparq-0.4.1/.github/FUNDING.yml +4 -0
  2. {iparq-0.3.0 → iparq-0.4.1}/.github/dependabot.yml +1 -1
  3. {iparq-0.3.0 → iparq-0.4.1}/.github/workflows/python-package.yml +12 -2
  4. iparq-0.4.1/.github/workflows/test.yml +37 -0
  5. {iparq-0.3.0 → iparq-0.4.1}/.gitignore +1 -0
  6. {iparq-0.3.0 → iparq-0.4.1}/PKG-INFO +20 -23
  7. {iparq-0.3.0 → iparq-0.4.1}/README.md +18 -22
  8. {iparq-0.3.0 → iparq-0.4.1}/pyproject.toml +9 -2
  9. iparq-0.4.1/src/iparq/__init__.py +1 -0
  10. {iparq-0.3.0 → iparq-0.4.1}/src/iparq/source.py +72 -0
  11. {iparq-0.3.0 → iparq-0.4.1}/tests/test_cli.py +28 -19
  12. iparq-0.4.1/uv.lock +923 -0
  13. iparq-0.3.0/src/iparq/__init__.py +0 -1
  14. iparq-0.3.0/uv.lock +0 -547
  15. {iparq-0.3.0 → iparq-0.4.1}/.github/copilot-instructions.md +0 -0
  16. {iparq-0.3.0 → iparq-0.4.1}/.github/workflows/copilot-setup-steps.yml +0 -0
  17. {iparq-0.3.0 → iparq-0.4.1}/.github/workflows/merge.yml +0 -0
  18. {iparq-0.3.0 → iparq-0.4.1}/.github/workflows/python-publish.yml +0 -0
  19. {iparq-0.3.0 → iparq-0.4.1}/.python-version +0 -0
  20. {iparq-0.3.0 → iparq-0.4.1}/.vscode/launch.json +0 -0
  21. {iparq-0.3.0 → iparq-0.4.1}/.vscode/settings.json +0 -0
  22. {iparq-0.3.0 → iparq-0.4.1}/CONTRIBUTING.md +0 -0
  23. {iparq-0.3.0 → iparq-0.4.1}/LICENSE +0 -0
  24. {iparq-0.3.0 → iparq-0.4.1}/dummy.parquet +0 -0
  25. {iparq-0.3.0 → iparq-0.4.1}/media/iparq.png +0 -0
  26. {iparq-0.3.0 → iparq-0.4.1}/src/iparq/py.typed +0 -0
  27. {iparq-0.3.0 → iparq-0.4.1}/tests/conftest.py +0 -0
  28. {iparq-0.3.0 → iparq-0.4.1}/tests/dummy.parquet +0 -0
@@ -0,0 +1,4 @@
1
+ # These are supported funding model platforms
2
+
3
+ github: [MiguelElGallo]
4
+
@@ -5,7 +5,7 @@
5
5
 
6
6
  version: 2
7
7
  updates:
8
- - package-ecosystem: "pip" # See documentation for possible values
8
+ - package-ecosystem: "uv" # See documentation for possible values
9
9
  directory: "/" # Location of package manifests
10
10
  schedule:
11
11
  interval: "weekly"
@@ -45,6 +45,16 @@ jobs:
45
45
  uv run mypy . --config-file=../../pyproject.toml
46
46
  - name: Check formatting with black
47
47
  run: uvx black . --check --verbose
48
- - name: Run Python tests
48
+ - name: Run Python tests with coverage
49
49
  if: runner.os != 'Windows'
50
- run: uv run pytest -vv
50
+ run: uv run pytest -vv --cov=src/iparq --cov-report=xml --cov-report=term-missing
51
+
52
+ - name: Upload coverage to Codecov
53
+ if: runner.os != 'Windows'
54
+ uses: codecov/codecov-action@v5
55
+ with:
56
+ files: ./coverage.xml
57
+ fail_ci_if_error: false
58
+ verbose: true
59
+ env:
60
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -0,0 +1,37 @@
1
+ name: Run Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [ "main" ]
6
+ pull_request:
7
+ branches: [ "main" ]
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ test:
12
+ permissions:
13
+ contents: read
14
+ runs-on: ${{ matrix.os }}
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ os: [ubuntu-latest]
19
+ python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
20
+
21
+ steps:
22
+ - name: Checkout code
23
+ uses: actions/checkout@v4
24
+
25
+ - name: Set up Python ${{ matrix.python-version }}
26
+ uses: actions/setup-python@v5
27
+ with:
28
+ python-version: ${{ matrix.python-version }}
29
+
30
+ - name: Install uv
31
+ uses: astral-sh/setup-uv@v5
32
+
33
+ - name: Install dependencies
34
+ run: uv sync --all-extras
35
+
36
+ - name: Run tests
37
+ run: uv run pytest -vv
@@ -172,3 +172,4 @@ cython_debug/
172
172
  .github/.DS_Store
173
173
  yellow_tripdata_2024-01.parquet
174
174
  filter.parquet
175
+ .DS_Store
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: iparq
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: Display version compression and bloom filter information about a parquet file
5
5
  Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
6
6
  License-File: LICENSE
@@ -13,6 +13,7 @@ Provides-Extra: checks
13
13
  Requires-Dist: mypy>=1.14.1; extra == 'checks'
14
14
  Requires-Dist: ruff>=0.9.3; extra == 'checks'
15
15
  Provides-Extra: test
16
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'test'
16
17
  Requires-Dist: pytest>=7.0; extra == 'test'
17
18
  Description-Content-Type: text/markdown
18
19
 
@@ -24,6 +25,8 @@ Description-Content-Type: text/markdown
24
25
 
25
26
  [![Upload Python Package](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml/badge.svg)](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
26
27
 
28
+ [![codecov](https://codecov.io/gh/MiguelElGallo/iparq/branch/main/graph/badge.svg)](https://codecov.io/gh/MiguelElGallo/iparq)
29
+
27
30
  ![alt text](media/iparq.png)
28
31
  After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there's no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
29
32
 
@@ -134,31 +137,25 @@ iparq inspect important.parquet temp_*.parquet
134
137
 
135
138
  When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
136
139
 
137
- ## Example ouput - Bloom Filters
140
+ ## Example output - Bloom Filters
138
141
 
139
142
  ```log
140
143
  ParquetMetaModel(
141
- created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
142
- num_columns=1,
143
- num_rows=100000000,
144
- num_row_groups=10,
145
- format_version='1.0',
146
- serialized_size=1196
144
+ created_by='parquet-cpp-arrow version 14.0.2',
145
+ num_columns=3,
146
+ num_rows=3,
147
+ num_row_groups=1,
148
+ format_version='2.6',
149
+ serialized_size=2223
147
150
  )
148
- Parquet Column Information
149
- ┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
150
- Row Group ┃ Column Name Index Compression Bloom Filter
151
- ┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
152
- │ 0 │ r │ 0 │ SNAPPY │ ✅ │
153
- 1r │ 0 │ SNAPPY ✅ │
154
- 2r 0 │ SNAPPY
155
- 3r 0 │ SNAPPY ✅ │
156
- │ 4 │ r │ 0 │ SNAPPY │ ✅ │
157
- │ 5 │ r │ 0 │ SNAPPY │ ✅ │
158
- │ 6 │ r │ 0 │ SNAPPY │ ✅ │
159
- │ 7 │ r │ 0 │ SNAPPY │ ✅ │
160
- │ 8 │ r │ 0 │ SNAPPY │ ✅ │
161
- │ 9 │ r │ 0 │ SNAPPY │ ✅ │
162
- └───────────┴─────────────┴───────┴─────────────┴──────────────┘
151
+ Parquet Column Information
152
+ ┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┓
153
+ ┃ Column Bloom ┃ ┃
154
+ ┃ Row Group ┃ Name ┃ Index ┃ Compress… ┃ Filter ┃ Min Value ┃ Max Value ┃
155
+ ┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━┩
156
+ 0one │ 0 │ SNAPPY │ -1.0 2.5 │
157
+ 0two 1 │ SNAPPY bar │ foo │
158
+ 0three 2 │ SNAPPY │ False │ True
159
+ └───────────┴────────────┴───────┴───────────┴────────────┴───────────┴───────────┘
163
160
  Compression codecs: {'SNAPPY'}
164
161
  ```
@@ -6,6 +6,8 @@
6
6
 
7
7
  [![Upload Python Package](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml/badge.svg)](https://github.com/MiguelElGallo/iparq/actions/workflows/python-publish.yml)
8
8
 
9
+ [![codecov](https://codecov.io/gh/MiguelElGallo/iparq/branch/main/graph/badge.svg)](https://codecov.io/gh/MiguelElGallo/iparq)
10
+
9
11
  ![alt text](media/iparq.png)
10
12
  After reading [this blog](https://duckdb.org/2025/01/22/parquet-encodings.html), I began to wonder which Parquet version and compression methods the everyday tools we rely on actually use, only to find that there's no straightforward way to determine this. That curiosity and the difficulty of quickly discovering such details motivated me to create iparq (Information Parquet). My goal with iparq is to help users easily identify the specifics of the Parquet files generated by different engines, making it clear which features—like newer encodings or certain compression algorithms—the creator of the parquet is using.
11
13
 
@@ -116,31 +118,25 @@ iparq inspect important.parquet temp_*.parquet
116
118
 
117
119
  When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
118
120
 
119
- ## Example ouput - Bloom Filters
121
+ ## Example output - Bloom Filters
120
122
 
121
123
  ```log
122
124
  ParquetMetaModel(
123
- created_by='DuckDB version v1.2.1 (build 8e52ec4395)',
124
- num_columns=1,
125
- num_rows=100000000,
126
- num_row_groups=10,
127
- format_version='1.0',
128
- serialized_size=1196
125
+ created_by='parquet-cpp-arrow version 14.0.2',
126
+ num_columns=3,
127
+ num_rows=3,
128
+ num_row_groups=1,
129
+ format_version='2.6',
130
+ serialized_size=2223
129
131
  )
130
- Parquet Column Information
131
- ┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
132
- Row Group ┃ Column Name Index Compression Bloom Filter
133
- ┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
134
- │ 0 │ r │ 0 │ SNAPPY │ ✅ │
135
- 1r │ 0 │ SNAPPY ✅ │
136
- 2r 0 │ SNAPPY
137
- 3r 0 │ SNAPPY ✅ │
138
- │ 4 │ r │ 0 │ SNAPPY │ ✅ │
139
- │ 5 │ r │ 0 │ SNAPPY │ ✅ │
140
- │ 6 │ r │ 0 │ SNAPPY │ ✅ │
141
- │ 7 │ r │ 0 │ SNAPPY │ ✅ │
142
- │ 8 │ r │ 0 │ SNAPPY │ ✅ │
143
- │ 9 │ r │ 0 │ SNAPPY │ ✅ │
144
- └───────────┴─────────────┴───────┴─────────────┴──────────────┘
132
+ Parquet Column Information
133
+ ┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┓
134
+ ┃ Column Bloom ┃ ┃
135
+ ┃ Row Group ┃ Name ┃ Index ┃ Compress… ┃ Filter ┃ Min Value ┃ Max Value ┃
136
+ ┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━┩
137
+ 0one │ 0 │ SNAPPY │ -1.0 2.5 │
138
+ 0two 1 │ SNAPPY bar │ foo │
139
+ 0three 2 │ SNAPPY │ False │ True
140
+ └───────────┴────────────┴───────┴───────────┴────────────┴───────────┴───────────┘
145
141
  Compression codecs: {'SNAPPY'}
146
142
  ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "iparq"
3
- version = "0.3.0"
3
+ version = "0.4.1"
4
4
  description = "Display version compression and bloom filter information about a parquet file"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -17,6 +17,7 @@ dependencies = [
17
17
  [project.optional-dependencies]
18
18
  test = [
19
19
  "pytest>=7.0",
20
+ "pytest-cov>=4.0.0",
20
21
  ]
21
22
  checks = [
22
23
  "mypy>=1.14.1",
@@ -38,4 +39,10 @@ testpaths = [
38
39
 
39
40
  [[tool.mypy.overrides]]
40
41
  module = ["pyarrow.*"]
41
- ignore_missing_imports = true
42
+ ignore_missing_imports = true
43
+
44
+ [dependency-groups]
45
+ dev = [
46
+ "pytest>=8.4.1",
47
+ "pytest-cov>=4.0.0",
48
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.4.1"
@@ -54,6 +54,9 @@ class ColumnInfo(BaseModel):
54
54
  column_index (int): The index of the column.
55
55
  compression_type (str): The compression type used for the column.
56
56
  has_bloom_filter (bool): Whether the column has a bloom filter.
57
+ has_min_max (bool): Whether min/max statistics are available.
58
+ min_value (Optional[str]): The minimum value in the column (as string for display).
59
+ max_value (Optional[str]): The maximum value in the column (as string for display).
57
60
  """
58
61
 
59
62
  row_group: int
@@ -61,6 +64,9 @@ class ColumnInfo(BaseModel):
61
64
  column_index: int
62
65
  compression_type: str
63
66
  has_bloom_filter: Optional[bool] = False
67
+ has_min_max: Optional[bool] = False
68
+ min_value: Optional[str] = None
69
+ max_value: Optional[str] = None
64
70
 
65
71
 
66
72
  class ParquetColumnInfo(BaseModel):
@@ -203,6 +209,59 @@ def print_bloom_filter_info(parquet_metadata, column_info: ParquetColumnInfo) ->
203
209
  )
204
210
 
205
211
 
212
+ def print_min_max_statistics(parquet_metadata, column_info: ParquetColumnInfo) -> None:
213
+ """
214
+ Updates the column_info model with min/max statistics information.
215
+
216
+ Args:
217
+ parquet_metadata: The Parquet file metadata.
218
+ column_info: The ParquetColumnInfo model to update.
219
+ """
220
+ try:
221
+ num_row_groups = parquet_metadata.num_row_groups
222
+ num_columns = parquet_metadata.num_columns
223
+
224
+ for i in range(num_row_groups):
225
+ row_group = parquet_metadata.row_group(i)
226
+
227
+ for j in range(num_columns):
228
+ column_chunk = row_group.column(j)
229
+
230
+ # Find the corresponding column in our model
231
+ for col in column_info.columns:
232
+ if col.row_group == i and col.column_index == j:
233
+ # Check if this column has statistics
234
+ if column_chunk.is_stats_set:
235
+ stats = column_chunk.statistics
236
+ col.has_min_max = stats.has_min_max
237
+
238
+ if stats.has_min_max:
239
+ # Convert values to string for display, handling potential None values
240
+ try:
241
+ col.min_value = (
242
+ str(stats.min)
243
+ if stats.min is not None
244
+ else "null"
245
+ )
246
+ col.max_value = (
247
+ str(stats.max)
248
+ if stats.max is not None
249
+ else "null"
250
+ )
251
+ except Exception:
252
+ # Fallback for complex types that might not stringify well
253
+ col.min_value = "<unable to display>"
254
+ col.max_value = "<unable to display>"
255
+ else:
256
+ col.has_min_max = False
257
+ break
258
+ except Exception as e:
259
+ console.print(
260
+ f"Error while collecting min/max statistics: {e}",
261
+ style="blink bold red underline on white",
262
+ )
263
+
264
+
206
265
  def print_column_info_table(column_info: ParquetColumnInfo) -> None:
207
266
  """
208
267
  Prints the column information using a Rich table.
@@ -218,15 +277,27 @@ def print_column_info_table(column_info: ParquetColumnInfo) -> None:
218
277
  table.add_column("Index", justify="center")
219
278
  table.add_column("Compression", style="magenta")
220
279
  table.add_column("Bloom Filter", justify="center")
280
+ table.add_column("Min Value", style="yellow")
281
+ table.add_column("Max Value", style="yellow")
221
282
 
222
283
  # Add rows to the table
223
284
  for col in column_info.columns:
285
+ # Format min/max values for display
286
+ min_display = (
287
+ col.min_value if col.has_min_max and col.min_value is not None else "N/A"
288
+ )
289
+ max_display = (
290
+ col.max_value if col.has_min_max and col.max_value is not None else "N/A"
291
+ )
292
+
224
293
  table.add_row(
225
294
  str(col.row_group),
226
295
  col.column_name,
227
296
  str(col.column_index),
228
297
  col.compression_type,
229
298
  "✅" if col.has_bloom_filter else "❌",
299
+ min_display,
300
+ max_display,
230
301
  )
231
302
 
232
303
  # Print the table
@@ -290,6 +361,7 @@ def inspect_single_file(
290
361
  # Collect information
291
362
  print_compression_types(parquet_metadata, column_info)
292
363
  print_bloom_filter_info(parquet_metadata, column_info)
364
+ print_min_max_statistics(parquet_metadata, column_info)
293
365
 
294
366
  # Filter columns if requested
295
367
  if column_filter:
@@ -17,25 +17,24 @@ def test_parquet_info():
17
17
 
18
18
  assert result.exit_code == 0
19
19
 
20
- expected_output = """ParquetMetaModel(
21
- created_by='parquet-cpp-arrow version 14.0.2',
22
- num_columns=3,
23
- num_rows=3,
24
- num_row_groups=1,
25
- format_version='2.6',
26
- serialized_size=2223
27
- )
28
- Parquet Column Information
29
- ┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
30
- Row Group Column Name Index Compression Bloom Filter ┃
31
- ┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
32
- │ 0 │ one │ 0 │ SNAPPY │ ✅ │
33
- │ 0 │ two │ 1 │ SNAPPY │ ✅ │
34
- │ 0 │ three │ 2 │ SNAPPY │ ✅ │
35
- └───────────┴─────────────┴───────┴─────────────┴──────────────┘
36
- Compression codecs: {'SNAPPY'}"""
37
-
38
- assert expected_output in result.stdout
20
+ # Check for key components instead of exact table format
21
+ assert "ParquetMetaModel" in result.stdout
22
+ assert "created_by='parquet-cpp-arrow version 14.0.2'" in result.stdout
23
+ assert "num_columns=3" in result.stdout
24
+ assert "num_rows=3" in result.stdout
25
+ assert "Parquet Column Information" in result.stdout
26
+ assert "Min Value" in result.stdout
27
+ assert (
28
+ "Value" in result.stdout
29
+ ) # This covers "Max Value" which is split across lines
30
+ assert "one" in result.stdout and "-1.0" in result.stdout and "2.5" in result.stdout
31
+ assert "two" in result.stdout and "bar" in result.stdout and "foo" in result.stdout
32
+ assert (
33
+ "three" in result.stdout
34
+ and "False" in result.stdout
35
+ and "True" in result.stdout
36
+ )
37
+ assert "Compression codecs: {'SNAPPY'}" in result.stdout
39
38
 
40
39
 
41
40
  def test_metadata_only_flag():
@@ -77,6 +76,16 @@ def test_json_output():
77
76
  assert "compression_codecs" in data
78
77
  assert data["metadata"]["num_columns"] == 3
79
78
 
79
+ # Check that min/max statistics are included
80
+ for column in data["columns"]:
81
+ assert "has_min_max" in column
82
+ assert "min_value" in column
83
+ assert "max_value" in column
84
+ # For our test data, all columns should have min/max stats
85
+ assert column["has_min_max"] is True
86
+ assert column["min_value"] is not None
87
+ assert column["max_value"] is not None
88
+
80
89
 
81
90
  def test_multiple_files():
82
91
  """Test that multiple files can be inspected in a single command."""