iparq 0.2.5__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iparq-0.3.0/.github/workflows/copilot-setup-steps.yml +31 -0
- {iparq-0.2.5 → iparq-0.3.0}/.github/workflows/merge.yml +1 -1
- {iparq-0.2.5 → iparq-0.3.0}/.github/workflows/python-package.yml +1 -1
- {iparq-0.2.5 → iparq-0.3.0}/.github/workflows/python-publish.yml +67 -1
- {iparq-0.2.5 → iparq-0.3.0}/PKG-INFO +25 -6
- {iparq-0.2.5 → iparq-0.3.0}/README.md +23 -4
- {iparq-0.2.5 → iparq-0.3.0}/pyproject.toml +2 -2
- iparq-0.3.0/src/iparq/__init__.py +1 -0
- {iparq-0.2.5 → iparq-0.3.0}/src/iparq/source.py +81 -33
- iparq-0.3.0/tests/test_cli.py +164 -0
- iparq-0.3.0/uv.lock +547 -0
- iparq-0.2.5/src/iparq/__init__.py +0 -1
- iparq-0.2.5/tests/test_cli.py +0 -78
- iparq-0.2.5/uv.lock +0 -472
- {iparq-0.2.5 → iparq-0.3.0}/.github/copilot-instructions.md +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/.github/dependabot.yml +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/.gitignore +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/.python-version +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/.vscode/launch.json +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/.vscode/settings.json +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/CONTRIBUTING.md +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/LICENSE +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/dummy.parquet +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/media/iparq.png +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/src/iparq/py.typed +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/tests/conftest.py +0 -0
- {iparq-0.2.5 → iparq-0.3.0}/tests/dummy.parquet +0 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: "Copilot Setup Steps"
|
|
2
|
+
|
|
3
|
+
# Allow testing of the setup steps from your repository's "Actions" tab.
|
|
4
|
+
on: workflow_dispatch
|
|
5
|
+
|
|
6
|
+
jobs:
|
|
7
|
+
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
|
|
8
|
+
copilot-setup-steps:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
# Set the permissions to the lowest permissions possible needed for your steps.
|
|
12
|
+
# Copilot will be given its own token for its operations.
|
|
13
|
+
permissions:
|
|
14
|
+
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
|
|
15
|
+
contents: read
|
|
16
|
+
|
|
17
|
+
# You can define any steps you want, and they will run before the agent starts.
|
|
18
|
+
# If you do not check out your code, Copilot will do this for you.
|
|
19
|
+
steps:
|
|
20
|
+
- name: Checkout code
|
|
21
|
+
uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Install UV (Python package manager)
|
|
24
|
+
run: |
|
|
25
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
26
|
+
export PATH="$HOME/.cargo/bin:$PATH"
|
|
27
|
+
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
|
28
|
+
uv --version
|
|
29
|
+
|
|
30
|
+
# Note: GitHub MCP server is not publicly available as npm package
|
|
31
|
+
# Remove this step until the package is officially released
|
|
@@ -15,11 +15,71 @@ permissions:
|
|
|
15
15
|
contents: read
|
|
16
16
|
|
|
17
17
|
jobs:
|
|
18
|
+
test-and-validate:
|
|
19
|
+
permissions:
|
|
20
|
+
contents: read
|
|
21
|
+
pull-requests: write
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v4
|
|
26
|
+
|
|
27
|
+
- name: Install uv
|
|
28
|
+
uses: astral-sh/setup-uv@v5
|
|
29
|
+
|
|
30
|
+
- name: "Set up Python"
|
|
31
|
+
uses: actions/setup-python@v5
|
|
32
|
+
with:
|
|
33
|
+
python-version-file: "pyproject.toml"
|
|
34
|
+
|
|
35
|
+
- name: Update dependencies and sync
|
|
36
|
+
run: |
|
|
37
|
+
uv lock --upgrade
|
|
38
|
+
uv sync --all-extras
|
|
39
|
+
|
|
40
|
+
- name: Run linting
|
|
41
|
+
run: |
|
|
42
|
+
uv run ruff check .
|
|
43
|
+
|
|
44
|
+
- name: Run type checking
|
|
45
|
+
run: |
|
|
46
|
+
cd src/iparq
|
|
47
|
+
uv run mypy . --config-file=../../pyproject.toml
|
|
48
|
+
|
|
49
|
+
- name: Run tests
|
|
50
|
+
run: |
|
|
51
|
+
uv run pytest -v
|
|
52
|
+
|
|
53
|
+
- name: Test package build
|
|
54
|
+
run: |
|
|
55
|
+
uv build
|
|
56
|
+
|
|
57
|
+
- name: Test package installation in clean environment
|
|
58
|
+
run: |
|
|
59
|
+
# Test that the built package can be installed and imported
|
|
60
|
+
python -m venv test_install_env
|
|
61
|
+
source test_install_env/bin/activate
|
|
62
|
+
# Install the latest wheel file
|
|
63
|
+
pip install $(ls -t dist/*.whl | head -1)
|
|
64
|
+
python -c "import iparq; print(f'Successfully imported iparq version {iparq.__version__}')"
|
|
65
|
+
iparq --help
|
|
66
|
+
deactivate
|
|
67
|
+
|
|
68
|
+
- name: Upload test results
|
|
69
|
+
uses: actions/upload-artifact@v4
|
|
70
|
+
if: always()
|
|
71
|
+
with:
|
|
72
|
+
name: test-results
|
|
73
|
+
path: |
|
|
74
|
+
.coverage
|
|
75
|
+
htmlcov/
|
|
76
|
+
|
|
18
77
|
release-build:
|
|
19
78
|
permissions:
|
|
20
79
|
contents: read
|
|
21
80
|
pull-requests: write
|
|
22
81
|
runs-on: ubuntu-latest
|
|
82
|
+
needs: test-and-validate
|
|
23
83
|
|
|
24
84
|
steps:
|
|
25
85
|
- uses: actions/checkout@v4
|
|
@@ -32,9 +92,14 @@ jobs:
|
|
|
32
92
|
with:
|
|
33
93
|
python-version-file: "pyproject.toml"
|
|
34
94
|
|
|
95
|
+
- name: Update dependencies and sync
|
|
96
|
+
run: |
|
|
97
|
+
uv lock --upgrade
|
|
98
|
+
uv sync --all-extras
|
|
99
|
+
|
|
35
100
|
- name: Build release distributions
|
|
36
101
|
run: |
|
|
37
|
-
|
|
102
|
+
uv build
|
|
38
103
|
|
|
39
104
|
- name: Upload distributions
|
|
40
105
|
uses: actions/upload-artifact@v4
|
|
@@ -45,6 +110,7 @@ jobs:
|
|
|
45
110
|
pypi-publish:
|
|
46
111
|
runs-on: ubuntu-latest
|
|
47
112
|
needs:
|
|
113
|
+
- test-and-validate
|
|
48
114
|
- release-build
|
|
49
115
|
permissions:
|
|
50
116
|
# IMPORTANT: this permission is mandatory for trusted publishing
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iparq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Display version compression and bloom filter information about a parquet file
|
|
5
5
|
Author-email: MiguelElGallo <miguel.zurcher@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -8,7 +8,7 @@ Requires-Python: >=3.9
|
|
|
8
8
|
Requires-Dist: pyarrow
|
|
9
9
|
Requires-Dist: pydantic
|
|
10
10
|
Requires-Dist: rich
|
|
11
|
-
Requires-Dist: typer
|
|
11
|
+
Requires-Dist: typer
|
|
12
12
|
Provides-Extra: checks
|
|
13
13
|
Requires-Dist: mypy>=1.14.1; extra == 'checks'
|
|
14
14
|
Requires-Dist: ruff>=0.9.3; extra == 'checks'
|
|
@@ -88,10 +88,10 @@ Read more about bloom filters in this [great article](https://duckdb.org/2025/03
|
|
|
88
88
|
|
|
89
89
|
## Usage
|
|
90
90
|
|
|
91
|
-
iparq
|
|
91
|
+
iparq supports inspecting single files, multiple files, and glob patterns:
|
|
92
92
|
|
|
93
93
|
```sh
|
|
94
|
-
iparq inspect <filename> [OPTIONS]
|
|
94
|
+
iparq inspect <filename(s)> [OPTIONS]
|
|
95
95
|
```
|
|
96
96
|
|
|
97
97
|
Options include:
|
|
@@ -100,9 +100,12 @@ Options include:
|
|
|
100
100
|
- `--metadata-only`, `-m`: Show only file metadata without column details
|
|
101
101
|
- `--column`, `-c`: Filter results to show only a specific column
|
|
102
102
|
|
|
103
|
-
Examples:
|
|
103
|
+
### Single File Examples:
|
|
104
104
|
|
|
105
105
|
```sh
|
|
106
|
+
# Basic inspection
|
|
107
|
+
iparq inspect yourfile.parquet
|
|
108
|
+
|
|
106
109
|
# Output in JSON format
|
|
107
110
|
iparq inspect yourfile.parquet --format json
|
|
108
111
|
|
|
@@ -113,7 +116,23 @@ iparq inspect yourfile.parquet --metadata-only
|
|
|
113
116
|
iparq inspect yourfile.parquet --column column_name
|
|
114
117
|
```
|
|
115
118
|
|
|
116
|
-
|
|
119
|
+
### Multiple Files and Glob Patterns:
|
|
120
|
+
|
|
121
|
+
```sh
|
|
122
|
+
# Inspect multiple specific files
|
|
123
|
+
iparq inspect file1.parquet file2.parquet file3.parquet
|
|
124
|
+
|
|
125
|
+
# Use glob patterns to inspect all parquet files
|
|
126
|
+
iparq inspect *.parquet
|
|
127
|
+
|
|
128
|
+
# Use specific patterns
|
|
129
|
+
iparq inspect yellow*.parquet data_*.parquet
|
|
130
|
+
|
|
131
|
+
# Combine patterns and specific files
|
|
132
|
+
iparq inspect important.parquet temp_*.parquet
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
|
|
117
136
|
|
|
118
137
|
## Example ouput - Bloom Filters
|
|
119
138
|
|
|
@@ -70,10 +70,10 @@ Read more about bloom filters in this [great article](https://duckdb.org/2025/03
|
|
|
70
70
|
|
|
71
71
|
## Usage
|
|
72
72
|
|
|
73
|
-
iparq
|
|
73
|
+
iparq supports inspecting single files, multiple files, and glob patterns:
|
|
74
74
|
|
|
75
75
|
```sh
|
|
76
|
-
iparq inspect <filename> [OPTIONS]
|
|
76
|
+
iparq inspect <filename(s)> [OPTIONS]
|
|
77
77
|
```
|
|
78
78
|
|
|
79
79
|
Options include:
|
|
@@ -82,9 +82,12 @@ Options include:
|
|
|
82
82
|
- `--metadata-only`, `-m`: Show only file metadata without column details
|
|
83
83
|
- `--column`, `-c`: Filter results to show only a specific column
|
|
84
84
|
|
|
85
|
-
Examples:
|
|
85
|
+
### Single File Examples:
|
|
86
86
|
|
|
87
87
|
```sh
|
|
88
|
+
# Basic inspection
|
|
89
|
+
iparq inspect yourfile.parquet
|
|
90
|
+
|
|
88
91
|
# Output in JSON format
|
|
89
92
|
iparq inspect yourfile.parquet --format json
|
|
90
93
|
|
|
@@ -95,7 +98,23 @@ iparq inspect yourfile.parquet --metadata-only
|
|
|
95
98
|
iparq inspect yourfile.parquet --column column_name
|
|
96
99
|
```
|
|
97
100
|
|
|
98
|
-
|
|
101
|
+
### Multiple Files and Glob Patterns:
|
|
102
|
+
|
|
103
|
+
```sh
|
|
104
|
+
# Inspect multiple specific files
|
|
105
|
+
iparq inspect file1.parquet file2.parquet file3.parquet
|
|
106
|
+
|
|
107
|
+
# Use glob patterns to inspect all parquet files
|
|
108
|
+
iparq inspect *.parquet
|
|
109
|
+
|
|
110
|
+
# Use specific patterns
|
|
111
|
+
iparq inspect yellow*.parquet data_*.parquet
|
|
112
|
+
|
|
113
|
+
# Combine patterns and specific files
|
|
114
|
+
iparq inspect important.parquet temp_*.parquet
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
When inspecting multiple files, each file's results are displayed with a header showing the filename. The utility will read the metadata of each file and print the compression codecs used in the parquet files.
|
|
99
118
|
|
|
100
119
|
## Example ouput - Bloom Filters
|
|
101
120
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "iparq"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Display version compression and bloom filter information about a parquet file"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -9,7 +9,7 @@ authors = [
|
|
|
9
9
|
requires-python = ">=3.9"
|
|
10
10
|
dependencies = [
|
|
11
11
|
"pyarrow",
|
|
12
|
-
"typer
|
|
12
|
+
"typer",
|
|
13
13
|
"pydantic",
|
|
14
14
|
"rich",
|
|
15
15
|
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.0"
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import json
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import List, Optional
|
|
@@ -84,22 +85,16 @@ def read_parquet_metadata(filename: str):
|
|
|
84
85
|
tuple: A tuple containing:
|
|
85
86
|
- parquet_metadata (pyarrow.parquet.FileMetaData): The metadata of the Parquet file.
|
|
86
87
|
- compression_codecs (set): A set of compression codecs used in the Parquet file.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
FileNotFoundError: If the file cannot be found or opened.
|
|
87
91
|
"""
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
parquet_metadata = pq.ParquetFile(filename).metadata
|
|
92
|
+
compression_codecs = set([])
|
|
93
|
+
parquet_metadata = pq.ParquetFile(filename).metadata
|
|
91
94
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
parquet_metadata.row_group(i).column(j).compression
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
except FileNotFoundError:
|
|
99
|
-
console.print(
|
|
100
|
-
f"Cannot open: {filename}.", style="blink bold red underline on white"
|
|
101
|
-
)
|
|
102
|
-
exit(1)
|
|
95
|
+
for i in range(parquet_metadata.num_row_groups):
|
|
96
|
+
for j in range(parquet_metadata.num_columns):
|
|
97
|
+
compression_codecs.add(parquet_metadata.row_group(i).column(j).compression)
|
|
103
98
|
|
|
104
99
|
return parquet_metadata, compression_codecs
|
|
105
100
|
|
|
@@ -260,27 +255,24 @@ def output_json(
|
|
|
260
255
|
print(json.dumps(result, indent=2))
|
|
261
256
|
|
|
262
257
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
),
|
|
270
|
-
metadata_only: bool = typer.Option(
|
|
271
|
-
False,
|
|
272
|
-
"--metadata-only",
|
|
273
|
-
"-m",
|
|
274
|
-
help="Show only file metadata without column details",
|
|
275
|
-
),
|
|
276
|
-
column_filter: Optional[str] = typer.Option(
|
|
277
|
-
None, "--column", "-c", help="Filter results to show only specific column"
|
|
278
|
-
),
|
|
279
|
-
):
|
|
258
|
+
def inspect_single_file(
|
|
259
|
+
filename: str,
|
|
260
|
+
format: OutputFormat,
|
|
261
|
+
metadata_only: bool,
|
|
262
|
+
column_filter: Optional[str],
|
|
263
|
+
) -> None:
|
|
280
264
|
"""
|
|
281
|
-
Inspect a Parquet file and display its metadata, compression settings, and bloom filter information.
|
|
265
|
+
Inspect a single Parquet file and display its metadata, compression settings, and bloom filter information.
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
Exception: If the file cannot be processed.
|
|
282
269
|
"""
|
|
283
|
-
|
|
270
|
+
try:
|
|
271
|
+
(parquet_metadata, compression) = read_parquet_metadata(filename)
|
|
272
|
+
except FileNotFoundError:
|
|
273
|
+
raise Exception(f"Cannot open: {filename}.")
|
|
274
|
+
except Exception as e:
|
|
275
|
+
raise Exception(f"Failed to read metadata: {e}")
|
|
284
276
|
|
|
285
277
|
# Create metadata model
|
|
286
278
|
meta_model = ParquetMetaModel(
|
|
@@ -322,5 +314,61 @@ def inspect(
|
|
|
322
314
|
console.print(f"Compression codecs: {compression}")
|
|
323
315
|
|
|
324
316
|
|
|
317
|
+
@app.command(name="")
|
|
318
|
+
@app.command(name="inspect")
|
|
319
|
+
def inspect(
|
|
320
|
+
filenames: List[str] = typer.Argument(
|
|
321
|
+
..., help="Path(s) or pattern(s) to Parquet files to inspect"
|
|
322
|
+
),
|
|
323
|
+
format: OutputFormat = typer.Option(
|
|
324
|
+
OutputFormat.RICH, "--format", "-f", help="Output format (rich or json)"
|
|
325
|
+
),
|
|
326
|
+
metadata_only: bool = typer.Option(
|
|
327
|
+
False,
|
|
328
|
+
"--metadata-only",
|
|
329
|
+
"-m",
|
|
330
|
+
help="Show only file metadata without column details",
|
|
331
|
+
),
|
|
332
|
+
column_filter: Optional[str] = typer.Option(
|
|
333
|
+
None, "--column", "-c", help="Filter results to show only specific column"
|
|
334
|
+
),
|
|
335
|
+
):
|
|
336
|
+
"""
|
|
337
|
+
Inspect Parquet files and display their metadata, compression settings, and bloom filter information.
|
|
338
|
+
"""
|
|
339
|
+
# Expand glob patterns and collect all matching files
|
|
340
|
+
all_files = []
|
|
341
|
+
for pattern in filenames:
|
|
342
|
+
matches = glob.glob(pattern)
|
|
343
|
+
if matches:
|
|
344
|
+
all_files.extend(matches)
|
|
345
|
+
else:
|
|
346
|
+
# If no matches found, treat as literal filename (for better error reporting)
|
|
347
|
+
all_files.append(pattern)
|
|
348
|
+
|
|
349
|
+
# Remove duplicates while preserving order
|
|
350
|
+
seen = set()
|
|
351
|
+
unique_files = []
|
|
352
|
+
for file in all_files:
|
|
353
|
+
if file not in seen:
|
|
354
|
+
seen.add(file)
|
|
355
|
+
unique_files.append(file)
|
|
356
|
+
|
|
357
|
+
# Process each file
|
|
358
|
+
for i, filename in enumerate(unique_files):
|
|
359
|
+
# For multiple files, add a header to separate results
|
|
360
|
+
if len(unique_files) > 1:
|
|
361
|
+
if i > 0:
|
|
362
|
+
console.print() # Add blank line between files
|
|
363
|
+
console.print(f"[bold blue]File: {filename}[/bold blue]")
|
|
364
|
+
console.print("─" * (len(filename) + 6))
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
inspect_single_file(filename, format, metadata_only, column_filter)
|
|
368
|
+
except Exception as e:
|
|
369
|
+
console.print(f"Error processing {filename}: {e}", style="red")
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
|
|
325
373
|
if __name__ == "__main__":
|
|
326
374
|
app()
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from typer.testing import CliRunner
|
|
5
|
+
|
|
6
|
+
from iparq.source import app
|
|
7
|
+
|
|
8
|
+
# Define path to test fixtures
|
|
9
|
+
FIXTURES_DIR = Path(__file__).parent
|
|
10
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_parquet_info():
|
|
14
|
+
"""Test that the CLI correctly displays parquet file information."""
|
|
15
|
+
runner = CliRunner()
|
|
16
|
+
result = runner.invoke(app, ["inspect", str(fixture_path)])
|
|
17
|
+
|
|
18
|
+
assert result.exit_code == 0
|
|
19
|
+
|
|
20
|
+
expected_output = """ParquetMetaModel(
|
|
21
|
+
created_by='parquet-cpp-arrow version 14.0.2',
|
|
22
|
+
num_columns=3,
|
|
23
|
+
num_rows=3,
|
|
24
|
+
num_row_groups=1,
|
|
25
|
+
format_version='2.6',
|
|
26
|
+
serialized_size=2223
|
|
27
|
+
)
|
|
28
|
+
Parquet Column Information
|
|
29
|
+
┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
|
|
30
|
+
┃ Row Group ┃ Column Name ┃ Index ┃ Compression ┃ Bloom Filter ┃
|
|
31
|
+
┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
|
|
32
|
+
│ 0 │ one │ 0 │ SNAPPY │ ✅ │
|
|
33
|
+
│ 0 │ two │ 1 │ SNAPPY │ ✅ │
|
|
34
|
+
│ 0 │ three │ 2 │ SNAPPY │ ✅ │
|
|
35
|
+
└───────────┴─────────────┴───────┴─────────────┴──────────────┘
|
|
36
|
+
Compression codecs: {'SNAPPY'}"""
|
|
37
|
+
|
|
38
|
+
assert expected_output in result.stdout
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_metadata_only_flag():
|
|
42
|
+
"""Test that the metadata-only flag works correctly."""
|
|
43
|
+
runner = CliRunner()
|
|
44
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
45
|
+
result = runner.invoke(app, ["inspect", "--metadata-only", str(fixture_path)])
|
|
46
|
+
|
|
47
|
+
assert result.exit_code == 0
|
|
48
|
+
assert "ParquetMetaModel" in result.stdout
|
|
49
|
+
assert "Parquet Column Information" not in result.stdout
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_column_filter():
|
|
53
|
+
"""Test that filtering by column name works correctly."""
|
|
54
|
+
runner = CliRunner()
|
|
55
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
56
|
+
result = runner.invoke(app, ["inspect", "--column", "one", str(fixture_path)])
|
|
57
|
+
|
|
58
|
+
assert result.exit_code == 0
|
|
59
|
+
assert "one" in result.stdout
|
|
60
|
+
assert "two" not in result.stdout
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_json_output():
|
|
64
|
+
"""Test JSON output format."""
|
|
65
|
+
runner = CliRunner()
|
|
66
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
67
|
+
result = runner.invoke(app, ["inspect", "--format", "json", str(fixture_path)])
|
|
68
|
+
|
|
69
|
+
assert result.exit_code == 0
|
|
70
|
+
|
|
71
|
+
# Test that output is valid JSON
|
|
72
|
+
data = json.loads(result.stdout)
|
|
73
|
+
|
|
74
|
+
# Check JSON structure
|
|
75
|
+
assert "metadata" in data
|
|
76
|
+
assert "columns" in data
|
|
77
|
+
assert "compression_codecs" in data
|
|
78
|
+
assert data["metadata"]["num_columns"] == 3
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_multiple_files():
|
|
82
|
+
"""Test that multiple files can be inspected in a single command."""
|
|
83
|
+
runner = CliRunner()
|
|
84
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
85
|
+
# Use the same file twice to test deduplication behavior
|
|
86
|
+
|
|
87
|
+
result = runner.invoke(app, ["inspect", str(fixture_path), str(fixture_path)])
|
|
88
|
+
|
|
89
|
+
assert result.exit_code == 0
|
|
90
|
+
# Since both arguments are the same file, deduplication means only one file is processed
|
|
91
|
+
# and since there's only one unique file, no file header should be shown
|
|
92
|
+
assert (
|
|
93
|
+
"File:" not in result.stdout
|
|
94
|
+
) # No header for single file (after deduplication)
|
|
95
|
+
assert result.stdout.count("ParquetMetaModel") == 1
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_multiple_different_files():
|
|
99
|
+
"""Test multiple different files by creating a temporary copy."""
|
|
100
|
+
import shutil
|
|
101
|
+
import tempfile
|
|
102
|
+
|
|
103
|
+
runner = CliRunner()
|
|
104
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
105
|
+
|
|
106
|
+
# Create a temporary file copy
|
|
107
|
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
|
|
108
|
+
shutil.copy2(fixture_path, tmp_file.name)
|
|
109
|
+
tmp_path = tmp_file.name
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
result = runner.invoke(app, ["inspect", str(fixture_path), tmp_path])
|
|
113
|
+
|
|
114
|
+
assert result.exit_code == 0
|
|
115
|
+
# Should contain file headers for both files
|
|
116
|
+
assert f"File: {fixture_path}" in result.stdout
|
|
117
|
+
assert f"File: {tmp_path}" in result.stdout
|
|
118
|
+
# Should contain metadata for both files
|
|
119
|
+
assert result.stdout.count("ParquetMetaModel") == 2
|
|
120
|
+
assert result.stdout.count("Parquet Column Information") == 2
|
|
121
|
+
finally:
|
|
122
|
+
# Clean up temporary file
|
|
123
|
+
import os
|
|
124
|
+
|
|
125
|
+
os.unlink(tmp_path)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_glob_pattern():
|
|
129
|
+
"""Test that glob patterns work correctly."""
|
|
130
|
+
runner = CliRunner()
|
|
131
|
+
# Test with a pattern that should match dummy files
|
|
132
|
+
result = runner.invoke(app, ["inspect", str(FIXTURES_DIR / "dummy*.parquet")])
|
|
133
|
+
|
|
134
|
+
assert result.exit_code == 0
|
|
135
|
+
# Should process at least one file
|
|
136
|
+
assert "ParquetMetaModel" in result.stdout
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_single_file_no_header():
|
|
140
|
+
"""Test that single files don't show file headers."""
|
|
141
|
+
runner = CliRunner()
|
|
142
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
143
|
+
result = runner.invoke(app, ["inspect", str(fixture_path)])
|
|
144
|
+
|
|
145
|
+
assert result.exit_code == 0
|
|
146
|
+
# Should not contain file header for single file
|
|
147
|
+
assert "File:" not in result.stdout
|
|
148
|
+
assert "ParquetMetaModel" in result.stdout
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_error_handling_with_multiple_files():
|
|
152
|
+
"""Test that errors in one file don't stop processing of other files."""
|
|
153
|
+
runner = CliRunner()
|
|
154
|
+
fixture_path = FIXTURES_DIR / "dummy.parquet"
|
|
155
|
+
nonexistent_path = FIXTURES_DIR / "nonexistent.parquet"
|
|
156
|
+
|
|
157
|
+
result = runner.invoke(app, ["inspect", str(fixture_path), str(nonexistent_path)])
|
|
158
|
+
|
|
159
|
+
assert result.exit_code == 0
|
|
160
|
+
# Should process the good file
|
|
161
|
+
assert "ParquetMetaModel" in result.stdout
|
|
162
|
+
# Should show error for bad file
|
|
163
|
+
assert "Error processing" in result.stdout
|
|
164
|
+
assert "nonexistent.parquet" in result.stdout
|