tha-csv-runner 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tha_csv_runner-0.1.0/.github/workflows/ci.yml +36 -0
- tha_csv_runner-0.1.0/.github/workflows/publish.yml +58 -0
- tha_csv_runner-0.1.0/.gitignore +10 -0
- tha_csv_runner-0.1.0/LICENSE +21 -0
- tha_csv_runner-0.1.0/PKG-INFO +115 -0
- tha_csv_runner-0.1.0/README.md +89 -0
- tha_csv_runner-0.1.0/pyproject.toml +56 -0
- tha_csv_runner-0.1.0/src/tha_csv_runner/__init__.py +7 -0
- tha_csv_runner-0.1.0/src/tha_csv_runner/__main__.py +4 -0
- tha_csv_runner-0.1.0/src/tha_csv_runner/cli.py +77 -0
- tha_csv_runner-0.1.0/src/tha_csv_runner/errors.py +2 -0
- tha_csv_runner-0.1.0/src/tha_csv_runner/runner.py +128 -0
- tha_csv_runner-0.1.0/tests/__init__.py +0 -0
- tha_csv_runner-0.1.0/tests/conftest.py +15 -0
- tha_csv_runner-0.1.0/tests/fixtures/__init__.py +0 -0
- tha_csv_runner-0.1.0/tests/fixtures/processors.py +7 -0
- tha_csv_runner-0.1.0/tests/fixtures/simple.csv +4 -0
- tha_csv_runner-0.1.0/tests/fixtures/with_errors.csv +4 -0
- tha_csv_runner-0.1.0/tests/test_cli.py +68 -0
- tha_csv_runner-0.1.0/tests/test_runner.py +196 -0
- tha_csv_runner-0.1.0/uv.lock +420 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["main"]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v4
|
|
20
|
+
with:
|
|
21
|
+
version: "latest"
|
|
22
|
+
|
|
23
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
24
|
+
run: uv python install ${{ matrix.python-version }}
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: uv sync --extra dev --python ${{ matrix.python-version }}
|
|
28
|
+
|
|
29
|
+
- name: Lint
|
|
30
|
+
run: uv run ruff check src/ tests/
|
|
31
|
+
|
|
32
|
+
- name: Test
|
|
33
|
+
run: uv run pytest
|
|
34
|
+
|
|
35
|
+
- name: Type check
|
|
36
|
+
run: uv run mypy src/
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v4
|
|
16
|
+
|
|
17
|
+
- name: Build
|
|
18
|
+
run: uv build
|
|
19
|
+
|
|
20
|
+
- name: Upload dist
|
|
21
|
+
uses: actions/upload-artifact@v4
|
|
22
|
+
with:
|
|
23
|
+
name: dist
|
|
24
|
+
path: dist/
|
|
25
|
+
|
|
26
|
+
publish-testpypi:
|
|
27
|
+
needs: build
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
environment: testpypi
|
|
30
|
+
permissions:
|
|
31
|
+
id-token: write
|
|
32
|
+
steps:
|
|
33
|
+
- name: Download dist
|
|
34
|
+
uses: actions/download-artifact@v4
|
|
35
|
+
with:
|
|
36
|
+
name: dist
|
|
37
|
+
path: dist/
|
|
38
|
+
|
|
39
|
+
- name: Publish to TestPyPI
|
|
40
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
41
|
+
with:
|
|
42
|
+
repository-url: https://test.pypi.org/legacy/
|
|
43
|
+
|
|
44
|
+
publish-pypi:
|
|
45
|
+
needs: publish-testpypi
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
environment: pypi
|
|
48
|
+
permissions:
|
|
49
|
+
id-token: write
|
|
50
|
+
steps:
|
|
51
|
+
- name: Download dist
|
|
52
|
+
uses: actions/download-artifact@v4
|
|
53
|
+
with:
|
|
54
|
+
name: dist
|
|
55
|
+
path: dist/
|
|
56
|
+
|
|
57
|
+
- name: Publish to PyPI
|
|
58
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nate Wright
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tha-csv-runner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Run a function over every row of a CSV — with progress, header validation, and structured per-row errors.
|
|
5
|
+
Project-URL: Homepage, https://github.com/tha-guy-nate/tha-csv-runner
|
|
6
|
+
Project-URL: Issues, https://github.com/tha-guy-nate/tha-csv-runner/issues
|
|
7
|
+
Author: Nate Wright
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Utilities
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: click>=8.1
|
|
20
|
+
Requires-Dist: tqdm>=4.66
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
24
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# tha-csv-runner
|
|
28
|
+
|
|
29
|
+
[](https://github.com/tha-guy-nate/tha-csv-runner/actions/workflows/ci.yml)
|
|
30
|
+
|
|
31
|
+
A small Python library that runs a function against every row of a CSV — with a progress bar, required header validation, and structured error capture per row.
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install tha-csv-runner
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from tha_csv_runner import Runner
|
|
43
|
+
|
|
44
|
+
def process(row: dict) -> None:
|
|
45
|
+
"""Raise any exception to mark the row as an error. Return value is ignored."""
|
|
46
|
+
if not row["email"].endswith("@example.com"):
|
|
47
|
+
raise ValueError("invalid email domain")
|
|
48
|
+
|
|
49
|
+
runner = Runner(
|
|
50
|
+
input_path="data.csv",
|
|
51
|
+
required_headers=["name", "email"],
|
|
52
|
+
processor=process,
|
|
53
|
+
)
|
|
54
|
+
runner.run()
|
|
55
|
+
runner.write("output.csv")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## How it works
|
|
59
|
+
|
|
60
|
+
1. Opens the CSV and validates that all `required_headers` are present — raises immediately if any are missing
|
|
61
|
+
2. Iterates every row with a `tqdm` progress bar
|
|
62
|
+
3. Calls your `processor(row)` function — if it raises, that row is marked as an error and processing continues
|
|
63
|
+
4. Appends three columns to every row: `row_number`, `row_status`, and `message`
|
|
64
|
+
- On success: `row_status` and `message` are blank
|
|
65
|
+
- On error: `row_status = "error"`, `message = str(exception)`
|
|
66
|
+
5. `write()` writes all rows (success and error) to a CSV
|
|
67
|
+
|
|
68
|
+
## API
|
|
69
|
+
|
|
70
|
+
### `Runner`
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
Runner(
|
|
74
|
+
input_path="data.csv", # path to input CSV
|
|
75
|
+
required_headers=["a", "b"], # columns that must exist — raises ConfigError if missing
|
|
76
|
+
processor=my_func, # optional: callable(row: dict) -> None
|
|
77
|
+
sample=100, # optional: process only the first N rows
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### `runner.run()`
|
|
82
|
+
|
|
83
|
+
Reads and processes all rows. Results are stored in `runner.rows` as a list of dicts.
|
|
84
|
+
|
|
85
|
+
### `runner.write()`
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
runner.write(
|
|
89
|
+
output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
|
|
90
|
+
sort_by="name", # optional — column name, or list of column names
|
|
91
|
+
ascending=True, # optional — bool or list of bools matching sort_by
|
|
92
|
+
column_order=["name", "email"], # optional — listed columns come first, rest follow
|
|
93
|
+
keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
|
|
94
|
+
drop=["row_number"], # optional — remove these columns (mutually exclusive with keep)
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Returns the `Path` that was written.
|
|
99
|
+
|
|
100
|
+
## CLI
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
tha-csv-runner run \
|
|
104
|
+
--input data.csv \
|
|
105
|
+
--processor my_module:process_row \
|
|
106
|
+
--header name \
|
|
107
|
+
--header email \
|
|
108
|
+
--sample 100
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
`--processor` uses the `module:function` convention. `--header` is repeatable. All flags are optional except `--input`.
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# tha-csv-runner
|
|
2
|
+
|
|
3
|
+
[](https://github.com/tha-guy-nate/tha-csv-runner/actions/workflows/ci.yml)
|
|
4
|
+
|
|
5
|
+
A small Python library that runs a function against every row of a CSV — with a progress bar, required header validation, and structured error capture per row.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install tha-csv-runner
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick start
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from tha_csv_runner import Runner
|
|
17
|
+
|
|
18
|
+
def process(row: dict) -> None:
|
|
19
|
+
"""Raise any exception to mark the row as an error. Return value is ignored."""
|
|
20
|
+
if not row["email"].endswith("@example.com"):
|
|
21
|
+
raise ValueError("invalid email domain")
|
|
22
|
+
|
|
23
|
+
runner = Runner(
|
|
24
|
+
input_path="data.csv",
|
|
25
|
+
required_headers=["name", "email"],
|
|
26
|
+
processor=process,
|
|
27
|
+
)
|
|
28
|
+
runner.run()
|
|
29
|
+
runner.write("output.csv")
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## How it works
|
|
33
|
+
|
|
34
|
+
1. Opens the CSV and validates that all `required_headers` are present — raises immediately if any are missing
|
|
35
|
+
2. Iterates every row with a `tqdm` progress bar
|
|
36
|
+
3. Calls your `processor(row)` function — if it raises, that row is marked as an error and processing continues
|
|
37
|
+
4. Appends three columns to every row: `row_number`, `row_status`, and `message`
|
|
38
|
+
- On success: `row_status` and `message` are blank
|
|
39
|
+
- On error: `row_status = "error"`, `message = str(exception)`
|
|
40
|
+
5. `write()` writes all rows (success and error) to a CSV
|
|
41
|
+
|
|
42
|
+
## API
|
|
43
|
+
|
|
44
|
+
### `Runner`
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
Runner(
|
|
48
|
+
input_path="data.csv", # path to input CSV
|
|
49
|
+
required_headers=["a", "b"], # columns that must exist — raises ConfigError if missing
|
|
50
|
+
processor=my_func, # optional: callable(row: dict) -> None
|
|
51
|
+
sample=100, # optional: process only the first N rows
|
|
52
|
+
)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### `runner.run()`
|
|
56
|
+
|
|
57
|
+
Reads and processes all rows. Results are stored in `runner.rows` as a list of dicts.
|
|
58
|
+
|
|
59
|
+
### `runner.write()`
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
runner.write(
|
|
63
|
+
output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
|
|
64
|
+
sort_by="name", # optional — column name, or list of column names
|
|
65
|
+
ascending=True, # optional — bool or list of bools matching sort_by
|
|
66
|
+
column_order=["name", "email"], # optional — listed columns come first, rest follow
|
|
67
|
+
keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
|
|
68
|
+
drop=["row_number"], # optional — remove these columns (mutually exclusive with keep)
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Returns the `Path` that was written.
|
|
73
|
+
|
|
74
|
+
## CLI
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
tha-csv-runner run \
|
|
78
|
+
--input data.csv \
|
|
79
|
+
--processor my_module:process_row \
|
|
80
|
+
--header name \
|
|
81
|
+
--header email \
|
|
82
|
+
--sample 100
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
`--processor` uses the `module:function` convention. `--header` is repeatable. All flags are optional except `--input`.
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
MIT
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tha-csv-runner"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Run a function over every row of a CSV — with progress, header validation, and structured per-row errors."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "Nate Wright" }]
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Topic :: Utilities",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"click>=8.1",
|
|
25
|
+
"tqdm>=4.66",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
dev = [
|
|
30
|
+
"pytest>=8",
|
|
31
|
+
"ruff>=0.5",
|
|
32
|
+
"mypy>=1.10",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
tha-csv-runner = "tha_csv_runner.cli:main"
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/tha-guy-nate/tha-csv-runner"
|
|
40
|
+
Issues = "https://github.com/tha-guy-nate/tha-csv-runner/issues"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/tha_csv_runner"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff]
|
|
46
|
+
line-length = 100
|
|
47
|
+
target-version = "py310"
|
|
48
|
+
|
|
49
|
+
[tool.ruff.lint]
|
|
50
|
+
select = ["E", "F", "I", "B", "UP", "RUF"]
|
|
51
|
+
|
|
52
|
+
[tool.mypy]
|
|
53
|
+
ignore_missing_imports = true
|
|
54
|
+
|
|
55
|
+
[tool.pytest.ini_options]
|
|
56
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import sys
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from .errors import ConfigError
|
|
9
|
+
from .runner import Runner
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _load_processor(spec: str) -> Callable[[dict], None]:
|
|
13
|
+
try:
|
|
14
|
+
module_name, func_name = spec.rsplit(":", 1)
|
|
15
|
+
except ValueError:
|
|
16
|
+
raise click.BadParameter(f"Expected format 'module:function', got: {spec!r}") from None
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
module = importlib.import_module(module_name)
|
|
20
|
+
except ModuleNotFoundError:
|
|
21
|
+
raise click.BadParameter(f"Cannot import module {module_name!r}") from None
|
|
22
|
+
|
|
23
|
+
if not hasattr(module, func_name):
|
|
24
|
+
raise click.BadParameter(f"Module {module_name!r} has no attribute {func_name!r}")
|
|
25
|
+
|
|
26
|
+
return getattr(module, func_name)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@click.group()
|
|
30
|
+
def main() -> None:
|
|
31
|
+
"""Run a Python function over every row of a CSV."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@main.command()
|
|
35
|
+
@click.option("--input", "input_path", required=True, help="Path to the input CSV file")
|
|
36
|
+
@click.option("--header", "required_headers", multiple=True, help="Required CSV header (repeatable)") # noqa: E501
|
|
37
|
+
@click.option(
|
|
38
|
+
"--processor", "processor_spec", default=None,
|
|
39
|
+
help="Processor in 'module:function' format (optional)",
|
|
40
|
+
)
|
|
41
|
+
@click.option("--sample", default=None, type=int, help="Process only the first N rows")
|
|
42
|
+
@click.option(
|
|
43
|
+
"--output", "output_path", default=None, help="Output CSV path (auto-named if omitted)",
|
|
44
|
+
)
|
|
45
|
+
def run(
|
|
46
|
+
input_path: str,
|
|
47
|
+
processor_spec: str | None,
|
|
48
|
+
required_headers: tuple[str, ...],
|
|
49
|
+
sample: int | None,
|
|
50
|
+
output_path: str | None,
|
|
51
|
+
) -> None:
|
|
52
|
+
"""Run a processor function over every row of a CSV."""
|
|
53
|
+
if not Path(input_path).exists():
|
|
54
|
+
click.echo(f"Error: input file not found: {input_path}", err=True)
|
|
55
|
+
sys.exit(1)
|
|
56
|
+
|
|
57
|
+
processor = None
|
|
58
|
+
if processor_spec is not None:
|
|
59
|
+
try:
|
|
60
|
+
processor = _load_processor(processor_spec)
|
|
61
|
+
except click.BadParameter as exc:
|
|
62
|
+
click.echo(f"Error: {exc}", err=True)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
runner = Runner(
|
|
67
|
+
input_path=input_path,
|
|
68
|
+
required_headers=list(required_headers),
|
|
69
|
+
processor=processor,
|
|
70
|
+
sample=sample,
|
|
71
|
+
)
|
|
72
|
+
runner.run()
|
|
73
|
+
out = runner.write(output_path=output_path)
|
|
74
|
+
click.echo(f"Wrote {out}")
|
|
75
|
+
except ConfigError as exc:
|
|
76
|
+
click.echo(f"Error: {exc}", err=True)
|
|
77
|
+
sys.exit(1)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import functools
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from .errors import ConfigError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _sort_key(val: object) -> tuple:
|
|
13
|
+
try:
|
|
14
|
+
return (0, float(val)) # type: ignore[arg-type]
|
|
15
|
+
except (TypeError, ValueError):
|
|
16
|
+
return (1, str(val))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Runner:
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
input_path: str | Path,
|
|
23
|
+
required_headers: list[str],
|
|
24
|
+
processor: Callable[[dict], None] | None = None,
|
|
25
|
+
sample: int | None = None,
|
|
26
|
+
) -> None:
|
|
27
|
+
self.input_path = Path(input_path)
|
|
28
|
+
self.processor = processor
|
|
29
|
+
self.required_headers = required_headers
|
|
30
|
+
self.sample = sample
|
|
31
|
+
self.rows: list[dict] = []
|
|
32
|
+
self._ran: bool = False
|
|
33
|
+
|
|
34
|
+
def _load(self) -> list[dict]:
|
|
35
|
+
with open(self.input_path, newline="", encoding="utf-8") as f:
|
|
36
|
+
reader = csv.DictReader(f)
|
|
37
|
+
if reader.fieldnames is None:
|
|
38
|
+
raise ConfigError(f"{self.input_path} appears to be empty")
|
|
39
|
+
missing = [h for h in self.required_headers if h not in reader.fieldnames]
|
|
40
|
+
if missing:
|
|
41
|
+
raise ConfigError(f"Missing required headers: {missing}")
|
|
42
|
+
rows = list(reader)
|
|
43
|
+
|
|
44
|
+
if self.sample is not None:
|
|
45
|
+
rows = rows[: self.sample]
|
|
46
|
+
|
|
47
|
+
return rows
|
|
48
|
+
|
|
49
|
+
def run(self) -> None:
|
|
50
|
+
raw_rows = self._load()
|
|
51
|
+
self.rows = []
|
|
52
|
+
self._ran = True
|
|
53
|
+
|
|
54
|
+
for i, row in enumerate(tqdm(raw_rows, desc=f"Reading {self.input_path.name}"), start=1):
|
|
55
|
+
enriched = {**row, "row_number": i, "row_status": "", "message": ""}
|
|
56
|
+
try:
|
|
57
|
+
if self.processor is not None:
|
|
58
|
+
self.processor(enriched)
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
enriched["row_status"] = "error"
|
|
61
|
+
enriched["message"] = str(exc)
|
|
62
|
+
self.rows.append(enriched)
|
|
63
|
+
|
|
64
|
+
def write(
|
|
65
|
+
self,
|
|
66
|
+
output_path: str | Path | None = None,
|
|
67
|
+
sort_by: str | list[str] | None = None,
|
|
68
|
+
ascending: bool | list[bool] = True,
|
|
69
|
+
column_order: list[str] | None = None,
|
|
70
|
+
keep: list[str] | None = None,
|
|
71
|
+
drop: list[str] | None = None,
|
|
72
|
+
) -> Path:
|
|
73
|
+
if not self._ran:
|
|
74
|
+
raise RuntimeError("No data to write — call run() first")
|
|
75
|
+
if keep and drop:
|
|
76
|
+
raise ValueError("Cannot specify both keep and drop")
|
|
77
|
+
|
|
78
|
+
rows = list(self.rows)
|
|
79
|
+
|
|
80
|
+
# --- column filtering ---
|
|
81
|
+
all_cols = list(rows[0].keys()) if rows else []
|
|
82
|
+
|
|
83
|
+
if keep:
|
|
84
|
+
cols = [c for c in keep if c in all_cols]
|
|
85
|
+
elif drop:
|
|
86
|
+
cols = [c for c in all_cols if c not in drop]
|
|
87
|
+
else:
|
|
88
|
+
cols = all_cols
|
|
89
|
+
|
|
90
|
+
# --- column ordering: listed cols first, unlisted follow in original order ---
|
|
91
|
+
if column_order:
|
|
92
|
+
front = [c for c in column_order if c in cols]
|
|
93
|
+
rest = [c for c in cols if c not in column_order]
|
|
94
|
+
cols = front + rest
|
|
95
|
+
|
|
96
|
+
# --- sorting ---
|
|
97
|
+
if sort_by is not None:
|
|
98
|
+
sort_cols = [sort_by] if isinstance(sort_by, str) else list(sort_by)
|
|
99
|
+
asc_list = (
|
|
100
|
+
[ascending] * len(sort_cols) if isinstance(ascending, bool) else list(ascending)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def compare(a: dict, b: dict) -> int:
|
|
104
|
+
for col, asc in zip(sort_cols, asc_list, strict=True):
|
|
105
|
+
ka, kb = _sort_key(a.get(col, "")), _sort_key(b.get(col, ""))
|
|
106
|
+
if ka < kb:
|
|
107
|
+
return -1 if asc else 1
|
|
108
|
+
if ka > kb:
|
|
109
|
+
return 1 if asc else -1
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
rows.sort(key=functools.cmp_to_key(compare))
|
|
113
|
+
|
|
114
|
+
# --- output path ---
|
|
115
|
+
if output_path is None:
|
|
116
|
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
117
|
+
output_path = Path(f"{self.input_path.stem}_processed_{ts}.csv")
|
|
118
|
+
|
|
119
|
+
out = Path(output_path)
|
|
120
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
|
|
122
|
+
with open(out, "w", newline="", encoding="utf-8") as f:
|
|
123
|
+
if rows:
|
|
124
|
+
writer = csv.DictWriter(f, fieldnames=cols)
|
|
125
|
+
writer.writeheader()
|
|
126
|
+
writer.writerows({c: row[c] for c in cols if c in row} for row in rows)
|
|
127
|
+
|
|
128
|
+
return out
|
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
FIXTURES = Path(__file__).parent / "fixtures"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def simple_csv() -> Path:
|
|
10
|
+
return FIXTURES / "simple.csv"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture
|
|
14
|
+
def error_csv() -> Path:
|
|
15
|
+
return FIXTURES / "with_errors.csv"
|
|
File without changes
|