tha-csv-runner 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["main"]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python-version: ["3.10", "3.11", "3.12"]
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v4
20
+ with:
21
+ version: "latest"
22
+
23
+ - name: Set up Python ${{ matrix.python-version }}
24
+ run: uv python install ${{ matrix.python-version }}
25
+
26
+ - name: Install dependencies
27
+ run: uv sync --extra dev --python ${{ matrix.python-version }}
28
+
29
+ - name: Lint
30
+ run: uv run ruff check src/ tests/
31
+
32
+ - name: Test
33
+ run: uv run pytest
34
+
35
+ - name: Type check
36
+ run: uv run mypy src/
@@ -0,0 +1,58 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v4
16
+
17
+ - name: Build
18
+ run: uv build
19
+
20
+ - name: Upload dist
21
+ uses: actions/upload-artifact@v4
22
+ with:
23
+ name: dist
24
+ path: dist/
25
+
26
+ publish-testpypi:
27
+ needs: build
28
+ runs-on: ubuntu-latest
29
+ environment: testpypi
30
+ permissions:
31
+ id-token: write
32
+ steps:
33
+ - name: Download dist
34
+ uses: actions/download-artifact@v4
35
+ with:
36
+ name: dist
37
+ path: dist/
38
+
39
+ - name: Publish to TestPyPI
40
+ uses: pypa/gh-action-pypi-publish@release/v1
41
+ with:
42
+ repository-url: https://test.pypi.org/legacy/
43
+
44
+ publish-pypi:
45
+ needs: publish-testpypi
46
+ runs-on: ubuntu-latest
47
+ environment: pypi
48
+ permissions:
49
+ id-token: write
50
+ steps:
51
+ - name: Download dist
52
+ uses: actions/download-artifact@v4
53
+ with:
54
+ name: dist
55
+ path: dist/
56
+
57
+ - name: Publish to PyPI
58
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ venv/
8
+ *_processed_*.csv
9
+ *_errors_*.csv
10
+ output.csv
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nate Wright
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: tha-csv-runner
3
+ Version: 0.1.0
4
+ Summary: Run a function over every row of a CSV — with progress, header validation, and structured per-row errors.
5
+ Project-URL: Homepage, https://github.com/tha-guy-nate/tha-csv-runner
6
+ Project-URL: Issues, https://github.com/tha-guy-nate/tha-csv-runner/issues
7
+ Author: Nate Wright
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Utilities
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: click>=8.1
20
+ Requires-Dist: tqdm>=4.66
21
+ Provides-Extra: dev
22
+ Requires-Dist: mypy>=1.10; extra == 'dev'
23
+ Requires-Dist: pytest>=8; extra == 'dev'
24
+ Requires-Dist: ruff>=0.5; extra == 'dev'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # tha-csv-runner
28
+
29
+ [![CI](https://github.com/tha-guy-nate/tha-csv-runner/actions/workflows/ci.yml/badge.svg)](https://github.com/tha-guy-nate/tha-csv-runner/actions/workflows/ci.yml)
30
+
31
+ A small Python library that runs a function against every row of a CSV — with a progress bar, required header validation, and structured error capture per row.
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ pip install tha-csv-runner
37
+ ```
38
+
39
+ ## Quick start
40
+
41
+ ```python
42
+ from tha_csv_runner import Runner
43
+
44
+ def process(row: dict) -> None:
45
+ """Raise any exception to mark the row as an error. Return value is ignored."""
46
+ if not row["email"].endswith("@example.com"):
47
+ raise ValueError("invalid email domain")
48
+
49
+ runner = Runner(
50
+ input_path="data.csv",
51
+ required_headers=["name", "email"],
52
+ processor=process,
53
+ )
54
+ runner.run()
55
+ runner.write("output.csv")
56
+ ```
57
+
58
+ ## How it works
59
+
60
+ 1. Opens the CSV and validates that all `required_headers` are present — raises immediately if any are missing
61
+ 2. Iterates every row with a `tqdm` progress bar
62
+ 3. Calls your `processor(row)` function — if it raises, that row is marked as an error and processing continues
63
+ 4. Appends three columns to every row: `row_number`, `row_status`, and `message`
64
+ - On success: `row_status` and `message` are blank
65
+ - On error: `row_status = "error"`, `message = str(exception)`
66
+ 5. `write()` writes all rows (success and error) to a CSV
67
+
68
+ ## API
69
+
70
+ ### `Runner`
71
+
72
+ ```python
73
+ Runner(
74
+ input_path="data.csv", # path to input CSV
75
+ required_headers=["a", "b"], # columns that must exist — raises ConfigError if missing
76
+ processor=my_func, # optional: callable(row: dict) -> None
77
+ sample=100, # optional: process only the first N rows
78
+ )
79
+ ```
80
+
81
+ ### `runner.run()`
82
+
83
+ Reads and processes all rows. Results are stored in `runner.rows` as a list of dicts.
84
+
85
+ ### `runner.write()`
86
+
87
+ ```python
88
+ runner.write(
89
+ output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
90
+ sort_by="name", # optional — column name, or list of column names
91
+ ascending=True, # optional — bool or list of bools matching sort_by
92
+ column_order=["name", "email"], # optional — listed columns come first, rest follow
93
+ keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
94
+ drop=["row_number"], # optional — remove these columns (mutually exclusive with keep)
95
+ )
96
+ ```
97
+
98
+ Returns the `Path` that was written.
99
+
100
+ ## CLI
101
+
102
+ ```bash
103
+ tha-csv-runner run \
104
+ --input data.csv \
105
+ --processor my_module:process_row \
106
+ --header name \
107
+ --header email \
108
+ --sample 100
109
+ ```
110
+
111
+ `--processor` uses the `module:function` convention. `--header` is repeatable. All flags are optional except `--input`.
112
+
113
+ ## License
114
+
115
+ MIT
@@ -0,0 +1,89 @@
1
+ # tha-csv-runner
2
+
3
+ [![CI](https://github.com/tha-guy-nate/tha-csv-runner/actions/workflows/ci.yml/badge.svg)](https://github.com/tha-guy-nate/tha-csv-runner/actions/workflows/ci.yml)
4
+
5
+ A small Python library that runs a function against every row of a CSV — with a progress bar, required header validation, and structured error capture per row.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install tha-csv-runner
11
+ ```
12
+
13
+ ## Quick start
14
+
15
+ ```python
16
+ from tha_csv_runner import Runner
17
+
18
+ def process(row: dict) -> None:
19
+ """Raise any exception to mark the row as an error. Return value is ignored."""
20
+ if not row["email"].endswith("@example.com"):
21
+ raise ValueError("invalid email domain")
22
+
23
+ runner = Runner(
24
+ input_path="data.csv",
25
+ required_headers=["name", "email"],
26
+ processor=process,
27
+ )
28
+ runner.run()
29
+ runner.write("output.csv")
30
+ ```
31
+
32
+ ## How it works
33
+
34
+ 1. Opens the CSV and validates that all `required_headers` are present — raises immediately if any are missing
35
+ 2. Iterates every row with a `tqdm` progress bar
36
+ 3. Calls your `processor(row)` function — if it raises, that row is marked as an error and processing continues
37
+ 4. Appends three columns to every row: `row_number`, `row_status`, and `message`
38
+ - On success: `row_status` and `message` are blank
39
+ - On error: `row_status = "error"`, `message = str(exception)`
40
+ 5. `write()` writes all rows (success and error) to a CSV
41
+
42
+ ## API
43
+
44
+ ### `Runner`
45
+
46
+ ```python
47
+ Runner(
48
+ input_path="data.csv", # path to input CSV
49
+ required_headers=["a", "b"], # columns that must exist — raises ConfigError if missing
50
+ processor=my_func, # optional: callable(row: dict) -> None
51
+ sample=100, # optional: process only the first N rows
52
+ )
53
+ ```
54
+
55
+ ### `runner.run()`
56
+
57
+ Reads and processes all rows. Results are stored in `runner.rows` as a list of dicts.
58
+
59
+ ### `runner.write()`
60
+
61
+ ```python
62
+ runner.write(
63
+ output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
64
+ sort_by="name", # optional — column name, or list of column names
65
+ ascending=True, # optional — bool or list of bools matching sort_by
66
+ column_order=["name", "email"], # optional — listed columns come first, rest follow
67
+ keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
68
+ drop=["row_number"], # optional — remove these columns (mutually exclusive with keep)
69
+ )
70
+ ```
71
+
72
+ Returns the `Path` that was written.
73
+
74
+ ## CLI
75
+
76
+ ```bash
77
+ tha-csv-runner run \
78
+ --input data.csv \
79
+ --processor my_module:process_row \
80
+ --header name \
81
+ --header email \
82
+ --sample 100
83
+ ```
84
+
85
+ `--processor` uses the `module:function` convention. `--header` is repeatable. All flags are optional except `--input`.
86
+
87
+ ## License
88
+
89
+ MIT
@@ -0,0 +1,56 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "tha-csv-runner"
7
+ version = "0.1.0"
8
+ description = "Run a function over every row of a CSV — with progress, header validation, and structured per-row errors."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "Nate Wright" }]
12
+ requires-python = ">=3.10"
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.10",
16
+ "Programming Language :: Python :: 3.11",
17
+ "Programming Language :: Python :: 3.12",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ "Intended Audience :: Developers",
21
+ "Topic :: Utilities",
22
+ ]
23
+ dependencies = [
24
+ "click>=8.1",
25
+ "tqdm>=4.66",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ dev = [
30
+ "pytest>=8",
31
+ "ruff>=0.5",
32
+ "mypy>=1.10",
33
+ ]
34
+
35
+ [project.scripts]
36
+ tha-csv-runner = "tha_csv_runner.cli:main"
37
+
38
+ [project.urls]
39
+ Homepage = "https://github.com/tha-guy-nate/tha-csv-runner"
40
+ Issues = "https://github.com/tha-guy-nate/tha-csv-runner/issues"
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["src/tha_csv_runner"]
44
+
45
+ [tool.ruff]
46
+ line-length = 100
47
+ target-version = "py310"
48
+
49
+ [tool.ruff.lint]
50
+ select = ["E", "F", "I", "B", "UP", "RUF"]
51
+
52
+ [tool.mypy]
53
+ ignore_missing_imports = true
54
+
55
+ [tool.pytest.ini_options]
56
+ testpaths = ["tests"]
@@ -0,0 +1,7 @@
1
+ """tha-csv-runner: run a function over every row of a CSV."""
2
+
3
+ from .errors import ConfigError
4
+ from .runner import Runner
5
+
6
+ __version__ = "0.1.0"
7
+ __all__ = ["ConfigError", "Runner"]
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1,77 @@
1
+ import importlib
2
+ import sys
3
+ from collections.abc import Callable
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ from .errors import ConfigError
9
+ from .runner import Runner
10
+
11
+
12
+ def _load_processor(spec: str) -> Callable[[dict], None]:
13
+ try:
14
+ module_name, func_name = spec.rsplit(":", 1)
15
+ except ValueError:
16
+ raise click.BadParameter(f"Expected format 'module:function', got: {spec!r}") from None
17
+
18
+ try:
19
+ module = importlib.import_module(module_name)
20
+ except ModuleNotFoundError:
21
+ raise click.BadParameter(f"Cannot import module {module_name!r}") from None
22
+
23
+ if not hasattr(module, func_name):
24
+ raise click.BadParameter(f"Module {module_name!r} has no attribute {func_name!r}")
25
+
26
+ return getattr(module, func_name)
27
+
28
+
29
+ @click.group()
30
+ def main() -> None:
31
+ """Run a Python function over every row of a CSV."""
32
+
33
+
34
+ @main.command()
35
+ @click.option("--input", "input_path", required=True, help="Path to the input CSV file")
36
+ @click.option("--header", "required_headers", multiple=True, help="Required CSV header (repeatable)") # noqa: E501
37
+ @click.option(
38
+ "--processor", "processor_spec", default=None,
39
+ help="Processor in 'module:function' format (optional)",
40
+ )
41
+ @click.option("--sample", default=None, type=int, help="Process only the first N rows")
42
+ @click.option(
43
+ "--output", "output_path", default=None, help="Output CSV path (auto-named if omitted)",
44
+ )
45
+ def run(
46
+ input_path: str,
47
+ processor_spec: str | None,
48
+ required_headers: tuple[str, ...],
49
+ sample: int | None,
50
+ output_path: str | None,
51
+ ) -> None:
52
+ """Run a processor function over every row of a CSV."""
53
+ if not Path(input_path).exists():
54
+ click.echo(f"Error: input file not found: {input_path}", err=True)
55
+ sys.exit(1)
56
+
57
+ processor = None
58
+ if processor_spec is not None:
59
+ try:
60
+ processor = _load_processor(processor_spec)
61
+ except click.BadParameter as exc:
62
+ click.echo(f"Error: {exc}", err=True)
63
+ sys.exit(1)
64
+
65
+ try:
66
+ runner = Runner(
67
+ input_path=input_path,
68
+ required_headers=list(required_headers),
69
+ processor=processor,
70
+ sample=sample,
71
+ )
72
+ runner.run()
73
+ out = runner.write(output_path=output_path)
74
+ click.echo(f"Wrote {out}")
75
+ except ConfigError as exc:
76
+ click.echo(f"Error: {exc}", err=True)
77
+ sys.exit(1)
@@ -0,0 +1,2 @@
1
+ class ConfigError(Exception):
2
+ """Raised for invalid Runner configuration."""
@@ -0,0 +1,128 @@
1
+ import csv
2
+ import functools
3
+ from collections.abc import Callable
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+
7
+ from tqdm import tqdm
8
+
9
+ from .errors import ConfigError
10
+
11
+
12
+ def _sort_key(val: object) -> tuple:
13
+ try:
14
+ return (0, float(val)) # type: ignore[arg-type]
15
+ except (TypeError, ValueError):
16
+ return (1, str(val))
17
+
18
+
19
+ class Runner:
20
+ def __init__(
21
+ self,
22
+ input_path: str | Path,
23
+ required_headers: list[str],
24
+ processor: Callable[[dict], None] | None = None,
25
+ sample: int | None = None,
26
+ ) -> None:
27
+ self.input_path = Path(input_path)
28
+ self.processor = processor
29
+ self.required_headers = required_headers
30
+ self.sample = sample
31
+ self.rows: list[dict] = []
32
+ self._ran: bool = False
33
+
34
+ def _load(self) -> list[dict]:
35
+ with open(self.input_path, newline="", encoding="utf-8") as f:
36
+ reader = csv.DictReader(f)
37
+ if reader.fieldnames is None:
38
+ raise ConfigError(f"{self.input_path} appears to be empty")
39
+ missing = [h for h in self.required_headers if h not in reader.fieldnames]
40
+ if missing:
41
+ raise ConfigError(f"Missing required headers: {missing}")
42
+ rows = list(reader)
43
+
44
+ if self.sample is not None:
45
+ rows = rows[: self.sample]
46
+
47
+ return rows
48
+
49
+ def run(self) -> None:
50
+ raw_rows = self._load()
51
+ self.rows = []
52
+ self._ran = True
53
+
54
+ for i, row in enumerate(tqdm(raw_rows, desc=f"Reading {self.input_path.name}"), start=1):
55
+ enriched = {**row, "row_number": i, "row_status": "", "message": ""}
56
+ try:
57
+ if self.processor is not None:
58
+ self.processor(enriched)
59
+ except Exception as exc:
60
+ enriched["row_status"] = "error"
61
+ enriched["message"] = str(exc)
62
+ self.rows.append(enriched)
63
+
64
+ def write(
65
+ self,
66
+ output_path: str | Path | None = None,
67
+ sort_by: str | list[str] | None = None,
68
+ ascending: bool | list[bool] = True,
69
+ column_order: list[str] | None = None,
70
+ keep: list[str] | None = None,
71
+ drop: list[str] | None = None,
72
+ ) -> Path:
73
+ if not self._ran:
74
+ raise RuntimeError("No data to write — call run() first")
75
+ if keep and drop:
76
+ raise ValueError("Cannot specify both keep and drop")
77
+
78
+ rows = list(self.rows)
79
+
80
+ # --- column filtering ---
81
+ all_cols = list(rows[0].keys()) if rows else []
82
+
83
+ if keep:
84
+ cols = [c for c in keep if c in all_cols]
85
+ elif drop:
86
+ cols = [c for c in all_cols if c not in drop]
87
+ else:
88
+ cols = all_cols
89
+
90
+ # --- column ordering: listed cols first, unlisted follow in original order ---
91
+ if column_order:
92
+ front = [c for c in column_order if c in cols]
93
+ rest = [c for c in cols if c not in column_order]
94
+ cols = front + rest
95
+
96
+ # --- sorting ---
97
+ if sort_by is not None:
98
+ sort_cols = [sort_by] if isinstance(sort_by, str) else list(sort_by)
99
+ asc_list = (
100
+ [ascending] * len(sort_cols) if isinstance(ascending, bool) else list(ascending)
101
+ )
102
+
103
+ def compare(a: dict, b: dict) -> int:
104
+ for col, asc in zip(sort_cols, asc_list, strict=True):
105
+ ka, kb = _sort_key(a.get(col, "")), _sort_key(b.get(col, ""))
106
+ if ka < kb:
107
+ return -1 if asc else 1
108
+ if ka > kb:
109
+ return 1 if asc else -1
110
+ return 0
111
+
112
+ rows.sort(key=functools.cmp_to_key(compare))
113
+
114
+ # --- output path ---
115
+ if output_path is None:
116
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
117
+ output_path = Path(f"{self.input_path.stem}_processed_{ts}.csv")
118
+
119
+ out = Path(output_path)
120
+ out.parent.mkdir(parents=True, exist_ok=True)
121
+
122
+ with open(out, "w", newline="", encoding="utf-8") as f:
123
+ if rows:
124
+ writer = csv.DictWriter(f, fieldnames=cols)
125
+ writer.writeheader()
126
+ writer.writerows({c: row[c] for c in cols if c in row} for row in rows)
127
+
128
+ return out
File without changes
@@ -0,0 +1,15 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+ FIXTURES = Path(__file__).parent / "fixtures"
6
+
7
+
8
+ @pytest.fixture
9
+ def simple_csv() -> Path:
10
+ return FIXTURES / "simple.csv"
11
+
12
+
13
+ @pytest.fixture
14
+ def error_csv() -> Path:
15
+ return FIXTURES / "with_errors.csv"
File without changes
@@ -0,0 +1,7 @@
1
+ def noop(row: dict) -> None:
2
+ pass
3
+
4
+
5
+ def fail_on_bob(row: dict) -> None:
6
+ if row["name"] == "Bob":
7
+ raise ValueError("Bob is not allowed")
@@ -0,0 +1,4 @@
1
+ id,name,email
2
+ 1,Alice,alice@example.com
3
+ 2,Bob,bob@example.com
4
+ 3,Carol,carol@example.com
@@ -0,0 +1,4 @@
1
+ id,name,value
2
+ 1,Alice,10
3
+ 2,Bob,bad_value
4
+ 3,Carol,30