PyPI - tha-csv-runner - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

tha-csv-runner 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tha-csv-runner
-Version: 0.2.2
+Version: 0.2.3
 Summary: Run a function over every row of a CSV — with progress, header validation, and structured per-row errors.
 Project-URL: Homepage, https://github.com/tha-guy-nate/tha-csv-runner
 Project-URL: Issues, https://github.com/tha-guy-nate/tha-csv-runner/issues
@@ -47,8 +47,8 @@ def process(row: dict) -> None:
 runner = ThaCSV()
-rows = runner.read("Step 1 of 1", "data.csv", ["name", "email"], process)
-runner.write("Step 1 of 1", "output.csv")
+rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
+runner.write("Step 2 of 2", "output.csv")
 ```
 ## How it works
@@ -57,6 +57,7 @@ runner.write("Step 1 of 1", "output.csv")
 2. Iterates every row with a `tqdm` progress bar labelled with `desc`
 3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
 4. Appends three columns to every row: `row number`, `row status`, and `message`
+   - `row number` starts at 2 (row 1 is the header)
    - On success: `row status` and `message` are blank
    - On error: `row status = "error"`, `message = str(exception)`
 5. `write()` writes all rows (success and error) to a CSV
@@ -73,7 +74,7 @@ ThaCSV()
 ```python
 runner.read(
-    "Step 2 of 10",          # progress bar label — pass None to use the filename
+    "Step 1 of 2",           # progress bar label — pass None to use the filename
     "data.csv",              # path to input CSV
     ["a", "b"],              # columns that must exist — raises ConfigError if missing
     validator=my_func,       # optional: callable(row: dict) -> None
@@ -89,17 +90,27 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
 ```python
 runner.write(
-    "Step 10 of 10",                   # progress bar label — pass None to use the output filename
+    "Step 2 of 2",                     # progress bar label — pass None to use the output filename
     output_path="output.csv",          # optional — auto-named input_processed_TIMESTAMP.csv if omitted
     sort_by="name",                    # optional — column name, or list of column names
     ascending=True,                    # optional — bool or list of bools matching sort_by
     column_order=["name", "email"],    # optional — listed columns come first, rest follow
     keep=["name", "email"],            # optional — keep only these columns (mutually exclusive with drop)
     drop=["row number"],               # optional — remove these columns (mutually exclusive with keep)
+    chunk_size=1000,                   # optional — split output into files of this many rows
 )
 ```
-Returns the `Path` that was written.
+Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
+#### `chunk_size`
+When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
+```python
+paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
+# ["output_001.csv", "output_002.csv", ...]
+```
 ## License

{tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/README.md RENAMED Viewed

@@ -22,8 +22,8 @@ def process(row: dict) -> None:
 runner = ThaCSV()
-rows = runner.read("Step 1 of 1", "data.csv", ["name", "email"], process)
-runner.write("Step 1 of 1", "output.csv")
+rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
+runner.write("Step 2 of 2", "output.csv")
 ```
 ## How it works
@@ -32,6 +32,7 @@ runner.write("Step 1 of 1", "output.csv")
 2. Iterates every row with a `tqdm` progress bar labelled with `desc`
 3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
 4. Appends three columns to every row: `row number`, `row status`, and `message`
+   - `row number` starts at 2 (row 1 is the header)
    - On success: `row status` and `message` are blank
    - On error: `row status = "error"`, `message = str(exception)`
 5. `write()` writes all rows (success and error) to a CSV
@@ -48,7 +49,7 @@ ThaCSV()
 ```python
 runner.read(
-    "Step 2 of 10",          # progress bar label — pass None to use the filename
+    "Step 1 of 2",           # progress bar label — pass None to use the filename
     "data.csv",              # path to input CSV
     ["a", "b"],              # columns that must exist — raises ConfigError if missing
     validator=my_func,       # optional: callable(row: dict) -> None
@@ -64,17 +65,27 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
 ```python
 runner.write(
-    "Step 10 of 10",                   # progress bar label — pass None to use the output filename
+    "Step 2 of 2",                     # progress bar label — pass None to use the output filename
     output_path="output.csv",          # optional — auto-named input_processed_TIMESTAMP.csv if omitted
     sort_by="name",                    # optional — column name, or list of column names
     ascending=True,                    # optional — bool or list of bools matching sort_by
     column_order=["name", "email"],    # optional — listed columns come first, rest follow
     keep=["name", "email"],            # optional — keep only these columns (mutually exclusive with drop)
     drop=["row number"],               # optional — remove these columns (mutually exclusive with keep)
+    chunk_size=1000,                   # optional — split output into files of this many rows
 )
 ```
-Returns the `Path` that was written.
+Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
+#### `chunk_size`
+When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
+```python
+paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
+# ["output_001.csv", "output_002.csv", ...]
+```
 ## License

{tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "tha-csv-runner"
-version = "0.2.2"
+version = "0.2.3"
 description = "Run a function over every row of a CSV — with progress, header validation, and structured per-row errors."
 readme = "README.md"
 license = { text = "MIT" }

{tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/src/tha_csv_runner/__init__.py RENAMED Viewed

@@ -3,5 +3,5 @@
 from .errors import ConfigError
 from .runner import ThaCSV
-__version__ = "0.2.2"
+__version__ = "0.2.3"
 __all__ = ["ConfigError", "ThaCSV"]

{tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/src/tha_csv_runner/runner.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import csv
 import functools
+import shutil
 from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
@@ -9,6 +10,10 @@ from tqdm import tqdm
 from .errors import ConfigError
+def tqdm_ncols(max_cols: int = 85) -> int:
+    return min(shutil.get_terminal_size(fallback=(max_cols, 24)).columns, max_cols)
 def _sort_key(val: object) -> tuple:
     try:
         return (0, float(val))  # type: ignore[arg-type]
@@ -16,6 +21,18 @@ def _sort_key(val: object) -> tuple:
         return (1, str(val))
+def _write_chunk(path: Path, rows: list[dict], cols: list[str], label: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", newline="", encoding="utf-8") as f:
+        if rows:
+            writer = csv.DictWriter(f, fieldnames=cols)
+            writer.writeheader()
+            writer.writerows(
+                {c: row[c] for c in cols if c in row}
+                for row in tqdm(rows, desc=label, ncols=tqdm_ncols())
+            )
 class ThaCSV:
     def __init__(self) -> None:
         self.rows: list[dict] = []
@@ -45,7 +62,7 @@ class ThaCSV:
         self._read = True
         label = desc if desc is not None else self._input_path.name
-        for i, row in enumerate(tqdm(raw_rows, desc=label), start=1):
+        for i, row in enumerate(tqdm(raw_rows, desc=label, ncols=tqdm_ncols()), start=2):
             if enrich:
                 enriched = {**row, "row number": i, "row status": "", "message": ""}
             else:
@@ -72,11 +89,14 @@ class ThaCSV:
         column_order: list[str] | None = None,
         keep: list[str] | None = None,
         drop: list[str] | None = None,
-    ) -> Path:
+        chunk_size: int | None = None,
+    ) -> Path | list[Path]:
         if not self._read:
             raise RuntimeError("No data to write — call read() first")
         if keep and drop:
             raise ValueError("Cannot specify both keep and drop")
+        if chunk_size is not None and chunk_size < 1:
+            raise ValueError("chunk_size must be >= 1")
         rows = list(self.rows)
@@ -121,16 +141,18 @@ class ThaCSV:
             output_path = Path(f"{stem}_processed_{ts}.csv")
         out = Path(output_path)
-        out.parent.mkdir(parents=True, exist_ok=True)
-        write_label = desc if desc is not None else out.name
-        with open(out, "w", newline="", encoding="utf-8") as f:
-            if rows:
-                writer = csv.DictWriter(f, fieldnames=cols)
-                writer.writeheader()
-                writer.writerows(
-                    {c: row[c] for c in cols if c in row}
-                    for row in tqdm(rows, desc=write_label)
-                )
+        # --- chunked write ---
+        if chunk_size is not None:
+            chunks = [rows[i:i + chunk_size] for i in range(0, max(len(rows), 1), chunk_size)]
+            paths = []
+            for idx, chunk in enumerate(chunks, start=1):
+                chunk_path = out.parent / f"{out.stem}_{idx:03d}{out.suffix}"
+                label = f"{desc} ({idx}/{len(chunks)})" if desc else chunk_path.name
+                _write_chunk(chunk_path, chunk, cols, label)
+                paths.append(chunk_path)
+            return paths
+        write_label = desc if desc is not None else out.name
+        _write_chunk(out, rows, cols, write_label)
         return out

{tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/test_runner.py RENAMED Viewed

@@ -33,8 +33,8 @@ def test_read_returns_rows(simple_csv: Path) -> None:
 def test_row_number_injected(simple_csv: Path) -> None:
     runner = ThaCSV()
     runner.read(None, simple_csv, ["name"])
-    assert runner.rows[0]["row number"] == 1
-    assert runner.rows[2]["row number"] == 3
+    assert runner.rows[0]["row number"] == 2
+    assert runner.rows[2]["row number"] == 4
 def test_message_and_status_columns_present(simple_csv: Path) -> None:
@@ -223,3 +223,68 @@ def test_enrich_false_validator_error_still_raises(simple_csv: Path) -> None:
     runner = ThaCSV()
     with pytest.raises(ValueError, match="Bob is not allowed"):
         runner.read(None, simple_csv, ["name"], fail_on_bob, enrich=False)
+# --- chunk_size ---
+def test_chunk_size_returns_list(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    result = runner.write(None, tmp_path / "out.csv", chunk_size=2)
+    assert isinstance(result, list)
+def test_chunk_size_correct_file_count(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
+    assert isinstance(paths, list)
+    assert len(paths) == 2  # 3 rows → chunks of 2, 1
+def test_chunk_size_files_exist(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
+    assert isinstance(paths, list)
+    assert all(p.exists() for p in paths)
+def test_chunk_size_naming(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
+    assert isinstance(paths, list)
+    assert paths[0].name == "out_001.csv"
+    assert paths[1].name == "out_002.csv"
+def test_chunk_size_total_rows(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
+    assert isinstance(paths, list)
+    total = sum(len(list(csv.DictReader(p.open()))) for p in paths)
+    assert total == 3
+def test_chunk_size_larger_than_rows(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    paths = runner.write(None, tmp_path / "out.csv", chunk_size=100)
+    assert isinstance(paths, list)
+    assert len(paths) == 1
+def test_chunk_size_zero_raises(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    with pytest.raises(ValueError, match="chunk_size"):
+        runner.write(None, tmp_path / "out.csv", chunk_size=0)
+def test_no_chunk_size_returns_path(simple_csv: Path, tmp_path: Path) -> None:
+    runner = ThaCSV()
+    runner.read(None, simple_csv, ["name"])
+    result = runner.write(None, tmp_path / "out.csv")
+    assert isinstance(result, Path)

{tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/uv.lock RENAMED Viewed

@@ -308,7 +308,7 @@ wheels = [
 [[package]]
 name = "tha-csv-runner"
-version = "0.2.2"
+version = "0.2.3"
 source = { editable = "." }
 dependencies = [
     { name = "tqdm" },