tha-csv-runner 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/PKG-INFO +17 -6
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/README.md +16 -5
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/pyproject.toml +1 -1
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/src/tha_csv_runner/__init__.py +1 -1
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/src/tha_csv_runner/runner.py +34 -12
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/test_runner.py +67 -2
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/uv.lock +1 -1
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/.github/workflows/ci.yml +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/.github/workflows/publish.yml +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/.gitignore +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/LICENSE +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/src/tha_csv_runner/__main__.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/src/tha_csv_runner/errors.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/__init__.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/conftest.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/fixtures/__init__.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/fixtures/processors.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/fixtures/simple.csv +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.3}/tests/fixtures/with_errors.csv +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tha-csv-runner
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Run a function over every row of a CSV — with progress, header validation, and structured per-row errors.
|
|
5
5
|
Project-URL: Homepage, https://github.com/tha-guy-nate/tha-csv-runner
|
|
6
6
|
Project-URL: Issues, https://github.com/tha-guy-nate/tha-csv-runner/issues
|
|
@@ -47,8 +47,8 @@ def process(row: dict) -> None:
|
|
|
47
47
|
|
|
48
48
|
runner = ThaCSV()
|
|
49
49
|
|
|
50
|
-
rows = runner.read("Step 1 of
|
|
51
|
-
runner.write("Step
|
|
50
|
+
rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
|
|
51
|
+
runner.write("Step 2 of 2", "output.csv")
|
|
52
52
|
```
|
|
53
53
|
|
|
54
54
|
## How it works
|
|
@@ -57,6 +57,7 @@ runner.write("Step 1 of 1", "output.csv")
|
|
|
57
57
|
2. Iterates every row with a `tqdm` progress bar labelled with `desc`
|
|
58
58
|
3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
|
|
59
59
|
4. Appends three columns to every row: `row number`, `row status`, and `message`
|
|
60
|
+
- `row number` starts at 2 (row 1 is the header)
|
|
60
61
|
- On success: `row status` and `message` are blank
|
|
61
62
|
- On error: `row status = "error"`, `message = str(exception)`
|
|
62
63
|
5. `write()` writes all rows (success and error) to a CSV
|
|
@@ -73,7 +74,7 @@ ThaCSV()
|
|
|
73
74
|
|
|
74
75
|
```python
|
|
75
76
|
runner.read(
|
|
76
|
-
"Step
|
|
77
|
+
"Step 1 of 2", # progress bar label — pass None to use the filename
|
|
77
78
|
"data.csv", # path to input CSV
|
|
78
79
|
["a", "b"], # columns that must exist — raises ConfigError if missing
|
|
79
80
|
validator=my_func, # optional: callable(row: dict) -> None
|
|
@@ -89,17 +90,27 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
|
|
|
89
90
|
|
|
90
91
|
```python
|
|
91
92
|
runner.write(
|
|
92
|
-
"Step
|
|
93
|
+
"Step 2 of 2", # progress bar label — pass None to use the output filename
|
|
93
94
|
output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
|
|
94
95
|
sort_by="name", # optional — column name, or list of column names
|
|
95
96
|
ascending=True, # optional — bool or list of bools matching sort_by
|
|
96
97
|
column_order=["name", "email"], # optional — listed columns come first, rest follow
|
|
97
98
|
keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
|
|
98
99
|
drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
|
|
100
|
+
chunk_size=1000, # optional — split output into files of this many rows
|
|
99
101
|
)
|
|
100
102
|
```
|
|
101
103
|
|
|
102
|
-
Returns the `Path` that was written.
|
|
104
|
+
Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
|
|
105
|
+
|
|
106
|
+
#### `chunk_size`
|
|
107
|
+
|
|
108
|
+
When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
|
|
112
|
+
# ["output_001.csv", "output_002.csv", ...]
|
|
113
|
+
```
|
|
103
114
|
|
|
104
115
|
## License
|
|
105
116
|
|
|
@@ -22,8 +22,8 @@ def process(row: dict) -> None:
|
|
|
22
22
|
|
|
23
23
|
runner = ThaCSV()
|
|
24
24
|
|
|
25
|
-
rows = runner.read("Step 1 of
|
|
26
|
-
runner.write("Step
|
|
25
|
+
rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
|
|
26
|
+
runner.write("Step 2 of 2", "output.csv")
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
## How it works
|
|
@@ -32,6 +32,7 @@ runner.write("Step 1 of 1", "output.csv")
|
|
|
32
32
|
2. Iterates every row with a `tqdm` progress bar labelled with `desc`
|
|
33
33
|
3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
|
|
34
34
|
4. Appends three columns to every row: `row number`, `row status`, and `message`
|
|
35
|
+
- `row number` starts at 2 (row 1 is the header)
|
|
35
36
|
- On success: `row status` and `message` are blank
|
|
36
37
|
- On error: `row status = "error"`, `message = str(exception)`
|
|
37
38
|
5. `write()` writes all rows (success and error) to a CSV
|
|
@@ -48,7 +49,7 @@ ThaCSV()
|
|
|
48
49
|
|
|
49
50
|
```python
|
|
50
51
|
runner.read(
|
|
51
|
-
"Step
|
|
52
|
+
"Step 1 of 2", # progress bar label — pass None to use the filename
|
|
52
53
|
"data.csv", # path to input CSV
|
|
53
54
|
["a", "b"], # columns that must exist — raises ConfigError if missing
|
|
54
55
|
validator=my_func, # optional: callable(row: dict) -> None
|
|
@@ -64,17 +65,27 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
|
|
|
64
65
|
|
|
65
66
|
```python
|
|
66
67
|
runner.write(
|
|
67
|
-
"Step
|
|
68
|
+
"Step 2 of 2", # progress bar label — pass None to use the output filename
|
|
68
69
|
output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
|
|
69
70
|
sort_by="name", # optional — column name, or list of column names
|
|
70
71
|
ascending=True, # optional — bool or list of bools matching sort_by
|
|
71
72
|
column_order=["name", "email"], # optional — listed columns come first, rest follow
|
|
72
73
|
keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
|
|
73
74
|
drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
|
|
75
|
+
chunk_size=1000, # optional — split output into files of this many rows
|
|
74
76
|
)
|
|
75
77
|
```
|
|
76
78
|
|
|
77
|
-
Returns the `Path` that was written.
|
|
79
|
+
Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
|
|
80
|
+
|
|
81
|
+
#### `chunk_size`
|
|
82
|
+
|
|
83
|
+
When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
|
|
87
|
+
# ["output_001.csv", "output_002.csv", ...]
|
|
88
|
+
```
|
|
78
89
|
|
|
79
90
|
## License
|
|
80
91
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tha-csv-runner"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "Run a function over every row of a CSV — with progress, header validation, and structured per-row errors."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import functools
|
|
3
|
+
import shutil
|
|
3
4
|
from collections.abc import Callable
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from pathlib import Path
|
|
@@ -9,6 +10,10 @@ from tqdm import tqdm
|
|
|
9
10
|
from .errors import ConfigError
|
|
10
11
|
|
|
11
12
|
|
|
13
|
+
def tqdm_ncols(max_cols: int = 85) -> int:
|
|
14
|
+
return min(shutil.get_terminal_size(fallback=(max_cols, 24)).columns, max_cols)
|
|
15
|
+
|
|
16
|
+
|
|
12
17
|
def _sort_key(val: object) -> tuple:
|
|
13
18
|
try:
|
|
14
19
|
return (0, float(val)) # type: ignore[arg-type]
|
|
@@ -16,6 +21,18 @@ def _sort_key(val: object) -> tuple:
|
|
|
16
21
|
return (1, str(val))
|
|
17
22
|
|
|
18
23
|
|
|
24
|
+
def _write_chunk(path: Path, rows: list[dict], cols: list[str], label: str) -> None:
|
|
25
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
27
|
+
if rows:
|
|
28
|
+
writer = csv.DictWriter(f, fieldnames=cols)
|
|
29
|
+
writer.writeheader()
|
|
30
|
+
writer.writerows(
|
|
31
|
+
{c: row[c] for c in cols if c in row}
|
|
32
|
+
for row in tqdm(rows, desc=label, ncols=tqdm_ncols())
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
19
36
|
class ThaCSV:
|
|
20
37
|
def __init__(self) -> None:
|
|
21
38
|
self.rows: list[dict] = []
|
|
@@ -45,7 +62,7 @@ class ThaCSV:
|
|
|
45
62
|
self._read = True
|
|
46
63
|
|
|
47
64
|
label = desc if desc is not None else self._input_path.name
|
|
48
|
-
for i, row in enumerate(tqdm(raw_rows, desc=label), start=
|
|
65
|
+
for i, row in enumerate(tqdm(raw_rows, desc=label, ncols=tqdm_ncols()), start=2):
|
|
49
66
|
if enrich:
|
|
50
67
|
enriched = {**row, "row number": i, "row status": "", "message": ""}
|
|
51
68
|
else:
|
|
@@ -72,11 +89,14 @@ class ThaCSV:
|
|
|
72
89
|
column_order: list[str] | None = None,
|
|
73
90
|
keep: list[str] | None = None,
|
|
74
91
|
drop: list[str] | None = None,
|
|
75
|
-
|
|
92
|
+
chunk_size: int | None = None,
|
|
93
|
+
) -> Path | list[Path]:
|
|
76
94
|
if not self._read:
|
|
77
95
|
raise RuntimeError("No data to write — call read() first")
|
|
78
96
|
if keep and drop:
|
|
79
97
|
raise ValueError("Cannot specify both keep and drop")
|
|
98
|
+
if chunk_size is not None and chunk_size < 1:
|
|
99
|
+
raise ValueError("chunk_size must be >= 1")
|
|
80
100
|
|
|
81
101
|
rows = list(self.rows)
|
|
82
102
|
|
|
@@ -121,16 +141,18 @@ class ThaCSV:
|
|
|
121
141
|
output_path = Path(f"{stem}_processed_{ts}.csv")
|
|
122
142
|
|
|
123
143
|
out = Path(output_path)
|
|
124
|
-
out.parent.mkdir(parents=True, exist_ok=True)
|
|
125
144
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
145
|
+
# --- chunked write ---
|
|
146
|
+
if chunk_size is not None:
|
|
147
|
+
chunks = [rows[i:i + chunk_size] for i in range(0, max(len(rows), 1), chunk_size)]
|
|
148
|
+
paths = []
|
|
149
|
+
for idx, chunk in enumerate(chunks, start=1):
|
|
150
|
+
chunk_path = out.parent / f"{out.stem}_{idx:03d}{out.suffix}"
|
|
151
|
+
label = f"{desc} ({idx}/{len(chunks)})" if desc else chunk_path.name
|
|
152
|
+
_write_chunk(chunk_path, chunk, cols, label)
|
|
153
|
+
paths.append(chunk_path)
|
|
154
|
+
return paths
|
|
135
155
|
|
|
156
|
+
write_label = desc if desc is not None else out.name
|
|
157
|
+
_write_chunk(out, rows, cols, write_label)
|
|
136
158
|
return out
|
|
@@ -33,8 +33,8 @@ def test_read_returns_rows(simple_csv: Path) -> None:
|
|
|
33
33
|
def test_row_number_injected(simple_csv: Path) -> None:
|
|
34
34
|
runner = ThaCSV()
|
|
35
35
|
runner.read(None, simple_csv, ["name"])
|
|
36
|
-
assert runner.rows[0]["row number"] ==
|
|
37
|
-
assert runner.rows[2]["row number"] ==
|
|
36
|
+
assert runner.rows[0]["row number"] == 2
|
|
37
|
+
assert runner.rows[2]["row number"] == 4
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def test_message_and_status_columns_present(simple_csv: Path) -> None:
|
|
@@ -223,3 +223,68 @@ def test_enrich_false_validator_error_still_raises(simple_csv: Path) -> None:
|
|
|
223
223
|
runner = ThaCSV()
|
|
224
224
|
with pytest.raises(ValueError, match="Bob is not allowed"):
|
|
225
225
|
runner.read(None, simple_csv, ["name"], fail_on_bob, enrich=False)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# --- chunk_size ---
|
|
229
|
+
|
|
230
|
+
def test_chunk_size_returns_list(simple_csv: Path, tmp_path: Path) -> None:
|
|
231
|
+
runner = ThaCSV()
|
|
232
|
+
runner.read(None, simple_csv, ["name"])
|
|
233
|
+
result = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
234
|
+
assert isinstance(result, list)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_chunk_size_correct_file_count(simple_csv: Path, tmp_path: Path) -> None:
|
|
238
|
+
runner = ThaCSV()
|
|
239
|
+
runner.read(None, simple_csv, ["name"])
|
|
240
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
241
|
+
assert isinstance(paths, list)
|
|
242
|
+
assert len(paths) == 2 # 3 rows → chunks of 2, 1
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def test_chunk_size_files_exist(simple_csv: Path, tmp_path: Path) -> None:
|
|
246
|
+
runner = ThaCSV()
|
|
247
|
+
runner.read(None, simple_csv, ["name"])
|
|
248
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
249
|
+
assert isinstance(paths, list)
|
|
250
|
+
assert all(p.exists() for p in paths)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def test_chunk_size_naming(simple_csv: Path, tmp_path: Path) -> None:
|
|
254
|
+
runner = ThaCSV()
|
|
255
|
+
runner.read(None, simple_csv, ["name"])
|
|
256
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
257
|
+
assert isinstance(paths, list)
|
|
258
|
+
assert paths[0].name == "out_001.csv"
|
|
259
|
+
assert paths[1].name == "out_002.csv"
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def test_chunk_size_total_rows(simple_csv: Path, tmp_path: Path) -> None:
|
|
263
|
+
runner = ThaCSV()
|
|
264
|
+
runner.read(None, simple_csv, ["name"])
|
|
265
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
266
|
+
assert isinstance(paths, list)
|
|
267
|
+
total = sum(len(list(csv.DictReader(p.open()))) for p in paths)
|
|
268
|
+
assert total == 3
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def test_chunk_size_larger_than_rows(simple_csv: Path, tmp_path: Path) -> None:
|
|
272
|
+
runner = ThaCSV()
|
|
273
|
+
runner.read(None, simple_csv, ["name"])
|
|
274
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=100)
|
|
275
|
+
assert isinstance(paths, list)
|
|
276
|
+
assert len(paths) == 1
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def test_chunk_size_zero_raises(simple_csv: Path, tmp_path: Path) -> None:
|
|
280
|
+
runner = ThaCSV()
|
|
281
|
+
runner.read(None, simple_csv, ["name"])
|
|
282
|
+
with pytest.raises(ValueError, match="chunk_size"):
|
|
283
|
+
runner.write(None, tmp_path / "out.csv", chunk_size=0)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def test_no_chunk_size_returns_path(simple_csv: Path, tmp_path: Path) -> None:
|
|
287
|
+
runner = ThaCSV()
|
|
288
|
+
runner.read(None, simple_csv, ["name"])
|
|
289
|
+
result = runner.write(None, tmp_path / "out.csv")
|
|
290
|
+
assert isinstance(result, Path)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|