tha-csv-runner 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/PKG-INFO +20 -6
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/README.md +19 -5
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/pyproject.toml +1 -1
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/src/tha_csv_runner/__init__.py +1 -1
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/src/tha_csv_runner/runner.py +49 -19
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/tests/test_runner.py +78 -2
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/uv.lock +1 -1
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/.github/workflows/ci.yml +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/.github/workflows/publish.yml +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/.gitignore +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/LICENSE +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/src/tha_csv_runner/__main__.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/src/tha_csv_runner/errors.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/tests/__init__.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/tests/conftest.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/tests/fixtures/__init__.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/tests/fixtures/processors.py +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/tests/fixtures/simple.csv +0 -0
- {tha_csv_runner-0.2.2 → tha_csv_runner-0.2.4}/tests/fixtures/with_errors.csv +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tha-csv-runner
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Run a function over every row of a CSV — with progress, header validation, and structured per-row errors.
|
|
5
5
|
Project-URL: Homepage, https://github.com/tha-guy-nate/tha-csv-runner
|
|
6
6
|
Project-URL: Issues, https://github.com/tha-guy-nate/tha-csv-runner/issues
|
|
@@ -47,8 +47,8 @@ def process(row: dict) -> None:
|
|
|
47
47
|
|
|
48
48
|
runner = ThaCSV()
|
|
49
49
|
|
|
50
|
-
rows = runner.read("Step 1 of
|
|
51
|
-
runner.write("Step
|
|
50
|
+
rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
|
|
51
|
+
runner.write("Step 2 of 2", "output.csv")
|
|
52
52
|
```
|
|
53
53
|
|
|
54
54
|
## How it works
|
|
@@ -57,6 +57,7 @@ runner.write("Step 1 of 1", "output.csv")
|
|
|
57
57
|
2. Iterates every row with a `tqdm` progress bar labelled with `desc`
|
|
58
58
|
3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
|
|
59
59
|
4. Appends three columns to every row: `row number`, `row status`, and `message`
|
|
60
|
+
- `row number` starts at 2 (row 1 is the header)
|
|
60
61
|
- On success: `row status` and `message` are blank
|
|
61
62
|
- On error: `row status = "error"`, `message = str(exception)`
|
|
62
63
|
5. `write()` writes all rows (success and error) to a CSV
|
|
@@ -73,7 +74,7 @@ ThaCSV()
|
|
|
73
74
|
|
|
74
75
|
```python
|
|
75
76
|
runner.read(
|
|
76
|
-
"Step
|
|
77
|
+
"Step 1 of 2", # progress bar label — pass None to use the filename
|
|
77
78
|
"data.csv", # path to input CSV
|
|
78
79
|
["a", "b"], # columns that must exist — raises ConfigError if missing
|
|
79
80
|
validator=my_func, # optional: callable(row: dict) -> None
|
|
@@ -89,17 +90,30 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
|
|
|
89
90
|
|
|
90
91
|
```python
|
|
91
92
|
runner.write(
|
|
92
|
-
"Step
|
|
93
|
+
"Step 2 of 2", # progress bar label — pass None for "Writing {stem} CSV"
|
|
93
94
|
output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
|
|
95
|
+
rows=my_rows, # optional — use these rows instead of runner.rows
|
|
94
96
|
sort_by="name", # optional — column name, or list of column names
|
|
95
97
|
ascending=True, # optional — bool or list of bools matching sort_by
|
|
96
98
|
column_order=["name", "email"], # optional — listed columns come first, rest follow
|
|
97
99
|
keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
|
|
98
100
|
drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
|
|
101
|
+
chunk_size=1000, # optional — split output into files of this many rows
|
|
99
102
|
)
|
|
100
103
|
```
|
|
101
104
|
|
|
102
|
-
|
|
105
|
+
Prints `:white_check_mark: Done! CSV was written to: {path}` on completion. Override by setting `runner.status_cb = my_fn`.
|
|
106
|
+
|
|
107
|
+
Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
|
|
108
|
+
|
|
109
|
+
#### `chunk_size`
|
|
110
|
+
|
|
111
|
+
When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
|
|
115
|
+
# ["output_001.csv", "output_002.csv", ...]
|
|
116
|
+
```
|
|
103
117
|
|
|
104
118
|
## License
|
|
105
119
|
|
|
@@ -22,8 +22,8 @@ def process(row: dict) -> None:
|
|
|
22
22
|
|
|
23
23
|
runner = ThaCSV()
|
|
24
24
|
|
|
25
|
-
rows = runner.read("Step 1 of
|
|
26
|
-
runner.write("Step
|
|
25
|
+
rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
|
|
26
|
+
runner.write("Step 2 of 2", "output.csv")
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
## How it works
|
|
@@ -32,6 +32,7 @@ runner.write("Step 1 of 1", "output.csv")
|
|
|
32
32
|
2. Iterates every row with a `tqdm` progress bar labelled with `desc`
|
|
33
33
|
3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
|
|
34
34
|
4. Appends three columns to every row: `row number`, `row status`, and `message`
|
|
35
|
+
- `row number` starts at 2 (row 1 is the header)
|
|
35
36
|
- On success: `row status` and `message` are blank
|
|
36
37
|
- On error: `row status = "error"`, `message = str(exception)`
|
|
37
38
|
5. `write()` writes all rows (success and error) to a CSV
|
|
@@ -48,7 +49,7 @@ ThaCSV()
|
|
|
48
49
|
|
|
49
50
|
```python
|
|
50
51
|
runner.read(
|
|
51
|
-
"Step
|
|
52
|
+
"Step 1 of 2", # progress bar label — pass None to use the filename
|
|
52
53
|
"data.csv", # path to input CSV
|
|
53
54
|
["a", "b"], # columns that must exist — raises ConfigError if missing
|
|
54
55
|
validator=my_func, # optional: callable(row: dict) -> None
|
|
@@ -64,17 +65,30 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
|
|
|
64
65
|
|
|
65
66
|
```python
|
|
66
67
|
runner.write(
|
|
67
|
-
"Step
|
|
68
|
+
"Step 2 of 2", # progress bar label — pass None for "Writing {stem} CSV"
|
|
68
69
|
output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
|
|
70
|
+
rows=my_rows, # optional — use these rows instead of runner.rows
|
|
69
71
|
sort_by="name", # optional — column name, or list of column names
|
|
70
72
|
ascending=True, # optional — bool or list of bools matching sort_by
|
|
71
73
|
column_order=["name", "email"], # optional — listed columns come first, rest follow
|
|
72
74
|
keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
|
|
73
75
|
drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
|
|
76
|
+
chunk_size=1000, # optional — split output into files of this many rows
|
|
74
77
|
)
|
|
75
78
|
```
|
|
76
79
|
|
|
77
|
-
|
|
80
|
+
Prints `:white_check_mark: Done! CSV was written to: {path}` on completion. Override by setting `runner.status_cb = my_fn`.
|
|
81
|
+
|
|
82
|
+
Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
|
|
83
|
+
|
|
84
|
+
#### `chunk_size`
|
|
85
|
+
|
|
86
|
+
When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
|
|
90
|
+
# ["output_001.csv", "output_002.csv", ...]
|
|
91
|
+
```
|
|
78
92
|
|
|
79
93
|
## License
|
|
80
94
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "tha-csv-runner"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.4"
|
|
8
8
|
description = "Run a function over every row of a CSV — with progress, header validation, and structured per-row errors."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import functools
|
|
3
|
+
import shutil
|
|
3
4
|
from collections.abc import Callable
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from pathlib import Path
|
|
@@ -9,6 +10,10 @@ from tqdm import tqdm
|
|
|
9
10
|
from .errors import ConfigError
|
|
10
11
|
|
|
11
12
|
|
|
13
|
+
def tqdm_ncols(max_cols: int = 85) -> int:
|
|
14
|
+
return min(shutil.get_terminal_size(fallback=(max_cols, 24)).columns, max_cols)
|
|
15
|
+
|
|
16
|
+
|
|
12
17
|
def _sort_key(val: object) -> tuple:
|
|
13
18
|
try:
|
|
14
19
|
return (0, float(val)) # type: ignore[arg-type]
|
|
@@ -16,11 +21,24 @@ def _sort_key(val: object) -> tuple:
|
|
|
16
21
|
return (1, str(val))
|
|
17
22
|
|
|
18
23
|
|
|
24
|
+
def _write_chunk(path: Path, rows: list[dict], cols: list[str], label: str) -> None:
|
|
25
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
27
|
+
if rows:
|
|
28
|
+
writer = csv.DictWriter(f, fieldnames=cols)
|
|
29
|
+
writer.writeheader()
|
|
30
|
+
writer.writerows(
|
|
31
|
+
{c: row[c] for c in cols if c in row}
|
|
32
|
+
for row in tqdm(rows, desc=label, ncols=tqdm_ncols())
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
19
36
|
class ThaCSV:
|
|
20
37
|
def __init__(self) -> None:
|
|
21
38
|
self.rows: list[dict] = []
|
|
22
39
|
self._read: bool = False
|
|
23
40
|
self._input_path: Path | None = None
|
|
41
|
+
self.status_cb = print
|
|
24
42
|
|
|
25
43
|
def read(
|
|
26
44
|
self,
|
|
@@ -44,8 +62,8 @@ class ThaCSV:
|
|
|
44
62
|
self.rows = []
|
|
45
63
|
self._read = True
|
|
46
64
|
|
|
47
|
-
label = desc if desc is not None else self._input_path.
|
|
48
|
-
for i, row in enumerate(tqdm(raw_rows, desc=label), start=
|
|
65
|
+
label = desc if desc is not None else f"Reading {self._input_path.stem} CSV"
|
|
66
|
+
for i, row in enumerate(tqdm(raw_rows, desc=label, ncols=tqdm_ncols()), start=2):
|
|
49
67
|
if enrich:
|
|
50
68
|
enriched = {**row, "row number": i, "row status": "", "message": ""}
|
|
51
69
|
else:
|
|
@@ -67,18 +85,22 @@ class ThaCSV:
|
|
|
67
85
|
self,
|
|
68
86
|
desc: str | None,
|
|
69
87
|
output_path: str | Path | None = None,
|
|
88
|
+
rows: list[dict] | None = None,
|
|
70
89
|
sort_by: str | list[str] | None = None,
|
|
71
90
|
ascending: bool | list[bool] = True,
|
|
72
91
|
column_order: list[str] | None = None,
|
|
73
92
|
keep: list[str] | None = None,
|
|
74
93
|
drop: list[str] | None = None,
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
94
|
+
chunk_size: int | None = None,
|
|
95
|
+
) -> Path | list[Path]:
|
|
96
|
+
if rows is None and not self._read:
|
|
97
|
+
raise RuntimeError("No data to write — call read() first or pass rows=")
|
|
78
98
|
if keep and drop:
|
|
79
99
|
raise ValueError("Cannot specify both keep and drop")
|
|
100
|
+
if chunk_size is not None and chunk_size < 1:
|
|
101
|
+
raise ValueError("chunk_size must be >= 1")
|
|
80
102
|
|
|
81
|
-
rows = list(self.rows)
|
|
103
|
+
rows = list(rows) if rows is not None else list(self.rows)
|
|
82
104
|
|
|
83
105
|
# --- column filtering ---
|
|
84
106
|
all_cols = list(rows[0].keys()) if rows else []
|
|
@@ -120,17 +142,25 @@ class ThaCSV:
|
|
|
120
142
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
121
143
|
output_path = Path(f"{stem}_processed_{ts}.csv")
|
|
122
144
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
{
|
|
133
|
-
|
|
145
|
+
output_file = Path(output_path)
|
|
146
|
+
|
|
147
|
+
# --- chunked write ---
|
|
148
|
+
if chunk_size is not None:
|
|
149
|
+
chunks = [rows[i:i + chunk_size] for i in range(0, max(len(rows), 1), chunk_size)]
|
|
150
|
+
paths = []
|
|
151
|
+
for idx, chunk in enumerate(chunks, start=1):
|
|
152
|
+
chunk_path = output_file.parent / f"{output_file.stem}_{idx:03d}{output_file.suffix}"
|
|
153
|
+
label = (
|
|
154
|
+
f"{desc} ({idx}/{len(chunks)})"
|
|
155
|
+
if desc
|
|
156
|
+
else f"Writing {output_file.stem} CSV ({idx}/{len(chunks)})"
|
|
134
157
|
)
|
|
135
|
-
|
|
136
|
-
|
|
158
|
+
_write_chunk(chunk_path, chunk, cols, label)
|
|
159
|
+
paths.append(chunk_path)
|
|
160
|
+
self.status_cb(f":white_check_mark: Done! CSV was written to: {paths}")
|
|
161
|
+
return paths
|
|
162
|
+
|
|
163
|
+
write_label = desc if desc is not None else f"Writing {output_file.stem} CSV"
|
|
164
|
+
_write_chunk(output_file, rows, cols, write_label)
|
|
165
|
+
self.status_cb(f":white_check_mark: Done! CSV was written to: {output_file}")
|
|
166
|
+
return output_file
|
|
@@ -33,8 +33,8 @@ def test_read_returns_rows(simple_csv: Path) -> None:
|
|
|
33
33
|
def test_row_number_injected(simple_csv: Path) -> None:
|
|
34
34
|
runner = ThaCSV()
|
|
35
35
|
runner.read(None, simple_csv, ["name"])
|
|
36
|
-
assert runner.rows[0]["row number"] ==
|
|
37
|
-
assert runner.rows[2]["row number"] ==
|
|
36
|
+
assert runner.rows[0]["row number"] == 2
|
|
37
|
+
assert runner.rows[2]["row number"] == 4
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def test_message_and_status_columns_present(simple_csv: Path) -> None:
|
|
@@ -118,6 +118,17 @@ def test_write_before_read_raises(simple_csv: Path) -> None:
|
|
|
118
118
|
runner.write(None)
|
|
119
119
|
|
|
120
120
|
|
|
121
|
+
def test_write_rows_param_bypasses_read_guard(tmp_path: Path) -> None:
|
|
122
|
+
out = tmp_path / "out.csv"
|
|
123
|
+
rows = [{"name": "Alice", "val": "1"}, {"name": "Bob", "val": "2"}]
|
|
124
|
+
runner = ThaCSV()
|
|
125
|
+
result = runner.write(None, out, rows=rows)
|
|
126
|
+
assert isinstance(result, Path)
|
|
127
|
+
written = list(csv.DictReader(out.open()))
|
|
128
|
+
assert len(written) == 2
|
|
129
|
+
assert written[0]["name"] == "Alice"
|
|
130
|
+
|
|
131
|
+
|
|
121
132
|
def test_write_sort_by_single(simple_csv: Path, tmp_path: Path) -> None:
|
|
122
133
|
out = tmp_path / "out.csv"
|
|
123
134
|
runner = ThaCSV()
|
|
@@ -223,3 +234,68 @@ def test_enrich_false_validator_error_still_raises(simple_csv: Path) -> None:
|
|
|
223
234
|
runner = ThaCSV()
|
|
224
235
|
with pytest.raises(ValueError, match="Bob is not allowed"):
|
|
225
236
|
runner.read(None, simple_csv, ["name"], fail_on_bob, enrich=False)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# --- chunk_size ---
|
|
240
|
+
|
|
241
|
+
def test_chunk_size_returns_list(simple_csv: Path, tmp_path: Path) -> None:
|
|
242
|
+
runner = ThaCSV()
|
|
243
|
+
runner.read(None, simple_csv, ["name"])
|
|
244
|
+
result = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
245
|
+
assert isinstance(result, list)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def test_chunk_size_correct_file_count(simple_csv: Path, tmp_path: Path) -> None:
|
|
249
|
+
runner = ThaCSV()
|
|
250
|
+
runner.read(None, simple_csv, ["name"])
|
|
251
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
252
|
+
assert isinstance(paths, list)
|
|
253
|
+
assert len(paths) == 2 # 3 rows → chunks of 2, 1
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def test_chunk_size_files_exist(simple_csv: Path, tmp_path: Path) -> None:
|
|
257
|
+
runner = ThaCSV()
|
|
258
|
+
runner.read(None, simple_csv, ["name"])
|
|
259
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
260
|
+
assert isinstance(paths, list)
|
|
261
|
+
assert all(p.exists() for p in paths)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def test_chunk_size_naming(simple_csv: Path, tmp_path: Path) -> None:
|
|
265
|
+
runner = ThaCSV()
|
|
266
|
+
runner.read(None, simple_csv, ["name"])
|
|
267
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
268
|
+
assert isinstance(paths, list)
|
|
269
|
+
assert paths[0].name == "out_001.csv"
|
|
270
|
+
assert paths[1].name == "out_002.csv"
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def test_chunk_size_total_rows(simple_csv: Path, tmp_path: Path) -> None:
|
|
274
|
+
runner = ThaCSV()
|
|
275
|
+
runner.read(None, simple_csv, ["name"])
|
|
276
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
|
|
277
|
+
assert isinstance(paths, list)
|
|
278
|
+
total = sum(len(list(csv.DictReader(p.open()))) for p in paths)
|
|
279
|
+
assert total == 3
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def test_chunk_size_larger_than_rows(simple_csv: Path, tmp_path: Path) -> None:
|
|
283
|
+
runner = ThaCSV()
|
|
284
|
+
runner.read(None, simple_csv, ["name"])
|
|
285
|
+
paths = runner.write(None, tmp_path / "out.csv", chunk_size=100)
|
|
286
|
+
assert isinstance(paths, list)
|
|
287
|
+
assert len(paths) == 1
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def test_chunk_size_zero_raises(simple_csv: Path, tmp_path: Path) -> None:
|
|
291
|
+
runner = ThaCSV()
|
|
292
|
+
runner.read(None, simple_csv, ["name"])
|
|
293
|
+
with pytest.raises(ValueError, match="chunk_size"):
|
|
294
|
+
runner.write(None, tmp_path / "out.csv", chunk_size=0)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def test_no_chunk_size_returns_path(simple_csv: Path, tmp_path: Path) -> None:
|
|
298
|
+
runner = ThaCSV()
|
|
299
|
+
runner.read(None, simple_csv, ["name"])
|
|
300
|
+
result = runner.write(None, tmp_path / "out.csv")
|
|
301
|
+
assert isinstance(result, Path)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|