tha-csv-runner 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tha-csv-runner
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Run a function over every row of a CSV — with progress, header validation, and structured per-row errors.
5
5
  Project-URL: Homepage, https://github.com/tha-guy-nate/tha-csv-runner
6
6
  Project-URL: Issues, https://github.com/tha-guy-nate/tha-csv-runner/issues
@@ -47,8 +47,8 @@ def process(row: dict) -> None:
47
47
 
48
48
  runner = ThaCSV()
49
49
 
50
- rows = runner.read("Step 1 of 1", "data.csv", ["name", "email"], process)
51
- runner.write("Step 1 of 1", "output.csv")
50
+ rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
51
+ runner.write("Step 2 of 2", "output.csv")
52
52
  ```
53
53
 
54
54
  ## How it works
@@ -57,6 +57,7 @@ runner.write("Step 1 of 1", "output.csv")
57
57
  2. Iterates every row with a `tqdm` progress bar labelled with `desc`
58
58
  3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
59
59
  4. Appends three columns to every row: `row number`, `row status`, and `message`
60
+ - `row number` starts at 2 (row 1 is the header)
60
61
  - On success: `row status` and `message` are blank
61
62
  - On error: `row status = "error"`, `message = str(exception)`
62
63
  5. `write()` writes all rows (success and error) to a CSV
@@ -73,7 +74,7 @@ ThaCSV()
73
74
 
74
75
  ```python
75
76
  runner.read(
76
- "Step 2 of 10", # progress bar label — pass None to use the filename
77
+ "Step 1 of 2", # progress bar label — pass None to use the filename
77
78
  "data.csv", # path to input CSV
78
79
  ["a", "b"], # columns that must exist — raises ConfigError if missing
79
80
  validator=my_func, # optional: callable(row: dict) -> None
@@ -89,17 +90,27 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
89
90
 
90
91
  ```python
91
92
  runner.write(
92
- "Step 10 of 10", # progress bar label — pass None to use the output filename
93
+ "Step 2 of 2", # progress bar label — pass None to use the output filename
93
94
  output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
94
95
  sort_by="name", # optional — column name, or list of column names
95
96
  ascending=True, # optional — bool or list of bools matching sort_by
96
97
  column_order=["name", "email"], # optional — listed columns come first, rest follow
97
98
  keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
98
99
  drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
100
+ chunk_size=1000, # optional — split output into files of this many rows
99
101
  )
100
102
  ```
101
103
 
102
- Returns the `Path` that was written.
104
+ Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
105
+
106
+ #### `chunk_size`
107
+
108
+ When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
109
+
110
+ ```python
111
+ paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
112
+ # ["output_001.csv", "output_002.csv", ...]
113
+ ```
103
114
 
104
115
  ## License
105
116
 
@@ -22,8 +22,8 @@ def process(row: dict) -> None:
22
22
 
23
23
  runner = ThaCSV()
24
24
 
25
- rows = runner.read("Step 1 of 1", "data.csv", ["name", "email"], process)
26
- runner.write("Step 1 of 1", "output.csv")
25
+ rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
26
+ runner.write("Step 2 of 2", "output.csv")
27
27
  ```
28
28
 
29
29
  ## How it works
@@ -32,6 +32,7 @@ runner.write("Step 1 of 1", "output.csv")
32
32
  2. Iterates every row with a `tqdm` progress bar labelled with `desc`
33
33
  3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
34
34
  4. Appends three columns to every row: `row number`, `row status`, and `message`
35
+ - `row number` starts at 2 (row 1 is the header)
35
36
  - On success: `row status` and `message` are blank
36
37
  - On error: `row status = "error"`, `message = str(exception)`
37
38
  5. `write()` writes all rows (success and error) to a CSV
@@ -48,7 +49,7 @@ ThaCSV()
48
49
 
49
50
  ```python
50
51
  runner.read(
51
- "Step 2 of 10", # progress bar label — pass None to use the filename
52
+ "Step 1 of 2", # progress bar label — pass None to use the filename
52
53
  "data.csv", # path to input CSV
53
54
  ["a", "b"], # columns that must exist — raises ConfigError if missing
54
55
  validator=my_func, # optional: callable(row: dict) -> None
@@ -64,17 +65,27 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
64
65
 
65
66
  ```python
66
67
  runner.write(
67
- "Step 10 of 10", # progress bar label — pass None to use the output filename
68
+ "Step 2 of 2", # progress bar label — pass None to use the output filename
68
69
  output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
69
70
  sort_by="name", # optional — column name, or list of column names
70
71
  ascending=True, # optional — bool or list of bools matching sort_by
71
72
  column_order=["name", "email"], # optional — listed columns come first, rest follow
72
73
  keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
73
74
  drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
75
+ chunk_size=1000, # optional — split output into files of this many rows
74
76
  )
75
77
  ```
76
78
 
77
- Returns the `Path` that was written.
79
+ Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
80
+
81
+ #### `chunk_size`
82
+
83
+ When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
84
+
85
+ ```python
86
+ paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
87
+ # ["output_001.csv", "output_002.csv", ...]
88
+ ```
78
89
 
79
90
  ## License
80
91
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tha-csv-runner"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Run a function over every row of a CSV — with progress, header validation, and structured per-row errors."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -3,5 +3,5 @@
3
3
  from .errors import ConfigError
4
4
  from .runner import ThaCSV
5
5
 
6
- __version__ = "0.2.2"
6
+ __version__ = "0.2.3"
7
7
  __all__ = ["ConfigError", "ThaCSV"]
@@ -1,5 +1,6 @@
1
1
  import csv
2
2
  import functools
3
+ import shutil
3
4
  from collections.abc import Callable
4
5
  from datetime import datetime
5
6
  from pathlib import Path
@@ -9,6 +10,10 @@ from tqdm import tqdm
9
10
  from .errors import ConfigError
10
11
 
11
12
 
13
+ def tqdm_ncols(max_cols: int = 85) -> int:
14
+ return min(shutil.get_terminal_size(fallback=(max_cols, 24)).columns, max_cols)
15
+
16
+
12
17
  def _sort_key(val: object) -> tuple:
13
18
  try:
14
19
  return (0, float(val)) # type: ignore[arg-type]
@@ -16,6 +21,18 @@ def _sort_key(val: object) -> tuple:
16
21
  return (1, str(val))
17
22
 
18
23
 
24
+ def _write_chunk(path: Path, rows: list[dict], cols: list[str], label: str) -> None:
25
+ path.parent.mkdir(parents=True, exist_ok=True)
26
+ with open(path, "w", newline="", encoding="utf-8") as f:
27
+ if rows:
28
+ writer = csv.DictWriter(f, fieldnames=cols)
29
+ writer.writeheader()
30
+ writer.writerows(
31
+ {c: row[c] for c in cols if c in row}
32
+ for row in tqdm(rows, desc=label, ncols=tqdm_ncols())
33
+ )
34
+
35
+
19
36
  class ThaCSV:
20
37
  def __init__(self) -> None:
21
38
  self.rows: list[dict] = []
@@ -45,7 +62,7 @@ class ThaCSV:
45
62
  self._read = True
46
63
 
47
64
  label = desc if desc is not None else self._input_path.name
48
- for i, row in enumerate(tqdm(raw_rows, desc=label), start=1):
65
+ for i, row in enumerate(tqdm(raw_rows, desc=label, ncols=tqdm_ncols()), start=2):
49
66
  if enrich:
50
67
  enriched = {**row, "row number": i, "row status": "", "message": ""}
51
68
  else:
@@ -72,11 +89,14 @@ class ThaCSV:
72
89
  column_order: list[str] | None = None,
73
90
  keep: list[str] | None = None,
74
91
  drop: list[str] | None = None,
75
- ) -> Path:
92
+ chunk_size: int | None = None,
93
+ ) -> Path | list[Path]:
76
94
  if not self._read:
77
95
  raise RuntimeError("No data to write — call read() first")
78
96
  if keep and drop:
79
97
  raise ValueError("Cannot specify both keep and drop")
98
+ if chunk_size is not None and chunk_size < 1:
99
+ raise ValueError("chunk_size must be >= 1")
80
100
 
81
101
  rows = list(self.rows)
82
102
 
@@ -121,16 +141,18 @@ class ThaCSV:
121
141
  output_path = Path(f"{stem}_processed_{ts}.csv")
122
142
 
123
143
  out = Path(output_path)
124
- out.parent.mkdir(parents=True, exist_ok=True)
125
144
 
126
- write_label = desc if desc is not None else out.name
127
- with open(out, "w", newline="", encoding="utf-8") as f:
128
- if rows:
129
- writer = csv.DictWriter(f, fieldnames=cols)
130
- writer.writeheader()
131
- writer.writerows(
132
- {c: row[c] for c in cols if c in row}
133
- for row in tqdm(rows, desc=write_label)
134
- )
145
+ # --- chunked write ---
146
+ if chunk_size is not None:
147
+ chunks = [rows[i:i + chunk_size] for i in range(0, max(len(rows), 1), chunk_size)]
148
+ paths = []
149
+ for idx, chunk in enumerate(chunks, start=1):
150
+ chunk_path = out.parent / f"{out.stem}_{idx:03d}{out.suffix}"
151
+ label = f"{desc} ({idx}/{len(chunks)})" if desc else chunk_path.name
152
+ _write_chunk(chunk_path, chunk, cols, label)
153
+ paths.append(chunk_path)
154
+ return paths
135
155
 
156
+ write_label = desc if desc is not None else out.name
157
+ _write_chunk(out, rows, cols, write_label)
136
158
  return out
@@ -33,8 +33,8 @@ def test_read_returns_rows(simple_csv: Path) -> None:
33
33
  def test_row_number_injected(simple_csv: Path) -> None:
34
34
  runner = ThaCSV()
35
35
  runner.read(None, simple_csv, ["name"])
36
- assert runner.rows[0]["row number"] == 1
37
- assert runner.rows[2]["row number"] == 3
36
+ assert runner.rows[0]["row number"] == 2
37
+ assert runner.rows[2]["row number"] == 4
38
38
 
39
39
 
40
40
  def test_message_and_status_columns_present(simple_csv: Path) -> None:
@@ -223,3 +223,68 @@ def test_enrich_false_validator_error_still_raises(simple_csv: Path) -> None:
223
223
  runner = ThaCSV()
224
224
  with pytest.raises(ValueError, match="Bob is not allowed"):
225
225
  runner.read(None, simple_csv, ["name"], fail_on_bob, enrich=False)
226
+
227
+
228
+ # --- chunk_size ---
229
+
230
+ def test_chunk_size_returns_list(simple_csv: Path, tmp_path: Path) -> None:
231
+ runner = ThaCSV()
232
+ runner.read(None, simple_csv, ["name"])
233
+ result = runner.write(None, tmp_path / "out.csv", chunk_size=2)
234
+ assert isinstance(result, list)
235
+
236
+
237
+ def test_chunk_size_correct_file_count(simple_csv: Path, tmp_path: Path) -> None:
238
+ runner = ThaCSV()
239
+ runner.read(None, simple_csv, ["name"])
240
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
241
+ assert isinstance(paths, list)
242
+ assert len(paths) == 2 # 3 rows → chunks of 2, 1
243
+
244
+
245
+ def test_chunk_size_files_exist(simple_csv: Path, tmp_path: Path) -> None:
246
+ runner = ThaCSV()
247
+ runner.read(None, simple_csv, ["name"])
248
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
249
+ assert isinstance(paths, list)
250
+ assert all(p.exists() for p in paths)
251
+
252
+
253
+ def test_chunk_size_naming(simple_csv: Path, tmp_path: Path) -> None:
254
+ runner = ThaCSV()
255
+ runner.read(None, simple_csv, ["name"])
256
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
257
+ assert isinstance(paths, list)
258
+ assert paths[0].name == "out_001.csv"
259
+ assert paths[1].name == "out_002.csv"
260
+
261
+
262
+ def test_chunk_size_total_rows(simple_csv: Path, tmp_path: Path) -> None:
263
+ runner = ThaCSV()
264
+ runner.read(None, simple_csv, ["name"])
265
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
266
+ assert isinstance(paths, list)
267
+ total = sum(len(list(csv.DictReader(p.open()))) for p in paths)
268
+ assert total == 3
269
+
270
+
271
+ def test_chunk_size_larger_than_rows(simple_csv: Path, tmp_path: Path) -> None:
272
+ runner = ThaCSV()
273
+ runner.read(None, simple_csv, ["name"])
274
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=100)
275
+ assert isinstance(paths, list)
276
+ assert len(paths) == 1
277
+
278
+
279
+ def test_chunk_size_zero_raises(simple_csv: Path, tmp_path: Path) -> None:
280
+ runner = ThaCSV()
281
+ runner.read(None, simple_csv, ["name"])
282
+ with pytest.raises(ValueError, match="chunk_size"):
283
+ runner.write(None, tmp_path / "out.csv", chunk_size=0)
284
+
285
+
286
+ def test_no_chunk_size_returns_path(simple_csv: Path, tmp_path: Path) -> None:
287
+ runner = ThaCSV()
288
+ runner.read(None, simple_csv, ["name"])
289
+ result = runner.write(None, tmp_path / "out.csv")
290
+ assert isinstance(result, Path)
@@ -308,7 +308,7 @@ wheels = [
308
308
 
309
309
  [[package]]
310
310
  name = "tha-csv-runner"
311
- version = "0.2.2"
311
+ version = "0.2.3"
312
312
  source = { editable = "." }
313
313
  dependencies = [
314
314
  { name = "tqdm" },
File without changes