tha-csv-runner 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tha-csv-runner
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Run a function over every row of a CSV — with progress, header validation, and structured per-row errors.
5
5
  Project-URL: Homepage, https://github.com/tha-guy-nate/tha-csv-runner
6
6
  Project-URL: Issues, https://github.com/tha-guy-nate/tha-csv-runner/issues
@@ -47,8 +47,8 @@ def process(row: dict) -> None:
47
47
 
48
48
  runner = ThaCSV()
49
49
 
50
- rows = runner.read("Step 1 of 1", "data.csv", ["name", "email"], process)
51
- runner.write("Step 1 of 1", "output.csv")
50
+ rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
51
+ runner.write("Step 2 of 2", "output.csv")
52
52
  ```
53
53
 
54
54
  ## How it works
@@ -57,6 +57,7 @@ runner.write("Step 1 of 1", "output.csv")
57
57
  2. Iterates every row with a `tqdm` progress bar labelled with `desc`
58
58
  3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
59
59
  4. Appends three columns to every row: `row number`, `row status`, and `message`
60
+ - `row number` starts at 2 (row 1 is the header)
60
61
  - On success: `row status` and `message` are blank
61
62
  - On error: `row status = "error"`, `message = str(exception)`
62
63
  5. `write()` writes all rows (success and error) to a CSV
@@ -73,7 +74,7 @@ ThaCSV()
73
74
 
74
75
  ```python
75
76
  runner.read(
76
- "Step 2 of 10", # progress bar label — pass None to use the filename
77
+ "Step 1 of 2", # progress bar label — pass None to use the filename
77
78
  "data.csv", # path to input CSV
78
79
  ["a", "b"], # columns that must exist — raises ConfigError if missing
79
80
  validator=my_func, # optional: callable(row: dict) -> None
@@ -89,17 +90,30 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
89
90
 
90
91
  ```python
91
92
  runner.write(
92
- "Step 10 of 10", # progress bar label — pass None to use the output filename
93
+ "Step 2 of 2", # progress bar label — pass None for "Writing {stem} CSV"
93
94
  output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
95
+ rows=my_rows, # optional — use these rows instead of runner.rows
94
96
  sort_by="name", # optional — column name, or list of column names
95
97
  ascending=True, # optional — bool or list of bools matching sort_by
96
98
  column_order=["name", "email"], # optional — listed columns come first, rest follow
97
99
  keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
98
100
  drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
101
+ chunk_size=1000, # optional — split output into files of this many rows
99
102
  )
100
103
  ```
101
104
 
102
- Returns the `Path` that was written.
105
+ Prints `:white_check_mark: Done! CSV was written to: {path}` on completion. Override by setting `runner.status_cb = my_fn`.
106
+
107
+ Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
108
+
109
+ #### `chunk_size`
110
+
111
+ When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
112
+
113
+ ```python
114
+ paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
115
+ # ["output_001.csv", "output_002.csv", ...]
116
+ ```
103
117
 
104
118
  ## License
105
119
 
@@ -22,8 +22,8 @@ def process(row: dict) -> None:
22
22
 
23
23
  runner = ThaCSV()
24
24
 
25
- rows = runner.read("Step 1 of 1", "data.csv", ["name", "email"], process)
26
- runner.write("Step 1 of 1", "output.csv")
25
+ rows = runner.read("Step 1 of 2", "data.csv", ["name", "email"], process)
26
+ runner.write("Step 2 of 2", "output.csv")
27
27
  ```
28
28
 
29
29
  ## How it works
@@ -32,6 +32,7 @@ runner.write("Step 1 of 1", "output.csv")
32
32
  2. Iterates every row with a `tqdm` progress bar labelled with `desc`
33
33
  3. Calls your `validator(row)` function — if it raises, that row is marked as an error and processing continues
34
34
  4. Appends three columns to every row: `row number`, `row status`, and `message`
35
+ - `row number` starts at 2 (row 1 is the header)
35
36
  - On success: `row status` and `message` are blank
36
37
  - On error: `row status = "error"`, `message = str(exception)`
37
38
  5. `write()` writes all rows (success and error) to a CSV
@@ -48,7 +49,7 @@ ThaCSV()
48
49
 
49
50
  ```python
50
51
  runner.read(
51
- "Step 2 of 10", # progress bar label — pass None to use the filename
52
+ "Step 1 of 2", # progress bar label — pass None to use the filename
52
53
  "data.csv", # path to input CSV
53
54
  ["a", "b"], # columns that must exist — raises ConfigError if missing
54
55
  validator=my_func, # optional: callable(row: dict) -> None
@@ -64,17 +65,30 @@ When `enrich=False`, validator exceptions are re-raised instead of captured.
64
65
 
65
66
  ```python
66
67
  runner.write(
67
- "Step 10 of 10", # progress bar label — pass None to use the output filename
68
+ "Step 2 of 2", # progress bar label — pass None for "Writing {stem} CSV"
68
69
  output_path="output.csv", # optional — auto-named input_processed_TIMESTAMP.csv if omitted
70
+ rows=my_rows, # optional — use these rows instead of runner.rows
69
71
  sort_by="name", # optional — column name, or list of column names
70
72
  ascending=True, # optional — bool or list of bools matching sort_by
71
73
  column_order=["name", "email"], # optional — listed columns come first, rest follow
72
74
  keep=["name", "email"], # optional — keep only these columns (mutually exclusive with drop)
73
75
  drop=["row number"], # optional — remove these columns (mutually exclusive with keep)
76
+ chunk_size=1000, # optional — split output into files of this many rows
74
77
  )
75
78
  ```
76
79
 
77
- Returns the `Path` that was written.
80
+ Prints `:white_check_mark: Done! CSV was written to: {path}` on completion. Override by setting `runner.status_cb = my_fn`.
81
+
82
+ Returns the `Path` that was written, or a `list[Path]` when `chunk_size` is set.
83
+
84
+ #### `chunk_size`
85
+
86
+ When provided, `write()` splits the output into multiple files named `output_001.csv`, `output_002.csv`, etc. and returns a `list[Path]`.
87
+
88
+ ```python
89
+ paths = runner.write("Step 2 of 2", "output.csv", chunk_size=1000)
90
+ # ["output_001.csv", "output_002.csv", ...]
91
+ ```
78
92
 
79
93
  ## License
80
94
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "tha-csv-runner"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  description = "Run a function over every row of a CSV — with progress, header validation, and structured per-row errors."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -3,5 +3,5 @@
3
3
  from .errors import ConfigError
4
4
  from .runner import ThaCSV
5
5
 
6
- __version__ = "0.2.2"
6
+ __version__ = "0.2.4"
7
7
  __all__ = ["ConfigError", "ThaCSV"]
@@ -1,5 +1,6 @@
1
1
  import csv
2
2
  import functools
3
+ import shutil
3
4
  from collections.abc import Callable
4
5
  from datetime import datetime
5
6
  from pathlib import Path
@@ -9,6 +10,10 @@ from tqdm import tqdm
9
10
  from .errors import ConfigError
10
11
 
11
12
 
13
+ def tqdm_ncols(max_cols: int = 85) -> int:
14
+ return min(shutil.get_terminal_size(fallback=(max_cols, 24)).columns, max_cols)
15
+
16
+
12
17
  def _sort_key(val: object) -> tuple:
13
18
  try:
14
19
  return (0, float(val)) # type: ignore[arg-type]
@@ -16,11 +21,24 @@ def _sort_key(val: object) -> tuple:
16
21
  return (1, str(val))
17
22
 
18
23
 
24
+ def _write_chunk(path: Path, rows: list[dict], cols: list[str], label: str) -> None:
25
+ path.parent.mkdir(parents=True, exist_ok=True)
26
+ with open(path, "w", newline="", encoding="utf-8") as f:
27
+ if rows:
28
+ writer = csv.DictWriter(f, fieldnames=cols)
29
+ writer.writeheader()
30
+ writer.writerows(
31
+ {c: row[c] for c in cols if c in row}
32
+ for row in tqdm(rows, desc=label, ncols=tqdm_ncols())
33
+ )
34
+
35
+
19
36
  class ThaCSV:
20
37
  def __init__(self) -> None:
21
38
  self.rows: list[dict] = []
22
39
  self._read: bool = False
23
40
  self._input_path: Path | None = None
41
+ self.status_cb = print
24
42
 
25
43
  def read(
26
44
  self,
@@ -44,8 +62,8 @@ class ThaCSV:
44
62
  self.rows = []
45
63
  self._read = True
46
64
 
47
- label = desc if desc is not None else self._input_path.name
48
- for i, row in enumerate(tqdm(raw_rows, desc=label), start=1):
65
+ label = desc if desc is not None else f"Reading {self._input_path.stem} CSV"
66
+ for i, row in enumerate(tqdm(raw_rows, desc=label, ncols=tqdm_ncols()), start=2):
49
67
  if enrich:
50
68
  enriched = {**row, "row number": i, "row status": "", "message": ""}
51
69
  else:
@@ -67,18 +85,22 @@ class ThaCSV:
67
85
  self,
68
86
  desc: str | None,
69
87
  output_path: str | Path | None = None,
88
+ rows: list[dict] | None = None,
70
89
  sort_by: str | list[str] | None = None,
71
90
  ascending: bool | list[bool] = True,
72
91
  column_order: list[str] | None = None,
73
92
  keep: list[str] | None = None,
74
93
  drop: list[str] | None = None,
75
- ) -> Path:
76
- if not self._read:
77
- raise RuntimeError("No data to write call read() first")
94
+ chunk_size: int | None = None,
95
+ ) -> Path | list[Path]:
96
+ if rows is None and not self._read:
97
+ raise RuntimeError("No data to write — call read() first or pass rows=")
78
98
  if keep and drop:
79
99
  raise ValueError("Cannot specify both keep and drop")
100
+ if chunk_size is not None and chunk_size < 1:
101
+ raise ValueError("chunk_size must be >= 1")
80
102
 
81
- rows = list(self.rows)
103
+ rows = list(rows) if rows is not None else list(self.rows)
82
104
 
83
105
  # --- column filtering ---
84
106
  all_cols = list(rows[0].keys()) if rows else []
@@ -120,17 +142,25 @@ class ThaCSV:
120
142
  ts = datetime.now().strftime("%Y%m%d_%H%M%S")
121
143
  output_path = Path(f"{stem}_processed_{ts}.csv")
122
144
 
123
- out = Path(output_path)
124
- out.parent.mkdir(parents=True, exist_ok=True)
125
-
126
- write_label = desc if desc is not None else out.name
127
- with open(out, "w", newline="", encoding="utf-8") as f:
128
- if rows:
129
- writer = csv.DictWriter(f, fieldnames=cols)
130
- writer.writeheader()
131
- writer.writerows(
132
- {c: row[c] for c in cols if c in row}
133
- for row in tqdm(rows, desc=write_label)
145
+ output_file = Path(output_path)
146
+
147
+ # --- chunked write ---
148
+ if chunk_size is not None:
149
+ chunks = [rows[i:i + chunk_size] for i in range(0, max(len(rows), 1), chunk_size)]
150
+ paths = []
151
+ for idx, chunk in enumerate(chunks, start=1):
152
+ chunk_path = output_file.parent / f"{output_file.stem}_{idx:03d}{output_file.suffix}"
153
+ label = (
154
+ f"{desc} ({idx}/{len(chunks)})"
155
+ if desc
156
+ else f"Writing {output_file.stem} CSV ({idx}/{len(chunks)})"
134
157
  )
135
-
136
- return out
158
+ _write_chunk(chunk_path, chunk, cols, label)
159
+ paths.append(chunk_path)
160
+ self.status_cb(f":white_check_mark: Done! CSV was written to: {paths}")
161
+ return paths
162
+
163
+ write_label = desc if desc is not None else f"Writing {output_file.stem} CSV"
164
+ _write_chunk(output_file, rows, cols, write_label)
165
+ self.status_cb(f":white_check_mark: Done! CSV was written to: {output_file}")
166
+ return output_file
@@ -33,8 +33,8 @@ def test_read_returns_rows(simple_csv: Path) -> None:
33
33
  def test_row_number_injected(simple_csv: Path) -> None:
34
34
  runner = ThaCSV()
35
35
  runner.read(None, simple_csv, ["name"])
36
- assert runner.rows[0]["row number"] == 1
37
- assert runner.rows[2]["row number"] == 3
36
+ assert runner.rows[0]["row number"] == 2
37
+ assert runner.rows[2]["row number"] == 4
38
38
 
39
39
 
40
40
  def test_message_and_status_columns_present(simple_csv: Path) -> None:
@@ -118,6 +118,17 @@ def test_write_before_read_raises(simple_csv: Path) -> None:
118
118
  runner.write(None)
119
119
 
120
120
 
121
+ def test_write_rows_param_bypasses_read_guard(tmp_path: Path) -> None:
122
+ out = tmp_path / "out.csv"
123
+ rows = [{"name": "Alice", "val": "1"}, {"name": "Bob", "val": "2"}]
124
+ runner = ThaCSV()
125
+ result = runner.write(None, out, rows=rows)
126
+ assert isinstance(result, Path)
127
+ written = list(csv.DictReader(out.open()))
128
+ assert len(written) == 2
129
+ assert written[0]["name"] == "Alice"
130
+
131
+
121
132
  def test_write_sort_by_single(simple_csv: Path, tmp_path: Path) -> None:
122
133
  out = tmp_path / "out.csv"
123
134
  runner = ThaCSV()
@@ -223,3 +234,68 @@ def test_enrich_false_validator_error_still_raises(simple_csv: Path) -> None:
223
234
  runner = ThaCSV()
224
235
  with pytest.raises(ValueError, match="Bob is not allowed"):
225
236
  runner.read(None, simple_csv, ["name"], fail_on_bob, enrich=False)
237
+
238
+
239
+ # --- chunk_size ---
240
+
241
+ def test_chunk_size_returns_list(simple_csv: Path, tmp_path: Path) -> None:
242
+ runner = ThaCSV()
243
+ runner.read(None, simple_csv, ["name"])
244
+ result = runner.write(None, tmp_path / "out.csv", chunk_size=2)
245
+ assert isinstance(result, list)
246
+
247
+
248
+ def test_chunk_size_correct_file_count(simple_csv: Path, tmp_path: Path) -> None:
249
+ runner = ThaCSV()
250
+ runner.read(None, simple_csv, ["name"])
251
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
252
+ assert isinstance(paths, list)
253
+ assert len(paths) == 2 # 3 rows → chunks of 2, 1
254
+
255
+
256
+ def test_chunk_size_files_exist(simple_csv: Path, tmp_path: Path) -> None:
257
+ runner = ThaCSV()
258
+ runner.read(None, simple_csv, ["name"])
259
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
260
+ assert isinstance(paths, list)
261
+ assert all(p.exists() for p in paths)
262
+
263
+
264
+ def test_chunk_size_naming(simple_csv: Path, tmp_path: Path) -> None:
265
+ runner = ThaCSV()
266
+ runner.read(None, simple_csv, ["name"])
267
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
268
+ assert isinstance(paths, list)
269
+ assert paths[0].name == "out_001.csv"
270
+ assert paths[1].name == "out_002.csv"
271
+
272
+
273
+ def test_chunk_size_total_rows(simple_csv: Path, tmp_path: Path) -> None:
274
+ runner = ThaCSV()
275
+ runner.read(None, simple_csv, ["name"])
276
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=2)
277
+ assert isinstance(paths, list)
278
+ total = sum(len(list(csv.DictReader(p.open()))) for p in paths)
279
+ assert total == 3
280
+
281
+
282
+ def test_chunk_size_larger_than_rows(simple_csv: Path, tmp_path: Path) -> None:
283
+ runner = ThaCSV()
284
+ runner.read(None, simple_csv, ["name"])
285
+ paths = runner.write(None, tmp_path / "out.csv", chunk_size=100)
286
+ assert isinstance(paths, list)
287
+ assert len(paths) == 1
288
+
289
+
290
+ def test_chunk_size_zero_raises(simple_csv: Path, tmp_path: Path) -> None:
291
+ runner = ThaCSV()
292
+ runner.read(None, simple_csv, ["name"])
293
+ with pytest.raises(ValueError, match="chunk_size"):
294
+ runner.write(None, tmp_path / "out.csv", chunk_size=0)
295
+
296
+
297
+ def test_no_chunk_size_returns_path(simple_csv: Path, tmp_path: Path) -> None:
298
+ runner = ThaCSV()
299
+ runner.read(None, simple_csv, ["name"])
300
+ result = runner.write(None, tmp_path / "out.csv")
301
+ assert isinstance(result, Path)
@@ -308,7 +308,7 @@ wheels = [
308
308
 
309
309
  [[package]]
310
310
  name = "tha-csv-runner"
311
- version = "0.2.2"
311
+ version = "0.2.3"
312
312
  source = { editable = "." }
313
313
  dependencies = [
314
314
  { name = "tqdm" },
File without changes