dsvmonkey 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dsvmonkey-0.1.0/LICENSE +21 -0
  2. dsvmonkey-0.1.0/PKG-INFO +166 -0
  3. dsvmonkey-0.1.0/README.md +114 -0
  4. dsvmonkey-0.1.0/pyproject.toml +57 -0
  5. dsvmonkey-0.1.0/setup.cfg +4 -0
  6. dsvmonkey-0.1.0/src/dsvmonkey/__init__.py +46 -0
  7. dsvmonkey-0.1.0/src/dsvmonkey/_internal.py +222 -0
  8. dsvmonkey-0.1.0/src/dsvmonkey/cli.py +784 -0
  9. dsvmonkey-0.1.0/src/dsvmonkey/columns.py +351 -0
  10. dsvmonkey-0.1.0/src/dsvmonkey/convert.py +152 -0
  11. dsvmonkey-0.1.0/src/dsvmonkey/detectors/__init__.py +6 -0
  12. dsvmonkey-0.1.0/src/dsvmonkey/detectors/delimiter.py +362 -0
  13. dsvmonkey-0.1.0/src/dsvmonkey/detectors/encoding.py +433 -0
  14. dsvmonkey-0.1.0/src/dsvmonkey/detectors/header.py +333 -0
  15. dsvmonkey-0.1.0/src/dsvmonkey/detectors/line_ending.py +172 -0
  16. dsvmonkey-0.1.0/src/dsvmonkey/detectors/quote.py +240 -0
  17. dsvmonkey-0.1.0/src/dsvmonkey/orchestrate.py +290 -0
  18. dsvmonkey-0.1.0/src/dsvmonkey/profile.py +307 -0
  19. dsvmonkey-0.1.0/src/dsvmonkey/reader.py +661 -0
  20. dsvmonkey-0.1.0/src/dsvmonkey/repair.py +496 -0
  21. dsvmonkey-0.1.0/src/dsvmonkey.egg-info/PKG-INFO +166 -0
  22. dsvmonkey-0.1.0/src/dsvmonkey.egg-info/SOURCES.txt +39 -0
  23. dsvmonkey-0.1.0/src/dsvmonkey.egg-info/dependency_links.txt +1 -0
  24. dsvmonkey-0.1.0/src/dsvmonkey.egg-info/entry_points.txt +2 -0
  25. dsvmonkey-0.1.0/src/dsvmonkey.egg-info/requires.txt +7 -0
  26. dsvmonkey-0.1.0/src/dsvmonkey.egg-info/top_level.txt +1 -0
  27. dsvmonkey-0.1.0/tests/test_cli.py +1113 -0
  28. dsvmonkey-0.1.0/tests/test_cli_golden.py +229 -0
  29. dsvmonkey-0.1.0/tests/test_columns.py +773 -0
  30. dsvmonkey-0.1.0/tests/test_convert.py +270 -0
  31. dsvmonkey-0.1.0/tests/test_cross_api_consistency.py +306 -0
  32. dsvmonkey-0.1.0/tests/test_delimiter.py +539 -0
  33. dsvmonkey-0.1.0/tests/test_encoding.py +582 -0
  34. dsvmonkey-0.1.0/tests/test_header.py +320 -0
  35. dsvmonkey-0.1.0/tests/test_line_ending.py +224 -0
  36. dsvmonkey-0.1.0/tests/test_orchestrate.py +747 -0
  37. dsvmonkey-0.1.0/tests/test_profile.py +367 -0
  38. dsvmonkey-0.1.0/tests/test_quote.py +392 -0
  39. dsvmonkey-0.1.0/tests/test_reader.py +905 -0
  40. dsvmonkey-0.1.0/tests/test_repair.py +1196 -0
  41. dsvmonkey-0.1.0/tests/test_roundtrip_properties.py +246 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 RexBytes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,166 @@
1
+ Metadata-Version: 2.4
2
+ Name: dsvmonkey
3
+ Version: 0.1.0
4
+ Summary: Detect, profile, normalize and repair delimiter-separated values files (CSV, TSV, pipe, semicolon).
5
+ Author-email: rexbytes <pythonic@rexbytes.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 RexBytes
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/rexbytes/dsvmonkey
29
+ Project-URL: Issues, https://github.com/rexbytes/dsvmonkey/issues
30
+ Keywords: csv,tsv,dsv,etl,encoding,delimiter,cleaning
31
+ Classifier: Development Status :: 3 - Alpha
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Operating System :: OS Independent
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
40
+ Classifier: Topic :: Text Processing
41
+ Classifier: Topic :: Utilities
42
+ Requires-Python: >=3.10
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: cleanmonkey<1.0,>=0.1
46
+ Requires-Dist: datemonkey<1.0,>=0.1
47
+ Provides-Extra: dev
48
+ Requires-Dist: pytest>=7.0; extra == "dev"
49
+ Requires-Dist: pytest-cov; extra == "dev"
50
+ Requires-Dist: hypothesis>=6.0; extra == "dev"
51
+ Dynamic: license-file
52
+
53
+ # dsvmonkey
54
+
55
+ Detect, profile, normalize and repair delimiter-separated-values files.
56
+
57
+ CSV is a polite lie. Real files are tab-separated, pipe-separated, or
58
+ semicolon-separated; start with decorative title rows; carry BOMs and
59
+ mixed encodings; include ragged rows and quoted newlines. `dsvmonkey`
60
+ reads them anyway, tells you what it found, and hands you a clean
61
+ stream of rows.
62
+
63
+ ## Status
64
+
65
+ Alpha. API is not yet stable.
66
+
67
+ ## Install
68
+
69
+ ```bash
70
+ pip install dsvmonkey
71
+ ```
72
+
73
+ For development (editable install with test tooling):
74
+
75
+ ```bash
76
+ pip install -e .[dev]
77
+ # or equivalently:
78
+ pip install -r requirements-dev.txt
79
+ ```
80
+
81
+ Both `requirements.txt` and `requirements-dev.txt` are thin pointers
82
+ to `pyproject.toml` — the single source of truth for dependency
83
+ lists. Edit dependencies in `pyproject.toml`; the requirements files
84
+ need no maintenance.
85
+
86
+ ## What it does
87
+
88
+ - **Detect** encoding, delimiter, quote char, header row and line
89
+ endings — each with a confidence score, runner-up alternatives and
90
+ the reasoning behind the choice.
91
+ - **Normalize** cells on read using [`cleanmonkey`](https://pypi.org/project/cleanmonkey/)
92
+ (BOMs, NBSPs, zero-width spaces, smart quotes, stray control chars).
93
+ - **Profile** date columns via [`datemonkey`](https://pypi.org/project/datemonkey/).
94
+ - **Repair** ragged rows, stray BOMs and inconsistent line endings.
95
+ - **Stream** row-by-row; large files are fine.
96
+ - **Chain** cleanly into `pgmonkey` (DB import), `xlfilldown` (Excel
97
+ output) and `typemonkey` (type inference).
98
+
99
+ ## CLI
100
+
101
+ ```bash
102
+ dsvmonkey inspect file.csv # human-readable detection report
103
+ dsvmonkey normalize file.csv -o clean.csv # strip BOM, fix ragged rows, normalize endings
104
+ dsvmonkey convert file.csv -o out.jsonl --to jsonl
105
+ ```
106
+
107
+ Run `dsvmonkey --help` or `dsvmonkey <command> --help` for the full
108
+ list. Flags are command-specific:
109
+
110
+ - `inspect`: `-v/--verbose`, `--no-columns`, `--sample-rows`,
111
+ `--excel-serial-min`, `--no-deep-scan`, `--clean-sample`,
112
+ `--strict` (exit 3 instead of 0 when the profile recommends
113
+ human review — the unattended-pipeline gate).
114
+ - `normalize`: `--encoding`, `--line-ending lf|crlf|cr`,
115
+ `--delimiter`, `--field-count`, `--no-clean`, `--no-deep-scan`,
116
+ `--keep-empty-rows`, `--sanitize-formulas`, `--strict` (same
117
+ gate semantics as `inspect --strict`: profile first, exit 3
118
+ with no output written when detection isn't confident enough).
119
+ - `convert`: `--to {csv,tsv,jsonl}`, `--no-clean`, `--no-deep-scan`,
120
+ `--keep-empty-rows`, `--sanitize-formulas` (applies on every output
121
+ format, including `jsonl` — JSONL output is commonly transformed
122
+ back to CSV/Excel later, where formula payloads surviving as JSON
123
+ string values become live formulas), `--strict` (gate as above).
124
+
125
+ ## Python API
126
+
127
+ ```python
128
+ import dsvmonkey
129
+
130
+ # Profile a file — encoding, delimiter, headers, etc.
131
+ profile = dsvmonkey.profile_file("file.csv")
132
+
133
+ # Stream cleaned rows as dicts
134
+ for row in dsvmonkey.read("file.csv"):
135
+ ...
136
+
137
+ # Write a cleaned version
138
+ report = dsvmonkey.repair("messy.csv", "clean.csv")
139
+
140
+ # Convert to JSON Lines
141
+ dsvmonkey.to_jsonl("file.csv", "file.jsonl")
142
+
143
+ # Per-column profiling (date-format detection via datemonkey)
144
+ columns = dsvmonkey.profile_columns("file.csv")
145
+ ```
146
+
147
+ ## Limitations
148
+
149
+ Some behaviours are deliberate design tradeoffs rather than bugs (e.g.
150
+ mixed-encoding detection requires UTF-8 multi-byte evidence to avoid
151
+ false-positives on cp1252 files; duplicate header names in dict mode
152
+ warn-and-collapse rather than raise). See `LIMITATIONS.md` for the
153
+ full list with rationale and escape hatches.
154
+
155
+ ## Using with AI assistants
156
+
157
+ `SKILL.md` at the repo root is a drop-in Claude Code / agent skill that
158
+ teaches LLMs how to call `dsvmonkey` correctly — decision tree, failure
159
+ modes it already handles, worked examples, and a "don't" list so agents
160
+ stop reinventing broken CSV parsing. Copy it to `~/.claude/skills/` or
161
+ include it in a project's `AGENTS.md` / `CLAUDE.md` for automatic
162
+ discovery.
163
+
164
+ ## License
165
+
166
+ MIT. See `LICENSE`.
@@ -0,0 +1,114 @@
1
+ # dsvmonkey
2
+
3
+ Detect, profile, normalize and repair delimiter-separated-values files.
4
+
5
+ CSV is a polite lie. Real files are tab-separated, pipe-separated, or
6
+ semicolon-separated; start with decorative title rows; carry BOMs and
7
+ mixed encodings; include ragged rows and quoted newlines. `dsvmonkey`
8
+ reads them anyway, tells you what it found, and hands you a clean
9
+ stream of rows.
10
+
11
+ ## Status
12
+
13
+ Alpha. API is not yet stable.
14
+
15
+ ## Install
16
+
17
+ ```bash
18
+ pip install dsvmonkey
19
+ ```
20
+
21
+ For development (editable install with test tooling):
22
+
23
+ ```bash
24
+ pip install -e .[dev]
25
+ # or equivalently:
26
+ pip install -r requirements-dev.txt
27
+ ```
28
+
29
+ Both `requirements.txt` and `requirements-dev.txt` are thin pointers
30
+ to `pyproject.toml` — the single source of truth for dependency
31
+ lists. Edit dependencies in `pyproject.toml`; the requirements files
32
+ need no maintenance.
33
+
34
+ ## What it does
35
+
36
+ - **Detect** encoding, delimiter, quote char, header row and line
37
+ endings — each with a confidence score, runner-up alternatives and
38
+ the reasoning behind the choice.
39
+ - **Normalize** cells on read using [`cleanmonkey`](https://pypi.org/project/cleanmonkey/)
40
+ (BOMs, NBSPs, zero-width spaces, smart quotes, stray control chars).
41
+ - **Profile** date columns via [`datemonkey`](https://pypi.org/project/datemonkey/).
42
+ - **Repair** ragged rows, stray BOMs and inconsistent line endings.
43
+ - **Stream** row-by-row; large files are fine.
44
+ - **Chain** cleanly into `pgmonkey` (DB import), `xlfilldown` (Excel
45
+ output) and `typemonkey` (type inference).
46
+
47
+ ## CLI
48
+
49
+ ```bash
50
+ dsvmonkey inspect file.csv # human-readable detection report
51
+ dsvmonkey normalize file.csv -o clean.csv # strip BOM, fix ragged rows, normalize endings
52
+ dsvmonkey convert file.csv -o out.jsonl --to jsonl
53
+ ```
54
+
55
+ Run `dsvmonkey --help` or `dsvmonkey <command> --help` for the full
56
+ list. Flags are command-specific:
57
+
58
+ - `inspect`: `-v/--verbose`, `--no-columns`, `--sample-rows`,
59
+ `--excel-serial-min`, `--no-deep-scan`, `--clean-sample`,
60
+ `--strict` (exit 3 instead of 0 when the profile recommends
61
+ human review — the unattended-pipeline gate).
62
+ - `normalize`: `--encoding`, `--line-ending lf|crlf|cr`,
63
+ `--delimiter`, `--field-count`, `--no-clean`, `--no-deep-scan`,
64
+ `--keep-empty-rows`, `--sanitize-formulas`, `--strict` (same
65
+ gate semantics as `inspect --strict`: profile first, exit 3
66
+ with no output written when detection isn't confident enough).
67
+ - `convert`: `--to {csv,tsv,jsonl}`, `--no-clean`, `--no-deep-scan`,
68
+ `--keep-empty-rows`, `--sanitize-formulas` (applies on every output
69
+ format, including `jsonl` — JSONL output is commonly transformed
70
+ back to CSV/Excel later, where formula payloads surviving as JSON
71
+ string values become live formulas), `--strict` (gate as above).
72
+
73
+ ## Python API
74
+
75
+ ```python
76
+ import dsvmonkey
77
+
78
+ # Profile a file — encoding, delimiter, headers, etc.
79
+ profile = dsvmonkey.profile_file("file.csv")
80
+
81
+ # Stream cleaned rows as dicts
82
+ for row in dsvmonkey.read("file.csv"):
83
+ ...
84
+
85
+ # Write a cleaned version
86
+ report = dsvmonkey.repair("messy.csv", "clean.csv")
87
+
88
+ # Convert to JSON Lines
89
+ dsvmonkey.to_jsonl("file.csv", "file.jsonl")
90
+
91
+ # Per-column profiling (date-format detection via datemonkey)
92
+ columns = dsvmonkey.profile_columns("file.csv")
93
+ ```
94
+
95
+ ## Limitations
96
+
97
+ Some behaviours are deliberate design tradeoffs rather than bugs (e.g.
98
+ mixed-encoding detection requires UTF-8 multi-byte evidence to avoid
99
+ false-positives on cp1252 files; duplicate header names in dict mode
100
+ warn-and-collapse rather than raise). See `LIMITATIONS.md` for the
101
+ full list with rationale and escape hatches.
102
+
103
+ ## Using with AI assistants
104
+
105
+ `SKILL.md` at the repo root is a drop-in Claude Code / agent skill that
106
+ teaches LLMs how to call `dsvmonkey` correctly — decision tree, failure
107
+ modes it already handles, worked examples, and a "don't" list so agents
108
+ stop reinventing broken CSV parsing. Copy it to `~/.claude/skills/` or
109
+ include it in a project's `AGENTS.md` / `CLAUDE.md` for automatic
110
+ discovery.
111
+
112
+ ## License
113
+
114
+ MIT. See `LICENSE`.
@@ -0,0 +1,57 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dsvmonkey"
7
+ version = "0.1.0"
8
+ description = "Detect, profile, normalize and repair delimiter-separated values files (CSV, TSV, pipe, semicolon)."
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ authors = [{ name = "rexbytes", email = "pythonic@rexbytes.com" }]
12
+ requires-python = ">=3.10"
13
+ keywords = ["csv", "tsv", "dsv", "etl", "encoding", "delimiter", "cleaning"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Software Development :: Libraries :: Python Modules",
24
+ "Topic :: Text Processing",
25
+ "Topic :: Utilities",
26
+ ]
27
+ # Both rexbytes-family helpers are pinned to a 0.x major. Without
28
+ # upper bounds an upstream 1.0 release (or any future major bump
29
+ # with breaking parsing/format changes) would silently alter
30
+ # detection or cleaning behaviour in installed environments —
31
+ # unacceptable for a data-quality library where reproducibility
32
+ # matters. Bumping the bound is an explicit decision per release.
33
+ dependencies = [
34
+ "cleanmonkey>=0.1,<1.0",
35
+ "datemonkey>=0.1,<1.0",
36
+ ]
37
+
38
+ [project.optional-dependencies]
39
+ dev = [
40
+ "pytest>=7.0",
41
+ "pytest-cov",
42
+ "hypothesis>=6.0",
43
+ ]
44
+
45
+ [project.scripts]
46
+ dsvmonkey = "dsvmonkey.cli:main"
47
+
48
+ [project.urls]
49
+ Homepage = "https://github.com/rexbytes/dsvmonkey"
50
+ Issues = "https://github.com/rexbytes/dsvmonkey/issues"
51
+
52
+ [tool.setuptools.packages.find]
53
+ where = ["src"]
54
+
55
+ [tool.pytest.ini_options]
56
+ testpaths = ["tests"]
57
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,46 @@
1
+ """dsvmonkey — detect, profile, normalize and repair DSV files."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version as _pkg_version
4
+
5
+ from dsvmonkey.columns import ColumnProfile, profile_columns
6
+ from dsvmonkey.convert import to_jsonl
7
+ from dsvmonkey.orchestrate import profile_bytes, profile_file
8
+ from dsvmonkey.profile import (
9
+ DetectionAlternative,
10
+ DetectionResult,
11
+ DictOverflowError,
12
+ DSVProfile,
13
+ ReviewRecommendedError,
14
+ assert_clean,
15
+ )
16
+ from dsvmonkey.reader import read
17
+ from dsvmonkey.repair import RepairReport, repair
18
+
19
+ # pyproject.toml is the single source of truth for the version number.
20
+ # Reading via importlib.metadata avoids hand-syncing a duplicate
21
+ # string here — a previous foot-gun where __init__.py's hardcoded
22
+ # "0.1.0" could drift from the packaged version on a release. The
23
+ # fallback covers running directly from a source tree where the
24
+ # package isn't installed (rare but legal during dev).
25
+ try:
26
+ __version__ = _pkg_version("dsvmonkey")
27
+ except PackageNotFoundError:
28
+ __version__ = "0.0.0+unknown"
29
+
30
+ __all__ = [
31
+ "ColumnProfile",
32
+ "DetectionAlternative",
33
+ "DetectionResult",
34
+ "DictOverflowError",
35
+ "DSVProfile",
36
+ "RepairReport",
37
+ "ReviewRecommendedError",
38
+ "assert_clean",
39
+ "profile_bytes",
40
+ "profile_columns",
41
+ "profile_file",
42
+ "read",
43
+ "repair",
44
+ "to_jsonl",
45
+ "__version__",
46
+ ]
@@ -0,0 +1,222 @@
1
+ """Shared internal helpers for output paths.
2
+
3
+ Lives here rather than under one feature module (`repair.py`) so the
4
+ cross-module dependency is explicit. Both `repair.py` and `convert.py`
5
+ need atomic writes, same-path safety checks, and formula-injection
6
+ neutralisation; importing private helpers across modules works but
7
+ makes future renames risky and obscures the dependency. Pulling
8
+ them into a single private module makes the shared surface
9
+ discoverable and refactorable in one place.
10
+
11
+ Still underscore-prefixed (the module name and the helpers) — these
12
+ are not part of the public API. External callers should not import
13
+ from `dsvmonkey._internal`.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import tempfile
20
+ import unicodedata
21
+ from contextlib import contextmanager
22
+ from pathlib import Path
23
+ from typing import IO, Iterator
24
+
25
+
26
+ # ---------- Atomic + durable text writer ----------
27
+
28
+
29
+ @contextmanager
30
+ def _atomic_text_writer(path: Path, *, encoding: str) -> Iterator[IO[str]]:
31
+ """Yield a text-mode file handle whose writes land atomically
32
+ and durably at `path` on successful close.
33
+
34
+ Writes go to a sibling temp file (`.<path.name>.<pid>.partial`),
35
+ the file's pages are flushed to disk (`fsync`), then
36
+ `os.replace()` swaps it into place — a single filesystem
37
+ operation that's atomic on POSIX and Windows. The parent
38
+ directory is then also `fsync`ed so the rename itself survives
39
+ power loss. If the caller raises, the temp file is removed
40
+ rather than left at the final path, so a killed-mid-write run
41
+ doesn't corrupt existing output.
42
+
43
+ Previously we used `os.replace` alone, which is atomic against
44
+ process crashes but not power loss — a sudden reboot between
45
+ the write and the write hitting the platter could still lose
46
+ the new contents despite "atomic" semantics. The fsync before
47
+ the rename, plus the dir fsync after, closes that window for
48
+ durability-critical pipelines (regulated ETL, audit trails).
49
+
50
+ Uses the same directory as the target so the replace doesn't
51
+ cross filesystems (cross-fs rename falls back to copy+delete
52
+ and isn't atomic).
53
+ """
54
+ path.parent.mkdir(parents=True, exist_ok=True)
55
+ tmp_fd, tmp_name = tempfile.mkstemp(
56
+ prefix=f".{path.name}.",
57
+ suffix=".partial",
58
+ dir=str(path.parent),
59
+ )
60
+ tmp_path = Path(tmp_name)
61
+ try:
62
+ fh = os.fdopen(tmp_fd, "w", encoding=encoding, newline="")
63
+ try:
64
+ yield fh
65
+ fh.flush()
66
+ try:
67
+ os.fsync(fh.fileno())
68
+ except OSError:
69
+ # Some filesystems (/dev/null, special devices,
70
+ # some network mounts) don't support fsync. The
71
+ # atomic rename still works; durability degrades
72
+ # gracefully to "regular POSIX" rather than raising
73
+ # on every /tmp write.
74
+ pass
75
+ finally:
76
+ fh.close()
77
+ os.replace(tmp_path, path)
78
+ # Also fsync the containing directory so the rename itself
79
+ # survives power loss. Without this, the new inode exists
80
+ # on disk but the directory entry pointing at it might not
81
+ # be flushed yet — crash recovery could "lose" the rename.
82
+ try:
83
+ dir_fd = os.open(str(path.parent), os.O_RDONLY)
84
+ try:
85
+ os.fsync(dir_fd)
86
+ finally:
87
+ os.close(dir_fd)
88
+ except OSError:
89
+ # Windows doesn't support directory fsync; on other
90
+ # filesystems where it fails, gracefully skip.
91
+ pass
92
+ except BaseException:
93
+ # Any failure: remove the temp so we don't leave
94
+ # `.something.123.partial` files scattered in user dirs.
95
+ # Suppress every OSError class (FileNotFoundError,
96
+ # PermissionError, race conditions on Windows, network
97
+ # mount weirdness) so a cleanup hiccup doesn't replace
98
+ # the real exception's traceback context. Diagnosis of
99
+ # the original failure matters more than perfect
100
+ # housekeeping when something already went wrong.
101
+ try:
102
+ tmp_path.unlink()
103
+ except OSError:
104
+ pass
105
+ raise
106
+
107
+
108
+ # ---------- Same-file protection ----------
109
+
110
+
111
+ def _reject_same_path(
112
+ input_path: Path, output_path: Path | None, op: str
113
+ ) -> None:
114
+ """Raise if src and dst point at the same underlying file.
115
+
116
+ Streamed read + truncate-on-open would race the filesystem:
117
+ `open(p, "w")` truncates the inode on disk, and while POSIX
118
+ semantics keep the read fd alive (the writer gets an independent
119
+ inode on most systems but NOT all filesystems), Windows raises
120
+ immediately and network filesystems can produce silently
121
+ truncated output. Failing fast is cheaper than debugging a lost
122
+ CSV at 2am.
123
+
124
+ Identity is checked via (st_dev, st_ino) when both files exist,
125
+ which catches hardlinks and bind-mounts that resolve to the
126
+ same inode via different paths. Falls back to resolved-path
127
+ equality when either file doesn't exist yet (a fresh output
128
+ path has no inode to compare) or when the stat call fails
129
+ (permission errors etc.) — same-path protection degrades
130
+ gracefully rather than blocking legitimate writes.
131
+ """
132
+ if output_path is None:
133
+ return
134
+ # Primary: inode identity. Handles hardlinks, bind mounts, and
135
+ # other paths-to-same-file cases that resolved-path equality
136
+ # alone misses.
137
+ try:
138
+ src_stat = input_path.stat()
139
+ dst_stat = output_path.stat()
140
+ except (OSError, ValueError):
141
+ # Output file doesn't exist yet (the common case) or can't
142
+ # be stat'ed — fall through to path-equality.
143
+ pass
144
+ else:
145
+ if (src_stat.st_dev, src_stat.st_ino) == (
146
+ dst_stat.st_dev,
147
+ dst_stat.st_ino,
148
+ ):
149
+ raise ValueError(
150
+ f"{op}(): input and output resolve to the same file "
151
+ f"(inode {src_stat.st_ino} on device "
152
+ f"{src_stat.st_dev}). Writing would truncate the "
153
+ f"source before the read completes. Pick a different "
154
+ f"output path, or write to a temp file and "
155
+ f"os.replace() it."
156
+ )
157
+
158
+ # Fallback: resolved-path equality, for the "dst doesn't exist
159
+ # yet" case and for filesystems where stat is unavailable.
160
+ try:
161
+ src_resolved = input_path.resolve(strict=False)
162
+ dst_resolved = output_path.resolve(strict=False)
163
+ except OSError:
164
+ return
165
+ if src_resolved == dst_resolved:
166
+ raise ValueError(
167
+ f"{op}(): input and output resolve to the same path "
168
+ f"({src_resolved}). Writing would truncate the source "
169
+ f"before the read completes. Pick a different output "
170
+ f"path, or write to a temp file and os.replace() it."
171
+ )
172
+
173
+
174
+ # ---------- Formula-injection neutralisation ----------
175
+
176
+
177
+ # Characters that Excel / Google Sheets / LibreOffice Calc treat as
178
+ # the start of a formula. An attacker who can influence cell values
179
+ # crafts something like `=SUM(A:A)*cmd|' /C calc.exe'!A1` — when the
180
+ # CSV is opened in a spreadsheet, the cell evaluates as a formula
181
+ # with side effects. Prepending `'` neutralises the formula marker
182
+ # (the apostrophe is treated as a formatting hint by spreadsheets,
183
+ # indicating "display this cell as literal text").
184
+ _FORMULA_INJECTION_PREFIXES: tuple[str, ...] = ("=", "+", "-", "@")
185
+
186
+
187
+ def _neutralize_formula(cell: str) -> str:
188
+ """Prepend `'` when `cell`'s first non-whitespace, non-invisible
189
+ character is a spreadsheet-formula trigger; return `cell`
190
+ unchanged otherwise.
191
+
192
+ Called only when the caller opts into formula sanitization
193
+ (`repair(sanitize_formulas=True)` /
194
+ `to_jsonl(sanitize_formulas=True)`). Off by default because the
195
+ prefix mutates visible content and is unwanted when the
196
+ consumer isn't a spreadsheet; turn on at the pipeline boundary
197
+ where output crosses into untrusted spreadsheet territory.
198
+
199
+ The skip set covers BOTH whitespace AND Unicode "Format" /
200
+ "Mark, Nonspacing" categories (BOM U+FEFF, ZWSP U+200B,
201
+ ZWNJ/ZWJ, combining marks, etc.). Excel and Sheets ignore
202
+ those when evaluating the cell, so a naive lstrip() — which
203
+ only removes whitespace — leaves `"\\ufeff=SUM(A:A)"`
204
+ un-neutralised. Under `clean_cells=False` the BOM survives
205
+ to output and the formula evaluates in the spreadsheet,
206
+ bypassing the defence the caller asked for.
207
+ """
208
+ if not cell:
209
+ return cell
210
+ for ch in cell:
211
+ if ch.isspace():
212
+ continue
213
+ # Cf = Format (BOM, ZWSP, ZWNJ, ZWJ, RLM, LRM…).
214
+ # Mn = Mark, Nonspacing (combining diacritics).
215
+ # Both are visually invisible / non-rendered as a leading
216
+ # cell character in spreadsheets.
217
+ if unicodedata.category(ch) in ("Cf", "Mn"):
218
+ continue
219
+ if ch in _FORMULA_INJECTION_PREFIXES:
220
+ return "'" + cell
221
+ break
222
+ return cell