dsvmonkey 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dsvmonkey/__init__.py ADDED
@@ -0,0 +1,46 @@
1
+ """dsvmonkey — detect, profile, normalize and repair DSV files."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version as _pkg_version
4
+
5
+ from dsvmonkey.columns import ColumnProfile, profile_columns
6
+ from dsvmonkey.convert import to_jsonl
7
+ from dsvmonkey.orchestrate import profile_bytes, profile_file
8
+ from dsvmonkey.profile import (
9
+ DetectionAlternative,
10
+ DetectionResult,
11
+ DictOverflowError,
12
+ DSVProfile,
13
+ ReviewRecommendedError,
14
+ assert_clean,
15
+ )
16
+ from dsvmonkey.reader import read
17
+ from dsvmonkey.repair import RepairReport, repair
18
+
19
+ # pyproject.toml is the single source of truth for the version number.
20
+ # Reading via importlib.metadata avoids hand-syncing a duplicate
21
+ # string here — a previous foot-gun where __init__.py's hardcoded
22
+ # "0.1.0" could drift from the packaged version on a release. The
23
+ # fallback covers running directly from a source tree where the
24
+ # package isn't installed (rare but legal during dev).
25
+ try:
26
+ __version__ = _pkg_version("dsvmonkey")
27
+ except PackageNotFoundError:
28
+ __version__ = "0.0.0+unknown"
29
+
30
+ __all__ = [
31
+ "ColumnProfile",
32
+ "DetectionAlternative",
33
+ "DetectionResult",
34
+ "DictOverflowError",
35
+ "DSVProfile",
36
+ "RepairReport",
37
+ "ReviewRecommendedError",
38
+ "assert_clean",
39
+ "profile_bytes",
40
+ "profile_columns",
41
+ "profile_file",
42
+ "read",
43
+ "repair",
44
+ "to_jsonl",
45
+ "__version__",
46
+ ]
dsvmonkey/_internal.py ADDED
@@ -0,0 +1,222 @@
1
+ """Shared internal helpers for output paths.
2
+
3
+ Lives here rather than under one feature module (`repair.py`) so the
4
+ cross-module dependency is explicit. Both `repair.py` and `convert.py`
5
+ need atomic writes, same-path safety checks, and formula-injection
6
+ neutralisation; importing private helpers across modules works but
7
+ makes future renames risky and obscures the dependency. Pulling
8
+ them into a single private module makes the shared surface
9
+ discoverable and refactorable in one place.
10
+
11
+ Still underscore-prefixed (the module name and the helpers) — these
12
+ are not part of the public API. External callers should not import
13
+ from `dsvmonkey._internal`.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import tempfile
20
+ import unicodedata
21
+ from contextlib import contextmanager
22
+ from pathlib import Path
23
+ from typing import IO, Iterator
24
+
25
+
26
+ # ---------- Atomic + durable text writer ----------
27
+
28
+
29
+ @contextmanager
30
+ def _atomic_text_writer(path: Path, *, encoding: str) -> Iterator[IO[str]]:
31
+ """Yield a text-mode file handle whose writes land atomically
32
+ and durably at `path` on successful close.
33
+
34
+ Writes go to a sibling temp file (`.<path.name>.<pid>.partial`),
35
+ the file's pages are flushed to disk (`fsync`), then
36
+ `os.replace()` swaps it into place — a single filesystem
37
+ operation that's atomic on POSIX and Windows. The parent
38
+ directory is then also `fsync`ed so the rename itself survives
39
+ power loss. If the caller raises, the temp file is removed
40
+ rather than left at the final path, so a killed-mid-write run
41
+ doesn't corrupt existing output.
42
+
43
+ Previously we used `os.replace` alone, which is atomic against
44
+ process crashes but not power loss — a sudden reboot between
45
+ the write and the write hitting the platter could still lose
46
+ the new contents despite "atomic" semantics. The fsync before
47
+ the rename, plus the dir fsync after, closes that window for
48
+ durability-critical pipelines (regulated ETL, audit trails).
49
+
50
+ Uses the same directory as the target so the replace doesn't
51
+ cross filesystems (cross-fs rename falls back to copy+delete
52
+ and isn't atomic).
53
+ """
54
+ path.parent.mkdir(parents=True, exist_ok=True)
55
+ tmp_fd, tmp_name = tempfile.mkstemp(
56
+ prefix=f".{path.name}.",
57
+ suffix=".partial",
58
+ dir=str(path.parent),
59
+ )
60
+ tmp_path = Path(tmp_name)
61
+ try:
62
+ fh = os.fdopen(tmp_fd, "w", encoding=encoding, newline="")
63
+ try:
64
+ yield fh
65
+ fh.flush()
66
+ try:
67
+ os.fsync(fh.fileno())
68
+ except OSError:
69
+ # Some filesystems (/dev/null, special devices,
70
+ # some network mounts) don't support fsync. The
71
+ # atomic rename still works; durability degrades
72
+ # gracefully to "regular POSIX" rather than raising
73
+ # on every /tmp write.
74
+ pass
75
+ finally:
76
+ fh.close()
77
+ os.replace(tmp_path, path)
78
+ # Also fsync the containing directory so the rename itself
79
+ # survives power loss. Without this, the new inode exists
80
+ # on disk but the directory entry pointing at it might not
81
+ # be flushed yet — crash recovery could "lose" the rename.
82
+ try:
83
+ dir_fd = os.open(str(path.parent), os.O_RDONLY)
84
+ try:
85
+ os.fsync(dir_fd)
86
+ finally:
87
+ os.close(dir_fd)
88
+ except OSError:
89
+ # Windows doesn't support directory fsync; on other
90
+ # filesystems where it fails, gracefully skip.
91
+ pass
92
+ except BaseException:
93
+ # Any failure: remove the temp so we don't leave
94
+ # `.something.123.partial` files scattered in user dirs.
95
+ # Suppress every OSError class (FileNotFoundError,
96
+ # PermissionError, race conditions on Windows, network
97
+ # mount weirdness) so a cleanup hiccup doesn't replace
98
+ # the real exception's traceback context. Diagnosis of
99
+ # the original failure matters more than perfect
100
+ # housekeeping when something already went wrong.
101
+ try:
102
+ tmp_path.unlink()
103
+ except OSError:
104
+ pass
105
+ raise
106
+
107
+
108
+ # ---------- Same-file protection ----------
109
+
110
+
111
+ def _reject_same_path(
112
+ input_path: Path, output_path: Path | None, op: str
113
+ ) -> None:
114
+ """Raise if src and dst point at the same underlying file.
115
+
116
+ Streamed read + truncate-on-open would race the filesystem:
117
+ `open(p, "w")` truncates the inode on disk, and while POSIX
118
+ semantics keep the read fd alive (the writer gets an independent
119
+ inode on most systems but NOT all filesystems), Windows raises
120
+ immediately and network filesystems can produce silently
121
+ truncated output. Failing fast is cheaper than debugging a lost
122
+ CSV at 2am.
123
+
124
+ Identity is checked via (st_dev, st_ino) when both files exist,
125
+ which catches hardlinks and bind-mounts that resolve to the
126
+ same inode via different paths. Falls back to resolved-path
127
+ equality when either file doesn't exist yet (a fresh output
128
+ path has no inode to compare) or when the stat call fails
129
+ (permission errors etc.) — same-path protection degrades
130
+ gracefully rather than blocking legitimate writes.
131
+ """
132
+ if output_path is None:
133
+ return
134
+ # Primary: inode identity. Handles hardlinks, bind mounts, and
135
+ # other paths-to-same-file cases that resolved-path equality
136
+ # alone misses.
137
+ try:
138
+ src_stat = input_path.stat()
139
+ dst_stat = output_path.stat()
140
+ except (OSError, ValueError):
141
+ # Output file doesn't exist yet (the common case) or can't
142
+ # be stat'ed — fall through to path-equality.
143
+ pass
144
+ else:
145
+ if (src_stat.st_dev, src_stat.st_ino) == (
146
+ dst_stat.st_dev,
147
+ dst_stat.st_ino,
148
+ ):
149
+ raise ValueError(
150
+ f"{op}(): input and output resolve to the same file "
151
+ f"(inode {src_stat.st_ino} on device "
152
+ f"{src_stat.st_dev}). Writing would truncate the "
153
+ f"source before the read completes. Pick a different "
154
+ f"output path, or write to a temp file and "
155
+ f"os.replace() it."
156
+ )
157
+
158
+ # Fallback: resolved-path equality, for the "dst doesn't exist
159
+ # yet" case and for filesystems where stat is unavailable.
160
+ try:
161
+ src_resolved = input_path.resolve(strict=False)
162
+ dst_resolved = output_path.resolve(strict=False)
163
+ except OSError:
164
+ return
165
+ if src_resolved == dst_resolved:
166
+ raise ValueError(
167
+ f"{op}(): input and output resolve to the same path "
168
+ f"({src_resolved}). Writing would truncate the source "
169
+ f"before the read completes. Pick a different output "
170
+ f"path, or write to a temp file and os.replace() it."
171
+ )
172
+
173
+
174
+ # ---------- Formula-injection neutralisation ----------
175
+
176
+
177
+ # Characters that Excel / Google Sheets / LibreOffice Calc treat as
178
+ # the start of a formula. An attacker who can influence cell values
179
+ # crafts something like `=SUM(A:A)*cmd|' /C calc.exe'!A1` — when the
180
+ # CSV is opened in a spreadsheet, the cell evaluates as a formula
181
+ # with side effects. Prepending `'` neutralises the formula marker
182
+ # (the apostrophe is treated as a formatting hint by spreadsheets,
183
+ # indicating "display this cell as literal text").
184
+ _FORMULA_INJECTION_PREFIXES: tuple[str, ...] = ("=", "+", "-", "@")
185
+
186
+
187
+ def _neutralize_formula(cell: str) -> str:
188
+ """Prepend `'` when `cell`'s first non-whitespace, non-invisible
189
+ character is a spreadsheet-formula trigger; return `cell`
190
+ unchanged otherwise.
191
+
192
+ Called only when the caller opts into formula sanitization
193
+ (`repair(sanitize_formulas=True)` /
194
+ `to_jsonl(sanitize_formulas=True)`). Off by default because the
195
+ prefix mutates visible content and is unwanted when the
196
+ consumer isn't a spreadsheet; turn on at the pipeline boundary
197
+ where output crosses into untrusted spreadsheet territory.
198
+
199
+ The skip set covers BOTH whitespace AND Unicode "Format" /
200
+ "Mark, Nonspacing" categories (BOM U+FEFF, ZWSP U+200B,
201
+ ZWNJ/ZWJ, combining marks, etc.). Excel and Sheets ignore
202
+ those when evaluating the cell, so a naive lstrip() — which
203
+ only removes whitespace — leaves `"\\ufeff=SUM(A:A)"`
204
+ un-neutralised. Under `clean_cells=False` the BOM survives
205
+ to output and the formula evaluates in the spreadsheet,
206
+ bypassing the defence the caller asked for.
207
+ """
208
+ if not cell:
209
+ return cell
210
+ for ch in cell:
211
+ if ch.isspace():
212
+ continue
213
+ # Cf = Format (BOM, ZWSP, ZWNJ, ZWJ, RLM, LRM…).
214
+ # Mn = Mark, Nonspacing (combining diacritics).
215
+ # Both are visually invisible / non-rendered as a leading
216
+ # cell character in spreadsheets.
217
+ if unicodedata.category(ch) in ("Cf", "Mn"):
218
+ continue
219
+ if ch in _FORMULA_INJECTION_PREFIXES:
220
+ return "'" + cell
221
+ break
222
+ return cell