dsvmonkey 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsvmonkey/__init__.py +46 -0
- dsvmonkey/_internal.py +222 -0
- dsvmonkey/cli.py +784 -0
- dsvmonkey/columns.py +351 -0
- dsvmonkey/convert.py +152 -0
- dsvmonkey/detectors/__init__.py +6 -0
- dsvmonkey/detectors/delimiter.py +362 -0
- dsvmonkey/detectors/encoding.py +433 -0
- dsvmonkey/detectors/header.py +333 -0
- dsvmonkey/detectors/line_ending.py +172 -0
- dsvmonkey/detectors/quote.py +240 -0
- dsvmonkey/orchestrate.py +290 -0
- dsvmonkey/profile.py +307 -0
- dsvmonkey/reader.py +661 -0
- dsvmonkey/repair.py +496 -0
- dsvmonkey-0.1.0.dist-info/METADATA +166 -0
- dsvmonkey-0.1.0.dist-info/RECORD +21 -0
- dsvmonkey-0.1.0.dist-info/WHEEL +5 -0
- dsvmonkey-0.1.0.dist-info/entry_points.txt +2 -0
- dsvmonkey-0.1.0.dist-info/licenses/LICENSE +21 -0
- dsvmonkey-0.1.0.dist-info/top_level.txt +1 -0
dsvmonkey/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""dsvmonkey — detect, profile, normalize and repair DSV files."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version as _pkg_version
|
|
4
|
+
|
|
5
|
+
from dsvmonkey.columns import ColumnProfile, profile_columns
|
|
6
|
+
from dsvmonkey.convert import to_jsonl
|
|
7
|
+
from dsvmonkey.orchestrate import profile_bytes, profile_file
|
|
8
|
+
from dsvmonkey.profile import (
|
|
9
|
+
DetectionAlternative,
|
|
10
|
+
DetectionResult,
|
|
11
|
+
DictOverflowError,
|
|
12
|
+
DSVProfile,
|
|
13
|
+
ReviewRecommendedError,
|
|
14
|
+
assert_clean,
|
|
15
|
+
)
|
|
16
|
+
from dsvmonkey.reader import read
|
|
17
|
+
from dsvmonkey.repair import RepairReport, repair
|
|
18
|
+
|
|
19
|
+
# pyproject.toml is the single source of truth for the version number.
|
|
20
|
+
# Reading via importlib.metadata avoids hand-syncing a duplicate
|
|
21
|
+
# string here — a previous foot-gun where __init__.py's hardcoded
|
|
22
|
+
# "0.1.0" could drift from the packaged version on a release. The
|
|
23
|
+
# fallback covers running directly from a source tree where the
|
|
24
|
+
# package isn't installed (rare but legal during dev).
|
|
25
|
+
try:
|
|
26
|
+
__version__ = _pkg_version("dsvmonkey")
|
|
27
|
+
except PackageNotFoundError:
|
|
28
|
+
__version__ = "0.0.0+unknown"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"ColumnProfile",
|
|
32
|
+
"DetectionAlternative",
|
|
33
|
+
"DetectionResult",
|
|
34
|
+
"DictOverflowError",
|
|
35
|
+
"DSVProfile",
|
|
36
|
+
"RepairReport",
|
|
37
|
+
"ReviewRecommendedError",
|
|
38
|
+
"assert_clean",
|
|
39
|
+
"profile_bytes",
|
|
40
|
+
"profile_columns",
|
|
41
|
+
"profile_file",
|
|
42
|
+
"read",
|
|
43
|
+
"repair",
|
|
44
|
+
"to_jsonl",
|
|
45
|
+
"__version__",
|
|
46
|
+
]
|
dsvmonkey/_internal.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""Shared internal helpers for output paths.
|
|
2
|
+
|
|
3
|
+
Lives here rather than under one feature module (`repair.py`) so the
|
|
4
|
+
cross-module dependency is explicit. Both `repair.py` and `convert.py`
|
|
5
|
+
need atomic writes, same-path safety checks, and formula-injection
|
|
6
|
+
neutralisation; importing private helpers across modules works but
|
|
7
|
+
makes future renames risky and obscures the dependency. Pulling
|
|
8
|
+
them into a single private module makes the shared surface
|
|
9
|
+
discoverable and refactorable in one place.
|
|
10
|
+
|
|
11
|
+
Still underscore-prefixed (the module name and the helpers) — these
|
|
12
|
+
are not part of the public API. External callers should not import
|
|
13
|
+
from `dsvmonkey._internal`.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import tempfile
|
|
20
|
+
import unicodedata
|
|
21
|
+
from contextlib import contextmanager
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import IO, Iterator
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ---------- Atomic + durable text writer ----------
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@contextmanager
|
|
30
|
+
def _atomic_text_writer(path: Path, *, encoding: str) -> Iterator[IO[str]]:
|
|
31
|
+
"""Yield a text-mode file handle whose writes land atomically
|
|
32
|
+
and durably at `path` on successful close.
|
|
33
|
+
|
|
34
|
+
Writes go to a sibling temp file (`.<path.name>.<pid>.partial`),
|
|
35
|
+
the file's pages are flushed to disk (`fsync`), then
|
|
36
|
+
`os.replace()` swaps it into place — a single filesystem
|
|
37
|
+
operation that's atomic on POSIX and Windows. The parent
|
|
38
|
+
directory is then also `fsync`ed so the rename itself survives
|
|
39
|
+
power loss. If the caller raises, the temp file is removed
|
|
40
|
+
rather than left at the final path, so a killed-mid-write run
|
|
41
|
+
doesn't corrupt existing output.
|
|
42
|
+
|
|
43
|
+
Previously we used `os.replace` alone, which is atomic against
|
|
44
|
+
process crashes but not power loss — a sudden reboot between
|
|
45
|
+
the write and the write hitting the platter could still lose
|
|
46
|
+
the new contents despite "atomic" semantics. The fsync before
|
|
47
|
+
the rename, plus the dir fsync after, closes that window for
|
|
48
|
+
durability-critical pipelines (regulated ETL, audit trails).
|
|
49
|
+
|
|
50
|
+
Uses the same directory as the target so the replace doesn't
|
|
51
|
+
cross filesystems (cross-fs rename falls back to copy+delete
|
|
52
|
+
and isn't atomic).
|
|
53
|
+
"""
|
|
54
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
tmp_fd, tmp_name = tempfile.mkstemp(
|
|
56
|
+
prefix=f".{path.name}.",
|
|
57
|
+
suffix=".partial",
|
|
58
|
+
dir=str(path.parent),
|
|
59
|
+
)
|
|
60
|
+
tmp_path = Path(tmp_name)
|
|
61
|
+
try:
|
|
62
|
+
fh = os.fdopen(tmp_fd, "w", encoding=encoding, newline="")
|
|
63
|
+
try:
|
|
64
|
+
yield fh
|
|
65
|
+
fh.flush()
|
|
66
|
+
try:
|
|
67
|
+
os.fsync(fh.fileno())
|
|
68
|
+
except OSError:
|
|
69
|
+
# Some filesystems (/dev/null, special devices,
|
|
70
|
+
# some network mounts) don't support fsync. The
|
|
71
|
+
# atomic rename still works; durability degrades
|
|
72
|
+
# gracefully to "regular POSIX" rather than raising
|
|
73
|
+
# on every /tmp write.
|
|
74
|
+
pass
|
|
75
|
+
finally:
|
|
76
|
+
fh.close()
|
|
77
|
+
os.replace(tmp_path, path)
|
|
78
|
+
# Also fsync the containing directory so the rename itself
|
|
79
|
+
# survives power loss. Without this, the new inode exists
|
|
80
|
+
# on disk but the directory entry pointing at it might not
|
|
81
|
+
# be flushed yet — crash recovery could "lose" the rename.
|
|
82
|
+
try:
|
|
83
|
+
dir_fd = os.open(str(path.parent), os.O_RDONLY)
|
|
84
|
+
try:
|
|
85
|
+
os.fsync(dir_fd)
|
|
86
|
+
finally:
|
|
87
|
+
os.close(dir_fd)
|
|
88
|
+
except OSError:
|
|
89
|
+
# Windows doesn't support directory fsync; on other
|
|
90
|
+
# filesystems where it fails, gracefully skip.
|
|
91
|
+
pass
|
|
92
|
+
except BaseException:
|
|
93
|
+
# Any failure: remove the temp so we don't leave
|
|
94
|
+
# `.something.123.partial` files scattered in user dirs.
|
|
95
|
+
# Suppress every OSError class (FileNotFoundError,
|
|
96
|
+
# PermissionError, race conditions on Windows, network
|
|
97
|
+
# mount weirdness) so a cleanup hiccup doesn't replace
|
|
98
|
+
# the real exception's traceback context. Diagnosis of
|
|
99
|
+
# the original failure matters more than perfect
|
|
100
|
+
# housekeeping when something already went wrong.
|
|
101
|
+
try:
|
|
102
|
+
tmp_path.unlink()
|
|
103
|
+
except OSError:
|
|
104
|
+
pass
|
|
105
|
+
raise
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------- Same-file protection ----------
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _reject_same_path(
|
|
112
|
+
input_path: Path, output_path: Path | None, op: str
|
|
113
|
+
) -> None:
|
|
114
|
+
"""Raise if src and dst point at the same underlying file.
|
|
115
|
+
|
|
116
|
+
Streamed read + truncate-on-open would race the filesystem:
|
|
117
|
+
`open(p, "w")` truncates the inode on disk, and while POSIX
|
|
118
|
+
semantics keep the read fd alive (the writer gets an independent
|
|
119
|
+
inode on most systems but NOT all filesystems), Windows raises
|
|
120
|
+
immediately and network filesystems can produce silently
|
|
121
|
+
truncated output. Failing fast is cheaper than debugging a lost
|
|
122
|
+
CSV at 2am.
|
|
123
|
+
|
|
124
|
+
Identity is checked via (st_dev, st_ino) when both files exist,
|
|
125
|
+
which catches hardlinks and bind-mounts that resolve to the
|
|
126
|
+
same inode via different paths. Falls back to resolved-path
|
|
127
|
+
equality when either file doesn't exist yet (a fresh output
|
|
128
|
+
path has no inode to compare) or when the stat call fails
|
|
129
|
+
(permission errors etc.) — same-path protection degrades
|
|
130
|
+
gracefully rather than blocking legitimate writes.
|
|
131
|
+
"""
|
|
132
|
+
if output_path is None:
|
|
133
|
+
return
|
|
134
|
+
# Primary: inode identity. Handles hardlinks, bind mounts, and
|
|
135
|
+
# other paths-to-same-file cases that resolved-path equality
|
|
136
|
+
# alone misses.
|
|
137
|
+
try:
|
|
138
|
+
src_stat = input_path.stat()
|
|
139
|
+
dst_stat = output_path.stat()
|
|
140
|
+
except (OSError, ValueError):
|
|
141
|
+
# Output file doesn't exist yet (the common case) or can't
|
|
142
|
+
# be stat'ed — fall through to path-equality.
|
|
143
|
+
pass
|
|
144
|
+
else:
|
|
145
|
+
if (src_stat.st_dev, src_stat.st_ino) == (
|
|
146
|
+
dst_stat.st_dev,
|
|
147
|
+
dst_stat.st_ino,
|
|
148
|
+
):
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"{op}(): input and output resolve to the same file "
|
|
151
|
+
f"(inode {src_stat.st_ino} on device "
|
|
152
|
+
f"{src_stat.st_dev}). Writing would truncate the "
|
|
153
|
+
f"source before the read completes. Pick a different "
|
|
154
|
+
f"output path, or write to a temp file and "
|
|
155
|
+
f"os.replace() it."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Fallback: resolved-path equality, for the "dst doesn't exist
|
|
159
|
+
# yet" case and for filesystems where stat is unavailable.
|
|
160
|
+
try:
|
|
161
|
+
src_resolved = input_path.resolve(strict=False)
|
|
162
|
+
dst_resolved = output_path.resolve(strict=False)
|
|
163
|
+
except OSError:
|
|
164
|
+
return
|
|
165
|
+
if src_resolved == dst_resolved:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"{op}(): input and output resolve to the same path "
|
|
168
|
+
f"({src_resolved}). Writing would truncate the source "
|
|
169
|
+
f"before the read completes. Pick a different output "
|
|
170
|
+
f"path, or write to a temp file and os.replace() it."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# ---------- Formula-injection neutralisation ----------
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# Characters that Excel / Google Sheets / LibreOffice Calc treat as
|
|
178
|
+
# the start of a formula. An attacker who can influence cell values
|
|
179
|
+
# crafts something like `=SUM(A:A)*cmd|' /C calc.exe'!A1` — when the
|
|
180
|
+
# CSV is opened in a spreadsheet, the cell evaluates as a formula
|
|
181
|
+
# with side effects. Prepending `'` neutralises the formula marker
|
|
182
|
+
# (the apostrophe is treated as a formatting hint by spreadsheets,
|
|
183
|
+
# indicating "display this cell as literal text").
|
|
184
|
+
_FORMULA_INJECTION_PREFIXES: tuple[str, ...] = ("=", "+", "-", "@")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _neutralize_formula(cell: str) -> str:
|
|
188
|
+
"""Prepend `'` when `cell`'s first non-whitespace, non-invisible
|
|
189
|
+
character is a spreadsheet-formula trigger; return `cell`
|
|
190
|
+
unchanged otherwise.
|
|
191
|
+
|
|
192
|
+
Called only when the caller opts into formula sanitization
|
|
193
|
+
(`repair(sanitize_formulas=True)` /
|
|
194
|
+
`to_jsonl(sanitize_formulas=True)`). Off by default because the
|
|
195
|
+
prefix mutates visible content and is unwanted when the
|
|
196
|
+
consumer isn't a spreadsheet; turn on at the pipeline boundary
|
|
197
|
+
where output crosses into untrusted spreadsheet territory.
|
|
198
|
+
|
|
199
|
+
The skip set covers BOTH whitespace AND Unicode "Format" /
|
|
200
|
+
"Mark, Nonspacing" categories (BOM U+FEFF, ZWSP U+200B,
|
|
201
|
+
ZWNJ/ZWJ, combining marks, etc.). Excel and Sheets ignore
|
|
202
|
+
those when evaluating the cell, so a naive lstrip() — which
|
|
203
|
+
only removes whitespace — leaves `"\\ufeff=SUM(A:A)"`
|
|
204
|
+
un-neutralised. Under `clean_cells=False` the BOM survives
|
|
205
|
+
to output and the formula evaluates in the spreadsheet,
|
|
206
|
+
bypassing the defence the caller asked for.
|
|
207
|
+
"""
|
|
208
|
+
if not cell:
|
|
209
|
+
return cell
|
|
210
|
+
for ch in cell:
|
|
211
|
+
if ch.isspace():
|
|
212
|
+
continue
|
|
213
|
+
# Cf = Format (BOM, ZWSP, ZWNJ, ZWJ, RLM, LRM…).
|
|
214
|
+
# Mn = Mark, Nonspacing (combining diacritics).
|
|
215
|
+
# Both are visually invisible / non-rendered as a leading
|
|
216
|
+
# cell character in spreadsheets.
|
|
217
|
+
if unicodedata.category(ch) in ("Cf", "Mn"):
|
|
218
|
+
continue
|
|
219
|
+
if ch in _FORMULA_INJECTION_PREFIXES:
|
|
220
|
+
return "'" + cell
|
|
221
|
+
break
|
|
222
|
+
return cell
|