dsvmonkey 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dsvmonkey-0.1.0/LICENSE +21 -0
- dsvmonkey-0.1.0/PKG-INFO +166 -0
- dsvmonkey-0.1.0/README.md +114 -0
- dsvmonkey-0.1.0/pyproject.toml +57 -0
- dsvmonkey-0.1.0/setup.cfg +4 -0
- dsvmonkey-0.1.0/src/dsvmonkey/__init__.py +46 -0
- dsvmonkey-0.1.0/src/dsvmonkey/_internal.py +222 -0
- dsvmonkey-0.1.0/src/dsvmonkey/cli.py +784 -0
- dsvmonkey-0.1.0/src/dsvmonkey/columns.py +351 -0
- dsvmonkey-0.1.0/src/dsvmonkey/convert.py +152 -0
- dsvmonkey-0.1.0/src/dsvmonkey/detectors/__init__.py +6 -0
- dsvmonkey-0.1.0/src/dsvmonkey/detectors/delimiter.py +362 -0
- dsvmonkey-0.1.0/src/dsvmonkey/detectors/encoding.py +433 -0
- dsvmonkey-0.1.0/src/dsvmonkey/detectors/header.py +333 -0
- dsvmonkey-0.1.0/src/dsvmonkey/detectors/line_ending.py +172 -0
- dsvmonkey-0.1.0/src/dsvmonkey/detectors/quote.py +240 -0
- dsvmonkey-0.1.0/src/dsvmonkey/orchestrate.py +290 -0
- dsvmonkey-0.1.0/src/dsvmonkey/profile.py +307 -0
- dsvmonkey-0.1.0/src/dsvmonkey/reader.py +661 -0
- dsvmonkey-0.1.0/src/dsvmonkey/repair.py +496 -0
- dsvmonkey-0.1.0/src/dsvmonkey.egg-info/PKG-INFO +166 -0
- dsvmonkey-0.1.0/src/dsvmonkey.egg-info/SOURCES.txt +39 -0
- dsvmonkey-0.1.0/src/dsvmonkey.egg-info/dependency_links.txt +1 -0
- dsvmonkey-0.1.0/src/dsvmonkey.egg-info/entry_points.txt +2 -0
- dsvmonkey-0.1.0/src/dsvmonkey.egg-info/requires.txt +7 -0
- dsvmonkey-0.1.0/src/dsvmonkey.egg-info/top_level.txt +1 -0
- dsvmonkey-0.1.0/tests/test_cli.py +1113 -0
- dsvmonkey-0.1.0/tests/test_cli_golden.py +229 -0
- dsvmonkey-0.1.0/tests/test_columns.py +773 -0
- dsvmonkey-0.1.0/tests/test_convert.py +270 -0
- dsvmonkey-0.1.0/tests/test_cross_api_consistency.py +306 -0
- dsvmonkey-0.1.0/tests/test_delimiter.py +539 -0
- dsvmonkey-0.1.0/tests/test_encoding.py +582 -0
- dsvmonkey-0.1.0/tests/test_header.py +320 -0
- dsvmonkey-0.1.0/tests/test_line_ending.py +224 -0
- dsvmonkey-0.1.0/tests/test_orchestrate.py +747 -0
- dsvmonkey-0.1.0/tests/test_profile.py +367 -0
- dsvmonkey-0.1.0/tests/test_quote.py +392 -0
- dsvmonkey-0.1.0/tests/test_reader.py +905 -0
- dsvmonkey-0.1.0/tests/test_repair.py +1196 -0
- dsvmonkey-0.1.0/tests/test_roundtrip_properties.py +246 -0
dsvmonkey-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RexBytes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
dsvmonkey-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dsvmonkey
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect, profile, normalize and repair delimiter-separated values files (CSV, TSV, pipe, semicolon).
|
|
5
|
+
Author-email: rexbytes <pythonic@rexbytes.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 RexBytes
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/rexbytes/dsvmonkey
|
|
29
|
+
Project-URL: Issues, https://github.com/rexbytes/dsvmonkey/issues
|
|
30
|
+
Keywords: csv,tsv,dsv,etl,encoding,delimiter,cleaning
|
|
31
|
+
Classifier: Development Status :: 3 - Alpha
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Operating System :: OS Independent
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
39
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
40
|
+
Classifier: Topic :: Text Processing
|
|
41
|
+
Classifier: Topic :: Utilities
|
|
42
|
+
Requires-Python: >=3.10
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
License-File: LICENSE
|
|
45
|
+
Requires-Dist: cleanmonkey<1.0,>=0.1
|
|
46
|
+
Requires-Dist: datemonkey<1.0,>=0.1
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
50
|
+
Requires-Dist: hypothesis>=6.0; extra == "dev"
|
|
51
|
+
Dynamic: license-file
|
|
52
|
+
|
|
53
|
+
# dsvmonkey
|
|
54
|
+
|
|
55
|
+
Detect, profile, normalize and repair delimiter-separated-values files.
|
|
56
|
+
|
|
57
|
+
CSV is a polite lie. Real files are tab-separated, pipe-separated, or
|
|
58
|
+
semicolon-separated; start with decorative title rows; carry BOMs and
|
|
59
|
+
mixed encodings; include ragged rows and quoted newlines. `dsvmonkey`
|
|
60
|
+
reads them anyway, tells you what it found, and hands you a clean
|
|
61
|
+
stream of rows.
|
|
62
|
+
|
|
63
|
+
## Status
|
|
64
|
+
|
|
65
|
+
Alpha. API is not yet stable.
|
|
66
|
+
|
|
67
|
+
## Install
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install dsvmonkey
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
For development (editable install with test tooling):
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e .[dev]
|
|
77
|
+
# or equivalently:
|
|
78
|
+
pip install -r requirements-dev.txt
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Both `requirements.txt` and `requirements-dev.txt` are thin pointers
|
|
82
|
+
to `pyproject.toml` — the single source of truth for dependency
|
|
83
|
+
lists. Edit dependencies in `pyproject.toml`; the requirements files
|
|
84
|
+
need no maintenance.
|
|
85
|
+
|
|
86
|
+
## What it does
|
|
87
|
+
|
|
88
|
+
- **Detect** encoding, delimiter, quote char, header row and line
|
|
89
|
+
endings — each with a confidence score, runner-up alternatives and
|
|
90
|
+
the reasoning behind the choice.
|
|
91
|
+
- **Normalize** cells on read using [`cleanmonkey`](https://pypi.org/project/cleanmonkey/)
|
|
92
|
+
(BOMs, NBSPs, zero-width spaces, smart quotes, stray control chars).
|
|
93
|
+
- **Profile** date columns via [`datemonkey`](https://pypi.org/project/datemonkey/).
|
|
94
|
+
- **Repair** ragged rows, stray BOMs and inconsistent line endings.
|
|
95
|
+
- **Stream** row-by-row; large files are fine.
|
|
96
|
+
- **Chain** cleanly into `pgmonkey` (DB import), `xlfilldown` (Excel
|
|
97
|
+
output) and `typemonkey` (type inference).
|
|
98
|
+
|
|
99
|
+
## CLI
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
dsvmonkey inspect file.csv # human-readable detection report
|
|
103
|
+
dsvmonkey normalize file.csv -o clean.csv # strip BOM, fix ragged rows, normalize endings
|
|
104
|
+
dsvmonkey convert file.csv -o out.jsonl --to jsonl
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Run `dsvmonkey --help` or `dsvmonkey <command> --help` for the full
|
|
108
|
+
list. Flags are command-specific:
|
|
109
|
+
|
|
110
|
+
- `inspect`: `-v/--verbose`, `--no-columns`, `--sample-rows`,
|
|
111
|
+
`--excel-serial-min`, `--no-deep-scan`, `--clean-sample`,
|
|
112
|
+
`--strict` (exit 3 instead of 0 when the profile recommends
|
|
113
|
+
human review — the unattended-pipeline gate).
|
|
114
|
+
- `normalize`: `--encoding`, `--line-ending lf|crlf|cr`,
|
|
115
|
+
`--delimiter`, `--field-count`, `--no-clean`, `--no-deep-scan`,
|
|
116
|
+
`--keep-empty-rows`, `--sanitize-formulas`, `--strict` (same
|
|
117
|
+
gate semantics as `inspect --strict`: profile first, exit 3
|
|
118
|
+
with no output written when detection isn't confident enough).
|
|
119
|
+
- `convert`: `--to {csv,tsv,jsonl}`, `--no-clean`, `--no-deep-scan`,
|
|
120
|
+
`--keep-empty-rows`, `--sanitize-formulas` (applies on every output
|
|
121
|
+
format, including `jsonl` — JSONL output is commonly transformed
|
|
122
|
+
back to CSV/Excel later, where formula payloads surviving as JSON
|
|
123
|
+
string values become live formulas), `--strict` (gate as above).
|
|
124
|
+
|
|
125
|
+
## Python API
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
import dsvmonkey
|
|
129
|
+
|
|
130
|
+
# Profile a file — encoding, delimiter, headers, etc.
|
|
131
|
+
profile = dsvmonkey.profile_file("file.csv")
|
|
132
|
+
|
|
133
|
+
# Stream cleaned rows as dicts
|
|
134
|
+
for row in dsvmonkey.read("file.csv"):
|
|
135
|
+
...
|
|
136
|
+
|
|
137
|
+
# Write a cleaned version
|
|
138
|
+
report = dsvmonkey.repair("messy.csv", "clean.csv")
|
|
139
|
+
|
|
140
|
+
# Convert to JSON Lines
|
|
141
|
+
dsvmonkey.to_jsonl("file.csv", "file.jsonl")
|
|
142
|
+
|
|
143
|
+
# Per-column profiling (date-format detection via datemonkey)
|
|
144
|
+
columns = dsvmonkey.profile_columns("file.csv")
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Limitations
|
|
148
|
+
|
|
149
|
+
Some behaviours are deliberate design tradeoffs rather than bugs (e.g.
|
|
150
|
+
mixed-encoding detection requires UTF-8 multi-byte evidence to avoid
|
|
151
|
+
false-positives on cp1252 files; duplicate header names in dict mode
|
|
152
|
+
warn-and-collapse rather than raise). See `LIMITATIONS.md` for the
|
|
153
|
+
full list with rationale and escape hatches.
|
|
154
|
+
|
|
155
|
+
## Using with AI assistants
|
|
156
|
+
|
|
157
|
+
`SKILL.md` at the repo root is a drop-in Claude Code / agent skill that
|
|
158
|
+
teaches LLMs how to call `dsvmonkey` correctly — decision tree, failure
|
|
159
|
+
modes it already handles, worked examples, and a "don't" list so agents
|
|
160
|
+
stop reinventing broken CSV parsing. Copy it to `~/.claude/skills/` or
|
|
161
|
+
include it in a project's `AGENTS.md` / `CLAUDE.md` for automatic
|
|
162
|
+
discovery.
|
|
163
|
+
|
|
164
|
+
## License
|
|
165
|
+
|
|
166
|
+
MIT. See `LICENSE`.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# dsvmonkey
|
|
2
|
+
|
|
3
|
+
Detect, profile, normalize and repair delimiter-separated-values files.
|
|
4
|
+
|
|
5
|
+
CSV is a polite lie. Real files are tab-separated, pipe-separated, or
|
|
6
|
+
semicolon-separated; start with decorative title rows; carry BOMs and
|
|
7
|
+
mixed encodings; include ragged rows and quoted newlines. `dsvmonkey`
|
|
8
|
+
reads them anyway, tells you what it found, and hands you a clean
|
|
9
|
+
stream of rows.
|
|
10
|
+
|
|
11
|
+
## Status
|
|
12
|
+
|
|
13
|
+
Alpha. API is not yet stable.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install dsvmonkey
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
For development (editable install with test tooling):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e .[dev]
|
|
25
|
+
# or equivalently:
|
|
26
|
+
pip install -r requirements-dev.txt
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Both `requirements.txt` and `requirements-dev.txt` are thin pointers
|
|
30
|
+
to `pyproject.toml` — the single source of truth for dependency
|
|
31
|
+
lists. Edit dependencies in `pyproject.toml`; the requirements files
|
|
32
|
+
need no maintenance.
|
|
33
|
+
|
|
34
|
+
## What it does
|
|
35
|
+
|
|
36
|
+
- **Detect** encoding, delimiter, quote char, header row and line
|
|
37
|
+
endings — each with a confidence score, runner-up alternatives and
|
|
38
|
+
the reasoning behind the choice.
|
|
39
|
+
- **Normalize** cells on read using [`cleanmonkey`](https://pypi.org/project/cleanmonkey/)
|
|
40
|
+
(BOMs, NBSPs, zero-width spaces, smart quotes, stray control chars).
|
|
41
|
+
- **Profile** date columns via [`datemonkey`](https://pypi.org/project/datemonkey/).
|
|
42
|
+
- **Repair** ragged rows, stray BOMs and inconsistent line endings.
|
|
43
|
+
- **Stream** row-by-row; large files are fine.
|
|
44
|
+
- **Chain** cleanly into `pgmonkey` (DB import), `xlfilldown` (Excel
|
|
45
|
+
output) and `typemonkey` (type inference).
|
|
46
|
+
|
|
47
|
+
## CLI
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
dsvmonkey inspect file.csv # human-readable detection report
|
|
51
|
+
dsvmonkey normalize file.csv -o clean.csv # strip BOM, fix ragged rows, normalize endings
|
|
52
|
+
dsvmonkey convert file.csv -o out.jsonl --to jsonl
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Run `dsvmonkey --help` or `dsvmonkey <command> --help` for the full
|
|
56
|
+
list. Flags are command-specific:
|
|
57
|
+
|
|
58
|
+
- `inspect`: `-v/--verbose`, `--no-columns`, `--sample-rows`,
|
|
59
|
+
`--excel-serial-min`, `--no-deep-scan`, `--clean-sample`,
|
|
60
|
+
`--strict` (exit 3 instead of 0 when the profile recommends
|
|
61
|
+
human review — the unattended-pipeline gate).
|
|
62
|
+
- `normalize`: `--encoding`, `--line-ending lf|crlf|cr`,
|
|
63
|
+
`--delimiter`, `--field-count`, `--no-clean`, `--no-deep-scan`,
|
|
64
|
+
`--keep-empty-rows`, `--sanitize-formulas`, `--strict` (same
|
|
65
|
+
gate semantics as `inspect --strict`: profile first, exit 3
|
|
66
|
+
with no output written when detection isn't confident enough).
|
|
67
|
+
- `convert`: `--to {csv,tsv,jsonl}`, `--no-clean`, `--no-deep-scan`,
|
|
68
|
+
`--keep-empty-rows`, `--sanitize-formulas` (applies on every output
|
|
69
|
+
format, including `jsonl` — JSONL output is commonly transformed
|
|
70
|
+
back to CSV/Excel later, where formula payloads surviving as JSON
|
|
71
|
+
string values become live formulas), `--strict` (gate as above).
|
|
72
|
+
|
|
73
|
+
## Python API
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import dsvmonkey
|
|
77
|
+
|
|
78
|
+
# Profile a file — encoding, delimiter, headers, etc.
|
|
79
|
+
profile = dsvmonkey.profile_file("file.csv")
|
|
80
|
+
|
|
81
|
+
# Stream cleaned rows as dicts
|
|
82
|
+
for row in dsvmonkey.read("file.csv"):
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
# Write a cleaned version
|
|
86
|
+
report = dsvmonkey.repair("messy.csv", "clean.csv")
|
|
87
|
+
|
|
88
|
+
# Convert to JSON Lines
|
|
89
|
+
dsvmonkey.to_jsonl("file.csv", "file.jsonl")
|
|
90
|
+
|
|
91
|
+
# Per-column profiling (date-format detection via datemonkey)
|
|
92
|
+
columns = dsvmonkey.profile_columns("file.csv")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Limitations
|
|
96
|
+
|
|
97
|
+
Some behaviours are deliberate design tradeoffs rather than bugs (e.g.
|
|
98
|
+
mixed-encoding detection requires UTF-8 multi-byte evidence to avoid
|
|
99
|
+
false-positives on cp1252 files; duplicate header names in dict mode
|
|
100
|
+
warn-and-collapse rather than raise). See `LIMITATIONS.md` for the
|
|
101
|
+
full list with rationale and escape hatches.
|
|
102
|
+
|
|
103
|
+
## Using with AI assistants
|
|
104
|
+
|
|
105
|
+
`SKILL.md` at the repo root is a drop-in Claude Code / agent skill that
|
|
106
|
+
teaches LLMs how to call `dsvmonkey` correctly — decision tree, failure
|
|
107
|
+
modes it already handles, worked examples, and a "don't" list so agents
|
|
108
|
+
stop reinventing broken CSV parsing. Copy it to `~/.claude/skills/` or
|
|
109
|
+
include it in a project's `AGENTS.md` / `CLAUDE.md` for automatic
|
|
110
|
+
discovery.
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
MIT. See `LICENSE`.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dsvmonkey"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Detect, profile, normalize and repair delimiter-separated values files (CSV, TSV, pipe, semicolon)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
authors = [{ name = "rexbytes", email = "pythonic@rexbytes.com" }]
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
keywords = ["csv", "tsv", "dsv", "etl", "encoding", "delimiter", "cleaning"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
24
|
+
"Topic :: Text Processing",
|
|
25
|
+
"Topic :: Utilities",
|
|
26
|
+
]
|
|
27
|
+
# Both rexbytes-family helpers are pinned to a 0.x major. Without
|
|
28
|
+
# upper bounds an upstream 1.0 release (or any future major bump
|
|
29
|
+
# with breaking parsing/format changes) would silently alter
|
|
30
|
+
# detection or cleaning behaviour in installed environments —
|
|
31
|
+
# unacceptable for a data-quality library where reproducibility
|
|
32
|
+
# matters. Bumping the bound is an explicit decision per release.
|
|
33
|
+
dependencies = [
|
|
34
|
+
"cleanmonkey>=0.1,<1.0",
|
|
35
|
+
"datemonkey>=0.1,<1.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=7.0",
|
|
41
|
+
"pytest-cov",
|
|
42
|
+
"hypothesis>=6.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
dsvmonkey = "dsvmonkey.cli:main"
|
|
47
|
+
|
|
48
|
+
[project.urls]
|
|
49
|
+
Homepage = "https://github.com/rexbytes/dsvmonkey"
|
|
50
|
+
Issues = "https://github.com/rexbytes/dsvmonkey/issues"
|
|
51
|
+
|
|
52
|
+
[tool.setuptools.packages.find]
|
|
53
|
+
where = ["src"]
|
|
54
|
+
|
|
55
|
+
[tool.pytest.ini_options]
|
|
56
|
+
testpaths = ["tests"]
|
|
57
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""dsvmonkey — detect, profile, normalize and repair DSV files."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version as _pkg_version
|
|
4
|
+
|
|
5
|
+
from dsvmonkey.columns import ColumnProfile, profile_columns
|
|
6
|
+
from dsvmonkey.convert import to_jsonl
|
|
7
|
+
from dsvmonkey.orchestrate import profile_bytes, profile_file
|
|
8
|
+
from dsvmonkey.profile import (
|
|
9
|
+
DetectionAlternative,
|
|
10
|
+
DetectionResult,
|
|
11
|
+
DictOverflowError,
|
|
12
|
+
DSVProfile,
|
|
13
|
+
ReviewRecommendedError,
|
|
14
|
+
assert_clean,
|
|
15
|
+
)
|
|
16
|
+
from dsvmonkey.reader import read
|
|
17
|
+
from dsvmonkey.repair import RepairReport, repair
|
|
18
|
+
|
|
19
|
+
# pyproject.toml is the single source of truth for the version number.
|
|
20
|
+
# Reading via importlib.metadata avoids hand-syncing a duplicate
|
|
21
|
+
# string here — a previous foot-gun where __init__.py's hardcoded
|
|
22
|
+
# "0.1.0" could drift from the packaged version on a release. The
|
|
23
|
+
# fallback covers running directly from a source tree where the
|
|
24
|
+
# package isn't installed (rare but legal during dev).
|
|
25
|
+
try:
|
|
26
|
+
__version__ = _pkg_version("dsvmonkey")
|
|
27
|
+
except PackageNotFoundError:
|
|
28
|
+
__version__ = "0.0.0+unknown"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"ColumnProfile",
|
|
32
|
+
"DetectionAlternative",
|
|
33
|
+
"DetectionResult",
|
|
34
|
+
"DictOverflowError",
|
|
35
|
+
"DSVProfile",
|
|
36
|
+
"RepairReport",
|
|
37
|
+
"ReviewRecommendedError",
|
|
38
|
+
"assert_clean",
|
|
39
|
+
"profile_bytes",
|
|
40
|
+
"profile_columns",
|
|
41
|
+
"profile_file",
|
|
42
|
+
"read",
|
|
43
|
+
"repair",
|
|
44
|
+
"to_jsonl",
|
|
45
|
+
"__version__",
|
|
46
|
+
]
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""Shared internal helpers for output paths.
|
|
2
|
+
|
|
3
|
+
Lives here rather than under one feature module (`repair.py`) so the
|
|
4
|
+
cross-module dependency is explicit. Both `repair.py` and `convert.py`
|
|
5
|
+
need atomic writes, same-path safety checks, and formula-injection
|
|
6
|
+
neutralisation; importing private helpers across modules works but
|
|
7
|
+
makes future renames risky and obscures the dependency. Pulling
|
|
8
|
+
them into a single private module makes the shared surface
|
|
9
|
+
discoverable and refactorable in one place.
|
|
10
|
+
|
|
11
|
+
Still underscore-prefixed (the module name and the helpers) — these
|
|
12
|
+
are not part of the public API. External callers should not import
|
|
13
|
+
from `dsvmonkey._internal`.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import tempfile
|
|
20
|
+
import unicodedata
|
|
21
|
+
from contextlib import contextmanager
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import IO, Iterator
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ---------- Atomic + durable text writer ----------
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@contextmanager
|
|
30
|
+
def _atomic_text_writer(path: Path, *, encoding: str) -> Iterator[IO[str]]:
|
|
31
|
+
"""Yield a text-mode file handle whose writes land atomically
|
|
32
|
+
and durably at `path` on successful close.
|
|
33
|
+
|
|
34
|
+
Writes go to a sibling temp file (`.<path.name>.<pid>.partial`),
|
|
35
|
+
the file's pages are flushed to disk (`fsync`), then
|
|
36
|
+
`os.replace()` swaps it into place — a single filesystem
|
|
37
|
+
operation that's atomic on POSIX and Windows. The parent
|
|
38
|
+
directory is then also `fsync`ed so the rename itself survives
|
|
39
|
+
power loss. If the caller raises, the temp file is removed
|
|
40
|
+
rather than left at the final path, so a killed-mid-write run
|
|
41
|
+
doesn't corrupt existing output.
|
|
42
|
+
|
|
43
|
+
Previously we used `os.replace` alone, which is atomic against
|
|
44
|
+
process crashes but not power loss — a sudden reboot between
|
|
45
|
+
the write and the write hitting the platter could still lose
|
|
46
|
+
the new contents despite "atomic" semantics. The fsync before
|
|
47
|
+
the rename, plus the dir fsync after, closes that window for
|
|
48
|
+
durability-critical pipelines (regulated ETL, audit trails).
|
|
49
|
+
|
|
50
|
+
Uses the same directory as the target so the replace doesn't
|
|
51
|
+
cross filesystems (cross-fs rename falls back to copy+delete
|
|
52
|
+
and isn't atomic).
|
|
53
|
+
"""
|
|
54
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
tmp_fd, tmp_name = tempfile.mkstemp(
|
|
56
|
+
prefix=f".{path.name}.",
|
|
57
|
+
suffix=".partial",
|
|
58
|
+
dir=str(path.parent),
|
|
59
|
+
)
|
|
60
|
+
tmp_path = Path(tmp_name)
|
|
61
|
+
try:
|
|
62
|
+
fh = os.fdopen(tmp_fd, "w", encoding=encoding, newline="")
|
|
63
|
+
try:
|
|
64
|
+
yield fh
|
|
65
|
+
fh.flush()
|
|
66
|
+
try:
|
|
67
|
+
os.fsync(fh.fileno())
|
|
68
|
+
except OSError:
|
|
69
|
+
# Some filesystems (/dev/null, special devices,
|
|
70
|
+
# some network mounts) don't support fsync. The
|
|
71
|
+
# atomic rename still works; durability degrades
|
|
72
|
+
# gracefully to "regular POSIX" rather than raising
|
|
73
|
+
# on every /tmp write.
|
|
74
|
+
pass
|
|
75
|
+
finally:
|
|
76
|
+
fh.close()
|
|
77
|
+
os.replace(tmp_path, path)
|
|
78
|
+
# Also fsync the containing directory so the rename itself
|
|
79
|
+
# survives power loss. Without this, the new inode exists
|
|
80
|
+
# on disk but the directory entry pointing at it might not
|
|
81
|
+
# be flushed yet — crash recovery could "lose" the rename.
|
|
82
|
+
try:
|
|
83
|
+
dir_fd = os.open(str(path.parent), os.O_RDONLY)
|
|
84
|
+
try:
|
|
85
|
+
os.fsync(dir_fd)
|
|
86
|
+
finally:
|
|
87
|
+
os.close(dir_fd)
|
|
88
|
+
except OSError:
|
|
89
|
+
# Windows doesn't support directory fsync; on other
|
|
90
|
+
# filesystems where it fails, gracefully skip.
|
|
91
|
+
pass
|
|
92
|
+
except BaseException:
|
|
93
|
+
# Any failure: remove the temp so we don't leave
|
|
94
|
+
# `.something.123.partial` files scattered in user dirs.
|
|
95
|
+
# Suppress every OSError class (FileNotFoundError,
|
|
96
|
+
# PermissionError, race conditions on Windows, network
|
|
97
|
+
# mount weirdness) so a cleanup hiccup doesn't replace
|
|
98
|
+
# the real exception's traceback context. Diagnosis of
|
|
99
|
+
# the original failure matters more than perfect
|
|
100
|
+
# housekeeping when something already went wrong.
|
|
101
|
+
try:
|
|
102
|
+
tmp_path.unlink()
|
|
103
|
+
except OSError:
|
|
104
|
+
pass
|
|
105
|
+
raise
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ---------- Same-file protection ----------
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _reject_same_path(
|
|
112
|
+
input_path: Path, output_path: Path | None, op: str
|
|
113
|
+
) -> None:
|
|
114
|
+
"""Raise if src and dst point at the same underlying file.
|
|
115
|
+
|
|
116
|
+
Streamed read + truncate-on-open would race the filesystem:
|
|
117
|
+
`open(p, "w")` truncates the inode on disk, and while POSIX
|
|
118
|
+
semantics keep the read fd alive (the writer gets an independent
|
|
119
|
+
inode on most systems but NOT all filesystems), Windows raises
|
|
120
|
+
immediately and network filesystems can produce silently
|
|
121
|
+
truncated output. Failing fast is cheaper than debugging a lost
|
|
122
|
+
CSV at 2am.
|
|
123
|
+
|
|
124
|
+
Identity is checked via (st_dev, st_ino) when both files exist,
|
|
125
|
+
which catches hardlinks and bind-mounts that resolve to the
|
|
126
|
+
same inode via different paths. Falls back to resolved-path
|
|
127
|
+
equality when either file doesn't exist yet (a fresh output
|
|
128
|
+
path has no inode to compare) or when the stat call fails
|
|
129
|
+
(permission errors etc.) — same-path protection degrades
|
|
130
|
+
gracefully rather than blocking legitimate writes.
|
|
131
|
+
"""
|
|
132
|
+
if output_path is None:
|
|
133
|
+
return
|
|
134
|
+
# Primary: inode identity. Handles hardlinks, bind mounts, and
|
|
135
|
+
# other paths-to-same-file cases that resolved-path equality
|
|
136
|
+
# alone misses.
|
|
137
|
+
try:
|
|
138
|
+
src_stat = input_path.stat()
|
|
139
|
+
dst_stat = output_path.stat()
|
|
140
|
+
except (OSError, ValueError):
|
|
141
|
+
# Output file doesn't exist yet (the common case) or can't
|
|
142
|
+
# be stat'ed — fall through to path-equality.
|
|
143
|
+
pass
|
|
144
|
+
else:
|
|
145
|
+
if (src_stat.st_dev, src_stat.st_ino) == (
|
|
146
|
+
dst_stat.st_dev,
|
|
147
|
+
dst_stat.st_ino,
|
|
148
|
+
):
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"{op}(): input and output resolve to the same file "
|
|
151
|
+
f"(inode {src_stat.st_ino} on device "
|
|
152
|
+
f"{src_stat.st_dev}). Writing would truncate the "
|
|
153
|
+
f"source before the read completes. Pick a different "
|
|
154
|
+
f"output path, or write to a temp file and "
|
|
155
|
+
f"os.replace() it."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Fallback: resolved-path equality, for the "dst doesn't exist
|
|
159
|
+
# yet" case and for filesystems where stat is unavailable.
|
|
160
|
+
try:
|
|
161
|
+
src_resolved = input_path.resolve(strict=False)
|
|
162
|
+
dst_resolved = output_path.resolve(strict=False)
|
|
163
|
+
except OSError:
|
|
164
|
+
return
|
|
165
|
+
if src_resolved == dst_resolved:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"{op}(): input and output resolve to the same path "
|
|
168
|
+
f"({src_resolved}). Writing would truncate the source "
|
|
169
|
+
f"before the read completes. Pick a different output "
|
|
170
|
+
f"path, or write to a temp file and os.replace() it."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# ---------- Formula-injection neutralisation ----------
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# Characters that Excel / Google Sheets / LibreOffice Calc treat as
|
|
178
|
+
# the start of a formula. An attacker who can influence cell values
|
|
179
|
+
# crafts something like `=SUM(A:A)*cmd|' /C calc.exe'!A1` — when the
|
|
180
|
+
# CSV is opened in a spreadsheet, the cell evaluates as a formula
|
|
181
|
+
# with side effects. Prepending `'` neutralises the formula marker
|
|
182
|
+
# (the apostrophe is treated as a formatting hint by spreadsheets,
|
|
183
|
+
# indicating "display this cell as literal text").
|
|
184
|
+
_FORMULA_INJECTION_PREFIXES: tuple[str, ...] = ("=", "+", "-", "@")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _neutralize_formula(cell: str) -> str:
|
|
188
|
+
"""Prepend `'` when `cell`'s first non-whitespace, non-invisible
|
|
189
|
+
character is a spreadsheet-formula trigger; return `cell`
|
|
190
|
+
unchanged otherwise.
|
|
191
|
+
|
|
192
|
+
Called only when the caller opts into formula sanitization
|
|
193
|
+
(`repair(sanitize_formulas=True)` /
|
|
194
|
+
`to_jsonl(sanitize_formulas=True)`). Off by default because the
|
|
195
|
+
prefix mutates visible content and is unwanted when the
|
|
196
|
+
consumer isn't a spreadsheet; turn on at the pipeline boundary
|
|
197
|
+
where output crosses into untrusted spreadsheet territory.
|
|
198
|
+
|
|
199
|
+
The skip set covers BOTH whitespace AND Unicode "Format" /
|
|
200
|
+
"Mark, Nonspacing" categories (BOM U+FEFF, ZWSP U+200B,
|
|
201
|
+
ZWNJ/ZWJ, combining marks, etc.). Excel and Sheets ignore
|
|
202
|
+
those when evaluating the cell, so a naive lstrip() — which
|
|
203
|
+
only removes whitespace — leaves `"\\ufeff=SUM(A:A)"`
|
|
204
|
+
un-neutralised. Under `clean_cells=False` the BOM survives
|
|
205
|
+
to output and the formula evaluates in the spreadsheet,
|
|
206
|
+
bypassing the defence the caller asked for.
|
|
207
|
+
"""
|
|
208
|
+
if not cell:
|
|
209
|
+
return cell
|
|
210
|
+
for ch in cell:
|
|
211
|
+
if ch.isspace():
|
|
212
|
+
continue
|
|
213
|
+
# Cf = Format (BOM, ZWSP, ZWNJ, ZWJ, RLM, LRM…).
|
|
214
|
+
# Mn = Mark, Nonspacing (combining diacritics).
|
|
215
|
+
# Both are visually invisible / non-rendered as a leading
|
|
216
|
+
# cell character in spreadsheets.
|
|
217
|
+
if unicodedata.category(ch) in ("Cf", "Mn"):
|
|
218
|
+
continue
|
|
219
|
+
if ch in _FORMULA_INJECTION_PREFIXES:
|
|
220
|
+
return "'" + cell
|
|
221
|
+
break
|
|
222
|
+
return cell
|