fix-text 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ # Changes here will be overwritten by Copier. Do not edit manually.
2
+ _commit: v0.2.20
3
+ _src_path: gh:jlevy/simple-modern-uv
4
+ package_author_name: Dean Li
5
+ package_description: Detect and normalize suspicious Unicode characters in text files.
6
+ package_github_org: deantvv
7
+ package_module: fix_text
8
+ package_name: fix-text
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
@@ -0,0 +1 @@
1
+ 3.12
fix_text-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Dean
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: fix-text
3
+ Version: 0.1.0
4
+ Summary: Detect and normalize suspicious Unicode characters in text files.
5
+ Project-URL: Homepage, https://github.com/deantvv/fix-text
6
+ Project-URL: Repository, https://github.com/deantvv/fix-text
7
+ Project-URL: Issues, https://github.com/deantvv/fix-text/issues
8
+ Project-URL: Changelog, https://github.com/deantvv/fix-text/releases
9
+ Author: Dean Li
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: cleanup,cli,normalization,text,unicode
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Text Processing
20
+ Classifier: Topic :: Utilities
21
+ Requires-Python: <4,>=3.12
22
+ Requires-Dist: orjson>=3.10.0
23
+ Requires-Dist: pyyaml>=6.0.0
24
+ Description-Content-Type: text/markdown
25
+
26
+ # fix-text
27
+
28
+ `fix-text` is a small Python CLI for detecting and cleaning suspicious Unicode
29
+ characters in text files, including fullwidth spaces, non-breaking spaces, zero-width
30
+ characters, BOM markers, and optional control characters.
31
+
32
+ ## What it fixes
33
+
34
+ - Replaces fullwidth and non-standard spacing characters with a normal ASCII space
35
+ - Removes zero-width characters and byte order marks
36
+ - Optionally removes unsupported control characters with `--include-controls`
37
+ - Validates cleaned `.json` files before rewriting them
38
+ - Validates cleaned `.yaml` and `.yml` files before rewriting them
39
+ - Scans individual files or entire directory trees
40
+
41
+ ## Install for development
42
+
43
+ This project is managed with `uv`.
44
+
45
+ ```bash
46
+ uv sync
47
+ ```
48
+
49
+ Install the test dependency group:
50
+
51
+ ```bash
52
+ uv sync --group dev
53
+ ```
54
+
55
+ Run the CLI without installing globally:
56
+
57
+ ```bash
58
+ uv run fix-text --help
59
+ ```
60
+
61
+ Run the test suite:
62
+
63
+ ```bash
64
+ uv run --group dev pytest
65
+ ```
66
+
67
+ ## Usage
68
+
69
+ Scan a file or directory:
70
+
71
+ ```bash
72
+ uv run fix-text path/to/file.txt path/to/dir
73
+ ```
74
+
75
+ Rewrite files in place:
76
+
77
+ ```bash
78
+ uv run fix-text --apply path/to/file.txt
79
+ ```
80
+
81
+ Include control-character cleanup:
82
+
83
+ ```bash
84
+ uv run fix-text --apply --include-controls logs/
85
+ ```
86
+
87
+ Treat additional suffixes as text:
88
+
89
+ ```bash
90
+ uv run fix-text --ext .log --ext .cfg sample/
91
+ ```
92
+
93
+ For `.json` files, `fix-text` validates the cleaned output with `orjson` before writing.
94
+ For `.yaml` and `.yml`, it validates the cleaned output with `PyYAML`.
95
+ If the cleaned content would still be invalid JSON or YAML, the file is left unchanged.
96
+
97
+ ## Example output
98
+
99
+ ```text
100
+ notes.txt
101
+ 3:14 U+3000 IDEOGRAPHIC SPACE '\u3000' -> replace with space
102
+ 8:2 U+200B ZERO WIDTH SPACE '\u200b' -> delete
103
+
104
+ Found 2 issue(s). Re-run with --apply to rewrite files.
105
+ ```
106
+
107
+ ## PyPI release plan
108
+
109
+ 1. Replace the placeholder GitHub URLs in `pyproject.toml`.
110
+ 2. Create a PyPI account and API token.
111
+ 3. Build distributions with `uv build`.
112
+ 4. Publish with `uv publish`.
113
+ 5. Tag the release in git so the source and package versions stay aligned.
114
+
115
+ Recommended pre-release checks:
116
+
117
+ - `uv run fix-text --help`
118
+ - `uv run fix-text README.md`
119
+ - `uv run --group dev pytest`
120
+ - `uv build --out-dir dist`
121
+
122
+ Build into the repo-local `dist/` directory:
123
+
124
+ ```bash
125
+ ./scripts/release.sh build
126
+ ```
127
+
128
+ Publish the repo-local artifacts:
129
+
130
+ ```bash
131
+ ./scripts/release.sh publish
132
+ ```
133
+
134
+ ## License
135
+
136
+ MIT
@@ -0,0 +1,111 @@
1
+ # fix-text
2
+
3
+ `fix-text` is a small Python CLI for detecting and cleaning suspicious Unicode
4
+ characters in text files, including fullwidth spaces, non-breaking spaces, zero-width
5
+ characters, BOM markers, and optional control characters.
6
+
7
+ ## What it fixes
8
+
9
+ - Replaces fullwidth and non-standard spacing characters with a normal ASCII space
10
+ - Removes zero-width characters and byte order marks
11
+ - Optionally removes unsupported control characters with `--include-controls`
12
+ - Validates cleaned `.json` files before rewriting them
13
+ - Validates cleaned `.yaml` and `.yml` files before rewriting them
14
+ - Scans individual files or entire directory trees
15
+
16
+ ## Install for development
17
+
18
+ This project is managed with `uv`.
19
+
20
+ ```bash
21
+ uv sync
22
+ ```
23
+
24
+ Install the test dependency group:
25
+
26
+ ```bash
27
+ uv sync --group dev
28
+ ```
29
+
30
+ Run the CLI without installing globally:
31
+
32
+ ```bash
33
+ uv run fix-text --help
34
+ ```
35
+
36
+ Run the test suite:
37
+
38
+ ```bash
39
+ uv run --group dev pytest
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ Scan a file or directory:
45
+
46
+ ```bash
47
+ uv run fix-text path/to/file.txt path/to/dir
48
+ ```
49
+
50
+ Rewrite files in place:
51
+
52
+ ```bash
53
+ uv run fix-text --apply path/to/file.txt
54
+ ```
55
+
56
+ Include control-character cleanup:
57
+
58
+ ```bash
59
+ uv run fix-text --apply --include-controls logs/
60
+ ```
61
+
62
+ Treat additional suffixes as text:
63
+
64
+ ```bash
65
+ uv run fix-text --ext .log --ext .cfg sample/
66
+ ```
67
+
68
+ For `.json` files, `fix-text` validates the cleaned output with `orjson` before writing.
69
+ For `.yaml` and `.yml`, it validates the cleaned output with `PyYAML`.
70
+ If the cleaned content would still be invalid JSON or YAML, the file is left unchanged.
71
+
72
+ ## Example output
73
+
74
+ ```text
75
+ notes.txt
76
+ 3:14 U+3000 IDEOGRAPHIC SPACE '\u3000' -> replace with space
77
+ 8:2 U+200B ZERO WIDTH SPACE '\u200b' -> delete
78
+
79
+ Found 2 issue(s). Re-run with --apply to rewrite files.
80
+ ```
81
+
82
+ ## PyPI release plan
83
+
84
+ 1. Replace the placeholder GitHub URLs in `pyproject.toml`.
85
+ 2. Create a PyPI account and API token.
86
+ 3. Build distributions with `uv build`.
87
+ 4. Publish with `uv publish`.
88
+ 5. Tag the release in git so the source and package versions stay aligned.
89
+
90
+ Recommended pre-release checks:
91
+
92
+ - `uv run fix-text --help`
93
+ - `uv run fix-text README.md`
94
+ - `uv run --group dev pytest`
95
+ - `uv build --out-dir dist`
96
+
97
+ Build into the repo-local `dist/` directory:
98
+
99
+ ```bash
100
+ ./scripts/release.sh build
101
+ ```
102
+
103
+ Publish the repo-local artifacts:
104
+
105
+ ```bash
106
+ ./scripts/release.sh publish
107
+ ```
108
+
109
+ ## License
110
+
111
+ MIT
@@ -0,0 +1,46 @@
1
+ [project]
2
+ name = "fix-text"
3
+ version = "0.1.0"
4
+ description = "Detect and normalize suspicious Unicode characters in text files."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12,<4"
7
+ dependencies = [
8
+ "orjson>=3.10.0",
9
+ "PyYAML>=6.0.0",
10
+ ]
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Dean Li" },
14
+ ]
15
+ keywords = ["cli", "text", "unicode", "normalization", "cleanup"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Environment :: Console",
19
+ "Intended Audience :: Developers",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Text Processing",
24
+ "Topic :: Utilities",
25
+ ]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/deantvv/fix-text"
29
+ Repository = "https://github.com/deantvv/fix-text"
30
+ Issues = "https://github.com/deantvv/fix-text/issues"
31
+ Changelog = "https://github.com/deantvv/fix-text/releases"
32
+
33
+ [project.scripts]
34
+ fix-text = "fix_text.cli:main"
35
+
36
+ [build-system]
37
+ requires = ["hatchling>=1.27.0"]
38
+ build-backend = "hatchling.build"
39
+
40
+ [dependency-groups]
41
+ dev = [
42
+ "pytest>=8.0.0",
43
+ ]
44
+
45
+ [tool.uv]
46
+ package = true
@@ -0,0 +1,3 @@
1
+ __all__ = ["__version__"]
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ from fix_text.cli import main
2
+
3
+
4
+ if __name__ == "__main__":
5
+ raise SystemExit(main())
@@ -0,0 +1,272 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ import unicodedata
6
+ from collections.abc import Iterable
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+
10
+ import orjson
11
+ import yaml
12
+
13
+ from fix_text import __version__
14
+
15
+
16
+ REPLACEMENTS: dict[str, str] = {
17
+ "\u00a0": " ", # no-break space
18
+ "\u1680": " ", # ogham space mark
19
+ "\u2000": " ",
20
+ "\u2001": " ",
21
+ "\u2002": " ",
22
+ "\u2003": " ",
23
+ "\u2004": " ",
24
+ "\u2005": " ",
25
+ "\u2006": " ",
26
+ "\u2007": " ",
27
+ "\u2008": " ",
28
+ "\u2009": " ",
29
+ "\u200a": " ",
30
+ "\u202f": " ",
31
+ "\u205f": " ",
32
+ "\u3000": " ", # fullwidth space
33
+ "\ufeff": "", # BOM / zero-width no-break space
34
+ "\u200b": "", # zero-width space
35
+ "\u200c": "", # zero-width non-joiner
36
+ "\u200d": "", # zero-width joiner
37
+ "\u2060": "", # word joiner
38
+ "\u00ad": "", # soft hyphen
39
+ }
40
+
41
+ CONTROL_EXCEPTIONS = {"\n", "\r", "\t"}
42
+ TEXT_SUFFIXES = {
43
+ ".csv",
44
+ ".html",
45
+ ".ini",
46
+ ".js",
47
+ ".json",
48
+ ".md",
49
+ ".py",
50
+ ".rst",
51
+ ".sql",
52
+ ".svg",
53
+ ".toml",
54
+ ".ts",
55
+ ".tsx",
56
+ ".txt",
57
+ ".xml",
58
+ ".yaml",
59
+ ".yml",
60
+ }
61
+
62
+
63
+ @dataclass(slots=True)
64
+ class Issue:
65
+ line: int
66
+ column: int
67
+ char: str
68
+ action: str
69
+
70
+
71
+ def build_parser() -> argparse.ArgumentParser:
72
+ parser = argparse.ArgumentParser(
73
+ prog="fix-text",
74
+ description=(
75
+ "Detect and normalize suspicious Unicode characters in text files. "
76
+ "Use --apply to rewrite files in place."
77
+ ),
78
+ )
79
+ parser.add_argument("paths", nargs="+", help="File or directory paths to scan.")
80
+ parser.add_argument(
81
+ "--apply",
82
+ action="store_true",
83
+ help="Rewrite files in place. Without this flag the tool only reports issues.",
84
+ )
85
+ parser.add_argument(
86
+ "--include-controls",
87
+ action="store_true",
88
+ help="Remove unsupported control characters as well as mapped Unicode spaces.",
89
+ )
90
+ parser.add_argument(
91
+ "--ext",
92
+ action="append",
93
+ default=[],
94
+ metavar="SUFFIX",
95
+ help="Extra file suffix to treat as text, for example --ext .log",
96
+ )
97
+ parser.add_argument(
98
+ "--encoding",
99
+ default="utf-8",
100
+ help="Text encoding to use when reading and writing files. Default: utf-8",
101
+ )
102
+ parser.add_argument(
103
+ "--version",
104
+ action="version",
105
+ version=f"%(prog)s {__version__}",
106
+ )
107
+ return parser
108
+
109
+
110
+ def iter_target_files(paths: Iterable[str], extra_suffixes: Iterable[str]) -> Iterable[Path]:
111
+ suffixes = TEXT_SUFFIXES | {normalize_suffix(value) for value in extra_suffixes}
112
+ seen: set[Path] = set()
113
+
114
+ for raw_path in paths:
115
+ path = Path(raw_path)
116
+ if not path.exists():
117
+ print(f"warning: path does not exist: {path}", file=sys.stderr)
118
+ continue
119
+
120
+ candidates = [path] if path.is_file() else sorted(p for p in path.rglob("*") if p.is_file())
121
+ for candidate in candidates:
122
+ resolved = candidate.resolve()
123
+ if resolved in seen:
124
+ continue
125
+ seen.add(resolved)
126
+ if candidate.suffix.lower() in suffixes:
127
+ yield candidate
128
+
129
+
130
+ def normalize_suffix(value: str) -> str:
131
+ return value if value.startswith(".") else f".{value}"
132
+
133
+
134
+ def find_issues(text: str, include_controls: bool) -> list[Issue]:
135
+ issues: list[Issue] = []
136
+ line = 1
137
+ column = 1
138
+
139
+ for char in text:
140
+ action = replacement_for(char, include_controls)
141
+ if action is not None:
142
+ issues.append(Issue(line=line, column=column, char=char, action=describe_action(char, action)))
143
+
144
+ if char == "\n":
145
+ line += 1
146
+ column = 1
147
+ else:
148
+ column += 1
149
+
150
+ return issues
151
+
152
+
153
+ def replacement_for(char: str, include_controls: bool) -> str | None:
154
+ replacement = REPLACEMENTS.get(char)
155
+ if replacement is not None:
156
+ return replacement
157
+
158
+ if include_controls and unicodedata.category(char) == "Cc" and char not in CONTROL_EXCEPTIONS:
159
+ return ""
160
+
161
+ return None
162
+
163
+
164
+ def describe_action(char: str, replacement: str) -> str:
165
+ if replacement == "":
166
+ return "delete"
167
+ if replacement == " ":
168
+ return "replace with space"
169
+ return f"replace with {replacement!r}"
170
+
171
+
172
+ def sanitize_text(text: str, include_controls: bool) -> str:
173
+ cleaned: list[str] = []
174
+ for char in text:
175
+ replacement = replacement_for(char, include_controls)
176
+ cleaned.append(char if replacement is None else replacement)
177
+ return "".join(cleaned)
178
+
179
+
180
+ def format_char(char: str) -> str:
181
+ codepoint = f"U+{ord(char):04X}"
182
+ name = unicodedata.name(char, "UNKNOWN")
183
+ printable = repr(char)
184
+ return f"{codepoint} {name} {printable}"
185
+
186
+
187
+ def validate_json_text(text: str) -> bool:
188
+ try:
189
+ orjson.loads(text)
190
+ except orjson.JSONDecodeError:
191
+ return False
192
+ return True
193
+
194
+
195
+ def validate_yaml_text(text: str) -> bool:
196
+ try:
197
+ yaml.safe_load(text)
198
+ except yaml.YAMLError:
199
+ return False
200
+ return True
201
+
202
+
203
+ def validate_cleaned_text(path: Path, text: str) -> tuple[bool, str | None]:
204
+ suffix = path.suffix.lower()
205
+ if suffix == ".json":
206
+ return validate_json_text(text), "JSON"
207
+ if suffix in {".yaml", ".yml"}:
208
+ return validate_yaml_text(text), "YAML"
209
+ return True, None
210
+
211
+
212
+ def process_file(path: Path, encoding: str, apply: bool, include_controls: bool) -> tuple[int, bool]:
213
+ try:
214
+ original = path.read_text(encoding=encoding)
215
+ except UnicodeDecodeError:
216
+ print(f"skipped binary or non-{encoding} file: {path}", file=sys.stderr)
217
+ return 0, False
218
+
219
+ issues = find_issues(original, include_controls=include_controls)
220
+ if not issues:
221
+ return 0, False
222
+
223
+ print(path)
224
+ for issue in issues:
225
+ print(f" {issue.line}:{issue.column} {format_char(issue.char)} -> {issue.action}")
226
+
227
+ changed = False
228
+ if apply:
229
+ sanitized = sanitize_text(original, include_controls=include_controls)
230
+ if sanitized != original:
231
+ is_valid, file_type = validate_cleaned_text(path, sanitized)
232
+ if not is_valid and file_type is not None:
233
+ print(f" skipped rewrite: cleaned content is not valid {file_type}", file=sys.stderr)
234
+ return len(issues), False
235
+ path.write_text(sanitized, encoding=encoding)
236
+ changed = True
237
+ print(" rewritten")
238
+
239
+ return len(issues), changed
240
+
241
+
242
+ def main(argv: list[str] | None = None) -> int:
243
+ parser = build_parser()
244
+ args = parser.parse_args(argv)
245
+
246
+ total_issues = 0
247
+ changed_files = 0
248
+
249
+ for path in iter_target_files(args.paths, args.ext):
250
+ issue_count, changed = process_file(
251
+ path,
252
+ encoding=args.encoding,
253
+ apply=args.apply,
254
+ include_controls=args.include_controls,
255
+ )
256
+ total_issues += issue_count
257
+ changed_files += int(changed)
258
+
259
+ if total_issues == 0:
260
+ print("No suspicious characters found.")
261
+ return 0
262
+
263
+ if args.apply:
264
+ print(f"\nFixed {total_issues} issue(s) across {changed_files} file(s).")
265
+ else:
266
+ print(f"\nFound {total_issues} issue(s). Re-run with --apply to rewrite files.")
267
+
268
+ return 1 if not args.apply else 0
269
+
270
+
271
+ if __name__ == "__main__":
272
+ raise SystemExit(main())
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from fix_text.cli import (
6
+ main,
7
+ process_file,
8
+ sanitize_text,
9
+ validate_json_text,
10
+ validate_yaml_text,
11
+ )
12
+
13
+
14
+ def test_sanitize_text_replaces_and_deletes_mapped_characters() -> None:
15
+ assert sanitize_text("A\u3000B\u200bC", include_controls=False) == "A BC"
16
+
17
+
18
+ def test_validate_json_text_accepts_valid_json() -> None:
19
+ assert validate_json_text('{"name":"value"}')
20
+
21
+
22
+ def test_validate_json_text_rejects_invalid_json() -> None:
23
+ assert not validate_json_text('{"name":"value",}')
24
+
25
+
26
+ def test_validate_yaml_text_accepts_valid_yaml() -> None:
27
+ assert validate_yaml_text("name: value\nitems:\n - one\n")
28
+
29
+
30
+ def test_validate_yaml_text_rejects_invalid_yaml() -> None:
31
+ assert not validate_yaml_text("items: [one\n")
32
+
33
+
34
+ def test_process_file_rewrites_valid_json(tmp_path: Path, capsys) -> None:
35
+ path = tmp_path / "sample.json"
36
+ path.write_text('{"name":"A\u3000B","note":"x\u200by"}\n', encoding="utf-8")
37
+
38
+ issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
39
+
40
+ assert issues == 2
41
+ assert changed is True
42
+ assert path.read_text(encoding="utf-8") == '{"name":"A B","note":"xy"}\n'
43
+ captured = capsys.readouterr()
44
+ assert "rewritten" in captured.out
45
+
46
+
47
+ def test_process_file_skips_invalid_json_rewrite(tmp_path: Path, capsys) -> None:
48
+ path = tmp_path / "invalid.json"
49
+ original = '{"name":"A\u3000B",}\n'
50
+ path.write_text(original, encoding="utf-8")
51
+
52
+ issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
53
+
54
+ assert issues == 1
55
+ assert changed is False
56
+ assert path.read_text(encoding="utf-8") == original
57
+ captured = capsys.readouterr()
58
+ assert "skipped rewrite: cleaned content is not valid JSON" in captured.err
59
+
60
+
61
+ def test_process_file_rewrites_valid_yaml(tmp_path: Path, capsys) -> None:
62
+ path = tmp_path / "sample.yaml"
63
+ path.write_text("title: A\u3000B\nnote: x\u200by\n", encoding="utf-8")
64
+
65
+ issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
66
+
67
+ assert issues == 2
68
+ assert changed is True
69
+ assert path.read_text(encoding="utf-8") == "title: A B\nnote: xy\n"
70
+ captured = capsys.readouterr()
71
+ assert "rewritten" in captured.out
72
+
73
+
74
+ def test_process_file_skips_invalid_yml_rewrite(tmp_path: Path, capsys) -> None:
75
+ path = tmp_path / "invalid.yml"
76
+ original = "items: [one\u200b\n"
77
+ path.write_text(original, encoding="utf-8")
78
+
79
+ issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
80
+
81
+ assert issues == 1
82
+ assert changed is False
83
+ assert path.read_text(encoding="utf-8") == original
84
+ captured = capsys.readouterr()
85
+ assert "skipped rewrite: cleaned content is not valid YAML" in captured.err
86
+
87
+
88
+ def test_main_reports_findings_without_apply(tmp_path: Path, capsys) -> None:
89
+ path = tmp_path / "notes.txt"
90
+ path.write_text("hello\u3000world\n", encoding="utf-8")
91
+
92
+ exit_code = main([str(path)])
93
+
94
+ assert exit_code == 1
95
+ captured = capsys.readouterr()
96
+ assert "U+3000 IDEOGRAPHIC SPACE" in captured.out
97
+ assert "Re-run with --apply" in captured.out