fix-text 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fix_text-0.1.0/.copier-answers.yml +8 -0
- fix_text-0.1.0/.gitignore +10 -0
- fix_text-0.1.0/.python-version +1 -0
- fix_text-0.1.0/LICENSE +21 -0
- fix_text-0.1.0/PKG-INFO +136 -0
- fix_text-0.1.0/README.md +111 -0
- fix_text-0.1.0/pyproject.toml +46 -0
- fix_text-0.1.0/src/fix_text/__init__.py +3 -0
- fix_text-0.1.0/src/fix_text/__main__.py +5 -0
- fix_text-0.1.0/src/fix_text/cli.py +272 -0
- fix_text-0.1.0/tests/test_cli.py +97 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# Changes here will be overwritten by Copier. Do not edit manually.
|
|
2
|
+
_commit: v0.2.20
|
|
3
|
+
_src_path: gh:jlevy/simple-modern-uv
|
|
4
|
+
package_author_name: Dean Li
|
|
5
|
+
package_description: Detect and normalize suspicious Unicode characters in text files.
|
|
6
|
+
package_github_org: deantvv
|
|
7
|
+
package_module: fix_text
|
|
8
|
+
package_name: fix-text
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
fix_text-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Dean
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
fix_text-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fix-text
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect and normalize suspicious Unicode characters in text files.
|
|
5
|
+
Project-URL: Homepage, https://github.com/deantvv/fix-text
|
|
6
|
+
Project-URL: Repository, https://github.com/deantvv/fix-text
|
|
7
|
+
Project-URL: Issues, https://github.com/deantvv/fix-text/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/deantvv/fix-text/releases
|
|
9
|
+
Author: Dean Li
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: cleanup,cli,normalization,text,unicode
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Text Processing
|
|
20
|
+
Classifier: Topic :: Utilities
|
|
21
|
+
Requires-Python: <4,>=3.12
|
|
22
|
+
Requires-Dist: orjson>=3.10.0
|
|
23
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# fix-text
|
|
27
|
+
|
|
28
|
+
`fix-text` is a small Python CLI for detecting and cleaning suspicious Unicode
|
|
29
|
+
characters in text files, including fullwidth spaces, non-breaking spaces, zero-width
|
|
30
|
+
characters, BOM markers, and optional control characters.
|
|
31
|
+
|
|
32
|
+
## What it fixes
|
|
33
|
+
|
|
34
|
+
- Replaces fullwidth and non-standard spacing characters with a normal ASCII space
|
|
35
|
+
- Removes zero-width characters and byte order marks
|
|
36
|
+
- Optionally removes unsupported control characters with `--include-controls`
|
|
37
|
+
- Validates cleaned `.json` files before rewriting them
|
|
38
|
+
- Validates cleaned `.yaml` and `.yml` files before rewriting them
|
|
39
|
+
- Scans individual files or entire directory trees
|
|
40
|
+
|
|
41
|
+
## Install for development
|
|
42
|
+
|
|
43
|
+
This project is managed with `uv`.
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
uv sync
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Install the test dependency group:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv sync --group dev
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Run the CLI without installing globally:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
uv run fix-text --help
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Run the test suite:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
uv run --group dev pytest
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
Scan a file or directory:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
uv run fix-text path/to/file.txt path/to/dir
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Rewrite files in place:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
uv run fix-text --apply path/to/file.txt
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Include control-character cleanup:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv run fix-text --apply --include-controls logs/
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Treat additional suffixes as text:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
uv run fix-text --ext .log --ext .cfg sample/
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
For `.json` files, `fix-text` validates the cleaned output with `orjson` before writing.
|
|
94
|
+
For `.yaml` and `.yml`, it validates the cleaned output with `PyYAML`.
|
|
95
|
+
If the cleaned content would still be invalid JSON or YAML, the file is left unchanged.
|
|
96
|
+
|
|
97
|
+
## Example output
|
|
98
|
+
|
|
99
|
+
```text
|
|
100
|
+
notes.txt
|
|
101
|
+
3:14 U+3000 IDEOGRAPHIC SPACE '\u3000' -> replace with space
|
|
102
|
+
8:2 U+200B ZERO WIDTH SPACE '\u200b' -> delete
|
|
103
|
+
|
|
104
|
+
Found 2 issue(s). Re-run with --apply to rewrite files.
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## PyPI release plan
|
|
108
|
+
|
|
109
|
+
1. Replace the placeholder GitHub URLs in `pyproject.toml`.
|
|
110
|
+
2. Create a PyPI account and API token.
|
|
111
|
+
3. Build distributions with `uv build`.
|
|
112
|
+
4. Publish with `uv publish`.
|
|
113
|
+
5. Tag the release in git so the source and package versions stay aligned.
|
|
114
|
+
|
|
115
|
+
Recommended pre-release checks:
|
|
116
|
+
|
|
117
|
+
- `uv run fix-text --help`
|
|
118
|
+
- `uv run fix-text README.md`
|
|
119
|
+
- `uv run --group dev pytest`
|
|
120
|
+
- `uv build --out-dir dist`
|
|
121
|
+
|
|
122
|
+
Build into the repo-local `dist/` directory:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
./scripts/release.sh build
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Publish the repo-local artifacts:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
./scripts/release.sh publish
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT
|
fix_text-0.1.0/README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# fix-text
|
|
2
|
+
|
|
3
|
+
`fix-text` is a small Python CLI for detecting and cleaning suspicious Unicode
|
|
4
|
+
characters in text files, including fullwidth spaces, non-breaking spaces, zero-width
|
|
5
|
+
characters, BOM markers, and optional control characters.
|
|
6
|
+
|
|
7
|
+
## What it fixes
|
|
8
|
+
|
|
9
|
+
- Replaces fullwidth and non-standard spacing characters with a normal ASCII space
|
|
10
|
+
- Removes zero-width characters and byte order marks
|
|
11
|
+
- Optionally removes unsupported control characters with `--include-controls`
|
|
12
|
+
- Validates cleaned `.json` files before rewriting them
|
|
13
|
+
- Validates cleaned `.yaml` and `.yml` files before rewriting them
|
|
14
|
+
- Scans individual files or entire directory trees
|
|
15
|
+
|
|
16
|
+
## Install for development
|
|
17
|
+
|
|
18
|
+
This project is managed with `uv`.
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv sync
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Install the test dependency group:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
uv sync --group dev
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Run the CLI without installing globally:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
uv run fix-text --help
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Run the test suite:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
uv run --group dev pytest
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
Scan a file or directory:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
uv run fix-text path/to/file.txt path/to/dir
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Rewrite files in place:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uv run fix-text --apply path/to/file.txt
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Include control-character cleanup:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
uv run fix-text --apply --include-controls logs/
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Treat additional suffixes as text:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
uv run fix-text --ext .log --ext .cfg sample/
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
For `.json` files, `fix-text` validates the cleaned output with `orjson` before writing.
|
|
69
|
+
For `.yaml` and `.yml`, it validates the cleaned output with `PyYAML`.
|
|
70
|
+
If the cleaned content would still be invalid JSON or YAML, the file is left unchanged.
|
|
71
|
+
|
|
72
|
+
## Example output
|
|
73
|
+
|
|
74
|
+
```text
|
|
75
|
+
notes.txt
|
|
76
|
+
3:14 U+3000 IDEOGRAPHIC SPACE '\u3000' -> replace with space
|
|
77
|
+
8:2 U+200B ZERO WIDTH SPACE '\u200b' -> delete
|
|
78
|
+
|
|
79
|
+
Found 2 issue(s). Re-run with --apply to rewrite files.
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## PyPI release plan
|
|
83
|
+
|
|
84
|
+
1. Replace the placeholder GitHub URLs in `pyproject.toml`.
|
|
85
|
+
2. Create a PyPI account and API token.
|
|
86
|
+
3. Build distributions with `uv build`.
|
|
87
|
+
4. Publish with `uv publish`.
|
|
88
|
+
5. Tag the release in git so the source and package versions stay aligned.
|
|
89
|
+
|
|
90
|
+
Recommended pre-release checks:
|
|
91
|
+
|
|
92
|
+
- `uv run fix-text --help`
|
|
93
|
+
- `uv run fix-text README.md`
|
|
94
|
+
- `uv run --group dev pytest`
|
|
95
|
+
- `uv build --out-dir dist`
|
|
96
|
+
|
|
97
|
+
Build into the repo-local `dist/` directory:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
./scripts/release.sh build
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Publish the repo-local artifacts:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
./scripts/release.sh publish
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
|
|
111
|
+
MIT
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fix-text"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Detect and normalize suspicious Unicode characters in text files."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12,<4"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"orjson>=3.10.0",
|
|
9
|
+
"PyYAML>=6.0.0",
|
|
10
|
+
]
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Dean Li" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["cli", "text", "unicode", "normalization", "cleanup"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Environment :: Console",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Text Processing",
|
|
24
|
+
"Topic :: Utilities",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/deantvv/fix-text"
|
|
29
|
+
Repository = "https://github.com/deantvv/fix-text"
|
|
30
|
+
Issues = "https://github.com/deantvv/fix-text/issues"
|
|
31
|
+
Changelog = "https://github.com/deantvv/fix-text/releases"
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
fix-text = "fix_text.cli:main"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["hatchling>=1.27.0"]
|
|
38
|
+
build-backend = "hatchling.build"
|
|
39
|
+
|
|
40
|
+
[dependency-groups]
|
|
41
|
+
dev = [
|
|
42
|
+
"pytest>=8.0.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[tool.uv]
|
|
46
|
+
package = true
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
import unicodedata
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import orjson
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
from fix_text import __version__
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
REPLACEMENTS: dict[str, str] = {
|
|
17
|
+
"\u00a0": " ", # no-break space
|
|
18
|
+
"\u1680": " ", # ogham space mark
|
|
19
|
+
"\u2000": " ",
|
|
20
|
+
"\u2001": " ",
|
|
21
|
+
"\u2002": " ",
|
|
22
|
+
"\u2003": " ",
|
|
23
|
+
"\u2004": " ",
|
|
24
|
+
"\u2005": " ",
|
|
25
|
+
"\u2006": " ",
|
|
26
|
+
"\u2007": " ",
|
|
27
|
+
"\u2008": " ",
|
|
28
|
+
"\u2009": " ",
|
|
29
|
+
"\u200a": " ",
|
|
30
|
+
"\u202f": " ",
|
|
31
|
+
"\u205f": " ",
|
|
32
|
+
"\u3000": " ", # fullwidth space
|
|
33
|
+
"\ufeff": "", # BOM / zero-width no-break space
|
|
34
|
+
"\u200b": "", # zero-width space
|
|
35
|
+
"\u200c": "", # zero-width non-joiner
|
|
36
|
+
"\u200d": "", # zero-width joiner
|
|
37
|
+
"\u2060": "", # word joiner
|
|
38
|
+
"\u00ad": "", # soft hyphen
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
CONTROL_EXCEPTIONS = {"\n", "\r", "\t"}
|
|
42
|
+
TEXT_SUFFIXES = {
|
|
43
|
+
".csv",
|
|
44
|
+
".html",
|
|
45
|
+
".ini",
|
|
46
|
+
".js",
|
|
47
|
+
".json",
|
|
48
|
+
".md",
|
|
49
|
+
".py",
|
|
50
|
+
".rst",
|
|
51
|
+
".sql",
|
|
52
|
+
".svg",
|
|
53
|
+
".toml",
|
|
54
|
+
".ts",
|
|
55
|
+
".tsx",
|
|
56
|
+
".txt",
|
|
57
|
+
".xml",
|
|
58
|
+
".yaml",
|
|
59
|
+
".yml",
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(slots=True)
|
|
64
|
+
class Issue:
|
|
65
|
+
line: int
|
|
66
|
+
column: int
|
|
67
|
+
char: str
|
|
68
|
+
action: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
72
|
+
parser = argparse.ArgumentParser(
|
|
73
|
+
prog="fix-text",
|
|
74
|
+
description=(
|
|
75
|
+
"Detect and normalize suspicious Unicode characters in text files. "
|
|
76
|
+
"Use --apply to rewrite files in place."
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument("paths", nargs="+", help="File or directory paths to scan.")
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"--apply",
|
|
82
|
+
action="store_true",
|
|
83
|
+
help="Rewrite files in place. Without this flag the tool only reports issues.",
|
|
84
|
+
)
|
|
85
|
+
parser.add_argument(
|
|
86
|
+
"--include-controls",
|
|
87
|
+
action="store_true",
|
|
88
|
+
help="Remove unsupported control characters as well as mapped Unicode spaces.",
|
|
89
|
+
)
|
|
90
|
+
parser.add_argument(
|
|
91
|
+
"--ext",
|
|
92
|
+
action="append",
|
|
93
|
+
default=[],
|
|
94
|
+
metavar="SUFFIX",
|
|
95
|
+
help="Extra file suffix to treat as text, for example --ext .log",
|
|
96
|
+
)
|
|
97
|
+
parser.add_argument(
|
|
98
|
+
"--encoding",
|
|
99
|
+
default="utf-8",
|
|
100
|
+
help="Text encoding to use when reading and writing files. Default: utf-8",
|
|
101
|
+
)
|
|
102
|
+
parser.add_argument(
|
|
103
|
+
"--version",
|
|
104
|
+
action="version",
|
|
105
|
+
version=f"%(prog)s {__version__}",
|
|
106
|
+
)
|
|
107
|
+
return parser
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def iter_target_files(paths: Iterable[str], extra_suffixes: Iterable[str]) -> Iterable[Path]:
|
|
111
|
+
suffixes = TEXT_SUFFIXES | {normalize_suffix(value) for value in extra_suffixes}
|
|
112
|
+
seen: set[Path] = set()
|
|
113
|
+
|
|
114
|
+
for raw_path in paths:
|
|
115
|
+
path = Path(raw_path)
|
|
116
|
+
if not path.exists():
|
|
117
|
+
print(f"warning: path does not exist: {path}", file=sys.stderr)
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
candidates = [path] if path.is_file() else sorted(p for p in path.rglob("*") if p.is_file())
|
|
121
|
+
for candidate in candidates:
|
|
122
|
+
resolved = candidate.resolve()
|
|
123
|
+
if resolved in seen:
|
|
124
|
+
continue
|
|
125
|
+
seen.add(resolved)
|
|
126
|
+
if candidate.suffix.lower() in suffixes:
|
|
127
|
+
yield candidate
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def normalize_suffix(value: str) -> str:
|
|
131
|
+
return value if value.startswith(".") else f".{value}"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def find_issues(text: str, include_controls: bool) -> list[Issue]:
|
|
135
|
+
issues: list[Issue] = []
|
|
136
|
+
line = 1
|
|
137
|
+
column = 1
|
|
138
|
+
|
|
139
|
+
for char in text:
|
|
140
|
+
action = replacement_for(char, include_controls)
|
|
141
|
+
if action is not None:
|
|
142
|
+
issues.append(Issue(line=line, column=column, char=char, action=describe_action(char, action)))
|
|
143
|
+
|
|
144
|
+
if char == "\n":
|
|
145
|
+
line += 1
|
|
146
|
+
column = 1
|
|
147
|
+
else:
|
|
148
|
+
column += 1
|
|
149
|
+
|
|
150
|
+
return issues
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def replacement_for(char: str, include_controls: bool) -> str | None:
|
|
154
|
+
replacement = REPLACEMENTS.get(char)
|
|
155
|
+
if replacement is not None:
|
|
156
|
+
return replacement
|
|
157
|
+
|
|
158
|
+
if include_controls and unicodedata.category(char) == "Cc" and char not in CONTROL_EXCEPTIONS:
|
|
159
|
+
return ""
|
|
160
|
+
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def describe_action(char: str, replacement: str) -> str:
|
|
165
|
+
if replacement == "":
|
|
166
|
+
return "delete"
|
|
167
|
+
if replacement == " ":
|
|
168
|
+
return "replace with space"
|
|
169
|
+
return f"replace with {replacement!r}"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def sanitize_text(text: str, include_controls: bool) -> str:
|
|
173
|
+
cleaned: list[str] = []
|
|
174
|
+
for char in text:
|
|
175
|
+
replacement = replacement_for(char, include_controls)
|
|
176
|
+
cleaned.append(char if replacement is None else replacement)
|
|
177
|
+
return "".join(cleaned)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def format_char(char: str) -> str:
|
|
181
|
+
codepoint = f"U+{ord(char):04X}"
|
|
182
|
+
name = unicodedata.name(char, "UNKNOWN")
|
|
183
|
+
printable = repr(char)
|
|
184
|
+
return f"{codepoint} {name} {printable}"
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def validate_json_text(text: str) -> bool:
|
|
188
|
+
try:
|
|
189
|
+
orjson.loads(text)
|
|
190
|
+
except orjson.JSONDecodeError:
|
|
191
|
+
return False
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def validate_yaml_text(text: str) -> bool:
|
|
196
|
+
try:
|
|
197
|
+
yaml.safe_load(text)
|
|
198
|
+
except yaml.YAMLError:
|
|
199
|
+
return False
|
|
200
|
+
return True
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def validate_cleaned_text(path: Path, text: str) -> tuple[bool, str | None]:
|
|
204
|
+
suffix = path.suffix.lower()
|
|
205
|
+
if suffix == ".json":
|
|
206
|
+
return validate_json_text(text), "JSON"
|
|
207
|
+
if suffix in {".yaml", ".yml"}:
|
|
208
|
+
return validate_yaml_text(text), "YAML"
|
|
209
|
+
return True, None
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def process_file(path: Path, encoding: str, apply: bool, include_controls: bool) -> tuple[int, bool]:
|
|
213
|
+
try:
|
|
214
|
+
original = path.read_text(encoding=encoding)
|
|
215
|
+
except UnicodeDecodeError:
|
|
216
|
+
print(f"skipped binary or non-{encoding} file: {path}", file=sys.stderr)
|
|
217
|
+
return 0, False
|
|
218
|
+
|
|
219
|
+
issues = find_issues(original, include_controls=include_controls)
|
|
220
|
+
if not issues:
|
|
221
|
+
return 0, False
|
|
222
|
+
|
|
223
|
+
print(path)
|
|
224
|
+
for issue in issues:
|
|
225
|
+
print(f" {issue.line}:{issue.column} {format_char(issue.char)} -> {issue.action}")
|
|
226
|
+
|
|
227
|
+
changed = False
|
|
228
|
+
if apply:
|
|
229
|
+
sanitized = sanitize_text(original, include_controls=include_controls)
|
|
230
|
+
if sanitized != original:
|
|
231
|
+
is_valid, file_type = validate_cleaned_text(path, sanitized)
|
|
232
|
+
if not is_valid and file_type is not None:
|
|
233
|
+
print(f" skipped rewrite: cleaned content is not valid {file_type}", file=sys.stderr)
|
|
234
|
+
return len(issues), False
|
|
235
|
+
path.write_text(sanitized, encoding=encoding)
|
|
236
|
+
changed = True
|
|
237
|
+
print(" rewritten")
|
|
238
|
+
|
|
239
|
+
return len(issues), changed
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def main(argv: list[str] | None = None) -> int:
|
|
243
|
+
parser = build_parser()
|
|
244
|
+
args = parser.parse_args(argv)
|
|
245
|
+
|
|
246
|
+
total_issues = 0
|
|
247
|
+
changed_files = 0
|
|
248
|
+
|
|
249
|
+
for path in iter_target_files(args.paths, args.ext):
|
|
250
|
+
issue_count, changed = process_file(
|
|
251
|
+
path,
|
|
252
|
+
encoding=args.encoding,
|
|
253
|
+
apply=args.apply,
|
|
254
|
+
include_controls=args.include_controls,
|
|
255
|
+
)
|
|
256
|
+
total_issues += issue_count
|
|
257
|
+
changed_files += int(changed)
|
|
258
|
+
|
|
259
|
+
if total_issues == 0:
|
|
260
|
+
print("No suspicious characters found.")
|
|
261
|
+
return 0
|
|
262
|
+
|
|
263
|
+
if args.apply:
|
|
264
|
+
print(f"\nFixed {total_issues} issue(s) across {changed_files} file(s).")
|
|
265
|
+
else:
|
|
266
|
+
print(f"\nFound {total_issues} issue(s). Re-run with --apply to rewrite files.")
|
|
267
|
+
|
|
268
|
+
return 1 if not args.apply else 0
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from fix_text.cli import (
|
|
6
|
+
main,
|
|
7
|
+
process_file,
|
|
8
|
+
sanitize_text,
|
|
9
|
+
validate_json_text,
|
|
10
|
+
validate_yaml_text,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_sanitize_text_replaces_and_deletes_mapped_characters() -> None:
|
|
15
|
+
assert sanitize_text("A\u3000B\u200bC", include_controls=False) == "A BC"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_validate_json_text_accepts_valid_json() -> None:
|
|
19
|
+
assert validate_json_text('{"name":"value"}')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_validate_json_text_rejects_invalid_json() -> None:
|
|
23
|
+
assert not validate_json_text('{"name":"value",}')
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_validate_yaml_text_accepts_valid_yaml() -> None:
|
|
27
|
+
assert validate_yaml_text("name: value\nitems:\n - one\n")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_validate_yaml_text_rejects_invalid_yaml() -> None:
|
|
31
|
+
assert not validate_yaml_text("items: [one\n")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_process_file_rewrites_valid_json(tmp_path: Path, capsys) -> None:
|
|
35
|
+
path = tmp_path / "sample.json"
|
|
36
|
+
path.write_text('{"name":"A\u3000B","note":"x\u200by"}\n', encoding="utf-8")
|
|
37
|
+
|
|
38
|
+
issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
|
|
39
|
+
|
|
40
|
+
assert issues == 2
|
|
41
|
+
assert changed is True
|
|
42
|
+
assert path.read_text(encoding="utf-8") == '{"name":"A B","note":"xy"}\n'
|
|
43
|
+
captured = capsys.readouterr()
|
|
44
|
+
assert "rewritten" in captured.out
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_process_file_skips_invalid_json_rewrite(tmp_path: Path, capsys) -> None:
|
|
48
|
+
path = tmp_path / "invalid.json"
|
|
49
|
+
original = '{"name":"A\u3000B",}\n'
|
|
50
|
+
path.write_text(original, encoding="utf-8")
|
|
51
|
+
|
|
52
|
+
issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
|
|
53
|
+
|
|
54
|
+
assert issues == 1
|
|
55
|
+
assert changed is False
|
|
56
|
+
assert path.read_text(encoding="utf-8") == original
|
|
57
|
+
captured = capsys.readouterr()
|
|
58
|
+
assert "skipped rewrite: cleaned content is not valid JSON" in captured.err
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_process_file_rewrites_valid_yaml(tmp_path: Path, capsys) -> None:
|
|
62
|
+
path = tmp_path / "sample.yaml"
|
|
63
|
+
path.write_text("title: A\u3000B\nnote: x\u200by\n", encoding="utf-8")
|
|
64
|
+
|
|
65
|
+
issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
|
|
66
|
+
|
|
67
|
+
assert issues == 2
|
|
68
|
+
assert changed is True
|
|
69
|
+
assert path.read_text(encoding="utf-8") == "title: A B\nnote: xy\n"
|
|
70
|
+
captured = capsys.readouterr()
|
|
71
|
+
assert "rewritten" in captured.out
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_process_file_skips_invalid_yml_rewrite(tmp_path: Path, capsys) -> None:
|
|
75
|
+
path = tmp_path / "invalid.yml"
|
|
76
|
+
original = "items: [one\u200b\n"
|
|
77
|
+
path.write_text(original, encoding="utf-8")
|
|
78
|
+
|
|
79
|
+
issues, changed = process_file(path, encoding="utf-8", apply=True, include_controls=False)
|
|
80
|
+
|
|
81
|
+
assert issues == 1
|
|
82
|
+
assert changed is False
|
|
83
|
+
assert path.read_text(encoding="utf-8") == original
|
|
84
|
+
captured = capsys.readouterr()
|
|
85
|
+
assert "skipped rewrite: cleaned content is not valid YAML" in captured.err
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_main_reports_findings_without_apply(tmp_path: Path, capsys) -> None:
|
|
89
|
+
path = tmp_path / "notes.txt"
|
|
90
|
+
path.write_text("hello\u3000world\n", encoding="utf-8")
|
|
91
|
+
|
|
92
|
+
exit_code = main([str(path)])
|
|
93
|
+
|
|
94
|
+
assert exit_code == 1
|
|
95
|
+
captured = capsys.readouterr()
|
|
96
|
+
assert "U+3000 IDEOGRAPHIC SPACE" in captured.out
|
|
97
|
+
assert "Re-run with --apply" in captured.out
|