cleanmonkey 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cleanmonkey/__init__.py +7 -0
- cleanmonkey/__main__.py +5 -0
- cleanmonkey/cli.py +400 -0
- cleanmonkey/core.py +449 -0
- cleanmonkey/maps.py +106 -0
- cleanmonkey/profiles.py +57 -0
- cleanmonkey/py.typed +0 -0
- cleanmonkey-0.1.0.dist-info/METADATA +152 -0
- cleanmonkey-0.1.0.dist-info/RECORD +13 -0
- cleanmonkey-0.1.0.dist-info/WHEEL +5 -0
- cleanmonkey-0.1.0.dist-info/entry_points.txt +2 -0
- cleanmonkey-0.1.0.dist-info/licenses/LICENSE +21 -0
- cleanmonkey-0.1.0.dist-info/top_level.txt +1 -0
cleanmonkey/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""cleanmonkey — one-call text cleanup for invisible characters, smart quotes, and whitespace."""
|
|
2
|
+
|
|
3
|
+
from cleanmonkey.core import MAX_DEPTH, clean, clean_column, clean_dict, inspect
|
|
4
|
+
from cleanmonkey.profiles import PROFILES, Profile
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
__all__ = ["MAX_DEPTH", "clean", "clean_column", "clean_dict", "inspect", "Profile", "PROFILES"]
|
cleanmonkey/__main__.py
ADDED
cleanmonkey/cli.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
"""CLI entry point for cleanmonkey."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import contextlib
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import shutil
|
|
10
|
+
import stat
|
|
11
|
+
import sys
|
|
12
|
+
import tempfile
|
|
13
|
+
from typing import Any, TextIO
|
|
14
|
+
|
|
15
|
+
from cleanmonkey.core import clean, inspect
|
|
16
|
+
from cleanmonkey.profiles import PROFILES
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _fsync_directory(dir_path: str) -> None:
|
|
20
|
+
"""Best-effort fsync of a directory for rename durability on POSIX.
|
|
21
|
+
|
|
22
|
+
Silently ignores errors when directory fsync is unsupported (e.g. Windows,
|
|
23
|
+
certain filesystems). Emits a warning to stderr for unexpected I/O errors
|
|
24
|
+
that may indicate a real durability problem.
|
|
25
|
+
"""
|
|
26
|
+
import errno
|
|
27
|
+
# Errors that indicate "not supported here" rather than a real failure.
|
|
28
|
+
# EACCES/EBADF are NOT included: they may indicate real permission issues
|
|
29
|
+
# on the directory that should surface as warnings.
|
|
30
|
+
_UNSUPPORTED_ERRNOS = {
|
|
31
|
+
errno.ENOTSUP, errno.EOPNOTSUPP, errno.ENOSYS, errno.EINVAL,
|
|
32
|
+
}
|
|
33
|
+
try:
|
|
34
|
+
fd = os.open(dir_path, os.O_RDONLY)
|
|
35
|
+
try:
|
|
36
|
+
os.fsync(fd)
|
|
37
|
+
finally:
|
|
38
|
+
os.close(fd)
|
|
39
|
+
except OSError as exc:
|
|
40
|
+
if exc.errno not in _UNSUPPORTED_ERRNOS:
|
|
41
|
+
print(
|
|
42
|
+
f"cleanmonkey: warning: directory fsync failed for "
|
|
43
|
+
f"{dir_path!r}: {exc}",
|
|
44
|
+
file=sys.stderr,
|
|
45
|
+
)
|
|
46
|
+
# Never fatal — durability is best-effort.
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _open_streams(
|
|
50
|
+
file_path: str | None,
|
|
51
|
+
output_path: str | None,
|
|
52
|
+
) -> contextlib.AbstractContextManager[tuple[TextIO, TextIO, str | None]]:
|
|
53
|
+
"""Open input/output streams with explicit ownership.
|
|
54
|
+
|
|
55
|
+
When a path is provided (and is not ``"-"``), the file is opened and will
|
|
56
|
+
be closed when the context manager exits. ``stdin``/``stdout`` are
|
|
57
|
+
*borrowed* – never closed by this function.
|
|
58
|
+
|
|
59
|
+
When *output_path* refers to a real file, writes go to a temporary file in
|
|
60
|
+
the same directory. The caller receives the temp path as the third element
|
|
61
|
+
of the yielded tuple so it can be atomically renamed to the final
|
|
62
|
+
destination **after** processing succeeds. On failure, the temp file is
|
|
63
|
+
removed and the original destination is untouched.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@contextlib.contextmanager
|
|
67
|
+
def _ctx():
|
|
68
|
+
in_stream: TextIO | None = None
|
|
69
|
+
out_stream: TextIO | None = None
|
|
70
|
+
tmp_path: str | None = None
|
|
71
|
+
# Resolve symlinks so os.replace() writes through to the real
|
|
72
|
+
# destination instead of replacing the symlink itself.
|
|
73
|
+
resolved_output = (
|
|
74
|
+
os.path.realpath(output_path)
|
|
75
|
+
if output_path is not None and output_path != "-"
|
|
76
|
+
else output_path
|
|
77
|
+
)
|
|
78
|
+
success = False
|
|
79
|
+
try:
|
|
80
|
+
if file_path is None or file_path == "-":
|
|
81
|
+
in_stream = sys.stdin
|
|
82
|
+
else:
|
|
83
|
+
in_stream = open(file_path, "r", encoding="utf-8", newline="")
|
|
84
|
+
|
|
85
|
+
if resolved_output is None or resolved_output == "-":
|
|
86
|
+
out_stream = sys.stdout
|
|
87
|
+
else:
|
|
88
|
+
# Write to a temp file in the same directory so os.replace()
|
|
89
|
+
# is atomic on the same filesystem.
|
|
90
|
+
out_dir = os.path.dirname(os.path.abspath(resolved_output))
|
|
91
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
92
|
+
dir=out_dir, prefix=".cleanmonkey_", suffix=".tmp",
|
|
93
|
+
)
|
|
94
|
+
out_stream = open(fd, "w", encoding="utf-8", newline="")
|
|
95
|
+
|
|
96
|
+
yield in_stream, out_stream, tmp_path
|
|
97
|
+
success = True
|
|
98
|
+
finally:
|
|
99
|
+
# Only close streams we opened (not stdin/stdout).
|
|
100
|
+
if in_stream is not None and in_stream is not sys.stdin:
|
|
101
|
+
in_stream.close()
|
|
102
|
+
finalize_error: OSError | None = None
|
|
103
|
+
if out_stream is not None and out_stream is not sys.stdout:
|
|
104
|
+
try:
|
|
105
|
+
# Flush to OS and fsync for durability before atomic rename.
|
|
106
|
+
if tmp_path is not None and success:
|
|
107
|
+
out_stream.flush()
|
|
108
|
+
os.fsync(out_stream.fileno())
|
|
109
|
+
except OSError as exc:
|
|
110
|
+
# Flush/fsync failed — treat as unsuccessful so we clean up.
|
|
111
|
+
success = False
|
|
112
|
+
finalize_error = exc
|
|
113
|
+
finally:
|
|
114
|
+
out_stream.close()
|
|
115
|
+
# Clean up temp file on failure; promote on success.
|
|
116
|
+
if tmp_path is not None:
|
|
117
|
+
if success and resolved_output is not None:
|
|
118
|
+
try:
|
|
119
|
+
# Preserve original file metadata if destination exists.
|
|
120
|
+
dest_existed = os.path.exists(resolved_output)
|
|
121
|
+
if dest_existed:
|
|
122
|
+
try:
|
|
123
|
+
shutil.copystat(resolved_output, tmp_path)
|
|
124
|
+
except OSError:
|
|
125
|
+
# copystat failed (e.g. unsupported metadata).
|
|
126
|
+
# Fall back to preserving at least the file mode
|
|
127
|
+
# so os.replace() doesn't leave mkstemp's 0600.
|
|
128
|
+
try:
|
|
129
|
+
orig_mode = os.stat(resolved_output).st_mode
|
|
130
|
+
os.chmod(tmp_path, stat.S_IMODE(orig_mode))
|
|
131
|
+
except OSError as perm_exc:
|
|
132
|
+
print(
|
|
133
|
+
f"cleanmonkey: warning: could not preserve "
|
|
134
|
+
f"permissions for {resolved_output!r}: {perm_exc}",
|
|
135
|
+
file=sys.stderr,
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
print(
|
|
139
|
+
f"cleanmonkey: warning: metadata preservation "
|
|
140
|
+
f"partially failed for {resolved_output!r}; "
|
|
141
|
+
f"file mode preserved",
|
|
142
|
+
file=sys.stderr,
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
# mkstemp creates files with mode 0600; apply
|
|
146
|
+
# umask-derived default so new files behave like
|
|
147
|
+
# a normal open() would (typically 0644).
|
|
148
|
+
# Apply to temp file BEFORE replace so that a
|
|
149
|
+
# chmod failure cannot leave a replaced destination.
|
|
150
|
+
umask = os.umask(0)
|
|
151
|
+
os.umask(umask)
|
|
152
|
+
os.chmod(tmp_path, 0o666 & ~umask)
|
|
153
|
+
os.replace(tmp_path, resolved_output)
|
|
154
|
+
_fsync_directory(os.path.dirname(os.path.abspath(resolved_output)))
|
|
155
|
+
except OSError:
|
|
156
|
+
# replace or dir fsync failed — clean up temp file.
|
|
157
|
+
try:
|
|
158
|
+
os.unlink(tmp_path)
|
|
159
|
+
except OSError:
|
|
160
|
+
pass
|
|
161
|
+
raise
|
|
162
|
+
else:
|
|
163
|
+
try:
|
|
164
|
+
os.unlink(tmp_path)
|
|
165
|
+
except OSError:
|
|
166
|
+
pass
|
|
167
|
+
# Re-raise flush/fsync error after cleanup so callers know it failed.
|
|
168
|
+
if finalize_error is not None:
|
|
169
|
+
raise finalize_error
|
|
170
|
+
|
|
171
|
+
return _ctx()
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _non_negative_int(value: str) -> int:
|
|
175
|
+
"""Argparse type for non-negative integers."""
|
|
176
|
+
try:
|
|
177
|
+
n = int(value)
|
|
178
|
+
except ValueError:
|
|
179
|
+
raise argparse.ArgumentTypeError(f"invalid int value: {value!r}")
|
|
180
|
+
if n < 0:
|
|
181
|
+
raise argparse.ArgumentTypeError(f"must be non-negative, got {n}")
|
|
182
|
+
return n
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def main(argv: list[str] | None = None) -> None:
|
|
186
|
+
parser = argparse.ArgumentParser(
|
|
187
|
+
prog="cleanmonkey",
|
|
188
|
+
description="Clean invisible characters, smart quotes, and whitespace from text.",
|
|
189
|
+
)
|
|
190
|
+
parser.add_argument(
|
|
191
|
+
"file",
|
|
192
|
+
nargs="?",
|
|
193
|
+
default=None,
|
|
194
|
+
help="Input file (default: stdin, use '-' for stdin)",
|
|
195
|
+
)
|
|
196
|
+
parser.add_argument(
|
|
197
|
+
"-o", "--output",
|
|
198
|
+
default=None,
|
|
199
|
+
help="Output file (default: stdout, use '-' for stdout)",
|
|
200
|
+
)
|
|
201
|
+
parser.add_argument(
|
|
202
|
+
"-p", "--profile",
|
|
203
|
+
choices=sorted(PROFILES),
|
|
204
|
+
default="default",
|
|
205
|
+
help="Cleaning profile (default: default)",
|
|
206
|
+
)
|
|
207
|
+
parser.add_argument(
|
|
208
|
+
"--inspect",
|
|
209
|
+
action="store_true",
|
|
210
|
+
dest="inspect_mode",
|
|
211
|
+
help="Inspect mode: report problematic characters instead of cleaning. "
|
|
212
|
+
"Reports character-map replacements only; structural changes like "
|
|
213
|
+
"space collapsing and per-line stripping are not reported.",
|
|
214
|
+
)
|
|
215
|
+
parser.add_argument(
|
|
216
|
+
"--no-smart-quotes", action="store_true", help="Disable smart quote normalization",
|
|
217
|
+
)
|
|
218
|
+
parser.add_argument(
|
|
219
|
+
"--no-dashes", action="store_true", help="Disable dash normalization",
|
|
220
|
+
)
|
|
221
|
+
parser.add_argument(
|
|
222
|
+
"--fullwidth", action="store_true",
|
|
223
|
+
help="Enable fullwidth ASCII letter and digit normalization "
|
|
224
|
+
"(e.g. \uff21\u2192A, \uff10\u21920; fullwidth punctuation is not covered)",
|
|
225
|
+
)
|
|
226
|
+
parser.add_argument(
|
|
227
|
+
"--no-line-endings", action="store_true",
|
|
228
|
+
help="Disable line-ending normalization (also disables CR reporting in inspect mode)",
|
|
229
|
+
)
|
|
230
|
+
parser.add_argument(
|
|
231
|
+
"--no-strip", action="store_true",
|
|
232
|
+
help="Disable stripping of leading/trailing whitespace per line",
|
|
233
|
+
)
|
|
234
|
+
parser.add_argument(
|
|
235
|
+
"--no-collapse-spaces", action="store_true",
|
|
236
|
+
help="Disable collapsing of multiple spaces into one",
|
|
237
|
+
)
|
|
238
|
+
parser.add_argument(
|
|
239
|
+
"--json", action="store_true",
|
|
240
|
+
help="Output inspect results as JSON (implies --inspect)",
|
|
241
|
+
)
|
|
242
|
+
parser.add_argument(
|
|
243
|
+
"--max-positions", type=_non_negative_int, default=None, metavar="N",
|
|
244
|
+
help="Limit position lists in inspect output to at most N entries "
|
|
245
|
+
"(count is always accurate). Useful for large files.",
|
|
246
|
+
)
|
|
247
|
+
parser.add_argument(
|
|
248
|
+
"--stream", action="store_true",
|
|
249
|
+
help="Process input line-by-line instead of loading it all into memory. "
|
|
250
|
+
"Suitable for very large files. Ignored in inspect mode.",
|
|
251
|
+
)
|
|
252
|
+
parser.add_argument(
|
|
253
|
+
"--version", action="version", version=f"%(prog)s {_get_version()}",
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
args = parser.parse_args(argv)
|
|
257
|
+
|
|
258
|
+
# --json implies --inspect.
|
|
259
|
+
if args.json:
|
|
260
|
+
args.inspect_mode = True
|
|
261
|
+
|
|
262
|
+
# Warn if --stream is used with --inspect (inspect needs full text).
|
|
263
|
+
if args.stream and args.inspect_mode:
|
|
264
|
+
print(
|
|
265
|
+
"cleanmonkey: warning: --stream is ignored in inspect mode",
|
|
266
|
+
file=sys.stderr,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Guard against same input/output path (would truncate source before read).
|
|
270
|
+
if (
|
|
271
|
+
args.file is not None
|
|
272
|
+
and args.file != "-"
|
|
273
|
+
and args.output is not None
|
|
274
|
+
and args.output != "-"
|
|
275
|
+
):
|
|
276
|
+
try:
|
|
277
|
+
if os.path.samefile(args.file, args.output):
|
|
278
|
+
parser.error("input and output paths refer to the same file; this would cause data loss")
|
|
279
|
+
except FileNotFoundError:
|
|
280
|
+
# Output file doesn't exist yet – that's fine, no collision.
|
|
281
|
+
pass
|
|
282
|
+
except OSError as exc:
|
|
283
|
+
parser.error(
|
|
284
|
+
f"cannot compare input {args.file!r} and output {args.output!r}: {exc}"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
_run(parser, args)
|
|
289
|
+
except (OSError, ValueError) as exc:
|
|
290
|
+
parser.error(str(exc))
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _run(parser: argparse.ArgumentParser, args: argparse.Namespace) -> None:
|
|
294
|
+
"""Execute the main logic, allowing OSError to propagate to the caller."""
|
|
295
|
+
with _open_streams(args.file, args.output) as (in_stream, out_stream, _tmp):
|
|
296
|
+
# Build clean overrides (shared by buffered and streaming paths).
|
|
297
|
+
overrides: dict[str, bool] = {}
|
|
298
|
+
if args.no_smart_quotes:
|
|
299
|
+
overrides["smart_quotes"] = False
|
|
300
|
+
if args.no_dashes:
|
|
301
|
+
overrides["dashes"] = False
|
|
302
|
+
if args.fullwidth:
|
|
303
|
+
overrides["fullwidth"] = True
|
|
304
|
+
if args.no_line_endings:
|
|
305
|
+
overrides["line_endings"] = False
|
|
306
|
+
if args.no_strip:
|
|
307
|
+
overrides["strip"] = False
|
|
308
|
+
if args.no_collapse_spaces:
|
|
309
|
+
overrides["collapse_spaces"] = False
|
|
310
|
+
|
|
311
|
+
# --stream: line-by-line processing for clean mode (not inspect).
|
|
312
|
+
if args.stream and not args.inspect_mode:
|
|
313
|
+
try:
|
|
314
|
+
for line in in_stream:
|
|
315
|
+
out_stream.write(clean(line, profile=args.profile, **overrides))
|
|
316
|
+
except UnicodeDecodeError:
|
|
317
|
+
parser.error(f"cannot decode {args.file or '<stdin>'!r}: file is not valid UTF-8")
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
try:
|
|
321
|
+
text = in_stream.read()
|
|
322
|
+
except UnicodeDecodeError:
|
|
323
|
+
parser.error(f"cannot decode {args.file or '<stdin>'!r}: file is not valid UTF-8")
|
|
324
|
+
|
|
325
|
+
if args.inspect_mode:
|
|
326
|
+
inspect_kw: dict[str, Any] = {"profile": args.profile}
|
|
327
|
+
if args.fullwidth:
|
|
328
|
+
inspect_kw["fullwidth"] = True
|
|
329
|
+
if args.no_line_endings:
|
|
330
|
+
inspect_kw["line_endings"] = False
|
|
331
|
+
if args.max_positions is not None:
|
|
332
|
+
inspect_kw["max_positions"] = args.max_positions
|
|
333
|
+
findings = inspect(text, **inspect_kw)
|
|
334
|
+
if args.json:
|
|
335
|
+
json.dump(
|
|
336
|
+
[
|
|
337
|
+
{
|
|
338
|
+
"char": info.char,
|
|
339
|
+
"codepoint": info.codepoint,
|
|
340
|
+
"name": info.name,
|
|
341
|
+
"category": info.category,
|
|
342
|
+
"count": info.count,
|
|
343
|
+
"positions": info.positions,
|
|
344
|
+
}
|
|
345
|
+
for info in findings
|
|
346
|
+
],
|
|
347
|
+
out_stream,
|
|
348
|
+
ensure_ascii=False,
|
|
349
|
+
)
|
|
350
|
+
out_stream.write("\n")
|
|
351
|
+
return
|
|
352
|
+
if not findings:
|
|
353
|
+
print("No problematic characters found.", file=out_stream)
|
|
354
|
+
return
|
|
355
|
+
for info in findings:
|
|
356
|
+
# When max_positions is set, inspect() already truncated;
|
|
357
|
+
# otherwise apply a default cap of 10 for text readability.
|
|
358
|
+
if args.max_positions is not None:
|
|
359
|
+
shown = info.positions
|
|
360
|
+
else:
|
|
361
|
+
shown = info.positions[:10]
|
|
362
|
+
truncated = len(shown) < info.count
|
|
363
|
+
print(
|
|
364
|
+
f"{info.codepoint} {info.name} (count: {info.count}, "
|
|
365
|
+
f"positions: {shown}{'...' if truncated else ''})",
|
|
366
|
+
file=out_stream,
|
|
367
|
+
)
|
|
368
|
+
return
|
|
369
|
+
|
|
370
|
+
result = clean(text, profile=args.profile, **overrides)
|
|
371
|
+
out_stream.write(result)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _get_version() -> str:
|
|
375
|
+
from cleanmonkey import __version__
|
|
376
|
+
return __version__
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _main_with_broken_pipe_handling() -> None:
|
|
380
|
+
"""Entry point that handles BrokenPipeError for pipeline-friendly behavior."""
|
|
381
|
+
try:
|
|
382
|
+
main()
|
|
383
|
+
except BrokenPipeError:
|
|
384
|
+
# Suppress noisy traceback when downstream consumer closes early
|
|
385
|
+
# (e.g., `cleanmonkey file.txt | head -n1`).
|
|
386
|
+
# Flush stderr and restore default SIGPIPE behavior for clean exit.
|
|
387
|
+
try:
|
|
388
|
+
sys.stdout.close()
|
|
389
|
+
except BrokenPipeError:
|
|
390
|
+
pass
|
|
391
|
+
try:
|
|
392
|
+
sys.stderr.close()
|
|
393
|
+
except BrokenPipeError:
|
|
394
|
+
pass
|
|
395
|
+
# Exit with the conventional signal code for SIGPIPE (128 + 13 = 141)
|
|
396
|
+
sys.exit(141)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
if __name__ == "__main__":
|
|
400
|
+
_main_with_broken_pipe_handling()
|
cleanmonkey/core.py
ADDED
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
"""Core cleaning functions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass, replace
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
_BOOL_OVERRIDE_NAMES_CLEAN = (
|
|
10
|
+
"smart_quotes", "dashes", "ellipsis", "invisible", "whitespace",
|
|
11
|
+
"control", "fullwidth", "line_endings", "collapse_spaces", "strip",
|
|
12
|
+
)
|
|
13
|
+
_BOOL_OVERRIDE_NAMES_INSPECT = ("fullwidth", "line_endings")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _validate_bool_overrides(overrides: dict[str, Any], allowed: tuple[str, ...], func_name: str) -> None:
|
|
17
|
+
"""Raise TypeError if any override value is not None or bool, or if unknown keys are present."""
|
|
18
|
+
unknown = set(overrides) - set(allowed)
|
|
19
|
+
if unknown:
|
|
20
|
+
raise TypeError(
|
|
21
|
+
f"{func_name}() got unexpected keyword argument(s): {', '.join(sorted(unknown))}"
|
|
22
|
+
)
|
|
23
|
+
for name in allowed:
|
|
24
|
+
val = overrides.get(name)
|
|
25
|
+
if val is not None and not isinstance(val, bool):
|
|
26
|
+
raise TypeError(
|
|
27
|
+
f"{func_name}() override {name!r} must be bool or None, got {type(val).__name__}"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from cleanmonkey.maps import (
|
|
31
|
+
CONTROL,
|
|
32
|
+
DASHES,
|
|
33
|
+
ELLIPSIS,
|
|
34
|
+
FULLWIDTH,
|
|
35
|
+
INVISIBLE,
|
|
36
|
+
SMART_QUOTES,
|
|
37
|
+
WHITESPACE,
|
|
38
|
+
)
|
|
39
|
+
from cleanmonkey.profiles import PROFILES, Profile
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _validate_profile_kwarg(kwargs: dict[str, Any], func_name: str) -> None:
|
|
43
|
+
"""Validate the 'profile' kwarg type and name if present, matching clean()'s contract."""
|
|
44
|
+
if "profile" in kwargs:
|
|
45
|
+
p = kwargs["profile"]
|
|
46
|
+
if isinstance(p, str):
|
|
47
|
+
if p not in PROFILES:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Unknown profile {p!r}. Available: {', '.join(sorted(PROFILES))}"
|
|
50
|
+
)
|
|
51
|
+
elif not isinstance(p, Profile):
|
|
52
|
+
raise TypeError(
|
|
53
|
+
f"{func_name}() profile must be str or Profile, got {type(p).__name__}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
_MULTI_SPACE = re.compile(r" {2,}")
|
|
57
|
+
|
|
58
|
+
#: Maximum nesting depth for recursive cleaners (clean_dict, clean_column).
|
|
59
|
+
#: Kept well below half of Python's default recursion limit (1000) since each
|
|
60
|
+
#: nesting level consumes multiple Python frames.
|
|
61
|
+
MAX_DEPTH: int = 200
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _build_table(profile: Profile) -> dict[int, str | int | None]:
|
|
65
|
+
"""Build a str.translate table from a profile."""
|
|
66
|
+
merged: dict[str, str] = {}
|
|
67
|
+
if profile.invisible:
|
|
68
|
+
merged.update(INVISIBLE)
|
|
69
|
+
if profile.whitespace:
|
|
70
|
+
merged.update(WHITESPACE)
|
|
71
|
+
if profile.control:
|
|
72
|
+
merged.update(CONTROL)
|
|
73
|
+
if profile.smart_quotes:
|
|
74
|
+
merged.update(SMART_QUOTES)
|
|
75
|
+
if profile.dashes:
|
|
76
|
+
merged.update(DASHES)
|
|
77
|
+
if profile.ellipsis:
|
|
78
|
+
merged.update(ELLIPSIS)
|
|
79
|
+
if profile.fullwidth:
|
|
80
|
+
merged.update(FULLWIDTH)
|
|
81
|
+
return str.maketrans({k: v for k, v in merged.items()})
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# Cache tables for default profiles
|
|
85
|
+
_TABLE_CACHE: dict[str, dict[int, str | int | None]] = {}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _get_table(profile: Profile, profile_name: str | None = None) -> dict[int, str | int | None]:
|
|
89
|
+
if profile_name and profile_name in _TABLE_CACHE:
|
|
90
|
+
return _TABLE_CACHE[profile_name]
|
|
91
|
+
table = _build_table(profile)
|
|
92
|
+
if profile_name:
|
|
93
|
+
_TABLE_CACHE[profile_name] = table
|
|
94
|
+
return table
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def clean(
|
|
98
|
+
text: str,
|
|
99
|
+
*,
|
|
100
|
+
profile: str | Profile = "default",
|
|
101
|
+
smart_quotes: bool | None = None,
|
|
102
|
+
dashes: bool | None = None,
|
|
103
|
+
ellipsis: bool | None = None,
|
|
104
|
+
invisible: bool | None = None,
|
|
105
|
+
whitespace: bool | None = None,
|
|
106
|
+
control: bool | None = None,
|
|
107
|
+
fullwidth: bool | None = None,
|
|
108
|
+
line_endings: bool | None = None,
|
|
109
|
+
collapse_spaces: bool | None = None,
|
|
110
|
+
strip: bool | None = None,
|
|
111
|
+
) -> str:
|
|
112
|
+
"""Clean text with sensible defaults.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
text : str
|
|
117
|
+
The text to clean.
|
|
118
|
+
profile : str or Profile
|
|
119
|
+
Named profile or a Profile instance. Default is "default".
|
|
120
|
+
smart_quotes, dashes, ellipsis, invisible, whitespace, control,
|
|
121
|
+
fullwidth, line_endings, collapse_spaces, strip :
|
|
122
|
+
Override individual profile settings. None means use profile value.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
str
|
|
127
|
+
Cleaned text.
|
|
128
|
+
|
|
129
|
+
Notes
|
|
130
|
+
-----
|
|
131
|
+
When ``strip=True`` (the default profile), leading and trailing spaces
|
|
132
|
+
and tabs are removed from **each line individually**, not just the
|
|
133
|
+
overall string. This will destroy meaningful indentation (e.g. Python,
|
|
134
|
+
YAML, Markdown code blocks). Pass ``strip=False`` for
|
|
135
|
+
indentation-sensitive content.
|
|
136
|
+
"""
|
|
137
|
+
if not isinstance(text, str):
|
|
138
|
+
raise TypeError(f"clean() expects str, got {type(text).__name__}")
|
|
139
|
+
|
|
140
|
+
# Validate override types before any other processing
|
|
141
|
+
_validate_bool_overrides(
|
|
142
|
+
{"smart_quotes": smart_quotes, "dashes": dashes, "ellipsis": ellipsis,
|
|
143
|
+
"invisible": invisible, "whitespace": whitespace, "control": control,
|
|
144
|
+
"fullwidth": fullwidth, "line_endings": line_endings,
|
|
145
|
+
"collapse_spaces": collapse_spaces, "strip": strip},
|
|
146
|
+
_BOOL_OVERRIDE_NAMES_CLEAN, "clean",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Resolve profile (validate before early return so invalid profiles always raise)
|
|
150
|
+
if isinstance(profile, str):
|
|
151
|
+
profile_name: str | None = profile
|
|
152
|
+
if profile not in PROFILES:
|
|
153
|
+
raise ValueError(f"Unknown profile {profile!r}. Available: {', '.join(sorted(PROFILES))}")
|
|
154
|
+
base = PROFILES[profile]
|
|
155
|
+
elif isinstance(profile, Profile):
|
|
156
|
+
profile_name = None
|
|
157
|
+
base = profile
|
|
158
|
+
else:
|
|
159
|
+
raise TypeError(f"profile must be str or Profile, got {type(profile).__name__}")
|
|
160
|
+
|
|
161
|
+
if not text:
|
|
162
|
+
return text
|
|
163
|
+
|
|
164
|
+
# Apply overrides
|
|
165
|
+
overrides = {
|
|
166
|
+
k: v
|
|
167
|
+
for k, v in {
|
|
168
|
+
"smart_quotes": smart_quotes,
|
|
169
|
+
"dashes": dashes,
|
|
170
|
+
"ellipsis": ellipsis,
|
|
171
|
+
"invisible": invisible,
|
|
172
|
+
"whitespace": whitespace,
|
|
173
|
+
"control": control,
|
|
174
|
+
"fullwidth": fullwidth,
|
|
175
|
+
"line_endings": line_endings,
|
|
176
|
+
"collapse_spaces": collapse_spaces,
|
|
177
|
+
"strip": strip,
|
|
178
|
+
}.items()
|
|
179
|
+
if v is not None
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if overrides:
|
|
183
|
+
p = replace(base, **overrides)
|
|
184
|
+
profile_name = None # don't cache custom combos
|
|
185
|
+
else:
|
|
186
|
+
p = base
|
|
187
|
+
|
|
188
|
+
# Translate characters
|
|
189
|
+
table = _get_table(p, profile_name)
|
|
190
|
+
result = text.translate(table)
|
|
191
|
+
|
|
192
|
+
# Normalize line endings
|
|
193
|
+
if p.line_endings:
|
|
194
|
+
result = result.replace("\r\n", "\n").replace("\r", "\n")
|
|
195
|
+
|
|
196
|
+
# Collapse multiple spaces
|
|
197
|
+
if p.collapse_spaces:
|
|
198
|
+
result = _MULTI_SPACE.sub(" ", result)
|
|
199
|
+
|
|
200
|
+
# Strip
|
|
201
|
+
if p.strip:
|
|
202
|
+
result = "\n".join(line.strip(" \t") for line in result.split("\n"))
|
|
203
|
+
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _clean_value(
|
|
208
|
+
v: Any, *, keys: bool = False, _seen: set[int] | None = None, _depth: int = 0, **kwargs: Any,
|
|
209
|
+
) -> Any:
|
|
210
|
+
"""Recursively clean a value of any type."""
|
|
211
|
+
if isinstance(v, str):
|
|
212
|
+
return clean(v, **kwargs)
|
|
213
|
+
if isinstance(v, dict):
|
|
214
|
+
return clean_dict(v, keys=keys, _seen=_seen, _depth=_depth, **kwargs)
|
|
215
|
+
if isinstance(v, list):
|
|
216
|
+
return clean_column(v, keys=keys, _seen=_seen, _depth=_depth, **kwargs)
|
|
217
|
+
if isinstance(v, (tuple, set, frozenset)):
|
|
218
|
+
if _depth >= MAX_DEPTH:
|
|
219
|
+
raise ValueError(f"Maximum nesting depth ({MAX_DEPTH}) exceeded")
|
|
220
|
+
obj_id = id(v)
|
|
221
|
+
if _seen is None:
|
|
222
|
+
_seen = set()
|
|
223
|
+
if obj_id in _seen:
|
|
224
|
+
raise ValueError("Circular reference detected in input structure")
|
|
225
|
+
_seen.add(obj_id)
|
|
226
|
+
try:
|
|
227
|
+
cleaned_items = [
|
|
228
|
+
_clean_value(item, keys=keys, _seen=_seen, _depth=_depth + 1, **kwargs)
|
|
229
|
+
for item in v
|
|
230
|
+
]
|
|
231
|
+
result = type(v)(cleaned_items)
|
|
232
|
+
if isinstance(v, (set, frozenset)) and len(result) != len(v):
|
|
233
|
+
raise ValueError(
|
|
234
|
+
"Set member collision: cleaning produced duplicate members"
|
|
235
|
+
)
|
|
236
|
+
return result
|
|
237
|
+
finally:
|
|
238
|
+
_seen.discard(obj_id)
|
|
239
|
+
return v
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def clean_column(
|
|
243
|
+
values: list[Any], *, keys: bool = False, _seen: set[int] | None = None,
|
|
244
|
+
_depth: int = 0, **kwargs: Any,
|
|
245
|
+
) -> list[Any]:
|
|
246
|
+
"""Clean a list of values. Recursively traverses nested dicts and lists."""
|
|
247
|
+
if _depth == 0:
|
|
248
|
+
if not isinstance(values, list):
|
|
249
|
+
raise TypeError(f"clean_column() expects list, got {type(values).__name__}")
|
|
250
|
+
if not isinstance(keys, bool):
|
|
251
|
+
raise TypeError(f"clean_column() keys must be bool, got {type(keys).__name__}")
|
|
252
|
+
# Validate profile type if provided (same contract as clean()).
|
|
253
|
+
_validate_profile_kwarg(kwargs, "clean_column")
|
|
254
|
+
# Validate overrides upfront so invalid types always raise, regardless of data shape.
|
|
255
|
+
if _depth == 0:
|
|
256
|
+
# Exclude 'profile' from bool validation — it's a valid passthrough to clean().
|
|
257
|
+
bool_kwargs = {k: v for k, v in kwargs.items() if k != "profile"}
|
|
258
|
+
_validate_bool_overrides(bool_kwargs, _BOOL_OVERRIDE_NAMES_CLEAN, "clean_column")
|
|
259
|
+
if _depth >= MAX_DEPTH:
|
|
260
|
+
raise ValueError(f"Maximum nesting depth ({MAX_DEPTH}) exceeded")
|
|
261
|
+
if _seen is None:
|
|
262
|
+
_seen = set()
|
|
263
|
+
obj_id = id(values)
|
|
264
|
+
if obj_id in _seen:
|
|
265
|
+
raise ValueError("Circular reference detected in input structure")
|
|
266
|
+
_seen.add(obj_id)
|
|
267
|
+
try:
|
|
268
|
+
return [_clean_value(v, keys=keys, _seen=_seen, _depth=_depth + 1, **kwargs) for v in values]
|
|
269
|
+
finally:
|
|
270
|
+
_seen.discard(obj_id)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def clean_dict(
|
|
274
|
+
d: dict[Any, Any], *, keys: bool = False, _seen: set[int] | None = None,
|
|
275
|
+
_depth: int = 0, **kwargs: Any,
|
|
276
|
+
) -> dict[Any, Any]:
|
|
277
|
+
"""Recursively clean string values in a dict.
|
|
278
|
+
|
|
279
|
+
Parameters
|
|
280
|
+
----------
|
|
281
|
+
d : dict
|
|
282
|
+
Dictionary to clean.
|
|
283
|
+
keys : bool
|
|
284
|
+
If True, also clean dictionary keys (only str keys are cleaned).
|
|
285
|
+
**kwargs :
|
|
286
|
+
Passed to clean().
|
|
287
|
+
"""
|
|
288
|
+
if _depth == 0:
|
|
289
|
+
if not isinstance(d, dict):
|
|
290
|
+
raise TypeError(f"clean_dict() expects dict, got {type(d).__name__}")
|
|
291
|
+
if not isinstance(keys, bool):
|
|
292
|
+
raise TypeError(f"clean_dict() keys must be bool, got {type(keys).__name__}")
|
|
293
|
+
# Validate profile type if provided (same contract as clean()).
|
|
294
|
+
_validate_profile_kwarg(kwargs, "clean_dict")
|
|
295
|
+
# Validate overrides upfront so invalid types always raise, regardless of data shape.
|
|
296
|
+
if _depth == 0:
|
|
297
|
+
bool_kwargs = {k: v for k, v in kwargs.items() if k != "profile"}
|
|
298
|
+
_validate_bool_overrides(bool_kwargs, _BOOL_OVERRIDE_NAMES_CLEAN, "clean_dict")
|
|
299
|
+
if _depth >= MAX_DEPTH:
|
|
300
|
+
raise ValueError(f"Maximum nesting depth ({MAX_DEPTH}) exceeded")
|
|
301
|
+
if _seen is None:
|
|
302
|
+
_seen = set()
|
|
303
|
+
obj_id = id(d)
|
|
304
|
+
if obj_id in _seen:
|
|
305
|
+
raise ValueError("Circular reference detected in input structure")
|
|
306
|
+
_seen.add(obj_id)
|
|
307
|
+
try:
|
|
308
|
+
out: dict[Any, Any] = {}
|
|
309
|
+
for k, v in d.items():
|
|
310
|
+
new_key = clean(k, **kwargs) if keys and isinstance(k, str) else k
|
|
311
|
+
if new_key in out:
|
|
312
|
+
raise ValueError(
|
|
313
|
+
f"Key collision: {k!r} normalizes to {new_key!r} which already exists"
|
|
314
|
+
)
|
|
315
|
+
out[new_key] = _clean_value(v, keys=keys, _seen=_seen, _depth=_depth + 1, **kwargs)
|
|
316
|
+
return out
|
|
317
|
+
finally:
|
|
318
|
+
_seen.discard(obj_id)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
@dataclass(slots=True)
|
|
322
|
+
class CharInfo:
|
|
323
|
+
"""Information about a character found during inspection."""
|
|
324
|
+
char: str
|
|
325
|
+
codepoint: str
|
|
326
|
+
name: str
|
|
327
|
+
category: str
|
|
328
|
+
positions: list[int]
|
|
329
|
+
count: int
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def inspect(
|
|
333
|
+
text: str,
|
|
334
|
+
*,
|
|
335
|
+
profile: str | Profile = "default",
|
|
336
|
+
fullwidth: bool | None = None,
|
|
337
|
+
line_endings: bool | None = None,
|
|
338
|
+
max_positions: int | None = None,
|
|
339
|
+
) -> list[CharInfo]:
|
|
340
|
+
"""Inspect text for non-standard characters.
|
|
341
|
+
|
|
342
|
+
Parameters
|
|
343
|
+
----------
|
|
344
|
+
text : str
|
|
345
|
+
The text to inspect.
|
|
346
|
+
profile : str or Profile
|
|
347
|
+
Named profile or a Profile instance. Only character categories
|
|
348
|
+
enabled in the profile are flagged. Default is ``"default"``.
|
|
349
|
+
fullwidth : bool or None
|
|
350
|
+
Override the profile's fullwidth setting.
|
|
351
|
+
line_endings : bool or None
|
|
352
|
+
Override the profile's line_endings setting.
|
|
353
|
+
max_positions : int or None
|
|
354
|
+
If set, limit the ``positions`` list in each :class:`CharInfo` to at
|
|
355
|
+
most this many entries. ``count`` always reflects the true total.
|
|
356
|
+
Useful for bounding memory on very large inputs.
|
|
357
|
+
|
|
358
|
+
Returns a list of CharInfo objects describing each problematic character
|
|
359
|
+
found, sorted by first position.
|
|
360
|
+
|
|
361
|
+
Notes
|
|
362
|
+
-----
|
|
363
|
+
Positions are **character indices** (``enumerate`` over the Python
|
|
364
|
+
string), not byte offsets. For multibyte UTF-8 characters such as
|
|
365
|
+
emoji, the character index will be smaller than the byte offset. If
|
|
366
|
+
byte offsets are needed, convert via
|
|
367
|
+
``len(text[:pos].encode('utf-8'))``.
|
|
368
|
+
"""
|
|
369
|
+
if not isinstance(text, str):
|
|
370
|
+
raise TypeError(f"inspect() expects str, got {type(text).__name__}")
|
|
371
|
+
|
|
372
|
+
# Validate max_positions
|
|
373
|
+
if max_positions is not None:
|
|
374
|
+
if not isinstance(max_positions, int) or isinstance(max_positions, bool):
|
|
375
|
+
raise TypeError(
|
|
376
|
+
f"inspect() max_positions must be int or None, got {type(max_positions).__name__}"
|
|
377
|
+
)
|
|
378
|
+
if max_positions < 0:
|
|
379
|
+
raise ValueError(
|
|
380
|
+
f"inspect() max_positions must be non-negative, got {max_positions}"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Validate override types
|
|
384
|
+
_validate_bool_overrides(
|
|
385
|
+
{"fullwidth": fullwidth, "line_endings": line_endings},
|
|
386
|
+
_BOOL_OVERRIDE_NAMES_INSPECT, "inspect",
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Resolve profile
|
|
390
|
+
if isinstance(profile, str):
|
|
391
|
+
if profile not in PROFILES:
|
|
392
|
+
raise ValueError(f"Unknown profile {profile!r}. Available: {', '.join(sorted(PROFILES))}")
|
|
393
|
+
p = PROFILES[profile]
|
|
394
|
+
elif isinstance(profile, Profile):
|
|
395
|
+
p = profile
|
|
396
|
+
else:
|
|
397
|
+
raise TypeError(f"profile must be str or Profile, got {type(profile).__name__}")
|
|
398
|
+
|
|
399
|
+
# Apply overrides
|
|
400
|
+
use_fullwidth = fullwidth if fullwidth is not None else p.fullwidth
|
|
401
|
+
use_line_endings = line_endings if line_endings is not None else p.line_endings
|
|
402
|
+
|
|
403
|
+
import unicodedata
|
|
404
|
+
|
|
405
|
+
# Collect only chars that the resolved profile would change
|
|
406
|
+
target_chars: set[str] = set()
|
|
407
|
+
if p.invisible:
|
|
408
|
+
target_chars.update(INVISIBLE.keys())
|
|
409
|
+
if p.whitespace:
|
|
410
|
+
target_chars.update(WHITESPACE.keys())
|
|
411
|
+
if p.control:
|
|
412
|
+
target_chars.update(CONTROL.keys())
|
|
413
|
+
if p.smart_quotes:
|
|
414
|
+
target_chars.update(SMART_QUOTES.keys())
|
|
415
|
+
if p.dashes:
|
|
416
|
+
target_chars.update(DASHES.keys())
|
|
417
|
+
if p.ellipsis:
|
|
418
|
+
target_chars.update(ELLIPSIS.keys())
|
|
419
|
+
if use_fullwidth:
|
|
420
|
+
target_chars.update(FULLWIDTH.keys())
|
|
421
|
+
if use_line_endings:
|
|
422
|
+
target_chars.add("\r")
|
|
423
|
+
|
|
424
|
+
found: dict[str, list[int]] = {}
|
|
425
|
+
counts: dict[str, int] = {}
|
|
426
|
+
first_pos: dict[str, int] = {}
|
|
427
|
+
for i, ch in enumerate(text):
|
|
428
|
+
if ch in target_chars:
|
|
429
|
+
counts[ch] = counts.get(ch, 0) + 1
|
|
430
|
+
if ch not in first_pos:
|
|
431
|
+
first_pos[ch] = i
|
|
432
|
+
found[ch] = []
|
|
433
|
+
# Only store positions up to max_positions to bound memory.
|
|
434
|
+
if max_positions is None or len(found[ch]) < max_positions:
|
|
435
|
+
found[ch].append(i)
|
|
436
|
+
|
|
437
|
+
results: list[CharInfo] = []
|
|
438
|
+
for ch in sorted(found, key=lambda c: first_pos[c]):
|
|
439
|
+
positions = found[ch]
|
|
440
|
+
total = counts[ch]
|
|
441
|
+
results.append(CharInfo(
|
|
442
|
+
char=ch,
|
|
443
|
+
codepoint=f"U+{ord(ch):04X}",
|
|
444
|
+
name=unicodedata.name(ch, f"UNKNOWN-{ord(ch):04X}"),
|
|
445
|
+
category=unicodedata.category(ch),
|
|
446
|
+
positions=positions,
|
|
447
|
+
count=total,
|
|
448
|
+
))
|
|
449
|
+
return results
|
cleanmonkey/maps.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Character replacement maps used by cleanmonkey."""
|
|
2
|
+
|
|
3
|
+
# Smart / curly quotes → ASCII equivalents
|
|
4
|
+
SMART_QUOTES: dict[str, str] = {
|
|
5
|
+
"\u2018": "'", # left single
|
|
6
|
+
"\u2019": "'", # right single
|
|
7
|
+
"\u201a": "'", # single low-9
|
|
8
|
+
"\u201b": "'", # single high-reversed-9
|
|
9
|
+
"\u201c": '"', # left double
|
|
10
|
+
"\u201d": '"', # right double
|
|
11
|
+
"\u201e": '"', # double low-9
|
|
12
|
+
"\u201f": '"', # double high-reversed-9
|
|
13
|
+
"\u2039": "'", # single left-pointing angle
|
|
14
|
+
"\u203a": "'", # single right-pointing angle
|
|
15
|
+
"\u00ab": '"', # left-pointing double angle
|
|
16
|
+
"\u00bb": '"', # right-pointing double angle
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
# Dash-like characters → ASCII hyphen-minus
|
|
20
|
+
DASHES: dict[str, str] = {
|
|
21
|
+
"\u2010": "-", # hyphen
|
|
22
|
+
"\u2011": "-", # non-breaking hyphen
|
|
23
|
+
"\u2012": "-", # figure dash
|
|
24
|
+
"\u2013": "-", # en dash
|
|
25
|
+
"\u2014": "-", # em dash
|
|
26
|
+
"\u2015": "-", # horizontal bar
|
|
27
|
+
"\u2212": "-", # minus sign
|
|
28
|
+
"\ufe58": "-", # small em dash
|
|
29
|
+
"\ufe63": "-", # small hyphen-minus
|
|
30
|
+
"\uff0d": "-", # fullwidth hyphen-minus
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Ellipsis
|
|
34
|
+
ELLIPSIS: dict[str, str] = {
|
|
35
|
+
"\u2026": "...", # horizontal ellipsis
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Zero-width and invisible characters → removed
|
|
39
|
+
INVISIBLE: dict[str, str] = {
|
|
40
|
+
"\u200b": "", # zero-width space
|
|
41
|
+
"\u200c": "", # zero-width non-joiner
|
|
42
|
+
"\u200d": "", # zero-width joiner
|
|
43
|
+
"\u200e": "", # left-to-right mark
|
|
44
|
+
"\u200f": "", # right-to-left mark
|
|
45
|
+
"\u2060": "", # word joiner
|
|
46
|
+
"\u2061": "", # function application
|
|
47
|
+
"\u2062": "", # invisible times
|
|
48
|
+
"\u2063": "", # invisible separator
|
|
49
|
+
"\u2064": "", # invisible plus
|
|
50
|
+
"\ufeff": "", # BOM / zero-width no-break space
|
|
51
|
+
"\u00ad": "", # soft hyphen
|
|
52
|
+
"\u034f": "", # combining grapheme joiner
|
|
53
|
+
"\u061c": "", # Arabic letter mark
|
|
54
|
+
"\u180e": "", # Mongolian vowel separator
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Whitespace-like characters → ASCII space
|
|
58
|
+
WHITESPACE: dict[str, str] = {
|
|
59
|
+
"\u00a0": " ", # non-breaking space
|
|
60
|
+
"\u1680": " ", # ogham space mark
|
|
61
|
+
"\u2000": " ", # en quad
|
|
62
|
+
"\u2001": " ", # em quad
|
|
63
|
+
"\u2002": " ", # en space
|
|
64
|
+
"\u2003": " ", # em space
|
|
65
|
+
"\u2004": " ", # three-per-em space
|
|
66
|
+
"\u2005": " ", # four-per-em space
|
|
67
|
+
"\u2006": " ", # six-per-em space
|
|
68
|
+
"\u2007": " ", # figure space
|
|
69
|
+
"\u2008": " ", # punctuation space
|
|
70
|
+
"\u2009": " ", # thin space
|
|
71
|
+
"\u200a": " ", # hair space
|
|
72
|
+
"\u202f": " ", # narrow no-break space
|
|
73
|
+
"\u205f": " ", # medium mathematical space
|
|
74
|
+
"\u3000": " ", # ideographic space
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Control characters (C0/C1) to remove, excluding \t \n \r which are handled separately
|
|
78
|
+
CONTROL: dict[str, str] = {
|
|
79
|
+
"\x00": "", # null
|
|
80
|
+
"\x01": "", "\x02": "", "\x03": "", "\x04": "",
|
|
81
|
+
"\x05": "", "\x06": "", "\x07": "", "\x08": "",
|
|
82
|
+
"\x0b": "", # vertical tab
|
|
83
|
+
"\x0c": "", # form feed
|
|
84
|
+
"\x0e": "", "\x0f": "",
|
|
85
|
+
"\x10": "", "\x11": "", "\x12": "", "\x13": "",
|
|
86
|
+
"\x14": "", "\x15": "", "\x16": "", "\x17": "",
|
|
87
|
+
"\x18": "", "\x19": "", "\x1a": "", "\x1b": "",
|
|
88
|
+
"\x1c": "", "\x1d": "", "\x1e": "", "\x1f": "",
|
|
89
|
+
"\x7f": "", # DEL
|
|
90
|
+
# C1 control characters (U+0080–U+009F)
|
|
91
|
+
**{chr(i): "" for i in range(0x80, 0xA0)},
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Fullwidth ASCII digits → normal digits
|
|
95
|
+
FULLWIDTH_DIGITS: dict[str, str] = {
|
|
96
|
+
chr(0xFF10 + i): str(i) for i in range(10)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Fullwidth ASCII letters → normal letters
|
|
100
|
+
FULLWIDTH_LETTERS: dict[str, str] = {
|
|
101
|
+
**{chr(0xFF21 + i): chr(0x41 + i) for i in range(26)}, # A-Z
|
|
102
|
+
**{chr(0xFF41 + i): chr(0x61 + i) for i in range(26)}, # a-z
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# Combined fullwidth map
|
|
106
|
+
FULLWIDTH: dict[str, str] = {**FULLWIDTH_DIGITS, **FULLWIDTH_LETTERS}
|
cleanmonkey/profiles.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Preset cleaning profiles for common use cases."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True, slots=True)
|
|
9
|
+
class Profile:
|
|
10
|
+
"""Configuration for which normalizations to apply."""
|
|
11
|
+
|
|
12
|
+
smart_quotes: bool = True
|
|
13
|
+
dashes: bool = True
|
|
14
|
+
ellipsis: bool = True
|
|
15
|
+
invisible: bool = True
|
|
16
|
+
whitespace: bool = True
|
|
17
|
+
control: bool = True
|
|
18
|
+
fullwidth: bool = False
|
|
19
|
+
line_endings: bool = True # normalize \r\n and \r to \n
|
|
20
|
+
collapse_spaces: bool = True # multiple spaces → single space
|
|
21
|
+
strip: bool = True # strip leading/trailing whitespace per line
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Named profiles
|
|
25
|
+
PROFILES: dict[str, Profile] = {
|
|
26
|
+
"default": Profile(),
|
|
27
|
+
"csv": Profile(
|
|
28
|
+
fullwidth=True,
|
|
29
|
+
),
|
|
30
|
+
"sql": Profile(
|
|
31
|
+
fullwidth=True,
|
|
32
|
+
),
|
|
33
|
+
"display": Profile(
|
|
34
|
+
smart_quotes=False,
|
|
35
|
+
dashes=False,
|
|
36
|
+
ellipsis=False,
|
|
37
|
+
fullwidth=False,
|
|
38
|
+
),
|
|
39
|
+
"minimal": Profile(
|
|
40
|
+
smart_quotes=False,
|
|
41
|
+
dashes=False,
|
|
42
|
+
ellipsis=False,
|
|
43
|
+
invisible=True,
|
|
44
|
+
whitespace=False,
|
|
45
|
+
control=False,
|
|
46
|
+
fullwidth=False,
|
|
47
|
+
line_endings=False,
|
|
48
|
+
collapse_spaces=False,
|
|
49
|
+
strip=False,
|
|
50
|
+
),
|
|
51
|
+
# "aggressive" enables every available normalization, including fullwidth.
|
|
52
|
+
# It is intentionally equivalent to the default profile with fullwidth=True,
|
|
53
|
+
# providing a semantic alias for pipelines that want maximum cleaning.
|
|
54
|
+
"aggressive": Profile(
|
|
55
|
+
fullwidth=True,
|
|
56
|
+
),
|
|
57
|
+
}
|
cleanmonkey/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cleanmonkey
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: One-call text cleanup: invisible characters, smart quotes, whitespace normalization.
|
|
5
|
+
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/RexBytes/cleanmonkey
|
|
8
|
+
Project-URL: Repository, https://github.com/RexBytes/cleanmonkey
|
|
9
|
+
Project-URL: Issues, https://github.com/RexBytes/cleanmonkey/issues
|
|
10
|
+
Keywords: text,cleanup,whitespace,unicode,normalize
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Text Processing
|
|
20
|
+
Classifier: Topic :: Text Processing :: Filters
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# cleanmonkey
|
|
28
|
+
|
|
29
|
+
One-call text cleanup for invisible characters, smart quotes, and whitespace normalization.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install cleanmonkey
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from cleanmonkey import clean
|
|
41
|
+
|
|
42
|
+
# Sensible defaults handle the common garbage
|
|
43
|
+
clean("hello\u00a0world\u2019s \u2014 test")
|
|
44
|
+
# → "hello world's - test"
|
|
45
|
+
|
|
46
|
+
# Idempotent — safe to call twice
|
|
47
|
+
clean(clean(text)) == clean(text)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## What It Cleans (by default)
|
|
51
|
+
|
|
52
|
+
| Category | Examples | Result |
|
|
53
|
+
|---|---|---|
|
|
54
|
+
| Non-breaking spaces | `\u00a0`, `\u2007`, `\u202f` | Regular space |
|
|
55
|
+
| Zero-width chars | `\u200b`, `\u200c`, `\u200d`, `\ufeff` | Removed |
|
|
56
|
+
| Smart quotes | `\u2018` `\u2019` `\u201c` `\u201d` | `'` and `"` |
|
|
57
|
+
| Dashes | `\u2013` (en), `\u2014` (em) | `-` |
|
|
58
|
+
| Ellipsis | `\u2026` | `...` |
|
|
59
|
+
| Control chars | null, form feed, vertical tab | Removed |
|
|
60
|
+
| Line endings | `\r\n`, `\r` | `\n` |
|
|
61
|
+
| Multiple spaces | `"hello world"` | `"hello world"` |
|
|
62
|
+
| Leading/trailing | `" hello "` | `"hello"` |
|
|
63
|
+
|
|
64
|
+
## Granular Control
|
|
65
|
+
|
|
66
|
+
Override any default:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
clean(text, smart_quotes=False) # keep curly quotes
|
|
70
|
+
clean(text, dashes=False) # keep em/en dashes
|
|
71
|
+
clean(text, fullwidth=True) # also normalize fullwidth digits/letters
|
|
72
|
+
clean(text, collapse_spaces=False) # keep multiple spaces
|
|
73
|
+
clean(text, strip=False) # keep leading/trailing whitespace
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Profiles
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
clean(text, profile="default") # all normalizations (the default)
|
|
80
|
+
clean(text, profile="csv") # default + fullwidth normalization
|
|
81
|
+
clean(text, profile="sql") # default + fullwidth normalization
|
|
82
|
+
clean(text, profile="display") # keep smart quotes & dashes; still clean invisible, control, whitespace, line endings
|
|
83
|
+
clean(text, profile="minimal") # invisible chars only, no collapsing or stripping
|
|
84
|
+
clean(text, profile="aggressive") # everything including fullwidth
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Batch Helpers
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from cleanmonkey import clean_column, clean_dict
|
|
91
|
+
|
|
92
|
+
# Clean a list (non-strings pass through)
|
|
93
|
+
clean_column(["hello\u00a0world", 42, None])
|
|
94
|
+
# → ["hello world", 42, None]
|
|
95
|
+
|
|
96
|
+
# Recursively clean dict values
|
|
97
|
+
clean_dict({"name": "John\u00a0Doe", "nested": {"val": "test\u200b"}})
|
|
98
|
+
# → {"name": "John Doe", "nested": {"val": "test"}}
|
|
99
|
+
|
|
100
|
+
# Also clean keys
|
|
101
|
+
clean_dict({"key\u00a0name": "val"}, keys=True)
|
|
102
|
+
# → {"key name": "val"}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Inspect
|
|
106
|
+
|
|
107
|
+
Find out what's lurking in your text:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from cleanmonkey import inspect
|
|
111
|
+
|
|
112
|
+
for info in inspect("hello\u00a0world\u200b"):
|
|
113
|
+
print(f"{info.codepoint} {info.name} count={info.count} at {info.positions}")
|
|
114
|
+
# U+00A0 NO-BREAK SPACE count=1 at [5]
|
|
115
|
+
# U+200B ZERO WIDTH SPACE count=1 at [11]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## CLI
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
# Clean a file
|
|
122
|
+
cleanmonkey input.txt -o output.txt
|
|
123
|
+
|
|
124
|
+
# Pipe through stdin
|
|
125
|
+
cat dirty.csv | cleanmonkey > clean.csv
|
|
126
|
+
|
|
127
|
+
# Use a profile
|
|
128
|
+
cleanmonkey --profile csv input.txt
|
|
129
|
+
|
|
130
|
+
# Inspect mode — report what's in a file
|
|
131
|
+
cleanmonkey --inspect input.txt
|
|
132
|
+
|
|
133
|
+
# Machine-readable JSON inspect output
|
|
134
|
+
cleanmonkey --json input.txt
|
|
135
|
+
|
|
136
|
+
# Selective overrides
|
|
137
|
+
cleanmonkey --no-smart-quotes --fullwidth input.txt
|
|
138
|
+
|
|
139
|
+
# Preserve whitespace structure
|
|
140
|
+
cleanmonkey --no-strip --no-collapse-spaces input.txt
|
|
141
|
+
|
|
142
|
+
# Preserve line endings (CR/CRLF)
|
|
143
|
+
cleanmonkey --no-line-endings input.txt
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Built for LLMs
|
|
147
|
+
|
|
148
|
+
cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
|
|
149
|
+
|
|
150
|
+
## License
|
|
151
|
+
|
|
152
|
+
MIT
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
cleanmonkey/__init__.py,sha256=gHz9hgjVMCryQXmiycnVzdWzWdDvOCKoSnMSSRlbNN4,353
|
|
2
|
+
cleanmonkey/__main__.py,sha256=jFCbwZHhmw8k8JnQsSE14dtyiZpBDkb8T7mjWBRaAg4,158
|
|
3
|
+
cleanmonkey/cli.py,sha256=wPXbCwCrT1yZx8kMCvauBFT2ioZTczTItLuwr35-w7Y,15788
|
|
4
|
+
cleanmonkey/core.py,sha256=vO03OpJqz7TzjJoEgOYJLQcwlT3gMLM_kHgASw36KYY,15624
|
|
5
|
+
cleanmonkey/maps.py,sha256=AS5kxrtZtcxYLOFES38jG34gFqB96V5fXRNwHtlkU98,3708
|
|
6
|
+
cleanmonkey/profiles.py,sha256=hG6zWAwAFhP0Pxm_vpc4HRKKfk_ipRsDv2AFuJeZ1YU,1534
|
|
7
|
+
cleanmonkey/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
cleanmonkey-0.1.0.dist-info/licenses/LICENSE,sha256=srNahN_Cxejm5SlFsCghF2Mml1gXgqlnuqWlDt7F1ck,1065
|
|
9
|
+
cleanmonkey-0.1.0.dist-info/METADATA,sha256=tHxlu1XJxo_kQLYJal9F2iTwyRCBSHgeYk6VRQD3kYQ,4914
|
|
10
|
+
cleanmonkey-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
11
|
+
cleanmonkey-0.1.0.dist-info/entry_points.txt,sha256=ePX3uiSQ0P3GDiO4yAci3iILpk-FqD6QCvNkXFjJmyg,80
|
|
12
|
+
cleanmonkey-0.1.0.dist-info/top_level.txt,sha256=q7GGSdV6NFD8-QSz1Vowde7NLAO5MPoaMvpnpmhrQWI,12
|
|
13
|
+
cleanmonkey-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RexBytes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cleanmonkey
|