cleanmonkey 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cleanmonkey-0.1.0/LICENSE +21 -0
- cleanmonkey-0.1.0/PKG-INFO +152 -0
- cleanmonkey-0.1.0/README.md +126 -0
- cleanmonkey-0.1.0/pyproject.toml +43 -0
- cleanmonkey-0.1.0/setup.cfg +4 -0
- cleanmonkey-0.1.0/src/cleanmonkey/__init__.py +7 -0
- cleanmonkey-0.1.0/src/cleanmonkey/__main__.py +5 -0
- cleanmonkey-0.1.0/src/cleanmonkey/cli.py +400 -0
- cleanmonkey-0.1.0/src/cleanmonkey/core.py +449 -0
- cleanmonkey-0.1.0/src/cleanmonkey/maps.py +106 -0
- cleanmonkey-0.1.0/src/cleanmonkey/profiles.py +57 -0
- cleanmonkey-0.1.0/src/cleanmonkey/py.typed +0 -0
- cleanmonkey-0.1.0/src/cleanmonkey.egg-info/PKG-INFO +152 -0
- cleanmonkey-0.1.0/src/cleanmonkey.egg-info/SOURCES.txt +17 -0
- cleanmonkey-0.1.0/src/cleanmonkey.egg-info/dependency_links.txt +1 -0
- cleanmonkey-0.1.0/src/cleanmonkey.egg-info/entry_points.txt +2 -0
- cleanmonkey-0.1.0/src/cleanmonkey.egg-info/top_level.txt +1 -0
- cleanmonkey-0.1.0/tests/test_cli.py +1113 -0
- cleanmonkey-0.1.0/tests/test_core.py +1049 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RexBytes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cleanmonkey
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: One-call text cleanup: invisible characters, smart quotes, whitespace normalization.
|
|
5
|
+
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/RexBytes/cleanmonkey
|
|
8
|
+
Project-URL: Repository, https://github.com/RexBytes/cleanmonkey
|
|
9
|
+
Project-URL: Issues, https://github.com/RexBytes/cleanmonkey/issues
|
|
10
|
+
Keywords: text,cleanup,whitespace,unicode,normalize
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Text Processing
|
|
20
|
+
Classifier: Topic :: Text Processing :: Filters
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# cleanmonkey
|
|
28
|
+
|
|
29
|
+
One-call text cleanup for invisible characters, smart quotes, and whitespace normalization.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install cleanmonkey
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from cleanmonkey import clean
|
|
41
|
+
|
|
42
|
+
# Sensible defaults handle the common garbage
|
|
43
|
+
clean("hello\u00a0world\u2019s \u2014 test")
|
|
44
|
+
# → "hello world's - test"
|
|
45
|
+
|
|
46
|
+
# Idempotent — safe to call twice
|
|
47
|
+
clean(clean(text)) == clean(text)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## What It Cleans (by default)
|
|
51
|
+
|
|
52
|
+
| Category | Examples | Result |
|
|
53
|
+
|---|---|---|
|
|
54
|
+
| Non-breaking spaces | `\u00a0`, `\u2007`, `\u202f` | Regular space |
|
|
55
|
+
| Zero-width chars | `\u200b`, `\u200c`, `\u200d`, `\ufeff` | Removed |
|
|
56
|
+
| Smart quotes | `\u2018` `\u2019` `\u201c` `\u201d` | `'` and `"` |
|
|
57
|
+
| Dashes | `\u2013` (en), `\u2014` (em) | `-` |
|
|
58
|
+
| Ellipsis | `\u2026` | `...` |
|
|
59
|
+
| Control chars | null, form feed, vertical tab | Removed |
|
|
60
|
+
| Line endings | `\r\n`, `\r` | `\n` |
|
|
61
|
+
| Multiple spaces | `"hello world"` | `"hello world"` |
|
|
62
|
+
| Leading/trailing | `" hello "` | `"hello"` |
|
|
63
|
+
|
|
64
|
+
## Granular Control
|
|
65
|
+
|
|
66
|
+
Override any default:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
clean(text, smart_quotes=False) # keep curly quotes
|
|
70
|
+
clean(text, dashes=False) # keep em/en dashes
|
|
71
|
+
clean(text, fullwidth=True) # also normalize fullwidth digits/letters
|
|
72
|
+
clean(text, collapse_spaces=False) # keep multiple spaces
|
|
73
|
+
clean(text, strip=False) # keep leading/trailing whitespace
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Profiles
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
clean(text, profile="default") # all normalizations (the default)
|
|
80
|
+
clean(text, profile="csv") # default + fullwidth normalization
|
|
81
|
+
clean(text, profile="sql") # default + fullwidth normalization
|
|
82
|
+
clean(text, profile="display") # keep smart quotes & dashes; still clean invisible, control, whitespace, line endings
|
|
83
|
+
clean(text, profile="minimal") # invisible chars only, no collapsing or stripping
|
|
84
|
+
clean(text, profile="aggressive") # everything including fullwidth
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Batch Helpers
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from cleanmonkey import clean_column, clean_dict
|
|
91
|
+
|
|
92
|
+
# Clean a list (non-strings pass through)
|
|
93
|
+
clean_column(["hello\u00a0world", 42, None])
|
|
94
|
+
# → ["hello world", 42, None]
|
|
95
|
+
|
|
96
|
+
# Recursively clean dict values
|
|
97
|
+
clean_dict({"name": "John\u00a0Doe", "nested": {"val": "test\u200b"}})
|
|
98
|
+
# → {"name": "John Doe", "nested": {"val": "test"}}
|
|
99
|
+
|
|
100
|
+
# Also clean keys
|
|
101
|
+
clean_dict({"key\u00a0name": "val"}, keys=True)
|
|
102
|
+
# → {"key name": "val"}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Inspect
|
|
106
|
+
|
|
107
|
+
Find out what's lurking in your text:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from cleanmonkey import inspect
|
|
111
|
+
|
|
112
|
+
for info in inspect("hello\u00a0world\u200b"):
|
|
113
|
+
print(f"{info.codepoint} {info.name} count={info.count} at {info.positions}")
|
|
114
|
+
# U+00A0 NO-BREAK SPACE count=1 at [5]
|
|
115
|
+
# U+200B ZERO WIDTH SPACE count=1 at [11]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## CLI
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
# Clean a file
|
|
122
|
+
cleanmonkey input.txt -o output.txt
|
|
123
|
+
|
|
124
|
+
# Pipe through stdin
|
|
125
|
+
cat dirty.csv | cleanmonkey > clean.csv
|
|
126
|
+
|
|
127
|
+
# Use a profile
|
|
128
|
+
cleanmonkey --profile csv input.txt
|
|
129
|
+
|
|
130
|
+
# Inspect mode — report what's in a file
|
|
131
|
+
cleanmonkey --inspect input.txt
|
|
132
|
+
|
|
133
|
+
# Machine-readable JSON inspect output
|
|
134
|
+
cleanmonkey --json input.txt
|
|
135
|
+
|
|
136
|
+
# Selective overrides
|
|
137
|
+
cleanmonkey --no-smart-quotes --fullwidth input.txt
|
|
138
|
+
|
|
139
|
+
# Preserve whitespace structure
|
|
140
|
+
cleanmonkey --no-strip --no-collapse-spaces input.txt
|
|
141
|
+
|
|
142
|
+
# Preserve line endings (CR/CRLF)
|
|
143
|
+
cleanmonkey --no-line-endings input.txt
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Built for LLMs
|
|
147
|
+
|
|
148
|
+
cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
|
|
149
|
+
|
|
150
|
+
## License
|
|
151
|
+
|
|
152
|
+
MIT
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# cleanmonkey
|
|
2
|
+
|
|
3
|
+
One-call text cleanup for invisible characters, smart quotes, and whitespace normalization.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install cleanmonkey
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from cleanmonkey import clean
|
|
15
|
+
|
|
16
|
+
# Sensible defaults handle the common garbage
|
|
17
|
+
clean("hello\u00a0world\u2019s \u2014 test")
|
|
18
|
+
# → "hello world's - test"
|
|
19
|
+
|
|
20
|
+
# Idempotent — safe to call twice
|
|
21
|
+
clean(clean(text)) == clean(text)
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## What It Cleans (by default)
|
|
25
|
+
|
|
26
|
+
| Category | Examples | Result |
|
|
27
|
+
|---|---|---|
|
|
28
|
+
| Non-breaking spaces | `\u00a0`, `\u2007`, `\u202f` | Regular space |
|
|
29
|
+
| Zero-width chars | `\u200b`, `\u200c`, `\u200d`, `\ufeff` | Removed |
|
|
30
|
+
| Smart quotes | `\u2018` `\u2019` `\u201c` `\u201d` | `'` and `"` |
|
|
31
|
+
| Dashes | `\u2013` (en), `\u2014` (em) | `-` |
|
|
32
|
+
| Ellipsis | `\u2026` | `...` |
|
|
33
|
+
| Control chars | null, form feed, vertical tab | Removed |
|
|
34
|
+
| Line endings | `\r\n`, `\r` | `\n` |
|
|
35
|
+
| Multiple spaces | `"hello world"` | `"hello world"` |
|
|
36
|
+
| Leading/trailing | `" hello "` | `"hello"` |
|
|
37
|
+
|
|
38
|
+
## Granular Control
|
|
39
|
+
|
|
40
|
+
Override any default:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
clean(text, smart_quotes=False) # keep curly quotes
|
|
44
|
+
clean(text, dashes=False) # keep em/en dashes
|
|
45
|
+
clean(text, fullwidth=True) # also normalize fullwidth digits/letters
|
|
46
|
+
clean(text, collapse_spaces=False) # keep multiple spaces
|
|
47
|
+
clean(text, strip=False) # keep leading/trailing whitespace
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Profiles
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
clean(text, profile="default") # all normalizations (the default)
|
|
54
|
+
clean(text, profile="csv") # default + fullwidth normalization
|
|
55
|
+
clean(text, profile="sql") # default + fullwidth normalization
|
|
56
|
+
clean(text, profile="display") # keep smart quotes & dashes; still clean invisible, control, whitespace, line endings
|
|
57
|
+
clean(text, profile="minimal") # invisible chars only, no collapsing or stripping
|
|
58
|
+
clean(text, profile="aggressive") # everything including fullwidth
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Batch Helpers
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from cleanmonkey import clean_column, clean_dict
|
|
65
|
+
|
|
66
|
+
# Clean a list (non-strings pass through)
|
|
67
|
+
clean_column(["hello\u00a0world", 42, None])
|
|
68
|
+
# → ["hello world", 42, None]
|
|
69
|
+
|
|
70
|
+
# Recursively clean dict values
|
|
71
|
+
clean_dict({"name": "John\u00a0Doe", "nested": {"val": "test\u200b"}})
|
|
72
|
+
# → {"name": "John Doe", "nested": {"val": "test"}}
|
|
73
|
+
|
|
74
|
+
# Also clean keys
|
|
75
|
+
clean_dict({"key\u00a0name": "val"}, keys=True)
|
|
76
|
+
# → {"key name": "val"}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Inspect
|
|
80
|
+
|
|
81
|
+
Find out what's lurking in your text:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from cleanmonkey import inspect
|
|
85
|
+
|
|
86
|
+
for info in inspect("hello\u00a0world\u200b"):
|
|
87
|
+
print(f"{info.codepoint} {info.name} count={info.count} at {info.positions}")
|
|
88
|
+
# U+00A0 NO-BREAK SPACE count=1 at [5]
|
|
89
|
+
# U+200B ZERO WIDTH SPACE count=1 at [11]
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## CLI
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Clean a file
|
|
96
|
+
cleanmonkey input.txt -o output.txt
|
|
97
|
+
|
|
98
|
+
# Pipe through stdin
|
|
99
|
+
cat dirty.csv | cleanmonkey > clean.csv
|
|
100
|
+
|
|
101
|
+
# Use a profile
|
|
102
|
+
cleanmonkey --profile csv input.txt
|
|
103
|
+
|
|
104
|
+
# Inspect mode — report what's in a file
|
|
105
|
+
cleanmonkey --inspect input.txt
|
|
106
|
+
|
|
107
|
+
# Machine-readable JSON inspect output
|
|
108
|
+
cleanmonkey --json input.txt
|
|
109
|
+
|
|
110
|
+
# Selective overrides
|
|
111
|
+
cleanmonkey --no-smart-quotes --fullwidth input.txt
|
|
112
|
+
|
|
113
|
+
# Preserve whitespace structure
|
|
114
|
+
cleanmonkey --no-strip --no-collapse-spaces input.txt
|
|
115
|
+
|
|
116
|
+
# Preserve line endings (CR/CRLF)
|
|
117
|
+
cleanmonkey --no-line-endings input.txt
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Built for LLMs
|
|
121
|
+
|
|
122
|
+
cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "cleanmonkey"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "One-call text cleanup: invisible characters, smart quotes, whitespace normalization."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "RexBytes", email = "pythonic@rexbytes.com"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["text", "cleanup", "whitespace", "unicode", "normalize"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Topic :: Text Processing",
|
|
26
|
+
"Topic :: Text Processing :: Filters",
|
|
27
|
+
"Typing :: Typed",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
cleanmonkey = "cleanmonkey.cli:_main_with_broken_pipe_handling"
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/RexBytes/cleanmonkey"
|
|
35
|
+
Repository = "https://github.com/RexBytes/cleanmonkey"
|
|
36
|
+
Issues = "https://github.com/RexBytes/cleanmonkey/issues"
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
testpaths = ["tests"]
|
|
43
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""cleanmonkey — one-call text cleanup for invisible characters, smart quotes, and whitespace."""
|
|
2
|
+
|
|
3
|
+
from cleanmonkey.core import MAX_DEPTH, clean, clean_column, clean_dict, inspect
|
|
4
|
+
from cleanmonkey.profiles import PROFILES, Profile
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
__all__ = ["MAX_DEPTH", "clean", "clean_column", "clean_dict", "inspect", "Profile", "PROFILES"]
|