cleanmonkey 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 RexBytes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,152 @@
1
+ Metadata-Version: 2.4
2
+ Name: cleanmonkey
3
+ Version: 0.1.0
4
+ Summary: One-call text cleanup: invisible characters, smart quotes, whitespace normalization.
5
+ Author-email: RexBytes <pythonic@rexbytes.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/RexBytes/cleanmonkey
8
+ Project-URL: Repository, https://github.com/RexBytes/cleanmonkey
9
+ Project-URL: Issues, https://github.com/RexBytes/cleanmonkey/issues
10
+ Keywords: text,cleanup,whitespace,unicode,normalize
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Text Processing
20
+ Classifier: Topic :: Text Processing :: Filters
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Dynamic: license-file
26
+
27
+ # cleanmonkey
28
+
29
+ One-call text cleanup for invisible characters, smart quotes, and whitespace normalization.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install cleanmonkey
35
+ ```
36
+
37
+ ## Quick Start
38
+
39
+ ```python
40
+ from cleanmonkey import clean
41
+
42
+ # Sensible defaults handle the common garbage
43
+ clean("hello\u00a0world\u2019s \u2014 test")
44
+ # → "hello world's - test"
45
+
46
+ # Idempotent — safe to call twice
47
+ clean(clean(text)) == clean(text)
48
+ ```
49
+
50
+ ## What It Cleans (by default)
51
+
52
+ | Category | Examples | Result |
53
+ |---|---|---|
54
+ | Non-breaking spaces | `\u00a0`, `\u2007`, `\u202f` | Regular space |
55
+ | Zero-width chars | `\u200b`, `\u200c`, `\u200d`, `\ufeff` | Removed |
56
+ | Smart quotes | `\u2018` `\u2019` `\u201c` `\u201d` | `'` and `"` |
57
+ | Dashes | `\u2013` (en), `\u2014` (em) | `-` |
58
+ | Ellipsis | `\u2026` | `...` |
59
+ | Control chars | null, form feed, vertical tab | Removed |
60
+ | Line endings | `\r\n`, `\r` | `\n` |
61
+ | Multiple spaces | `"hello world"` | `"hello world"` |
62
+ | Leading/trailing | `" hello "` | `"hello"` |
63
+
64
+ ## Granular Control
65
+
66
+ Override any default:
67
+
68
+ ```python
69
+ clean(text, smart_quotes=False) # keep curly quotes
70
+ clean(text, dashes=False) # keep em/en dashes
71
+ clean(text, fullwidth=True) # also normalize fullwidth digits/letters
72
+ clean(text, collapse_spaces=False) # keep multiple spaces
73
+ clean(text, strip=False) # keep leading/trailing whitespace
74
+ ```
75
+
76
+ ## Profiles
77
+
78
+ ```python
79
+ clean(text, profile="default") # all normalizations (the default)
80
+ clean(text, profile="csv") # default + fullwidth normalization
81
+ clean(text, profile="sql") # default + fullwidth normalization
82
+ clean(text, profile="display") # keep smart quotes & dashes; still clean invisible, control, whitespace, line endings
83
+ clean(text, profile="minimal") # invisible chars only, no collapsing or stripping
84
+ clean(text, profile="aggressive") # everything including fullwidth
85
+ ```
86
+
87
+ ## Batch Helpers
88
+
89
+ ```python
90
+ from cleanmonkey import clean_column, clean_dict
91
+
92
+ # Clean a list (non-strings pass through)
93
+ clean_column(["hello\u00a0world", 42, None])
94
+ # → ["hello world", 42, None]
95
+
96
+ # Recursively clean dict values
97
+ clean_dict({"name": "John\u00a0Doe", "nested": {"val": "test\u200b"}})
98
+ # → {"name": "John Doe", "nested": {"val": "test"}}
99
+
100
+ # Also clean keys
101
+ clean_dict({"key\u00a0name": "val"}, keys=True)
102
+ # → {"key name": "val"}
103
+ ```
104
+
105
+ ## Inspect
106
+
107
+ Find out what's lurking in your text:
108
+
109
+ ```python
110
+ from cleanmonkey import inspect
111
+
112
+ for info in inspect("hello\u00a0world\u200b"):
113
+ print(f"{info.codepoint} {info.name} count={info.count} at {info.positions}")
114
+ # U+00A0 NO-BREAK SPACE count=1 at [5]
115
+ # U+200B ZERO WIDTH SPACE count=1 at [11]
116
+ ```
117
+
118
+ ## CLI
119
+
120
+ ```bash
121
+ # Clean a file
122
+ cleanmonkey input.txt -o output.txt
123
+
124
+ # Pipe through stdin
125
+ cat dirty.csv | cleanmonkey > clean.csv
126
+
127
+ # Use a profile
128
+ cleanmonkey --profile csv input.txt
129
+
130
+ # Inspect mode — report what's in a file
131
+ cleanmonkey --inspect input.txt
132
+
133
+ # Machine-readable JSON inspect output
134
+ cleanmonkey --json input.txt
135
+
136
+ # Selective overrides
137
+ cleanmonkey --no-smart-quotes --fullwidth input.txt
138
+
139
+ # Preserve whitespace structure
140
+ cleanmonkey --no-strip --no-collapse-spaces input.txt
141
+
142
+ # Preserve line endings (CR/CRLF)
143
+ cleanmonkey --no-line-endings input.txt
144
+ ```
145
+
146
+ ## Built for LLMs
147
+
148
+ cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
149
+
150
+ ## License
151
+
152
+ MIT
@@ -0,0 +1,126 @@
1
+ # cleanmonkey
2
+
3
+ One-call text cleanup for invisible characters, smart quotes, and whitespace normalization.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install cleanmonkey
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from cleanmonkey import clean
15
+
16
+ # Sensible defaults handle the common garbage
17
+ clean("hello\u00a0world\u2019s \u2014 test")
18
+ # → "hello world's - test"
19
+
20
+ # Idempotent — safe to call twice
21
+ clean(clean(text)) == clean(text)
22
+ ```
23
+
24
+ ## What It Cleans (by default)
25
+
26
+ | Category | Examples | Result |
27
+ |---|---|---|
28
+ | Non-breaking spaces | `\u00a0`, `\u2007`, `\u202f` | Regular space |
29
+ | Zero-width chars | `\u200b`, `\u200c`, `\u200d`, `\ufeff` | Removed |
30
+ | Smart quotes | `\u2018` `\u2019` `\u201c` `\u201d` | `'` and `"` |
31
+ | Dashes | `\u2013` (en), `\u2014` (em) | `-` |
32
+ | Ellipsis | `\u2026` | `...` |
33
+ | Control chars | null, form feed, vertical tab | Removed |
34
+ | Line endings | `\r\n`, `\r` | `\n` |
35
+ | Multiple spaces | `"hello world"` | `"hello world"` |
36
+ | Leading/trailing | `" hello "` | `"hello"` |
37
+
38
+ ## Granular Control
39
+
40
+ Override any default:
41
+
42
+ ```python
43
+ clean(text, smart_quotes=False) # keep curly quotes
44
+ clean(text, dashes=False) # keep em/en dashes
45
+ clean(text, fullwidth=True) # also normalize fullwidth digits/letters
46
+ clean(text, collapse_spaces=False) # keep multiple spaces
47
+ clean(text, strip=False) # keep leading/trailing whitespace
48
+ ```
49
+
50
+ ## Profiles
51
+
52
+ ```python
53
+ clean(text, profile="default") # all normalizations (the default)
54
+ clean(text, profile="csv") # default + fullwidth normalization
55
+ clean(text, profile="sql") # default + fullwidth normalization
56
+ clean(text, profile="display") # keep smart quotes & dashes; still clean invisible, control, whitespace, line endings
57
+ clean(text, profile="minimal") # invisible chars only, no collapsing or stripping
58
+ clean(text, profile="aggressive") # everything including fullwidth
59
+ ```
60
+
61
+ ## Batch Helpers
62
+
63
+ ```python
64
+ from cleanmonkey import clean_column, clean_dict
65
+
66
+ # Clean a list (non-strings pass through)
67
+ clean_column(["hello\u00a0world", 42, None])
68
+ # → ["hello world", 42, None]
69
+
70
+ # Recursively clean dict values
71
+ clean_dict({"name": "John\u00a0Doe", "nested": {"val": "test\u200b"}})
72
+ # → {"name": "John Doe", "nested": {"val": "test"}}
73
+
74
+ # Also clean keys
75
+ clean_dict({"key\u00a0name": "val"}, keys=True)
76
+ # → {"key name": "val"}
77
+ ```
78
+
79
+ ## Inspect
80
+
81
+ Find out what's lurking in your text:
82
+
83
+ ```python
84
+ from cleanmonkey import inspect
85
+
86
+ for info in inspect("hello\u00a0world\u200b"):
87
+ print(f"{info.codepoint} {info.name} count={info.count} at {info.positions}")
88
+ # U+00A0 NO-BREAK SPACE count=1 at [5]
89
+ # U+200B ZERO WIDTH SPACE count=1 at [11]
90
+ ```
91
+
92
+ ## CLI
93
+
94
+ ```bash
95
+ # Clean a file
96
+ cleanmonkey input.txt -o output.txt
97
+
98
+ # Pipe through stdin
99
+ cat dirty.csv | cleanmonkey > clean.csv
100
+
101
+ # Use a profile
102
+ cleanmonkey --profile csv input.txt
103
+
104
+ # Inspect mode — report what's in a file
105
+ cleanmonkey --inspect input.txt
106
+
107
+ # Machine-readable JSON inspect output
108
+ cleanmonkey --json input.txt
109
+
110
+ # Selective overrides
111
+ cleanmonkey --no-smart-quotes --fullwidth input.txt
112
+
113
+ # Preserve whitespace structure
114
+ cleanmonkey --no-strip --no-collapse-spaces input.txt
115
+
116
+ # Preserve line endings (CR/CRLF)
117
+ cleanmonkey --no-line-endings input.txt
118
+ ```
119
+
120
+ ## Built for LLMs
121
+
122
+ cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
123
+
124
+ ## License
125
+
126
+ MIT
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "cleanmonkey"
7
+ version = "0.1.0"
8
+ description = "One-call text cleanup: invisible characters, smart quotes, whitespace normalization."
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "RexBytes", email = "pythonic@rexbytes.com"},
14
+ ]
15
+ keywords = ["text", "cleanup", "whitespace", "unicode", "normalize"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Text Processing",
26
+ "Topic :: Text Processing :: Filters",
27
+ "Typing :: Typed",
28
+ ]
29
+
30
+ [project.scripts]
31
+ cleanmonkey = "cleanmonkey.cli:_main_with_broken_pipe_handling"
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/RexBytes/cleanmonkey"
35
+ Repository = "https://github.com/RexBytes/cleanmonkey"
36
+ Issues = "https://github.com/RexBytes/cleanmonkey/issues"
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
40
+
41
+ [tool.pytest.ini_options]
42
+ testpaths = ["tests"]
43
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,7 @@
1
+ """cleanmonkey — one-call text cleanup for invisible characters, smart quotes, and whitespace."""
2
+
3
+ from cleanmonkey.core import MAX_DEPTH, clean, clean_column, clean_dict, inspect
4
+ from cleanmonkey.profiles import PROFILES, Profile
5
+
6
+ __version__ = "0.1.0"
7
+ __all__ = ["MAX_DEPTH", "clean", "clean_column", "clean_dict", "inspect", "Profile", "PROFILES"]
@@ -0,0 +1,5 @@
1
+ """Allow running cleanmonkey as ``python -m cleanmonkey``."""
2
+
3
+ from cleanmonkey.cli import _main_with_broken_pipe_handling
4
+
5
+ _main_with_broken_pipe_handling()