cleanmonkey 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cleanmonkey-0.1.0/src/cleanmonkey.egg-info → cleanmonkey-0.2.0}/PKG-INFO +29 -1
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/README.md +22 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/pyproject.toml +12 -1
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey/__init__.py +1 -1
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey/core.py +21 -16
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0/src/cleanmonkey.egg-info}/PKG-INFO +29 -1
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey.egg-info/SOURCES.txt +1 -0
- cleanmonkey-0.2.0/src/cleanmonkey.egg-info/requires.txt +7 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/tests/test_cli.py +0 -1
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/tests/test_core.py +40 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/LICENSE +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/setup.cfg +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey/__main__.py +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey/cli.py +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey/maps.py +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey/profiles.py +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey/py.typed +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey.egg-info/dependency_links.txt +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey.egg-info/entry_points.txt +0 -0
- {cleanmonkey-0.1.0 → cleanmonkey-0.2.0}/src/cleanmonkey.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cleanmonkey
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: One-call text cleanup: invisible characters, smart quotes, whitespace normalization.
|
|
5
5
|
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
6
|
License: MIT
|
|
@@ -22,6 +22,12 @@ Classifier: Typing :: Typed
|
|
|
22
22
|
Requires-Python: >=3.10
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
28
|
+
Requires-Dist: hypothesis; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy; extra == "dev"
|
|
25
31
|
Dynamic: license-file
|
|
26
32
|
|
|
27
33
|
# cleanmonkey
|
|
@@ -147,6 +153,28 @@ cleanmonkey --no-line-endings input.txt
|
|
|
147
153
|
|
|
148
154
|
cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
|
|
149
155
|
|
|
156
|
+
## A note on whitespace
|
|
157
|
+
|
|
158
|
+
By default `clean()` strips each line and collapses runs of spaces, which is
|
|
159
|
+
correct for prose but **destroys indentation** in structured text (YAML, Python,
|
|
160
|
+
Markdown). Use `clean(text, strip=False, collapse_spaces=False)` or
|
|
161
|
+
`profile="minimal"` for those. See [`LIMITATIONS.md`](LIMITATIONS.md) for this
|
|
162
|
+
and other deliberate design tradeoffs.
|
|
163
|
+
|
|
164
|
+
## Quality & review
|
|
165
|
+
|
|
166
|
+
cleanmonkey is hardened with a competitive multi-model review methodology and a
|
|
167
|
+
measurable release gate:
|
|
168
|
+
|
|
169
|
+
- [`CONTRIBUTING.md`](CONTRIBUTING.md) — testing philosophy and the review-panel
|
|
170
|
+
process.
|
|
171
|
+
- [`LIMITATIONS.md`](LIMITATIONS.md) — intentional tradeoffs reviewers should not
|
|
172
|
+
re-litigate.
|
|
173
|
+
- [`RELEASE_READINESS.md`](RELEASE_READINESS.md) + `release_readiness.json` +
|
|
174
|
+
`scripts/readiness.py` — the release rubric and convergence metric
|
|
175
|
+
(`python scripts/readiness.py`).
|
|
176
|
+
- [`REVIEW_HISTORY.md`](REVIEW_HISTORY.md) — how the library was hardened.
|
|
177
|
+
|
|
150
178
|
## License
|
|
151
179
|
|
|
152
180
|
MIT
|
|
@@ -121,6 +121,28 @@ cleanmonkey --no-line-endings input.txt
|
|
|
121
121
|
|
|
122
122
|
cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
|
|
123
123
|
|
|
124
|
+
## A note on whitespace
|
|
125
|
+
|
|
126
|
+
By default `clean()` strips each line and collapses runs of spaces, which is
|
|
127
|
+
correct for prose but **destroys indentation** in structured text (YAML, Python,
|
|
128
|
+
Markdown). Use `clean(text, strip=False, collapse_spaces=False)` or
|
|
129
|
+
`profile="minimal"` for those. See [`LIMITATIONS.md`](LIMITATIONS.md) for this
|
|
130
|
+
and other deliberate design tradeoffs.
|
|
131
|
+
|
|
132
|
+
## Quality & review
|
|
133
|
+
|
|
134
|
+
cleanmonkey is hardened with a competitive multi-model review methodology and a
|
|
135
|
+
measurable release gate:
|
|
136
|
+
|
|
137
|
+
- [`CONTRIBUTING.md`](CONTRIBUTING.md) — testing philosophy and the review-panel
|
|
138
|
+
process.
|
|
139
|
+
- [`LIMITATIONS.md`](LIMITATIONS.md) — intentional tradeoffs reviewers should not
|
|
140
|
+
re-litigate.
|
|
141
|
+
- [`RELEASE_READINESS.md`](RELEASE_READINESS.md) + `release_readiness.json` +
|
|
142
|
+
`scripts/readiness.py` — the release rubric and convergence metric
|
|
143
|
+
(`python scripts/readiness.py`).
|
|
144
|
+
- [`REVIEW_HISTORY.md`](REVIEW_HISTORY.md) — how the library was hardened.
|
|
145
|
+
|
|
124
146
|
## License
|
|
125
147
|
|
|
126
148
|
MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "cleanmonkey"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "One-call text cleanup: invisible characters, smart quotes, whitespace normalization."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -27,6 +27,9 @@ classifiers = [
|
|
|
27
27
|
"Typing :: Typed",
|
|
28
28
|
]
|
|
29
29
|
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = ["pytest", "pytest-cov", "hypothesis", "ruff", "mypy"]
|
|
32
|
+
|
|
30
33
|
[project.scripts]
|
|
31
34
|
cleanmonkey = "cleanmonkey.cli:_main_with_broken_pipe_handling"
|
|
32
35
|
|
|
@@ -41,3 +44,11 @@ where = ["src"]
|
|
|
41
44
|
[tool.pytest.ini_options]
|
|
42
45
|
testpaths = ["tests"]
|
|
43
46
|
pythonpath = ["src"]
|
|
47
|
+
|
|
48
|
+
[tool.ruff]
|
|
49
|
+
target-version = "py310"
|
|
50
|
+
src = ["src", "tests"]
|
|
51
|
+
|
|
52
|
+
[tool.mypy]
|
|
53
|
+
files = ["src"]
|
|
54
|
+
python_version = "3.10"
|
|
@@ -3,5 +3,5 @@
|
|
|
3
3
|
from cleanmonkey.core import MAX_DEPTH, clean, clean_column, clean_dict, inspect
|
|
4
4
|
from cleanmonkey.profiles import PROFILES, Profile
|
|
5
5
|
|
|
6
|
-
__version__ = "0.
|
|
6
|
+
__version__ = "0.2.0"
|
|
7
7
|
__all__ = ["MAX_DEPTH", "clean", "clean_column", "clean_dict", "inspect", "Profile", "PROFILES"]
|
|
@@ -4,7 +4,18 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
6
|
from dataclasses import dataclass, replace
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, cast
|
|
8
|
+
|
|
9
|
+
from cleanmonkey.maps import (
|
|
10
|
+
CONTROL,
|
|
11
|
+
DASHES,
|
|
12
|
+
ELLIPSIS,
|
|
13
|
+
FULLWIDTH,
|
|
14
|
+
INVISIBLE,
|
|
15
|
+
SMART_QUOTES,
|
|
16
|
+
WHITESPACE,
|
|
17
|
+
)
|
|
18
|
+
from cleanmonkey.profiles import PROFILES, Profile
|
|
8
19
|
|
|
9
20
|
_BOOL_OVERRIDE_NAMES_CLEAN = (
|
|
10
21
|
"smart_quotes", "dashes", "ellipsis", "invisible", "whitespace",
|
|
@@ -27,18 +38,6 @@ def _validate_bool_overrides(overrides: dict[str, Any], allowed: tuple[str, ...]
|
|
|
27
38
|
f"{func_name}() override {name!r} must be bool or None, got {type(val).__name__}"
|
|
28
39
|
)
|
|
29
40
|
|
|
30
|
-
from cleanmonkey.maps import (
|
|
31
|
-
CONTROL,
|
|
32
|
-
DASHES,
|
|
33
|
-
ELLIPSIS,
|
|
34
|
-
FULLWIDTH,
|
|
35
|
-
INVISIBLE,
|
|
36
|
-
SMART_QUOTES,
|
|
37
|
-
WHITESPACE,
|
|
38
|
-
)
|
|
39
|
-
from cleanmonkey.profiles import PROFILES, Profile
|
|
40
|
-
|
|
41
|
-
|
|
42
41
|
def _validate_profile_kwarg(kwargs: dict[str, Any], func_name: str) -> None:
|
|
43
42
|
"""Validate the 'profile' kwarg type and name if present, matching clean()'s contract."""
|
|
44
43
|
if "profile" in kwargs:
|
|
@@ -63,7 +62,7 @@ MAX_DEPTH: int = 200
|
|
|
63
62
|
|
|
64
63
|
def _build_table(profile: Profile) -> dict[int, str | int | None]:
|
|
65
64
|
"""Build a str.translate table from a profile."""
|
|
66
|
-
merged: dict[str, str] = {}
|
|
65
|
+
merged: dict[str, str | int | None] = {}
|
|
67
66
|
if profile.invisible:
|
|
68
67
|
merged.update(INVISIBLE)
|
|
69
68
|
if profile.whitespace:
|
|
@@ -78,7 +77,7 @@ def _build_table(profile: Profile) -> dict[int, str | int | None]:
|
|
|
78
77
|
merged.update(ELLIPSIS)
|
|
79
78
|
if profile.fullwidth:
|
|
80
79
|
merged.update(FULLWIDTH)
|
|
81
|
-
return str.maketrans(
|
|
80
|
+
return str.maketrans(merged)
|
|
82
81
|
|
|
83
82
|
|
|
84
83
|
# Cache tables for default profiles
|
|
@@ -228,7 +227,13 @@ def _clean_value(
|
|
|
228
227
|
_clean_value(item, keys=keys, _seen=_seen, _depth=_depth + 1, **kwargs)
|
|
229
228
|
for item in v
|
|
230
229
|
]
|
|
231
|
-
|
|
230
|
+
if isinstance(v, tuple) and hasattr(v, "_fields"):
|
|
231
|
+
# namedtuple: its constructor takes positional fields, not a
|
|
232
|
+
# single iterable, so type(v)(cleaned_items) would fail. Rebuild
|
|
233
|
+
# via the _make classmethod, which constructs from an iterable.
|
|
234
|
+
result = cast(Any, type(v))._make(cleaned_items)
|
|
235
|
+
else:
|
|
236
|
+
result = type(v)(cleaned_items)
|
|
232
237
|
if isinstance(v, (set, frozenset)) and len(result) != len(v):
|
|
233
238
|
raise ValueError(
|
|
234
239
|
"Set member collision: cleaning produced duplicate members"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cleanmonkey
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: One-call text cleanup: invisible characters, smart quotes, whitespace normalization.
|
|
5
5
|
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
6
|
License: MIT
|
|
@@ -22,6 +22,12 @@ Classifier: Typing :: Typed
|
|
|
22
22
|
Requires-Python: >=3.10
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
28
|
+
Requires-Dist: hypothesis; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy; extra == "dev"
|
|
25
31
|
Dynamic: license-file
|
|
26
32
|
|
|
27
33
|
# cleanmonkey
|
|
@@ -147,6 +153,28 @@ cleanmonkey --no-line-endings input.txt
|
|
|
147
153
|
|
|
148
154
|
cleanmonkey is designed to work well as a tool for large language models. Invisible character cleanup is a constant source of silent bugs in LLM-driven data pipelines — non-breaking spaces break splits, zero-width characters corrupt comparisons, and smart quotes fail exact matches. Without cleanmonkey, LLMs end up generating repetitive `.replace()` chains that miss edge cases and waste tokens. A single `clean()` call handles all of it with a structured, idempotent result — no multi-step prompting or character-by-character debugging required. Fewer tokens in, clean data out.
|
|
149
155
|
|
|
156
|
+
## A note on whitespace
|
|
157
|
+
|
|
158
|
+
By default `clean()` strips each line and collapses runs of spaces, which is
|
|
159
|
+
correct for prose but **destroys indentation** in structured text (YAML, Python,
|
|
160
|
+
Markdown). Use `clean(text, strip=False, collapse_spaces=False)` or
|
|
161
|
+
`profile="minimal"` for those. See [`LIMITATIONS.md`](LIMITATIONS.md) for this
|
|
162
|
+
and other deliberate design tradeoffs.
|
|
163
|
+
|
|
164
|
+
## Quality & review
|
|
165
|
+
|
|
166
|
+
cleanmonkey is hardened with a competitive multi-model review methodology and a
|
|
167
|
+
measurable release gate:
|
|
168
|
+
|
|
169
|
+
- [`CONTRIBUTING.md`](CONTRIBUTING.md) — testing philosophy and the review-panel
|
|
170
|
+
process.
|
|
171
|
+
- [`LIMITATIONS.md`](LIMITATIONS.md) — intentional tradeoffs reviewers should not
|
|
172
|
+
re-litigate.
|
|
173
|
+
- [`RELEASE_READINESS.md`](RELEASE_READINESS.md) + `release_readiness.json` +
|
|
174
|
+
`scripts/readiness.py` — the release rubric and convergence metric
|
|
175
|
+
(`python scripts/readiness.py`).
|
|
176
|
+
- [`REVIEW_HISTORY.md`](REVIEW_HISTORY.md) — how the library was hardened.
|
|
177
|
+
|
|
150
178
|
## License
|
|
151
179
|
|
|
152
180
|
MIT
|
|
@@ -12,6 +12,7 @@ src/cleanmonkey.egg-info/PKG-INFO
|
|
|
12
12
|
src/cleanmonkey.egg-info/SOURCES.txt
|
|
13
13
|
src/cleanmonkey.egg-info/dependency_links.txt
|
|
14
14
|
src/cleanmonkey.egg-info/entry_points.txt
|
|
15
|
+
src/cleanmonkey.egg-info/requires.txt
|
|
15
16
|
src/cleanmonkey.egg-info/top_level.txt
|
|
16
17
|
tests/test_cli.py
|
|
17
18
|
tests/test_core.py
|
|
@@ -279,6 +279,46 @@ class TestTupleRecursion:
|
|
|
279
279
|
assert isinstance(result["data"], tuple)
|
|
280
280
|
assert result["data"] == (1, "ab", None)
|
|
281
281
|
|
|
282
|
+
def test_namedtuple_in_list_preserves_type(self):
|
|
283
|
+
"""namedtuples (tuple subclass, positional ctor) must clean and keep type."""
|
|
284
|
+
from collections import namedtuple
|
|
285
|
+
|
|
286
|
+
Row = namedtuple("Row", ["name", "city"])
|
|
287
|
+
result = clean_column([Row("Alice\u200b", "NYC"), Row("Bob", "LA")])
|
|
288
|
+
assert result == [Row("Alice", "NYC"), Row("Bob", "LA")]
|
|
289
|
+
assert isinstance(result[0], Row)
|
|
290
|
+
# cleaned values are accessible by field name (not collapsed to a plain tuple)
|
|
291
|
+
assert result[0].name == "Alice"
|
|
292
|
+
|
|
293
|
+
def test_namedtuple_as_dict_value_preserves_type(self):
|
|
294
|
+
"""namedtuple dict values must clean and remain the same namedtuple type."""
|
|
295
|
+
from collections import namedtuple
|
|
296
|
+
|
|
297
|
+
Row = namedtuple("Row", ["name", "city"])
|
|
298
|
+
result = clean_dict({"r": Row("x\u200b", "y\u00a0z")})
|
|
299
|
+
assert isinstance(result["r"], Row)
|
|
300
|
+
assert result["r"] == Row("x", "y z")
|
|
301
|
+
|
|
302
|
+
def test_dict_list_subclasses_normalize_to_base_type(self):
|
|
303
|
+
"""Documented contract (LIMITATIONS.md): dict/list subclasses clean their
|
|
304
|
+
values but are returned as the base type, not the subclass."""
|
|
305
|
+
from collections import Counter, OrderedDict, defaultdict
|
|
306
|
+
|
|
307
|
+
od = clean_dict(OrderedDict([("a\u200b", "v\u200b")]), keys=True)
|
|
308
|
+
assert type(od) is dict and od == {"a": "v"}
|
|
309
|
+
|
|
310
|
+
dd = clean_dict(defaultdict(int, {"a\u200b": "v\u200b"}), keys=True)
|
|
311
|
+
assert type(dd) is dict and dd == {"a": "v"}
|
|
312
|
+
|
|
313
|
+
class MyList(list):
|
|
314
|
+
pass
|
|
315
|
+
|
|
316
|
+
ml = clean_column(MyList(["a\u200b", "b"]))
|
|
317
|
+
assert type(ml) is list and ml == ["a", "b"]
|
|
318
|
+
|
|
319
|
+
cnt = clean_column([Counter({"a\u200b": 2})], keys=True)
|
|
320
|
+
assert type(cnt[0]) is dict and cnt[0] == {"a": 2}
|
|
321
|
+
|
|
282
322
|
|
|
283
323
|
class TestCycleDetection:
|
|
284
324
|
"""Cyclic structures must raise ValueError, not RecursionError."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|