typemonkey 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. typemonkey-1.0.0/LICENSE +21 -0
  2. typemonkey-1.0.0/PKG-INFO +134 -0
  3. typemonkey-1.0.0/README.md +87 -0
  4. typemonkey-1.0.0/pyproject.toml +45 -0
  5. typemonkey-1.0.0/setup.cfg +4 -0
  6. typemonkey-1.0.0/src/typemonkey/__init__.py +49 -0
  7. typemonkey-1.0.0/src/typemonkey/booleans.py +124 -0
  8. typemonkey-1.0.0/src/typemonkey/clean.py +162 -0
  9. typemonkey-1.0.0/src/typemonkey/cli.py +100 -0
  10. typemonkey-1.0.0/src/typemonkey/detectors/__init__.py +16 -0
  11. typemonkey-1.0.0/src/typemonkey/detectors/base.py +28 -0
  12. typemonkey-1.0.0/src/typemonkey/detectors/boolean.py +39 -0
  13. typemonkey-1.0.0/src/typemonkey/detectors/date.py +53 -0
  14. typemonkey-1.0.0/src/typemonkey/detectors/numeric.py +46 -0
  15. typemonkey-1.0.0/src/typemonkey/infer.py +291 -0
  16. typemonkey-1.0.0/src/typemonkey/locale.py +87 -0
  17. typemonkey-1.0.0/src/typemonkey/models.py +147 -0
  18. typemonkey-1.0.0/src/typemonkey/nulls.py +107 -0
  19. typemonkey-1.0.0/src/typemonkey/numbers.py +295 -0
  20. typemonkey-1.0.0/src/typemonkey/preserve.py +82 -0
  21. typemonkey-1.0.0/src/typemonkey.egg-info/PKG-INFO +134 -0
  22. typemonkey-1.0.0/src/typemonkey.egg-info/SOURCES.txt +37 -0
  23. typemonkey-1.0.0/src/typemonkey.egg-info/dependency_links.txt +1 -0
  24. typemonkey-1.0.0/src/typemonkey.egg-info/entry_points.txt +2 -0
  25. typemonkey-1.0.0/src/typemonkey.egg-info/requires.txt +7 -0
  26. typemonkey-1.0.0/src/typemonkey.egg-info/top_level.txt +1 -0
  27. typemonkey-1.0.0/tests/test_booleans.py +313 -0
  28. typemonkey-1.0.0/tests/test_clean.py +225 -0
  29. typemonkey-1.0.0/tests/test_cli.py +187 -0
  30. typemonkey-1.0.0/tests/test_consistency.py +166 -0
  31. typemonkey-1.0.0/tests/test_detectors.py +244 -0
  32. typemonkey-1.0.0/tests/test_infer.py +312 -0
  33. typemonkey-1.0.0/tests/test_locale.py +127 -0
  34. typemonkey-1.0.0/tests/test_metadata.py +33 -0
  35. typemonkey-1.0.0/tests/test_models.py +265 -0
  36. typemonkey-1.0.0/tests/test_nulls.py +153 -0
  37. typemonkey-1.0.0/tests/test_numbers.py +554 -0
  38. typemonkey-1.0.0/tests/test_preserve.py +155 -0
  39. typemonkey-1.0.0/tests/test_properties.py +328 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 RexBytes
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: typemonkey
3
+ Version: 1.0.0
4
+ Summary: Column type inference and type-aware cleaning: numbers, currency, percentages, booleans, nulls, dates.
5
+ Author-email: RexBytes <pythonic@rexbytes.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 RexBytes
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/RexBytes/typemonkey
29
+ Project-URL: Issues, https://github.com/RexBytes/typemonkey/issues
30
+ Classifier: Development Status :: 5 - Production/Stable
31
+ Classifier: Intended Audience :: Developers
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Operating System :: OS Independent
34
+ Classifier: Programming Language :: Python :: 3
35
+ Classifier: Programming Language :: Python :: 3.11
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Requires-Python: >=3.11
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Requires-Dist: datemonkey~=0.1.0
41
+ Requires-Dist: cleanmonkey~=0.1.0
42
+ Provides-Extra: dev
43
+ Requires-Dist: pytest>=7.0; extra == "dev"
44
+ Requires-Dist: pytest-cov; extra == "dev"
45
+ Requires-Dist: hypothesis>=6.0; extra == "dev"
46
+ Dynamic: license-file
47
+
48
+ # typemonkey
49
+
50
+ Column type inference and type-aware cleaning for messy tabular data.
51
+ Infer whether a column is an integer, float, currency, percentage, boolean,
52
+ date, or free-text string — then clean it to that type. Numbers buried in
53
+ currency symbols, thousands separators, European decimal commas, accounting
54
+ parentheses, and percent signs come out as plain Python numbers; a column's
55
+ worth of `yes`/`Y`/`1`/`true` come out as `bool`; twenty-plus spellings of
56
+ "null" collapse to `None`.
57
+
58
+ Part of the monkey toolkit. Delegates date detection to
59
+ [`datemonkey`](https://pypi.org/project/datemonkey/) and value normalisation
60
+ to [`cleanmonkey`](https://pypi.org/project/cleanmonkey/) — it does not
61
+ reinvent either.
62
+
63
+ ## Install
64
+
65
+ ```bash
66
+ pip install typemonkey
67
+ ```
68
+
69
+ ## Quick start
70
+
71
+ ```python
72
+ from typemonkey import infer_type, clean_numeric, clean_boolean, clean_column
73
+
74
+ profile = infer_type(["$1,234.56", "$2,000.00", "$3.50"])
75
+ profile.type # TypeName.CURRENCY
76
+ profile.confidence # 1.0
77
+ profile.locale # "us"
78
+
79
+ clean_numeric(["$1,234.56", "(50)", "12%", "N/A"]).values
80
+ # [1234.56, -50, 0.12, None] # parens = negative, 12% = 0.12, N/A = null
81
+
82
+ clean_numeric(["1.234,56", "3,50"], locale="eu").values
83
+ # [1234.56, 3.5] # European decimal comma
84
+
85
+ clean_boolean(["yes", "NO", "1", "0", "maybe"]).values
86
+ # [True, False, True, False, None] # "maybe" recorded in .failures
87
+
88
+ clean_column(["01234", "07090", "02139"]).values
89
+ # ['01234', '07090', '02139'] # zero-padded IDs preserved as strings
90
+ ```
91
+
92
+ Every entry point returns a typed dataclass (`ColumnProfile`, `CleanResult`),
93
+ not a dict. `CleanResult.failures` lists `(index, original)` for non-null
94
+ values that didn't parse, so "missing" is never confused with "empty".
95
+
96
+ ## What it recognises
97
+
98
+ - **Numbers** — `int`, `float`, with thousands separators, apostrophe/space
99
+ grouping, leading `+`/`-`, accounting `(parentheses)` negatives.
100
+ - **Currency** — `$ € £ ¥ ₹ ...` symbols and ISO codes (`USD`, `EUR`, ...).
101
+ - **Percentages** — `"12%"`, `"8 %"` → `0.12`, `0.08` (or keep as `12`, `8`).
102
+ - **Booleans** — `true/false`, `t/f`, `yes/no`, `y/n`, `on/off`, `1/0`.
103
+ - **Dates** — via datemonkey (ISO, US/EU slash and dash, ambiguity reporting).
104
+ - **Nulls** — 20+ spellings (`N/A`, `#N/A`, `null`, `none`, `-`, `unknown`, …).
105
+ - **Preserve-as-string** — zero-padded IDs, Zip+4, phone numbers.
106
+ - **Locale** — US `1,234.56` vs European `1.234,56`, auto-detected per column.
107
+
108
+ ## CLI
109
+
110
+ ```bash
111
+ printf '$1,234.56\n$2,000.00\n$3.50\n' | typemonkey profile # JSON report
112
+ printf '12%%\n8 %%\nN/A\n' | typemonkey clean # cleaned values
113
+ typemonkey clean --type integer column.txt
114
+ ```
115
+
116
+ `typemonkey profile` prints a JSON `ColumnProfile`; `typemonkey clean` prints
117
+ one cleaned value per line (blank for nulls) and exits non-zero if any non-null
118
+ value failed to parse.
119
+
120
+ ## Using with AI assistants
121
+
122
+ See [`SKILL.md`](SKILL.md) for an LLM-oriented quick reference (decision table,
123
+ worked examples, anti-patterns).
124
+
125
+ ## Deliberate tradeoffs
126
+
127
+ Some behaviour is intentional and might look like a bug — bare 5-digit numbers
128
+ aren't treated as zips, all-`0`/`1` columns are integers not booleans, Excel
129
+ serials report numeric. See [`LIMITATIONS.md`](LIMITATIONS.md) for the
130
+ rationale and escape hatch on each.
131
+
132
+ ## License
133
+
134
+ MIT
@@ -0,0 +1,87 @@
1
+ # typemonkey
2
+
3
+ Column type inference and type-aware cleaning for messy tabular data.
4
+ Infer whether a column is an integer, float, currency, percentage, boolean,
5
+ date, or free-text string — then clean it to that type. Numbers buried in
6
+ currency symbols, thousands separators, European decimal commas, accounting
7
+ parentheses, and percent signs come out as plain Python numbers; a column's
8
+ worth of `yes`/`Y`/`1`/`true` come out as `bool`; twenty-plus spellings of
9
+ "null" collapse to `None`.
10
+
11
+ Part of the monkey toolkit. Delegates date detection to
12
+ [`datemonkey`](https://pypi.org/project/datemonkey/) and value normalisation
13
+ to [`cleanmonkey`](https://pypi.org/project/cleanmonkey/) — it does not
14
+ reinvent either.
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install typemonkey
20
+ ```
21
+
22
+ ## Quick start
23
+
24
+ ```python
25
+ from typemonkey import infer_type, clean_numeric, clean_boolean, clean_column
26
+
27
+ profile = infer_type(["$1,234.56", "$2,000.00", "$3.50"])
28
+ profile.type # TypeName.CURRENCY
29
+ profile.confidence # 1.0
30
+ profile.locale # "us"
31
+
32
+ clean_numeric(["$1,234.56", "(50)", "12%", "N/A"]).values
33
+ # [1234.56, -50, 0.12, None] # parens = negative, 12% = 0.12, N/A = null
34
+
35
+ clean_numeric(["1.234,56", "3,50"], locale="eu").values
36
+ # [1234.56, 3.5] # European decimal comma
37
+
38
+ clean_boolean(["yes", "NO", "1", "0", "maybe"]).values
39
+ # [True, False, True, False, None] # "maybe" recorded in .failures
40
+
41
+ clean_column(["01234", "07090", "02139"]).values
42
+ # ['01234', '07090', '02139'] # zero-padded IDs preserved as strings
43
+ ```
44
+
45
+ Every entry point returns a typed dataclass (`ColumnProfile`, `CleanResult`),
46
+ not a dict. `CleanResult.failures` lists `(index, original)` for non-null
47
+ values that didn't parse, so "missing" is never confused with "empty".
48
+
49
+ ## What it recognises
50
+
51
+ - **Numbers** — `int`, `float`, with thousands separators, apostrophe/space
52
+ grouping, leading `+`/`-`, accounting `(parentheses)` negatives.
53
+ - **Currency** — `$ € £ ¥ ₹ ...` symbols and ISO codes (`USD`, `EUR`, ...).
54
+ - **Percentages** — `"12%"`, `"8 %"` → `0.12`, `0.08` (or keep as `12`, `8`).
55
+ - **Booleans** — `true/false`, `t/f`, `yes/no`, `y/n`, `on/off`, `1/0`.
56
+ - **Dates** — via datemonkey (ISO, US/EU slash and dash, ambiguity reporting).
57
+ - **Nulls** — 20+ spellings (`N/A`, `#N/A`, `null`, `none`, `-`, `unknown`, …).
58
+ - **Preserve-as-string** — zero-padded IDs, Zip+4, phone numbers.
59
+ - **Locale** — US `1,234.56` vs European `1.234,56`, auto-detected per column.
60
+
61
+ ## CLI
62
+
63
+ ```bash
64
+ printf '$1,234.56\n$2,000.00\n$3.50\n' | typemonkey profile # JSON report
65
+ printf '12%%\n8 %%\nN/A\n' | typemonkey clean # cleaned values
66
+ typemonkey clean --type integer column.txt
67
+ ```
68
+
69
+ `typemonkey profile` prints a JSON `ColumnProfile`; `typemonkey clean` prints
70
+ one cleaned value per line (blank for nulls) and exits non-zero if any non-null
71
+ value failed to parse.
72
+
73
+ ## Using with AI assistants
74
+
75
+ See [`SKILL.md`](SKILL.md) for an LLM-oriented quick reference (decision table,
76
+ worked examples, anti-patterns).
77
+
78
+ ## Deliberate tradeoffs
79
+
80
+ Some behaviour is intentional and might look like a bug — bare 5-digit numbers
81
+ aren't treated as zips, all-`0`/`1` columns are integers not booleans, Excel
82
+ serials report numeric. See [`LIMITATIONS.md`](LIMITATIONS.md) for the
83
+ rationale and escape hatch on each.
84
+
85
+ ## License
86
+
87
+ MIT
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "typemonkey"
7
+ version = "1.0.0"
8
+ description = "Column type inference and type-aware cleaning: numbers, currency, percentages, booleans, nulls, dates."
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ authors = [{ name = "RexBytes", email = "pythonic@rexbytes.com" }]
12
+ requires-python = ">=3.11"
13
+ classifiers = [
14
+ "Development Status :: 5 - Production/Stable",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ ]
22
+ dependencies = [
23
+ # Pinned to the 0.1.x line: typemonkey reads detailed attributes off these
24
+ # pre-1.0 result objects (datemonkey's .format.pattern / .confidence /
25
+ # .results[].row_index, etc.), so a 0.2 could break us. Bump deliberately.
26
+ "datemonkey~=0.1.0",
27
+ "cleanmonkey~=0.1.0",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0"]
32
+
33
+ [project.scripts]
34
+ typemonkey = "typemonkey.cli:main"
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/RexBytes/typemonkey"
38
+ Issues = "https://github.com/RexBytes/typemonkey/issues"
39
+
40
+ [tool.setuptools.packages.find]
41
+ where = ["src"]
42
+
43
+ [tool.pytest.ini_options]
44
+ testpaths = ["tests"]
45
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,49 @@
1
+ """typemonkey — column type inference and type-aware cleaning.
2
+
3
+ Public API (import ``from typemonkey import X``):
4
+
5
+ * :func:`infer_type` — profile a column, returning a :class:`ColumnProfile`.
6
+ * :func:`clean_column` — clean a column to an inferred or given type.
7
+ * :func:`clean_numeric` / :func:`clean_boolean` — type-specific cleaners.
8
+ * :func:`normalize_nulls` / :func:`is_null` — null handling.
9
+ * :func:`detect_number_locale` — US vs European number format detection.
10
+ * :func:`looks_like_preserve_string` — zip/phone/zero-padded-ID detection.
11
+ * :class:`ColumnProfile`, :class:`CleanResult`, :class:`TypeName` — result types.
12
+ * :data:`DEFAULT_NULLS`, :data:`TRUE_VALUES`, :data:`FALSE_VALUES` — vocabularies.
13
+
14
+ See LIMITATIONS.md for deliberate design tradeoffs and SKILL.md for an
15
+ LLM-oriented quick reference.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from .booleans import FALSE_VALUES, TRUE_VALUES, clean_boolean, parse_boolean
21
+ from .clean import clean_column
22
+ from .infer import infer_type
23
+ from .locale import detect_number_locale
24
+ from .models import CleanResult, ColumnProfile, TypeName
25
+ from .nulls import DEFAULT_NULLS, is_null, normalize_nulls
26
+ from .numbers import clean_numeric, parse_number
27
+ from .preserve import looks_like_preserve_string
28
+
29
+ __version__ = "1.0.0"
30
+
31
+ __all__ = [
32
+ "infer_type",
33
+ "clean_column",
34
+ "clean_numeric",
35
+ "clean_boolean",
36
+ "parse_number",
37
+ "parse_boolean",
38
+ "normalize_nulls",
39
+ "is_null",
40
+ "detect_number_locale",
41
+ "looks_like_preserve_string",
42
+ "ColumnProfile",
43
+ "CleanResult",
44
+ "TypeName",
45
+ "DEFAULT_NULLS",
46
+ "TRUE_VALUES",
47
+ "FALSE_VALUES",
48
+ "__version__",
49
+ ]
@@ -0,0 +1,124 @@
1
+ """Boolean vocabulary and normalisation.
2
+
3
+ This module exists because "true" arrives as ``yes``, ``Y``, ``1``, ``on``,
4
+ ``T``, ``true``, ``TRUE`` and a dozen other spellings, and inference needs one
5
+ authoritative mapping from token to ``bool``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from .models import CleanResult, TypeName
13
+ from .nulls import is_null
14
+
15
+ # Tokens that mean True / False. Matching is case-insensitive and
16
+ # whitespace-trimmed, so only one casing of each spelling is listed.
17
+ TRUE_VALUES: frozenset[str] = frozenset(
18
+ {"true", "t", "yes", "y", "1", "on", "enabled", "active"}
19
+ )
20
+ FALSE_VALUES: frozenset[str] = frozenset(
21
+ {"false", "f", "no", "n", "0", "off", "disabled", "inactive"}
22
+ )
23
+ # The numeric pair is recognised as boolean but is *also* valid integer data.
24
+ # Inference uses this set to avoid mislabelling a 0/1 integer column as boolean.
25
+ NUMERIC_BOOLEANS: frozenset[str] = frozenset({"0", "1"})
26
+
27
+
28
+ @dataclass
29
+ class ParsedBoolean:
30
+ """A parsed boolean token.
31
+
32
+ Attributes:
33
+ value: The resulting ``bool``.
34
+ numeric: ``True`` when the source token was ``"0"`` or ``"1"`` — the
35
+ ambiguous case that is equally valid integer data.
36
+ """
37
+
38
+ value: bool
39
+ numeric: bool
40
+
41
+
42
+ def parse_boolean(
43
+ token: object,
44
+ *,
45
+ true_values=None,
46
+ false_values=None,
47
+ ) -> ParsedBoolean:
48
+ """Parse one boolean token, raising :class:`ValueError` if unrecognised.
49
+
50
+ Args:
51
+ token: The value to parse. Non-strings are stringified; the result is
52
+ trimmed and lower-cased before lookup.
53
+ true_values: Override truthy vocabulary (case-insensitive). Defaults
54
+ to :data:`TRUE_VALUES`.
55
+ false_values: Override falsy vocabulary. Defaults to
56
+ :data:`FALSE_VALUES`.
57
+
58
+ Returns:
59
+ A :class:`ParsedBoolean`.
60
+
61
+ Raises:
62
+ ValueError: If the token is in neither vocabulary.
63
+ """
64
+ trues = TRUE_VALUES if true_values is None else frozenset(v.lower() for v in true_values)
65
+ falses = FALSE_VALUES if false_values is None else frozenset(v.lower() for v in false_values)
66
+ key = str(token).strip().lower()
67
+ if key in trues:
68
+ return ParsedBoolean(value=True, numeric=key in NUMERIC_BOOLEANS)
69
+ if key in falses:
70
+ return ParsedBoolean(value=False, numeric=key in NUMERIC_BOOLEANS)
71
+ raise ValueError(f"{token!r} is not a recognised boolean")
72
+
73
+
74
+ def is_boolean(token: object, **kwargs) -> bool:
75
+ """Return ``True`` when :func:`parse_boolean` would succeed on ``token``."""
76
+ try:
77
+ parse_boolean(token, **kwargs)
78
+ return True
79
+ except ValueError:
80
+ return False
81
+
82
+
83
+ def clean_boolean(
84
+ values,
85
+ *,
86
+ true_values=None,
87
+ false_values=None,
88
+ null_values=None,
89
+ ) -> CleanResult:
90
+ """Clean a column of messy boolean strings into Python ``bool`` values.
91
+
92
+ Args:
93
+ values: The column to clean.
94
+ true_values: Override truthy vocabulary (see :func:`parse_boolean`).
95
+ false_values: Override falsy vocabulary.
96
+ null_values: Null spellings (see :func:`typemonkey.nulls.is_null`);
97
+ recognised nulls become ``None`` and are not counted as failures.
98
+
99
+ Returns:
100
+ A :class:`CleanResult` whose ``values`` align 1:1 with ``values`` and
101
+ whose ``failures`` records ``(index, original)`` for non-null tokens
102
+ that matched neither vocabulary.
103
+ """
104
+ out: list[object] = []
105
+ failures: list[tuple[int, str]] = []
106
+ null_count = 0
107
+ for i, raw in enumerate(values):
108
+ if is_null(raw, null_values):
109
+ out.append(None)
110
+ null_count += 1
111
+ continue
112
+ try:
113
+ parsed = parse_boolean(raw, true_values=true_values, false_values=false_values)
114
+ except ValueError:
115
+ out.append(None)
116
+ failures.append((i, str(raw)))
117
+ continue
118
+ out.append(parsed.value)
119
+ return CleanResult(
120
+ values=out,
121
+ target_type=TypeName.BOOLEAN,
122
+ null_count=null_count,
123
+ failures=failures,
124
+ )
@@ -0,0 +1,162 @@
1
+ """``clean_column`` — infer (or accept) a target type, then clean to it.
2
+
3
+ This module exists as the one-call convenience entry point: hand it a column
4
+ and it figures out the type and returns cleaned values, or pass ``target_type``
5
+ to force the conversion. It dispatches to the type-specific cleaners in
6
+ :mod:`typemonkey.numbers` and :mod:`typemonkey.booleans`, and to datemonkey for
7
+ dates.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import cleanmonkey
13
+ from datemonkey import parse_dates
14
+
15
+ from .booleans import clean_boolean
16
+ from .infer import infer_type
17
+ from .locale import detect_number_locale
18
+ from .models import CleanResult, TypeName
19
+ from .nulls import is_null
20
+ from .numbers import clean_numeric
21
+
22
+ _NUMERIC = {TypeName.INTEGER, TypeName.FLOAT, TypeName.CURRENCY, TypeName.PERCENTAGE}
23
+
24
+
25
+ def clean_column(
26
+ values,
27
+ *,
28
+ target_type: TypeName | str | None = None,
29
+ null_values=None,
30
+ locale: str | None = None,
31
+ locale_preference: str | None = None,
32
+ true_values=None,
33
+ false_values=None,
34
+ percent_as_fraction: bool = True,
35
+ integers: bool = True,
36
+ ) -> CleanResult:
37
+ """Clean a column to ``target_type``, inferring the type when not given.
38
+
39
+ ``values`` is materialised once up front, so a one-shot iterable
40
+ (generator) is safe: inference and cleaning see the same data, and counts
41
+ stay consistent.
42
+
43
+ Args:
44
+ values: The column to clean. Consumed exactly once.
45
+ target_type: A :class:`TypeName` (or its string value) to force. When
46
+ ``None`` the type is inferred via :func:`typemonkey.infer.infer_type`
47
+ and that result — including its detected ``locale`` — drives the
48
+ cleaning.
49
+ null_values: Null spellings (see :func:`typemonkey.nulls.is_null`).
50
+ locale: Number locale for numeric targets. When ``None`` it is taken
51
+ from inference (auto-detected from the data), or detected directly
52
+ for a forced numeric ``target_type``.
53
+ locale_preference: ``"us"``/``"eu"`` hint for date parsing.
54
+ true_values: Override truthy boolean vocabulary.
55
+ false_values: Override falsy boolean vocabulary.
56
+ percent_as_fraction: Scale percents by 1/100 (see
57
+ :func:`typemonkey.numbers.clean_numeric`).
58
+ integers: Return whole numbers as ``int`` (see
59
+ :func:`typemonkey.numbers.clean_numeric`).
60
+
61
+ Returns:
62
+ A :class:`CleanResult` whose ``values`` align 1:1 with ``values`` and
63
+ whose ``target_type`` echoes the inferred or forced type. For ``STRING``
64
+ targets every non-null value is cleanmonkey-normalised and never a
65
+ failure; for ``NULL`` every value is ``None`` and ``null_count`` equals
66
+ the input length.
67
+ """
68
+ vals = list(values) # materialise once — safe for generators
69
+
70
+ inferred_locale: str | None = None
71
+ if target_type is None:
72
+ profile = infer_type(
73
+ vals,
74
+ null_values=null_values,
75
+ locale=locale,
76
+ locale_preference=locale_preference,
77
+ true_values=true_values,
78
+ false_values=false_values,
79
+ )
80
+ target = profile.type
81
+ inferred_locale = profile.locale
82
+ else:
83
+ target = TypeName(target_type)
84
+
85
+ if target in _NUMERIC:
86
+ # Honour an explicit locale, else the one inference detected, else
87
+ # detect it directly (forced numeric target with no inference step).
88
+ effective_locale = locale or inferred_locale or detect_number_locale(vals)
89
+ return clean_numeric(
90
+ vals,
91
+ locale=effective_locale,
92
+ null_values=null_values,
93
+ percent_as_fraction=percent_as_fraction,
94
+ integers=integers,
95
+ target_type=target,
96
+ )
97
+ if target is TypeName.BOOLEAN:
98
+ return clean_boolean(
99
+ vals,
100
+ true_values=true_values,
101
+ false_values=false_values,
102
+ null_values=null_values,
103
+ )
104
+ if target is TypeName.DATE:
105
+ return _clean_dates(vals, null_values, locale_preference)
106
+ if target is TypeName.NULL:
107
+ return CleanResult(
108
+ values=[None] * len(vals),
109
+ target_type=TypeName.NULL,
110
+ null_count=len(vals),
111
+ )
112
+ return _clean_strings(vals, null_values)
113
+
114
+
115
+ def _clean_dates(values, null_values, locale_preference) -> CleanResult:
116
+ """Parse a date column via datemonkey, preserving null/failure distinction."""
117
+ vals = list(values)
118
+ null_mask = [is_null(v, null_values) for v in vals]
119
+ # datemonkey treats None as a null/failure; feed it None for our nulls so
120
+ # the row indices line up.
121
+ feed = [None if null_mask[i] else vals[i] for i in range(len(vals))]
122
+ batch = parse_dates(feed, locale_preference=locale_preference)
123
+ # datemonkey returns an *empty* ``results`` list when no value in the batch
124
+ # parses, and otherwise one result per row tagged with ``row_index``. Map by
125
+ # ``row_index`` and walk our own indices so output stays 1:1 with the input
126
+ # even when zero values parsed (every non-null then becomes a failure).
127
+ by_index = {r.row_index: r for r in batch.results}
128
+ out: list[object] = []
129
+ failures: list[tuple[int, str]] = []
130
+ for i in range(len(vals)):
131
+ if null_mask[i]:
132
+ out.append(None)
133
+ continue
134
+ result = by_index.get(i)
135
+ if result is not None and result.parsed is not None:
136
+ out.append(result.parsed)
137
+ else:
138
+ out.append(None)
139
+ failures.append((i, str(vals[i])))
140
+ return CleanResult(
141
+ values=out,
142
+ target_type=TypeName.DATE,
143
+ null_count=sum(null_mask),
144
+ failures=failures,
145
+ )
146
+
147
+
148
+ def _clean_strings(values, null_values) -> CleanResult:
149
+ """Normalise a string column with cleanmonkey; nulls become ``None``."""
150
+ out: list[object] = []
151
+ null_count = 0
152
+ for v in values:
153
+ if is_null(v, null_values):
154
+ out.append(None)
155
+ null_count += 1
156
+ else:
157
+ out.append(cleanmonkey.clean(str(v)))
158
+ return CleanResult(
159
+ values=out,
160
+ target_type=TypeName.STRING,
161
+ null_count=null_count,
162
+ )