typemonkey 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- typemonkey-1.0.0/LICENSE +21 -0
- typemonkey-1.0.0/PKG-INFO +134 -0
- typemonkey-1.0.0/README.md +87 -0
- typemonkey-1.0.0/pyproject.toml +45 -0
- typemonkey-1.0.0/setup.cfg +4 -0
- typemonkey-1.0.0/src/typemonkey/__init__.py +49 -0
- typemonkey-1.0.0/src/typemonkey/booleans.py +124 -0
- typemonkey-1.0.0/src/typemonkey/clean.py +162 -0
- typemonkey-1.0.0/src/typemonkey/cli.py +100 -0
- typemonkey-1.0.0/src/typemonkey/detectors/__init__.py +16 -0
- typemonkey-1.0.0/src/typemonkey/detectors/base.py +28 -0
- typemonkey-1.0.0/src/typemonkey/detectors/boolean.py +39 -0
- typemonkey-1.0.0/src/typemonkey/detectors/date.py +53 -0
- typemonkey-1.0.0/src/typemonkey/detectors/numeric.py +46 -0
- typemonkey-1.0.0/src/typemonkey/infer.py +291 -0
- typemonkey-1.0.0/src/typemonkey/locale.py +87 -0
- typemonkey-1.0.0/src/typemonkey/models.py +147 -0
- typemonkey-1.0.0/src/typemonkey/nulls.py +107 -0
- typemonkey-1.0.0/src/typemonkey/numbers.py +295 -0
- typemonkey-1.0.0/src/typemonkey/preserve.py +82 -0
- typemonkey-1.0.0/src/typemonkey.egg-info/PKG-INFO +134 -0
- typemonkey-1.0.0/src/typemonkey.egg-info/SOURCES.txt +37 -0
- typemonkey-1.0.0/src/typemonkey.egg-info/dependency_links.txt +1 -0
- typemonkey-1.0.0/src/typemonkey.egg-info/entry_points.txt +2 -0
- typemonkey-1.0.0/src/typemonkey.egg-info/requires.txt +7 -0
- typemonkey-1.0.0/src/typemonkey.egg-info/top_level.txt +1 -0
- typemonkey-1.0.0/tests/test_booleans.py +313 -0
- typemonkey-1.0.0/tests/test_clean.py +225 -0
- typemonkey-1.0.0/tests/test_cli.py +187 -0
- typemonkey-1.0.0/tests/test_consistency.py +166 -0
- typemonkey-1.0.0/tests/test_detectors.py +244 -0
- typemonkey-1.0.0/tests/test_infer.py +312 -0
- typemonkey-1.0.0/tests/test_locale.py +127 -0
- typemonkey-1.0.0/tests/test_metadata.py +33 -0
- typemonkey-1.0.0/tests/test_models.py +265 -0
- typemonkey-1.0.0/tests/test_nulls.py +153 -0
- typemonkey-1.0.0/tests/test_numbers.py +554 -0
- typemonkey-1.0.0/tests/test_preserve.py +155 -0
- typemonkey-1.0.0/tests/test_properties.py +328 -0
typemonkey-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RexBytes
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: typemonkey
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Column type inference and type-aware cleaning: numbers, currency, percentages, booleans, nulls, dates.
|
|
5
|
+
Author-email: RexBytes <pythonic@rexbytes.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 RexBytes
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/RexBytes/typemonkey
|
|
29
|
+
Project-URL: Issues, https://github.com/RexBytes/typemonkey/issues
|
|
30
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
31
|
+
Classifier: Intended Audience :: Developers
|
|
32
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
33
|
+
Classifier: Operating System :: OS Independent
|
|
34
|
+
Classifier: Programming Language :: Python :: 3
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
37
|
+
Requires-Python: >=3.11
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Requires-Dist: datemonkey~=0.1.0
|
|
41
|
+
Requires-Dist: cleanmonkey~=0.1.0
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
45
|
+
Requires-Dist: hypothesis>=6.0; extra == "dev"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# typemonkey
|
|
49
|
+
|
|
50
|
+
Column type inference and type-aware cleaning for messy tabular data.
|
|
51
|
+
Infer whether a column is an integer, float, currency, percentage, boolean,
|
|
52
|
+
date, or free-text string — then clean it to that type. Numbers buried in
|
|
53
|
+
currency symbols, thousands separators, European decimal commas, accounting
|
|
54
|
+
parentheses, and percent signs come out as plain Python numbers; a column's
|
|
55
|
+
worth of `yes`/`Y`/`1`/`true` come out as `bool`; twenty-plus spellings of
|
|
56
|
+
"null" collapse to `None`.
|
|
57
|
+
|
|
58
|
+
Part of the monkey toolkit. Delegates date detection to
|
|
59
|
+
[`datemonkey`](https://pypi.org/project/datemonkey/) and value normalisation
|
|
60
|
+
to [`cleanmonkey`](https://pypi.org/project/cleanmonkey/) — it does not
|
|
61
|
+
reinvent either.
|
|
62
|
+
|
|
63
|
+
## Install
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install typemonkey
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick start
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from typemonkey import infer_type, clean_numeric, clean_boolean, clean_column
|
|
73
|
+
|
|
74
|
+
profile = infer_type(["$1,234.56", "$2,000.00", "$3.50"])
|
|
75
|
+
profile.type # TypeName.CURRENCY
|
|
76
|
+
profile.confidence # 1.0
|
|
77
|
+
profile.locale # "us"
|
|
78
|
+
|
|
79
|
+
clean_numeric(["$1,234.56", "(50)", "12%", "N/A"]).values
|
|
80
|
+
# [1234.56, -50, 0.12, None] # parens = negative, 12% = 0.12, N/A = null
|
|
81
|
+
|
|
82
|
+
clean_numeric(["1.234,56", "3,50"], locale="eu").values
|
|
83
|
+
# [1234.56, 3.5] # European decimal comma
|
|
84
|
+
|
|
85
|
+
clean_boolean(["yes", "NO", "1", "0", "maybe"]).values
|
|
86
|
+
# [True, False, True, False, None] # "maybe" recorded in .failures
|
|
87
|
+
|
|
88
|
+
clean_column(["01234", "07090", "02139"]).values
|
|
89
|
+
# ['01234', '07090', '02139'] # zero-padded IDs preserved as strings
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Every entry point returns a typed dataclass (`ColumnProfile`, `CleanResult`),
|
|
93
|
+
not a dict. `CleanResult.failures` lists `(index, original)` for non-null
|
|
94
|
+
values that didn't parse, so "missing" is never confused with "empty".
|
|
95
|
+
|
|
96
|
+
## What it recognises
|
|
97
|
+
|
|
98
|
+
- **Numbers** — `int`, `float`, with thousands separators, apostrophe/space
|
|
99
|
+
grouping, leading `+`/`-`, accounting `(parentheses)` negatives.
|
|
100
|
+
- **Currency** — `$ € £ ¥ ₹ ...` symbols and ISO codes (`USD`, `EUR`, ...).
|
|
101
|
+
- **Percentages** — `"12%"`, `"8 %"` → `0.12`, `0.08` (or keep as `12`, `8`).
|
|
102
|
+
- **Booleans** — `true/false`, `t/f`, `yes/no`, `y/n`, `on/off`, `1/0`.
|
|
103
|
+
- **Dates** — via datemonkey (ISO, US/EU slash and dash, ambiguity reporting).
|
|
104
|
+
- **Nulls** — 20+ spellings (`N/A`, `#N/A`, `null`, `none`, `-`, `unknown`, …).
|
|
105
|
+
- **Preserve-as-string** — zero-padded IDs, Zip+4, phone numbers.
|
|
106
|
+
- **Locale** — US `1,234.56` vs European `1.234,56`, auto-detected per column.
|
|
107
|
+
|
|
108
|
+
## CLI
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
printf '$1,234.56\n$2,000.00\n$3.50\n' | typemonkey profile # JSON report
|
|
112
|
+
printf '12%%\n8 %%\nN/A\n' | typemonkey clean # cleaned values
|
|
113
|
+
typemonkey clean --type integer column.txt
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
`typemonkey profile` prints a JSON `ColumnProfile`; `typemonkey clean` prints
|
|
117
|
+
one cleaned value per line (blank for nulls) and exits non-zero if any non-null
|
|
118
|
+
value failed to parse.
|
|
119
|
+
|
|
120
|
+
## Using with AI assistants
|
|
121
|
+
|
|
122
|
+
See [`SKILL.md`](SKILL.md) for an LLM-oriented quick reference (decision table,
|
|
123
|
+
worked examples, anti-patterns).
|
|
124
|
+
|
|
125
|
+
## Deliberate tradeoffs
|
|
126
|
+
|
|
127
|
+
Some behaviour is intentional and might look like a bug — bare 5-digit numbers
|
|
128
|
+
aren't treated as zips, all-`0`/`1` columns are integers not booleans, Excel
|
|
129
|
+
serials report numeric. See [`LIMITATIONS.md`](LIMITATIONS.md) for the
|
|
130
|
+
rationale and escape hatch on each.
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
MIT
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# typemonkey
|
|
2
|
+
|
|
3
|
+
Column type inference and type-aware cleaning for messy tabular data.
|
|
4
|
+
Infer whether a column is an integer, float, currency, percentage, boolean,
|
|
5
|
+
date, or free-text string — then clean it to that type. Numbers buried in
|
|
6
|
+
currency symbols, thousands separators, European decimal commas, accounting
|
|
7
|
+
parentheses, and percent signs come out as plain Python numbers; a column's
|
|
8
|
+
worth of `yes`/`Y`/`1`/`true` come out as `bool`; twenty-plus spellings of
|
|
9
|
+
"null" collapse to `None`.
|
|
10
|
+
|
|
11
|
+
Part of the monkey toolkit. Delegates date detection to
|
|
12
|
+
[`datemonkey`](https://pypi.org/project/datemonkey/) and value normalisation
|
|
13
|
+
to [`cleanmonkey`](https://pypi.org/project/cleanmonkey/) — it does not
|
|
14
|
+
reinvent either.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install typemonkey
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Quick start
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from typemonkey import infer_type, clean_numeric, clean_boolean, clean_column
|
|
26
|
+
|
|
27
|
+
profile = infer_type(["$1,234.56", "$2,000.00", "$3.50"])
|
|
28
|
+
profile.type # TypeName.CURRENCY
|
|
29
|
+
profile.confidence # 1.0
|
|
30
|
+
profile.locale # "us"
|
|
31
|
+
|
|
32
|
+
clean_numeric(["$1,234.56", "(50)", "12%", "N/A"]).values
|
|
33
|
+
# [1234.56, -50, 0.12, None] # parens = negative, 12% = 0.12, N/A = null
|
|
34
|
+
|
|
35
|
+
clean_numeric(["1.234,56", "3,50"], locale="eu").values
|
|
36
|
+
# [1234.56, 3.5] # European decimal comma
|
|
37
|
+
|
|
38
|
+
clean_boolean(["yes", "NO", "1", "0", "maybe"]).values
|
|
39
|
+
# [True, False, True, False, None] # "maybe" recorded in .failures
|
|
40
|
+
|
|
41
|
+
clean_column(["01234", "07090", "02139"]).values
|
|
42
|
+
# ['01234', '07090', '02139'] # zero-padded IDs preserved as strings
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Every entry point returns a typed dataclass (`ColumnProfile`, `CleanResult`),
|
|
46
|
+
not a dict. `CleanResult.failures` lists `(index, original)` for non-null
|
|
47
|
+
values that didn't parse, so "missing" is never confused with "empty".
|
|
48
|
+
|
|
49
|
+
## What it recognises
|
|
50
|
+
|
|
51
|
+
- **Numbers** — `int`, `float`, with thousands separators, apostrophe/space
|
|
52
|
+
grouping, leading `+`/`-`, accounting `(parentheses)` negatives.
|
|
53
|
+
- **Currency** — `$ € £ ¥ ₹ ...` symbols and ISO codes (`USD`, `EUR`, ...).
|
|
54
|
+
- **Percentages** — `"12%"`, `"8 %"` → `0.12`, `0.08` (or keep as `12`, `8`).
|
|
55
|
+
- **Booleans** — `true/false`, `t/f`, `yes/no`, `y/n`, `on/off`, `1/0`.
|
|
56
|
+
- **Dates** — via datemonkey (ISO, US/EU slash and dash, ambiguity reporting).
|
|
57
|
+
- **Nulls** — 20+ spellings (`N/A`, `#N/A`, `null`, `none`, `-`, `unknown`, …).
|
|
58
|
+
- **Preserve-as-string** — zero-padded IDs, Zip+4, phone numbers.
|
|
59
|
+
- **Locale** — US `1,234.56` vs European `1.234,56`, auto-detected per column.
|
|
60
|
+
|
|
61
|
+
## CLI
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
printf '$1,234.56\n$2,000.00\n$3.50\n' | typemonkey profile # JSON report
|
|
65
|
+
printf '12%%\n8 %%\nN/A\n' | typemonkey clean # cleaned values
|
|
66
|
+
typemonkey clean --type integer column.txt
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
`typemonkey profile` prints a JSON `ColumnProfile`; `typemonkey clean` prints
|
|
70
|
+
one cleaned value per line (blank for nulls) and exits non-zero if any non-null
|
|
71
|
+
value failed to parse.
|
|
72
|
+
|
|
73
|
+
## Using with AI assistants
|
|
74
|
+
|
|
75
|
+
See [`SKILL.md`](SKILL.md) for an LLM-oriented quick reference (decision table,
|
|
76
|
+
worked examples, anti-patterns).
|
|
77
|
+
|
|
78
|
+
## Deliberate tradeoffs
|
|
79
|
+
|
|
80
|
+
Some behaviour is intentional and might look like a bug — bare 5-digit numbers
|
|
81
|
+
aren't treated as zips, all-`0`/`1` columns are integers not booleans, Excel
|
|
82
|
+
serials report numeric. See [`LIMITATIONS.md`](LIMITATIONS.md) for the
|
|
83
|
+
rationale and escape hatch on each.
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
MIT
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "typemonkey"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Column type inference and type-aware cleaning: numbers, currency, percentages, booleans, nulls, dates."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
authors = [{ name = "RexBytes", email = "pythonic@rexbytes.com" }]
|
|
12
|
+
requires-python = ">=3.11"
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 5 - Production/Stable",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
# Pinned to the 0.1.x line: typemonkey reads detailed attributes off these
|
|
24
|
+
# pre-1.0 result objects (datemonkey's .format.pattern / .confidence /
|
|
25
|
+
# .results[].row_index, etc.), so a 0.2 could break us. Bump deliberately.
|
|
26
|
+
"datemonkey~=0.1.0",
|
|
27
|
+
"cleanmonkey~=0.1.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = ["pytest>=7.0", "pytest-cov", "hypothesis>=6.0"]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
typemonkey = "typemonkey.cli:main"
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/RexBytes/typemonkey"
|
|
38
|
+
Issues = "https://github.com/RexBytes/typemonkey/issues"
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["src"]
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
testpaths = ["tests"]
|
|
45
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""typemonkey — column type inference and type-aware cleaning.
|
|
2
|
+
|
|
3
|
+
Public API (import ``from typemonkey import X``):
|
|
4
|
+
|
|
5
|
+
* :func:`infer_type` — profile a column, returning a :class:`ColumnProfile`.
|
|
6
|
+
* :func:`clean_column` — clean a column to an inferred or given type.
|
|
7
|
+
* :func:`clean_numeric` / :func:`clean_boolean` — type-specific cleaners.
|
|
8
|
+
* :func:`normalize_nulls` / :func:`is_null` — null handling.
|
|
9
|
+
* :func:`detect_number_locale` — US vs European number format detection.
|
|
10
|
+
* :func:`looks_like_preserve_string` — zip/phone/zero-padded-ID detection.
|
|
11
|
+
* :class:`ColumnProfile`, :class:`CleanResult`, :class:`TypeName` — result types.
|
|
12
|
+
* :data:`DEFAULT_NULLS`, :data:`TRUE_VALUES`, :data:`FALSE_VALUES` — vocabularies.
|
|
13
|
+
|
|
14
|
+
See LIMITATIONS.md for deliberate design tradeoffs and SKILL.md for an
|
|
15
|
+
LLM-oriented quick reference.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from .booleans import FALSE_VALUES, TRUE_VALUES, clean_boolean, parse_boolean
|
|
21
|
+
from .clean import clean_column
|
|
22
|
+
from .infer import infer_type
|
|
23
|
+
from .locale import detect_number_locale
|
|
24
|
+
from .models import CleanResult, ColumnProfile, TypeName
|
|
25
|
+
from .nulls import DEFAULT_NULLS, is_null, normalize_nulls
|
|
26
|
+
from .numbers import clean_numeric, parse_number
|
|
27
|
+
from .preserve import looks_like_preserve_string
|
|
28
|
+
|
|
29
|
+
__version__ = "1.0.0"
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"infer_type",
|
|
33
|
+
"clean_column",
|
|
34
|
+
"clean_numeric",
|
|
35
|
+
"clean_boolean",
|
|
36
|
+
"parse_number",
|
|
37
|
+
"parse_boolean",
|
|
38
|
+
"normalize_nulls",
|
|
39
|
+
"is_null",
|
|
40
|
+
"detect_number_locale",
|
|
41
|
+
"looks_like_preserve_string",
|
|
42
|
+
"ColumnProfile",
|
|
43
|
+
"CleanResult",
|
|
44
|
+
"TypeName",
|
|
45
|
+
"DEFAULT_NULLS",
|
|
46
|
+
"TRUE_VALUES",
|
|
47
|
+
"FALSE_VALUES",
|
|
48
|
+
"__version__",
|
|
49
|
+
]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Boolean vocabulary and normalisation.
|
|
2
|
+
|
|
3
|
+
This module exists because "true" arrives as ``yes``, ``Y``, ``1``, ``on``,
|
|
4
|
+
``T``, ``true``, ``TRUE`` and a dozen other spellings, and inference needs one
|
|
5
|
+
authoritative mapping from token to ``bool``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from .models import CleanResult, TypeName
|
|
13
|
+
from .nulls import is_null
|
|
14
|
+
|
|
15
|
+
# Tokens that mean True / False. Matching is case-insensitive and
|
|
16
|
+
# whitespace-trimmed, so only one casing of each spelling is listed.
|
|
17
|
+
TRUE_VALUES: frozenset[str] = frozenset(
|
|
18
|
+
{"true", "t", "yes", "y", "1", "on", "enabled", "active"}
|
|
19
|
+
)
|
|
20
|
+
FALSE_VALUES: frozenset[str] = frozenset(
|
|
21
|
+
{"false", "f", "no", "n", "0", "off", "disabled", "inactive"}
|
|
22
|
+
)
|
|
23
|
+
# The numeric pair is recognised as boolean but is *also* valid integer data.
|
|
24
|
+
# Inference uses this set to avoid mislabelling a 0/1 integer column as boolean.
|
|
25
|
+
NUMERIC_BOOLEANS: frozenset[str] = frozenset({"0", "1"})
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ParsedBoolean:
|
|
30
|
+
"""A parsed boolean token.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
value: The resulting ``bool``.
|
|
34
|
+
numeric: ``True`` when the source token was ``"0"`` or ``"1"`` — the
|
|
35
|
+
ambiguous case that is equally valid integer data.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
value: bool
|
|
39
|
+
numeric: bool
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_boolean(
|
|
43
|
+
token: object,
|
|
44
|
+
*,
|
|
45
|
+
true_values=None,
|
|
46
|
+
false_values=None,
|
|
47
|
+
) -> ParsedBoolean:
|
|
48
|
+
"""Parse one boolean token, raising :class:`ValueError` if unrecognised.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
token: The value to parse. Non-strings are stringified; the result is
|
|
52
|
+
trimmed and lower-cased before lookup.
|
|
53
|
+
true_values: Override truthy vocabulary (case-insensitive). Defaults
|
|
54
|
+
to :data:`TRUE_VALUES`.
|
|
55
|
+
false_values: Override falsy vocabulary. Defaults to
|
|
56
|
+
:data:`FALSE_VALUES`.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
A :class:`ParsedBoolean`.
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If the token is in neither vocabulary.
|
|
63
|
+
"""
|
|
64
|
+
trues = TRUE_VALUES if true_values is None else frozenset(v.lower() for v in true_values)
|
|
65
|
+
falses = FALSE_VALUES if false_values is None else frozenset(v.lower() for v in false_values)
|
|
66
|
+
key = str(token).strip().lower()
|
|
67
|
+
if key in trues:
|
|
68
|
+
return ParsedBoolean(value=True, numeric=key in NUMERIC_BOOLEANS)
|
|
69
|
+
if key in falses:
|
|
70
|
+
return ParsedBoolean(value=False, numeric=key in NUMERIC_BOOLEANS)
|
|
71
|
+
raise ValueError(f"{token!r} is not a recognised boolean")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def is_boolean(token: object, **kwargs) -> bool:
|
|
75
|
+
"""Return ``True`` when :func:`parse_boolean` would succeed on ``token``."""
|
|
76
|
+
try:
|
|
77
|
+
parse_boolean(token, **kwargs)
|
|
78
|
+
return True
|
|
79
|
+
except ValueError:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def clean_boolean(
|
|
84
|
+
values,
|
|
85
|
+
*,
|
|
86
|
+
true_values=None,
|
|
87
|
+
false_values=None,
|
|
88
|
+
null_values=None,
|
|
89
|
+
) -> CleanResult:
|
|
90
|
+
"""Clean a column of messy boolean strings into Python ``bool`` values.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
values: The column to clean.
|
|
94
|
+
true_values: Override truthy vocabulary (see :func:`parse_boolean`).
|
|
95
|
+
false_values: Override falsy vocabulary.
|
|
96
|
+
null_values: Null spellings (see :func:`typemonkey.nulls.is_null`);
|
|
97
|
+
recognised nulls become ``None`` and are not counted as failures.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
A :class:`CleanResult` whose ``values`` align 1:1 with ``values`` and
|
|
101
|
+
whose ``failures`` records ``(index, original)`` for non-null tokens
|
|
102
|
+
that matched neither vocabulary.
|
|
103
|
+
"""
|
|
104
|
+
out: list[object] = []
|
|
105
|
+
failures: list[tuple[int, str]] = []
|
|
106
|
+
null_count = 0
|
|
107
|
+
for i, raw in enumerate(values):
|
|
108
|
+
if is_null(raw, null_values):
|
|
109
|
+
out.append(None)
|
|
110
|
+
null_count += 1
|
|
111
|
+
continue
|
|
112
|
+
try:
|
|
113
|
+
parsed = parse_boolean(raw, true_values=true_values, false_values=false_values)
|
|
114
|
+
except ValueError:
|
|
115
|
+
out.append(None)
|
|
116
|
+
failures.append((i, str(raw)))
|
|
117
|
+
continue
|
|
118
|
+
out.append(parsed.value)
|
|
119
|
+
return CleanResult(
|
|
120
|
+
values=out,
|
|
121
|
+
target_type=TypeName.BOOLEAN,
|
|
122
|
+
null_count=null_count,
|
|
123
|
+
failures=failures,
|
|
124
|
+
)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""``clean_column`` — infer (or accept) a target type, then clean to it.
|
|
2
|
+
|
|
3
|
+
This module exists as the one-call convenience entry point: hand it a column
|
|
4
|
+
and it figures out the type and returns cleaned values, or pass ``target_type``
|
|
5
|
+
to force the conversion. It dispatches to the type-specific cleaners in
|
|
6
|
+
:mod:`typemonkey.numbers` and :mod:`typemonkey.booleans`, and to datemonkey for
|
|
7
|
+
dates.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import cleanmonkey
|
|
13
|
+
from datemonkey import parse_dates
|
|
14
|
+
|
|
15
|
+
from .booleans import clean_boolean
|
|
16
|
+
from .infer import infer_type
|
|
17
|
+
from .locale import detect_number_locale
|
|
18
|
+
from .models import CleanResult, TypeName
|
|
19
|
+
from .nulls import is_null
|
|
20
|
+
from .numbers import clean_numeric
|
|
21
|
+
|
|
22
|
+
_NUMERIC = {TypeName.INTEGER, TypeName.FLOAT, TypeName.CURRENCY, TypeName.PERCENTAGE}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def clean_column(
|
|
26
|
+
values,
|
|
27
|
+
*,
|
|
28
|
+
target_type: TypeName | str | None = None,
|
|
29
|
+
null_values=None,
|
|
30
|
+
locale: str | None = None,
|
|
31
|
+
locale_preference: str | None = None,
|
|
32
|
+
true_values=None,
|
|
33
|
+
false_values=None,
|
|
34
|
+
percent_as_fraction: bool = True,
|
|
35
|
+
integers: bool = True,
|
|
36
|
+
) -> CleanResult:
|
|
37
|
+
"""Clean a column to ``target_type``, inferring the type when not given.
|
|
38
|
+
|
|
39
|
+
``values`` is materialised once up front, so a one-shot iterable
|
|
40
|
+
(generator) is safe: inference and cleaning see the same data, and counts
|
|
41
|
+
stay consistent.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
values: The column to clean. Consumed exactly once.
|
|
45
|
+
target_type: A :class:`TypeName` (or its string value) to force. When
|
|
46
|
+
``None`` the type is inferred via :func:`typemonkey.infer.infer_type`
|
|
47
|
+
and that result — including its detected ``locale`` — drives the
|
|
48
|
+
cleaning.
|
|
49
|
+
null_values: Null spellings (see :func:`typemonkey.nulls.is_null`).
|
|
50
|
+
locale: Number locale for numeric targets. When ``None`` it is taken
|
|
51
|
+
from inference (auto-detected from the data), or detected directly
|
|
52
|
+
for a forced numeric ``target_type``.
|
|
53
|
+
locale_preference: ``"us"``/``"eu"`` hint for date parsing.
|
|
54
|
+
true_values: Override truthy boolean vocabulary.
|
|
55
|
+
false_values: Override falsy boolean vocabulary.
|
|
56
|
+
percent_as_fraction: Scale percents by 1/100 (see
|
|
57
|
+
:func:`typemonkey.numbers.clean_numeric`).
|
|
58
|
+
integers: Return whole numbers as ``int`` (see
|
|
59
|
+
:func:`typemonkey.numbers.clean_numeric`).
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A :class:`CleanResult` whose ``values`` align 1:1 with ``values`` and
|
|
63
|
+
whose ``target_type`` echoes the inferred or forced type. For ``STRING``
|
|
64
|
+
targets every non-null value is cleanmonkey-normalised and never a
|
|
65
|
+
failure; for ``NULL`` every value is ``None`` and ``null_count`` equals
|
|
66
|
+
the input length.
|
|
67
|
+
"""
|
|
68
|
+
vals = list(values) # materialise once — safe for generators
|
|
69
|
+
|
|
70
|
+
inferred_locale: str | None = None
|
|
71
|
+
if target_type is None:
|
|
72
|
+
profile = infer_type(
|
|
73
|
+
vals,
|
|
74
|
+
null_values=null_values,
|
|
75
|
+
locale=locale,
|
|
76
|
+
locale_preference=locale_preference,
|
|
77
|
+
true_values=true_values,
|
|
78
|
+
false_values=false_values,
|
|
79
|
+
)
|
|
80
|
+
target = profile.type
|
|
81
|
+
inferred_locale = profile.locale
|
|
82
|
+
else:
|
|
83
|
+
target = TypeName(target_type)
|
|
84
|
+
|
|
85
|
+
if target in _NUMERIC:
|
|
86
|
+
# Honour an explicit locale, else the one inference detected, else
|
|
87
|
+
# detect it directly (forced numeric target with no inference step).
|
|
88
|
+
effective_locale = locale or inferred_locale or detect_number_locale(vals)
|
|
89
|
+
return clean_numeric(
|
|
90
|
+
vals,
|
|
91
|
+
locale=effective_locale,
|
|
92
|
+
null_values=null_values,
|
|
93
|
+
percent_as_fraction=percent_as_fraction,
|
|
94
|
+
integers=integers,
|
|
95
|
+
target_type=target,
|
|
96
|
+
)
|
|
97
|
+
if target is TypeName.BOOLEAN:
|
|
98
|
+
return clean_boolean(
|
|
99
|
+
vals,
|
|
100
|
+
true_values=true_values,
|
|
101
|
+
false_values=false_values,
|
|
102
|
+
null_values=null_values,
|
|
103
|
+
)
|
|
104
|
+
if target is TypeName.DATE:
|
|
105
|
+
return _clean_dates(vals, null_values, locale_preference)
|
|
106
|
+
if target is TypeName.NULL:
|
|
107
|
+
return CleanResult(
|
|
108
|
+
values=[None] * len(vals),
|
|
109
|
+
target_type=TypeName.NULL,
|
|
110
|
+
null_count=len(vals),
|
|
111
|
+
)
|
|
112
|
+
return _clean_strings(vals, null_values)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _clean_dates(values, null_values, locale_preference) -> CleanResult:
|
|
116
|
+
"""Parse a date column via datemonkey, preserving null/failure distinction."""
|
|
117
|
+
vals = list(values)
|
|
118
|
+
null_mask = [is_null(v, null_values) for v in vals]
|
|
119
|
+
# datemonkey treats None as a null/failure; feed it None for our nulls so
|
|
120
|
+
# the row indices line up.
|
|
121
|
+
feed = [None if null_mask[i] else vals[i] for i in range(len(vals))]
|
|
122
|
+
batch = parse_dates(feed, locale_preference=locale_preference)
|
|
123
|
+
# datemonkey returns an *empty* ``results`` list when no value in the batch
|
|
124
|
+
# parses, and otherwise one result per row tagged with ``row_index``. Map by
|
|
125
|
+
# ``row_index`` and walk our own indices so output stays 1:1 with the input
|
|
126
|
+
# even when zero values parsed (every non-null then becomes a failure).
|
|
127
|
+
by_index = {r.row_index: r for r in batch.results}
|
|
128
|
+
out: list[object] = []
|
|
129
|
+
failures: list[tuple[int, str]] = []
|
|
130
|
+
for i in range(len(vals)):
|
|
131
|
+
if null_mask[i]:
|
|
132
|
+
out.append(None)
|
|
133
|
+
continue
|
|
134
|
+
result = by_index.get(i)
|
|
135
|
+
if result is not None and result.parsed is not None:
|
|
136
|
+
out.append(result.parsed)
|
|
137
|
+
else:
|
|
138
|
+
out.append(None)
|
|
139
|
+
failures.append((i, str(vals[i])))
|
|
140
|
+
return CleanResult(
|
|
141
|
+
values=out,
|
|
142
|
+
target_type=TypeName.DATE,
|
|
143
|
+
null_count=sum(null_mask),
|
|
144
|
+
failures=failures,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _clean_strings(values, null_values) -> CleanResult:
|
|
149
|
+
"""Normalise a string column with cleanmonkey; nulls become ``None``."""
|
|
150
|
+
out: list[object] = []
|
|
151
|
+
null_count = 0
|
|
152
|
+
for v in values:
|
|
153
|
+
if is_null(v, null_values):
|
|
154
|
+
out.append(None)
|
|
155
|
+
null_count += 1
|
|
156
|
+
else:
|
|
157
|
+
out.append(cleanmonkey.clean(str(v)))
|
|
158
|
+
return CleanResult(
|
|
159
|
+
values=out,
|
|
160
|
+
target_type=TypeName.STRING,
|
|
161
|
+
null_count=null_count,
|
|
162
|
+
)
|