typemonkey 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- typemonkey/__init__.py +49 -0
- typemonkey/booleans.py +124 -0
- typemonkey/clean.py +162 -0
- typemonkey/cli.py +100 -0
- typemonkey/detectors/__init__.py +16 -0
- typemonkey/detectors/base.py +28 -0
- typemonkey/detectors/boolean.py +39 -0
- typemonkey/detectors/date.py +53 -0
- typemonkey/detectors/numeric.py +46 -0
- typemonkey/infer.py +291 -0
- typemonkey/locale.py +87 -0
- typemonkey/models.py +147 -0
- typemonkey/nulls.py +107 -0
- typemonkey/numbers.py +295 -0
- typemonkey/preserve.py +82 -0
- typemonkey-1.0.0.dist-info/METADATA +134 -0
- typemonkey-1.0.0.dist-info/RECORD +21 -0
- typemonkey-1.0.0.dist-info/WHEEL +5 -0
- typemonkey-1.0.0.dist-info/entry_points.txt +2 -0
- typemonkey-1.0.0.dist-info/licenses/LICENSE +21 -0
- typemonkey-1.0.0.dist-info/top_level.txt +1 -0
typemonkey/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""typemonkey — column type inference and type-aware cleaning.
|
|
2
|
+
|
|
3
|
+
Public API (import ``from typemonkey import X``):
|
|
4
|
+
|
|
5
|
+
* :func:`infer_type` — profile a column, returning a :class:`ColumnProfile`.
|
|
6
|
+
* :func:`clean_column` — clean a column to an inferred or given type.
|
|
7
|
+
* :func:`clean_numeric` / :func:`clean_boolean` — type-specific cleaners.
|
|
8
|
+
* :func:`normalize_nulls` / :func:`is_null` — null handling.
|
|
9
|
+
* :func:`detect_number_locale` — US vs European number format detection.
|
|
10
|
+
* :func:`looks_like_preserve_string` — zip/phone/zero-padded-ID detection.
|
|
11
|
+
* :class:`ColumnProfile`, :class:`CleanResult`, :class:`TypeName` — result types.
|
|
12
|
+
* :data:`DEFAULT_NULLS`, :data:`TRUE_VALUES`, :data:`FALSE_VALUES` — vocabularies.
|
|
13
|
+
|
|
14
|
+
See LIMITATIONS.md for deliberate design tradeoffs and SKILL.md for an
|
|
15
|
+
LLM-oriented quick reference.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from .booleans import FALSE_VALUES, TRUE_VALUES, clean_boolean, parse_boolean
|
|
21
|
+
from .clean import clean_column
|
|
22
|
+
from .infer import infer_type
|
|
23
|
+
from .locale import detect_number_locale
|
|
24
|
+
from .models import CleanResult, ColumnProfile, TypeName
|
|
25
|
+
from .nulls import DEFAULT_NULLS, is_null, normalize_nulls
|
|
26
|
+
from .numbers import clean_numeric, parse_number
|
|
27
|
+
from .preserve import looks_like_preserve_string
|
|
28
|
+
|
|
29
|
+
__version__ = "1.0.0"
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"infer_type",
|
|
33
|
+
"clean_column",
|
|
34
|
+
"clean_numeric",
|
|
35
|
+
"clean_boolean",
|
|
36
|
+
"parse_number",
|
|
37
|
+
"parse_boolean",
|
|
38
|
+
"normalize_nulls",
|
|
39
|
+
"is_null",
|
|
40
|
+
"detect_number_locale",
|
|
41
|
+
"looks_like_preserve_string",
|
|
42
|
+
"ColumnProfile",
|
|
43
|
+
"CleanResult",
|
|
44
|
+
"TypeName",
|
|
45
|
+
"DEFAULT_NULLS",
|
|
46
|
+
"TRUE_VALUES",
|
|
47
|
+
"FALSE_VALUES",
|
|
48
|
+
"__version__",
|
|
49
|
+
]
|
typemonkey/booleans.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Boolean vocabulary and normalisation.
|
|
2
|
+
|
|
3
|
+
This module exists because "true" arrives as ``yes``, ``Y``, ``1``, ``on``,
|
|
4
|
+
``T``, ``true``, ``TRUE`` and a dozen other spellings, and inference needs one
|
|
5
|
+
authoritative mapping from token to ``bool``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from .models import CleanResult, TypeName
|
|
13
|
+
from .nulls import is_null
|
|
14
|
+
|
|
15
|
+
# Tokens that mean True / False. Matching is case-insensitive and
|
|
16
|
+
# whitespace-trimmed, so only one casing of each spelling is listed.
|
|
17
|
+
TRUE_VALUES: frozenset[str] = frozenset(
|
|
18
|
+
{"true", "t", "yes", "y", "1", "on", "enabled", "active"}
|
|
19
|
+
)
|
|
20
|
+
FALSE_VALUES: frozenset[str] = frozenset(
|
|
21
|
+
{"false", "f", "no", "n", "0", "off", "disabled", "inactive"}
|
|
22
|
+
)
|
|
23
|
+
# The numeric pair is recognised as boolean but is *also* valid integer data.
|
|
24
|
+
# Inference uses this set to avoid mislabelling a 0/1 integer column as boolean.
|
|
25
|
+
NUMERIC_BOOLEANS: frozenset[str] = frozenset({"0", "1"})
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ParsedBoolean:
|
|
30
|
+
"""A parsed boolean token.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
value: The resulting ``bool``.
|
|
34
|
+
numeric: ``True`` when the source token was ``"0"`` or ``"1"`` — the
|
|
35
|
+
ambiguous case that is equally valid integer data.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
value: bool
|
|
39
|
+
numeric: bool
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_boolean(
|
|
43
|
+
token: object,
|
|
44
|
+
*,
|
|
45
|
+
true_values=None,
|
|
46
|
+
false_values=None,
|
|
47
|
+
) -> ParsedBoolean:
|
|
48
|
+
"""Parse one boolean token, raising :class:`ValueError` if unrecognised.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
token: The value to parse. Non-strings are stringified; the result is
|
|
52
|
+
trimmed and lower-cased before lookup.
|
|
53
|
+
true_values: Override truthy vocabulary (case-insensitive). Defaults
|
|
54
|
+
to :data:`TRUE_VALUES`.
|
|
55
|
+
false_values: Override falsy vocabulary. Defaults to
|
|
56
|
+
:data:`FALSE_VALUES`.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
A :class:`ParsedBoolean`.
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If the token is in neither vocabulary.
|
|
63
|
+
"""
|
|
64
|
+
trues = TRUE_VALUES if true_values is None else frozenset(v.lower() for v in true_values)
|
|
65
|
+
falses = FALSE_VALUES if false_values is None else frozenset(v.lower() for v in false_values)
|
|
66
|
+
key = str(token).strip().lower()
|
|
67
|
+
if key in trues:
|
|
68
|
+
return ParsedBoolean(value=True, numeric=key in NUMERIC_BOOLEANS)
|
|
69
|
+
if key in falses:
|
|
70
|
+
return ParsedBoolean(value=False, numeric=key in NUMERIC_BOOLEANS)
|
|
71
|
+
raise ValueError(f"{token!r} is not a recognised boolean")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def is_boolean(token: object, **kwargs) -> bool:
|
|
75
|
+
"""Return ``True`` when :func:`parse_boolean` would succeed on ``token``."""
|
|
76
|
+
try:
|
|
77
|
+
parse_boolean(token, **kwargs)
|
|
78
|
+
return True
|
|
79
|
+
except ValueError:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def clean_boolean(
|
|
84
|
+
values,
|
|
85
|
+
*,
|
|
86
|
+
true_values=None,
|
|
87
|
+
false_values=None,
|
|
88
|
+
null_values=None,
|
|
89
|
+
) -> CleanResult:
|
|
90
|
+
"""Clean a column of messy boolean strings into Python ``bool`` values.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
values: The column to clean.
|
|
94
|
+
true_values: Override truthy vocabulary (see :func:`parse_boolean`).
|
|
95
|
+
false_values: Override falsy vocabulary.
|
|
96
|
+
null_values: Null spellings (see :func:`typemonkey.nulls.is_null`);
|
|
97
|
+
recognised nulls become ``None`` and are not counted as failures.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
A :class:`CleanResult` whose ``values`` align 1:1 with ``values`` and
|
|
101
|
+
whose ``failures`` records ``(index, original)`` for non-null tokens
|
|
102
|
+
that matched neither vocabulary.
|
|
103
|
+
"""
|
|
104
|
+
out: list[object] = []
|
|
105
|
+
failures: list[tuple[int, str]] = []
|
|
106
|
+
null_count = 0
|
|
107
|
+
for i, raw in enumerate(values):
|
|
108
|
+
if is_null(raw, null_values):
|
|
109
|
+
out.append(None)
|
|
110
|
+
null_count += 1
|
|
111
|
+
continue
|
|
112
|
+
try:
|
|
113
|
+
parsed = parse_boolean(raw, true_values=true_values, false_values=false_values)
|
|
114
|
+
except ValueError:
|
|
115
|
+
out.append(None)
|
|
116
|
+
failures.append((i, str(raw)))
|
|
117
|
+
continue
|
|
118
|
+
out.append(parsed.value)
|
|
119
|
+
return CleanResult(
|
|
120
|
+
values=out,
|
|
121
|
+
target_type=TypeName.BOOLEAN,
|
|
122
|
+
null_count=null_count,
|
|
123
|
+
failures=failures,
|
|
124
|
+
)
|
typemonkey/clean.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""``clean_column`` — infer (or accept) a target type, then clean to it.
|
|
2
|
+
|
|
3
|
+
This module exists as the one-call convenience entry point: hand it a column
|
|
4
|
+
and it figures out the type and returns cleaned values, or pass ``target_type``
|
|
5
|
+
to force the conversion. It dispatches to the type-specific cleaners in
|
|
6
|
+
:mod:`typemonkey.numbers` and :mod:`typemonkey.booleans`, and to datemonkey for
|
|
7
|
+
dates.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import cleanmonkey
|
|
13
|
+
from datemonkey import parse_dates
|
|
14
|
+
|
|
15
|
+
from .booleans import clean_boolean
|
|
16
|
+
from .infer import infer_type
|
|
17
|
+
from .locale import detect_number_locale
|
|
18
|
+
from .models import CleanResult, TypeName
|
|
19
|
+
from .nulls import is_null
|
|
20
|
+
from .numbers import clean_numeric
|
|
21
|
+
|
|
22
|
+
_NUMERIC = {TypeName.INTEGER, TypeName.FLOAT, TypeName.CURRENCY, TypeName.PERCENTAGE}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def clean_column(
|
|
26
|
+
values,
|
|
27
|
+
*,
|
|
28
|
+
target_type: TypeName | str | None = None,
|
|
29
|
+
null_values=None,
|
|
30
|
+
locale: str | None = None,
|
|
31
|
+
locale_preference: str | None = None,
|
|
32
|
+
true_values=None,
|
|
33
|
+
false_values=None,
|
|
34
|
+
percent_as_fraction: bool = True,
|
|
35
|
+
integers: bool = True,
|
|
36
|
+
) -> CleanResult:
|
|
37
|
+
"""Clean a column to ``target_type``, inferring the type when not given.
|
|
38
|
+
|
|
39
|
+
``values`` is materialised once up front, so a one-shot iterable
|
|
40
|
+
(generator) is safe: inference and cleaning see the same data, and counts
|
|
41
|
+
stay consistent.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
values: The column to clean. Consumed exactly once.
|
|
45
|
+
target_type: A :class:`TypeName` (or its string value) to force. When
|
|
46
|
+
``None`` the type is inferred via :func:`typemonkey.infer.infer_type`
|
|
47
|
+
and that result — including its detected ``locale`` — drives the
|
|
48
|
+
cleaning.
|
|
49
|
+
null_values: Null spellings (see :func:`typemonkey.nulls.is_null`).
|
|
50
|
+
locale: Number locale for numeric targets. When ``None`` it is taken
|
|
51
|
+
from inference (auto-detected from the data), or detected directly
|
|
52
|
+
for a forced numeric ``target_type``.
|
|
53
|
+
locale_preference: ``"us"``/``"eu"`` hint for date parsing.
|
|
54
|
+
true_values: Override truthy boolean vocabulary.
|
|
55
|
+
false_values: Override falsy boolean vocabulary.
|
|
56
|
+
percent_as_fraction: Scale percents by 1/100 (see
|
|
57
|
+
:func:`typemonkey.numbers.clean_numeric`).
|
|
58
|
+
integers: Return whole numbers as ``int`` (see
|
|
59
|
+
:func:`typemonkey.numbers.clean_numeric`).
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A :class:`CleanResult` whose ``values`` align 1:1 with ``values`` and
|
|
63
|
+
whose ``target_type`` echoes the inferred or forced type. For ``STRING``
|
|
64
|
+
targets every non-null value is cleanmonkey-normalised and never a
|
|
65
|
+
failure; for ``NULL`` every value is ``None`` and ``null_count`` equals
|
|
66
|
+
the input length.
|
|
67
|
+
"""
|
|
68
|
+
vals = list(values) # materialise once — safe for generators
|
|
69
|
+
|
|
70
|
+
inferred_locale: str | None = None
|
|
71
|
+
if target_type is None:
|
|
72
|
+
profile = infer_type(
|
|
73
|
+
vals,
|
|
74
|
+
null_values=null_values,
|
|
75
|
+
locale=locale,
|
|
76
|
+
locale_preference=locale_preference,
|
|
77
|
+
true_values=true_values,
|
|
78
|
+
false_values=false_values,
|
|
79
|
+
)
|
|
80
|
+
target = profile.type
|
|
81
|
+
inferred_locale = profile.locale
|
|
82
|
+
else:
|
|
83
|
+
target = TypeName(target_type)
|
|
84
|
+
|
|
85
|
+
if target in _NUMERIC:
|
|
86
|
+
# Honour an explicit locale, else the one inference detected, else
|
|
87
|
+
# detect it directly (forced numeric target with no inference step).
|
|
88
|
+
effective_locale = locale or inferred_locale or detect_number_locale(vals)
|
|
89
|
+
return clean_numeric(
|
|
90
|
+
vals,
|
|
91
|
+
locale=effective_locale,
|
|
92
|
+
null_values=null_values,
|
|
93
|
+
percent_as_fraction=percent_as_fraction,
|
|
94
|
+
integers=integers,
|
|
95
|
+
target_type=target,
|
|
96
|
+
)
|
|
97
|
+
if target is TypeName.BOOLEAN:
|
|
98
|
+
return clean_boolean(
|
|
99
|
+
vals,
|
|
100
|
+
true_values=true_values,
|
|
101
|
+
false_values=false_values,
|
|
102
|
+
null_values=null_values,
|
|
103
|
+
)
|
|
104
|
+
if target is TypeName.DATE:
|
|
105
|
+
return _clean_dates(vals, null_values, locale_preference)
|
|
106
|
+
if target is TypeName.NULL:
|
|
107
|
+
return CleanResult(
|
|
108
|
+
values=[None] * len(vals),
|
|
109
|
+
target_type=TypeName.NULL,
|
|
110
|
+
null_count=len(vals),
|
|
111
|
+
)
|
|
112
|
+
return _clean_strings(vals, null_values)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _clean_dates(values, null_values, locale_preference) -> CleanResult:
|
|
116
|
+
"""Parse a date column via datemonkey, preserving null/failure distinction."""
|
|
117
|
+
vals = list(values)
|
|
118
|
+
null_mask = [is_null(v, null_values) for v in vals]
|
|
119
|
+
# datemonkey treats None as a null/failure; feed it None for our nulls so
|
|
120
|
+
# the row indices line up.
|
|
121
|
+
feed = [None if null_mask[i] else vals[i] for i in range(len(vals))]
|
|
122
|
+
batch = parse_dates(feed, locale_preference=locale_preference)
|
|
123
|
+
# datemonkey returns an *empty* ``results`` list when no value in the batch
|
|
124
|
+
# parses, and otherwise one result per row tagged with ``row_index``. Map by
|
|
125
|
+
# ``row_index`` and walk our own indices so output stays 1:1 with the input
|
|
126
|
+
# even when zero values parsed (every non-null then becomes a failure).
|
|
127
|
+
by_index = {r.row_index: r for r in batch.results}
|
|
128
|
+
out: list[object] = []
|
|
129
|
+
failures: list[tuple[int, str]] = []
|
|
130
|
+
for i in range(len(vals)):
|
|
131
|
+
if null_mask[i]:
|
|
132
|
+
out.append(None)
|
|
133
|
+
continue
|
|
134
|
+
result = by_index.get(i)
|
|
135
|
+
if result is not None and result.parsed is not None:
|
|
136
|
+
out.append(result.parsed)
|
|
137
|
+
else:
|
|
138
|
+
out.append(None)
|
|
139
|
+
failures.append((i, str(vals[i])))
|
|
140
|
+
return CleanResult(
|
|
141
|
+
values=out,
|
|
142
|
+
target_type=TypeName.DATE,
|
|
143
|
+
null_count=sum(null_mask),
|
|
144
|
+
failures=failures,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _clean_strings(values, null_values) -> CleanResult:
|
|
149
|
+
"""Normalise a string column with cleanmonkey; nulls become ``None``."""
|
|
150
|
+
out: list[object] = []
|
|
151
|
+
null_count = 0
|
|
152
|
+
for v in values:
|
|
153
|
+
if is_null(v, null_values):
|
|
154
|
+
out.append(None)
|
|
155
|
+
null_count += 1
|
|
156
|
+
else:
|
|
157
|
+
out.append(cleanmonkey.clean(str(v)))
|
|
158
|
+
return CleanResult(
|
|
159
|
+
values=out,
|
|
160
|
+
target_type=TypeName.STRING,
|
|
161
|
+
null_count=null_count,
|
|
162
|
+
)
|
typemonkey/cli.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Thin CLI wrapper around the typemonkey library.
|
|
2
|
+
|
|
3
|
+
This module exists only to parse arguments, read input columns, and format
|
|
4
|
+
output; all logic lives in the library. Commands:
|
|
5
|
+
|
|
6
|
+
* ``typemonkey profile`` — infer a column's type and print a JSON report.
|
|
7
|
+
* ``typemonkey clean`` — clean a column to an inferred or given type.
|
|
8
|
+
|
|
9
|
+
Input is read one value per line from a file argument or stdin.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from typing import Sequence
|
|
18
|
+
|
|
19
|
+
from .clean import clean_column
|
|
20
|
+
from .infer import infer_type
|
|
21
|
+
from .models import TypeName, _jsonable
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _read_values(path: str | None) -> list[str]:
|
|
25
|
+
"""Read one value per line from ``path`` (or stdin when ``None``/``"-"``)."""
|
|
26
|
+
if path in (None, "-"):
|
|
27
|
+
text = sys.stdin.read()
|
|
28
|
+
else:
|
|
29
|
+
with open(path, "r", encoding="utf-8") as fh:
|
|
30
|
+
text = fh.read()
|
|
31
|
+
# Strip only the trailing newline of the file, then split on line breaks so
|
|
32
|
+
# genuinely empty lines survive as empty-string values.
|
|
33
|
+
text = text.rstrip("\n")
|
|
34
|
+
return text.split("\n") if text != "" else []
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
38
|
+
parser = argparse.ArgumentParser(
|
|
39
|
+
prog="typemonkey",
|
|
40
|
+
description="Infer column types and clean values to those types.",
|
|
41
|
+
)
|
|
42
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
43
|
+
|
|
44
|
+
p_profile = sub.add_parser("profile", help="Infer a column's type (JSON report).")
|
|
45
|
+
p_profile.add_argument("file", nargs="?", default="-", help="Input file, or - for stdin.")
|
|
46
|
+
p_profile.add_argument("--threshold", type=float, default=None, help="Conformance threshold (0-1).")
|
|
47
|
+
p_profile.add_argument("--locale", choices=["us", "eu"], default=None, help="Force number locale.")
|
|
48
|
+
|
|
49
|
+
p_clean = sub.add_parser("clean", help="Clean a column to a type (one value per line).")
|
|
50
|
+
p_clean.add_argument("file", nargs="?", default="-", help="Input file, or - for stdin.")
|
|
51
|
+
p_clean.add_argument(
|
|
52
|
+
"--type",
|
|
53
|
+
choices=[t.value for t in TypeName],
|
|
54
|
+
default=None,
|
|
55
|
+
help="Force target type instead of inferring.",
|
|
56
|
+
)
|
|
57
|
+
p_clean.add_argument("--locale", choices=["us", "eu"], default=None, help="Force number locale.")
|
|
58
|
+
p_clean.add_argument(
|
|
59
|
+
"--keep-percent",
|
|
60
|
+
action="store_true",
|
|
61
|
+
help="Keep percents as whole numbers (12%% -> 12) instead of fractions.",
|
|
62
|
+
)
|
|
63
|
+
return parser
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
67
|
+
"""CLI entry point. Returns a process exit code."""
|
|
68
|
+
parser = _build_parser()
|
|
69
|
+
args = parser.parse_args(argv)
|
|
70
|
+
values = _read_values(args.file)
|
|
71
|
+
|
|
72
|
+
if args.command == "profile":
|
|
73
|
+
kwargs = {}
|
|
74
|
+
if args.threshold is not None:
|
|
75
|
+
kwargs["threshold"] = args.threshold
|
|
76
|
+
if args.locale is not None:
|
|
77
|
+
kwargs["locale"] = args.locale
|
|
78
|
+
profile = infer_type(values, **kwargs)
|
|
79
|
+
json.dump(profile.to_dict(), sys.stdout, indent=2, sort_keys=True)
|
|
80
|
+
sys.stdout.write("\n")
|
|
81
|
+
return 0
|
|
82
|
+
|
|
83
|
+
if args.command == "clean":
|
|
84
|
+
result = clean_column(
|
|
85
|
+
values,
|
|
86
|
+
target_type=args.type,
|
|
87
|
+
locale=args.locale,
|
|
88
|
+
percent_as_fraction=not args.keep_percent,
|
|
89
|
+
)
|
|
90
|
+
for value in result.values:
|
|
91
|
+
jv = _jsonable(value)
|
|
92
|
+
sys.stdout.write("" if jv is None else str(jv))
|
|
93
|
+
sys.stdout.write("\n")
|
|
94
|
+
return 1 if result.failures else 0
|
|
95
|
+
|
|
96
|
+
return 2 # pragma: no cover - argparse enforces a valid command
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__": # pragma: no cover
|
|
100
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Pluggable per-column type detectors.
|
|
2
|
+
|
|
3
|
+
Each detector is a single exported function taking a list of non-null values
|
|
4
|
+
and returning a :class:`Detection`. They are intentionally independent so a
|
|
5
|
+
caller (or :mod:`typemonkey.infer`) can run a subset, and so new strategies
|
|
6
|
+
can be added without touching the orchestrator.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .base import Detection
|
|
12
|
+
from .boolean import detect_boolean
|
|
13
|
+
from .date import detect_date
|
|
14
|
+
from .numeric import detect_numeric
|
|
15
|
+
|
|
16
|
+
__all__ = ["Detection", "detect_boolean", "detect_date", "detect_numeric"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Shared result type for detectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Detection:
|
|
10
|
+
"""The outcome of running one detector over a column's non-null values.
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
match_count: Number of non-null values the detector recognised.
|
|
14
|
+
sample_size: Number of non-null values examined.
|
|
15
|
+
confidence: ``match_count / sample_size`` in ``[0.0, 1.0]``; ``0.0``
|
|
16
|
+
for an empty sample.
|
|
17
|
+
detail: Detector-specific extras (e.g. fine kind counts for numeric,
|
|
18
|
+
detected date format for date).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
match_count: int
|
|
22
|
+
sample_size: int
|
|
23
|
+
detail: dict = field(default_factory=dict)
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def confidence(self) -> float:
|
|
27
|
+
"""Fraction of the sample the detector matched (0.0 if empty)."""
|
|
28
|
+
return self.match_count / self.sample_size if self.sample_size else 0.0
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Boolean detector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..booleans import NUMERIC_BOOLEANS, is_boolean
|
|
6
|
+
from .base import Detection
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_boolean(values, *, true_values=None, false_values=None) -> Detection:
|
|
10
|
+
"""Detect whether a column of non-null values is boolean.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
values: Non-null values to test.
|
|
14
|
+
true_values: Override truthy vocabulary (see
|
|
15
|
+
:func:`typemonkey.booleans.parse_boolean`).
|
|
16
|
+
false_values: Override falsy vocabulary.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
A :class:`Detection`. ``detail["numeric_only"]`` is ``True`` when every
|
|
20
|
+
matched value was ``"0"``/``"1"`` — the ambiguous case that is equally
|
|
21
|
+
valid integer data, which the orchestrator uses to avoid labelling a
|
|
22
|
+
0/1 integer column as boolean. ``detail["distinct"]`` holds the count
|
|
23
|
+
of distinct normalised tokens matched.
|
|
24
|
+
"""
|
|
25
|
+
matched = 0
|
|
26
|
+
distinct: set[str] = set()
|
|
27
|
+
numeric_only = True
|
|
28
|
+
for v in values:
|
|
29
|
+
if is_boolean(v, true_values=true_values, false_values=false_values):
|
|
30
|
+
matched += 1
|
|
31
|
+
key = str(v).strip().lower()
|
|
32
|
+
distinct.add(key)
|
|
33
|
+
if key not in NUMERIC_BOOLEANS:
|
|
34
|
+
numeric_only = False
|
|
35
|
+
return Detection(
|
|
36
|
+
match_count=matched,
|
|
37
|
+
sample_size=len(values),
|
|
38
|
+
detail={"numeric_only": numeric_only, "distinct": len(distinct)},
|
|
39
|
+
)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Date detector — delegates to datemonkey.
|
|
2
|
+
|
|
3
|
+
typemonkey does not reimplement date parsing. This detector is a thin adapter
|
|
4
|
+
around :func:`datemonkey.detect_format` that reports its batch confidence in
|
|
5
|
+
typemonkey's :class:`Detection` shape.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from datemonkey import Confidence, detect_format
|
|
11
|
+
|
|
12
|
+
from .base import Detection
|
|
13
|
+
|
|
14
|
+
# Map datemonkey's categorical confidence onto a fraction of the sample we are
|
|
15
|
+
# willing to treat as matched. FAILED means no date format fit at all.
|
|
16
|
+
_CONFIDENCE_WEIGHT = {
|
|
17
|
+
Confidence.HIGH: 1.0,
|
|
18
|
+
Confidence.MEDIUM: 0.75,
|
|
19
|
+
Confidence.LOW: 0.4,
|
|
20
|
+
Confidence.FAILED: 0.0,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def detect_date(values, *, locale_preference: str | None = None) -> Detection:
|
|
25
|
+
"""Detect whether a column of non-null values is a date column.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
values: Non-null values to test.
|
|
29
|
+
locale_preference: ``"us"`` or ``"eu"`` hint forwarded to datemonkey to
|
|
30
|
+
resolve DD/MM vs MM/DD ambiguity.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
A :class:`Detection` whose ``match_count`` is datemonkey's
|
|
34
|
+
``match_count`` and whose ``detail`` carries the detected ``format``
|
|
35
|
+
(strftime pattern or ``None``), the categorical ``confidence`` value,
|
|
36
|
+
and any ``ambiguities`` datemonkey reported.
|
|
37
|
+
"""
|
|
38
|
+
if not values:
|
|
39
|
+
return Detection(
|
|
40
|
+
0, 0, {"format": None, "confidence": "failed", "weight": 0.0, "ambiguities": []}
|
|
41
|
+
)
|
|
42
|
+
result = detect_format(values, locale_preference=locale_preference)
|
|
43
|
+
fmt = result.format.pattern if result.format is not None else None
|
|
44
|
+
return Detection(
|
|
45
|
+
match_count=result.match_count,
|
|
46
|
+
sample_size=result.sample_size,
|
|
47
|
+
detail={
|
|
48
|
+
"format": fmt,
|
|
49
|
+
"confidence": result.confidence.value,
|
|
50
|
+
"weight": _CONFIDENCE_WEIGHT.get(result.confidence, 0.0),
|
|
51
|
+
"ambiguities": [a.value for a in result.ambiguities],
|
|
52
|
+
},
|
|
53
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Numeric detector — classifies each value into a fine numeric kind."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import Counter
|
|
6
|
+
|
|
7
|
+
from ..numbers import parse_number
|
|
8
|
+
from .base import Detection
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def detect_numeric(values, *, locale: str = "us") -> Detection:
|
|
12
|
+
"""Detect numeric values and bucket each into a fine kind.
|
|
13
|
+
|
|
14
|
+
Every value is run through :func:`typemonkey.numbers.parse_number`. Each
|
|
15
|
+
success is tallied into exactly one of ``currency``, ``percentage``,
|
|
16
|
+
``integer``, or ``float`` (currency and percentage take precedence over
|
|
17
|
+
the plain int/float distinction when their marker is present).
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
values: Non-null values to test.
|
|
21
|
+
locale: Number locale for parsing (``"us"`` or ``"eu"``).
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
A :class:`Detection` whose ``match_count`` is the count of values that
|
|
25
|
+
parsed as numbers and whose ``detail["kinds"]`` is a
|
|
26
|
+
:class:`collections.Counter` over the fine kinds.
|
|
27
|
+
"""
|
|
28
|
+
kinds: Counter[str] = Counter()
|
|
29
|
+
for v in values:
|
|
30
|
+
try:
|
|
31
|
+
p = parse_number(v, locale)
|
|
32
|
+
except ValueError:
|
|
33
|
+
continue
|
|
34
|
+
if p.had_currency:
|
|
35
|
+
kinds["currency"] += 1
|
|
36
|
+
elif p.had_percent:
|
|
37
|
+
kinds["percentage"] += 1
|
|
38
|
+
elif p.is_integer:
|
|
39
|
+
kinds["integer"] += 1
|
|
40
|
+
else:
|
|
41
|
+
kinds["float"] += 1
|
|
42
|
+
return Detection(
|
|
43
|
+
match_count=sum(kinds.values()),
|
|
44
|
+
sample_size=len(values),
|
|
45
|
+
detail={"kinds": kinds, "locale": locale},
|
|
46
|
+
)
|