scales-python 1.4.0.9000__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scales/__init__.py +295 -0
- scales/_colors.py +272 -0
- scales/_palettes_data.py +595 -0
- scales/_utils.py +579 -0
- scales/bounds.py +512 -0
- scales/breaks.py +627 -0
- scales/breaks_log.py +268 -0
- scales/colour_manip.py +681 -0
- scales/colour_mapping.py +593 -0
- scales/colour_ramp.py +126 -0
- scales/labels.py +2144 -0
- scales/minor_breaks.py +197 -0
- scales/palettes.py +1328 -0
- scales/py.typed +0 -0
- scales/range.py +223 -0
- scales/scale_continuous.py +146 -0
- scales/scale_discrete.py +196 -0
- scales/transforms.py +1338 -0
- scales_python-1.4.0.9000.dist-info/METADATA +73 -0
- scales_python-1.4.0.9000.dist-info/RECORD +22 -0
- scales_python-1.4.0.9000.dist-info/WHEEL +4 -0
- scales_python-1.4.0.9000.dist-info/licenses/LICENSE +3 -0
scales/py.typed
ADDED
|
File without changes
|
scales/range.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Mutable range classes for accumulating scale domains.
|
|
3
|
+
|
|
4
|
+
Python port of ``R/range.R`` from the R *scales* package
|
|
5
|
+
(https://github.com/r-lib/scales). The R source uses R6 classes;
|
|
6
|
+
here we use plain Python classes with the same ``train`` / ``reset``
|
|
7
|
+
interface.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Optional, Sequence, Union
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from numpy.typing import ArrayLike
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Range",
|
|
19
|
+
"ContinuousRange",
|
|
20
|
+
"DiscreteRange",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Range:
|
|
25
|
+
"""Base range class.
|
|
26
|
+
|
|
27
|
+
Attributes
|
|
28
|
+
----------
|
|
29
|
+
range : object or None
|
|
30
|
+
The accumulated range. ``None`` until the first ``train()``
|
|
31
|
+
call.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self) -> None:
|
|
35
|
+
self.range: object = None
|
|
36
|
+
|
|
37
|
+
def train(self, x: ArrayLike, **kwargs) -> None: # pragma: no cover
|
|
38
|
+
"""Update the range with new data (implemented by subclasses)."""
|
|
39
|
+
raise NotImplementedError
|
|
40
|
+
|
|
41
|
+
def reset(self) -> None:
|
|
42
|
+
"""Reset the range to its initial (empty) state."""
|
|
43
|
+
self.range = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ContinuousRange(Range):
|
|
47
|
+
"""Mutable continuous range that accumulates via :meth:`train`.
|
|
48
|
+
|
|
49
|
+
An R6-style object that progressively builds a numeric ``(min, max)``
|
|
50
|
+
range across multiple ``train()`` calls.
|
|
51
|
+
|
|
52
|
+
Examples
|
|
53
|
+
--------
|
|
54
|
+
>>> rng = ContinuousRange()
|
|
55
|
+
>>> rng.train([1, 5, 3])
|
|
56
|
+
>>> rng.range
|
|
57
|
+
(1.0, 5.0)
|
|
58
|
+
>>> rng.train([0, 4])
|
|
59
|
+
>>> rng.range
|
|
60
|
+
(0.0, 5.0)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self) -> None:
|
|
64
|
+
super().__init__()
|
|
65
|
+
self.range: Optional[tuple[float, float]] = None
|
|
66
|
+
|
|
67
|
+
def train(self, x: ArrayLike) -> None:
|
|
68
|
+
"""Update the range with new numeric data.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
x : array_like
|
|
73
|
+
Numeric values. Non-finite values (``NaN``, ``Inf``) are
|
|
74
|
+
silently dropped before the range is updated.
|
|
75
|
+
"""
|
|
76
|
+
x = np.asarray(x, dtype=float)
|
|
77
|
+
x = x[np.isfinite(x)]
|
|
78
|
+
if len(x) == 0:
|
|
79
|
+
return
|
|
80
|
+
new_range = (float(np.min(x)), float(np.max(x)))
|
|
81
|
+
if self.range is None:
|
|
82
|
+
self.range = new_range
|
|
83
|
+
else:
|
|
84
|
+
self.range = (
|
|
85
|
+
min(self.range[0], new_range[0]),
|
|
86
|
+
max(self.range[1], new_range[1]),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def reset(self) -> None:
|
|
90
|
+
"""Reset to an empty range."""
|
|
91
|
+
self.range = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class DiscreteRange(Range):
|
|
95
|
+
"""Mutable discrete range (ordered set of unique levels).
|
|
96
|
+
|
|
97
|
+
Mirrors R ``scales::discrete_range`` / ``clevels``
|
|
98
|
+
(scales/R/scale-discrete.R:55-116):
|
|
99
|
+
|
|
100
|
+
* If the input is a pandas Categorical (R factor), its ``categories``
|
|
101
|
+
order is preserved.
|
|
102
|
+
* Otherwise, levels are **sorted alphabetically** (R ``sort(unique(x))``).
|
|
103
|
+
* When combined with an existing range, a factor input keeps its
|
|
104
|
+
order; a non-factor combination is re-sorted.
|
|
105
|
+
|
|
106
|
+
Examples
|
|
107
|
+
--------
|
|
108
|
+
>>> rng = DiscreteRange()
|
|
109
|
+
>>> rng.train(["b", "a", "c"])
|
|
110
|
+
>>> rng.range
|
|
111
|
+
['a', 'b', 'c']
|
|
112
|
+
>>> rng.train(["d", "a"])
|
|
113
|
+
>>> rng.range
|
|
114
|
+
['a', 'b', 'c', 'd']
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self) -> None:
|
|
118
|
+
super().__init__()
|
|
119
|
+
self.range: Optional[list] = None
|
|
120
|
+
self._is_factor: bool = False
|
|
121
|
+
|
|
122
|
+
def train(
|
|
123
|
+
self,
|
|
124
|
+
x: Union[ArrayLike, Sequence, "pd.Categorical"],
|
|
125
|
+
drop: bool = False,
|
|
126
|
+
na_rm: bool = False,
|
|
127
|
+
) -> None:
|
|
128
|
+
"""Update the range with new discrete data.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
x : array_like or pandas.Categorical
|
|
133
|
+
Discrete values. If *x* is a :class:`pandas.Categorical`
|
|
134
|
+
its categories are used (respecting order). Otherwise,
|
|
135
|
+
the unique values are sorted alphabetically to match R's
|
|
136
|
+
``sort(unique(x))`` behaviour in ``clevels()``.
|
|
137
|
+
drop : bool, optional
|
|
138
|
+
If ``True`` and *x* is categorical, unused categories are
|
|
139
|
+
dropped before training (default ``False``).
|
|
140
|
+
na_rm : bool, optional
|
|
141
|
+
If ``True``, ``None`` / ``NaN`` values are removed before
|
|
142
|
+
training (default ``False``).
|
|
143
|
+
"""
|
|
144
|
+
new_is_factor = hasattr(x, "categories")
|
|
145
|
+
# Handle pandas Categoricals — factor-style, preserve order.
|
|
146
|
+
if new_is_factor:
|
|
147
|
+
if drop:
|
|
148
|
+
x = x.remove_unused_categories()
|
|
149
|
+
levels = list(x.categories)
|
|
150
|
+
else:
|
|
151
|
+
x = np.asarray(x)
|
|
152
|
+
# R's clevels for non-factor: sort(unique(x))
|
|
153
|
+
seen: set = set()
|
|
154
|
+
uniq: list = []
|
|
155
|
+
for val in x.flat:
|
|
156
|
+
key = val
|
|
157
|
+
if isinstance(val, float) and np.isnan(val):
|
|
158
|
+
key = None
|
|
159
|
+
if key not in seen:
|
|
160
|
+
seen.add(key)
|
|
161
|
+
uniq.append(val)
|
|
162
|
+
# Sort alphabetically (R default). Keep NaN separate —
|
|
163
|
+
# sorted() will raise on mixed None/str, so we strip first
|
|
164
|
+
# and re-append.
|
|
165
|
+
non_na = [v for v in uniq if not (v is None or
|
|
166
|
+
(isinstance(v, float) and np.isnan(v)))]
|
|
167
|
+
na_tail = [v for v in uniq if v is None or
|
|
168
|
+
(isinstance(v, float) and np.isnan(v))]
|
|
169
|
+
try:
|
|
170
|
+
non_na = sorted(non_na)
|
|
171
|
+
except TypeError:
|
|
172
|
+
# Mixed incomparable types — keep insertion order
|
|
173
|
+
pass
|
|
174
|
+
levels = non_na + na_tail
|
|
175
|
+
|
|
176
|
+
# Optionally strip NaN / None
|
|
177
|
+
if na_rm:
|
|
178
|
+
levels = [
|
|
179
|
+
v
|
|
180
|
+
for v in levels
|
|
181
|
+
if not (v is None or (isinstance(v, float) and np.isnan(v)))
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
if self.range is None:
|
|
185
|
+
# First batch — remember whether it was a factor.
|
|
186
|
+
self.range = levels
|
|
187
|
+
self._is_factor = new_is_factor
|
|
188
|
+
else:
|
|
189
|
+
# Combine with existing range. R discrete_range
|
|
190
|
+
# (scale-discrete.R:82-96): union of old ∪ new_levels.
|
|
191
|
+
# Keep factor order if either side was a factor; else
|
|
192
|
+
# re-sort alphabetically.
|
|
193
|
+
existing_set = set()
|
|
194
|
+
for v in self.range:
|
|
195
|
+
if isinstance(v, float) and np.isnan(v):
|
|
196
|
+
existing_set.add(None)
|
|
197
|
+
else:
|
|
198
|
+
existing_set.add(v)
|
|
199
|
+
combined = list(self.range)
|
|
200
|
+
for v in levels:
|
|
201
|
+
key = None if (isinstance(v, float) and np.isnan(v)) else v
|
|
202
|
+
if key not in existing_set:
|
|
203
|
+
existing_set.add(key)
|
|
204
|
+
combined.append(v)
|
|
205
|
+
|
|
206
|
+
if self._is_factor or new_is_factor:
|
|
207
|
+
self.range = combined
|
|
208
|
+
self._is_factor = True
|
|
209
|
+
else:
|
|
210
|
+
non_na = [v for v in combined if not (v is None or
|
|
211
|
+
(isinstance(v, float) and np.isnan(v)))]
|
|
212
|
+
na_tail = [v for v in combined if v is None or
|
|
213
|
+
(isinstance(v, float) and np.isnan(v))]
|
|
214
|
+
try:
|
|
215
|
+
non_na = sorted(non_na)
|
|
216
|
+
except TypeError:
|
|
217
|
+
pass
|
|
218
|
+
self.range = non_na + na_tail
|
|
219
|
+
|
|
220
|
+
def reset(self) -> None:
|
|
221
|
+
"""Reset to an empty range."""
|
|
222
|
+
self.range = None
|
|
223
|
+
self._is_factor = False
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Continuous-scale helpers: apply and train continuous scales.
|
|
3
|
+
|
|
4
|
+
Python port of ``R/scale-continuous.R`` from the R *scales* package
|
|
5
|
+
(https://github.com/r-lib/scales).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Callable, Optional, Tuple, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from numpy.typing import ArrayLike
|
|
14
|
+
|
|
15
|
+
from .bounds import censor, rescale
|
|
16
|
+
from .transforms import Transform, as_transform
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"cscale",
|
|
20
|
+
"train_continuous",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def cscale(
|
|
25
|
+
x: ArrayLike,
|
|
26
|
+
palette: Callable[[np.ndarray], np.ndarray],
|
|
27
|
+
na_value: Any = np.nan,
|
|
28
|
+
trans: Optional[Union[Transform, str]] = None,
|
|
29
|
+
oob: Callable[[np.ndarray], np.ndarray] = censor,
|
|
30
|
+
) -> np.ndarray:
|
|
31
|
+
"""Apply a continuous scale to numeric data.
|
|
32
|
+
|
|
33
|
+
Mirrors R's ``cscale`` + ``map_continuous``: transforms *x*, rescales
|
|
34
|
+
to ``[0, 1]``, applies *oob* (censor by default) to that rescaled
|
|
35
|
+
result, then passes it through *palette*. NaNs (including those
|
|
36
|
+
introduced by *oob*) are replaced with *na_value*.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
x : array_like
|
|
41
|
+
Numeric values in data coordinates.
|
|
42
|
+
palette : callable
|
|
43
|
+
A continuous palette function that maps values in ``[0, 1]`` to
|
|
44
|
+
output values (e.g. colours or sizes).
|
|
45
|
+
na_value : any, optional
|
|
46
|
+
Value used for ``NaN`` entries in *x* (default ``np.nan``).
|
|
47
|
+
trans : Transform or str, optional
|
|
48
|
+
If given, *x* is first transformed before rescaling. May be a
|
|
49
|
+
:class:`~scales.transforms.Transform` object or a string name
|
|
50
|
+
recognised by :func:`~scales.transforms.as_transform`.
|
|
51
|
+
oob : callable, optional
|
|
52
|
+
Out-of-bounds handler applied to the rescaled ``[0, 1]`` values
|
|
53
|
+
before the palette. Default is :func:`~scales.bounds.censor`,
|
|
54
|
+
which replaces values outside ``[0, 1]`` with ``NaN`` — matching
|
|
55
|
+
R's ``map_continuous(oob = censor)``. Use
|
|
56
|
+
:func:`~scales.bounds.squish` to clamp instead.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
numpy.ndarray
|
|
61
|
+
Palette-mapped values, same length as *x*.
|
|
62
|
+
|
|
63
|
+
Examples
|
|
64
|
+
--------
|
|
65
|
+
>>> from scales.palettes import pal_seq_gradient
|
|
66
|
+
>>> cscale([1, 5, 10], pal_seq_gradient("white", "blue"))
|
|
67
|
+
"""
|
|
68
|
+
x = np.asarray(x, dtype=float)
|
|
69
|
+
|
|
70
|
+
# 1. Optionally transform
|
|
71
|
+
if trans is not None:
|
|
72
|
+
if isinstance(trans, str):
|
|
73
|
+
trans = as_transform(trans)
|
|
74
|
+
x = trans.transform(x)
|
|
75
|
+
|
|
76
|
+
# 2. Identify NAs *before* rescaling
|
|
77
|
+
na_mask = ~np.isfinite(x)
|
|
78
|
+
|
|
79
|
+
# 3. Rescale to [0, 1] using the finite range of x
|
|
80
|
+
scaled = rescale(x, to=(0.0, 1.0))
|
|
81
|
+
|
|
82
|
+
# 4. Apply OOB handler (default: censor → NaN). After this, any value
|
|
83
|
+
# outside [0, 1] that the user asked to censor becomes NaN.
|
|
84
|
+
scaled = np.asarray(oob(scaled), dtype=float)
|
|
85
|
+
na_mask = na_mask | ~np.isfinite(scaled)
|
|
86
|
+
|
|
87
|
+
# 5. Apply palette
|
|
88
|
+
result = palette(scaled)
|
|
89
|
+
result = np.asarray(result)
|
|
90
|
+
|
|
91
|
+
# 6. Replace NAs
|
|
92
|
+
if np.any(na_mask):
|
|
93
|
+
if result.dtype.kind in ("U", "S", "O"):
|
|
94
|
+
# String / object array
|
|
95
|
+
result = result.astype(object)
|
|
96
|
+
result[na_mask] = na_value
|
|
97
|
+
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def train_continuous(
|
|
102
|
+
new: ArrayLike,
|
|
103
|
+
existing: Optional[Tuple[float, float]] = None,
|
|
104
|
+
) -> Tuple[float, float]:
|
|
105
|
+
"""Train (update) a continuous range with new data.
|
|
106
|
+
|
|
107
|
+
Combines the range of *new* with an *existing* ``(min, max)`` range
|
|
108
|
+
to produce an updated range that spans both.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
new : array_like
|
|
113
|
+
New numeric observations. Non-finite values are ignored.
|
|
114
|
+
existing : tuple of float or None, optional
|
|
115
|
+
Previously computed ``(min, max)`` range. ``None`` indicates
|
|
116
|
+
no prior range.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
tuple of float
|
|
121
|
+
Updated ``(min, max)`` range.
|
|
122
|
+
|
|
123
|
+
Examples
|
|
124
|
+
--------
|
|
125
|
+
>>> train_continuous([1, 5, 3])
|
|
126
|
+
(1.0, 5.0)
|
|
127
|
+
>>> train_continuous([0, 4], existing=(1.0, 5.0))
|
|
128
|
+
(0.0, 5.0)
|
|
129
|
+
"""
|
|
130
|
+
new = np.asarray(new, dtype=float)
|
|
131
|
+
new = new[np.isfinite(new)]
|
|
132
|
+
|
|
133
|
+
if len(new) == 0:
|
|
134
|
+
if existing is None:
|
|
135
|
+
raise ValueError("Cannot train on empty data with no existing range.")
|
|
136
|
+
return existing
|
|
137
|
+
|
|
138
|
+
new_range = (float(np.min(new)), float(np.max(new)))
|
|
139
|
+
|
|
140
|
+
if existing is None:
|
|
141
|
+
return new_range
|
|
142
|
+
|
|
143
|
+
return (
|
|
144
|
+
min(existing[0], new_range[0]),
|
|
145
|
+
max(existing[1], new_range[1]),
|
|
146
|
+
)
|
scales/scale_discrete.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Discrete-scale helpers: apply and train discrete scales.
|
|
3
|
+
|
|
4
|
+
Python port of ``R/scale-discrete.R`` from the R *scales* package
|
|
5
|
+
(https://github.com/r-lib/scales).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Callable, List, Optional, Sequence, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from numpy.typing import ArrayLike
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"dscale",
|
|
17
|
+
"train_discrete",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def dscale(
|
|
22
|
+
x: ArrayLike,
|
|
23
|
+
palette: Callable[[int], Any],
|
|
24
|
+
na_value: Any = None,
|
|
25
|
+
) -> np.ndarray:
|
|
26
|
+
"""Apply a discrete scale to categorical data.
|
|
27
|
+
|
|
28
|
+
Maps each unique level of *x* to a palette output, then broadcasts
|
|
29
|
+
back to the full length of *x*.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
x : array_like
|
|
34
|
+
Discrete (categorical) values. May be strings, integers, or a
|
|
35
|
+
:class:`pandas.Categorical`.
|
|
36
|
+
palette : callable
|
|
37
|
+
A discrete palette function that takes an integer *n* (number
|
|
38
|
+
of levels) and returns a sequence of *n* output values.
|
|
39
|
+
na_value : any, optional
|
|
40
|
+
Value used for ``None`` / ``NaN`` entries in *x* (default
|
|
41
|
+
``None``).
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
numpy.ndarray
|
|
46
|
+
Palette-mapped values, same length as *x*.
|
|
47
|
+
|
|
48
|
+
Examples
|
|
49
|
+
--------
|
|
50
|
+
>>> from scales.palettes import pal_brewer
|
|
51
|
+
>>> dscale(["a", "b", "a", "c"], pal_brewer())
|
|
52
|
+
"""
|
|
53
|
+
# Determine levels (ordered unique values)
|
|
54
|
+
if hasattr(x, "categories"):
|
|
55
|
+
# pandas Categorical
|
|
56
|
+
levels = list(x.categories)
|
|
57
|
+
x_arr = np.asarray(x)
|
|
58
|
+
else:
|
|
59
|
+
x_arr = np.asarray(x)
|
|
60
|
+
# Preserve first-appearance order
|
|
61
|
+
seen: set = set()
|
|
62
|
+
levels: list = []
|
|
63
|
+
for val in x_arr.flat:
|
|
64
|
+
key = _na_key(val)
|
|
65
|
+
if key not in seen:
|
|
66
|
+
seen.add(key)
|
|
67
|
+
if not _is_na(val):
|
|
68
|
+
levels.append(val)
|
|
69
|
+
|
|
70
|
+
n = len(levels)
|
|
71
|
+
if n == 0:
|
|
72
|
+
return np.full(x_arr.shape, na_value, dtype=object)
|
|
73
|
+
|
|
74
|
+
# Get palette colours / values for n levels
|
|
75
|
+
pal_values = palette(n)
|
|
76
|
+
pal_values = np.asarray(pal_values)
|
|
77
|
+
|
|
78
|
+
# Build lookup: level -> palette value
|
|
79
|
+
lookup: dict = {}
|
|
80
|
+
for i, lev in enumerate(levels):
|
|
81
|
+
lookup[lev] = pal_values[i] if i < len(pal_values) else na_value
|
|
82
|
+
|
|
83
|
+
# Map x through the lookup
|
|
84
|
+
result = np.empty(x_arr.shape, dtype=pal_values.dtype if len(pal_values) > 0 else object)
|
|
85
|
+
for idx in np.ndindex(x_arr.shape):
|
|
86
|
+
val = x_arr[idx]
|
|
87
|
+
if _is_na(val):
|
|
88
|
+
result[idx] = na_value
|
|
89
|
+
else:
|
|
90
|
+
result[idx] = lookup.get(val, na_value)
|
|
91
|
+
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def train_discrete(
|
|
96
|
+
new: Union[ArrayLike, Sequence],
|
|
97
|
+
existing: Optional[List] = None,
|
|
98
|
+
drop: bool = False,
|
|
99
|
+
na_rm: bool = False,
|
|
100
|
+
) -> list:
|
|
101
|
+
"""Train (update) a discrete range with new data.
|
|
102
|
+
|
|
103
|
+
Combines the unique levels of *new* with an *existing* level list
|
|
104
|
+
to produce an updated set of levels (preserving order of first
|
|
105
|
+
appearance).
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
new : array_like or sequence
|
|
110
|
+
New discrete observations.
|
|
111
|
+
existing : list or None, optional
|
|
112
|
+
Previously computed list of levels. ``None`` indicates no
|
|
113
|
+
prior levels.
|
|
114
|
+
drop : bool, optional
|
|
115
|
+
If ``True`` and *new* is a :class:`pandas.Categorical`,
|
|
116
|
+
unused categories are dropped (default ``False``).
|
|
117
|
+
na_rm : bool, optional
|
|
118
|
+
If ``True``, ``None`` / ``NaN`` values are removed from the
|
|
119
|
+
result (default ``False``).
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
list
|
|
124
|
+
Updated list of unique levels.
|
|
125
|
+
|
|
126
|
+
Examples
|
|
127
|
+
--------
|
|
128
|
+
>>> train_discrete(["a", "b", "c"])
|
|
129
|
+
['a', 'b', 'c']
|
|
130
|
+
>>> train_discrete(["b", "d"], existing=["a", "b", "c"])
|
|
131
|
+
['a', 'b', 'c', 'd']
|
|
132
|
+
"""
|
|
133
|
+
# Extract levels from new data.
|
|
134
|
+
# R semantics: non-factor input is `sort(unique(...))`; Categorical
|
|
135
|
+
# (factor) input preserves its defined order.
|
|
136
|
+
existing_is_factor = hasattr(existing, "categories")
|
|
137
|
+
new_is_factor = hasattr(new, "categories")
|
|
138
|
+
|
|
139
|
+
if new_is_factor:
|
|
140
|
+
if drop:
|
|
141
|
+
new = new.remove_unused_categories()
|
|
142
|
+
new_levels = list(new.categories)
|
|
143
|
+
else:
|
|
144
|
+
arr = np.asarray(new)
|
|
145
|
+
seen: set = set()
|
|
146
|
+
uniq: list = []
|
|
147
|
+
for val in arr.flat:
|
|
148
|
+
key = _na_key(val)
|
|
149
|
+
if key not in seen:
|
|
150
|
+
seen.add(key)
|
|
151
|
+
uniq.append(val)
|
|
152
|
+
new_levels = uniq
|
|
153
|
+
|
|
154
|
+
if na_rm:
|
|
155
|
+
new_levels = [v for v in new_levels if not _is_na(v)]
|
|
156
|
+
|
|
157
|
+
if existing is None:
|
|
158
|
+
if new_is_factor:
|
|
159
|
+
return new_levels
|
|
160
|
+
# Non-factor: sort alphabetically per R's clevels().
|
|
161
|
+
return sorted(new_levels, key=lambda v: (v is None, str(v)))
|
|
162
|
+
|
|
163
|
+
existing_keys = {_na_key(v) for v in existing}
|
|
164
|
+
merged = list(existing)
|
|
165
|
+
for v in new_levels:
|
|
166
|
+
key = _na_key(v)
|
|
167
|
+
if key not in existing_keys:
|
|
168
|
+
existing_keys.add(key)
|
|
169
|
+
merged.append(v)
|
|
170
|
+
|
|
171
|
+
# When neither side is a factor, R re-sorts the union.
|
|
172
|
+
if not (existing_is_factor or new_is_factor):
|
|
173
|
+
merged = sorted(merged, key=lambda v: (v is None, str(v)))
|
|
174
|
+
|
|
175
|
+
return merged
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
# Internal helpers
|
|
180
|
+
# ---------------------------------------------------------------------------
|
|
181
|
+
|
|
182
|
+
def _is_na(val: Any) -> bool:
|
|
183
|
+
"""Check if a value is NA-like (None or NaN)."""
|
|
184
|
+
if val is None:
|
|
185
|
+
return True
|
|
186
|
+
try:
|
|
187
|
+
return np.isnan(val)
|
|
188
|
+
except (TypeError, ValueError):
|
|
189
|
+
return False
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _na_key(val: Any) -> Any:
|
|
193
|
+
"""Return a hashable key, mapping all NA variants to ``None``."""
|
|
194
|
+
if _is_na(val):
|
|
195
|
+
return None
|
|
196
|
+
return val
|