glm-factor-optimizer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glm_factor_optimizer/__init__.py +83 -0
- glm_factor_optimizer/aggregation.py +136 -0
- glm_factor_optimizer/bins.py +257 -0
- glm_factor_optimizer/core.py +330 -0
- glm_factor_optimizer/diagnostics.py +236 -0
- glm_factor_optimizer/factor.py +354 -0
- glm_factor_optimizer/metrics.py +334 -0
- glm_factor_optimizer/model.py +269 -0
- glm_factor_optimizer/optimize.py +391 -0
- glm_factor_optimizer/penalties.py +167 -0
- glm_factor_optimizer/runs.py +192 -0
- glm_factor_optimizer/sampling.py +173 -0
- glm_factor_optimizer/screening.py +341 -0
- glm_factor_optimizer/spark/__init__.py +30 -0
- glm_factor_optimizer/spark/_deps.py +72 -0
- glm_factor_optimizer/spark/aggregation.py +143 -0
- glm_factor_optimizer/spark/bins.py +247 -0
- glm_factor_optimizer/spark/core.py +240 -0
- glm_factor_optimizer/spark/metrics.py +165 -0
- glm_factor_optimizer/spark/model.py +185 -0
- glm_factor_optimizer/spark/optimize.py +382 -0
- glm_factor_optimizer/spark/split.py +67 -0
- glm_factor_optimizer/spark/workflow.py +203 -0
- glm_factor_optimizer/split.py +58 -0
- glm_factor_optimizer/study.py +1096 -0
- glm_factor_optimizer/validation.py +387 -0
- glm_factor_optimizer/workflow.py +455 -0
- glm_factor_optimizer-0.1.0.dist-info/METADATA +267 -0
- glm_factor_optimizer-0.1.0.dist-info/RECORD +31 -0
- glm_factor_optimizer-0.1.0.dist-info/WHEEL +4 -0
- glm_factor_optimizer-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Simple GLM factor-binning and modeling tools."""
|
|
2
|
+
|
|
3
|
+
from .aggregation import aggregate_rate_table, aggregate_table
|
|
4
|
+
from .bins import apply_spec, category_target_order, make_numeric_bins
|
|
5
|
+
from .core import GLM, RateGLM
|
|
6
|
+
from .diagnostics import find_interactions, pair_diagnostics
|
|
7
|
+
from .factor import FactorBlock
|
|
8
|
+
from .metrics import (
|
|
9
|
+
calibration,
|
|
10
|
+
gaussian_deviance,
|
|
11
|
+
gamma_deviance,
|
|
12
|
+
lift_table,
|
|
13
|
+
model_deviance,
|
|
14
|
+
poisson_deviance,
|
|
15
|
+
summary,
|
|
16
|
+
weighted_mae,
|
|
17
|
+
weighted_rmse,
|
|
18
|
+
)
|
|
19
|
+
from .model import FittedGLM, FittedRateGLM, fit_glm, fit_rate_glm
|
|
20
|
+
from .optimize import (
|
|
21
|
+
OptimizationResult,
|
|
22
|
+
optimize_factor,
|
|
23
|
+
small_bin_size_penalty,
|
|
24
|
+
small_count_penalty,
|
|
25
|
+
train_validation_gap_penalty,
|
|
26
|
+
)
|
|
27
|
+
from .penalties import bin_count_penalty, small_target_penalty, unstable_relativity_penalty
|
|
28
|
+
from .runs import RunLogger
|
|
29
|
+
from .sampling import missing_strata, stratified_sample
|
|
30
|
+
from .screening import rank_factors
|
|
31
|
+
from .split import split
|
|
32
|
+
from .study import GLMStudy
|
|
33
|
+
from .validation import by_factor_report, train_validation_comparison, validation_report
|
|
34
|
+
from .workflow import GLMWorkflow, WorkflowResult, run_workflow
|
|
35
|
+
|
|
36
|
+
optimize_bins = optimize_factor
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"FittedRateGLM",
|
|
40
|
+
"FittedGLM",
|
|
41
|
+
"FactorBlock",
|
|
42
|
+
"GLM",
|
|
43
|
+
"GLMStudy",
|
|
44
|
+
"GLMWorkflow",
|
|
45
|
+
"OptimizationResult",
|
|
46
|
+
"RateGLM",
|
|
47
|
+
"RunLogger",
|
|
48
|
+
"aggregate_rate_table",
|
|
49
|
+
"aggregate_table",
|
|
50
|
+
"apply_spec",
|
|
51
|
+
"bin_count_penalty",
|
|
52
|
+
"calibration",
|
|
53
|
+
"category_target_order",
|
|
54
|
+
"by_factor_report",
|
|
55
|
+
"fit_rate_glm",
|
|
56
|
+
"fit_glm",
|
|
57
|
+
"find_interactions",
|
|
58
|
+
"gaussian_deviance",
|
|
59
|
+
"gamma_deviance",
|
|
60
|
+
"lift_table",
|
|
61
|
+
"make_numeric_bins",
|
|
62
|
+
"missing_strata",
|
|
63
|
+
"model_deviance",
|
|
64
|
+
"optimize_bins",
|
|
65
|
+
"optimize_factor",
|
|
66
|
+
"pair_diagnostics",
|
|
67
|
+
"poisson_deviance",
|
|
68
|
+
"rank_factors",
|
|
69
|
+
"split",
|
|
70
|
+
"small_bin_size_penalty",
|
|
71
|
+
"small_count_penalty",
|
|
72
|
+
"small_target_penalty",
|
|
73
|
+
"summary",
|
|
74
|
+
"train_validation_gap_penalty",
|
|
75
|
+
"train_validation_comparison",
|
|
76
|
+
"unstable_relativity_penalty",
|
|
77
|
+
"stratified_sample",
|
|
78
|
+
"validation_report",
|
|
79
|
+
"weighted_mae",
|
|
80
|
+
"weighted_rmse",
|
|
81
|
+
"WorkflowResult",
|
|
82
|
+
"run_workflow",
|
|
83
|
+
]
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Generic aggregation helpers for model development tables."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
AggregationSpec = Mapping[str, tuple[str, str]]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def aggregate_table(
|
|
14
|
+
df: pd.DataFrame,
|
|
15
|
+
group_by: str | Sequence[str] | None = None,
|
|
16
|
+
aggregations: AggregationSpec | None = None,
|
|
17
|
+
) -> pd.DataFrame:
|
|
18
|
+
"""Return grouped rows plus optional named aggregations.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
df:
|
|
23
|
+
Data to aggregate.
|
|
24
|
+
group_by:
|
|
25
|
+
Optional column or columns to group by. When omitted, a single overall
|
|
26
|
+
row is returned.
|
|
27
|
+
aggregations:
|
|
28
|
+
Optional mapping from output column name to ``(source_column,
|
|
29
|
+
function)``. Supported functions include ``sum``, ``mean``, ``min``,
|
|
30
|
+
``max``, ``count``, ``size``, and ``std``.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
pandas.DataFrame
|
|
35
|
+
Grouped table with a ``rows`` column and requested aggregations.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
group_cols = _as_list(group_by)
|
|
39
|
+
named = {name: tuple(spec) for name, spec in (aggregations or {}).items()}
|
|
40
|
+
if group_cols:
|
|
41
|
+
return (
|
|
42
|
+
df.groupby(group_cols, dropna=False)
|
|
43
|
+
.agg(rows=(group_cols[0], "size"), **named)
|
|
44
|
+
.reset_index()
|
|
45
|
+
)
|
|
46
|
+
row: dict[str, Any] = {"rows": int(len(df))}
|
|
47
|
+
for name, (column, function) in named.items():
|
|
48
|
+
row[name] = _aggregate_series(df[column], function)
|
|
49
|
+
return pd.DataFrame([row])
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def aggregate_rate_table(
|
|
53
|
+
df: pd.DataFrame,
|
|
54
|
+
group_by: str | Sequence[str] | None,
|
|
55
|
+
*,
|
|
56
|
+
target: str,
|
|
57
|
+
exposure: str | None = None,
|
|
58
|
+
weight: str | None = None,
|
|
59
|
+
prediction: str | None = None,
|
|
60
|
+
extras: AggregationSpec | None = None,
|
|
61
|
+
) -> pd.DataFrame:
|
|
62
|
+
"""Return grouped observed/predicted totals and level diagnostics.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
df:
|
|
67
|
+
Data to aggregate.
|
|
68
|
+
group_by:
|
|
69
|
+
Optional column or columns to group by.
|
|
70
|
+
target:
|
|
71
|
+
Observed outcome column.
|
|
72
|
+
exposure:
|
|
73
|
+
Optional exposure column for exposure-adjusted level columns.
|
|
74
|
+
weight:
|
|
75
|
+
Optional weight column to total.
|
|
76
|
+
prediction:
|
|
77
|
+
Optional predicted outcome column.
|
|
78
|
+
extras:
|
|
79
|
+
Optional extra named aggregations, using the same format as
|
|
80
|
+
:func:`aggregate_table`.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
pandas.DataFrame
|
|
85
|
+
Grouped observed and predicted totals with mean or exposure-adjusted
|
|
86
|
+
diagnostics.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
aggregations: dict[str, tuple[str, str]] = {"actual": (target, "sum")}
|
|
90
|
+
if exposure is not None:
|
|
91
|
+
aggregations["exposure"] = (exposure, "sum")
|
|
92
|
+
if weight is not None:
|
|
93
|
+
aggregations["weight"] = (weight, "sum")
|
|
94
|
+
if prediction is not None:
|
|
95
|
+
aggregations["predicted"] = (prediction, "sum")
|
|
96
|
+
aggregations.update(extras or {})
|
|
97
|
+
|
|
98
|
+
table = aggregate_table(df, group_by, aggregations)
|
|
99
|
+
if exposure is not None:
|
|
100
|
+
table["actual_rate"] = table["actual"] / table["exposure"].clip(lower=1e-9)
|
|
101
|
+
if prediction is not None:
|
|
102
|
+
table["predicted_rate"] = table["predicted"] / table["exposure"].clip(lower=1e-9)
|
|
103
|
+
else:
|
|
104
|
+
table["actual_mean"] = table["actual"] / table["rows"].clip(lower=1)
|
|
105
|
+
if prediction is not None:
|
|
106
|
+
table["predicted_mean"] = table["predicted"] / table["rows"].clip(lower=1)
|
|
107
|
+
if prediction is not None:
|
|
108
|
+
table["actual_to_predicted"] = table["actual"] / table["predicted"].clip(lower=1e-9)
|
|
109
|
+
return table
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _aggregate_series(series: pd.Series, function: str) -> Any:
|
|
113
|
+
normalized = function.lower().strip()
|
|
114
|
+
if normalized == "sum":
|
|
115
|
+
return series.sum()
|
|
116
|
+
if normalized in {"mean", "avg"}:
|
|
117
|
+
return series.mean()
|
|
118
|
+
if normalized == "min":
|
|
119
|
+
return series.min()
|
|
120
|
+
if normalized == "max":
|
|
121
|
+
return series.max()
|
|
122
|
+
if normalized == "count":
|
|
123
|
+
return int(series.count())
|
|
124
|
+
if normalized == "size":
|
|
125
|
+
return int(len(series))
|
|
126
|
+
if normalized == "std":
|
|
127
|
+
return series.std()
|
|
128
|
+
raise ValueError(f"Unsupported aggregation function {function!r}.")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _as_list(value: str | Sequence[str] | None) -> list[str]:
|
|
132
|
+
if value is None:
|
|
133
|
+
return []
|
|
134
|
+
if isinstance(value, str):
|
|
135
|
+
return [value]
|
|
136
|
+
return list(value)
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""Binning and grouping specs for model factors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
JsonDict = dict[str, Any]
|
|
11
|
+
|
|
12
|
+
_MISSING = "__missing__"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _final_edges(raw_edges: list[float], series: pd.Series) -> list[float]:
|
|
16
|
+
clean = pd.to_numeric(series, errors="coerce").dropna()
|
|
17
|
+
if clean.empty:
|
|
18
|
+
raise ValueError("Cannot create bins from an empty numeric column.")
|
|
19
|
+
edges = sorted({float(value) for value in raw_edges})
|
|
20
|
+
lower = float(clean.min()) - 1e-9
|
|
21
|
+
upper = float(clean.max()) + 1e-9
|
|
22
|
+
if len(edges) < 2:
|
|
23
|
+
midpoint = float(clean.median())
|
|
24
|
+
edges = [lower, midpoint, upper]
|
|
25
|
+
else:
|
|
26
|
+
edges[0] = min(edges[0], lower)
|
|
27
|
+
edges[-1] = max(edges[-1], upper)
|
|
28
|
+
final = [edges[0]]
|
|
29
|
+
for edge in edges[1:]:
|
|
30
|
+
if edge > final[-1]:
|
|
31
|
+
final.append(edge)
|
|
32
|
+
return final if len(final) >= 2 else [lower, upper]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def make_numeric_bins(
|
|
36
|
+
series: pd.Series,
|
|
37
|
+
bins: int = 10,
|
|
38
|
+
column: str | None = None,
|
|
39
|
+
method: str = "quantile",
|
|
40
|
+
) -> JsonDict:
|
|
41
|
+
"""Create a JSON-serializable numeric binning spec.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
series:
|
|
46
|
+
Numeric values used to derive bin edges.
|
|
47
|
+
bins:
|
|
48
|
+
Number of bins to request.
|
|
49
|
+
column:
|
|
50
|
+
Source column name. When omitted, ``series.name`` is used if present.
|
|
51
|
+
method:
|
|
52
|
+
Binning method, currently ``"quantile"`` or ``"uniform"``.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
dict
|
|
57
|
+
Numeric binning spec with edges, labels, source column, and output
|
|
58
|
+
column.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
if bins < 1:
|
|
62
|
+
raise ValueError("bins must be at least 1.")
|
|
63
|
+
clean = pd.to_numeric(series, errors="coerce").dropna()
|
|
64
|
+
if method == "quantile":
|
|
65
|
+
raw_edges = clean.quantile(np.linspace(0.0, 1.0, bins + 1)).tolist()
|
|
66
|
+
elif method == "uniform":
|
|
67
|
+
raw_edges = np.linspace(float(clean.min()), float(clean.max()), bins + 1).tolist()
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError("method must be 'quantile' or 'uniform'.")
|
|
70
|
+
|
|
71
|
+
edges = _final_edges(raw_edges, clean)
|
|
72
|
+
labels = [f"bin_{index + 1}" for index in range(len(edges) - 1)]
|
|
73
|
+
source_column = column or series.name or "value"
|
|
74
|
+
return {
|
|
75
|
+
"type": "numeric",
|
|
76
|
+
"column": source_column,
|
|
77
|
+
"output": f"{source_column}_bin",
|
|
78
|
+
"method": method,
|
|
79
|
+
"edges": edges,
|
|
80
|
+
"labels": labels,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _category_key(value: object) -> str:
|
|
85
|
+
if pd.isna(value):
|
|
86
|
+
return _MISSING
|
|
87
|
+
if isinstance(value, np.generic):
|
|
88
|
+
value = value.item()
|
|
89
|
+
return str(value)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def category_target_order(
|
|
93
|
+
df: pd.DataFrame,
|
|
94
|
+
factor: str,
|
|
95
|
+
target: str,
|
|
96
|
+
exposure: str | None = None,
|
|
97
|
+
weight: str | None = None,
|
|
98
|
+
) -> pd.DataFrame:
|
|
99
|
+
"""Order categories by observed training target level.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
df:
|
|
104
|
+
Training data.
|
|
105
|
+
factor:
|
|
106
|
+
Categorical factor column to order.
|
|
107
|
+
target:
|
|
108
|
+
Observed outcome column.
|
|
109
|
+
exposure:
|
|
110
|
+
Optional exposure column used as the denominator for the level.
|
|
111
|
+
weight:
|
|
112
|
+
Optional row-weight column used as the denominator for weighted levels.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
pandas.DataFrame
|
|
117
|
+
Category table sorted by observed level, including measure, actual,
|
|
118
|
+
level, and credibility columns.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
columns = [factor, target]
|
|
122
|
+
if exposure is not None:
|
|
123
|
+
columns.append(exposure)
|
|
124
|
+
if weight is not None and weight not in columns:
|
|
125
|
+
columns.append(weight)
|
|
126
|
+
work = df[columns].copy()
|
|
127
|
+
work[factor] = work[factor].map(_category_key)
|
|
128
|
+
if exposure is not None:
|
|
129
|
+
table = (
|
|
130
|
+
work.groupby(factor, dropna=False)
|
|
131
|
+
.agg(measure=(exposure, "sum"), actual=(target, "sum"))
|
|
132
|
+
.reset_index()
|
|
133
|
+
)
|
|
134
|
+
table["level"] = table["actual"] / table["measure"].clip(lower=1e-9)
|
|
135
|
+
elif weight is not None:
|
|
136
|
+
work["_weighted_target"] = work[target] * work[weight]
|
|
137
|
+
table = (
|
|
138
|
+
work.groupby(factor, dropna=False)
|
|
139
|
+
.agg(measure=(weight, "sum"), actual=("_weighted_target", "sum"))
|
|
140
|
+
.reset_index()
|
|
141
|
+
)
|
|
142
|
+
table["level"] = table["actual"] / table["measure"].clip(lower=1e-9)
|
|
143
|
+
else:
|
|
144
|
+
table = (
|
|
145
|
+
work.groupby(factor, dropna=False)
|
|
146
|
+
.agg(measure=(target, "size"), actual=(target, "sum"))
|
|
147
|
+
.reset_index()
|
|
148
|
+
)
|
|
149
|
+
table["level"] = table["actual"] / table["measure"].clip(lower=1e-9)
|
|
150
|
+
table["credibility"] = np.sqrt(table["actual"].clip(lower=0.0) + 1.0)
|
|
151
|
+
return table.sort_values(["level", "measure"]).reset_index(drop=True)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def make_categorical_groups(
|
|
155
|
+
df: pd.DataFrame,
|
|
156
|
+
factor: str,
|
|
157
|
+
target: str,
|
|
158
|
+
exposure: str | None = None,
|
|
159
|
+
weight: str | None = None,
|
|
160
|
+
cutpoints: list[int] | None = None,
|
|
161
|
+
) -> JsonDict:
|
|
162
|
+
"""Create a target-ordered categorical grouping spec from training data.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
df:
|
|
167
|
+
Training data.
|
|
168
|
+
factor:
|
|
169
|
+
Categorical factor column to group.
|
|
170
|
+
target:
|
|
171
|
+
Observed outcome column used for ordering categories.
|
|
172
|
+
exposure:
|
|
173
|
+
Optional exposure column used as the denominator for category levels.
|
|
174
|
+
weight:
|
|
175
|
+
Optional row-weight column used for weighted category levels.
|
|
176
|
+
cutpoints:
|
|
177
|
+
Ordered category positions where new groups should start.
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
dict
|
|
182
|
+
JSON-serializable categorical grouping spec with order, cutpoints,
|
|
183
|
+
mapping, labels, and training statistics.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
cutpoints = cutpoints or []
|
|
187
|
+
target_order = category_target_order(df, factor, target, exposure=exposure, weight=weight)
|
|
188
|
+
ordered = [str(value) for value in target_order[factor].tolist()]
|
|
189
|
+
clean_cutpoints = sorted({int(point) for point in cutpoints if 0 < int(point) < len(ordered)})
|
|
190
|
+
boundaries = [0, *clean_cutpoints, len(ordered)]
|
|
191
|
+
mapping: dict[str, str] = {}
|
|
192
|
+
labels: list[str] = []
|
|
193
|
+
|
|
194
|
+
for index in range(len(boundaries) - 1):
|
|
195
|
+
label = f"group_{index + 1:02d}"
|
|
196
|
+
labels.append(label)
|
|
197
|
+
for category in ordered[boundaries[index] : boundaries[index + 1]]:
|
|
198
|
+
mapping[category] = label
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"type": "categorical",
|
|
202
|
+
"column": factor,
|
|
203
|
+
"output": f"{factor}_group",
|
|
204
|
+
"order": ordered,
|
|
205
|
+
"cutpoints": clean_cutpoints,
|
|
206
|
+
"mapping": mapping,
|
|
207
|
+
"labels": labels,
|
|
208
|
+
"default": "other",
|
|
209
|
+
"missing": _MISSING,
|
|
210
|
+
"stats": target_order.rename(columns={factor: "category"}).to_dict("records"),
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def apply_spec(
|
|
215
|
+
df: pd.DataFrame,
|
|
216
|
+
spec: JsonDict,
|
|
217
|
+
output: str | None = None,
|
|
218
|
+
) -> pd.DataFrame:
|
|
219
|
+
"""Apply a saved numeric or categorical spec to a dataframe.
|
|
220
|
+
|
|
221
|
+
Parameters
|
|
222
|
+
----------
|
|
223
|
+
df:
|
|
224
|
+
Data containing the raw column referenced by ``spec``.
|
|
225
|
+
spec:
|
|
226
|
+
JSON-serializable numeric or categorical spec.
|
|
227
|
+
output:
|
|
228
|
+
Optional output column override. When omitted, the spec output is used.
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
pandas.DataFrame
|
|
233
|
+
Copy of ``df`` with the transformed output column added.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
kind = str(spec["type"])
|
|
237
|
+
column = str(spec["column"])
|
|
238
|
+
output = output or str(spec.get("output") or f"{column}_{kind}")
|
|
239
|
+
result = df.copy()
|
|
240
|
+
|
|
241
|
+
if kind == "numeric":
|
|
242
|
+
binned = pd.cut(
|
|
243
|
+
pd.to_numeric(result[column], errors="coerce"),
|
|
244
|
+
bins=spec["edges"],
|
|
245
|
+
labels=spec["labels"],
|
|
246
|
+
include_lowest=True,
|
|
247
|
+
)
|
|
248
|
+
result[output] = binned.astype(object)
|
|
249
|
+
result[output] = result[output].where(~result[output].isna(), other="missing")
|
|
250
|
+
return result
|
|
251
|
+
|
|
252
|
+
if kind == "categorical":
|
|
253
|
+
keys = result[column].map(_category_key)
|
|
254
|
+
result[output] = keys.map(spec["mapping"]).fillna(str(spec.get("default", "other")))
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
raise ValueError("spec type must be 'numeric' or 'categorical'.")
|