glm-factor-optimizer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ """Simple GLM factor-binning and modeling tools."""
2
+
3
+ from .aggregation import aggregate_rate_table, aggregate_table
4
+ from .bins import apply_spec, category_target_order, make_numeric_bins
5
+ from .core import GLM, RateGLM
6
+ from .diagnostics import find_interactions, pair_diagnostics
7
+ from .factor import FactorBlock
8
+ from .metrics import (
9
+ calibration,
10
+ gaussian_deviance,
11
+ gamma_deviance,
12
+ lift_table,
13
+ model_deviance,
14
+ poisson_deviance,
15
+ summary,
16
+ weighted_mae,
17
+ weighted_rmse,
18
+ )
19
+ from .model import FittedGLM, FittedRateGLM, fit_glm, fit_rate_glm
20
+ from .optimize import (
21
+ OptimizationResult,
22
+ optimize_factor,
23
+ small_bin_size_penalty,
24
+ small_count_penalty,
25
+ train_validation_gap_penalty,
26
+ )
27
+ from .penalties import bin_count_penalty, small_target_penalty, unstable_relativity_penalty
28
+ from .runs import RunLogger
29
+ from .sampling import missing_strata, stratified_sample
30
+ from .screening import rank_factors
31
+ from .split import split
32
+ from .study import GLMStudy
33
+ from .validation import by_factor_report, train_validation_comparison, validation_report
34
+ from .workflow import GLMWorkflow, WorkflowResult, run_workflow
35
+
36
+ optimize_bins = optimize_factor
37
+
38
+ __all__ = [
39
+ "FittedRateGLM",
40
+ "FittedGLM",
41
+ "FactorBlock",
42
+ "GLM",
43
+ "GLMStudy",
44
+ "GLMWorkflow",
45
+ "OptimizationResult",
46
+ "RateGLM",
47
+ "RunLogger",
48
+ "aggregate_rate_table",
49
+ "aggregate_table",
50
+ "apply_spec",
51
+ "bin_count_penalty",
52
+ "calibration",
53
+ "category_target_order",
54
+ "by_factor_report",
55
+ "fit_rate_glm",
56
+ "fit_glm",
57
+ "find_interactions",
58
+ "gaussian_deviance",
59
+ "gamma_deviance",
60
+ "lift_table",
61
+ "make_numeric_bins",
62
+ "missing_strata",
63
+ "model_deviance",
64
+ "optimize_bins",
65
+ "optimize_factor",
66
+ "pair_diagnostics",
67
+ "poisson_deviance",
68
+ "rank_factors",
69
+ "split",
70
+ "small_bin_size_penalty",
71
+ "small_count_penalty",
72
+ "small_target_penalty",
73
+ "summary",
74
+ "train_validation_gap_penalty",
75
+ "train_validation_comparison",
76
+ "unstable_relativity_penalty",
77
+ "stratified_sample",
78
+ "validation_report",
79
+ "weighted_mae",
80
+ "weighted_rmse",
81
+ "WorkflowResult",
82
+ "run_workflow",
83
+ ]
@@ -0,0 +1,136 @@
1
+ """Generic aggregation helpers for model development tables."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping, Sequence
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ AggregationSpec = Mapping[str, tuple[str, str]]
11
+
12
+
13
+ def aggregate_table(
14
+ df: pd.DataFrame,
15
+ group_by: str | Sequence[str] | None = None,
16
+ aggregations: AggregationSpec | None = None,
17
+ ) -> pd.DataFrame:
18
+ """Return grouped rows plus optional named aggregations.
19
+
20
+ Parameters
21
+ ----------
22
+ df:
23
+ Data to aggregate.
24
+ group_by:
25
+ Optional column or columns to group by. When omitted, a single overall
26
+ row is returned.
27
+ aggregations:
28
+ Optional mapping from output column name to ``(source_column,
29
+ function)``. Supported functions include ``sum``, ``mean``, ``min``,
30
+ ``max``, ``count``, ``size``, and ``std``.
31
+
32
+ Returns
33
+ -------
34
+ pandas.DataFrame
35
+ Grouped table with a ``rows`` column and requested aggregations.
36
+ """
37
+
38
+ group_cols = _as_list(group_by)
39
+ named = {name: tuple(spec) for name, spec in (aggregations or {}).items()}
40
+ if group_cols:
41
+ return (
42
+ df.groupby(group_cols, dropna=False)
43
+ .agg(rows=(group_cols[0], "size"), **named)
44
+ .reset_index()
45
+ )
46
+ row: dict[str, Any] = {"rows": int(len(df))}
47
+ for name, (column, function) in named.items():
48
+ row[name] = _aggregate_series(df[column], function)
49
+ return pd.DataFrame([row])
50
+
51
+
52
+ def aggregate_rate_table(
53
+ df: pd.DataFrame,
54
+ group_by: str | Sequence[str] | None,
55
+ *,
56
+ target: str,
57
+ exposure: str | None = None,
58
+ weight: str | None = None,
59
+ prediction: str | None = None,
60
+ extras: AggregationSpec | None = None,
61
+ ) -> pd.DataFrame:
62
+ """Return grouped observed/predicted totals and level diagnostics.
63
+
64
+ Parameters
65
+ ----------
66
+ df:
67
+ Data to aggregate.
68
+ group_by:
69
+ Optional column or columns to group by.
70
+ target:
71
+ Observed outcome column.
72
+ exposure:
73
+ Optional exposure column for exposure-adjusted level columns.
74
+ weight:
75
+ Optional weight column to total.
76
+ prediction:
77
+ Optional predicted outcome column.
78
+ extras:
79
+ Optional extra named aggregations, using the same format as
80
+ :func:`aggregate_table`.
81
+
82
+ Returns
83
+ -------
84
+ pandas.DataFrame
85
+ Grouped observed and predicted totals with mean or exposure-adjusted
86
+ diagnostics.
87
+ """
88
+
89
+ aggregations: dict[str, tuple[str, str]] = {"actual": (target, "sum")}
90
+ if exposure is not None:
91
+ aggregations["exposure"] = (exposure, "sum")
92
+ if weight is not None:
93
+ aggregations["weight"] = (weight, "sum")
94
+ if prediction is not None:
95
+ aggregations["predicted"] = (prediction, "sum")
96
+ aggregations.update(extras or {})
97
+
98
+ table = aggregate_table(df, group_by, aggregations)
99
+ if exposure is not None:
100
+ table["actual_rate"] = table["actual"] / table["exposure"].clip(lower=1e-9)
101
+ if prediction is not None:
102
+ table["predicted_rate"] = table["predicted"] / table["exposure"].clip(lower=1e-9)
103
+ else:
104
+ table["actual_mean"] = table["actual"] / table["rows"].clip(lower=1)
105
+ if prediction is not None:
106
+ table["predicted_mean"] = table["predicted"] / table["rows"].clip(lower=1)
107
+ if prediction is not None:
108
+ table["actual_to_predicted"] = table["actual"] / table["predicted"].clip(lower=1e-9)
109
+ return table
110
+
111
+
112
+ def _aggregate_series(series: pd.Series, function: str) -> Any:
113
+ normalized = function.lower().strip()
114
+ if normalized == "sum":
115
+ return series.sum()
116
+ if normalized in {"mean", "avg"}:
117
+ return series.mean()
118
+ if normalized == "min":
119
+ return series.min()
120
+ if normalized == "max":
121
+ return series.max()
122
+ if normalized == "count":
123
+ return int(series.count())
124
+ if normalized == "size":
125
+ return int(len(series))
126
+ if normalized == "std":
127
+ return series.std()
128
+ raise ValueError(f"Unsupported aggregation function {function!r}.")
129
+
130
+
131
+ def _as_list(value: str | Sequence[str] | None) -> list[str]:
132
+ if value is None:
133
+ return []
134
+ if isinstance(value, str):
135
+ return [value]
136
+ return list(value)
@@ -0,0 +1,257 @@
1
+ """Binning and grouping specs for model factors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ JsonDict = dict[str, Any]
11
+
12
+ _MISSING = "__missing__"
13
+
14
+
15
+ def _final_edges(raw_edges: list[float], series: pd.Series) -> list[float]:
16
+ clean = pd.to_numeric(series, errors="coerce").dropna()
17
+ if clean.empty:
18
+ raise ValueError("Cannot create bins from an empty numeric column.")
19
+ edges = sorted({float(value) for value in raw_edges})
20
+ lower = float(clean.min()) - 1e-9
21
+ upper = float(clean.max()) + 1e-9
22
+ if len(edges) < 2:
23
+ midpoint = float(clean.median())
24
+ edges = [lower, midpoint, upper]
25
+ else:
26
+ edges[0] = min(edges[0], lower)
27
+ edges[-1] = max(edges[-1], upper)
28
+ final = [edges[0]]
29
+ for edge in edges[1:]:
30
+ if edge > final[-1]:
31
+ final.append(edge)
32
+ return final if len(final) >= 2 else [lower, upper]
33
+
34
+
35
+ def make_numeric_bins(
36
+ series: pd.Series,
37
+ bins: int = 10,
38
+ column: str | None = None,
39
+ method: str = "quantile",
40
+ ) -> JsonDict:
41
+ """Create a JSON-serializable numeric binning spec.
42
+
43
+ Parameters
44
+ ----------
45
+ series:
46
+ Numeric values used to derive bin edges.
47
+ bins:
48
+ Number of bins to request.
49
+ column:
50
+ Source column name. When omitted, ``series.name`` is used if present.
51
+ method:
52
+ Binning method, currently ``"quantile"`` or ``"uniform"``.
53
+
54
+ Returns
55
+ -------
56
+ dict
57
+ Numeric binning spec with edges, labels, source column, and output
58
+ column.
59
+ """
60
+
61
+ if bins < 1:
62
+ raise ValueError("bins must be at least 1.")
63
+ clean = pd.to_numeric(series, errors="coerce").dropna()
64
+ if method == "quantile":
65
+ raw_edges = clean.quantile(np.linspace(0.0, 1.0, bins + 1)).tolist()
66
+ elif method == "uniform":
67
+ raw_edges = np.linspace(float(clean.min()), float(clean.max()), bins + 1).tolist()
68
+ else:
69
+ raise ValueError("method must be 'quantile' or 'uniform'.")
70
+
71
+ edges = _final_edges(raw_edges, clean)
72
+ labels = [f"bin_{index + 1}" for index in range(len(edges) - 1)]
73
+ source_column = column or series.name or "value"
74
+ return {
75
+ "type": "numeric",
76
+ "column": source_column,
77
+ "output": f"{source_column}_bin",
78
+ "method": method,
79
+ "edges": edges,
80
+ "labels": labels,
81
+ }
82
+
83
+
84
+ def _category_key(value: object) -> str:
85
+ if pd.isna(value):
86
+ return _MISSING
87
+ if isinstance(value, np.generic):
88
+ value = value.item()
89
+ return str(value)
90
+
91
+
92
+ def category_target_order(
93
+ df: pd.DataFrame,
94
+ factor: str,
95
+ target: str,
96
+ exposure: str | None = None,
97
+ weight: str | None = None,
98
+ ) -> pd.DataFrame:
99
+ """Order categories by observed training target level.
100
+
101
+ Parameters
102
+ ----------
103
+ df:
104
+ Training data.
105
+ factor:
106
+ Categorical factor column to order.
107
+ target:
108
+ Observed outcome column.
109
+ exposure:
110
+ Optional exposure column used as the denominator for the level.
111
+ weight:
112
+ Optional row-weight column used as the denominator for weighted levels.
113
+
114
+ Returns
115
+ -------
116
+ pandas.DataFrame
117
+ Category table sorted by observed level, including measure, actual,
118
+ level, and credibility columns.
119
+ """
120
+
121
+ columns = [factor, target]
122
+ if exposure is not None:
123
+ columns.append(exposure)
124
+ if weight is not None and weight not in columns:
125
+ columns.append(weight)
126
+ work = df[columns].copy()
127
+ work[factor] = work[factor].map(_category_key)
128
+ if exposure is not None:
129
+ table = (
130
+ work.groupby(factor, dropna=False)
131
+ .agg(measure=(exposure, "sum"), actual=(target, "sum"))
132
+ .reset_index()
133
+ )
134
+ table["level"] = table["actual"] / table["measure"].clip(lower=1e-9)
135
+ elif weight is not None:
136
+ work["_weighted_target"] = work[target] * work[weight]
137
+ table = (
138
+ work.groupby(factor, dropna=False)
139
+ .agg(measure=(weight, "sum"), actual=("_weighted_target", "sum"))
140
+ .reset_index()
141
+ )
142
+ table["level"] = table["actual"] / table["measure"].clip(lower=1e-9)
143
+ else:
144
+ table = (
145
+ work.groupby(factor, dropna=False)
146
+ .agg(measure=(target, "size"), actual=(target, "sum"))
147
+ .reset_index()
148
+ )
149
+ table["level"] = table["actual"] / table["measure"].clip(lower=1e-9)
150
+ table["credibility"] = np.sqrt(table["actual"].clip(lower=0.0) + 1.0)
151
+ return table.sort_values(["level", "measure"]).reset_index(drop=True)
152
+
153
+
154
+ def make_categorical_groups(
155
+ df: pd.DataFrame,
156
+ factor: str,
157
+ target: str,
158
+ exposure: str | None = None,
159
+ weight: str | None = None,
160
+ cutpoints: list[int] | None = None,
161
+ ) -> JsonDict:
162
+ """Create a target-ordered categorical grouping spec from training data.
163
+
164
+ Parameters
165
+ ----------
166
+ df:
167
+ Training data.
168
+ factor:
169
+ Categorical factor column to group.
170
+ target:
171
+ Observed outcome column used for ordering categories.
172
+ exposure:
173
+ Optional exposure column used as the denominator for category levels.
174
+ weight:
175
+ Optional row-weight column used for weighted category levels.
176
+ cutpoints:
177
+ Ordered category positions where new groups should start.
178
+
179
+ Returns
180
+ -------
181
+ dict
182
+ JSON-serializable categorical grouping spec with order, cutpoints,
183
+ mapping, labels, and training statistics.
184
+ """
185
+
186
+ cutpoints = cutpoints or []
187
+ target_order = category_target_order(df, factor, target, exposure=exposure, weight=weight)
188
+ ordered = [str(value) for value in target_order[factor].tolist()]
189
+ clean_cutpoints = sorted({int(point) for point in cutpoints if 0 < int(point) < len(ordered)})
190
+ boundaries = [0, *clean_cutpoints, len(ordered)]
191
+ mapping: dict[str, str] = {}
192
+ labels: list[str] = []
193
+
194
+ for index in range(len(boundaries) - 1):
195
+ label = f"group_{index + 1:02d}"
196
+ labels.append(label)
197
+ for category in ordered[boundaries[index] : boundaries[index + 1]]:
198
+ mapping[category] = label
199
+
200
+ return {
201
+ "type": "categorical",
202
+ "column": factor,
203
+ "output": f"{factor}_group",
204
+ "order": ordered,
205
+ "cutpoints": clean_cutpoints,
206
+ "mapping": mapping,
207
+ "labels": labels,
208
+ "default": "other",
209
+ "missing": _MISSING,
210
+ "stats": target_order.rename(columns={factor: "category"}).to_dict("records"),
211
+ }
212
+
213
+
214
+ def apply_spec(
215
+ df: pd.DataFrame,
216
+ spec: JsonDict,
217
+ output: str | None = None,
218
+ ) -> pd.DataFrame:
219
+ """Apply a saved numeric or categorical spec to a dataframe.
220
+
221
+ Parameters
222
+ ----------
223
+ df:
224
+ Data containing the raw column referenced by ``spec``.
225
+ spec:
226
+ JSON-serializable numeric or categorical spec.
227
+ output:
228
+ Optional output column override. When omitted, the spec output is used.
229
+
230
+ Returns
231
+ -------
232
+ pandas.DataFrame
233
+ Copy of ``df`` with the transformed output column added.
234
+ """
235
+
236
+ kind = str(spec["type"])
237
+ column = str(spec["column"])
238
+ output = output or str(spec.get("output") or f"{column}_{kind}")
239
+ result = df.copy()
240
+
241
+ if kind == "numeric":
242
+ binned = pd.cut(
243
+ pd.to_numeric(result[column], errors="coerce"),
244
+ bins=spec["edges"],
245
+ labels=spec["labels"],
246
+ include_lowest=True,
247
+ )
248
+ result[output] = binned.astype(object)
249
+ result[output] = result[output].where(~result[output].isna(), other="missing")
250
+ return result
251
+
252
+ if kind == "categorical":
253
+ keys = result[column].map(_category_key)
254
+ result[output] = keys.map(spec["mapping"]).fillna(str(spec.get("default", "other")))
255
+ return result
256
+
257
+ raise ValueError("spec type must be 'numeric' or 'categorical'.")