p2predict 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- p2predict/__init__.py +88 -0
- p2predict/__main__.py +11 -0
- p2predict/cli/__init__.py +9 -0
- p2predict/cli/predict.py +706 -0
- p2predict/cli/train.py +659 -0
- p2predict/cmdline_io.py +64 -0
- p2predict/explain.py +464 -0
- p2predict/feature_selection.py +139 -0
- p2predict/hpo_training.py +44 -0
- p2predict/input_checks.py +59 -0
- p2predict/intervals.py +317 -0
- p2predict/json_output.py +225 -0
- p2predict/mcp/__init__.py +1 -0
- p2predict/mcp/__main__.py +3 -0
- p2predict/mcp/conversions.py +44 -0
- p2predict/mcp/registry.py +149 -0
- p2predict/mcp/server.py +1258 -0
- p2predict/model_evals.py +36 -0
- p2predict/model_utils.py +235 -0
- p2predict/outliers.py +234 -0
- p2predict/plotting.py +499 -0
- p2predict/prepare_data.py +48 -0
- p2predict/preprocessing.py +130 -0
- p2predict/quality.py +457 -0
- p2predict/trained_model_io.py +64 -0
- p2predict/training.py +270 -0
- p2predict/ui_console.py +36 -0
- p2predict/whatif.py +269 -0
- p2predict-0.9.0.dist-info/METADATA +216 -0
- p2predict-0.9.0.dist-info/RECORD +34 -0
- p2predict-0.9.0.dist-info/WHEEL +5 -0
- p2predict-0.9.0.dist-info/entry_points.txt +4 -0
- p2predict-0.9.0.dist-info/licenses/LICENSE +121 -0
- p2predict-0.9.0.dist-info/top_level.txt +1 -0
p2predict/model_evals.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.inspection import permutation_importance
|
|
3
|
+
from sklearn.metrics import mean_absolute_error, r2_score
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def evaluate_model(X_test, y_test, model):
|
|
7
|
+
predictions = model.predict(X_test)
|
|
8
|
+
mae = mean_absolute_error(y_test, predictions)
|
|
9
|
+
r2 = r2_score(y_test, predictions)
|
|
10
|
+
|
|
11
|
+
# Replace the prior (and incorrect) two-sample t-test with a
|
|
12
|
+
# residual-mean test: under a well-calibrated model, residuals should be
|
|
13
|
+
# centered on zero. Reject => systematic bias.
|
|
14
|
+
residuals = np.asarray(y_test) - np.asarray(predictions)
|
|
15
|
+
rmse = float(np.sqrt(np.mean(residuals ** 2)))
|
|
16
|
+
n = len(residuals)
|
|
17
|
+
if n > 1 and residuals.std(ddof=1) > 0:
|
|
18
|
+
from scipy import stats
|
|
19
|
+
_, p_value = stats.ttest_1samp(residuals, 0.0)
|
|
20
|
+
else:
|
|
21
|
+
p_value = 1.0
|
|
22
|
+
|
|
23
|
+
return mae, r2, p_value, rmse
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_column_statistics(data, feature_columns):
|
|
27
|
+
return {
|
|
28
|
+
col: {"skewness": data[col].skew(), "kurtosis": data[col].kurt()}
|
|
29
|
+
for col in feature_columns
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def calculate_feature_importance(X, y, model):
|
|
34
|
+
result = permutation_importance(model, X, y, n_repeats=10, random_state=0, n_jobs=-1)
|
|
35
|
+
total = sum(result.importances_mean) or 1.0
|
|
36
|
+
return result.importances_mean / total
|
p2predict/model_utils.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from sklearn.compose import TransformedTargetRegressor
|
|
6
|
+
from sklearn.pipeline import Pipeline
|
|
7
|
+
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, TargetEncoder
|
|
8
|
+
|
|
9
|
+
from p2predict.explain import Explanation
|
|
10
|
+
from p2predict.whatif import WhatIfResult, interaction_is_material
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def inner_pipeline(model):
|
|
14
|
+
"""Unwrap a TransformedTargetRegressor to get the inner Pipeline."""
|
|
15
|
+
return model.regressor_ if isinstance(model, TransformedTargetRegressor) else model
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract_feature_info(pipeline):
|
|
19
|
+
"""Return (feature_types, all_categories) from a fitted preprocessor.
|
|
20
|
+
|
|
21
|
+
Works with OneHotEncoder, OrdinalEncoder, and TargetEncoder pipelines.
|
|
22
|
+
"""
|
|
23
|
+
preprocessor = pipeline.named_steps["preprocessor"]
|
|
24
|
+
feature_types: dict[str, str] = {}
|
|
25
|
+
all_categories: dict[str, list] = {}
|
|
26
|
+
|
|
27
|
+
for name, transformer, columns in preprocessor.transformers_:
|
|
28
|
+
if name == "num":
|
|
29
|
+
feature_types.update({col: "Numerical" for col in columns})
|
|
30
|
+
elif name == "cat":
|
|
31
|
+
feature_types.update({col: "Categorical" for col in columns})
|
|
32
|
+
|
|
33
|
+
encoder = transformer
|
|
34
|
+
if isinstance(transformer, Pipeline):
|
|
35
|
+
if "onehot" in transformer.named_steps:
|
|
36
|
+
encoder = transformer.named_steps["onehot"]
|
|
37
|
+
elif "target" in transformer.named_steps:
|
|
38
|
+
encoder = transformer.named_steps["target"]
|
|
39
|
+
|
|
40
|
+
if isinstance(encoder, (OneHotEncoder, OrdinalEncoder, TargetEncoder)) and hasattr(
|
|
41
|
+
encoder, "categories_"
|
|
42
|
+
):
|
|
43
|
+
all_categories = {
|
|
44
|
+
col: cat.tolist()
|
|
45
|
+
for col, cat in zip(columns, encoder.categories_)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return feature_types, all_categories
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def coerce_features(features_df, feature_types):
|
|
52
|
+
"""Coerce numerical columns to numeric dtype."""
|
|
53
|
+
for col, kind in feature_types.items():
|
|
54
|
+
if col in features_df.columns and kind == "Numerical":
|
|
55
|
+
features_df[col] = pd.to_numeric(features_df[col], errors="coerce")
|
|
56
|
+
return features_df
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def interval_to_dicts(intervals) -> list[dict]:
|
|
60
|
+
"""Serialize a list of IntervalResult to JSON-ready dicts.
|
|
61
|
+
|
|
62
|
+
Each dict carries the raw bounds AND a per-part trust read the agent can
|
|
63
|
+
quote: ``reliability`` ('trust' | 'caution' | 'quote') and a plain
|
|
64
|
+
``say_to_user`` sentence derived from how wide the band is relative to the
|
|
65
|
+
prediction. Lead with ``say_to_user``; don't make the user infer trust from
|
|
66
|
+
the bare low/high numbers.
|
|
67
|
+
"""
|
|
68
|
+
from p2predict.quality import interval_reliability, interval_say_to_user
|
|
69
|
+
|
|
70
|
+
out = []
|
|
71
|
+
for ir in intervals:
|
|
72
|
+
low, pred, high = float(ir.low), float(ir.prediction), float(ir.high)
|
|
73
|
+
out.append({
|
|
74
|
+
"low": low,
|
|
75
|
+
"prediction": pred,
|
|
76
|
+
"high": high,
|
|
77
|
+
"band": ir.band,
|
|
78
|
+
"reliability": interval_reliability(low, pred, high),
|
|
79
|
+
"say_to_user": interval_say_to_user(low, pred, high),
|
|
80
|
+
})
|
|
81
|
+
return out
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def explanation_to_dict(explanation: Explanation) -> dict:
|
|
85
|
+
"""Serialize an Explanation to a JSON-ready dict.
|
|
86
|
+
|
|
87
|
+
The dict carries TWO views of the same attribution:
|
|
88
|
+
|
|
89
|
+
- A business view the agent can quote to a category manager verbatim:
|
|
90
|
+
``starting_point`` (the baseline price every part starts from) and
|
|
91
|
+
``price_drivers`` (each spec/supplier's effect in dollars AND percent,
|
|
92
|
+
biggest mover first). This is the view to lead with.
|
|
93
|
+
- The technical view (``baseline``, ``contributions``,
|
|
94
|
+
``multiplicative_factors``, ``dollar_attribution``, ``residual``) for
|
|
95
|
+
callers that need the raw SHAP numbers. Do NOT surface these key names
|
|
96
|
+
to a procurement user.
|
|
97
|
+
"""
|
|
98
|
+
out = {
|
|
99
|
+
"baseline": float(explanation.baseline),
|
|
100
|
+
"prediction": float(explanation.prediction),
|
|
101
|
+
"log_target": bool(explanation.log_target),
|
|
102
|
+
"contributions": [
|
|
103
|
+
{"feature": k, "value": float(v)}
|
|
104
|
+
for k, v in sorted(
|
|
105
|
+
explanation.contributions.items(), key=lambda kv: abs(kv[1]), reverse=True
|
|
106
|
+
)
|
|
107
|
+
],
|
|
108
|
+
"residual": float(explanation.residual),
|
|
109
|
+
}
|
|
110
|
+
if explanation.log_target and explanation.multiplicative_factors is not None:
|
|
111
|
+
out["multiplicative_factors"] = [
|
|
112
|
+
{"feature": k, "factor": float(v)}
|
|
113
|
+
for k, v in sorted(
|
|
114
|
+
explanation.multiplicative_factors.items(),
|
|
115
|
+
key=lambda kv: abs(np.log(kv[1])) if kv[1] > 0 else 0.0,
|
|
116
|
+
reverse=True,
|
|
117
|
+
)
|
|
118
|
+
]
|
|
119
|
+
out["dollar_attribution"] = (
|
|
120
|
+
[
|
|
121
|
+
{"feature": k, "value": float(v)}
|
|
122
|
+
for k, v in sorted(
|
|
123
|
+
explanation.dollar_attribution.items(),
|
|
124
|
+
key=lambda kv: abs(kv[1]),
|
|
125
|
+
reverse=True,
|
|
126
|
+
)
|
|
127
|
+
]
|
|
128
|
+
if explanation.dollar_attribution is not None
|
|
129
|
+
else None
|
|
130
|
+
)
|
|
131
|
+
else:
|
|
132
|
+
out["multiplicative_factors"] = None
|
|
133
|
+
out["dollar_attribution"] = None
|
|
134
|
+
|
|
135
|
+
out["starting_point"] = _business_starting_point(explanation)
|
|
136
|
+
out["price_drivers"] = _business_price_drivers(explanation)
|
|
137
|
+
return out
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _business_starting_point(explanation: Explanation) -> float:
|
|
141
|
+
"""The baseline price every part 'starts from' before its specs apply, in
|
|
142
|
+
dollars (price space for log-target models)."""
|
|
143
|
+
if explanation.log_target and explanation.baseline_price is not None:
|
|
144
|
+
return float(explanation.baseline_price)
|
|
145
|
+
return float(explanation.baseline)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _business_price_drivers(explanation: Explanation) -> list[dict]:
|
|
149
|
+
"""A single plain-language attribution list the agent can quote directly.
|
|
150
|
+
|
|
151
|
+
Each entry is one spec/supplier and its effect on the price, expressed in
|
|
152
|
+
BOTH dollars and percent, biggest absolute mover first:
|
|
153
|
+
{"driver": "Supplier ADI", "effect_dollars": 0.72, "effect_pct": 18.0}
|
|
154
|
+
Works for additive and log-target models alike, so the caller never has to
|
|
155
|
+
branch on the model's internal scale.
|
|
156
|
+
"""
|
|
157
|
+
drivers: list[dict] = []
|
|
158
|
+
if explanation.log_target and explanation.multiplicative_factors is not None:
|
|
159
|
+
dollars = explanation.dollar_attribution or {}
|
|
160
|
+
for feature, factor in explanation.multiplicative_factors.items():
|
|
161
|
+
drivers.append({
|
|
162
|
+
"driver": feature,
|
|
163
|
+
"effect_dollars": (
|
|
164
|
+
round(float(dollars[feature]), 4) if feature in dollars else None
|
|
165
|
+
),
|
|
166
|
+
"effect_pct": round((float(factor) - 1.0) * 100.0, 1),
|
|
167
|
+
})
|
|
168
|
+
else:
|
|
169
|
+
# Additive model: contributions are already dollars; percent is the
|
|
170
|
+
# share of the baseline each driver moves the price by.
|
|
171
|
+
base = float(explanation.baseline) or 1.0
|
|
172
|
+
for feature, value in explanation.contributions.items():
|
|
173
|
+
drivers.append({
|
|
174
|
+
"driver": feature,
|
|
175
|
+
"effect_dollars": round(float(value), 4),
|
|
176
|
+
"effect_pct": round(float(value) / base * 100.0, 1),
|
|
177
|
+
})
|
|
178
|
+
drivers.sort(key=lambda d: abs(d["effect_dollars"] or 0.0), reverse=True)
|
|
179
|
+
return drivers
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def whatif_to_dict(result: WhatIfResult) -> dict:
|
|
183
|
+
"""Serialize a WhatIfResult to a JSON-ready dict.
|
|
184
|
+
|
|
185
|
+
``summary`` is the plain-language headline the agent can quote to a category
|
|
186
|
+
manager ("Switching to Microchip saves $0.41 per part, -12%"); the
|
|
187
|
+
remaining keys are the technical detail behind it. Lead with ``summary``.
|
|
188
|
+
"""
|
|
189
|
+
delta = float(result.delta)
|
|
190
|
+
direction = "no change"
|
|
191
|
+
if delta > 0:
|
|
192
|
+
direction = "adds"
|
|
193
|
+
elif delta < 0:
|
|
194
|
+
direction = "saves"
|
|
195
|
+
return {
|
|
196
|
+
"summary": {
|
|
197
|
+
"direction": direction, # "adds" | "saves" | "no change"
|
|
198
|
+
"effect_dollars": round(abs(delta), 4),
|
|
199
|
+
"effect_pct": round(abs(float(result.delta_pct)), 1),
|
|
200
|
+
"new_price": round(float(result.counterfactual_prediction), 4),
|
|
201
|
+
"old_price": round(float(result.base_prediction), 4),
|
|
202
|
+
},
|
|
203
|
+
"changes": {
|
|
204
|
+
col: {"from": base_val, "to": cf_val}
|
|
205
|
+
for col, (base_val, cf_val) in result.changes.items()
|
|
206
|
+
},
|
|
207
|
+
"base_prediction": float(result.base_prediction),
|
|
208
|
+
"counterfactual_prediction": float(result.counterfactual_prediction),
|
|
209
|
+
"delta": float(result.delta),
|
|
210
|
+
"delta_pct": float(result.delta_pct),
|
|
211
|
+
"log_target": bool(result.log_target),
|
|
212
|
+
"multiplicative_factor": (
|
|
213
|
+
float(result.multiplicative_factor)
|
|
214
|
+
if result.multiplicative_factor is not None
|
|
215
|
+
else None
|
|
216
|
+
),
|
|
217
|
+
"changed_contributions": [
|
|
218
|
+
{"feature": k, "value": float(v)}
|
|
219
|
+
for k, v in sorted(
|
|
220
|
+
result.changed_contributions.items(), key=lambda kv: abs(kv[1]), reverse=True
|
|
221
|
+
)
|
|
222
|
+
],
|
|
223
|
+
"interaction_contribution": float(result.interaction_contribution),
|
|
224
|
+
"interaction_is_material": bool(interaction_is_material(result)),
|
|
225
|
+
"base_interval": (
|
|
226
|
+
{"low": float(result.base_interval.low), "high": float(result.base_interval.high)}
|
|
227
|
+
if result.base_interval is not None
|
|
228
|
+
else None
|
|
229
|
+
),
|
|
230
|
+
"cf_interval": (
|
|
231
|
+
{"low": float(result.cf_interval.low), "high": float(result.cf_interval.high)}
|
|
232
|
+
if result.cf_interval is not None
|
|
233
|
+
else None
|
|
234
|
+
),
|
|
235
|
+
}
|
p2predict/outliers.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Outlier detection and handling for both the target and the feature columns.
|
|
2
|
+
|
|
3
|
+
Procurement data routinely contains rush orders, one-off spot buys, and
|
|
4
|
+
data-entry errors (a `Weight` of `100000` when someone meant `100`, a unit
|
|
5
|
+
mix-up between kg and g). Both kinds quietly distort a learned cost model:
|
|
6
|
+
target outliers warp R² and inflate intervals, feature outliers pull the
|
|
7
|
+
model's response surface around the bad rows.
|
|
8
|
+
|
|
9
|
+
This module flags both via the same Tukey IQR rule and applies one of four
|
|
10
|
+
policies.
|
|
11
|
+
|
|
12
|
+
Policies
|
|
13
|
+
--------
|
|
14
|
+
keep Flag only, change nothing.
|
|
15
|
+
warn Flag and warn (default). Same as keep, but with a console
|
|
16
|
+
message so the user actually finds out.
|
|
17
|
+
drop Remove flagged rows before training.
|
|
18
|
+
winsorize Cap flagged values at the IQR bounds (preserves row count).
|
|
19
|
+
|
|
20
|
+
Two surfaces
|
|
21
|
+
------------
|
|
22
|
+
1. ``apply_outlier_policy(data, target_column, policy, multiplier)``
|
|
23
|
+
Target-side. Inspects ``data[target_column]`` only. Has been around
|
|
24
|
+
since v0.3.
|
|
25
|
+
|
|
26
|
+
2. ``apply_feature_outlier_policy(data, feature_columns, policy, multiplier)``
|
|
27
|
+
Feature-side. Inspects every *numerical* column in ``feature_columns``
|
|
28
|
+
independently. ``drop`` removes any row that has an outlier in any
|
|
29
|
+
feature column; ``winsorize`` caps each column at its own IQR bounds
|
|
30
|
+
independently. Categorical columns are silently ignored — "outlier"
|
|
31
|
+
doesn't have a clean meaning there (a rare category isn't necessarily
|
|
32
|
+
wrong, just rare). Added in v0.7.
|
|
33
|
+
|
|
34
|
+
drop semantics for the feature-side path
|
|
35
|
+
----------------------------------------
|
|
36
|
+
We chose to drop the *whole row* when any one feature column flags an
|
|
37
|
+
outlier, rather than null out the offending cell. Reasoning: in
|
|
38
|
+
procurement data an outlier in one feature almost always signals
|
|
39
|
+
data-entry corruption that correlates with quality issues elsewhere in
|
|
40
|
+
the row (a transcription mistake, a unit confusion). Throwing the row
|
|
41
|
+
is the conservative move. Users who want per-cell handling should
|
|
42
|
+
pre-clean the CSV before feeding it in.
|
|
43
|
+
|
|
44
|
+
Detection rule (both surfaces)
|
|
45
|
+
------------------------------
|
|
46
|
+
Standard Tukey IQR: a value is an outlier when it is below Q1 − 1.5·IQR
|
|
47
|
+
or above Q3 + 1.5·IQR, where IQR = Q3 − Q1. The multiplier is exposed
|
|
48
|
+
on the API but defaults to 1.5; raising it (e.g. 3.0) catches only
|
|
49
|
+
extreme outliers, lowering it (e.g. 1.0) is more aggressive.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
import numpy as np
|
|
53
|
+
import pandas as pd
|
|
54
|
+
|
|
55
|
+
POLICIES = ("keep", "warn", "drop", "winsorize")
|
|
56
|
+
IQR_MULTIPLIER = 1.5
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def detect_outliers(values, multiplier=IQR_MULTIPLIER):
|
|
60
|
+
"""Return a ``(mask, lower, upper)`` tuple flagging Tukey-IQR outliers.
|
|
61
|
+
|
|
62
|
+
``values`` may be any 1-D numeric iterable. Non-numeric / NaN entries
|
|
63
|
+
are treated as non-outliers (the mask is False for them) so callers
|
|
64
|
+
don't need to clean inputs first.
|
|
65
|
+
"""
|
|
66
|
+
series = pd.Series(values).astype(float)
|
|
67
|
+
finite = series.dropna()
|
|
68
|
+
if finite.empty:
|
|
69
|
+
return pd.Series([False] * len(series), index=series.index), float("nan"), float("nan")
|
|
70
|
+
|
|
71
|
+
q1, q3 = finite.quantile([0.25, 0.75])
|
|
72
|
+
iqr = q3 - q1
|
|
73
|
+
if iqr == 0:
|
|
74
|
+
# Central 50% collapses to a single point. The Tukey rule
|
|
75
|
+
# degenerates and the old behaviour (mask all False) silently
|
|
76
|
+
# missed obvious outliers in near-constant columns — e.g. a
|
|
77
|
+
# Weight column of [10]*20 + [10_000] would slip through. Anything
|
|
78
|
+
# not equal to the central point is, by definition, outside the
|
|
79
|
+
# central 50%, so we flag it; bounds collapse to that point.
|
|
80
|
+
point = float(q1)
|
|
81
|
+
mask = (series != point).fillna(False)
|
|
82
|
+
return mask, point, point
|
|
83
|
+
|
|
84
|
+
lower = float(q1 - multiplier * iqr)
|
|
85
|
+
upper = float(q3 + multiplier * iqr)
|
|
86
|
+
mask = (series < lower) | (series > upper)
|
|
87
|
+
mask = mask.fillna(False)
|
|
88
|
+
return mask, lower, upper
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def apply_outlier_policy(data, target_column, policy="warn", multiplier=IQR_MULTIPLIER):
|
|
92
|
+
"""Apply ``policy`` to outliers in ``data[target_column]``.
|
|
93
|
+
|
|
94
|
+
Returns ``(df, summary)``. The summary dict is suitable for
|
|
95
|
+
caller-side logging:
|
|
96
|
+
``{n_outliers, n_total, lower, upper, policy, applied}``
|
|
97
|
+
|
|
98
|
+
``applied`` is the action that actually changed the data (``drop``,
|
|
99
|
+
``winsorize``, or ``none``).
|
|
100
|
+
"""
|
|
101
|
+
if policy not in POLICIES:
|
|
102
|
+
raise ValueError(f"Unknown outlier policy: {policy}. Choose from {POLICIES}.")
|
|
103
|
+
|
|
104
|
+
mask, lower, upper = detect_outliers(data[target_column], multiplier=multiplier)
|
|
105
|
+
n_outliers = int(mask.sum())
|
|
106
|
+
summary = {
|
|
107
|
+
"n_outliers": n_outliers,
|
|
108
|
+
"n_total": len(data),
|
|
109
|
+
"lower": lower,
|
|
110
|
+
"upper": upper,
|
|
111
|
+
"policy": policy,
|
|
112
|
+
"applied": "none",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if n_outliers == 0:
|
|
116
|
+
return data, summary
|
|
117
|
+
|
|
118
|
+
if policy == "drop":
|
|
119
|
+
summary["applied"] = "drop"
|
|
120
|
+
return data.loc[~mask].reset_index(drop=True), summary
|
|
121
|
+
|
|
122
|
+
if policy == "winsorize":
|
|
123
|
+
summary["applied"] = "winsorize"
|
|
124
|
+
new_data = data.copy()
|
|
125
|
+
new_data[target_column] = new_data[target_column].clip(lower=lower, upper=upper)
|
|
126
|
+
return new_data, summary
|
|
127
|
+
|
|
128
|
+
# keep / warn: don't change the data
|
|
129
|
+
return data, summary
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _is_numeric_series(series: pd.Series) -> bool:
|
|
133
|
+
"""True iff the column is numeric. Categorical columns are silently
|
|
134
|
+
skipped by the feature-outlier path because "outlier" doesn't have a
|
|
135
|
+
clean meaning for a discrete code — a rare category isn't necessarily
|
|
136
|
+
wrong, just rare. Use ``find_high_variation_features`` instead for
|
|
137
|
+
that case."""
|
|
138
|
+
return pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def apply_feature_outlier_policy(
|
|
142
|
+
data, feature_columns, policy="warn", multiplier=IQR_MULTIPLIER
|
|
143
|
+
):
|
|
144
|
+
"""Apply ``policy`` to outliers across one or more *feature* columns.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
data : pd.DataFrame
|
|
149
|
+
Training data. Must contain every column in ``feature_columns``.
|
|
150
|
+
feature_columns : Iterable[str]
|
|
151
|
+
Columns to inspect. Non-numeric columns are silently skipped (see
|
|
152
|
+
module docstring for the rationale). Pass the model's feature
|
|
153
|
+
list — exclude the target and any time column upstream.
|
|
154
|
+
policy : str
|
|
155
|
+
One of POLICIES. ``drop`` removes rows that have an outlier in
|
|
156
|
+
*any* numeric feature column; ``winsorize`` caps each column at
|
|
157
|
+
its own IQR bounds independently; ``keep`` / ``warn`` change
|
|
158
|
+
nothing (warn surfaces a message at the caller).
|
|
159
|
+
multiplier : float
|
|
160
|
+
Tukey IQR multiplier. 1.5 is the textbook default.
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
(df, summary) where summary has the shape::
|
|
165
|
+
|
|
166
|
+
{
|
|
167
|
+
"policy": policy,
|
|
168
|
+
"applied": "drop" | "winsorize" | "none",
|
|
169
|
+
"n_total": int,
|
|
170
|
+
"n_outliers_total": int, # rows touched at all
|
|
171
|
+
"per_column": {
|
|
172
|
+
column_name: {
|
|
173
|
+
"n_outliers": int,
|
|
174
|
+
"lower": float,
|
|
175
|
+
"upper": float,
|
|
176
|
+
},
|
|
177
|
+
...
|
|
178
|
+
},
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
``n_outliers_total`` counts rows containing at least one outlier
|
|
182
|
+
(the relevant figure for ``drop``); per-column counts can sum to
|
|
183
|
+
more than this when a row has outliers in multiple columns.
|
|
184
|
+
"""
|
|
185
|
+
if policy not in POLICIES:
|
|
186
|
+
raise ValueError(f"Unknown outlier policy: {policy}. Choose from {POLICIES}.")
|
|
187
|
+
|
|
188
|
+
numeric_cols = [
|
|
189
|
+
c for c in feature_columns
|
|
190
|
+
if c in data.columns and _is_numeric_series(data[c])
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
per_column = {}
|
|
194
|
+
any_outlier_mask = pd.Series(False, index=data.index)
|
|
195
|
+
|
|
196
|
+
for col in numeric_cols:
|
|
197
|
+
mask, lower, upper = detect_outliers(data[col], multiplier=multiplier)
|
|
198
|
+
n = int(mask.sum())
|
|
199
|
+
per_column[col] = {"n_outliers": n, "lower": lower, "upper": upper}
|
|
200
|
+
if n > 0:
|
|
201
|
+
# Align the column-specific mask onto the full-frame mask.
|
|
202
|
+
any_outlier_mask = any_outlier_mask | mask.reindex(data.index, fill_value=False)
|
|
203
|
+
|
|
204
|
+
n_outliers_total = int(any_outlier_mask.sum())
|
|
205
|
+
summary = {
|
|
206
|
+
"policy": policy,
|
|
207
|
+
"applied": "none",
|
|
208
|
+
"n_total": len(data),
|
|
209
|
+
"n_outliers_total": n_outliers_total,
|
|
210
|
+
"per_column": per_column,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if n_outliers_total == 0:
|
|
214
|
+
return data, summary
|
|
215
|
+
|
|
216
|
+
if policy == "drop":
|
|
217
|
+
summary["applied"] = "drop"
|
|
218
|
+
return data.loc[~any_outlier_mask].reset_index(drop=True), summary
|
|
219
|
+
|
|
220
|
+
if policy == "winsorize":
|
|
221
|
+
# Per-column winsorisation: each column is capped at its own
|
|
222
|
+
# IQR bounds independently. Row count preserved.
|
|
223
|
+
summary["applied"] = "winsorize"
|
|
224
|
+
new_data = data.copy()
|
|
225
|
+
for col, stats in per_column.items():
|
|
226
|
+
if stats["n_outliers"] == 0:
|
|
227
|
+
continue
|
|
228
|
+
new_data[col] = new_data[col].clip(
|
|
229
|
+
lower=stats["lower"], upper=stats["upper"]
|
|
230
|
+
)
|
|
231
|
+
return new_data, summary
|
|
232
|
+
|
|
233
|
+
# keep / warn: don't change the data.
|
|
234
|
+
return data, summary
|