p2predict 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- p2predict/__init__.py +88 -0
- p2predict/__main__.py +11 -0
- p2predict/cli/__init__.py +9 -0
- p2predict/cli/predict.py +706 -0
- p2predict/cli/train.py +659 -0
- p2predict/cmdline_io.py +64 -0
- p2predict/explain.py +464 -0
- p2predict/feature_selection.py +139 -0
- p2predict/hpo_training.py +44 -0
- p2predict/input_checks.py +59 -0
- p2predict/intervals.py +317 -0
- p2predict/json_output.py +225 -0
- p2predict/mcp/__init__.py +1 -0
- p2predict/mcp/__main__.py +3 -0
- p2predict/mcp/conversions.py +44 -0
- p2predict/mcp/registry.py +149 -0
- p2predict/mcp/server.py +1258 -0
- p2predict/model_evals.py +36 -0
- p2predict/model_utils.py +235 -0
- p2predict/outliers.py +234 -0
- p2predict/plotting.py +499 -0
- p2predict/prepare_data.py +48 -0
- p2predict/preprocessing.py +130 -0
- p2predict/quality.py +457 -0
- p2predict/trained_model_io.py +64 -0
- p2predict/training.py +270 -0
- p2predict/ui_console.py +36 -0
- p2predict/whatif.py +269 -0
- p2predict-0.9.0.dist-info/METADATA +216 -0
- p2predict-0.9.0.dist-info/RECORD +34 -0
- p2predict-0.9.0.dist-info/WHEEL +5 -0
- p2predict-0.9.0.dist-info/entry_points.txt +4 -0
- p2predict-0.9.0.dist-info/licenses/LICENSE +121 -0
- p2predict-0.9.0.dist-info/top_level.txt +1 -0
p2predict/cmdline_io.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
from rich.table import Table
|
|
3
|
+
console = Console()
|
|
4
|
+
|
|
5
|
+
def print_logo():
|
|
6
|
+
console.print(" ____ ____ ____ _ _ _ ",style='bold blue')
|
|
7
|
+
console.print("| _ \\ |___ \\ | _ \\ _ __ ___ __| |(_) ___ | |_ ",style='bold blue')
|
|
8
|
+
console.print("| |_) | __) || |_) || '__| / _ \\ / _` || | / __|| __|",style='bold blue')
|
|
9
|
+
console.print("| __/ / __/ | __/ | | | __/| (_| || || (__ | |_ ",style='bold blue')
|
|
10
|
+
console.print("|_| |_____||_| |_| \\___| \\__,_||_| \\___| \\__|",style='bold blue')
|
|
11
|
+
|
|
12
|
+
def plot_importances(feature_importances, feature_names):
|
|
13
|
+
table = Table(show_header=True, header_style="bold blue", highlight=True)
|
|
14
|
+
table.add_column("Feature", overflow="fold", width=50) # Adjust the width as necessary
|
|
15
|
+
table.add_column("Importance (%)", justify="right")
|
|
16
|
+
|
|
17
|
+
for i in range(len(feature_importances)):
|
|
18
|
+
table.add_row(feature_names[i], str(round(feature_importances[i] * 100, 2)) + "%")
|
|
19
|
+
|
|
20
|
+
console.print(table)
|
|
21
|
+
|
|
22
|
+
def print_feature_weights(sorted_feature_importances):
|
|
23
|
+
for feature, importance in sorted_feature_importances:
|
|
24
|
+
console.print(f"Feature: {feature}, Model Weight: {round(importance,ndigits=4)}")
|
|
25
|
+
|
|
26
|
+
def output_features(data):
|
|
27
|
+
table = Table(show_header=True, header_style="bold blue", highlight=True)
|
|
28
|
+
table.add_column("Feature")
|
|
29
|
+
table.add_column("Type")
|
|
30
|
+
|
|
31
|
+
for col, dtype in data.dtypes.items():
|
|
32
|
+
if dtype == 'object':
|
|
33
|
+
dtype = 'text'
|
|
34
|
+
elif dtype == 'int64':
|
|
35
|
+
dtype = 'numerical: integer'
|
|
36
|
+
elif dtype == 'float64':
|
|
37
|
+
dtype = 'numerical: float'
|
|
38
|
+
table.add_row(col, dtype)
|
|
39
|
+
console.print(table)
|
|
40
|
+
|
|
41
|
+
def print_feature_stats(data):
|
|
42
|
+
console = Console()
|
|
43
|
+
table = Table(show_header=True, header_style="bold blue", highlight=True)
|
|
44
|
+
table.add_column("Feature")
|
|
45
|
+
table.add_column("Min")
|
|
46
|
+
table.add_column("Max")
|
|
47
|
+
table.add_column("Mean")
|
|
48
|
+
table.add_column("Median")
|
|
49
|
+
table.add_column("Standard Deviation")
|
|
50
|
+
table.add_column("Skewness")
|
|
51
|
+
table.add_column("Kurtosis")
|
|
52
|
+
|
|
53
|
+
for col in data.columns:
|
|
54
|
+
min_val = data[col].min()
|
|
55
|
+
max_val = data[col].max()
|
|
56
|
+
mean_val = round(data[col].mean(),ndigits=4)
|
|
57
|
+
median_val = round(data[col].median(),ndigits=4)
|
|
58
|
+
std_val = round(data[col].std(),ndigits=4)
|
|
59
|
+
skewness = round(data[col].skew(),ndigits=4)
|
|
60
|
+
curt = round(data[col].kurt(),ndigits=4)
|
|
61
|
+
|
|
62
|
+
table.add_row(col, str(min_val), str(max_val), str(mean_val), str(median_val), str(std_val), str(skewness), str(curt))
|
|
63
|
+
|
|
64
|
+
console.print(table)
|
p2predict/explain.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""SHAP-based per-prediction explanations for P2Predict models.
|
|
2
|
+
|
|
3
|
+
What this module computes
|
|
4
|
+
-------------------------
|
|
5
|
+
For a fitted P2Predict model and a single input row x, return the additive
|
|
6
|
+
decomposition
|
|
7
|
+
|
|
8
|
+
f(x) = phi_0 + sum_i phi_i
|
|
9
|
+
|
|
10
|
+
where phi_0 is the model's baseline (its expected value over a background
|
|
11
|
+
population) and phi_i is feature i's Shapley value. The Shapley value is the
|
|
12
|
+
unique attribution satisfying efficiency (local accuracy), missingness,
|
|
13
|
+
symmetry, and consistency — that uniqueness is what makes the per-feature
|
|
14
|
+
numbers defensible in a design-review meeting rather than yet another
|
|
15
|
+
heuristic importance score.
|
|
16
|
+
|
|
17
|
+
Which algorithm we use, and why
|
|
18
|
+
-------------------------------
|
|
19
|
+
We pick the explainer that is *exact* for the model family and runs in
|
|
20
|
+
polynomial time. We do not fall back to KernelExplainer — it is slow and
|
|
21
|
+
Monte-Carlo approximate, and we never need it for the three model families
|
|
22
|
+
this project supports.
|
|
23
|
+
|
|
24
|
+
Linear (Ridge, Lasso) -> shap.LinearExplainer
|
|
25
|
+
Closed form: phi_i = beta_i * (x_i - E[x_i]). Requires a background
|
|
26
|
+
sample only to estimate E[x_i]; cost is O(F).
|
|
27
|
+
Trees (RandomForest, -> shap.TreeExplainer with feature_perturbation=
|
|
28
|
+
XGBoost) "tree_path_dependent" (Lundberg 2018).
|
|
29
|
+
Exact Shapley values in O(T L D^2), no background sample required —
|
|
30
|
+
the conditional expectations are estimated from the trees' own node
|
|
31
|
+
counts.
|
|
32
|
+
|
|
33
|
+
Log-target wrap (TransformedTargetRegressor with log1p / expm1)
|
|
34
|
+
---------------------------------------------------------------
|
|
35
|
+
The inner model predicts log(price). SHAP values on the inner model live in
|
|
36
|
+
log space and satisfy local accuracy *in log space*:
|
|
37
|
+
|
|
38
|
+
log(pred) - log(base) = sum_i phi_i_log
|
|
39
|
+
|
|
40
|
+
Exponentiating turns the sum into a product:
|
|
41
|
+
|
|
42
|
+
pred / base = prod_i exp(phi_i_log)
|
|
43
|
+
|
|
44
|
+
So in price space each feature becomes a *multiplicative factor*
|
|
45
|
+
exp(phi_i_log) -- e.g. "Region=EU multiplies the predicted price by 1.18
|
|
46
|
+
(+18%)". This is the axiomatically clean reading.
|
|
47
|
+
|
|
48
|
+
For procurement readability we additionally surface an "approximate dollar
|
|
49
|
+
attribution" obtained by proportionally rescaling the log-space contributions
|
|
50
|
+
to the price-space delta (pred - base). This *forces* additivity in dollars
|
|
51
|
+
at the cost of breaking the SHAP axioms — it is not strict SHAP, and we label
|
|
52
|
+
it that way in the report and in the CLI.
|
|
53
|
+
|
|
54
|
+
Source-feature roll-up
|
|
55
|
+
----------------------
|
|
56
|
+
SHAP gives one value per *transformed* feature. We sum across the columns
|
|
57
|
+
that came from the same source column (one-hot dummies for linear models;
|
|
58
|
+
ordinal-encoded categoricals for tree models, where this is a no-op).
|
|
59
|
+
Summing one-hot dummies' Shapley values to attribute to the source column is
|
|
60
|
+
standard practice and is sound under SHAP's additivity property when the
|
|
61
|
+
dummies are mutually exclusive (exactly one is 1 at a time).
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
from __future__ import annotations
|
|
65
|
+
|
|
66
|
+
from dataclasses import dataclass, field
|
|
67
|
+
from typing import Optional
|
|
68
|
+
|
|
69
|
+
import numpy as np
|
|
70
|
+
import pandas as pd
|
|
71
|
+
from sklearn.compose import TransformedTargetRegressor
|
|
72
|
+
|
|
73
|
+
# Local-accuracy sanity-check tolerance. Floating-point + SHAP internals can
|
|
74
|
+
# leave a tiny residual; anything bigger is a sign something is wrong with
|
|
75
|
+
# the explainer choice or the transformed-matrix shape.
|
|
76
|
+
_LOCAL_ACCURACY_TOL = 1e-4
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _to_dense_2d(X) -> np.ndarray:
|
|
80
|
+
"""Coerce a sklearn ColumnTransformer output into a dense 2-d ndarray.
|
|
81
|
+
|
|
82
|
+
ColumnTransformer with OneHotEncoder (the linear-model path) returns a
|
|
83
|
+
scipy sparse matrix. ``np.asarray`` on a sparse matrix wraps it in a
|
|
84
|
+
0-d object array, which then breaks every downstream ``len()`` and
|
|
85
|
+
indexing call inside SHAP. We densify here so both LinearExplainer and
|
|
86
|
+
the local-accuracy ``estimator.predict(x_t)`` get an actual 2-d array.
|
|
87
|
+
"""
|
|
88
|
+
if hasattr(X, "toarray"):
|
|
89
|
+
return X.toarray()
|
|
90
|
+
return np.asarray(X)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class Explanation:
|
|
95
|
+
"""Per-row attribution result.
|
|
96
|
+
|
|
97
|
+
The contract:
|
|
98
|
+
contributions[col] are in the *inner model's output space* (price for
|
|
99
|
+
a non-log model; log(price) for a log-target model). They satisfy
|
|
100
|
+
local accuracy: baseline + sum(contributions.values()) ~= prediction
|
|
101
|
+
to within _LOCAL_ACCURACY_TOL.
|
|
102
|
+
|
|
103
|
+
For log-target models the price-space fields are populated and the
|
|
104
|
+
multiplicative_factors are the only attribution form that strictly
|
|
105
|
+
satisfies the SHAP axioms in price space. dollar_attribution is a
|
|
106
|
+
proportional rescaling — additive but not strict SHAP.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
baseline: float
|
|
110
|
+
prediction: float
|
|
111
|
+
contributions: dict[str, float]
|
|
112
|
+
log_target: bool = False
|
|
113
|
+
baseline_price: Optional[float] = None
|
|
114
|
+
predicted_price: Optional[float] = None
|
|
115
|
+
multiplicative_factors: Optional[dict[str, float]] = None
|
|
116
|
+
dollar_attribution: Optional[dict[str, float]] = None
|
|
117
|
+
residual: float = 0.0 # local-accuracy residual, for diagnostics
|
|
118
|
+
# True iff product(multiplicative_factors) == predicted_price / baseline_price
|
|
119
|
+
# holds strictly. Holds for the v0.4 log/exp wrap; not for an older
|
|
120
|
+
# log1p/expm1 wrap, where the factors apply to (1 + price) instead.
|
|
121
|
+
strict_multiplicative: bool = False
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _unwrap(model):
|
|
125
|
+
"""Return (inner_pipeline, is_log_target, inverse_func).
|
|
126
|
+
|
|
127
|
+
``inverse_func`` is read off the TransformedTargetRegressor so the
|
|
128
|
+
explanation code stays correct whichever forward/inverse pair was used
|
|
129
|
+
at training time (v0.4+ uses log/exp; older models may have used
|
|
130
|
+
log1p/expm1). We invert via this function rather than hard-coding
|
|
131
|
+
``expm1`` so the multiplicative-axiom math only holds strictly under
|
|
132
|
+
the right pairing (log/exp) but doesn't *silently lie* under the
|
|
133
|
+
wrong one — we surface that case via a flag.
|
|
134
|
+
"""
|
|
135
|
+
if isinstance(model, TransformedTargetRegressor):
|
|
136
|
+
inverse = getattr(model, "inverse_func", None) or np.exp
|
|
137
|
+
return model.regressor_, True, inverse
|
|
138
|
+
return model, False, None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _detect_family(estimator) -> str:
|
|
142
|
+
name = type(estimator).__name__.lower()
|
|
143
|
+
if any(t in name for t in ("ridge", "lasso", "linear", "elasticnet")):
|
|
144
|
+
return "linear"
|
|
145
|
+
if any(t in name for t in ("forest", "xgb", "gradientboost", "boost", "tree")):
|
|
146
|
+
return "tree"
|
|
147
|
+
return "unknown"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _source_column_groups(
|
|
151
|
+
preprocessor, source_cols: list[str], n_values: int
|
|
152
|
+
) -> dict[str, list[int]]:
|
|
153
|
+
"""Map each source column to the transformed-feature indices it produced.
|
|
154
|
+
|
|
155
|
+
Uses the longest-source-column-prefix match (the same logic used by
|
|
156
|
+
extract_feature_importances), so source columns whose names share a
|
|
157
|
+
prefix — e.g. 'weight' and 'weight_extra' — are kept separate rather
|
|
158
|
+
than collapsed. Computed once per explain call so the per-row rollup
|
|
159
|
+
is a plain column-sum.
|
|
160
|
+
"""
|
|
161
|
+
raw_names = list(preprocessor.get_feature_names_out())
|
|
162
|
+
if len(raw_names) != n_values:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"Transformed-feature/SHAP-value length mismatch: "
|
|
165
|
+
f"{len(raw_names)} names vs {n_values} values."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
groups: dict[str, list[int]] = {col: [] for col in source_cols}
|
|
169
|
+
for i, raw_name in enumerate(raw_names):
|
|
170
|
+
rest = raw_name.split("__", 1)[1] if "__" in raw_name else raw_name
|
|
171
|
+
match = None
|
|
172
|
+
for col in source_cols:
|
|
173
|
+
if rest == col or rest.startswith(f"{col}_"):
|
|
174
|
+
if match is None or len(col) > len(match):
|
|
175
|
+
match = col
|
|
176
|
+
if match is None:
|
|
177
|
+
match = rest
|
|
178
|
+
groups.setdefault(match, [])
|
|
179
|
+
groups[match].append(i)
|
|
180
|
+
return groups
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _scalar_expected_value(explainer) -> float:
|
|
184
|
+
"""SHAP returns expected_value as either a scalar or a 1-element array
|
|
185
|
+
depending on the model and version. Normalise to a Python float."""
|
|
186
|
+
ev = explainer.expected_value
|
|
187
|
+
if isinstance(ev, (list, tuple, np.ndarray)):
|
|
188
|
+
ev = np.atleast_1d(ev)
|
|
189
|
+
if ev.size != 1:
|
|
190
|
+
# Multi-output models are not in our scope (regression only).
|
|
191
|
+
raise ValueError(
|
|
192
|
+
"SHAP expected_value has multiple outputs; only single-output "
|
|
193
|
+
"regression is supported."
|
|
194
|
+
)
|
|
195
|
+
return float(ev[0])
|
|
196
|
+
return float(ev)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _shap_values(explainer, X_t):
|
|
200
|
+
"""Get a (n_samples, n_features) SHAP value matrix regardless of the
|
|
201
|
+
library's version-dependent return shape."""
|
|
202
|
+
sv = explainer.shap_values(X_t)
|
|
203
|
+
if isinstance(sv, list):
|
|
204
|
+
# Classification returns a list of per-class arrays; regression
|
|
205
|
+
# returns either a 2-D array or a 1-D row. We only do regression.
|
|
206
|
+
if len(sv) != 1:
|
|
207
|
+
raise ValueError("Unexpected multi-output SHAP result.")
|
|
208
|
+
sv = sv[0]
|
|
209
|
+
sv = np.asarray(sv)
|
|
210
|
+
if sv.ndim == 1:
|
|
211
|
+
sv = sv.reshape(1, -1)
|
|
212
|
+
return sv
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _patch_shap_xgboost_base_score(shap_module) -> None:
|
|
216
|
+
"""Coerce XGBoost >= 3.0's stringified-list ``base_score`` to a scalar
|
|
217
|
+
before SHAP's XGBTreeModelLoader tries to ``float()`` it.
|
|
218
|
+
|
|
219
|
+
XGBoost 3.x serialises ``base_score`` as a stringified one-element list
|
|
220
|
+
(e.g. ``'[9.567467E0]'``); SHAP 0.49.x's ``XGBTreeModelLoader`` calls
|
|
221
|
+
``float(learner_model_param["base_score"])`` and raises ``ValueError:
|
|
222
|
+
could not convert string to float`` (shap/shap#4184, #4202, #4288). The
|
|
223
|
+
upstream fix (shap/shap#4187) is merged but not yet released, so we
|
|
224
|
+
patch the field inside the decoded UBJ payload before the loader sees
|
|
225
|
+
it. The patch is idempotent.
|
|
226
|
+
"""
|
|
227
|
+
tree_mod = shap_module.explainers._tree
|
|
228
|
+
if getattr(tree_mod, "_p2predict_base_score_patched", False):
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
original_init = tree_mod.XGBTreeModelLoader.__init__
|
|
232
|
+
original_decode = tree_mod.decode_ubjson_buffer
|
|
233
|
+
|
|
234
|
+
def patched_init(self, xgb_model):
|
|
235
|
+
def coercing_decode(fp):
|
|
236
|
+
jmodel = original_decode(fp)
|
|
237
|
+
try:
|
|
238
|
+
lmp = jmodel["learner"]["learner_model_param"]
|
|
239
|
+
bs = lmp.get("base_score")
|
|
240
|
+
if isinstance(bs, str) and bs.startswith("["):
|
|
241
|
+
import ast
|
|
242
|
+
val = ast.literal_eval(bs)
|
|
243
|
+
if isinstance(val, (list, tuple)) and val:
|
|
244
|
+
lmp["base_score"] = str(float(val[0]))
|
|
245
|
+
except (KeyError, ValueError, SyntaxError):
|
|
246
|
+
pass
|
|
247
|
+
return jmodel
|
|
248
|
+
|
|
249
|
+
tree_mod.decode_ubjson_buffer = coercing_decode
|
|
250
|
+
try:
|
|
251
|
+
original_init(self, xgb_model)
|
|
252
|
+
finally:
|
|
253
|
+
tree_mod.decode_ubjson_buffer = original_decode
|
|
254
|
+
|
|
255
|
+
tree_mod.XGBTreeModelLoader.__init__ = patched_init
|
|
256
|
+
tree_mod._p2predict_base_score_patched = True
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _build_explainer(estimator, family: str, background_X_t):
|
|
260
|
+
"""Construct the right SHAP explainer.
|
|
261
|
+
|
|
262
|
+
Trees use the tree-path-dependent algorithm — no background needed, and
|
|
263
|
+
the result is exact in O(TLD^2). Linear models use the closed-form
|
|
264
|
+
LinearExplainer; that one *does* need a background to estimate E[x_i].
|
|
265
|
+
"""
|
|
266
|
+
import shap # imported lazily so the rest of P2Predict has no hard
|
|
267
|
+
# dependency on shap unless --explain is actually used.
|
|
268
|
+
|
|
269
|
+
if family == "tree":
|
|
270
|
+
_patch_shap_xgboost_base_score(shap)
|
|
271
|
+
return shap.TreeExplainer(
|
|
272
|
+
estimator, feature_perturbation="tree_path_dependent"
|
|
273
|
+
)
|
|
274
|
+
if family == "linear":
|
|
275
|
+
if background_X_t is None or len(background_X_t) == 0:
|
|
276
|
+
raise ValueError(
|
|
277
|
+
"Linear models require a background sample for SHAP. "
|
|
278
|
+
"Re-train with v0.4 (which persists one) or pass background_X."
|
|
279
|
+
)
|
|
280
|
+
return shap.LinearExplainer(estimator, background_X_t)
|
|
281
|
+
raise ValueError(
|
|
282
|
+
f"No SHAP explainer wired for estimator '{type(estimator).__name__}'."
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _finalize_explanation(
|
|
287
|
+
baseline: float,
|
|
288
|
+
inner_pred: float,
|
|
289
|
+
contributions: dict[str, float],
|
|
290
|
+
is_log_target: bool,
|
|
291
|
+
inverse_func,
|
|
292
|
+
) -> Explanation:
|
|
293
|
+
"""Assemble one row's Explanation from its rolled-up contributions."""
|
|
294
|
+
# Local-accuracy check in *inner-model* output space. This catches issues
|
|
295
|
+
# like a mis-extracted preprocessor or a wrong-family explainer pick.
|
|
296
|
+
residual = float(inner_pred - (baseline + sum(contributions.values())))
|
|
297
|
+
if abs(residual) > _LOCAL_ACCURACY_TOL * max(1.0, abs(inner_pred)):
|
|
298
|
+
# Don't raise — log via the Explanation so the CLI can surface it.
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
if not is_log_target:
|
|
302
|
+
return Explanation(
|
|
303
|
+
baseline=baseline,
|
|
304
|
+
prediction=inner_pred,
|
|
305
|
+
contributions=contributions,
|
|
306
|
+
residual=residual,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Log-target post-processing.
|
|
310
|
+
#
|
|
311
|
+
# contributions are in inner-model output space. When the wrap is log/exp
|
|
312
|
+
# (the v0.4+ default) the per-feature multiplicative factor in price
|
|
313
|
+
# space is exp(contribution), and the product of factors *exactly*
|
|
314
|
+
# reproduces predicted_price / baseline_price. This is the axiomatic
|
|
315
|
+
# SHAP statement in price space.
|
|
316
|
+
#
|
|
317
|
+
# For other wraps (e.g. v0.2/v0.3 log1p/expm1) the multiplicative
|
|
318
|
+
# interpretation applies on the inverse_func's *pre-shift* scale rather
|
|
319
|
+
# than on price directly — for log1p that's (1 + price). We keep the
|
|
320
|
+
# exp() factor (which is what SHAP gives us in log space) and let the
|
|
321
|
+
# caller know via the strict_multiplicative flag.
|
|
322
|
+
baseline_price = float(inverse_func(baseline))
|
|
323
|
+
predicted_price = float(inverse_func(inner_pred))
|
|
324
|
+
multiplicative_factors = {
|
|
325
|
+
col: float(np.exp(v)) for col, v in contributions.items()
|
|
326
|
+
}
|
|
327
|
+
strict_multiplicative = inverse_func is np.exp
|
|
328
|
+
|
|
329
|
+
# Approximate dollar attribution: rescale log-space contributions so they
|
|
330
|
+
# sum to the price-space delta. This is *not* strict SHAP — see the
|
|
331
|
+
# module docstring — but it is the form procurement readers naturally
|
|
332
|
+
# want, and we label it as approximate everywhere it is shown.
|
|
333
|
+
delta_price = predicted_price - baseline_price
|
|
334
|
+
log_total = sum(contributions.values())
|
|
335
|
+
if abs(log_total) > 1e-12:
|
|
336
|
+
dollar_attribution = {
|
|
337
|
+
col: float(delta_price * v / log_total)
|
|
338
|
+
for col, v in contributions.items()
|
|
339
|
+
}
|
|
340
|
+
else:
|
|
341
|
+
dollar_attribution = {col: 0.0 for col in contributions}
|
|
342
|
+
|
|
343
|
+
return Explanation(
|
|
344
|
+
baseline=baseline,
|
|
345
|
+
prediction=inner_pred,
|
|
346
|
+
contributions=contributions,
|
|
347
|
+
log_target=True,
|
|
348
|
+
baseline_price=baseline_price,
|
|
349
|
+
predicted_price=predicted_price,
|
|
350
|
+
multiplicative_factors=multiplicative_factors,
|
|
351
|
+
dollar_attribution=dollar_attribution,
|
|
352
|
+
residual=residual,
|
|
353
|
+
strict_multiplicative=strict_multiplicative,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def explain_batch(
|
|
358
|
+
model,
|
|
359
|
+
X: pd.DataFrame,
|
|
360
|
+
background_X: Optional[pd.DataFrame] = None,
|
|
361
|
+
) -> list[Explanation]:
|
|
362
|
+
"""Compute SHAP explanations for every row of ``X``.
|
|
363
|
+
|
|
364
|
+
Builds the explainer *once* and computes all rows' SHAP values in a
|
|
365
|
+
single call. Explainer construction is the expensive part — for tree
|
|
366
|
+
ensembles SHAP parses the entire fitted forest — so this is the path
|
|
367
|
+
to use for more than one row. Each row's Explanation is identical to
|
|
368
|
+
what :func:`explain_row` returns for that row alone.
|
|
369
|
+
|
|
370
|
+
Parameters
|
|
371
|
+
----------
|
|
372
|
+
model
|
|
373
|
+
A fitted P2Predict pipeline — either a sklearn ``Pipeline`` or a
|
|
374
|
+
``TransformedTargetRegressor`` wrapping one.
|
|
375
|
+
X
|
|
376
|
+
DataFrame with the same source columns the pipeline was trained on.
|
|
377
|
+
One Explanation is returned per row.
|
|
378
|
+
background_X
|
|
379
|
+
Optional background sample of raw (pre-preprocessor) feature rows.
|
|
380
|
+
Required for linear models, ignored for tree models.
|
|
381
|
+
"""
|
|
382
|
+
if len(X) == 0:
|
|
383
|
+
return []
|
|
384
|
+
|
|
385
|
+
inner, is_log_target, inverse_func = _unwrap(model)
|
|
386
|
+
preprocessor = inner.named_steps["preprocessor"]
|
|
387
|
+
estimator = inner.named_steps["model"]
|
|
388
|
+
family = _detect_family(estimator)
|
|
389
|
+
|
|
390
|
+
X_t = _to_dense_2d(preprocessor.transform(X))
|
|
391
|
+
bg_t = (
|
|
392
|
+
_to_dense_2d(preprocessor.transform(background_X))
|
|
393
|
+
if background_X is not None
|
|
394
|
+
else None
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
explainer = _build_explainer(estimator, family, bg_t)
|
|
398
|
+
sv = _shap_values(explainer, X_t)
|
|
399
|
+
|
|
400
|
+
baseline = _scalar_expected_value(explainer)
|
|
401
|
+
inner_preds = np.asarray(estimator.predict(X_t), dtype=float).ravel()
|
|
402
|
+
|
|
403
|
+
source_cols = list(X.columns)
|
|
404
|
+
groups = _source_column_groups(preprocessor, source_cols, sv.shape[1])
|
|
405
|
+
# One column-sum per source feature, vectorised across all rows.
|
|
406
|
+
rolled = {src: sv[:, idxs].sum(axis=1) for src, idxs in groups.items()}
|
|
407
|
+
|
|
408
|
+
return [
|
|
409
|
+
_finalize_explanation(
|
|
410
|
+
baseline,
|
|
411
|
+
float(inner_preds[i]),
|
|
412
|
+
{src: float(vals[i]) for src, vals in rolled.items()},
|
|
413
|
+
is_log_target,
|
|
414
|
+
inverse_func,
|
|
415
|
+
)
|
|
416
|
+
for i in range(len(X))
|
|
417
|
+
]
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def explain_row(
|
|
421
|
+
model,
|
|
422
|
+
x: pd.DataFrame,
|
|
423
|
+
background_X: Optional[pd.DataFrame] = None,
|
|
424
|
+
) -> Explanation:
|
|
425
|
+
"""Compute the SHAP explanation for a single-row DataFrame x.
|
|
426
|
+
|
|
427
|
+
Parameters
|
|
428
|
+
----------
|
|
429
|
+
model
|
|
430
|
+
A fitted P2Predict pipeline — either a sklearn ``Pipeline`` or a
|
|
431
|
+
``TransformedTargetRegressor`` wrapping one.
|
|
432
|
+
x
|
|
433
|
+
Single-row DataFrame with the same source columns the pipeline was
|
|
434
|
+
trained on. To explain many rows, use :func:`explain_batch` — it
|
|
435
|
+
builds the (expensive) explainer once instead of per row.
|
|
436
|
+
background_X
|
|
437
|
+
Optional background sample of raw (pre-preprocessor) feature rows.
|
|
438
|
+
Required for linear models, ignored for tree models.
|
|
439
|
+
"""
|
|
440
|
+
if len(x) != 1:
|
|
441
|
+
raise ValueError("explain_row expects a single-row DataFrame.")
|
|
442
|
+
return explain_batch(model, x, background_X=background_X)[0]
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def top_drivers(
|
|
446
|
+
explanation: Explanation, n: int = 3, signed: bool = True
|
|
447
|
+
) -> list[tuple[str, float]]:
|
|
448
|
+
"""Return the n source features with the largest |contribution|.
|
|
449
|
+
|
|
450
|
+
In the log-target case we rank by absolute log-space contribution (which
|
|
451
|
+
is monotone with |log(multiplicative_factor)|) and report the actual
|
|
452
|
+
multiplicative factor as the numeric value, since that is the
|
|
453
|
+
axiomatically clean per-feature quantity in price space.
|
|
454
|
+
"""
|
|
455
|
+
items = list(explanation.contributions.items())
|
|
456
|
+
items.sort(key=lambda kv: abs(kv[1]), reverse=True)
|
|
457
|
+
items = items[:n]
|
|
458
|
+
if explanation.log_target and explanation.multiplicative_factors is not None:
|
|
459
|
+
return [
|
|
460
|
+
(col, explanation.multiplicative_factors[col]) for col, _ in items
|
|
461
|
+
]
|
|
462
|
+
if not signed:
|
|
463
|
+
return [(col, abs(v)) for col, v in items]
|
|
464
|
+
return items
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
4
|
+
from sklearn.pipeline import Pipeline
|
|
5
|
+
|
|
6
|
+
from p2predict.preprocessing import build_preprocessor
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def find_high_variation_features(df):
|
|
10
|
+
high_variation = []
|
|
11
|
+
|
|
12
|
+
numeric_df = df.select_dtypes(include=["int64", "float64", "int32", "float32"])
|
|
13
|
+
if not numeric_df.empty:
|
|
14
|
+
means = numeric_df.mean()
|
|
15
|
+
stds = numeric_df.std()
|
|
16
|
+
# Use |mean| and guard against ~0 to keep CV well-defined.
|
|
17
|
+
safe_means = means.abs().where(means.abs() > 1e-9)
|
|
18
|
+
cv = (stds / safe_means).dropna()
|
|
19
|
+
high_variation.extend(cv[cv > 1].index.tolist())
|
|
20
|
+
|
|
21
|
+
categorical_df = df.select_dtypes(include=["object", "bool", "category"])
|
|
22
|
+
if not categorical_df.empty:
|
|
23
|
+
unique_ratio = categorical_df.apply(lambda x: x.nunique() / max(len(x), 1))
|
|
24
|
+
high_variation.extend(unique_ratio[unique_ratio > 0.9].index.tolist())
|
|
25
|
+
|
|
26
|
+
return high_variation
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def find_no_variation_features(df):
|
|
30
|
+
unique_counts = df.nunique(dropna=False)
|
|
31
|
+
return unique_counts[unique_counts <= 1].index.tolist()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def find_leaky_features(data, target_column, threshold=0.97):
|
|
35
|
+
"""Flag features that look like an alternate form of the target (leakage).
|
|
36
|
+
|
|
37
|
+
A numeric feature whose absolute Pearson correlation with the target
|
|
38
|
+
exceeds ``threshold`` is almost certainly **target leakage** — a
|
|
39
|
+
near-duplicate of the answer (e.g. the same price at a different quantity
|
|
40
|
+
break, or a pre-rounded copy) rather than a genuine spec. Training on it
|
|
41
|
+
inflates every metric while producing a model that is useless on real
|
|
42
|
+
parts, because at prediction time you wouldn't have the leaked column
|
|
43
|
+
(or you'd already know the price).
|
|
44
|
+
|
|
45
|
+
Only numeric columns are screened — a categorical can't be a linear
|
|
46
|
+
duplicate of a numeric target — and the target column itself is never
|
|
47
|
+
returned.
|
|
48
|
+
|
|
49
|
+
Returns a list of ``{"feature", "correlation", "reason"}`` dicts sorted
|
|
50
|
+
by absolute correlation, descending. Empty when nothing looks leaky.
|
|
51
|
+
"""
|
|
52
|
+
if target_column not in data.columns:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
y = pd.to_numeric(data[target_column], errors="coerce")
|
|
56
|
+
leaks = []
|
|
57
|
+
for col in data.columns:
|
|
58
|
+
if col == target_column:
|
|
59
|
+
continue
|
|
60
|
+
x = pd.to_numeric(data[col], errors="coerce")
|
|
61
|
+
pair = pd.concat([x, y], axis=1).dropna()
|
|
62
|
+
if len(pair) < 3 or pair.iloc[:, 0].nunique() < 2 or pair.iloc[:, 1].nunique() < 2:
|
|
63
|
+
continue
|
|
64
|
+
corr = pair.iloc[:, 0].corr(pair.iloc[:, 1])
|
|
65
|
+
if corr is not None and not pd.isna(corr) and abs(corr) >= threshold:
|
|
66
|
+
leaks.append({
|
|
67
|
+
"feature": col,
|
|
68
|
+
"correlation": round(float(corr), 4),
|
|
69
|
+
"reason": (
|
|
70
|
+
f"correlates {corr:.2f} with the target '{target_column}' — "
|
|
71
|
+
"almost certainly an alternate form of the value being "
|
|
72
|
+
"predicted (e.g. a different quantity break), not a spec. "
|
|
73
|
+
"Training on it makes the model look near-perfect but useless "
|
|
74
|
+
"on real parts."
|
|
75
|
+
),
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
leaks.sort(key=lambda d: abs(d["correlation"]), reverse=True)
|
|
79
|
+
return leaks
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _column_types(X):
|
|
83
|
+
numerical_cols = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns
|
|
84
|
+
categorical_cols = X.select_dtypes(include=["object", "bool", "category"]).columns
|
|
85
|
+
return numerical_cols, categorical_cols
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def get_most_predictable_features(data, target_column, output_only_headers=False):
|
|
89
|
+
X = data.drop(target_column, axis=1)
|
|
90
|
+
y = data[target_column]
|
|
91
|
+
|
|
92
|
+
numerical_cols, categorical_cols = _column_types(X)
|
|
93
|
+
|
|
94
|
+
preprocessor = build_preprocessor(numerical_cols, categorical_cols, model_family="tree")
|
|
95
|
+
model = RandomForestRegressor(random_state=0, n_jobs=-1)
|
|
96
|
+
pipeline = Pipeline(
|
|
97
|
+
steps=[("preprocessor", preprocessor), ("model", model)]
|
|
98
|
+
)
|
|
99
|
+
pipeline.fit(X, y)
|
|
100
|
+
|
|
101
|
+
# With OrdinalEncoder, each source column maps to a single transformed
|
|
102
|
+
# column — no expansion, no underscore-grouping needed.
|
|
103
|
+
raw_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
|
|
104
|
+
importances = np.asarray(model.feature_importances_, dtype=float)
|
|
105
|
+
|
|
106
|
+
source_cols = list(X.columns)
|
|
107
|
+
by_source = {}
|
|
108
|
+
for name, imp in zip(raw_names, importances):
|
|
109
|
+
rest = name.split("__", 1)[1] if "__" in name else name
|
|
110
|
+
match = None
|
|
111
|
+
for col in source_cols:
|
|
112
|
+
if rest == col or rest.startswith(f"{col}_"):
|
|
113
|
+
if match is None or len(col) > len(match):
|
|
114
|
+
match = col
|
|
115
|
+
source = match if match is not None else rest
|
|
116
|
+
by_source[source] = by_source.get(source, 0.0) + float(imp)
|
|
117
|
+
|
|
118
|
+
feature_importances = pd.DataFrame(
|
|
119
|
+
sorted(by_source.items(), key=lambda kv: kv[1], reverse=True),
|
|
120
|
+
columns=["Feature", "Importance"],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if output_only_headers:
|
|
124
|
+
return feature_importances["Feature"]
|
|
125
|
+
|
|
126
|
+
total = feature_importances["Importance"].sum()
|
|
127
|
+
if total > 0:
|
|
128
|
+
feature_importances["Importance"] = (
|
|
129
|
+
feature_importances["Importance"] / total * 100
|
|
130
|
+
).round(2)
|
|
131
|
+
feature_importances.rename(columns={"Importance": "Importance (%)"}, inplace=True)
|
|
132
|
+
return feature_importances
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# Kept as a thin alias for backwards compatibility with any external callers.
|
|
136
|
+
def get_most_predictable_features_RFE(data, target_column, n_features_to_select=10):
|
|
137
|
+
return get_most_predictable_features(data, target_column, output_only_headers=True).head(
|
|
138
|
+
n_features_to_select
|
|
139
|
+
).tolist()
|