p2predict 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ from rich.console import Console
2
+ from rich.table import Table
3
+ console = Console()
4
+
5
+ def print_logo():
6
+ console.print(" ____ ____ ____ _ _ _ ",style='bold blue')
7
+ console.print("| _ \\ |___ \\ | _ \\ _ __ ___ __| |(_) ___ | |_ ",style='bold blue')
8
+ console.print("| |_) | __) || |_) || '__| / _ \\ / _` || | / __|| __|",style='bold blue')
9
+ console.print("| __/ / __/ | __/ | | | __/| (_| || || (__ | |_ ",style='bold blue')
10
+ console.print("|_| |_____||_| |_| \\___| \\__,_||_| \\___| \\__|",style='bold blue')
11
+
12
+ def plot_importances(feature_importances, feature_names):
13
+ table = Table(show_header=True, header_style="bold blue", highlight=True)
14
+ table.add_column("Feature", overflow="fold", width=50) # Adjust the width as necessary
15
+ table.add_column("Importance (%)", justify="right")
16
+
17
+ for i in range(len(feature_importances)):
18
+ table.add_row(feature_names[i], str(round(feature_importances[i] * 100, 2)) + "%")
19
+
20
+ console.print(table)
21
+
22
+ def print_feature_weights(sorted_feature_importances):
23
+ for feature, importance in sorted_feature_importances:
24
+ console.print(f"Feature: {feature}, Model Weight: {round(importance,ndigits=4)}")
25
+
26
+ def output_features(data):
27
+ table = Table(show_header=True, header_style="bold blue", highlight=True)
28
+ table.add_column("Feature")
29
+ table.add_column("Type")
30
+
31
+ for col, dtype in data.dtypes.items():
32
+ if dtype == 'object':
33
+ dtype = 'text'
34
+ elif dtype == 'int64':
35
+ dtype = 'numerical: integer'
36
+ elif dtype == 'float64':
37
+ dtype = 'numerical: float'
38
+ table.add_row(col, dtype)
39
+ console.print(table)
40
+
41
+ def print_feature_stats(data):
42
+ console = Console()
43
+ table = Table(show_header=True, header_style="bold blue", highlight=True)
44
+ table.add_column("Feature")
45
+ table.add_column("Min")
46
+ table.add_column("Max")
47
+ table.add_column("Mean")
48
+ table.add_column("Median")
49
+ table.add_column("Standard Deviation")
50
+ table.add_column("Skewness")
51
+ table.add_column("Kurtosis")
52
+
53
+ for col in data.columns:
54
+ min_val = data[col].min()
55
+ max_val = data[col].max()
56
+ mean_val = round(data[col].mean(),ndigits=4)
57
+ median_val = round(data[col].median(),ndigits=4)
58
+ std_val = round(data[col].std(),ndigits=4)
59
+ skewness = round(data[col].skew(),ndigits=4)
60
+ curt = round(data[col].kurt(),ndigits=4)
61
+
62
+ table.add_row(col, str(min_val), str(max_val), str(mean_val), str(median_val), str(std_val), str(skewness), str(curt))
63
+
64
+ console.print(table)
p2predict/explain.py ADDED
@@ -0,0 +1,464 @@
1
+ """SHAP-based per-prediction explanations for P2Predict models.
2
+
3
+ What this module computes
4
+ -------------------------
5
+ For a fitted P2Predict model and a single input row x, return the additive
6
+ decomposition
7
+
8
+ f(x) = phi_0 + sum_i phi_i
9
+
10
+ where phi_0 is the model's baseline (its expected value over a background
11
+ population) and phi_i is feature i's Shapley value. The Shapley value is the
12
+ unique attribution satisfying efficiency (local accuracy), missingness,
13
+ symmetry, and consistency — that uniqueness is what makes the per-feature
14
+ numbers defensible in a design-review meeting rather than yet another
15
+ heuristic importance score.
16
+
17
+ Which algorithm we use, and why
18
+ -------------------------------
19
+ We pick the explainer that is *exact* for the model family and runs in
20
+ polynomial time. We do not fall back to KernelExplainer — it is slow and
21
+ Monte-Carlo approximate, and we never need it for the three model families
22
+ this project supports.
23
+
24
+ Linear (Ridge, Lasso) -> shap.LinearExplainer
25
+ Closed form: phi_i = beta_i * (x_i - E[x_i]). Requires a background
26
+ sample only to estimate E[x_i]; cost is O(F).
27
+ Trees (RandomForest, -> shap.TreeExplainer with feature_perturbation=
28
+ XGBoost) "tree_path_dependent" (Lundberg 2018).
29
+ Exact Shapley values in O(T L D^2), no background sample required —
30
+ the conditional expectations are estimated from the trees' own node
31
+ counts.
32
+
33
+ Log-target wrap (TransformedTargetRegressor with log1p / expm1)
34
+ ---------------------------------------------------------------
35
+ The inner model predicts log(price). SHAP values on the inner model live in
36
+ log space and satisfy local accuracy *in log space*:
37
+
38
+ log(pred) - log(base) = sum_i phi_i_log
39
+
40
+ Exponentiating turns the sum into a product:
41
+
42
+ pred / base = prod_i exp(phi_i_log)
43
+
44
+ So in price space each feature becomes a *multiplicative factor*
45
+ exp(phi_i_log) -- e.g. "Region=EU multiplies the predicted price by 1.18
46
+ (+18%)". This is the axiomatically clean reading.
47
+
48
+ For procurement readability we additionally surface an "approximate dollar
49
+ attribution" obtained by proportionally rescaling the log-space contributions
50
+ to the price-space delta (pred - base). This *forces* additivity in dollars
51
+ at the cost of breaking the SHAP axioms — it is not strict SHAP, and we label
52
+ it that way in the report and in the CLI.
53
+
54
+ Source-feature roll-up
55
+ ----------------------
56
+ SHAP gives one value per *transformed* feature. We sum across the columns
57
+ that came from the same source column (one-hot dummies for linear models;
58
+ ordinal-encoded categoricals for tree models, where this is a no-op).
59
+ Summing one-hot dummies' Shapley values to attribute to the source column is
60
+ standard practice and is sound under SHAP's additivity property when the
61
+ dummies are mutually exclusive (exactly one is 1 at a time).
62
+ """
63
+
64
+ from __future__ import annotations
65
+
66
+ from dataclasses import dataclass, field
67
+ from typing import Optional
68
+
69
+ import numpy as np
70
+ import pandas as pd
71
+ from sklearn.compose import TransformedTargetRegressor
72
+
73
+ # Local-accuracy sanity-check tolerance. Floating-point + SHAP internals can
74
+ # leave a tiny residual; anything bigger is a sign something is wrong with
75
+ # the explainer choice or the transformed-matrix shape.
76
+ _LOCAL_ACCURACY_TOL = 1e-4
77
+
78
+
79
+ def _to_dense_2d(X) -> np.ndarray:
80
+ """Coerce a sklearn ColumnTransformer output into a dense 2-d ndarray.
81
+
82
+ ColumnTransformer with OneHotEncoder (the linear-model path) returns a
83
+ scipy sparse matrix. ``np.asarray`` on a sparse matrix wraps it in a
84
+ 0-d object array, which then breaks every downstream ``len()`` and
85
+ indexing call inside SHAP. We densify here so both LinearExplainer and
86
+ the local-accuracy ``estimator.predict(x_t)`` get an actual 2-d array.
87
+ """
88
+ if hasattr(X, "toarray"):
89
+ return X.toarray()
90
+ return np.asarray(X)
91
+
92
+
93
+ @dataclass
94
+ class Explanation:
95
+ """Per-row attribution result.
96
+
97
+ The contract:
98
+ contributions[col] are in the *inner model's output space* (price for
99
+ a non-log model; log(price) for a log-target model). They satisfy
100
+ local accuracy: baseline + sum(contributions.values()) ~= prediction
101
+ to within _LOCAL_ACCURACY_TOL.
102
+
103
+ For log-target models the price-space fields are populated and the
104
+ multiplicative_factors are the only attribution form that strictly
105
+ satisfies the SHAP axioms in price space. dollar_attribution is a
106
+ proportional rescaling — additive but not strict SHAP.
107
+ """
108
+
109
+ baseline: float
110
+ prediction: float
111
+ contributions: dict[str, float]
112
+ log_target: bool = False
113
+ baseline_price: Optional[float] = None
114
+ predicted_price: Optional[float] = None
115
+ multiplicative_factors: Optional[dict[str, float]] = None
116
+ dollar_attribution: Optional[dict[str, float]] = None
117
+ residual: float = 0.0 # local-accuracy residual, for diagnostics
118
+ # True iff product(multiplicative_factors) == predicted_price / baseline_price
119
+ # holds strictly. Holds for the v0.4 log/exp wrap; not for an older
120
+ # log1p/expm1 wrap, where the factors apply to (1 + price) instead.
121
+ strict_multiplicative: bool = False
122
+
123
+
124
+ def _unwrap(model):
125
+ """Return (inner_pipeline, is_log_target, inverse_func).
126
+
127
+ ``inverse_func`` is read off the TransformedTargetRegressor so the
128
+ explanation code stays correct whichever forward/inverse pair was used
129
+ at training time (v0.4+ uses log/exp; older models may have used
130
+ log1p/expm1). We invert via this function rather than hard-coding
131
+ ``expm1`` so the multiplicative-axiom math only holds strictly under
132
+ the right pairing (log/exp) but doesn't *silently lie* under the
133
+ wrong one — we surface that case via a flag.
134
+ """
135
+ if isinstance(model, TransformedTargetRegressor):
136
+ inverse = getattr(model, "inverse_func", None) or np.exp
137
+ return model.regressor_, True, inverse
138
+ return model, False, None
139
+
140
+
141
+ def _detect_family(estimator) -> str:
142
+ name = type(estimator).__name__.lower()
143
+ if any(t in name for t in ("ridge", "lasso", "linear", "elasticnet")):
144
+ return "linear"
145
+ if any(t in name for t in ("forest", "xgb", "gradientboost", "boost", "tree")):
146
+ return "tree"
147
+ return "unknown"
148
+
149
+
150
+ def _source_column_groups(
151
+ preprocessor, source_cols: list[str], n_values: int
152
+ ) -> dict[str, list[int]]:
153
+ """Map each source column to the transformed-feature indices it produced.
154
+
155
+ Uses the longest-source-column-prefix match (the same logic used by
156
+ extract_feature_importances), so source columns whose names share a
157
+ prefix — e.g. 'weight' and 'weight_extra' — are kept separate rather
158
+ than collapsed. Computed once per explain call so the per-row rollup
159
+ is a plain column-sum.
160
+ """
161
+ raw_names = list(preprocessor.get_feature_names_out())
162
+ if len(raw_names) != n_values:
163
+ raise ValueError(
164
+ f"Transformed-feature/SHAP-value length mismatch: "
165
+ f"{len(raw_names)} names vs {n_values} values."
166
+ )
167
+
168
+ groups: dict[str, list[int]] = {col: [] for col in source_cols}
169
+ for i, raw_name in enumerate(raw_names):
170
+ rest = raw_name.split("__", 1)[1] if "__" in raw_name else raw_name
171
+ match = None
172
+ for col in source_cols:
173
+ if rest == col or rest.startswith(f"{col}_"):
174
+ if match is None or len(col) > len(match):
175
+ match = col
176
+ if match is None:
177
+ match = rest
178
+ groups.setdefault(match, [])
179
+ groups[match].append(i)
180
+ return groups
181
+
182
+
183
+ def _scalar_expected_value(explainer) -> float:
184
+ """SHAP returns expected_value as either a scalar or a 1-element array
185
+ depending on the model and version. Normalise to a Python float."""
186
+ ev = explainer.expected_value
187
+ if isinstance(ev, (list, tuple, np.ndarray)):
188
+ ev = np.atleast_1d(ev)
189
+ if ev.size != 1:
190
+ # Multi-output models are not in our scope (regression only).
191
+ raise ValueError(
192
+ "SHAP expected_value has multiple outputs; only single-output "
193
+ "regression is supported."
194
+ )
195
+ return float(ev[0])
196
+ return float(ev)
197
+
198
+
199
+ def _shap_values(explainer, X_t):
200
+ """Get a (n_samples, n_features) SHAP value matrix regardless of the
201
+ library's version-dependent return shape."""
202
+ sv = explainer.shap_values(X_t)
203
+ if isinstance(sv, list):
204
+ # Classification returns a list of per-class arrays; regression
205
+ # returns either a 2-D array or a 1-D row. We only do regression.
206
+ if len(sv) != 1:
207
+ raise ValueError("Unexpected multi-output SHAP result.")
208
+ sv = sv[0]
209
+ sv = np.asarray(sv)
210
+ if sv.ndim == 1:
211
+ sv = sv.reshape(1, -1)
212
+ return sv
213
+
214
+
215
+ def _patch_shap_xgboost_base_score(shap_module) -> None:
216
+ """Coerce XGBoost >= 3.0's stringified-list ``base_score`` to a scalar
217
+ before SHAP's XGBTreeModelLoader tries to ``float()`` it.
218
+
219
+ XGBoost 3.x serialises ``base_score`` as a stringified one-element list
220
+ (e.g. ``'[9.567467E0]'``); SHAP 0.49.x's ``XGBTreeModelLoader`` calls
221
+ ``float(learner_model_param["base_score"])`` and raises ``ValueError:
222
+ could not convert string to float`` (shap/shap#4184, #4202, #4288). The
223
+ upstream fix (shap/shap#4187) is merged but not yet released, so we
224
+ patch the field inside the decoded UBJ payload before the loader sees
225
+ it. The patch is idempotent.
226
+ """
227
+ tree_mod = shap_module.explainers._tree
228
+ if getattr(tree_mod, "_p2predict_base_score_patched", False):
229
+ return
230
+
231
+ original_init = tree_mod.XGBTreeModelLoader.__init__
232
+ original_decode = tree_mod.decode_ubjson_buffer
233
+
234
+ def patched_init(self, xgb_model):
235
+ def coercing_decode(fp):
236
+ jmodel = original_decode(fp)
237
+ try:
238
+ lmp = jmodel["learner"]["learner_model_param"]
239
+ bs = lmp.get("base_score")
240
+ if isinstance(bs, str) and bs.startswith("["):
241
+ import ast
242
+ val = ast.literal_eval(bs)
243
+ if isinstance(val, (list, tuple)) and val:
244
+ lmp["base_score"] = str(float(val[0]))
245
+ except (KeyError, ValueError, SyntaxError):
246
+ pass
247
+ return jmodel
248
+
249
+ tree_mod.decode_ubjson_buffer = coercing_decode
250
+ try:
251
+ original_init(self, xgb_model)
252
+ finally:
253
+ tree_mod.decode_ubjson_buffer = original_decode
254
+
255
+ tree_mod.XGBTreeModelLoader.__init__ = patched_init
256
+ tree_mod._p2predict_base_score_patched = True
257
+
258
+
259
+ def _build_explainer(estimator, family: str, background_X_t):
260
+ """Construct the right SHAP explainer.
261
+
262
+ Trees use the tree-path-dependent algorithm — no background needed, and
263
+ the result is exact in O(TLD^2). Linear models use the closed-form
264
+ LinearExplainer; that one *does* need a background to estimate E[x_i].
265
+ """
266
+ import shap # imported lazily so the rest of P2Predict has no hard
267
+ # dependency on shap unless --explain is actually used.
268
+
269
+ if family == "tree":
270
+ _patch_shap_xgboost_base_score(shap)
271
+ return shap.TreeExplainer(
272
+ estimator, feature_perturbation="tree_path_dependent"
273
+ )
274
+ if family == "linear":
275
+ if background_X_t is None or len(background_X_t) == 0:
276
+ raise ValueError(
277
+ "Linear models require a background sample for SHAP. "
278
+ "Re-train with v0.4 (which persists one) or pass background_X."
279
+ )
280
+ return shap.LinearExplainer(estimator, background_X_t)
281
+ raise ValueError(
282
+ f"No SHAP explainer wired for estimator '{type(estimator).__name__}'."
283
+ )
284
+
285
+
286
+ def _finalize_explanation(
287
+ baseline: float,
288
+ inner_pred: float,
289
+ contributions: dict[str, float],
290
+ is_log_target: bool,
291
+ inverse_func,
292
+ ) -> Explanation:
293
+ """Assemble one row's Explanation from its rolled-up contributions."""
294
+ # Local-accuracy check in *inner-model* output space. This catches issues
295
+ # like a mis-extracted preprocessor or a wrong-family explainer pick.
296
+ residual = float(inner_pred - (baseline + sum(contributions.values())))
297
+ if abs(residual) > _LOCAL_ACCURACY_TOL * max(1.0, abs(inner_pred)):
298
+ # Don't raise — log via the Explanation so the CLI can surface it.
299
+ pass
300
+
301
+ if not is_log_target:
302
+ return Explanation(
303
+ baseline=baseline,
304
+ prediction=inner_pred,
305
+ contributions=contributions,
306
+ residual=residual,
307
+ )
308
+
309
+ # Log-target post-processing.
310
+ #
311
+ # contributions are in inner-model output space. When the wrap is log/exp
312
+ # (the v0.4+ default) the per-feature multiplicative factor in price
313
+ # space is exp(contribution), and the product of factors *exactly*
314
+ # reproduces predicted_price / baseline_price. This is the axiomatic
315
+ # SHAP statement in price space.
316
+ #
317
+ # For other wraps (e.g. v0.2/v0.3 log1p/expm1) the multiplicative
318
+ # interpretation applies on the inverse_func's *pre-shift* scale rather
319
+ # than on price directly — for log1p that's (1 + price). We keep the
320
+ # exp() factor (which is what SHAP gives us in log space) and let the
321
+ # caller know via the strict_multiplicative flag.
322
+ baseline_price = float(inverse_func(baseline))
323
+ predicted_price = float(inverse_func(inner_pred))
324
+ multiplicative_factors = {
325
+ col: float(np.exp(v)) for col, v in contributions.items()
326
+ }
327
+ strict_multiplicative = inverse_func is np.exp
328
+
329
+ # Approximate dollar attribution: rescale log-space contributions so they
330
+ # sum to the price-space delta. This is *not* strict SHAP — see the
331
+ # module docstring — but it is the form procurement readers naturally
332
+ # want, and we label it as approximate everywhere it is shown.
333
+ delta_price = predicted_price - baseline_price
334
+ log_total = sum(contributions.values())
335
+ if abs(log_total) > 1e-12:
336
+ dollar_attribution = {
337
+ col: float(delta_price * v / log_total)
338
+ for col, v in contributions.items()
339
+ }
340
+ else:
341
+ dollar_attribution = {col: 0.0 for col in contributions}
342
+
343
+ return Explanation(
344
+ baseline=baseline,
345
+ prediction=inner_pred,
346
+ contributions=contributions,
347
+ log_target=True,
348
+ baseline_price=baseline_price,
349
+ predicted_price=predicted_price,
350
+ multiplicative_factors=multiplicative_factors,
351
+ dollar_attribution=dollar_attribution,
352
+ residual=residual,
353
+ strict_multiplicative=strict_multiplicative,
354
+ )
355
+
356
+
357
+ def explain_batch(
358
+ model,
359
+ X: pd.DataFrame,
360
+ background_X: Optional[pd.DataFrame] = None,
361
+ ) -> list[Explanation]:
362
+ """Compute SHAP explanations for every row of ``X``.
363
+
364
+ Builds the explainer *once* and computes all rows' SHAP values in a
365
+ single call. Explainer construction is the expensive part — for tree
366
+ ensembles SHAP parses the entire fitted forest — so this is the path
367
+ to use for more than one row. Each row's Explanation is identical to
368
+ what :func:`explain_row` returns for that row alone.
369
+
370
+ Parameters
371
+ ----------
372
+ model
373
+ A fitted P2Predict pipeline — either a sklearn ``Pipeline`` or a
374
+ ``TransformedTargetRegressor`` wrapping one.
375
+ X
376
+ DataFrame with the same source columns the pipeline was trained on.
377
+ One Explanation is returned per row.
378
+ background_X
379
+ Optional background sample of raw (pre-preprocessor) feature rows.
380
+ Required for linear models, ignored for tree models.
381
+ """
382
+ if len(X) == 0:
383
+ return []
384
+
385
+ inner, is_log_target, inverse_func = _unwrap(model)
386
+ preprocessor = inner.named_steps["preprocessor"]
387
+ estimator = inner.named_steps["model"]
388
+ family = _detect_family(estimator)
389
+
390
+ X_t = _to_dense_2d(preprocessor.transform(X))
391
+ bg_t = (
392
+ _to_dense_2d(preprocessor.transform(background_X))
393
+ if background_X is not None
394
+ else None
395
+ )
396
+
397
+ explainer = _build_explainer(estimator, family, bg_t)
398
+ sv = _shap_values(explainer, X_t)
399
+
400
+ baseline = _scalar_expected_value(explainer)
401
+ inner_preds = np.asarray(estimator.predict(X_t), dtype=float).ravel()
402
+
403
+ source_cols = list(X.columns)
404
+ groups = _source_column_groups(preprocessor, source_cols, sv.shape[1])
405
+ # One column-sum per source feature, vectorised across all rows.
406
+ rolled = {src: sv[:, idxs].sum(axis=1) for src, idxs in groups.items()}
407
+
408
+ return [
409
+ _finalize_explanation(
410
+ baseline,
411
+ float(inner_preds[i]),
412
+ {src: float(vals[i]) for src, vals in rolled.items()},
413
+ is_log_target,
414
+ inverse_func,
415
+ )
416
+ for i in range(len(X))
417
+ ]
418
+
419
+
420
+ def explain_row(
421
+ model,
422
+ x: pd.DataFrame,
423
+ background_X: Optional[pd.DataFrame] = None,
424
+ ) -> Explanation:
425
+ """Compute the SHAP explanation for a single-row DataFrame x.
426
+
427
+ Parameters
428
+ ----------
429
+ model
430
+ A fitted P2Predict pipeline — either a sklearn ``Pipeline`` or a
431
+ ``TransformedTargetRegressor`` wrapping one.
432
+ x
433
+ Single-row DataFrame with the same source columns the pipeline was
434
+ trained on. To explain many rows, use :func:`explain_batch` — it
435
+ builds the (expensive) explainer once instead of per row.
436
+ background_X
437
+ Optional background sample of raw (pre-preprocessor) feature rows.
438
+ Required for linear models, ignored for tree models.
439
+ """
440
+ if len(x) != 1:
441
+ raise ValueError("explain_row expects a single-row DataFrame.")
442
+ return explain_batch(model, x, background_X=background_X)[0]
443
+
444
+
445
+ def top_drivers(
446
+ explanation: Explanation, n: int = 3, signed: bool = True
447
+ ) -> list[tuple[str, float]]:
448
+ """Return the n source features with the largest |contribution|.
449
+
450
+ In the log-target case we rank by absolute log-space contribution (which
451
+ is monotone with |log(multiplicative_factor)|) and report the actual
452
+ multiplicative factor as the numeric value, since that is the
453
+ axiomatically clean per-feature quantity in price space.
454
+ """
455
+ items = list(explanation.contributions.items())
456
+ items.sort(key=lambda kv: abs(kv[1]), reverse=True)
457
+ items = items[:n]
458
+ if explanation.log_target and explanation.multiplicative_factors is not None:
459
+ return [
460
+ (col, explanation.multiplicative_factors[col]) for col, _ in items
461
+ ]
462
+ if not signed:
463
+ return [(col, abs(v)) for col, v in items]
464
+ return items
@@ -0,0 +1,139 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.ensemble import RandomForestRegressor
4
+ from sklearn.pipeline import Pipeline
5
+
6
+ from p2predict.preprocessing import build_preprocessor
7
+
8
+
9
+ def find_high_variation_features(df):
10
+ high_variation = []
11
+
12
+ numeric_df = df.select_dtypes(include=["int64", "float64", "int32", "float32"])
13
+ if not numeric_df.empty:
14
+ means = numeric_df.mean()
15
+ stds = numeric_df.std()
16
+ # Use |mean| and guard against ~0 to keep CV well-defined.
17
+ safe_means = means.abs().where(means.abs() > 1e-9)
18
+ cv = (stds / safe_means).dropna()
19
+ high_variation.extend(cv[cv > 1].index.tolist())
20
+
21
+ categorical_df = df.select_dtypes(include=["object", "bool", "category"])
22
+ if not categorical_df.empty:
23
+ unique_ratio = categorical_df.apply(lambda x: x.nunique() / max(len(x), 1))
24
+ high_variation.extend(unique_ratio[unique_ratio > 0.9].index.tolist())
25
+
26
+ return high_variation
27
+
28
+
29
+ def find_no_variation_features(df):
30
+ unique_counts = df.nunique(dropna=False)
31
+ return unique_counts[unique_counts <= 1].index.tolist()
32
+
33
+
34
+ def find_leaky_features(data, target_column, threshold=0.97):
35
+ """Flag features that look like an alternate form of the target (leakage).
36
+
37
+ A numeric feature whose absolute Pearson correlation with the target
38
+ exceeds ``threshold`` is almost certainly **target leakage** — a
39
+ near-duplicate of the answer (e.g. the same price at a different quantity
40
+ break, or a pre-rounded copy) rather than a genuine spec. Training on it
41
+ inflates every metric while producing a model that is useless on real
42
+ parts, because at prediction time you wouldn't have the leaked column
43
+ (or you'd already know the price).
44
+
45
+ Only numeric columns are screened — a categorical can't be a linear
46
+ duplicate of a numeric target — and the target column itself is never
47
+ returned.
48
+
49
+ Returns a list of ``{"feature", "correlation", "reason"}`` dicts sorted
50
+ by absolute correlation, descending. Empty when nothing looks leaky.
51
+ """
52
+ if target_column not in data.columns:
53
+ return []
54
+
55
+ y = pd.to_numeric(data[target_column], errors="coerce")
56
+ leaks = []
57
+ for col in data.columns:
58
+ if col == target_column:
59
+ continue
60
+ x = pd.to_numeric(data[col], errors="coerce")
61
+ pair = pd.concat([x, y], axis=1).dropna()
62
+ if len(pair) < 3 or pair.iloc[:, 0].nunique() < 2 or pair.iloc[:, 1].nunique() < 2:
63
+ continue
64
+ corr = pair.iloc[:, 0].corr(pair.iloc[:, 1])
65
+ if corr is not None and not pd.isna(corr) and abs(corr) >= threshold:
66
+ leaks.append({
67
+ "feature": col,
68
+ "correlation": round(float(corr), 4),
69
+ "reason": (
70
+ f"correlates {corr:.2f} with the target '{target_column}' — "
71
+ "almost certainly an alternate form of the value being "
72
+ "predicted (e.g. a different quantity break), not a spec. "
73
+ "Training on it makes the model look near-perfect but useless "
74
+ "on real parts."
75
+ ),
76
+ })
77
+
78
+ leaks.sort(key=lambda d: abs(d["correlation"]), reverse=True)
79
+ return leaks
80
+
81
+
82
+ def _column_types(X):
83
+ numerical_cols = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns
84
+ categorical_cols = X.select_dtypes(include=["object", "bool", "category"]).columns
85
+ return numerical_cols, categorical_cols
86
+
87
+
88
+ def get_most_predictable_features(data, target_column, output_only_headers=False):
89
+ X = data.drop(target_column, axis=1)
90
+ y = data[target_column]
91
+
92
+ numerical_cols, categorical_cols = _column_types(X)
93
+
94
+ preprocessor = build_preprocessor(numerical_cols, categorical_cols, model_family="tree")
95
+ model = RandomForestRegressor(random_state=0, n_jobs=-1)
96
+ pipeline = Pipeline(
97
+ steps=[("preprocessor", preprocessor), ("model", model)]
98
+ )
99
+ pipeline.fit(X, y)
100
+
101
+ # With OrdinalEncoder, each source column maps to a single transformed
102
+ # column — no expansion, no underscore-grouping needed.
103
+ raw_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
104
+ importances = np.asarray(model.feature_importances_, dtype=float)
105
+
106
+ source_cols = list(X.columns)
107
+ by_source = {}
108
+ for name, imp in zip(raw_names, importances):
109
+ rest = name.split("__", 1)[1] if "__" in name else name
110
+ match = None
111
+ for col in source_cols:
112
+ if rest == col or rest.startswith(f"{col}_"):
113
+ if match is None or len(col) > len(match):
114
+ match = col
115
+ source = match if match is not None else rest
116
+ by_source[source] = by_source.get(source, 0.0) + float(imp)
117
+
118
+ feature_importances = pd.DataFrame(
119
+ sorted(by_source.items(), key=lambda kv: kv[1], reverse=True),
120
+ columns=["Feature", "Importance"],
121
+ )
122
+
123
+ if output_only_headers:
124
+ return feature_importances["Feature"]
125
+
126
+ total = feature_importances["Importance"].sum()
127
+ if total > 0:
128
+ feature_importances["Importance"] = (
129
+ feature_importances["Importance"] / total * 100
130
+ ).round(2)
131
+ feature_importances.rename(columns={"Importance": "Importance (%)"}, inplace=True)
132
+ return feature_importances
133
+
134
+
135
+ # Kept as a thin alias for backwards compatibility with any external callers.
136
+ def get_most_predictable_features_RFE(data, target_column, n_features_to_select=10):
137
+ return get_most_predictable_features(data, target_column, output_only_headers=True).head(
138
+ n_features_to_select
139
+ ).tolist()