p2predict 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ import numpy as np
2
+ from sklearn.inspection import permutation_importance
3
+ from sklearn.metrics import mean_absolute_error, r2_score
4
+
5
+
6
+ def evaluate_model(X_test, y_test, model):
7
+ predictions = model.predict(X_test)
8
+ mae = mean_absolute_error(y_test, predictions)
9
+ r2 = r2_score(y_test, predictions)
10
+
11
+ # Replace the prior (and incorrect) two-sample t-test with a
12
+ # residual-mean test: under a well-calibrated model, residuals should be
13
+ # centered on zero. Reject => systematic bias.
14
+ residuals = np.asarray(y_test) - np.asarray(predictions)
15
+ rmse = float(np.sqrt(np.mean(residuals ** 2)))
16
+ n = len(residuals)
17
+ if n > 1 and residuals.std(ddof=1) > 0:
18
+ from scipy import stats
19
+ _, p_value = stats.ttest_1samp(residuals, 0.0)
20
+ else:
21
+ p_value = 1.0
22
+
23
+ return mae, r2, p_value, rmse
24
+
25
+
26
+ def get_column_statistics(data, feature_columns):
27
+ return {
28
+ col: {"skewness": data[col].skew(), "kurtosis": data[col].kurt()}
29
+ for col in feature_columns
30
+ }
31
+
32
+
33
+ def calculate_feature_importance(X, y, model):
34
+ result = permutation_importance(model, X, y, n_repeats=10, random_state=0, n_jobs=-1)
35
+ total = sum(result.importances_mean) or 1.0
36
+ return result.importances_mean / total
@@ -0,0 +1,235 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.compose import TransformedTargetRegressor
6
+ from sklearn.pipeline import Pipeline
7
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, TargetEncoder
8
+
9
+ from p2predict.explain import Explanation
10
+ from p2predict.whatif import WhatIfResult, interaction_is_material
11
+
12
+
13
+ def inner_pipeline(model):
14
+ """Unwrap a TransformedTargetRegressor to get the inner Pipeline."""
15
+ return model.regressor_ if isinstance(model, TransformedTargetRegressor) else model
16
+
17
+
18
+ def extract_feature_info(pipeline):
19
+ """Return (feature_types, all_categories) from a fitted preprocessor.
20
+
21
+ Works with OneHotEncoder, OrdinalEncoder, and TargetEncoder pipelines.
22
+ """
23
+ preprocessor = pipeline.named_steps["preprocessor"]
24
+ feature_types: dict[str, str] = {}
25
+ all_categories: dict[str, list] = {}
26
+
27
+ for name, transformer, columns in preprocessor.transformers_:
28
+ if name == "num":
29
+ feature_types.update({col: "Numerical" for col in columns})
30
+ elif name == "cat":
31
+ feature_types.update({col: "Categorical" for col in columns})
32
+
33
+ encoder = transformer
34
+ if isinstance(transformer, Pipeline):
35
+ if "onehot" in transformer.named_steps:
36
+ encoder = transformer.named_steps["onehot"]
37
+ elif "target" in transformer.named_steps:
38
+ encoder = transformer.named_steps["target"]
39
+
40
+ if isinstance(encoder, (OneHotEncoder, OrdinalEncoder, TargetEncoder)) and hasattr(
41
+ encoder, "categories_"
42
+ ):
43
+ all_categories = {
44
+ col: cat.tolist()
45
+ for col, cat in zip(columns, encoder.categories_)
46
+ }
47
+
48
+ return feature_types, all_categories
49
+
50
+
51
+ def coerce_features(features_df, feature_types):
52
+ """Coerce numerical columns to numeric dtype."""
53
+ for col, kind in feature_types.items():
54
+ if col in features_df.columns and kind == "Numerical":
55
+ features_df[col] = pd.to_numeric(features_df[col], errors="coerce")
56
+ return features_df
57
+
58
+
59
+ def interval_to_dicts(intervals) -> list[dict]:
60
+ """Serialize a list of IntervalResult to JSON-ready dicts.
61
+
62
+ Each dict carries the raw bounds AND a per-part trust read the agent can
63
+ quote: ``reliability`` ('trust' | 'caution' | 'quote') and a plain
64
+ ``say_to_user`` sentence derived from how wide the band is relative to the
65
+ prediction. Lead with ``say_to_user``; don't make the user infer trust from
66
+ the bare low/high numbers.
67
+ """
68
+ from p2predict.quality import interval_reliability, interval_say_to_user
69
+
70
+ out = []
71
+ for ir in intervals:
72
+ low, pred, high = float(ir.low), float(ir.prediction), float(ir.high)
73
+ out.append({
74
+ "low": low,
75
+ "prediction": pred,
76
+ "high": high,
77
+ "band": ir.band,
78
+ "reliability": interval_reliability(low, pred, high),
79
+ "say_to_user": interval_say_to_user(low, pred, high),
80
+ })
81
+ return out
82
+
83
+
84
+ def explanation_to_dict(explanation: Explanation) -> dict:
85
+ """Serialize an Explanation to a JSON-ready dict.
86
+
87
+ The dict carries TWO views of the same attribution:
88
+
89
+ - A business view the agent can quote to a category manager verbatim:
90
+ ``starting_point`` (the baseline price every part starts from) and
91
+ ``price_drivers`` (each spec/supplier's effect in dollars AND percent,
92
+ biggest mover first). This is the view to lead with.
93
+ - The technical view (``baseline``, ``contributions``,
94
+ ``multiplicative_factors``, ``dollar_attribution``, ``residual``) for
95
+ callers that need the raw SHAP numbers. Do NOT surface these key names
96
+ to a procurement user.
97
+ """
98
+ out = {
99
+ "baseline": float(explanation.baseline),
100
+ "prediction": float(explanation.prediction),
101
+ "log_target": bool(explanation.log_target),
102
+ "contributions": [
103
+ {"feature": k, "value": float(v)}
104
+ for k, v in sorted(
105
+ explanation.contributions.items(), key=lambda kv: abs(kv[1]), reverse=True
106
+ )
107
+ ],
108
+ "residual": float(explanation.residual),
109
+ }
110
+ if explanation.log_target and explanation.multiplicative_factors is not None:
111
+ out["multiplicative_factors"] = [
112
+ {"feature": k, "factor": float(v)}
113
+ for k, v in sorted(
114
+ explanation.multiplicative_factors.items(),
115
+ key=lambda kv: abs(np.log(kv[1])) if kv[1] > 0 else 0.0,
116
+ reverse=True,
117
+ )
118
+ ]
119
+ out["dollar_attribution"] = (
120
+ [
121
+ {"feature": k, "value": float(v)}
122
+ for k, v in sorted(
123
+ explanation.dollar_attribution.items(),
124
+ key=lambda kv: abs(kv[1]),
125
+ reverse=True,
126
+ )
127
+ ]
128
+ if explanation.dollar_attribution is not None
129
+ else None
130
+ )
131
+ else:
132
+ out["multiplicative_factors"] = None
133
+ out["dollar_attribution"] = None
134
+
135
+ out["starting_point"] = _business_starting_point(explanation)
136
+ out["price_drivers"] = _business_price_drivers(explanation)
137
+ return out
138
+
139
+
140
+ def _business_starting_point(explanation: Explanation) -> float:
141
+ """The baseline price every part 'starts from' before its specs apply, in
142
+ dollars (price space for log-target models)."""
143
+ if explanation.log_target and explanation.baseline_price is not None:
144
+ return float(explanation.baseline_price)
145
+ return float(explanation.baseline)
146
+
147
+
148
+ def _business_price_drivers(explanation: Explanation) -> list[dict]:
149
+ """A single plain-language attribution list the agent can quote directly.
150
+
151
+ Each entry is one spec/supplier and its effect on the price, expressed in
152
+ BOTH dollars and percent, biggest absolute mover first:
153
+ {"driver": "Supplier ADI", "effect_dollars": 0.72, "effect_pct": 18.0}
154
+ Works for additive and log-target models alike, so the caller never has to
155
+ branch on the model's internal scale.
156
+ """
157
+ drivers: list[dict] = []
158
+ if explanation.log_target and explanation.multiplicative_factors is not None:
159
+ dollars = explanation.dollar_attribution or {}
160
+ for feature, factor in explanation.multiplicative_factors.items():
161
+ drivers.append({
162
+ "driver": feature,
163
+ "effect_dollars": (
164
+ round(float(dollars[feature]), 4) if feature in dollars else None
165
+ ),
166
+ "effect_pct": round((float(factor) - 1.0) * 100.0, 1),
167
+ })
168
+ else:
169
+ # Additive model: contributions are already dollars; percent is the
170
+ # share of the baseline each driver moves the price by.
171
+ base = float(explanation.baseline) or 1.0
172
+ for feature, value in explanation.contributions.items():
173
+ drivers.append({
174
+ "driver": feature,
175
+ "effect_dollars": round(float(value), 4),
176
+ "effect_pct": round(float(value) / base * 100.0, 1),
177
+ })
178
+ drivers.sort(key=lambda d: abs(d["effect_dollars"] or 0.0), reverse=True)
179
+ return drivers
180
+
181
+
182
+ def whatif_to_dict(result: WhatIfResult) -> dict:
183
+ """Serialize a WhatIfResult to a JSON-ready dict.
184
+
185
+ ``summary`` is the plain-language headline the agent can quote to a category
186
+ manager ("Switching to Microchip saves $0.41 per part, -12%"); the
187
+ remaining keys are the technical detail behind it. Lead with ``summary``.
188
+ """
189
+ delta = float(result.delta)
190
+ direction = "no change"
191
+ if delta > 0:
192
+ direction = "adds"
193
+ elif delta < 0:
194
+ direction = "saves"
195
+ return {
196
+ "summary": {
197
+ "direction": direction, # "adds" | "saves" | "no change"
198
+ "effect_dollars": round(abs(delta), 4),
199
+ "effect_pct": round(abs(float(result.delta_pct)), 1),
200
+ "new_price": round(float(result.counterfactual_prediction), 4),
201
+ "old_price": round(float(result.base_prediction), 4),
202
+ },
203
+ "changes": {
204
+ col: {"from": base_val, "to": cf_val}
205
+ for col, (base_val, cf_val) in result.changes.items()
206
+ },
207
+ "base_prediction": float(result.base_prediction),
208
+ "counterfactual_prediction": float(result.counterfactual_prediction),
209
+ "delta": float(result.delta),
210
+ "delta_pct": float(result.delta_pct),
211
+ "log_target": bool(result.log_target),
212
+ "multiplicative_factor": (
213
+ float(result.multiplicative_factor)
214
+ if result.multiplicative_factor is not None
215
+ else None
216
+ ),
217
+ "changed_contributions": [
218
+ {"feature": k, "value": float(v)}
219
+ for k, v in sorted(
220
+ result.changed_contributions.items(), key=lambda kv: abs(kv[1]), reverse=True
221
+ )
222
+ ],
223
+ "interaction_contribution": float(result.interaction_contribution),
224
+ "interaction_is_material": bool(interaction_is_material(result)),
225
+ "base_interval": (
226
+ {"low": float(result.base_interval.low), "high": float(result.base_interval.high)}
227
+ if result.base_interval is not None
228
+ else None
229
+ ),
230
+ "cf_interval": (
231
+ {"low": float(result.cf_interval.low), "high": float(result.cf_interval.high)}
232
+ if result.cf_interval is not None
233
+ else None
234
+ ),
235
+ }
p2predict/outliers.py ADDED
@@ -0,0 +1,234 @@
1
+ """Outlier detection and handling for both the target and the feature columns.
2
+
3
+ Procurement data routinely contains rush orders, one-off spot buys, and
4
+ data-entry errors (a `Weight` of `100000` when someone meant `100`, a unit
5
+ mix-up between kg and g). Both kinds quietly distort a learned cost model:
6
+ target outliers warp R² and inflate intervals, feature outliers pull the
7
+ model's response surface around the bad rows.
8
+
9
+ This module flags both via the same Tukey IQR rule and applies one of four
10
+ policies.
11
+
12
+ Policies
13
+ --------
14
+ keep Flag only, change nothing.
15
+ warn Flag and warn (default). Same as keep, but with a console
16
+ message so the user actually finds out.
17
+ drop Remove flagged rows before training.
18
+ winsorize Cap flagged values at the IQR bounds (preserves row count).
19
+
20
+ Two surfaces
21
+ ------------
22
+ 1. ``apply_outlier_policy(data, target_column, policy, multiplier)``
23
+ Target-side. Inspects ``data[target_column]`` only. Has been around
24
+ since v0.3.
25
+
26
+ 2. ``apply_feature_outlier_policy(data, feature_columns, policy, multiplier)``
27
+ Feature-side. Inspects every *numerical* column in ``feature_columns``
28
+ independently. ``drop`` removes any row that has an outlier in any
29
+ feature column; ``winsorize`` caps each column at its own IQR bounds
30
+ independently. Categorical columns are silently ignored — "outlier"
31
+ doesn't have a clean meaning there (a rare category isn't necessarily
32
+ wrong, just rare). Added in v0.7.
33
+
34
+ drop semantics for the feature-side path
35
+ ----------------------------------------
36
+ We chose to drop the *whole row* when any one feature column flags an
37
+ outlier, rather than null out the offending cell. Reasoning: in
38
+ procurement data an outlier in one feature almost always signals
39
+ data-entry corruption that correlates with quality issues elsewhere in
40
+ the row (a transcription mistake, a unit confusion). Throwing the row
41
+ is the conservative move. Users who want per-cell handling should
42
+ pre-clean the CSV before feeding it in.
43
+
44
+ Detection rule (both surfaces)
45
+ ------------------------------
46
+ Standard Tukey IQR: a value is an outlier when it is below Q1 − 1.5·IQR
47
+ or above Q3 + 1.5·IQR, where IQR = Q3 − Q1. The multiplier is exposed
48
+ on the API but defaults to 1.5; raising it (e.g. 3.0) catches only
49
+ extreme outliers, lowering it (e.g. 1.0) is more aggressive.
50
+ """
51
+
52
+ import numpy as np
53
+ import pandas as pd
54
+
55
+ POLICIES = ("keep", "warn", "drop", "winsorize")
56
+ IQR_MULTIPLIER = 1.5
57
+
58
+
59
+ def detect_outliers(values, multiplier=IQR_MULTIPLIER):
60
+ """Return a ``(mask, lower, upper)`` tuple flagging Tukey-IQR outliers.
61
+
62
+ ``values`` may be any 1-D numeric iterable. Non-numeric / NaN entries
63
+ are treated as non-outliers (the mask is False for them) so callers
64
+ don't need to clean inputs first.
65
+ """
66
+ series = pd.Series(values).astype(float)
67
+ finite = series.dropna()
68
+ if finite.empty:
69
+ return pd.Series([False] * len(series), index=series.index), float("nan"), float("nan")
70
+
71
+ q1, q3 = finite.quantile([0.25, 0.75])
72
+ iqr = q3 - q1
73
+ if iqr == 0:
74
+ # Central 50% collapses to a single point. The Tukey rule
75
+ # degenerates and the old behaviour (mask all False) silently
76
+ # missed obvious outliers in near-constant columns — e.g. a
77
+ # Weight column of [10]*20 + [10_000] would slip through. Anything
78
+ # not equal to the central point is, by definition, outside the
79
+ # central 50%, so we flag it; bounds collapse to that point.
80
+ point = float(q1)
81
+ mask = (series != point).fillna(False)
82
+ return mask, point, point
83
+
84
+ lower = float(q1 - multiplier * iqr)
85
+ upper = float(q3 + multiplier * iqr)
86
+ mask = (series < lower) | (series > upper)
87
+ mask = mask.fillna(False)
88
+ return mask, lower, upper
89
+
90
+
91
+ def apply_outlier_policy(data, target_column, policy="warn", multiplier=IQR_MULTIPLIER):
92
+ """Apply ``policy`` to outliers in ``data[target_column]``.
93
+
94
+ Returns ``(df, summary)``. The summary dict is suitable for
95
+ caller-side logging:
96
+ ``{n_outliers, n_total, lower, upper, policy, applied}``
97
+
98
+ ``applied`` is the action that actually changed the data (``drop``,
99
+ ``winsorize``, or ``none``).
100
+ """
101
+ if policy not in POLICIES:
102
+ raise ValueError(f"Unknown outlier policy: {policy}. Choose from {POLICIES}.")
103
+
104
+ mask, lower, upper = detect_outliers(data[target_column], multiplier=multiplier)
105
+ n_outliers = int(mask.sum())
106
+ summary = {
107
+ "n_outliers": n_outliers,
108
+ "n_total": len(data),
109
+ "lower": lower,
110
+ "upper": upper,
111
+ "policy": policy,
112
+ "applied": "none",
113
+ }
114
+
115
+ if n_outliers == 0:
116
+ return data, summary
117
+
118
+ if policy == "drop":
119
+ summary["applied"] = "drop"
120
+ return data.loc[~mask].reset_index(drop=True), summary
121
+
122
+ if policy == "winsorize":
123
+ summary["applied"] = "winsorize"
124
+ new_data = data.copy()
125
+ new_data[target_column] = new_data[target_column].clip(lower=lower, upper=upper)
126
+ return new_data, summary
127
+
128
+ # keep / warn: don't change the data
129
+ return data, summary
130
+
131
+
132
+ def _is_numeric_series(series: pd.Series) -> bool:
133
+ """True iff the column is numeric. Categorical columns are silently
134
+ skipped by the feature-outlier path because "outlier" doesn't have a
135
+ clean meaning for a discrete code — a rare category isn't necessarily
136
+ wrong, just rare. Use ``find_high_variation_features`` instead for
137
+ that case."""
138
+ return pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series)
139
+
140
+
141
+ def apply_feature_outlier_policy(
142
+ data, feature_columns, policy="warn", multiplier=IQR_MULTIPLIER
143
+ ):
144
+ """Apply ``policy`` to outliers across one or more *feature* columns.
145
+
146
+ Parameters
147
+ ----------
148
+ data : pd.DataFrame
149
+ Training data. Must contain every column in ``feature_columns``.
150
+ feature_columns : Iterable[str]
151
+ Columns to inspect. Non-numeric columns are silently skipped (see
152
+ module docstring for the rationale). Pass the model's feature
153
+ list — exclude the target and any time column upstream.
154
+ policy : str
155
+ One of POLICIES. ``drop`` removes rows that have an outlier in
156
+ *any* numeric feature column; ``winsorize`` caps each column at
157
+ its own IQR bounds independently; ``keep`` / ``warn`` change
158
+ nothing (warn surfaces a message at the caller).
159
+ multiplier : float
160
+ Tukey IQR multiplier. 1.5 is the textbook default.
161
+
162
+ Returns
163
+ -------
164
+ (df, summary) where summary has the shape::
165
+
166
+ {
167
+ "policy": policy,
168
+ "applied": "drop" | "winsorize" | "none",
169
+ "n_total": int,
170
+ "n_outliers_total": int, # rows touched at all
171
+ "per_column": {
172
+ column_name: {
173
+ "n_outliers": int,
174
+ "lower": float,
175
+ "upper": float,
176
+ },
177
+ ...
178
+ },
179
+ }
180
+
181
+ ``n_outliers_total`` counts rows containing at least one outlier
182
+ (the relevant figure for ``drop``); per-column counts can sum to
183
+ more than this when a row has outliers in multiple columns.
184
+ """
185
+ if policy not in POLICIES:
186
+ raise ValueError(f"Unknown outlier policy: {policy}. Choose from {POLICIES}.")
187
+
188
+ numeric_cols = [
189
+ c for c in feature_columns
190
+ if c in data.columns and _is_numeric_series(data[c])
191
+ ]
192
+
193
+ per_column = {}
194
+ any_outlier_mask = pd.Series(False, index=data.index)
195
+
196
+ for col in numeric_cols:
197
+ mask, lower, upper = detect_outliers(data[col], multiplier=multiplier)
198
+ n = int(mask.sum())
199
+ per_column[col] = {"n_outliers": n, "lower": lower, "upper": upper}
200
+ if n > 0:
201
+ # Align the column-specific mask onto the full-frame mask.
202
+ any_outlier_mask = any_outlier_mask | mask.reindex(data.index, fill_value=False)
203
+
204
+ n_outliers_total = int(any_outlier_mask.sum())
205
+ summary = {
206
+ "policy": policy,
207
+ "applied": "none",
208
+ "n_total": len(data),
209
+ "n_outliers_total": n_outliers_total,
210
+ "per_column": per_column,
211
+ }
212
+
213
+ if n_outliers_total == 0:
214
+ return data, summary
215
+
216
+ if policy == "drop":
217
+ summary["applied"] = "drop"
218
+ return data.loc[~any_outlier_mask].reset_index(drop=True), summary
219
+
220
+ if policy == "winsorize":
221
+ # Per-column winsorisation: each column is capped at its own
222
+ # IQR bounds independently. Row count preserved.
223
+ summary["applied"] = "winsorize"
224
+ new_data = data.copy()
225
+ for col, stats in per_column.items():
226
+ if stats["n_outliers"] == 0:
227
+ continue
228
+ new_data[col] = new_data[col].clip(
229
+ lower=stats["lower"], upper=stats["upper"]
230
+ )
231
+ return new_data, summary
232
+
233
+ # keep / warn: don't change the data.
234
+ return data, summary