p2predict 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- p2predict/__init__.py +88 -0
- p2predict/__main__.py +11 -0
- p2predict/cli/__init__.py +9 -0
- p2predict/cli/predict.py +706 -0
- p2predict/cli/train.py +659 -0
- p2predict/cmdline_io.py +64 -0
- p2predict/explain.py +464 -0
- p2predict/feature_selection.py +139 -0
- p2predict/hpo_training.py +44 -0
- p2predict/input_checks.py +59 -0
- p2predict/intervals.py +317 -0
- p2predict/json_output.py +225 -0
- p2predict/mcp/__init__.py +1 -0
- p2predict/mcp/__main__.py +3 -0
- p2predict/mcp/conversions.py +44 -0
- p2predict/mcp/registry.py +149 -0
- p2predict/mcp/server.py +1258 -0
- p2predict/model_evals.py +36 -0
- p2predict/model_utils.py +235 -0
- p2predict/outliers.py +234 -0
- p2predict/plotting.py +499 -0
- p2predict/prepare_data.py +48 -0
- p2predict/preprocessing.py +130 -0
- p2predict/quality.py +457 -0
- p2predict/trained_model_io.py +64 -0
- p2predict/training.py +270 -0
- p2predict/ui_console.py +36 -0
- p2predict/whatif.py +269 -0
- p2predict-0.9.0.dist-info/METADATA +216 -0
- p2predict-0.9.0.dist-info/RECORD +34 -0
- p2predict-0.9.0.dist-info/WHEEL +5 -0
- p2predict-0.9.0.dist-info/entry_points.txt +4 -0
- p2predict-0.9.0.dist-info/licenses/LICENSE +121 -0
- p2predict-0.9.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
|
|
3
|
+
from p2predict.training import ALGORITHMS, _budget_params, _tune, build_pipeline, should_log_target
|
|
4
|
+
|
|
5
|
+
console = Console()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def hyper_parameter_tuning(
|
|
9
|
+
X_train, y_train, numerical_cols, categorical_cols, algorithm,
|
|
10
|
+
budget="fast", time_aware=False, log_target=None,
|
|
11
|
+
):
|
|
12
|
+
"""Tune the given algorithm and return the refitted best pipeline."""
|
|
13
|
+
if log_target is None:
|
|
14
|
+
log_target = should_log_target(y_train)
|
|
15
|
+
pipeline = build_pipeline(
|
|
16
|
+
algorithm, numerical_cols, categorical_cols, log_target=log_target
|
|
17
|
+
)
|
|
18
|
+
best_model, best_score = _tune(
|
|
19
|
+
pipeline, X_train, y_train, algorithm, budget, log_target,
|
|
20
|
+
time_aware=time_aware,
|
|
21
|
+
)
|
|
22
|
+
console.print(f"Tuned {algorithm} --> CV R²: {round(best_score, 3)}")
|
|
23
|
+
return best_model, best_score, log_target
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def compare_all_algorithms(
|
|
27
|
+
X_train, y_train, numerical_cols, categorical_cols, budget="fast"
|
|
28
|
+
):
|
|
29
|
+
"""Tune every supported algorithm and report the best."""
|
|
30
|
+
log_target = should_log_target(y_train)
|
|
31
|
+
best_score = float("-inf")
|
|
32
|
+
best = None
|
|
33
|
+
for algorithm in ALGORITHMS:
|
|
34
|
+
pipeline = build_pipeline(
|
|
35
|
+
algorithm, numerical_cols, categorical_cols, log_target=log_target
|
|
36
|
+
)
|
|
37
|
+
model, score = _tune(
|
|
38
|
+
pipeline, X_train, y_train, algorithm, budget, log_target
|
|
39
|
+
)
|
|
40
|
+
console.print(f"Model: {algorithm} --> CV R²: {round(score, 3)}")
|
|
41
|
+
if score > best_score:
|
|
42
|
+
best_score = score
|
|
43
|
+
best = (algorithm, model)
|
|
44
|
+
return best, best_score, log_target
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
|
|
4
|
+
# Route warnings to stderr so that `--json` callers get a pure-JSON stdout.
|
|
5
|
+
# A module-level Console() on stdout used to print the NA warning *before*
|
|
6
|
+
# the JSON document, making `p2predict-train --json` output unparseable.
|
|
7
|
+
console = Console(stderr=True)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def check_csv_sanity(file):
|
|
11
|
+
"""Load and sanity-check a CSV. Returns the loaded DataFrame unchanged.
|
|
12
|
+
|
|
13
|
+
Aborts on empty files, malformed CSV, or missing files. Missing values
|
|
14
|
+
are *reported* (to stderr) but no longer dropped here: dropping rows over
|
|
15
|
+
all columns at load time silently discarded data — including NAs in
|
|
16
|
+
columns that aren't even selected as training features. Row dropping /
|
|
17
|
+
imputation is now decided downstream (in the train CLI), once the target
|
|
18
|
+
and feature columns are known, so only the relevant NAs matter.
|
|
19
|
+
"""
|
|
20
|
+
try:
|
|
21
|
+
df = pd.read_csv(file)
|
|
22
|
+
except FileNotFoundError:
|
|
23
|
+
console.print(f"Aborted: File '{file}' not found", style="red")
|
|
24
|
+
raise SystemExit(1)
|
|
25
|
+
except pd.errors.ParserError as e:
|
|
26
|
+
console.print(f"Aborted: Invalid CSV format in '{file}': {e}", style="red")
|
|
27
|
+
raise SystemExit(1)
|
|
28
|
+
except pd.errors.EmptyDataError:
|
|
29
|
+
console.print("Aborted: CSV file is empty", style="red")
|
|
30
|
+
raise SystemExit(1)
|
|
31
|
+
|
|
32
|
+
if df.empty:
|
|
33
|
+
console.print("Aborted: CSV file is empty", style="red")
|
|
34
|
+
raise SystemExit(1)
|
|
35
|
+
|
|
36
|
+
empty_header_positions = [
|
|
37
|
+
i + 1 for i, col in enumerate(df.columns)
|
|
38
|
+
if isinstance(col, str) and col.strip() == ""
|
|
39
|
+
]
|
|
40
|
+
if empty_header_positions:
|
|
41
|
+
console.print(
|
|
42
|
+
f"Aborted: CSV file contains empty column(s) at position(s): {empty_header_positions}",
|
|
43
|
+
style="red",
|
|
44
|
+
)
|
|
45
|
+
raise SystemExit(1)
|
|
46
|
+
|
|
47
|
+
na_counts = df.isna().sum()
|
|
48
|
+
columns_with_na = na_counts[na_counts > 0]
|
|
49
|
+
if not columns_with_na.empty:
|
|
50
|
+
details = ", ".join(f"{col} ({n})" for col, n in columns_with_na.items())
|
|
51
|
+
console.print(
|
|
52
|
+
f"Warning: CSV contains missing values in: {details}. "
|
|
53
|
+
"Rows are not dropped at load time — NAs in the target column are "
|
|
54
|
+
"dropped at training, and NAs in feature columns are handled by "
|
|
55
|
+
"the model (XGBoost natively; imputed for random_forest/ridge).",
|
|
56
|
+
style="yellow",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return df
|
p2predict/intervals.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Conformal prediction intervals for P2Predict models.
|
|
2
|
+
|
|
3
|
+
What this module computes
|
|
4
|
+
-------------------------
|
|
5
|
+
For each prediction, a "likely range" [low, high] that is mathematically
|
|
6
|
+
guaranteed to contain the true value with probability >= 1 - alpha (the
|
|
7
|
+
target coverage rate), under the assumption that future inputs come from
|
|
8
|
+
the same distribution as the training data.
|
|
9
|
+
|
|
10
|
+
This guarantee is what split conformal prediction provides — the interval
|
|
11
|
+
isn't just a heuristic ±2σ, it has a finite-sample coverage proof. The
|
|
12
|
+
proof rests on exchangeability of (X_test, y_test) with (X_future,
|
|
13
|
+
y_future): a much weaker assumption than the parametric normality
|
|
14
|
+
assumptions that classical prediction intervals rely on.
|
|
15
|
+
|
|
16
|
+
Algorithm: split conformal with the test set as calibration set
|
|
17
|
+
---------------------------------------------------------------
|
|
18
|
+
1. Train the model on the training split (already done by the time we
|
|
19
|
+
get here).
|
|
20
|
+
2. Compute absolute residuals on the held-out test split:
|
|
21
|
+
r_i = |y_test_i - model.predict(x_test_i)| (or in log space for log-target)
|
|
22
|
+
3. For coverage 1 - alpha, the conformal threshold is the
|
|
23
|
+
k-th smallest residual, where k = ceil((n + 1) * (1 - alpha))
|
|
24
|
+
In numpy: ``np.quantile(residuals, q, method="higher")`` with q
|
|
25
|
+
chosen to match.
|
|
26
|
+
4. At predict time:
|
|
27
|
+
low, high = pred - q_hat, pred + q_hat (additive intervals)
|
|
28
|
+
For log-target models the calibration is done in log space, so the
|
|
29
|
+
bounds transform via exp() to multiplicative intervals in price space:
|
|
30
|
+
low, high = pred * exp(-q_hat), pred * exp(+q_hat)
|
|
31
|
+
|
|
32
|
+
Why use the test set for calibration instead of a separate split
|
|
33
|
+
----------------------------------------------------------------
|
|
34
|
+
The natural worry is double-dipping: "if we report R² on the test set
|
|
35
|
+
and then use the same test set residuals to calibrate intervals, are
|
|
36
|
+
the metrics still valid?" Yes. R² on the test set remains an unbiased
|
|
37
|
+
estimate of generalization on the underlying distribution; computing a
|
|
38
|
+
downstream statistic (the conformal quantile) from those same residuals
|
|
39
|
+
doesn't change that. The exchangeability assumption holds for (X_test,
|
|
40
|
+
y_test) ~ (X_future, y_future) regardless of what else we use the test
|
|
41
|
+
residuals for, as long as we don't select the model based on them.
|
|
42
|
+
|
|
43
|
+
The data-efficiency win is real: a separate calibration split would
|
|
44
|
+
shrink the training set by 16% in the standard 80/20 setup, slightly
|
|
45
|
+
degrading the model. Using test residuals avoids that.
|
|
46
|
+
|
|
47
|
+
Why we calibrate in log space when log-target is active
|
|
48
|
+
-------------------------------------------------------
|
|
49
|
+
Procurement prices vary by orders of magnitude. A $1 part and a $1,000
|
|
50
|
+
part should not get the same ± dollar interval — that would be useless
|
|
51
|
+
on the small end and reckless on the large. Calibrating absolute log-
|
|
52
|
+
residuals gives constant-width intervals in log space, which transform
|
|
53
|
+
to *multiplicative* intervals in price space:
|
|
54
|
+
|
|
55
|
+
[pred * exp(-q_hat), pred * exp(+q_hat)]
|
|
56
|
+
|
|
57
|
+
Same percentage-width regardless of prediction magnitude. Procurement-
|
|
58
|
+
natural.
|
|
59
|
+
|
|
60
|
+
For non-log-target models we use absolute residuals in the target's
|
|
61
|
+
native units, giving constant-width additive intervals. That's the
|
|
62
|
+
right behaviour when the target is something like profit margin
|
|
63
|
+
(which can be negative and isn't bounded multiplicatively).
|
|
64
|
+
|
|
65
|
+
Banded (Mondrian) calibration
|
|
66
|
+
-----------------------------
|
|
67
|
+
A single global q_hat gives every prediction the same width, which lets the
|
|
68
|
+
noisiest segment of the data set the width for everyone: on a catalog whose
|
|
69
|
+
sub-$5 parts are near-random, the $200 parts inherit that noise in their
|
|
70
|
+
likely range. When the calibration set is large enough we therefore
|
|
71
|
+
partition it into bands by *predicted* value (terciles of the calibration
|
|
72
|
+
predictions) and compute a separate conformal quantile per band — Mondrian
|
|
73
|
+
conformal prediction. The coverage guarantee then holds *within each band*,
|
|
74
|
+
not just on average, because the banding rule depends only on the model's
|
|
75
|
+
prediction (a function of X), never on the calibration labels.
|
|
76
|
+
|
|
77
|
+
Fallbacks keep the old behaviour bit-for-bit:
|
|
78
|
+
* calibration dicts saved by older versions (no "predictions" key),
|
|
79
|
+
* calibration sets smaller than MIN_CALIBRATION_FOR_BANDING,
|
|
80
|
+
both produce the single global quantile exactly as before.
|
|
81
|
+
|
|
82
|
+
User-facing language
|
|
83
|
+
--------------------
|
|
84
|
+
The CLI and README deliberately avoid "confidence interval" (technically
|
|
85
|
+
wrong for prediction intervals anyway), "alpha", "conformal", and
|
|
86
|
+
"coverage". We use "likely range" and natural-frequency framing
|
|
87
|
+
("9 in 10 similar parts fall in this range"). Bands surface to users as
|
|
88
|
+
"calibrated on similar-priced parts". This module's docstrings keep the
|
|
89
|
+
technical names because the audience here is developers.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
from __future__ import annotations
|
|
93
|
+
|
|
94
|
+
from dataclasses import dataclass
|
|
95
|
+
from typing import Optional
|
|
96
|
+
|
|
97
|
+
import numpy as np
|
|
98
|
+
from sklearn.compose import TransformedTargetRegressor
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Banding thresholds. Three bands of >= 50 calibration points each keeps the
|
|
102
|
+
# per-band conformal quantile stable; below 150 total we stay global. Chosen
|
|
103
|
+
# so a standard 80/20 split bands from ~750 training rows upward.
|
|
104
|
+
N_BANDS = 3
|
|
105
|
+
MIN_CALIBRATION_FOR_BANDING = 150
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class IntervalResult:
|
|
110
|
+
"""One prediction with its likely range.
|
|
111
|
+
|
|
112
|
+
Attributes are named for user-facing rendering: ``low`` and ``high``
|
|
113
|
+
are in the same units as ``prediction``, regardless of whether the
|
|
114
|
+
underlying model used a log-target transform.
|
|
115
|
+
|
|
116
|
+
``band`` is a human-readable description of the calibration band the
|
|
117
|
+
width came from (e.g. ``"predicted 5.20 to 155.00"``), or ``None`` when
|
|
118
|
+
the global quantile was used (old calibration data, or a calibration
|
|
119
|
+
set too small to band).
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
low: float
|
|
123
|
+
prediction: float
|
|
124
|
+
high: float
|
|
125
|
+
coverage: float # the realised target coverage, e.g. 0.90 for 90%
|
|
126
|
+
band: Optional[str] = None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def compute_calibration_residuals(model, X_test, y_test) -> dict:
|
|
130
|
+
"""Return the residuals to stash in the saved model.
|
|
131
|
+
|
|
132
|
+
The dict shape is what gets persisted in model metadata under
|
|
133
|
+
``calibration``. We store the raw residuals (not a precomputed q_hat)
|
|
134
|
+
so the user can pick any coverage level at predict time without
|
|
135
|
+
retraining.
|
|
136
|
+
|
|
137
|
+
For log-target models the residuals are in log space; the
|
|
138
|
+
``in_log_space`` flag tells the predict-time code to inverse-transform
|
|
139
|
+
them multiplicatively.
|
|
140
|
+
"""
|
|
141
|
+
y_test = np.asarray(y_test, dtype=float)
|
|
142
|
+
is_log_target = isinstance(model, TransformedTargetRegressor)
|
|
143
|
+
|
|
144
|
+
# We need the model's prediction in the user's target units regardless;
|
|
145
|
+
# for log-target models we then take the log of both sides to get
|
|
146
|
+
# log-space residuals.
|
|
147
|
+
preds = np.asarray(model.predict(X_test), dtype=float)
|
|
148
|
+
|
|
149
|
+
if is_log_target:
|
|
150
|
+
# Guard against numerical zero/negative preds (shouldn't happen if
|
|
151
|
+
# should_log_target gated correctly, but defend the math).
|
|
152
|
+
with np.errstate(invalid="ignore", divide="ignore"):
|
|
153
|
+
valid = (preds > 0) & (y_test > 0)
|
|
154
|
+
residuals = np.abs(np.log(y_test[valid]) - np.log(preds[valid]))
|
|
155
|
+
cal_preds = preds[valid]
|
|
156
|
+
in_log_space = True
|
|
157
|
+
else:
|
|
158
|
+
residuals = np.abs(y_test - preds)
|
|
159
|
+
cal_preds = preds
|
|
160
|
+
in_log_space = False
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
"residuals": residuals.tolist(),
|
|
164
|
+
# Target-space predictions aligned 1:1 with `residuals`. New in the
|
|
165
|
+
# banded-calibration version; lets predict_interval() partition the
|
|
166
|
+
# calibration set by predicted value (Mondrian bands). Older models
|
|
167
|
+
# without this key keep the global-quantile behaviour.
|
|
168
|
+
"predictions": cal_preds.tolist(),
|
|
169
|
+
"in_log_space": in_log_space,
|
|
170
|
+
"n_calibration": int(len(residuals)),
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _conformal_quantile(residuals: np.ndarray, alpha: float) -> float:
|
|
175
|
+
"""The k-th smallest residual that gives finite-sample coverage 1-alpha.
|
|
176
|
+
|
|
177
|
+
Standard split-conformal quantile rule. Uses ``method="higher"`` so we
|
|
178
|
+
err on the safe side (slightly wider interval) rather than the
|
|
179
|
+
optimistic side when the order statistic falls between samples.
|
|
180
|
+
"""
|
|
181
|
+
n = len(residuals)
|
|
182
|
+
if n == 0:
|
|
183
|
+
raise ValueError("Cannot calibrate with zero residuals.")
|
|
184
|
+
# k / n quantile, where k = ceil((n + 1) * (1 - alpha)).
|
|
185
|
+
# Clip to (0, 1] so np.quantile is well-defined for tiny n.
|
|
186
|
+
q_level = min(1.0, np.ceil((n + 1) * (1.0 - alpha)) / n)
|
|
187
|
+
return float(np.quantile(residuals, q_level, method="higher"))
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _build_bands(cal_preds: np.ndarray, residuals: np.ndarray, alpha: float):
|
|
191
|
+
"""Partition the calibration set into N_BANDS by predicted value and
|
|
192
|
+
return ``(edges, band_q_hats, band_labels)``, or ``None`` when banding
|
|
193
|
+
isn't justified (too few calibration points, or degenerate predictions
|
|
194
|
+
that collapse the band edges).
|
|
195
|
+
|
|
196
|
+
The banding rule uses only the model's predictions — a function of X —
|
|
197
|
+
so the split-conformal coverage guarantee holds within each band.
|
|
198
|
+
"""
|
|
199
|
+
n = len(cal_preds)
|
|
200
|
+
if n < MIN_CALIBRATION_FOR_BANDING or len(residuals) != n:
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
quantiles = np.linspace(0, 1, N_BANDS + 1)[1:-1]
|
|
204
|
+
edges = np.quantile(cal_preds, quantiles)
|
|
205
|
+
if len(np.unique(edges)) != len(edges):
|
|
206
|
+
# Predictions so concentrated that the terciles coincide — banding
|
|
207
|
+
# would create empty/degenerate bands. Stay global.
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
band_of = np.searchsorted(edges, cal_preds, side="right")
|
|
211
|
+
q_hats = []
|
|
212
|
+
labels = []
|
|
213
|
+
bounds = np.concatenate(([-np.inf], edges, [np.inf]))
|
|
214
|
+
for b in range(N_BANDS):
|
|
215
|
+
r = residuals[band_of == b]
|
|
216
|
+
if len(r) == 0:
|
|
217
|
+
return None
|
|
218
|
+
q_hats.append(_conformal_quantile(r, alpha))
|
|
219
|
+
lo, hi = bounds[b], bounds[b + 1]
|
|
220
|
+
if np.isinf(lo):
|
|
221
|
+
labels.append(f"predicted under {hi:,.2f}")
|
|
222
|
+
elif np.isinf(hi):
|
|
223
|
+
labels.append(f"predicted over {lo:,.2f}")
|
|
224
|
+
else:
|
|
225
|
+
labels.append(f"predicted {lo:,.2f} to {hi:,.2f}")
|
|
226
|
+
return edges, q_hats, labels
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def predict_interval(
|
|
230
|
+
model,
|
|
231
|
+
x,
|
|
232
|
+
calibration: dict,
|
|
233
|
+
coverage: float = 0.90,
|
|
234
|
+
) -> list[IntervalResult]:
|
|
235
|
+
"""Predict a likely-range interval for each row of ``x``.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
model
|
|
240
|
+
A fitted P2Predict pipeline (with or without a
|
|
241
|
+
``TransformedTargetRegressor`` wrap).
|
|
242
|
+
x
|
|
243
|
+
DataFrame of inputs. Each row gets its own interval.
|
|
244
|
+
calibration
|
|
245
|
+
The dict returned by ``compute_calibration_residuals`` and
|
|
246
|
+
persisted with the model in v0.5+.
|
|
247
|
+
coverage
|
|
248
|
+
Target coverage rate in (0, 1). 0.90 means a "9-in-10" interval.
|
|
249
|
+
|
|
250
|
+
Returns
|
|
251
|
+
-------
|
|
252
|
+
A list of IntervalResult — one per input row.
|
|
253
|
+
"""
|
|
254
|
+
if not 0.0 < coverage < 1.0:
|
|
255
|
+
raise ValueError(f"Coverage must be strictly between 0 and 1, got {coverage}.")
|
|
256
|
+
|
|
257
|
+
residuals = np.asarray(calibration["residuals"], dtype=float)
|
|
258
|
+
in_log_space = bool(calibration.get("in_log_space", False))
|
|
259
|
+
alpha = 1.0 - coverage
|
|
260
|
+
q_global = _conformal_quantile(residuals, alpha)
|
|
261
|
+
|
|
262
|
+
# Banded (Mondrian) calibration: a per-band quantile, keyed by predicted
|
|
263
|
+
# value, so the width tracks where the model is actually good instead of
|
|
264
|
+
# the noisiest segment setting one width for everyone. Falls back to the
|
|
265
|
+
# global quantile for old calibration dicts or small calibration sets.
|
|
266
|
+
bands = None
|
|
267
|
+
cal_preds = calibration.get("predictions")
|
|
268
|
+
if cal_preds is not None:
|
|
269
|
+
bands = _build_bands(
|
|
270
|
+
np.asarray(cal_preds, dtype=float), residuals, alpha
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
preds = np.asarray(model.predict(x), dtype=float)
|
|
274
|
+
if bands is None:
|
|
275
|
+
q_hat = np.full(preds.shape, q_global)
|
|
276
|
+
band_labels = [None] * len(preds)
|
|
277
|
+
else:
|
|
278
|
+
edges, band_q_hats, labels = bands
|
|
279
|
+
band_of = np.searchsorted(edges, preds, side="right")
|
|
280
|
+
q_hat = np.asarray(band_q_hats, dtype=float)[band_of]
|
|
281
|
+
band_labels = [labels[b] for b in band_of]
|
|
282
|
+
|
|
283
|
+
if in_log_space:
|
|
284
|
+
# Multiplicative bounds in price space.
|
|
285
|
+
low = preds * np.exp(-q_hat)
|
|
286
|
+
high = preds * np.exp(+q_hat)
|
|
287
|
+
else:
|
|
288
|
+
low = preds - q_hat
|
|
289
|
+
high = preds + q_hat
|
|
290
|
+
|
|
291
|
+
return [
|
|
292
|
+
IntervalResult(
|
|
293
|
+
low=float(lo), prediction=float(p), high=float(hi),
|
|
294
|
+
coverage=coverage, band=band,
|
|
295
|
+
)
|
|
296
|
+
for lo, p, hi, band in zip(low, preds, high, band_labels)
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def coverage_health(calibration: Optional[dict]) -> Optional[str]:
|
|
301
|
+
"""Return a short caveat string if the calibration set is too small
|
|
302
|
+
to give reliable intervals. None means "intervals are fine."
|
|
303
|
+
|
|
304
|
+
Split conformal's coverage guarantee is technically valid for any
|
|
305
|
+
n >= 1, but the *interval width* becomes very sensitive to individual
|
|
306
|
+
residuals when n is small. Below ~20 calibration points we surface a
|
|
307
|
+
warning at the CLI so users know to take the range with a grain of
|
|
308
|
+
salt.
|
|
309
|
+
"""
|
|
310
|
+
if calibration is None:
|
|
311
|
+
return "no calibration data stored with this model — re-train on v0.5+ for likely-range support"
|
|
312
|
+
n = int(calibration.get("n_calibration", 0))
|
|
313
|
+
if n == 0:
|
|
314
|
+
return "calibration set is empty — likely range is undefined"
|
|
315
|
+
if n < 20:
|
|
316
|
+
return f"calibration set is small (n={n}) — likely range may be noisy"
|
|
317
|
+
return None
|
p2predict/json_output.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""Stable JSON output schema for P2Predict's CLIs.
|
|
2
|
+
|
|
3
|
+
When either CLI is invoked with ``--json``, all human-facing output is
|
|
4
|
+
suppressed and a single JSON document is emitted to stdout instead.
|
|
5
|
+
Stderr stays clean (no spinner, no logo). Exit code is 0 on success,
|
|
6
|
+
1 on error — including on errors, where a JSON-shaped error document
|
|
7
|
+
is still emitted to stdout so an agent that piped the output can parse
|
|
8
|
+
it instead of seeing a Rich-formatted abort message.
|
|
9
|
+
|
|
10
|
+
Schema versioning
|
|
11
|
+
-----------------
|
|
12
|
+
Every JSON document includes a top-level ``schema_version`` field
|
|
13
|
+
(currently "1.0"). When fields are added we can leave the version
|
|
14
|
+
alone; when fields are renamed or removed we bump the major number.
|
|
15
|
+
Tests in tests/test_json_output.py lock in the field names so this
|
|
16
|
+
doesn't drift accidentally.
|
|
17
|
+
|
|
18
|
+
Predict (``p2predict ... --json``)
|
|
19
|
+
----------------------------------
|
|
20
|
+
::
|
|
21
|
+
|
|
22
|
+
{
|
|
23
|
+
"schema_version": "1.0",
|
|
24
|
+
"command": "predict",
|
|
25
|
+
"model": {
|
|
26
|
+
"path": str,
|
|
27
|
+
"algorithm": str,
|
|
28
|
+
"target": str,
|
|
29
|
+
"version": str, # the p2predict_version saved in the model
|
|
30
|
+
"log_target": bool,
|
|
31
|
+
"features": [str, ...]
|
|
32
|
+
},
|
|
33
|
+
"mode": "inline" | "batch" | "interactive",
|
|
34
|
+
"predictions": [ # one entry per input row
|
|
35
|
+
{
|
|
36
|
+
"input": {feature: value, ...},
|
|
37
|
+
"prediction": float
|
|
38
|
+
},
|
|
39
|
+
...
|
|
40
|
+
],
|
|
41
|
+
"interval": { # present when --interval N was passed
|
|
42
|
+
"coverage": float, # e.g. 0.90
|
|
43
|
+
"per_row": [
|
|
44
|
+
{"low": float, "high": float, "prediction": float},
|
|
45
|
+
...
|
|
46
|
+
],
|
|
47
|
+
"soft_warning": str | null # non-null when calibration is small
|
|
48
|
+
},
|
|
49
|
+
"explanation": [ # present when --explain was passed
|
|
50
|
+
{ # one entry per input row
|
|
51
|
+
"baseline": float,
|
|
52
|
+
"prediction": float,
|
|
53
|
+
"log_target": bool,
|
|
54
|
+
"contributions": [{"feature": str, "value": float}, ...],
|
|
55
|
+
"multiplicative_factors": # only for log-target models
|
|
56
|
+
[{"feature": str, "factor": float}, ...] | null,
|
|
57
|
+
"dollar_attribution": # only for log-target models, labelled
|
|
58
|
+
# approximate in the README
|
|
59
|
+
[{"feature": str, "value": float}, ...] | null,
|
|
60
|
+
"residual": float
|
|
61
|
+
},
|
|
62
|
+
...
|
|
63
|
+
],
|
|
64
|
+
"whatif": { # present when --whatif "Feature:NewVal,..." was passed
|
|
65
|
+
"changes": {feature: {"from": value, "to": value}, ...},
|
|
66
|
+
"base_prediction": float,
|
|
67
|
+
"counterfactual_prediction": float,
|
|
68
|
+
"delta": float,
|
|
69
|
+
"delta_pct": float,
|
|
70
|
+
"log_target": bool,
|
|
71
|
+
"multiplicative_factor": float | null,
|
|
72
|
+
"changed_contributions": [{"feature": str, "value": float}, ...],
|
|
73
|
+
"interaction_contribution": float,
|
|
74
|
+
"interaction_is_material": bool,
|
|
75
|
+
"base_interval": {"low": float, "high": float} | null,
|
|
76
|
+
"cf_interval": {"low": float, "high": float} | null
|
|
77
|
+
},
|
|
78
|
+
"batch": { # present in batch mode (-i)
|
|
79
|
+
"csv_path": str, # where predictions got written
|
|
80
|
+
"n_rows": int
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
Train (``p2predict-train ... --json``)
|
|
85
|
+
--------------------------------------
|
|
86
|
+
::
|
|
87
|
+
|
|
88
|
+
{
|
|
89
|
+
"schema_version": "1.0",
|
|
90
|
+
"command": "train",
|
|
91
|
+
"input": {
|
|
92
|
+
"csv_path": str,
|
|
93
|
+
"rows_loaded": int,
|
|
94
|
+
"rows_after_outlier_handling": int,
|
|
95
|
+
"target": str
|
|
96
|
+
},
|
|
97
|
+
"mode": "auto" | "expert",
|
|
98
|
+
"time_column": str | null,
|
|
99
|
+
"outliers": {
|
|
100
|
+
"target": {
|
|
101
|
+
"policy": str, # keep / warn / drop / winsorize
|
|
102
|
+
"applied": str, # the action that actually changed data
|
|
103
|
+
"n_outliers": int,
|
|
104
|
+
"n_total": int,
|
|
105
|
+
"lower": float | null,
|
|
106
|
+
"upper": float | null
|
|
107
|
+
},
|
|
108
|
+
"features": {
|
|
109
|
+
"policy": str,
|
|
110
|
+
"applied": str,
|
|
111
|
+
"n_outliers_total": int,
|
|
112
|
+
"per_column": {col: {"n_outliers": int,
|
|
113
|
+
"lower": float,
|
|
114
|
+
"upper": float}, ...}
|
|
115
|
+
}
|
|
116
|
+
},
|
|
117
|
+
"low_info_features": {
|
|
118
|
+
"no_information": [str, ...],
|
|
119
|
+
"high_variation": [str, ...]
|
|
120
|
+
},
|
|
121
|
+
"features_selected": [str, ...],
|
|
122
|
+
"algorithm_selected": str,
|
|
123
|
+
"log_target": bool,
|
|
124
|
+
"log_target_decision": str, # "auto:skew=<value>" | "manual:on" | "manual:off"
|
|
125
|
+
"cv_scores": {algo: float, ...}, # auto-mode only
|
|
126
|
+
"feature_importances": [
|
|
127
|
+
{"feature": str, "importance": float},
|
|
128
|
+
...
|
|
129
|
+
],
|
|
130
|
+
"evaluation": {
|
|
131
|
+
"r2": float,
|
|
132
|
+
"mae": float,
|
|
133
|
+
"rmse": float,
|
|
134
|
+
"residual_bias_p_value": float,
|
|
135
|
+
"quality_label": "Excellent" | "Good" | "Needs Improvement"
|
|
136
|
+
},
|
|
137
|
+
"model_path": str | null, # null if not saved (interactive declined)
|
|
138
|
+
"report_path": str | null # null unless --report PATH was passed
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
Errors (any command, when --json is set)
|
|
142
|
+
----------------------------------------
|
|
143
|
+
::
|
|
144
|
+
|
|
145
|
+
{
|
|
146
|
+
"schema_version": "1.0",
|
|
147
|
+
"command": "predict" | "train",
|
|
148
|
+
"error": {
|
|
149
|
+
"code": str, # short identifier, e.g. "missing_input"
|
|
150
|
+
"message": str # human-readable description
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
from __future__ import annotations
|
|
156
|
+
|
|
157
|
+
import json
|
|
158
|
+
import sys
|
|
159
|
+
from typing import Any
|
|
160
|
+
|
|
161
|
+
JSON_SCHEMA_VERSION = "1.0"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def emit(payload: dict[str, Any]) -> None:
|
|
165
|
+
"""Write the JSON payload to stdout. Single source of truth so the
|
|
166
|
+
serialisation options stay consistent across the two CLIs."""
|
|
167
|
+
json.dump(
|
|
168
|
+
payload,
|
|
169
|
+
sys.stdout,
|
|
170
|
+
indent=2,
|
|
171
|
+
default=_json_default,
|
|
172
|
+
ensure_ascii=False,
|
|
173
|
+
)
|
|
174
|
+
sys.stdout.write("\n")
|
|
175
|
+
sys.stdout.flush()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def emit_error(command: str, code: str, message: str, exit_code: int = 1) -> None:
|
|
179
|
+
"""Emit a JSON error document and exit non-zero.
|
|
180
|
+
|
|
181
|
+
Used in place of ``console.print('Aborted: ...'); raise SystemExit(1)``
|
|
182
|
+
when ``--json`` is active so callers piping stdout to ``jq`` (or an
|
|
183
|
+
agent) still get a parseable document on failure.
|
|
184
|
+
"""
|
|
185
|
+
emit({
|
|
186
|
+
"schema_version": JSON_SCHEMA_VERSION,
|
|
187
|
+
"command": command,
|
|
188
|
+
"error": {"code": code, "message": message},
|
|
189
|
+
})
|
|
190
|
+
raise SystemExit(exit_code)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _json_default(obj: Any) -> Any:
|
|
194
|
+
"""Coerce non-JSON-native types we routinely return from the model
|
|
195
|
+
stack (numpy scalars, pandas/numpy timestamps, dataclasses) into
|
|
196
|
+
plain Python so json.dump doesn't choke.
|
|
197
|
+
|
|
198
|
+
Anything that survives this and still isn't serialisable will raise
|
|
199
|
+
TypeError, which is what we want — better an explicit failure than a
|
|
200
|
+
silent string-cast that lies about the field's type.
|
|
201
|
+
"""
|
|
202
|
+
# numpy scalars
|
|
203
|
+
try:
|
|
204
|
+
import numpy as np
|
|
205
|
+
if isinstance(obj, np.generic):
|
|
206
|
+
return obj.item()
|
|
207
|
+
if isinstance(obj, np.ndarray):
|
|
208
|
+
return obj.tolist()
|
|
209
|
+
except ImportError: # pragma: no cover — numpy is a hard dep
|
|
210
|
+
pass
|
|
211
|
+
# pandas timestamps
|
|
212
|
+
try:
|
|
213
|
+
import pandas as pd
|
|
214
|
+
if isinstance(obj, pd.Timestamp):
|
|
215
|
+
return obj.isoformat()
|
|
216
|
+
except ImportError: # pragma: no cover — pandas is a hard dep
|
|
217
|
+
pass
|
|
218
|
+
# dataclasses
|
|
219
|
+
if hasattr(obj, "__dataclass_fields__"):
|
|
220
|
+
from dataclasses import asdict
|
|
221
|
+
return asdict(obj)
|
|
222
|
+
raise TypeError(
|
|
223
|
+
f"Cannot serialise object of type {type(obj).__name__} to JSON. "
|
|
224
|
+
"Coerce it in the CLI before emitting."
|
|
225
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""P2Predict MCP server — typed tools for AI agents."""
|