p2predict 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ from rich.console import Console
2
+
3
+ from p2predict.training import ALGORITHMS, _budget_params, _tune, build_pipeline, should_log_target
4
+
5
+ console = Console()
6
+
7
+
8
+ def hyper_parameter_tuning(
9
+ X_train, y_train, numerical_cols, categorical_cols, algorithm,
10
+ budget="fast", time_aware=False, log_target=None,
11
+ ):
12
+ """Tune the given algorithm and return the refitted best pipeline."""
13
+ if log_target is None:
14
+ log_target = should_log_target(y_train)
15
+ pipeline = build_pipeline(
16
+ algorithm, numerical_cols, categorical_cols, log_target=log_target
17
+ )
18
+ best_model, best_score = _tune(
19
+ pipeline, X_train, y_train, algorithm, budget, log_target,
20
+ time_aware=time_aware,
21
+ )
22
+ console.print(f"Tuned {algorithm} --> CV R²: {round(best_score, 3)}")
23
+ return best_model, best_score, log_target
24
+
25
+
26
+ def compare_all_algorithms(
27
+ X_train, y_train, numerical_cols, categorical_cols, budget="fast"
28
+ ):
29
+ """Tune every supported algorithm and report the best."""
30
+ log_target = should_log_target(y_train)
31
+ best_score = float("-inf")
32
+ best = None
33
+ for algorithm in ALGORITHMS:
34
+ pipeline = build_pipeline(
35
+ algorithm, numerical_cols, categorical_cols, log_target=log_target
36
+ )
37
+ model, score = _tune(
38
+ pipeline, X_train, y_train, algorithm, budget, log_target
39
+ )
40
+ console.print(f"Model: {algorithm} --> CV R²: {round(score, 3)}")
41
+ if score > best_score:
42
+ best_score = score
43
+ best = (algorithm, model)
44
+ return best, best_score, log_target
@@ -0,0 +1,59 @@
1
+ import pandas as pd
2
+ from rich.console import Console
3
+
4
+ # Route warnings to stderr so that `--json` callers get a pure-JSON stdout.
5
+ # A module-level Console() on stdout used to print the NA warning *before*
6
+ # the JSON document, making `p2predict-train --json` output unparseable.
7
+ console = Console(stderr=True)
8
+
9
+
10
+ def check_csv_sanity(file):
11
+ """Load and sanity-check a CSV. Returns the loaded DataFrame unchanged.
12
+
13
+ Aborts on empty files, malformed CSV, or missing files. Missing values
14
+ are *reported* (to stderr) but no longer dropped here: dropping rows over
15
+ all columns at load time silently discarded data — including NAs in
16
+ columns that aren't even selected as training features. Row dropping /
17
+ imputation is now decided downstream (in the train CLI), once the target
18
+ and feature columns are known, so only the relevant NAs matter.
19
+ """
20
+ try:
21
+ df = pd.read_csv(file)
22
+ except FileNotFoundError:
23
+ console.print(f"Aborted: File '{file}' not found", style="red")
24
+ raise SystemExit(1)
25
+ except pd.errors.ParserError as e:
26
+ console.print(f"Aborted: Invalid CSV format in '{file}': {e}", style="red")
27
+ raise SystemExit(1)
28
+ except pd.errors.EmptyDataError:
29
+ console.print("Aborted: CSV file is empty", style="red")
30
+ raise SystemExit(1)
31
+
32
+ if df.empty:
33
+ console.print("Aborted: CSV file is empty", style="red")
34
+ raise SystemExit(1)
35
+
36
+ empty_header_positions = [
37
+ i + 1 for i, col in enumerate(df.columns)
38
+ if isinstance(col, str) and col.strip() == ""
39
+ ]
40
+ if empty_header_positions:
41
+ console.print(
42
+ f"Aborted: CSV file contains empty column(s) at position(s): {empty_header_positions}",
43
+ style="red",
44
+ )
45
+ raise SystemExit(1)
46
+
47
+ na_counts = df.isna().sum()
48
+ columns_with_na = na_counts[na_counts > 0]
49
+ if not columns_with_na.empty:
50
+ details = ", ".join(f"{col} ({n})" for col, n in columns_with_na.items())
51
+ console.print(
52
+ f"Warning: CSV contains missing values in: {details}. "
53
+ "Rows are not dropped at load time — NAs in the target column are "
54
+ "dropped at training, and NAs in feature columns are handled by "
55
+ "the model (XGBoost natively; imputed for random_forest/ridge).",
56
+ style="yellow",
57
+ )
58
+
59
+ return df
p2predict/intervals.py ADDED
@@ -0,0 +1,317 @@
1
+ """Conformal prediction intervals for P2Predict models.
2
+
3
+ What this module computes
4
+ -------------------------
5
+ For each prediction, a "likely range" [low, high] that is mathematically
6
+ guaranteed to contain the true value with probability >= 1 - alpha (the
7
+ target coverage rate), under the assumption that future inputs come from
8
+ the same distribution as the training data.
9
+
10
+ This guarantee is what split conformal prediction provides — the interval
11
+ isn't just a heuristic ±2σ, it has a finite-sample coverage proof. The
12
+ proof rests on exchangeability of (X_test, y_test) with (X_future,
13
+ y_future): a much weaker assumption than the parametric normality
14
+ assumptions that classical prediction intervals rely on.
15
+
16
+ Algorithm: split conformal with the test set as calibration set
17
+ ---------------------------------------------------------------
18
+ 1. Train the model on the training split (already done by the time we
19
+ get here).
20
+ 2. Compute absolute residuals on the held-out test split:
21
+ r_i = |y_test_i - model.predict(x_test_i)| (or in log space for log-target)
22
+ 3. For coverage 1 - alpha, the conformal threshold is the
23
+ k-th smallest residual, where k = ceil((n + 1) * (1 - alpha))
24
+ In numpy: ``np.quantile(residuals, q, method="higher")`` with q
25
+ chosen to match.
26
+ 4. At predict time:
27
+ low, high = pred - q_hat, pred + q_hat (additive intervals)
28
+ For log-target models the calibration is done in log space, so the
29
+ bounds transform via exp() to multiplicative intervals in price space:
30
+ low, high = pred * exp(-q_hat), pred * exp(+q_hat)
31
+
32
+ Why use the test set for calibration instead of a separate split
33
+ ----------------------------------------------------------------
34
+ The natural worry is double-dipping: "if we report R² on the test set
35
+ and then use the same test set residuals to calibrate intervals, are
36
+ the metrics still valid?" Yes. R² on the test set remains an unbiased
37
+ estimate of generalization on the underlying distribution; computing a
38
+ downstream statistic (the conformal quantile) from those same residuals
39
+ doesn't change that. The exchangeability assumption holds for (X_test,
40
+ y_test) ~ (X_future, y_future) regardless of what else we use the test
41
+ residuals for, as long as we don't select the model based on them.
42
+
43
+ The data-efficiency win is real: a separate calibration split would
44
+ shrink the training set by 16% in the standard 80/20 setup, slightly
45
+ degrading the model. Using test residuals avoids that.
46
+
47
+ Why we calibrate in log space when log-target is active
48
+ -------------------------------------------------------
49
+ Procurement prices vary by orders of magnitude. A $1 part and a $1,000
50
+ part should not get the same ± dollar interval — that would be useless
51
+ on the small end and reckless on the large. Calibrating absolute log-
52
+ residuals gives constant-width intervals in log space, which transform
53
+ to *multiplicative* intervals in price space:
54
+
55
+ [pred * exp(-q_hat), pred * exp(+q_hat)]
56
+
57
+ Same percentage-width regardless of prediction magnitude. Procurement-
58
+ natural.
59
+
60
+ For non-log-target models we use absolute residuals in the target's
61
+ native units, giving constant-width additive intervals. That's the
62
+ right behaviour when the target is something like profit margin
63
+ (which can be negative and isn't bounded multiplicatively).
64
+
65
+ Banded (Mondrian) calibration
66
+ -----------------------------
67
+ A single global q_hat gives every prediction the same width, which lets the
68
+ noisiest segment of the data set the width for everyone: on a catalog whose
69
+ sub-$5 parts are near-random, the $200 parts inherit that noise in their
70
+ likely range. When the calibration set is large enough we therefore
71
+ partition it into bands by *predicted* value (terciles of the calibration
72
+ predictions) and compute a separate conformal quantile per band — Mondrian
73
+ conformal prediction. The coverage guarantee then holds *within each band*,
74
+ not just on average, because the banding rule depends only on the model's
75
+ prediction (a function of X), never on the calibration labels.
76
+
77
+ Fallbacks keep the old behaviour bit-for-bit:
78
+ * calibration dicts saved by older versions (no "predictions" key),
79
+ * calibration sets smaller than MIN_CALIBRATION_FOR_BANDING,
80
+ both produce the single global quantile exactly as before.
81
+
82
+ User-facing language
83
+ --------------------
84
+ The CLI and README deliberately avoid "confidence interval" (technically
85
+ wrong for prediction intervals anyway), "alpha", "conformal", and
86
+ "coverage". We use "likely range" and natural-frequency framing
87
+ ("9 in 10 similar parts fall in this range"). Bands surface to users as
88
+ "calibrated on similar-priced parts". This module's docstrings keep the
89
+ technical names because the audience here is developers.
90
+ """
91
+
92
+ from __future__ import annotations
93
+
94
+ from dataclasses import dataclass
95
+ from typing import Optional
96
+
97
+ import numpy as np
98
+ from sklearn.compose import TransformedTargetRegressor
99
+
100
+
101
+ # Banding thresholds. Three bands of >= 50 calibration points each keeps the
102
+ # per-band conformal quantile stable; below 150 total we stay global. Chosen
103
+ # so a standard 80/20 split bands from ~750 training rows upward.
104
+ N_BANDS = 3
105
+ MIN_CALIBRATION_FOR_BANDING = 150
106
+
107
+
108
+ @dataclass
109
+ class IntervalResult:
110
+ """One prediction with its likely range.
111
+
112
+ Attributes are named for user-facing rendering: ``low`` and ``high``
113
+ are in the same units as ``prediction``, regardless of whether the
114
+ underlying model used a log-target transform.
115
+
116
+ ``band`` is a human-readable description of the calibration band the
117
+ width came from (e.g. ``"predicted 5.20 to 155.00"``), or ``None`` when
118
+ the global quantile was used (old calibration data, or a calibration
119
+ set too small to band).
120
+ """
121
+
122
+ low: float
123
+ prediction: float
124
+ high: float
125
+ coverage: float # the realised target coverage, e.g. 0.90 for 90%
126
+ band: Optional[str] = None
127
+
128
+
129
+ def compute_calibration_residuals(model, X_test, y_test) -> dict:
130
+ """Return the residuals to stash in the saved model.
131
+
132
+ The dict shape is what gets persisted in model metadata under
133
+ ``calibration``. We store the raw residuals (not a precomputed q_hat)
134
+ so the user can pick any coverage level at predict time without
135
+ retraining.
136
+
137
+ For log-target models the residuals are in log space; the
138
+ ``in_log_space`` flag tells the predict-time code to inverse-transform
139
+ them multiplicatively.
140
+ """
141
+ y_test = np.asarray(y_test, dtype=float)
142
+ is_log_target = isinstance(model, TransformedTargetRegressor)
143
+
144
+ # We need the model's prediction in the user's target units regardless;
145
+ # for log-target models we then take the log of both sides to get
146
+ # log-space residuals.
147
+ preds = np.asarray(model.predict(X_test), dtype=float)
148
+
149
+ if is_log_target:
150
+ # Guard against numerical zero/negative preds (shouldn't happen if
151
+ # should_log_target gated correctly, but defend the math).
152
+ with np.errstate(invalid="ignore", divide="ignore"):
153
+ valid = (preds > 0) & (y_test > 0)
154
+ residuals = np.abs(np.log(y_test[valid]) - np.log(preds[valid]))
155
+ cal_preds = preds[valid]
156
+ in_log_space = True
157
+ else:
158
+ residuals = np.abs(y_test - preds)
159
+ cal_preds = preds
160
+ in_log_space = False
161
+
162
+ return {
163
+ "residuals": residuals.tolist(),
164
+ # Target-space predictions aligned 1:1 with `residuals`. New in the
165
+ # banded-calibration version; lets predict_interval() partition the
166
+ # calibration set by predicted value (Mondrian bands). Older models
167
+ # without this key keep the global-quantile behaviour.
168
+ "predictions": cal_preds.tolist(),
169
+ "in_log_space": in_log_space,
170
+ "n_calibration": int(len(residuals)),
171
+ }
172
+
173
+
174
+ def _conformal_quantile(residuals: np.ndarray, alpha: float) -> float:
175
+ """The k-th smallest residual that gives finite-sample coverage 1-alpha.
176
+
177
+ Standard split-conformal quantile rule. Uses ``method="higher"`` so we
178
+ err on the safe side (slightly wider interval) rather than the
179
+ optimistic side when the order statistic falls between samples.
180
+ """
181
+ n = len(residuals)
182
+ if n == 0:
183
+ raise ValueError("Cannot calibrate with zero residuals.")
184
+ # k / n quantile, where k = ceil((n + 1) * (1 - alpha)).
185
+ # Clip to (0, 1] so np.quantile is well-defined for tiny n.
186
+ q_level = min(1.0, np.ceil((n + 1) * (1.0 - alpha)) / n)
187
+ return float(np.quantile(residuals, q_level, method="higher"))
188
+
189
+
190
+ def _build_bands(cal_preds: np.ndarray, residuals: np.ndarray, alpha: float):
191
+ """Partition the calibration set into N_BANDS by predicted value and
192
+ return ``(edges, band_q_hats, band_labels)``, or ``None`` when banding
193
+ isn't justified (too few calibration points, or degenerate predictions
194
+ that collapse the band edges).
195
+
196
+ The banding rule uses only the model's predictions — a function of X —
197
+ so the split-conformal coverage guarantee holds within each band.
198
+ """
199
+ n = len(cal_preds)
200
+ if n < MIN_CALIBRATION_FOR_BANDING or len(residuals) != n:
201
+ return None
202
+
203
+ quantiles = np.linspace(0, 1, N_BANDS + 1)[1:-1]
204
+ edges = np.quantile(cal_preds, quantiles)
205
+ if len(np.unique(edges)) != len(edges):
206
+ # Predictions so concentrated that the terciles coincide — banding
207
+ # would create empty/degenerate bands. Stay global.
208
+ return None
209
+
210
+ band_of = np.searchsorted(edges, cal_preds, side="right")
211
+ q_hats = []
212
+ labels = []
213
+ bounds = np.concatenate(([-np.inf], edges, [np.inf]))
214
+ for b in range(N_BANDS):
215
+ r = residuals[band_of == b]
216
+ if len(r) == 0:
217
+ return None
218
+ q_hats.append(_conformal_quantile(r, alpha))
219
+ lo, hi = bounds[b], bounds[b + 1]
220
+ if np.isinf(lo):
221
+ labels.append(f"predicted under {hi:,.2f}")
222
+ elif np.isinf(hi):
223
+ labels.append(f"predicted over {lo:,.2f}")
224
+ else:
225
+ labels.append(f"predicted {lo:,.2f} to {hi:,.2f}")
226
+ return edges, q_hats, labels
227
+
228
+
229
+ def predict_interval(
230
+ model,
231
+ x,
232
+ calibration: dict,
233
+ coverage: float = 0.90,
234
+ ) -> list[IntervalResult]:
235
+ """Predict a likely-range interval for each row of ``x``.
236
+
237
+ Parameters
238
+ ----------
239
+ model
240
+ A fitted P2Predict pipeline (with or without a
241
+ ``TransformedTargetRegressor`` wrap).
242
+ x
243
+ DataFrame of inputs. Each row gets its own interval.
244
+ calibration
245
+ The dict returned by ``compute_calibration_residuals`` and
246
+ persisted with the model in v0.5+.
247
+ coverage
248
+ Target coverage rate in (0, 1). 0.90 means a "9-in-10" interval.
249
+
250
+ Returns
251
+ -------
252
+ A list of IntervalResult — one per input row.
253
+ """
254
+ if not 0.0 < coverage < 1.0:
255
+ raise ValueError(f"Coverage must be strictly between 0 and 1, got {coverage}.")
256
+
257
+ residuals = np.asarray(calibration["residuals"], dtype=float)
258
+ in_log_space = bool(calibration.get("in_log_space", False))
259
+ alpha = 1.0 - coverage
260
+ q_global = _conformal_quantile(residuals, alpha)
261
+
262
+ # Banded (Mondrian) calibration: a per-band quantile, keyed by predicted
263
+ # value, so the width tracks where the model is actually good instead of
264
+ # the noisiest segment setting one width for everyone. Falls back to the
265
+ # global quantile for old calibration dicts or small calibration sets.
266
+ bands = None
267
+ cal_preds = calibration.get("predictions")
268
+ if cal_preds is not None:
269
+ bands = _build_bands(
270
+ np.asarray(cal_preds, dtype=float), residuals, alpha
271
+ )
272
+
273
+ preds = np.asarray(model.predict(x), dtype=float)
274
+ if bands is None:
275
+ q_hat = np.full(preds.shape, q_global)
276
+ band_labels = [None] * len(preds)
277
+ else:
278
+ edges, band_q_hats, labels = bands
279
+ band_of = np.searchsorted(edges, preds, side="right")
280
+ q_hat = np.asarray(band_q_hats, dtype=float)[band_of]
281
+ band_labels = [labels[b] for b in band_of]
282
+
283
+ if in_log_space:
284
+ # Multiplicative bounds in price space.
285
+ low = preds * np.exp(-q_hat)
286
+ high = preds * np.exp(+q_hat)
287
+ else:
288
+ low = preds - q_hat
289
+ high = preds + q_hat
290
+
291
+ return [
292
+ IntervalResult(
293
+ low=float(lo), prediction=float(p), high=float(hi),
294
+ coverage=coverage, band=band,
295
+ )
296
+ for lo, p, hi, band in zip(low, preds, high, band_labels)
297
+ ]
298
+
299
+
300
+ def coverage_health(calibration: Optional[dict]) -> Optional[str]:
301
+ """Return a short caveat string if the calibration set is too small
302
+ to give reliable intervals. None means "intervals are fine."
303
+
304
+ Split conformal's coverage guarantee is technically valid for any
305
+ n >= 1, but the *interval width* becomes very sensitive to individual
306
+ residuals when n is small. Below ~20 calibration points we surface a
307
+ warning at the CLI so users know to take the range with a grain of
308
+ salt.
309
+ """
310
+ if calibration is None:
311
+ return "no calibration data stored with this model — re-train on v0.5+ for likely-range support"
312
+ n = int(calibration.get("n_calibration", 0))
313
+ if n == 0:
314
+ return "calibration set is empty — likely range is undefined"
315
+ if n < 20:
316
+ return f"calibration set is small (n={n}) — likely range may be noisy"
317
+ return None
@@ -0,0 +1,225 @@
1
+ """Stable JSON output schema for P2Predict's CLIs.
2
+
3
+ When either CLI is invoked with ``--json``, all human-facing output is
4
+ suppressed and a single JSON document is emitted to stdout instead.
5
+ Stderr stays clean (no spinner, no logo). Exit code is 0 on success,
6
+ 1 on error — including on errors, where a JSON-shaped error document
7
+ is still emitted to stdout so an agent that piped the output can parse
8
+ it instead of seeing a Rich-formatted abort message.
9
+
10
+ Schema versioning
11
+ -----------------
12
+ Every JSON document includes a top-level ``schema_version`` field
13
+ (currently "1.0"). When fields are added we can leave the version
14
+ alone; when fields are renamed or removed we bump the major number.
15
+ Tests in tests/test_json_output.py lock in the field names so this
16
+ doesn't drift accidentally.
17
+
18
+ Predict (``p2predict ... --json``)
19
+ ----------------------------------
20
+ ::
21
+
22
+ {
23
+ "schema_version": "1.0",
24
+ "command": "predict",
25
+ "model": {
26
+ "path": str,
27
+ "algorithm": str,
28
+ "target": str,
29
+ "version": str, # the p2predict_version saved in the model
30
+ "log_target": bool,
31
+ "features": [str, ...]
32
+ },
33
+ "mode": "inline" | "batch" | "interactive",
34
+ "predictions": [ # one entry per input row
35
+ {
36
+ "input": {feature: value, ...},
37
+ "prediction": float
38
+ },
39
+ ...
40
+ ],
41
+ "interval": { # present when --interval N was passed
42
+ "coverage": float, # e.g. 0.90
43
+ "per_row": [
44
+ {"low": float, "high": float, "prediction": float},
45
+ ...
46
+ ],
47
+ "soft_warning": str | null # non-null when calibration is small
48
+ },
49
+ "explanation": [ # present when --explain was passed
50
+ { # one entry per input row
51
+ "baseline": float,
52
+ "prediction": float,
53
+ "log_target": bool,
54
+ "contributions": [{"feature": str, "value": float}, ...],
55
+ "multiplicative_factors": # only for log-target models
56
+ [{"feature": str, "factor": float}, ...] | null,
57
+ "dollar_attribution": # only for log-target models, labelled
58
+ # approximate in the README
59
+ [{"feature": str, "value": float}, ...] | null,
60
+ "residual": float
61
+ },
62
+ ...
63
+ ],
64
+ "whatif": { # present when --whatif "Feature:NewVal,..." was passed
65
+ "changes": {feature: {"from": value, "to": value}, ...},
66
+ "base_prediction": float,
67
+ "counterfactual_prediction": float,
68
+ "delta": float,
69
+ "delta_pct": float,
70
+ "log_target": bool,
71
+ "multiplicative_factor": float | null,
72
+ "changed_contributions": [{"feature": str, "value": float}, ...],
73
+ "interaction_contribution": float,
74
+ "interaction_is_material": bool,
75
+ "base_interval": {"low": float, "high": float} | null,
76
+ "cf_interval": {"low": float, "high": float} | null
77
+ },
78
+ "batch": { # present in batch mode (-i)
79
+ "csv_path": str, # where predictions got written
80
+ "n_rows": int
81
+ }
82
+ }
83
+
84
+ Train (``p2predict-train ... --json``)
85
+ --------------------------------------
86
+ ::
87
+
88
+ {
89
+ "schema_version": "1.0",
90
+ "command": "train",
91
+ "input": {
92
+ "csv_path": str,
93
+ "rows_loaded": int,
94
+ "rows_after_outlier_handling": int,
95
+ "target": str
96
+ },
97
+ "mode": "auto" | "expert",
98
+ "time_column": str | null,
99
+ "outliers": {
100
+ "target": {
101
+ "policy": str, # keep / warn / drop / winsorize
102
+ "applied": str, # the action that actually changed data
103
+ "n_outliers": int,
104
+ "n_total": int,
105
+ "lower": float | null,
106
+ "upper": float | null
107
+ },
108
+ "features": {
109
+ "policy": str,
110
+ "applied": str,
111
+ "n_outliers_total": int,
112
+ "per_column": {col: {"n_outliers": int,
113
+ "lower": float,
114
+ "upper": float}, ...}
115
+ }
116
+ },
117
+ "low_info_features": {
118
+ "no_information": [str, ...],
119
+ "high_variation": [str, ...]
120
+ },
121
+ "features_selected": [str, ...],
122
+ "algorithm_selected": str,
123
+ "log_target": bool,
124
+ "log_target_decision": str, # "auto:skew=<value>" | "manual:on" | "manual:off"
125
+ "cv_scores": {algo: float, ...}, # auto-mode only
126
+ "feature_importances": [
127
+ {"feature": str, "importance": float},
128
+ ...
129
+ ],
130
+ "evaluation": {
131
+ "r2": float,
132
+ "mae": float,
133
+ "rmse": float,
134
+ "residual_bias_p_value": float,
135
+ "quality_label": "Excellent" | "Good" | "Needs Improvement"
136
+ },
137
+ "model_path": str | null, # null if not saved (interactive declined)
138
+ "report_path": str | null # null unless --report PATH was passed
139
+ }
140
+
141
+ Errors (any command, when --json is set)
142
+ ----------------------------------------
143
+ ::
144
+
145
+ {
146
+ "schema_version": "1.0",
147
+ "command": "predict" | "train",
148
+ "error": {
149
+ "code": str, # short identifier, e.g. "missing_input"
150
+ "message": str # human-readable description
151
+ }
152
+ }
153
+ """
154
+
155
+ from __future__ import annotations
156
+
157
+ import json
158
+ import sys
159
+ from typing import Any
160
+
161
+ JSON_SCHEMA_VERSION = "1.0"
162
+
163
+
164
+ def emit(payload: dict[str, Any]) -> None:
165
+ """Write the JSON payload to stdout. Single source of truth so the
166
+ serialisation options stay consistent across the two CLIs."""
167
+ json.dump(
168
+ payload,
169
+ sys.stdout,
170
+ indent=2,
171
+ default=_json_default,
172
+ ensure_ascii=False,
173
+ )
174
+ sys.stdout.write("\n")
175
+ sys.stdout.flush()
176
+
177
+
178
+ def emit_error(command: str, code: str, message: str, exit_code: int = 1) -> None:
179
+ """Emit a JSON error document and exit non-zero.
180
+
181
+ Used in place of ``console.print('Aborted: ...'); raise SystemExit(1)``
182
+ when ``--json`` is active so callers piping stdout to ``jq`` (or an
183
+ agent) still get a parseable document on failure.
184
+ """
185
+ emit({
186
+ "schema_version": JSON_SCHEMA_VERSION,
187
+ "command": command,
188
+ "error": {"code": code, "message": message},
189
+ })
190
+ raise SystemExit(exit_code)
191
+
192
+
193
+ def _json_default(obj: Any) -> Any:
194
+ """Coerce non-JSON-native types we routinely return from the model
195
+ stack (numpy scalars, pandas/numpy timestamps, dataclasses) into
196
+ plain Python so json.dump doesn't choke.
197
+
198
+ Anything that survives this and still isn't serialisable will raise
199
+ TypeError, which is what we want — better an explicit failure than a
200
+ silent string-cast that lies about the field's type.
201
+ """
202
+ # numpy scalars
203
+ try:
204
+ import numpy as np
205
+ if isinstance(obj, np.generic):
206
+ return obj.item()
207
+ if isinstance(obj, np.ndarray):
208
+ return obj.tolist()
209
+ except ImportError: # pragma: no cover — numpy is a hard dep
210
+ pass
211
+ # pandas timestamps
212
+ try:
213
+ import pandas as pd
214
+ if isinstance(obj, pd.Timestamp):
215
+ return obj.isoformat()
216
+ except ImportError: # pragma: no cover — pandas is a hard dep
217
+ pass
218
+ # dataclasses
219
+ if hasattr(obj, "__dataclass_fields__"):
220
+ from dataclasses import asdict
221
+ return asdict(obj)
222
+ raise TypeError(
223
+ f"Cannot serialise object of type {type(obj).__name__} to JSON. "
224
+ "Coerce it in the CLI before emitting."
225
+ )
@@ -0,0 +1 @@
1
+ """P2Predict MCP server — typed tools for AI agents."""
@@ -0,0 +1,3 @@
1
+ from p2predict.mcp.server import main
2
+
3
+ main()