p2predict 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1258 @@
1
+ """P2Predict MCP server — typed tools for AI agents.
2
+
3
+ Start with: p2predict-mcp --models-dir /path/to/models
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import argparse
8
+ import asyncio
9
+ import datetime
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from mcp.server.fastmcp import FastMCP
15
+
16
+ from p2predict.mcp.registry import ModelRegistry
17
+
18
+ # NOTE: the "INTERPRETING OUTPUT" rules below are a condensed, self-contained
19
+ # mirror of "The interpretation rules" in .claude/skills/p2predict/SKILL.md.
20
+ # The SKILL.md version is the canonical, fuller teaching (it also covers the
21
+ # CLI/Python surface and feature engineering); this copy exists because
22
+ # non-Claude clients never see the skill and the server must stand alone.
23
+ # If you change a rule here, update SKILL.md too (and vice versa).
24
+ mcp = FastMCP(
25
+ "P2Predict",
26
+ instructions=(
27
+ "P2Predict is a parametric price/cost benchmarking toolkit for "
28
+ "procurement. The person you are helping is almost always a "
29
+ "category manager or buyer, NOT a data scientist — they do not know "
30
+ "what a 'feature', 'target', 'leakage', or 'log-target' is. Your job "
31
+ "is to do that thinking for them and explain results in plain "
32
+ "procurement language ('this supplier adds $0.72', not 'feature "
33
+ "importance 0.4').\n"
34
+ "\n"
35
+ "SPEAK BUSINESS, NOT STATISTICS. Never say these words to the user — "
36
+ "translate every one:\n"
37
+ " • 'SHAP' / 'attribution' / 'contribution' -> 'what's driving the "
38
+ "price' (the JSON gives you `price_drivers` and `starting_point` "
39
+ "already in dollars and percent — quote those).\n"
40
+ " • 'log-target' -> 'I'm modelling on a percentage scale so the likely-"
41
+ "range never goes negative on cheap parts'.\n"
42
+ " • 'R²' / 'p-value' / 'residual bias' / 'feature importance' -> use the "
43
+ "computed verdicts instead: every quality block carries a `headline` and "
44
+ "per-band / per-feature `say_to_user` sentence written in plain words. "
45
+ "Quote those; do not read raw metrics aloud.\n"
46
+ " • 'feature' / 'target' -> 'spec' / 'the price'.\n"
47
+ "The raw statistical keys stay in the JSON for your reasoning — just "
48
+ "don't surface their names to a category manager.\n"
49
+ "\n"
50
+ "DISCOVER & ANALYSE: use list_models to find trained models, then "
51
+ "predict, explain, predict_interval, or what_if on parts. Use "
52
+ "get_model_quality to judge whether a model is trustworthy before "
53
+ "quoting its numbers. Lead with its computed `verdict`: 'trustworthy' "
54
+ "(benchmark against it), 'usable' (unbiased but modest — relative "
55
+ "comparisons and benchmarks only, not a single-part appraisal), "
56
+ "'unreliable' (biased residuals — relative comparisons only, never an "
57
+ "absolute target), or 'insufficient_data'/'unknown' (too little data "
58
+ "to judge — treat metrics as indicative). It also returns per-price-"
59
+ "band reliability and per-feature signal strength as computed flags.\n"
60
+ "\n"
61
+ "BUILD A MODEL: do NOT call `train` directly on a user's CSV first. "
62
+ "Call `propose_training_plan` first, relay its plain_summary and "
63
+ "questions_for_the_user to the user, and only call `train` after they "
64
+ "confirm. If you do call `train` directly, it applies safe defaults "
65
+ "(it screens out target-leakage columns and recommends a log-target "
66
+ "for price/cost targets) and reports them in `warnings` — surface "
67
+ "those warnings to the user.\n"
68
+ "\n"
69
+ "INTERPRETING OUTPUT (this is where the value is — apply every time):\n"
70
+ "1. log-target: prices/costs are multiplicative; a log-target keeps "
71
+ "intervals positive and makes SHAP read as percentages. Recommend it "
72
+ "for any price/cost target — don't trust 'auto' to catch it.\n"
73
+ "2. SHAP has an axiom check (baseline +/x contributions = prediction); "
74
+ "if it fails the explanation is unsound. Sign-check contributions: a "
75
+ "counterintuitive sign on a LOW-importance feature means that feature "
76
+ "is under-sampled, not that the world is upside-down — lean on the "
77
+ "high-importance, correctly-signed drivers.\n"
78
+ "3. The interval width is the per-part trust signal. A wide band — or "
79
+ "a lower bound at/below $0 on an additive model — means 'I'm unsure "
80
+ "here; get a quote, don't benchmark.' Always show the interval, not "
81
+ "just the point estimate, when the user will act on the number.\n"
82
+ "4. Judge a model by residual-bias (unbiasedness), not R2 alone: a "
83
+ "modest-R2 but unbiased model is more trustworthy for procurement than "
84
+ "a higher-R2 biased one.\n"
85
+ "5. Before quoting a finding to a stakeholder, check its feature's "
86
+ "importance — a finding resting on a 1-2% feature is a hypothesis, not "
87
+ "a number to negotiate against.\n"
88
+ "\n"
89
+ "Never present a single high-value part's point estimate as a final "
90
+ "appraisal — use the model to set the target and find the lever, then "
91
+ "get a real quote for the decision."
92
+ ),
93
+ )
94
+
95
+ def _compute_server_build() -> dict:
96
+ """Identity of the build THIS process loaded — captured once at import time.
97
+
98
+ MCP servers are long-lived: a process started before a code change keeps
99
+ serving the old code until it is restarted. We capture the version + git
100
+ short-SHA at import (NOT at call time) on purpose — a stale process then
101
+ honestly reports the build it actually loaded, instead of reading the repo's
102
+ current HEAD off disk and falsely claiming to be up to date. That lets a
103
+ caller tell, at a glance, whether the server needs a restart after a change
104
+ was shipped.
105
+ """
106
+ import importlib.metadata
107
+ import subprocess
108
+ from pathlib import Path as _Path
109
+
110
+ try:
111
+ version = importlib.metadata.version("p2predict")
112
+ except Exception:
113
+ version = "unknown"
114
+
115
+ pkg_dir = _Path(__file__).resolve().parent # .../p2predict/mcp
116
+ try:
117
+ proc = subprocess.run(
118
+ ["git", "-C", str(pkg_dir), "rev-parse", "--short", "HEAD"],
119
+ capture_output=True, text=True, timeout=2,
120
+ )
121
+ git_sha = proc.stdout.strip() or None
122
+ except Exception:
123
+ git_sha = None
124
+
125
+ return {
126
+ "version": version,
127
+ "git_sha": git_sha, # None for a non-git (e.g. PyPI) install
128
+ "source": str(pkg_dir.parent), # where the running code is loaded from
129
+ }
130
+
131
+
132
+ # Captured at import = at process spawn. Do not recompute per call.
133
+ _SERVER_BUILD = _compute_server_build()
134
+
135
+ _registry: ModelRegistry | None = None
136
+
137
+
138
+ def _get_registry() -> ModelRegistry:
139
+ if _registry is None:
140
+ raise RuntimeError("ModelRegistry not initialized — server not started correctly.")
141
+ return _registry
142
+
143
+
144
+ def _error(code: str, message: str) -> str:
145
+ return json.dumps({"error": {"code": code, "message": message}})
146
+
147
+
148
+ def _ok(data: dict) -> str:
149
+ return json.dumps(data, default=_json_default)
150
+
151
+
152
+ def _json_default(obj: Any) -> Any:
153
+ import numpy as np
154
+ import pandas as pd
155
+
156
+ if isinstance(obj, (np.integer,)):
157
+ return int(obj)
158
+ if isinstance(obj, (np.floating,)):
159
+ return float(obj)
160
+ if isinstance(obj, np.ndarray):
161
+ return obj.tolist()
162
+ if isinstance(obj, pd.Timestamp):
163
+ return str(obj)
164
+ raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # Tools
169
+ # ---------------------------------------------------------------------------
170
+
171
+
172
+ @mcp.tool()
173
+ async def list_models(include_internal: bool = False) -> str:
174
+ """List all trained P2Predict models in the configured models directory.
175
+
176
+ Call this first to discover which models are available. Each model carries
177
+ a plain `say_to_user` line, its target, and its specs. Lead with
178
+ `say_to_user`; do NOT read out raw fields like algorithm or R² (and never
179
+ the words 'SHAP', 'log-target', 'R²') to a category manager. Pass
180
+ include_internal=true only when you need the algorithm name / R² / log-target
181
+ flag for your own reasoning.
182
+
183
+ The response also carries a `server` block (version + git short-SHA + source
184
+ path) identifying the build this server process loaded — useful to confirm a
185
+ code change actually took effect (MCP servers are long-lived; a stale process
186
+ serves old code until restarted).
187
+ """
188
+ registry = _get_registry()
189
+ infos = await asyncio.to_thread(registry.scan)
190
+ return _ok({
191
+ "server": _SERVER_BUILD,
192
+ "models_dir": str(registry.models_dir),
193
+ "models": [info.to_dict(include_internal=include_internal) for info in infos],
194
+ })
195
+
196
+
197
+ @mcp.tool()
198
+ async def get_model_info(model_id: str, include_internal: bool = False) -> str:
199
+ """Get detailed information about a specific model.
200
+
201
+ Returns a plain `say_to_user` line, the target, and the model's specs with
202
+ their types (Numerical/Categorical) and allowed categories — everything you
203
+ need to build a predict/explain call. Use it to understand what inputs a
204
+ model expects. Lead with `say_to_user`; do NOT surface algorithm / R² /
205
+ log-target (or the words 'SHAP', 'log-target', 'R²') to the user. Pass
206
+ include_internal=true for those raw fields when you need them to reason.
207
+ """
208
+ registry = _get_registry()
209
+ try:
210
+ info = await asyncio.to_thread(registry.get_info, model_id)
211
+ except FileNotFoundError as e:
212
+ return _error("model_not_found", str(e))
213
+ return _ok(info.to_dict(include_internal=include_internal))
214
+
215
+
216
+ @mcp.tool()
217
+ async def predict(model_id: str, features: dict) -> str:
218
+ """Predict the target value (e.g. price) for a single part.
219
+
220
+ Pass the model_id from list_models and a dictionary of feature values
221
+ matching the model's expected features. Example:
222
+ {"Weight": 15, "Region": "EU", "Supplier": "A"}
223
+ """
224
+ registry = _get_registry()
225
+ try:
226
+ loaded = await asyncio.to_thread(registry.load, model_id)
227
+ except FileNotFoundError as e:
228
+ return _error("model_not_found", str(e))
229
+
230
+ from p2predict.mcp.conversions import features_to_dataframe
231
+ from p2predict.model_utils import extract_feature_info, inner_pipeline
232
+
233
+ pipeline = inner_pipeline(loaded["model"])
234
+ feature_types, _ = extract_feature_info(pipeline)
235
+ try:
236
+ df = features_to_dataframe(features, loaded["features"], feature_types)
237
+ except ValueError as e:
238
+ return _error("missing_feature", str(e))
239
+
240
+ preds = await asyncio.to_thread(loaded["model"].predict, df)
241
+ return _ok({
242
+ "model_id": model_id,
243
+ "target": loaded.get("target_feature"),
244
+ "prediction": float(preds[0]),
245
+ "input": features,
246
+ })
247
+
248
+
249
+ @mcp.tool()
250
+ async def predict_batch(
251
+ model_id: str,
252
+ rows: list[dict],
253
+ with_explanation: bool = False,
254
+ coverage: int | None = None,
255
+ ) -> str:
256
+ """Predict the target value for multiple parts at once.
257
+
258
+ More efficient than calling predict repeatedly. Pass a list of
259
+ feature dictionaries, one per part. Returns one prediction per row.
260
+
261
+ Optionally enriches every row with the same views the single-part tools
262
+ give, so you don't have to fan out to explain / predict_interval:
263
+ - coverage (1-99): adds a likely-price range per row (conformal interval)
264
+ when the model carries calibration data; read `interval.reliability`
265
+ and `interval.say_to_user` per row exactly as predict_interval does.
266
+ Left None (default) for plain point predictions.
267
+ - with_explanation: adds the per-row price drivers (same `explanation`
268
+ shape as explain). Surface the high-importance, correctly-signed
269
+ drivers in dollars/percent; never say 'SHAP' to a category manager.
270
+ """
271
+ registry = _get_registry()
272
+ try:
273
+ loaded = await asyncio.to_thread(registry.load, model_id)
274
+ except FileNotFoundError as e:
275
+ return _error("model_not_found", str(e))
276
+
277
+ if coverage is not None and not (1 <= coverage <= 99):
278
+ return _error("bad_coverage", "coverage must be between 1 and 99")
279
+
280
+ from p2predict.mcp.conversions import rows_to_dataframe
281
+ from p2predict.model_utils import (
282
+ explanation_to_dict,
283
+ extract_feature_info,
284
+ inner_pipeline,
285
+ interval_to_dicts,
286
+ )
287
+
288
+ pipeline = inner_pipeline(loaded["model"])
289
+ feature_types, _ = extract_feature_info(pipeline)
290
+ try:
291
+ df = rows_to_dataframe(rows, loaded["features"], feature_types)
292
+ except ValueError as e:
293
+ return _error("missing_feature", str(e))
294
+
295
+ preds = await asyncio.to_thread(loaded["model"].predict, df)
296
+ rows_out = [
297
+ {"input": row, "prediction": float(p)}
298
+ for row, p in zip(rows, preds)
299
+ ]
300
+
301
+ result: dict[str, Any] = {
302
+ "model_id": model_id,
303
+ "target": loaded.get("target_feature"),
304
+ "predictions": rows_out,
305
+ }
306
+
307
+ if coverage is not None:
308
+ calibration = loaded.get("calibration")
309
+ if not calibration or not calibration.get("residuals"):
310
+ return _error(
311
+ "no_calibration",
312
+ "This model has no calibration data, so no likely-range can be "
313
+ "produced. Retrain with P2Predict v0.5+, or call predict_batch "
314
+ "without coverage for point predictions.",
315
+ )
316
+ from p2predict import predict_interval as pi_fn
317
+
318
+ intervals = await asyncio.to_thread(
319
+ pi_fn, loaded["model"], df, calibration, coverage=coverage / 100.0
320
+ )
321
+ for row_out, iv in zip(rows_out, interval_to_dicts(intervals)):
322
+ row_out["interval"] = iv
323
+ result["coverage_pct"] = coverage
324
+
325
+ if with_explanation:
326
+ from p2predict import explain_batch
327
+
328
+ background = loaded.get("background_sample")
329
+ try:
330
+ explanations = await asyncio.to_thread(
331
+ explain_batch, loaded["model"], df, background_X=background
332
+ )
333
+ except ValueError as e:
334
+ return _error("explain_error", str(e))
335
+ for row_out, expl in zip(rows_out, explanations):
336
+ row_out["explanation"] = explanation_to_dict(expl)
337
+
338
+ return _ok(result)
339
+
340
+
341
+ @mcp.tool()
342
+ async def explain(model_id: str, features: dict, top_n: int = 3) -> str:
343
+ """Explain what is driving a part's predicted price, spec by spec.
344
+
345
+ Returns a business-ready view to quote directly — `starting_point` (the
346
+ baseline price every part starts from) and `price_drivers` (each spec /
347
+ supplier's effect in BOTH dollars and percent, biggest mover first) — plus
348
+ the underlying technical attribution for your own reasoning. top_n controls
349
+ how many top drivers are highlighted (default 3).
350
+
351
+ Reading it for the user: the explanation carries an axiom check
352
+ (baseline +/x contributions = prediction); if it fails the explanation is
353
+ unsound. SIGN-CHECK the drivers against intuition — a counterintuitive
354
+ sign (e.g. "more cells -> cheaper") on a LOW-importance driver means that
355
+ spec is under-sampled, not that the world is upside-down. Quote the
356
+ high-importance, correctly-signed drivers; flag the rest as noise. State
357
+ effects in the user's terms ("this supplier adds $0.72" / "+18%") — never
358
+ say "SHAP", "contribution", or "baseline" to a category manager.
359
+ """
360
+ registry = _get_registry()
361
+ try:
362
+ loaded = await asyncio.to_thread(registry.load, model_id)
363
+ except FileNotFoundError as e:
364
+ return _error("model_not_found", str(e))
365
+
366
+ from p2predict import explain as explain_fn, top_drivers
367
+ from p2predict.mcp.conversions import features_to_dataframe
368
+ from p2predict.model_utils import (
369
+ explanation_to_dict,
370
+ extract_feature_info,
371
+ inner_pipeline,
372
+ )
373
+
374
+ pipeline = inner_pipeline(loaded["model"])
375
+ feature_types, _ = extract_feature_info(pipeline)
376
+ try:
377
+ df = features_to_dataframe(features, loaded["features"], feature_types)
378
+ except ValueError as e:
379
+ return _error("missing_feature", str(e))
380
+
381
+ background = loaded.get("background_sample")
382
+ try:
383
+ expl = await asyncio.to_thread(
384
+ explain_fn, loaded["model"], df, background_X=background
385
+ )
386
+ except ValueError as e:
387
+ return _error("explain_error", str(e))
388
+
389
+ drivers = top_drivers(expl, n=top_n)
390
+ # For log-target models expl.prediction is the inner model's output in LOG
391
+ # space (it satisfies baseline + sum(contributions) = prediction there).
392
+ # The user-facing prediction must be in price space and match predict() /
393
+ # predict_batch(), so surface predicted_price for log-target models.
394
+ user_prediction = (
395
+ expl.predicted_price
396
+ if expl.log_target and expl.predicted_price is not None
397
+ else expl.prediction
398
+ )
399
+ return _ok({
400
+ "model_id": model_id,
401
+ "target": loaded.get("target_feature"),
402
+ "prediction": float(user_prediction),
403
+ "explanation": explanation_to_dict(expl),
404
+ "top_drivers": [{"feature": f, "value": float(v)} for f, v in drivers],
405
+ })
406
+
407
+
408
+ @mcp.tool()
409
+ async def predict_interval(
410
+ model_id: str, features: dict, coverage: int = 90
411
+ ) -> str:
412
+ """Predict with a likely range (conformal prediction interval).
413
+
414
+ For a 90% interval, about 9 in 10 similar parts fall within the range.
415
+ coverage is an integer 1-99 (default 90). Requires a model trained with
416
+ P2Predict v0.5+ (which stores calibration data).
417
+
418
+ Reading it for the user: the band WIDTH is the per-part trust signal, and
419
+ the payload computes it for you — `interval.reliability`
420
+ ('trust' | 'caution' | 'quote') and a plain `interval.say_to_user` sentence
421
+ you can quote directly. A tight band = predict with confidence; a very wide
422
+ band — or a lower bound at/below $0 on an additive (non-log) model — means
423
+ "get a quote, don't benchmark." Always surface the range, not just the point
424
+ estimate, when the user will act on the number.
425
+ """
426
+ registry = _get_registry()
427
+ try:
428
+ loaded = await asyncio.to_thread(registry.load, model_id)
429
+ except FileNotFoundError as e:
430
+ return _error("model_not_found", str(e))
431
+
432
+ calibration = loaded.get("calibration")
433
+ if not calibration or not calibration.get("residuals"):
434
+ return _error(
435
+ "no_calibration",
436
+ "This model has no calibration data. Retrain with P2Predict v0.5+ "
437
+ "to enable prediction intervals.",
438
+ )
439
+
440
+ from p2predict import predict_interval as pi_fn
441
+ from p2predict.mcp.conversions import features_to_dataframe
442
+ from p2predict.model_utils import (
443
+ extract_feature_info,
444
+ inner_pipeline,
445
+ interval_to_dicts,
446
+ )
447
+
448
+ pipeline = inner_pipeline(loaded["model"])
449
+ feature_types, _ = extract_feature_info(pipeline)
450
+ try:
451
+ df = features_to_dataframe(features, loaded["features"], feature_types)
452
+ except ValueError as e:
453
+ return _error("missing_feature", str(e))
454
+
455
+ if not (1 <= coverage <= 99):
456
+ return _error("bad_coverage", "coverage must be between 1 and 99")
457
+
458
+ try:
459
+ intervals = await asyncio.to_thread(
460
+ pi_fn, loaded["model"], df, calibration, coverage=coverage / 100.0
461
+ )
462
+ except ValueError as e:
463
+ return _error("interval_error", str(e))
464
+
465
+ ir = intervals[0]
466
+ return _ok({
467
+ "model_id": model_id,
468
+ "target": loaded.get("target_feature"),
469
+ "prediction": float(ir.prediction),
470
+ "interval": interval_to_dicts(intervals)[0],
471
+ "coverage_pct": coverage,
472
+ })
473
+
474
+
475
+ @mcp.tool()
476
+ async def what_if(
477
+ model_id: str,
478
+ features: dict,
479
+ changes: dict,
480
+ coverage: int | None = 90,
481
+ ) -> str:
482
+ """Compare a base scenario with a counterfactual where features change.
483
+
484
+ Returns a plain `summary` to quote directly (does the change add or save,
485
+ how many dollars, what percent, old vs. new price), plus both predictions,
486
+ the delta, and per-driver attribution of each change for your reasoning.
487
+ Answers "what if we switch from supplier A to B?" Set coverage to null to
488
+ skip intervals.
489
+ """
490
+ registry = _get_registry()
491
+ try:
492
+ loaded = await asyncio.to_thread(registry.load, model_id)
493
+ except FileNotFoundError as e:
494
+ return _error("model_not_found", str(e))
495
+
496
+ from p2predict import what_if as whatif_fn
497
+ from p2predict.mcp.conversions import features_to_dataframe
498
+ from p2predict.model_utils import (
499
+ extract_feature_info,
500
+ inner_pipeline,
501
+ whatif_to_dict,
502
+ )
503
+
504
+ pipeline = inner_pipeline(loaded["model"])
505
+ feature_types, _ = extract_feature_info(pipeline)
506
+ try:
507
+ df = features_to_dataframe(features, loaded["features"], feature_types)
508
+ except ValueError as e:
509
+ return _error("missing_feature", str(e))
510
+
511
+ for key in changes:
512
+ if key not in feature_types:
513
+ return _error(
514
+ "bad_whatif",
515
+ f"Cannot change '{key}': not a training feature. "
516
+ f"Valid features: {list(feature_types.keys())}",
517
+ )
518
+
519
+ calibration = loaded.get("calibration") if coverage else None
520
+ background = loaded.get("background_sample")
521
+ cov = (coverage / 100.0) if coverage else 0.90
522
+
523
+ try:
524
+ result = await asyncio.to_thread(
525
+ whatif_fn,
526
+ loaded["model"],
527
+ df,
528
+ changes,
529
+ feature_types,
530
+ background_X=background,
531
+ calibration=calibration,
532
+ coverage=cov,
533
+ )
534
+ except ValueError as e:
535
+ return _error("whatif_error", str(e))
536
+
537
+ return _ok({
538
+ "model_id": model_id,
539
+ "target": loaded.get("target_feature"),
540
+ "whatif": whatif_to_dict(result),
541
+ })
542
+
543
+
544
+ @mcp.tool()
545
+ async def predict_from_csv(
546
+ model_id: str,
547
+ csv_path: str,
548
+ with_explanation: bool = False,
549
+ coverage: int | None = None,
550
+ ) -> str:
551
+ """Batch-predict from a CSV file on the local filesystem.
552
+
553
+ The file-based sibling of predict_batch — use it when the user drops a
554
+ spreadsheet of parts. Reads csv_path, predicts every row, and returns one
555
+ prediction per row (point estimates by default).
556
+
557
+ The same opt-in enrichments as predict_batch apply per row:
558
+ - coverage (1-99): adds a likely-price range per row (conformal interval)
559
+ with its `interval.reliability` / `interval.say_to_user` read. Requires
560
+ a model with calibration data; an explicit coverage on an uncalibrated
561
+ model returns a no_calibration error. Left None (default) for plain
562
+ point predictions.
563
+ - with_explanation: adds the per-row price drivers (same `explanation`
564
+ shape as explain). State drivers in dollars/percent; never say 'SHAP'
565
+ to a category manager.
566
+ """
567
+ registry = _get_registry()
568
+ try:
569
+ loaded = await asyncio.to_thread(registry.load, model_id)
570
+ except FileNotFoundError as e:
571
+ return _error("model_not_found", str(e))
572
+
573
+ import pandas as pd
574
+
575
+ from p2predict.model_utils import (
576
+ coerce_features,
577
+ explanation_to_dict,
578
+ extract_feature_info,
579
+ inner_pipeline,
580
+ interval_to_dicts,
581
+ )
582
+
583
+ path = Path(csv_path)
584
+ if not path.exists():
585
+ return _error("file_not_found", f"CSV not found: {csv_path}")
586
+
587
+ try:
588
+ df = pd.read_csv(csv_path)
589
+ except Exception as e:
590
+ return _error("csv_read_error", str(e))
591
+
592
+ model_features = loaded["features"]
593
+ missing = [f for f in model_features if f not in df.columns]
594
+ if missing:
595
+ return _error(
596
+ "missing_feature",
597
+ f"CSV is missing columns: {missing}. Expected: {model_features}",
598
+ )
599
+
600
+ pipeline = inner_pipeline(loaded["model"])
601
+ feature_types, _ = extract_feature_info(pipeline)
602
+ X = coerce_features(df[model_features].copy(), feature_types)
603
+
604
+ preds = await asyncio.to_thread(loaded["model"].predict, X)
605
+
606
+ rows_out: list[dict] = []
607
+ for i in range(len(X)):
608
+ row_data: dict[str, Any] = {
609
+ "input": {f: df[f].iloc[i] for f in model_features},
610
+ "prediction": float(preds[i]),
611
+ }
612
+ rows_out.append(row_data)
613
+
614
+ result: dict[str, Any] = {
615
+ "model_id": model_id,
616
+ "target": loaded.get("target_feature"),
617
+ "csv_path": csv_path,
618
+ "n_rows": len(X),
619
+ "predictions": rows_out,
620
+ }
621
+
622
+ if coverage is not None:
623
+ if not (1 <= coverage <= 99):
624
+ return _error("bad_coverage", "coverage must be between 1 and 99")
625
+ calibration = loaded.get("calibration")
626
+ if not calibration or not calibration.get("residuals"):
627
+ return _error(
628
+ "no_calibration",
629
+ "This model has no calibration data, so no likely-range can be "
630
+ "produced. Retrain with P2Predict v0.5+, or call "
631
+ "predict_from_csv without coverage for point predictions.",
632
+ )
633
+ from p2predict import predict_interval as pi_fn
634
+
635
+ intervals = await asyncio.to_thread(
636
+ pi_fn, loaded["model"], X, calibration, coverage=coverage / 100.0
637
+ )
638
+ for i, iv in enumerate(interval_to_dicts(intervals)):
639
+ rows_out[i]["interval"] = iv
640
+ result["coverage_pct"] = coverage
641
+
642
+ if with_explanation:
643
+ from p2predict import explain_batch
644
+
645
+ background = loaded.get("background_sample")
646
+ try:
647
+ explanations = await asyncio.to_thread(
648
+ explain_batch, loaded["model"], X, background_X=background
649
+ )
650
+ except ValueError as e:
651
+ return _error("explain_error", str(e))
652
+ for i, expl in enumerate(explanations):
653
+ rows_out[i]["explanation"] = explanation_to_dict(expl)
654
+
655
+ return _ok(result)
656
+
657
+
658
+ @mcp.tool()
659
+ async def propose_training_plan(
660
+ csv_path: str,
661
+ target: str,
662
+ max_features: int = 6,
663
+ ) -> str:
664
+ """Inspect a training CSV and return a plain-language plan BEFORE training.
665
+
666
+ Call this first whenever a user wants to build a should-cost / pricing
667
+ model. It reads the CSV, decides what it would predict, which columns it
668
+ would use as specs, which it would leave out (and why — target leakage,
669
+ ID-like columns), and whether the target should use a log-target. It
670
+ trains nothing and writes nothing.
671
+
672
+ Relay `plain_summary` and `questions_for_the_user` to the user in their
673
+ own language, get confirmation, then call `train` (passing the agreed
674
+ `features` and `log_target`).
675
+ """
676
+ registry = _get_registry() # noqa: F841 — validates server is initialised
677
+
678
+ def _do_plan() -> dict:
679
+ import pandas as pd
680
+
681
+ from p2predict.feature_selection import (
682
+ find_high_variation_features,
683
+ find_leaky_features,
684
+ find_no_variation_features,
685
+ get_most_predictable_features,
686
+ )
687
+ from p2predict.trained_model_io import load_csv_file
688
+ from p2predict.training import resolve_log_target
689
+
690
+ path = Path(csv_path)
691
+ if not path.exists():
692
+ raise FileNotFoundError(f"CSV not found: {csv_path}")
693
+
694
+ data = load_csv_file(csv_path)
695
+ rows_loaded = len(data)
696
+ if target not in data.columns:
697
+ raise ValueError(
698
+ f"Target '{target}' not in CSV columns: {list(data.columns)}"
699
+ )
700
+
701
+ data = data[data[target].notna()]
702
+ if data.empty:
703
+ raise ValueError(f"All rows have missing values in target '{target}'.")
704
+
705
+ y = pd.to_numeric(data[target], errors="coerce").dropna()
706
+
707
+ # Columns to leave out, with reasons the user can understand.
708
+ leaky = find_leaky_features(data, target)
709
+ leaky_names = {d["feature"] for d in leaky}
710
+
711
+ no_var = [c for c in find_no_variation_features(data) if c != target]
712
+ high_var = find_high_variation_features(data)
713
+ id_like = [
714
+ c for c in high_var
715
+ if c != target
716
+ and c not in leaky_names
717
+ and not pd.api.types.is_numeric_dtype(data[c])
718
+ ]
719
+
720
+ excluded = []
721
+ for d in leaky:
722
+ excluded.append({"column": d["feature"], "reason": d["reason"]})
723
+ for c in id_like:
724
+ excluded.append({
725
+ "column": c,
726
+ "reason": "Looks like an ID / free-text column (almost every "
727
+ "row is unique), not a spec the model can learn from.",
728
+ })
729
+ for c in no_var:
730
+ excluded.append({
731
+ "column": c,
732
+ "reason": "Same value in every row — carries no information.",
733
+ })
734
+
735
+ drop_all = leaky_names | set(id_like) | set(no_var)
736
+ ranked = get_most_predictable_features(data, target, output_only_headers=True)
737
+ candidate_specs = [c for c in ranked.tolist() if c not in drop_all]
738
+ cap = max(2, min(len(candidate_specs), max_features))
739
+ selected = candidate_specs[:cap]
740
+
741
+ # Log-target recommendation.
742
+ _, auto_decision = resolve_log_target(y, mode="auto")
743
+ positive = bool((y > 0).all()) and len(y) > 0
744
+ recommend_log_target = "on" if positive else "off"
745
+
746
+ questions = [
747
+ f"I'll predict '{target}'. Is that the price/cost you actually pay "
748
+ "per part? If not, tell me which column is.",
749
+ ]
750
+ if leaky:
751
+ cols = ", ".join(f"'{d['feature']}'" for d in leaky)
752
+ questions.append(
753
+ f"I'm leaving out {cols} because it's almost the same number as "
754
+ f"'{target}' — it would make the model 'cheat'. OK to exclude?"
755
+ )
756
+ if positive and recommend_log_target == "on":
757
+ questions.append(
758
+ "I'll model this on a percentage scale (log-target) so the "
759
+ "likely-range never goes negative on cheap parts. Sound good?"
760
+ )
761
+
762
+ plain_summary = (
763
+ f"I found {rows_loaded} rows. I can build a model that estimates "
764
+ f"'{target}' from {len(selected)} spec column(s): "
765
+ f"{', '.join(selected)}."
766
+ )
767
+ if excluded:
768
+ plain_summary += (
769
+ f" I'd leave out {len(excluded)} column(s) "
770
+ f"({', '.join(e['column'] for e in excluded)}) — see "
771
+ "i_am_leaving_out for why."
772
+ )
773
+
774
+ return {
775
+ "status": "needs_confirmation",
776
+ "plain_summary": plain_summary,
777
+ "i_will_predict": target,
778
+ "i_will_use_these_specs": selected,
779
+ "i_am_leaving_out": excluded,
780
+ "recommended_log_target": recommend_log_target,
781
+ "log_target_auto_decision": auto_decision,
782
+ "rows_available": rows_loaded,
783
+ "questions_for_the_user": questions,
784
+ "to_proceed": (
785
+ "After the user confirms, call train(csv_path, target, "
786
+ "features=i_will_use_these_specs, "
787
+ "log_target=recommended_log_target)."
788
+ ),
789
+ }
790
+
791
+ try:
792
+ result = await asyncio.to_thread(_do_plan)
793
+ except FileNotFoundError as e:
794
+ return _error("file_not_found", str(e))
795
+ except ValueError as e:
796
+ return _error("plan_error", str(e))
797
+ except Exception as e:
798
+ return _error("internal_error", str(e))
799
+
800
+ return _ok(result)
801
+
802
+
803
+ @mcp.tool()
804
+ async def train(
805
+ csv_path: str,
806
+ target: str,
807
+ features: list[str] | None = None,
808
+ algorithm: str = "auto",
809
+ budget: str = "fast",
810
+ log_target: str = "auto",
811
+ outlier_policy: str = "warn",
812
+ feature_outlier_policy: str = "warn",
813
+ max_features: int = 6,
814
+ allow_leaky_features: bool = False,
815
+ ) -> str:
816
+ """Train a new P2Predict model from a local CSV file.
817
+
818
+ Prefer calling `propose_training_plan` first and confirming with the user
819
+ — this tool is the execution step. The CSV must have spec columns and a
820
+ price/cost target column. Training runs locally; no data leaves the
821
+ machine. The trained model is saved and immediately available.
822
+
823
+ Safe defaults (always surfaced in the returned `warnings` list):
824
+ - When features are auto-selected (features=None), columns that look
825
+ like target leakage — a near-duplicate of the price being predicted —
826
+ are excluded automatically.
827
+ - For a strictly-positive (price/cost) target where the automatic skew
828
+ test leaves the log-target off, the result recommends log_target="on".
829
+
830
+ algorithm: "auto" (default), "ridge", "random_forest", or "xgboost".
831
+ budget: "fast" (default) or "thorough".
832
+ log_target: "auto" (default), "on", or "off". Use "on" for prices.
833
+ allow_leaky_features: set True only to override the leakage guard and
834
+ train on an explicitly-requested feature that looks like leakage.
835
+ """
836
+ registry = _get_registry()
837
+
838
+ def _do_train() -> dict:
839
+ import pandas as pd
840
+
841
+ from p2predict import auto_train, Serialize_Trained_Model, save_model
842
+ from p2predict.feature_selection import (
843
+ find_leaky_features,
844
+ find_no_variation_features,
845
+ get_most_predictable_features,
846
+ )
847
+ from p2predict.intervals import compute_calibration_residuals
848
+ from p2predict.model_evals import evaluate_model
849
+ from p2predict.outliers import (
850
+ apply_feature_outlier_policy,
851
+ apply_outlier_policy,
852
+ )
853
+ from p2predict.prepare_data import prepare_data
854
+ from p2predict.trained_model_io import load_csv_file
855
+ from p2predict.training import (
856
+ extract_feature_importances,
857
+ resolve_log_target,
858
+ start_training,
859
+ )
860
+
861
+ path = Path(csv_path)
862
+ if not path.exists():
863
+ raise FileNotFoundError(f"CSV not found: {csv_path}")
864
+
865
+ data = load_csv_file(csv_path)
866
+ rows_loaded = len(data)
867
+
868
+ if target not in data.columns:
869
+ raise ValueError(
870
+ f"Target '{target}' not in CSV columns: {list(data.columns)}"
871
+ )
872
+
873
+ data = data[data[target].notna()]
874
+ if data.empty:
875
+ raise ValueError(f"All rows have missing values in target '{target}'.")
876
+
877
+ data, _ = apply_outlier_policy(data, target, policy=outlier_policy)
878
+
879
+ num_candidates = [
880
+ c for c in data.columns if c != target and pd.api.types.is_numeric_dtype(data[c])
881
+ ]
882
+ data, _ = apply_feature_outlier_policy(
883
+ data, num_candidates, policy=feature_outlier_policy
884
+ )
885
+
886
+ low_vars = find_no_variation_features(data)
887
+ if low_vars:
888
+ data = data.drop(low_vars, axis=1)
889
+
890
+ warnings: list[str] = []
891
+ leaky = find_leaky_features(data, target)
892
+ leaky_names = {d["feature"] for d in leaky}
893
+
894
+ if features:
895
+ missing = [f for f in features if f not in data.columns]
896
+ if missing:
897
+ raise ValueError(f"Requested features not in CSV: {missing}")
898
+
899
+ explicit_leaky = [d for d in leaky if d["feature"] in set(features)]
900
+ if explicit_leaky and not allow_leaky_features:
901
+ # Stop and ask rather than train a confidently-wrong model.
902
+ return {
903
+ "status": "needs_confirmation",
904
+ "reason": "target_leakage",
905
+ "message": (
906
+ "Some requested features look like target leakage — a "
907
+ "near-duplicate of the value you're predicting, not a "
908
+ "real spec. Training on them produces a model that looks "
909
+ "near-perfect but is useless on real parts."
910
+ ),
911
+ "leaky_features": explicit_leaky,
912
+ "to_proceed": (
913
+ "Re-call train without these features (recommended), or "
914
+ "pass allow_leaky_features=true to override deliberately."
915
+ ),
916
+ }
917
+ selected = list(features)
918
+ else:
919
+ ranked = get_most_predictable_features(data, target, output_only_headers=True)
920
+ # Safe default: never auto-select a leakage column.
921
+ ranked = [c for c in ranked.tolist() if c not in leaky_names]
922
+ n_ranked = len(ranked)
923
+ cap = max(2, min(n_ranked, max_features))
924
+ selected = ranked[:cap]
925
+ if leaky_names:
926
+ warnings.append(
927
+ "Auto-excluded likely target-leakage column(s) from feature "
928
+ f"selection: {sorted(leaky_names)}. "
929
+ + "; ".join(d["reason"] for d in leaky)
930
+ )
931
+
932
+ X_train, X_test, y_train, y_test, num_cols, cat_cols = prepare_data(
933
+ data, selected, target
934
+ )
935
+
936
+ log_target_override, log_target_decision = resolve_log_target(
937
+ y_train, mode=log_target
938
+ )
939
+
940
+ # Prices/costs are multiplicative: a log-target keeps intervals strictly
941
+ # positive and makes SHAP read as percentages. The skew-based "auto"
942
+ # test under-fires on samples that happen to look symmetric, so for a
943
+ # strictly-positive target left additive by auto, recommend "on".
944
+ if (
945
+ log_target == "auto"
946
+ and not log_target_override
947
+ and bool((y_train > 0).all())
948
+ ):
949
+ warnings.append(
950
+ "Target is strictly positive (price/cost-like) but the automatic "
951
+ "skew test left the log-target OFF, so intervals are additive and "
952
+ "can go negative on cheap parts. Consider re-training with "
953
+ "log_target=\"on\" for percentage-based, always-positive intervals."
954
+ )
955
+
956
+ scores: dict = {}
957
+ if algorithm == "auto":
958
+ model, algo, scores, log_t = auto_train(
959
+ X_train, y_train, num_cols, cat_cols,
960
+ budget=budget, log_target=log_target_override,
961
+ )
962
+ else:
963
+ model, _, log_t = start_training(
964
+ X_train, y_train, num_cols, cat_cols, algorithm,
965
+ budget=budget, tune=(budget == "thorough"),
966
+ log_target=log_target_override,
967
+ )
968
+ algo = algorithm
969
+
970
+ mae, r2, p_value, rmse = evaluate_model(X_test, y_test, model)
971
+
972
+ background_n = min(100, len(X_train))
973
+ background_sample = (
974
+ X_train.sample(n=background_n, random_state=0).reset_index(drop=True)
975
+ if background_n > 0
976
+ else None
977
+ )
978
+ calibration = compute_calibration_residuals(model, X_test, y_test)
979
+
980
+ y_pred_test = model.predict(X_test)
981
+
982
+ model_metadata = Serialize_Trained_Model(
983
+ algo, selected, target, model, r2,
984
+ log_target=log_t,
985
+ background_sample=background_sample,
986
+ calibration=calibration,
987
+ )
988
+ model_metadata["holdout_y_test"] = y_test.tolist()
989
+ model_metadata["holdout_y_pred"] = y_pred_test.tolist()
990
+
991
+ try:
992
+ importances = extract_feature_importances(model, X_train)
993
+ importances_block = [
994
+ {"feature": k, "importance": float(v)} for k, v in importances
995
+ ]
996
+ except Exception:
997
+ importances = None
998
+ importances_block = []
999
+
1000
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
1001
+ model_filename = f"{algo}_{target}_{timestamp}.model"
1002
+ model_path = registry.models_dir / model_filename
1003
+ registry.models_dir.mkdir(parents=True, exist_ok=True)
1004
+ save_model(model_metadata, str(model_path))
1005
+
1006
+ model_id = model_path.stem
1007
+ registry.register(model_id, model_path, model_metadata)
1008
+
1009
+ from p2predict.quality import r2_quality_label
1010
+ quality_label = r2_quality_label(r2)
1011
+
1012
+ return {
1013
+ "model_id": model_id,
1014
+ "model_path": str(model_path),
1015
+ "algorithm": algo,
1016
+ "target": target,
1017
+ "features": selected,
1018
+ "log_target": bool(log_t),
1019
+ "log_target_decision": log_target_decision,
1020
+ "evaluation": {
1021
+ "r2": float(r2),
1022
+ "mae": float(mae),
1023
+ "rmse": float(rmse),
1024
+ "residual_bias_p_value": float(p_value),
1025
+ "quality_label": quality_label,
1026
+ },
1027
+ "cv_scores": {k: float(v) for k, v in scores.items()} if scores else {},
1028
+ "feature_importances": importances_block,
1029
+ "rows_loaded": rows_loaded,
1030
+ "rows_used": len(data),
1031
+ "calibration_size": calibration.get("n_calibration"),
1032
+ "excluded_leaky_features": leaky,
1033
+ "warnings": warnings,
1034
+ }
1035
+
1036
+ try:
1037
+ result = await asyncio.to_thread(_do_train)
1038
+ except FileNotFoundError as e:
1039
+ return _error("file_not_found", str(e))
1040
+ except ValueError as e:
1041
+ return _error("train_error", str(e))
1042
+ except Exception as e:
1043
+ return _error("internal_error", str(e))
1044
+
1045
+ return _ok(result)
1046
+
1047
+
1048
+ def _quality_report_for(
1049
+ loaded: dict, include_holdout: bool = False, include_metrics: bool = False
1050
+ ) -> dict:
1051
+ """Build the structured quality report for a loaded model (shared by
1052
+ get_model_quality and generate_report). Raises ValueError('no_holdout_data')."""
1053
+ from p2predict.quality import build_quality_report
1054
+ from p2predict.training import extract_feature_importances
1055
+
1056
+ try:
1057
+ importances = extract_feature_importances(
1058
+ loaded["model"], loaded.get("background_sample")
1059
+ )
1060
+ except Exception:
1061
+ importances = None
1062
+ return build_quality_report(
1063
+ loaded, importances,
1064
+ include_holdout=include_holdout, include_metrics=include_metrics,
1065
+ )
1066
+
1067
+
1068
+ @mcp.tool()
1069
+ async def get_model_quality(
1070
+ model_id: str, include_holdout: bool = False, include_metrics: bool = False
1071
+ ) -> str:
1072
+ """Structured, agent-readable model-quality report — the JSON form of the PDF.
1073
+
1074
+ Use this (not just generate_report, which only writes a PDF) when you need
1075
+ to *reason about or relay* model quality. Every judgment is computed so you
1076
+ don't eyeball thresholds:
1077
+
1078
+ - `assessment.verdict` — LEAD WITH THIS. One of: 'trustworthy' | 'usable'
1079
+ | 'unreliable' | 'unknown' | 'insufficient_data'. It folds bias and
1080
+ sample size into a plain `headline` you can quote verbatim — e.g. a
1081
+ modest model that is even-handed reads 'usable', not just 'Needs
1082
+ Improvement'. `assessment.confidence` is 'high' | 'limited' |
1083
+ 'insufficient'.
1084
+ - `calibration_by_price_band[].reliability` — 'trust' | 'caution' |
1085
+ 'quote' per price range (with `low_confidence` when a band is thin).
1086
+ Each band carries a `say_to_user` sentence in plain words — quote it to
1087
+ tell the user which prices to benchmark vs. get a quote on.
1088
+ - `feature_importance[].signal` — 'strong' | 'moderate' | 'weak', each
1089
+ with its own `say_to_user` sentence. Only quote findings resting on
1090
+ 'strong' drivers to a stakeholder.
1091
+
1092
+ The default payload is deliberately business-only — every string is safe to
1093
+ read to a category manager. NEVER say 'SHAP', 'R²', 'p-value', 'log-target',
1094
+ 'residual' to the user. The raw statistics (R², p-value, algorithm,
1095
+ log-target) are NOT in the default response; pass include_metrics=true to
1096
+ add them under `metrics`/`provenance` for your own developer-level reasoning.
1097
+
1098
+ Set include_holdout=true to also get the raw actual/predicted arrays, so an
1099
+ agent with a code/plotting tool can draw its own charts (predicted-vs-actual,
1100
+ residuals, error-by-band).
1101
+
1102
+ Requires a model trained via the MCP train tool (which stores holdout data).
1103
+ """
1104
+ registry = _get_registry()
1105
+ try:
1106
+ loaded = await asyncio.to_thread(registry.load, model_id)
1107
+ except FileNotFoundError as e:
1108
+ return _error("model_not_found", str(e))
1109
+
1110
+ try:
1111
+ report = await asyncio.to_thread(
1112
+ _quality_report_for, loaded, include_holdout, include_metrics
1113
+ )
1114
+ except ValueError:
1115
+ return _error(
1116
+ "no_holdout_data",
1117
+ "This model has no stored holdout data (trained before MCP support). "
1118
+ "Retrain via the MCP train tool to enable the quality report.",
1119
+ )
1120
+ except Exception as e:
1121
+ return _error("quality_error", str(e))
1122
+
1123
+ return _ok({"model_id": model_id, **report})
1124
+
1125
+
1126
+ @mcp.tool()
1127
+ async def generate_report(
1128
+ model_id: str,
1129
+ output_path: str | None = None,
1130
+ ) -> str:
1131
+ """Generate a procurement-style model-quality PDF report (3 pages).
1132
+
1133
+ Page 1: summary metrics + predicted vs actual scatter.
1134
+ Page 2: error distribution + median % error by price band.
1135
+ Page 3: top-N feature importance.
1136
+
1137
+ The PDF is the human deliverable; the return value also echoes the same
1138
+ numbers as a structured `quality` block (identical to get_model_quality)
1139
+ so you can both hand the user the file AND reason over the metrics.
1140
+
1141
+ Works best with models trained via the MCP train tool (which stores
1142
+ holdout data). For older models, the report may be unavailable.
1143
+ """
1144
+ registry = _get_registry()
1145
+ try:
1146
+ loaded = await asyncio.to_thread(registry.load, model_id)
1147
+ except FileNotFoundError as e:
1148
+ return _error("model_not_found", str(e))
1149
+
1150
+ import numpy as np
1151
+
1152
+ y_test = loaded.get("holdout_y_test")
1153
+ y_pred = loaded.get("holdout_y_pred")
1154
+ if y_test is None or y_pred is None:
1155
+ return _error(
1156
+ "no_holdout_data",
1157
+ "This model doesn't have stored holdout data (trained before MCP "
1158
+ "support). Retrain via the MCP train tool to enable report generation.",
1159
+ )
1160
+
1161
+ y_test_arr = np.array(y_test)
1162
+ y_pred_arr = np.array(y_pred)
1163
+
1164
+ if output_path is None:
1165
+ output_path = str(registry.models_dir / f"{model_id}_report.pdf")
1166
+
1167
+ def _generate() -> str:
1168
+ import matplotlib
1169
+ matplotlib.use("agg")
1170
+ from p2predict import plotting
1171
+ from p2predict.model_utils import inner_pipeline
1172
+ from p2predict.training import extract_feature_importances
1173
+
1174
+ try:
1175
+ importances = extract_feature_importances(
1176
+ loaded["model"], loaded.get("background_sample")
1177
+ )
1178
+ except Exception:
1179
+ importances = None
1180
+
1181
+ plotting.plot_results_pdf(
1182
+ y_test_arr,
1183
+ y_pred_arr,
1184
+ output_path,
1185
+ target_name=loaded.get("target_feature", "Price"),
1186
+ model_name=loaded.get("model_name"),
1187
+ n_train=None,
1188
+ training_date=loaded.get("training_date"),
1189
+ feature_importances=importances,
1190
+ )
1191
+ return output_path
1192
+
1193
+ try:
1194
+ path = await asyncio.to_thread(_generate)
1195
+ except Exception as e:
1196
+ return _error("report_error", str(e))
1197
+
1198
+ # Echo the same numbers as structured data so the agent can reason over the
1199
+ # report, not just hand the user a PDF path.
1200
+ try:
1201
+ quality = await asyncio.to_thread(_quality_report_for, loaded)
1202
+ except Exception:
1203
+ quality = None
1204
+
1205
+ return _ok({
1206
+ "model_id": model_id,
1207
+ "report_path": path,
1208
+ "quality": quality,
1209
+ })
1210
+
1211
+
1212
+ # ---------------------------------------------------------------------------
1213
+ # Resources
1214
+ # ---------------------------------------------------------------------------
1215
+
1216
+
1217
+ @mcp.resource("model://{model_id}")
1218
+ async def model_resource(model_id: str) -> str:
1219
+ """Model metadata as a resource."""
1220
+ registry = _get_registry()
1221
+ try:
1222
+ info = await asyncio.to_thread(registry.get_info, model_id)
1223
+ except FileNotFoundError as e:
1224
+ return _error("model_not_found", str(e))
1225
+ return _ok(info.to_dict())
1226
+
1227
+
1228
+ # ---------------------------------------------------------------------------
1229
+ # Entry point
1230
+ # ---------------------------------------------------------------------------
1231
+
1232
+
1233
+ def main():
1234
+ parser = argparse.ArgumentParser(
1235
+ description="P2Predict MCP server — parametric price benchmarking for AI agents"
1236
+ )
1237
+ parser.add_argument(
1238
+ "--models-dir",
1239
+ default="models",
1240
+ help="Directory containing .model files (default: models)",
1241
+ )
1242
+ args = parser.parse_args()
1243
+
1244
+ global _registry
1245
+ _registry = ModelRegistry(Path(args.models_dir).resolve())
1246
+
1247
+ # stderr is safe on stdio transport (stdout is the MCP protocol channel) and
1248
+ # shows up in the client's server logs — a quick way to confirm which build
1249
+ # is actually running.
1250
+ import sys
1251
+ print(
1252
+ f"P2Predict MCP server v{_SERVER_BUILD['version']} "
1253
+ f"({_SERVER_BUILD['git_sha'] or 'no-git'}) loaded from "
1254
+ f"{_SERVER_BUILD['source']}",
1255
+ file=sys.stderr,
1256
+ )
1257
+
1258
+ mcp.run(transport="stdio")