minima-cli 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. minima/__init__.py +5 -0
  2. minima/api/__init__.py +1 -0
  3. minima/api/auth.py +39 -0
  4. minima/api/errors.py +40 -0
  5. minima/api/routers/__init__.py +1 -0
  6. minima/api/routers/calibration.py +50 -0
  7. minima/api/routers/feedback.py +279 -0
  8. minima/api/routers/health.py +50 -0
  9. minima/api/routers/models.py +42 -0
  10. minima/api/routers/recommend.py +66 -0
  11. minima/api/routers/savings.py +55 -0
  12. minima/api/routers/strategies.py +33 -0
  13. minima/catalog/__init__.py +1 -0
  14. minima/catalog/data/capability_priors.json +210 -0
  15. minima/catalog/data/model_aliases.json +12 -0
  16. minima/catalog/merge.py +69 -0
  17. minima/catalog/refresh.py +54 -0
  18. minima/catalog/sources/__init__.py +1 -0
  19. minima/catalog/sources/litellm.py +19 -0
  20. minima/catalog/sources/openrouter.py +25 -0
  21. minima/catalog/store.py +86 -0
  22. minima/config.py +288 -0
  23. minima/deps.py +35 -0
  24. minima/llm/__init__.py +1 -0
  25. minima/llm/anthropic.py +106 -0
  26. minima/llm/base.py +196 -0
  27. minima/llm/gemini.py +124 -0
  28. minima/llm/registry.py +54 -0
  29. minima/logging.py +28 -0
  30. minima/main.py +109 -0
  31. minima/memory/__init__.py +1 -0
  32. minima/memory/adapter.py +572 -0
  33. minima/memory/keys.py +83 -0
  34. minima/memory/records.py +190 -0
  35. minima/memory/threadpool.py +41 -0
  36. minima/metrics/__init__.py +1 -0
  37. minima/metrics/calibration.py +415 -0
  38. minima/metrics/report.py +116 -0
  39. minima/metrics/savings.py +98 -0
  40. minima/recommender/__init__.py +1 -0
  41. minima/recommender/_pg_pool.py +38 -0
  42. minima/recommender/_redis_client.py +32 -0
  43. minima/recommender/aggregate.py +157 -0
  44. minima/recommender/classify.py +165 -0
  45. minima/recommender/decisionlog.py +505 -0
  46. minima/recommender/durablerefs.py +312 -0
  47. minima/recommender/engine.py +997 -0
  48. minima/recommender/escalation.py +83 -0
  49. minima/recommender/propensity.py +189 -0
  50. minima/recommender/recstore.py +368 -0
  51. minima/recommender/score.py +318 -0
  52. minima/recommender/types.py +166 -0
  53. minima/schemas/__init__.py +1 -0
  54. minima/schemas/common.py +73 -0
  55. minima/schemas/feedback.py +34 -0
  56. minima/schemas/models_catalog.py +36 -0
  57. minima/schemas/recommend.py +104 -0
  58. minima/schemas/savings.py +39 -0
  59. minima/schemas/strategies.py +57 -0
  60. minima/schemas/workflow.py +43 -0
  61. minima/seeding/__init__.py +1 -0
  62. minima/seeding/items.py +42 -0
  63. minima/seeding/llmrouterbench.py +232 -0
  64. minima/seeding/routerbench.py +141 -0
  65. minima/seeding/run_seed.py +56 -0
  66. minima/seeding/synthetic.py +70 -0
  67. minima/tenancy/__init__.py +8 -0
  68. minima/tenancy/context.py +37 -0
  69. minima/tenancy/passthrough.py +110 -0
  70. minima/version.py +3 -0
  71. minima_cli-0.4.9.dist-info/METADATA +275 -0
  72. minima_cli-0.4.9.dist-info/RECORD +161 -0
  73. minima_cli-0.4.9.dist-info/WHEEL +4 -0
  74. minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
  75. minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
  76. minima_client/__init__.py +19 -0
  77. minima_client/autocapture.py +101 -0
  78. minima_client/client.py +301 -0
  79. minima_client/errors.py +23 -0
  80. minima_harness/LICENSE_PI +32 -0
  81. minima_harness/__init__.py +16 -0
  82. minima_harness/agent/__init__.py +72 -0
  83. minima_harness/agent/agent.py +276 -0
  84. minima_harness/agent/events.py +124 -0
  85. minima_harness/agent/loop.py +311 -0
  86. minima_harness/agent/state.py +79 -0
  87. minima_harness/agent/tools.py +97 -0
  88. minima_harness/ai/__init__.py +66 -0
  89. minima_harness/ai/compat.py +71 -0
  90. minima_harness/ai/errors.py +96 -0
  91. minima_harness/ai/events.py +117 -0
  92. minima_harness/ai/openrouter_catalog.py +153 -0
  93. minima_harness/ai/provider_catalog.py +299 -0
  94. minima_harness/ai/provider_quirks.py +37 -0
  95. minima_harness/ai/providers/__init__.py +75 -0
  96. minima_harness/ai/providers/_common.py +48 -0
  97. minima_harness/ai/providers/anthropic.py +290 -0
  98. minima_harness/ai/providers/base.py +65 -0
  99. minima_harness/ai/providers/faux.py +173 -0
  100. minima_harness/ai/providers/google.py +221 -0
  101. minima_harness/ai/providers/openai_compat.py +278 -0
  102. minima_harness/ai/registry.py +184 -0
  103. minima_harness/ai/stream.py +82 -0
  104. minima_harness/ai/tools.py +51 -0
  105. minima_harness/ai/types.py +204 -0
  106. minima_harness/ai/usage.py +41 -0
  107. minima_harness/minima/__init__.py +40 -0
  108. minima_harness/minima/cache.py +102 -0
  109. minima_harness/minima/config.py +85 -0
  110. minima_harness/minima/goals.py +226 -0
  111. minima_harness/minima/judge.py +144 -0
  112. minima_harness/minima/mapping.py +147 -0
  113. minima_harness/minima/meter.py +143 -0
  114. minima_harness/minima/router.py +220 -0
  115. minima_harness/minima/runtime.py +544 -0
  116. minima_harness/minima/signals.py +195 -0
  117. minima_harness/session/__init__.py +14 -0
  118. minima_harness/session/format.py +35 -0
  119. minima_harness/session/store.py +236 -0
  120. minima_harness/tasks/__init__.py +17 -0
  121. minima_harness/tasks/task_set.py +78 -0
  122. minima_harness/tools/__init__.py +7 -0
  123. minima_harness/tools/_io.py +34 -0
  124. minima_harness/tools/bash.py +70 -0
  125. minima_harness/tools/builtin.py +23 -0
  126. minima_harness/tools/edit.py +50 -0
  127. minima_harness/tools/find.py +38 -0
  128. minima_harness/tools/grep.py +73 -0
  129. minima_harness/tools/ls.py +35 -0
  130. minima_harness/tools/read.py +38 -0
  131. minima_harness/tools/tasks.py +75 -0
  132. minima_harness/tools/write.py +36 -0
  133. minima_harness/tui/__init__.py +3 -0
  134. minima_harness/tui/analytics.py +111 -0
  135. minima_harness/tui/app.py +1927 -0
  136. minima_harness/tui/bridge.py +103 -0
  137. minima_harness/tui/cli.py +227 -0
  138. minima_harness/tui/clipboard.py +60 -0
  139. minima_harness/tui/commands.py +49 -0
  140. minima_harness/tui/compaction.py +17 -0
  141. minima_harness/tui/config_cli.py +141 -0
  142. minima_harness/tui/config_store.py +237 -0
  143. minima_harness/tui/context.py +93 -0
  144. minima_harness/tui/customize.py +95 -0
  145. minima_harness/tui/diff.py +53 -0
  146. minima_harness/tui/editor.py +43 -0
  147. minima_harness/tui/extensions.py +84 -0
  148. minima_harness/tui/extra_models.py +52 -0
  149. minima_harness/tui/history.py +71 -0
  150. minima_harness/tui/mubit.py +295 -0
  151. minima_harness/tui/overlays.py +593 -0
  152. minima_harness/tui/packages.py +59 -0
  153. minima_harness/tui/run_modes.py +66 -0
  154. minima_harness/tui/theme.py +77 -0
  155. minima_harness/tui/welcome.py +83 -0
  156. minima_harness/tui/widgets/__init__.py +3 -0
  157. minima_harness/tui/widgets/banner.py +38 -0
  158. minima_harness/tui/widgets/editor.py +83 -0
  159. minima_harness/tui/widgets/footer.py +73 -0
  160. minima_harness/tui/widgets/messages.py +151 -0
  161. minima_harness/tui/widgets/status.py +57 -0
@@ -0,0 +1,190 @@
1
+ """Mapping between Minima's internal outcome model and Mubit memory metadata.
2
+
3
+ All three intake paths (explicit feedback, auto-capture, offline seed) converge on
4
+ this one record shape, so the recommender is agnostic to where evidence came from.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from collections.abc import Mapping
11
+ from dataclasses import asdict, dataclass, field
12
+ from typing import Any
13
+
14
+ SCHEMA_VERSION = 2 # v2 adds recorded_at (unix seconds); v1 records parse unchanged
15
+
16
+ _OUTCOME_DEFAULT_QUALITY = {"success": 0.9, "partial": 0.5, "failure": 0.1}
17
+
18
+ # Caller-supplied quality scores that flatly contradict the outcome label are clamped
19
+ # (never rejected — nuanced feedback like "succeeded but mediocre" must survive).
20
+ _FAILURE_QUALITY_CAP = 0.6
21
+ _SUCCESS_QUALITY_FLOOR = 0.4
22
+
23
+
24
+ def _clamp(x: float, lo: float, hi: float) -> float:
25
+ return max(lo, min(hi, x))
26
+
27
+
28
+ def clamp01(x: float) -> float:
29
+ return _clamp(x, 0.0, 1.0)
30
+
31
+
32
+ def quality_from_outcome(outcome: str, quality_score: float | None) -> float:
33
+ """Caller-supplied quality wins; else a label-based default."""
34
+ if quality_score is not None:
35
+ return clamp01(float(quality_score))
36
+ return _OUTCOME_DEFAULT_QUALITY.get(outcome, 0.5)
37
+
38
+
39
+ def reconcile_quality(outcome: str, quality: float) -> tuple[float, str | None]:
40
+ """Log-and-clamp gate for outcome/quality contradictions.
41
+
42
+ A "failure" reported with quality 0.95 (or a "success" with 0.05) would poison the
43
+ weighted-success aggregate with a label/score pair that can't both be true. Clamp
44
+ into the consistent band and surface a warning so the caller can fix their scorer.
45
+ """
46
+ if outcome == "failure" and quality > _FAILURE_QUALITY_CAP:
47
+ return _FAILURE_QUALITY_CAP, "quality_outcome_mismatch"
48
+ if outcome == "success" and quality < _SUCCESS_QUALITY_FLOOR:
49
+ return _SUCCESS_QUALITY_FLOOR, "quality_outcome_mismatch"
50
+ return quality, None
51
+
52
+
53
+ def signal_from_outcome(outcome: str, quality: float) -> float:
54
+ """Map an outcome+quality to a reinforcement signal in [-1, 1]."""
55
+ if outcome == "success":
56
+ return 1.0
57
+ if outcome == "partial":
58
+ return _clamp(2.0 * quality - 1.0, -1.0, 1.0)
59
+ return _clamp(quality - 1.0, -1.0, 0.0) # failure
60
+
61
+
62
+ @dataclass(slots=True)
63
+ class OutcomeRecord:
64
+ """A single (task, model, outcome) observation."""
65
+
66
+ model_id: str
67
+ provider: str = ""
68
+ task_type: str = "other"
69
+ difficulty: str = "medium"
70
+ task_fingerprint: str = ""
71
+ task_cluster: str = ""
72
+ input_tokens: int = 0
73
+ output_tokens: int = 0
74
+ cost_usd: float = 0.0
75
+ latency_ms: int | None = None
76
+ quality_score: float = 0.0
77
+ outcome: str = "success"
78
+ recommendation_id: str | None = None
79
+ verified_in_production: bool = False
80
+ source_dataset: str | None = None
81
+ # Agent loop turns to resolution (token-yield signal; a cheap model that takes many
82
+ # turns to resolve can cost more than one frontier turn). Backward-compatible: None
83
+ # on legacy records.
84
+ iterations: int | None = None
85
+ # Unix seconds when the outcome was observed. Powers evidence age decay; None on
86
+ # legacy (schema v1) records, which fall back to the binary staleness penalty.
87
+ recorded_at: float | None = None
88
+ kind: str = "outcome"
89
+ schema_version: int = SCHEMA_VERSION
90
+ extra: dict = field(default_factory=dict)
91
+
92
+ def to_metadata(self) -> dict:
93
+ data = asdict(self)
94
+ extra = data.pop("extra", {}) or {}
95
+ return {**extra, **{k: v for k, v in data.items() if v is not None}}
96
+
97
+ @classmethod
98
+ def from_metadata(cls, meta: Mapping | str | None) -> OutcomeRecord | None:
99
+ """Parse a Mubit ``metadata_json`` (string or dict) into an OutcomeRecord.
100
+
101
+ Returns ``None`` when the entry is not a Minima outcome record.
102
+ """
103
+ parsed = _coerce_mapping(meta)
104
+ if not parsed:
105
+ return None
106
+ if parsed.get("kind") != "outcome":
107
+ return None
108
+ model_id = parsed.get("model_id")
109
+ if not model_id:
110
+ return None
111
+ return cls(
112
+ model_id=str(model_id),
113
+ provider=str(parsed.get("provider", "")),
114
+ task_type=str(parsed.get("task_type", "other")),
115
+ difficulty=str(parsed.get("difficulty", "medium")),
116
+ task_fingerprint=str(parsed.get("task_fingerprint", "")),
117
+ task_cluster=str(parsed.get("task_cluster", "")),
118
+ input_tokens=_as_int(parsed.get("input_tokens")),
119
+ output_tokens=_as_int(parsed.get("output_tokens")),
120
+ cost_usd=_as_float(parsed.get("cost_usd")),
121
+ latency_ms=_as_int(parsed.get("latency_ms")) if parsed.get("latency_ms") else None,
122
+ quality_score=clamp01(_as_float(parsed.get("quality_score"))),
123
+ outcome=str(parsed.get("outcome", "success")),
124
+ recommendation_id=parsed.get("recommendation_id"),
125
+ verified_in_production=bool(parsed.get("verified_in_production", False)),
126
+ source_dataset=parsed.get("source_dataset"),
127
+ recorded_at=(
128
+ _as_float(parsed.get("recorded_at")) if parsed.get("recorded_at") else None
129
+ ),
130
+ iterations=(_as_int(parsed.get("iterations")) if parsed.get("iterations") else None),
131
+ )
132
+
133
+
134
+ @dataclass(slots=True)
135
+ class RecalledEvidence:
136
+ """One recalled Mubit entry, with its parsed outcome record (if any)."""
137
+
138
+ entry_id: str
139
+ reference_id: str | None
140
+ score: float
141
+ knowledge_confidence: float
142
+ is_stale: bool
143
+ content: str
144
+ record: OutcomeRecord | None
145
+ # Whether this entry can be re-read exactly via Dereference (durable fast path).
146
+ referenceable: bool = False
147
+ entry_type: str = ""
148
+
149
+
150
+ @dataclass(slots=True)
151
+ class RecallResult:
152
+ evidence: list[RecalledEvidence]
153
+ degraded: bool = False
154
+ raw_confidence: float = 0.0
155
+ timed_out: bool = False
156
+ error: str | None = None
157
+
158
+ @property
159
+ def outcome_evidence(self) -> list[RecalledEvidence]:
160
+ return [e for e in self.evidence if e.record is not None]
161
+
162
+
163
+ def _coerce_mapping(meta: Mapping | str | None) -> dict | None:
164
+ if meta is None:
165
+ return None
166
+ if isinstance(meta, str):
167
+ if not meta.strip():
168
+ return None
169
+ try:
170
+ loaded = json.loads(meta)
171
+ except (json.JSONDecodeError, ValueError):
172
+ return None
173
+ return loaded if isinstance(loaded, dict) else None
174
+ if isinstance(meta, Mapping):
175
+ return dict(meta)
176
+ return None
177
+
178
+
179
+ def _as_int(value: Any, default: int = 0) -> int:
180
+ try:
181
+ return int(value)
182
+ except (TypeError, ValueError):
183
+ return default
184
+
185
+
186
+ def _as_float(value: Any, default: float = 0.0) -> float:
187
+ try:
188
+ return float(value)
189
+ except (TypeError, ValueError):
190
+ return default
@@ -0,0 +1,41 @@
1
+ """Run the synchronous Mubit SDK off the event loop.
2
+
3
+ The Mubit Python SDK is blocking (``requests``/``grpc``). Every adapter call goes
4
+ through a worker thread so FastAPI's event loop stays responsive.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import functools
10
+ import inspect
11
+ from collections.abc import Callable
12
+ from typing import TypeVar
13
+
14
+ import anyio
15
+
16
+ T = TypeVar("T")
17
+
18
+ # anyio renamed ``cancellable`` -> ``abandon_on_cancel`` in 4.1. Detect once.
19
+ _ABANDON_KW = (
20
+ "abandon_on_cancel"
21
+ if "abandon_on_cancel" in inspect.signature(anyio.to_thread.run_sync).parameters
22
+ else "cancellable"
23
+ )
24
+
25
+
26
+ async def run(func: Callable[..., T], *args: object, **kwargs: object) -> T:
27
+ """Run a blocking call in a worker thread (not abandoned on cancel)."""
28
+ call = functools.partial(func, *args, **kwargs)
29
+ return await anyio.to_thread.run_sync(call)
30
+
31
+
32
+ async def run_cancellable(func: Callable[..., T], *args: object, **kwargs: object) -> T:
33
+ """Run a blocking call, abandoning the thread if the await is cancelled.
34
+
35
+ Used for the latency-bounded recall path: on timeout we stop waiting, while the
36
+ abandoned thread finishes harmlessly in the background.
37
+ """
38
+ call = functools.partial(func, *args, **kwargs)
39
+ if _ABANDON_KW == "abandon_on_cancel":
40
+ return await anyio.to_thread.run_sync(call, abandon_on_cancel=True)
41
+ return await anyio.to_thread.run_sync(call, cancellable=True)
@@ -0,0 +1 @@
1
+ """Measurement layer: calibration, savings accounting, routing health."""
@@ -0,0 +1,415 @@
1
+ """Calibration and routing-health metrics over the decision log.
2
+
3
+ Pure functions over reconciled ``DecisionRecord`` rows — no state of their own, so the
4
+ same code powers the tenant-scoped ``GET /v1/calibration`` endpoint and the ops-side
5
+ ``minima-calibration-report`` console script.
6
+
7
+ A recommendation is "reconciled" once feedback arrived; only reconciled rows carry a
8
+ realized label. Calibration compares the chosen candidate's predicted_success at
9
+ decision time against that label (success=1 primary; quality-weighted alongside).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import bisect
15
+ from dataclasses import dataclass, field
16
+
17
+ from minima.memory.records import clamp01
18
+ from minima.recommender.decisionlog import DecisionRecord
19
+
20
+
21
+ @dataclass(slots=True)
22
+ class ReliabilityBin:
23
+ lo: float
24
+ hi: float
25
+ n: int = 0
26
+ avg_predicted: float = 0.0
27
+ avg_realized: float = 0.0
28
+
29
+
30
+ @dataclass(slots=True)
31
+ class CalibrationReport:
32
+ """ECE + reliability for one slice (a task_type, or the global pool)."""
33
+
34
+ slice_key: str
35
+ n: int
36
+ ece: float
37
+ ece_shrunk: float
38
+ ece_quality: float
39
+ bins: list[ReliabilityBin] = field(default_factory=list)
40
+
41
+
42
+ @dataclass(slots=True)
43
+ class CusumFlag:
44
+ cluster: str
45
+ model_id: str
46
+ n: int
47
+ statistic: float
48
+ direction: str # "over_predicting" | "under_predicting"
49
+
50
+
51
+ def _pairs(rows: list[DecisionRecord]) -> list[tuple[float, float, float]]:
52
+ """(predicted, realized_label, realized_quality) for reconciled rows."""
53
+ out: list[tuple[float, float, float]] = []
54
+ for r in rows:
55
+ if not r.reconciled:
56
+ continue
57
+ predicted = r.predicted_success_chosen
58
+ if predicted is None:
59
+ continue
60
+ label = 1.0 if r.realized_outcome == "success" else 0.0
61
+ quality = r.realized_quality if r.realized_quality is not None else label
62
+ out.append((predicted, label, quality))
63
+ return out
64
+
65
+
66
+ def _ece(pairs: list[tuple[float, float]], n_bins: int) -> tuple[float, list[ReliabilityBin]]:
67
+ bins = [
68
+ ReliabilityBin(lo=i / n_bins, hi=(i + 1) / n_bins)
69
+ for i in range(max(1, n_bins))
70
+ ]
71
+ sums_p = [0.0] * len(bins)
72
+ sums_y = [0.0] * len(bins)
73
+ for p, y in pairs:
74
+ idx = min(len(bins) - 1, int(p * len(bins)))
75
+ bins[idx].n += 1
76
+ sums_p[idx] += p
77
+ sums_y[idx] += y
78
+ total = sum(b.n for b in bins)
79
+ if total == 0:
80
+ return 0.0, bins
81
+ ece = 0.0
82
+ for i, b in enumerate(bins):
83
+ if b.n == 0:
84
+ continue
85
+ b.avg_predicted = sums_p[i] / b.n
86
+ b.avg_realized = sums_y[i] / b.n
87
+ ece += (b.n / total) * abs(b.avg_predicted - b.avg_realized)
88
+ return ece, bins
89
+
90
+
91
+ def calibration_by_task_type(
92
+ rows: list[DecisionRecord],
93
+ *,
94
+ n_bins: int = 10,
95
+ shrinkage_k: float = 20.0,
96
+ ) -> list[CalibrationReport]:
97
+ """Per-task_type ECE with hierarchical shrinkage toward the global estimate.
98
+
99
+ Sparse slices are pulled toward the global ECE with weight ``n / (n + k)`` so a
100
+ task_type with three feedbacks doesn't read as perfectly (mis)calibrated.
101
+ The first report ("global") is the unshrunk pool.
102
+ """
103
+ global_pairs = _pairs(rows)
104
+ g_label = [(p, y) for p, y, _ in global_pairs]
105
+ g_quality = [(p, q) for p, _, q in global_pairs]
106
+ global_ece, global_bins = _ece(g_label, n_bins)
107
+ global_ece_q, _ = _ece(g_quality, n_bins)
108
+ reports = [
109
+ CalibrationReport(
110
+ slice_key="global",
111
+ n=len(global_pairs),
112
+ ece=round(global_ece, 4),
113
+ ece_shrunk=round(global_ece, 4),
114
+ ece_quality=round(global_ece_q, 4),
115
+ bins=global_bins,
116
+ )
117
+ ]
118
+
119
+ by_type: dict[str, list[DecisionRecord]] = {}
120
+ for r in rows:
121
+ by_type.setdefault(r.task_type, []).append(r)
122
+ for task_type in sorted(by_type):
123
+ pairs = _pairs(by_type[task_type])
124
+ if not pairs:
125
+ continue
126
+ ece, bins = _ece([(p, y) for p, y, _ in pairs], n_bins)
127
+ ece_q, _ = _ece([(p, q) for p, _, q in pairs], n_bins)
128
+ n = len(pairs)
129
+ shrunk = (n * ece + shrinkage_k * global_ece) / (n + shrinkage_k)
130
+ reports.append(
131
+ CalibrationReport(
132
+ slice_key=task_type,
133
+ n=n,
134
+ ece=round(ece, 4),
135
+ ece_shrunk=round(shrunk, 4),
136
+ ece_quality=round(ece_q, 4),
137
+ bins=bins,
138
+ )
139
+ )
140
+ return reports
141
+
142
+
143
+ def cusum_flags(
144
+ rows: list[DecisionRecord],
145
+ *,
146
+ k: float = 0.25,
147
+ h: float = 2.0,
148
+ ) -> list[CusumFlag]:
149
+ """Two-sided CUSUM on (predicted - realized) residuals per (cluster, chosen model).
150
+
151
+ Flags sustained over-prediction (model got worse than the evidence says — the
152
+ expensive failure mode) and under-prediction, ordered by feedback time. Detection
153
+ only: acting on a flag (evidence reset / down-weight) is a later-phase policy.
154
+ Defaults are sized for binary residuals (|resid| up to 1): the slack absorbs
155
+ routine misses, the threshold requires a sustained run before flagging.
156
+ """
157
+ series: dict[tuple[str, str], list[tuple[float, float]]] = {}
158
+ for r in rows:
159
+ if not r.reconciled:
160
+ continue
161
+ predicted = r.predicted_success_chosen
162
+ if predicted is None or r.realized_model_id is None:
163
+ continue
164
+ label = 1.0 if r.realized_outcome == "success" else 0.0
165
+ series.setdefault((r.cluster, r.realized_model_id), []).append(
166
+ (r.feedback_ts or r.ts, predicted - label)
167
+ )
168
+
169
+ flags: list[CusumFlag] = []
170
+ for (cluster, model_id), points in series.items():
171
+ points.sort(key=lambda tr: tr[0])
172
+ s_hi = s_lo = 0.0
173
+ peak_hi = peak_lo = 0.0
174
+ for _, resid in points:
175
+ s_hi = max(0.0, s_hi + resid - k)
176
+ s_lo = max(0.0, s_lo - resid - k)
177
+ peak_hi = max(peak_hi, s_hi)
178
+ peak_lo = max(peak_lo, s_lo)
179
+ if peak_hi > h:
180
+ flags.append(
181
+ CusumFlag(
182
+ cluster=cluster,
183
+ model_id=model_id,
184
+ n=len(points),
185
+ statistic=round(peak_hi, 4),
186
+ direction="over_predicting",
187
+ )
188
+ )
189
+ if peak_lo > h:
190
+ flags.append(
191
+ CusumFlag(
192
+ cluster=cluster,
193
+ model_id=model_id,
194
+ n=len(points),
195
+ statistic=round(peak_lo, 4),
196
+ direction="under_predicting",
197
+ )
198
+ )
199
+ flags.sort(key=lambda f: f.statistic, reverse=True)
200
+ return flags
201
+
202
+
203
+ def routing_health(rows: list[DecisionRecord]) -> dict[str, float | int]:
204
+ """Decision-stream health rates; the fitness gates for everything analytical.
205
+
206
+ feedback_coverage is the share of recommendations that ever got feedback — the
207
+ statistic that decides whether calibration/MNAR machinery is fit for purpose.
208
+ """
209
+ n = len(rows)
210
+ if n == 0:
211
+ return {
212
+ "recommendations": 0,
213
+ "feedback_coverage": 0.0,
214
+ "late_feedback_share": 0.0,
215
+ "escalation_rate": 0.0,
216
+ "exploration_share": 0.0,
217
+ "epsilon_policy_share": 0.0,
218
+ "success_rate": 0.0,
219
+ "top_model_share": 0.0,
220
+ "cheapest_model_share": 0.0,
221
+ "cost_position": 0.0,
222
+ "shadow_agreement": 0.0,
223
+ }
224
+ reconciled = sum(1 for r in rows if r.reconciled)
225
+ late = sum(1 for r in rows if r.late_feedback)
226
+ escalated = sum(1 for r in rows if r.escalated)
227
+ successes = sum(1 for r in rows if r.realized_outcome == "success")
228
+ # exploration_share = picks actually changed by the epsilon branch (~epsilon when
229
+ # active); epsilon_policy_share = share of decisions where exploration was possible.
230
+ explored = sum(1 for r in rows if r.explored)
231
+ epsilon_policy = sum(1 for r in rows if r.policy == "epsilon_softmax")
232
+ top_share, cheapest_share, cost_position = _cost_metrics(rows)
233
+ return {
234
+ "recommendations": n,
235
+ "feedback_coverage": round(reconciled / n, 4),
236
+ "late_feedback_share": round(late / reconciled, 4) if reconciled else 0.0,
237
+ "escalation_rate": round(escalated / n, 4),
238
+ "exploration_share": round(explored / n, 4),
239
+ "epsilon_policy_share": round(epsilon_policy / n, 4),
240
+ # success_rate over reconciled rows — pair with cost_position for the Pareto view.
241
+ "success_rate": round(successes / reconciled, 4) if reconciled else 0.0,
242
+ # Routing-optimality signals over the candidate price ladder:
243
+ # top_model_share — share picking the MOST expensive candidate (collapse signal,
244
+ # arXiv 2602.03478).
245
+ # cheapest_model_share — share picking the CHEAPEST candidate (aggressive saving).
246
+ # cost_position — mean normalized position 0=cheapest .. 1=priciest. The honest
247
+ # online "how far up the price ladder do we routinely pick"
248
+ # number; pair with success_rate (true regret-vs-oracle needs
249
+ # counterfactuals — that lives in the offline RouterBench eval).
250
+ "top_model_share": top_share,
251
+ "cheapest_model_share": cheapest_share,
252
+ "cost_position": cost_position,
253
+ # Share of decisions where the advisory shadow bandit agreed with the deployed pick
254
+ # (over rows that logged a shadow pick). Low agreement => the policies diverge; pair
255
+ # with offline regret before considering promotion.
256
+ "shadow_agreement": _shadow_agreement(rows),
257
+ }
258
+
259
+
260
+ def _shadow_agreement(rows: list[DecisionRecord]) -> float:
261
+ counted = agree = 0
262
+ for r in rows:
263
+ if r.shadow_chosen_model_id is None:
264
+ continue
265
+ counted += 1
266
+ if r.shadow_chosen_model_id == r.chosen_model_id:
267
+ agree += 1
268
+ return round(agree / counted, 4) if counted else 0.0
269
+
270
+
271
+ def _cost_metrics(rows: list[DecisionRecord]) -> tuple[float, float, float]:
272
+ """(top_model_share, cheapest_model_share, mean cost_position) over rows with candidates."""
273
+ counted = picked_top = picked_cheap = 0
274
+ position_sum = 0.0
275
+ for r in rows:
276
+ if not r.candidates:
277
+ continue
278
+ counted += 1
279
+ costs = [c.est_cost_usd for c in r.candidates]
280
+ lo, hi = min(costs), max(costs)
281
+ chosen = next((c for c in r.candidates if c.model_id == r.chosen_model_id), None)
282
+ if chosen is None:
283
+ continue
284
+ if chosen.est_cost_usd >= hi - 1e-12:
285
+ picked_top += 1
286
+ if chosen.est_cost_usd <= lo + 1e-12:
287
+ picked_cheap += 1
288
+ position_sum += (chosen.est_cost_usd - lo) / (hi - lo) if hi > lo else 0.0
289
+ if not counted:
290
+ return 0.0, 0.0, 0.0
291
+ return (
292
+ round(picked_top / counted, 4),
293
+ round(picked_cheap / counted, 4),
294
+ round(position_sum / counted, 4),
295
+ )
296
+
297
+
298
+ # --------------------------------------------------------------------------- calibration FIT
299
+ # The reports above MEASURE calibration; the machinery below FITS a monotonic remap that the
300
+ # recommender applies to predicted_success before the tau-clearing decision, so a "0.7" really
301
+ # means ~70% realized success. Isotonic regression (pool-adjacent-violators) is non-parametric
302
+ # and monotonic; we shrink it toward the identity by n / (n + k) so a sparse slice barely moves
303
+ # (the same hierarchical-shrinkage instinct as ``calibration_by_task_type``). Pure stdlib — no
304
+ # numpy/sklearn — to stay on the recommend() hot path's dependency budget.
305
+
306
+
307
+ def _isotonic_pav(pairs: list[tuple[float, float]]) -> tuple[list[float], list[float]]:
308
+ """Pool-adjacent-violators isotonic regression of y on x.
309
+
310
+ Returns ``(xs, ys)`` where ``xs`` are block right-edges (ascending) and ``ys`` the
311
+ block means (non-decreasing) — a monotonic step function. Empty when no pairs.
312
+ """
313
+ pts = sorted(pairs, key=lambda t: t[0])
314
+ if not pts:
315
+ return [], []
316
+
317
+ def _mean(b: list[float]) -> float:
318
+ return b[0] / b[1]
319
+
320
+ # Each block: [sum_y, count, right_edge_x].
321
+ blocks: list[list[float]] = []
322
+ for x, y in pts:
323
+ blocks.append([y, 1.0, x])
324
+ while len(blocks) >= 2 and _mean(blocks[-2]) >= _mean(blocks[-1]):
325
+ sy2, c2, x2 = blocks.pop()
326
+ sy1, c1, x1 = blocks.pop()
327
+ blocks.append([sy1 + sy2, c1 + c2, max(x1, x2)])
328
+ xs = [b[2] for b in blocks]
329
+ ys = [clamp01(b[0] / b[1]) for b in blocks]
330
+ return xs, ys
331
+
332
+
333
+ @dataclass(slots=True)
334
+ class IsotonicCalibrator:
335
+ """A monotonic predicted->realized remap, shrunk toward the identity at low n."""
336
+
337
+ xs: list[float]
338
+ ys: list[float]
339
+ weight: float # shrinkage toward identity: n / (n + k), in [0, 1]
340
+ n: int
341
+
342
+ def transform(self, p: float) -> float:
343
+ if not self.xs:
344
+ return clamp01(p)
345
+ i = bisect.bisect_left(self.xs, p)
346
+ if i >= len(self.ys):
347
+ i = len(self.ys) - 1
348
+ iso = self.ys[i]
349
+ return clamp01(self.weight * iso + (1.0 - self.weight) * p)
350
+
351
+
352
+ @dataclass(slots=True)
353
+ class CalibratorSet:
354
+ """Per-task_type calibrators with a global fallback; identity when a slice is unknown."""
355
+
356
+ by_task_type: dict[str, IsotonicCalibrator]
357
+ global_map: IsotonicCalibrator | None
358
+ fitted_at: float
359
+ n: int
360
+
361
+ def transform(self, task_type: str, p: float) -> float:
362
+ m = self.by_task_type.get(task_type) or self.global_map
363
+ return m.transform(p) if m is not None else clamp01(p)
364
+
365
+
366
+ def _raw_label_pairs(rows: list[DecisionRecord]) -> list[tuple[float, float, str]]:
367
+ """(raw_predicted_chosen, realized_label, task_type) over reconciled rows."""
368
+ out: list[tuple[float, float, str]] = []
369
+ for r in rows:
370
+ if not r.reconciled:
371
+ continue
372
+ raw = r.raw_predicted_success_chosen
373
+ if raw is None:
374
+ continue
375
+ label = 1.0 if r.realized_outcome == "success" else 0.0
376
+ out.append((raw, label, r.task_type))
377
+ return out
378
+
379
+
380
+ def _fit_one(pairs: list[tuple[float, float]], shrinkage_k: float) -> IsotonicCalibrator | None:
381
+ n = len(pairs)
382
+ if n == 0:
383
+ return None
384
+ xs, ys = _isotonic_pav(pairs)
385
+ weight = n / (n + shrinkage_k)
386
+ return IsotonicCalibrator(xs=xs, ys=ys, weight=weight, n=n)
387
+
388
+
389
+ def fit_calibrators(
390
+ rows: list[DecisionRecord],
391
+ *,
392
+ min_n: int,
393
+ shrinkage_k: float,
394
+ now: float,
395
+ ) -> CalibratorSet | None:
396
+ """Fit a global + per-task_type isotonic calibrator from reconciled decision rows.
397
+
398
+ Returns None (=> identity everywhere) when fewer than ``min_n`` reconciled pairs exist.
399
+ Per-task_type maps are only fit for slices that themselves clear ``min_n``; everything
400
+ else falls back to the global map.
401
+ """
402
+ triples = _raw_label_pairs(rows)
403
+ if len(triples) < min_n:
404
+ return None
405
+ global_map = _fit_one([(p, y) for p, y, _ in triples], shrinkage_k)
406
+ grouped: dict[str, list[tuple[float, float]]] = {}
407
+ for p, y, tt in triples:
408
+ grouped.setdefault(tt, []).append((p, y))
409
+ by_type: dict[str, IsotonicCalibrator] = {}
410
+ for tt, ps in grouped.items():
411
+ if len(ps) >= min_n:
412
+ fitted = _fit_one(ps, shrinkage_k)
413
+ if fitted is not None:
414
+ by_type[tt] = fitted
415
+ return CalibratorSet(by_task_type=by_type, global_map=global_map, fitted_at=now, n=len(triples))