minima-cli 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. minima/__init__.py +5 -0
  2. minima/api/__init__.py +1 -0
  3. minima/api/auth.py +39 -0
  4. minima/api/errors.py +40 -0
  5. minima/api/routers/__init__.py +1 -0
  6. minima/api/routers/calibration.py +50 -0
  7. minima/api/routers/feedback.py +279 -0
  8. minima/api/routers/health.py +50 -0
  9. minima/api/routers/models.py +42 -0
  10. minima/api/routers/recommend.py +66 -0
  11. minima/api/routers/savings.py +55 -0
  12. minima/api/routers/strategies.py +33 -0
  13. minima/catalog/__init__.py +1 -0
  14. minima/catalog/data/capability_priors.json +210 -0
  15. minima/catalog/data/model_aliases.json +12 -0
  16. minima/catalog/merge.py +69 -0
  17. minima/catalog/refresh.py +54 -0
  18. minima/catalog/sources/__init__.py +1 -0
  19. minima/catalog/sources/litellm.py +19 -0
  20. minima/catalog/sources/openrouter.py +25 -0
  21. minima/catalog/store.py +86 -0
  22. minima/config.py +288 -0
  23. minima/deps.py +35 -0
  24. minima/llm/__init__.py +1 -0
  25. minima/llm/anthropic.py +106 -0
  26. minima/llm/base.py +196 -0
  27. minima/llm/gemini.py +124 -0
  28. minima/llm/registry.py +54 -0
  29. minima/logging.py +28 -0
  30. minima/main.py +109 -0
  31. minima/memory/__init__.py +1 -0
  32. minima/memory/adapter.py +572 -0
  33. minima/memory/keys.py +83 -0
  34. minima/memory/records.py +190 -0
  35. minima/memory/threadpool.py +41 -0
  36. minima/metrics/__init__.py +1 -0
  37. minima/metrics/calibration.py +415 -0
  38. minima/metrics/report.py +116 -0
  39. minima/metrics/savings.py +98 -0
  40. minima/recommender/__init__.py +1 -0
  41. minima/recommender/_pg_pool.py +38 -0
  42. minima/recommender/_redis_client.py +32 -0
  43. minima/recommender/aggregate.py +157 -0
  44. minima/recommender/classify.py +165 -0
  45. minima/recommender/decisionlog.py +505 -0
  46. minima/recommender/durablerefs.py +312 -0
  47. minima/recommender/engine.py +997 -0
  48. minima/recommender/escalation.py +83 -0
  49. minima/recommender/propensity.py +189 -0
  50. minima/recommender/recstore.py +368 -0
  51. minima/recommender/score.py +318 -0
  52. minima/recommender/types.py +166 -0
  53. minima/schemas/__init__.py +1 -0
  54. minima/schemas/common.py +73 -0
  55. minima/schemas/feedback.py +34 -0
  56. minima/schemas/models_catalog.py +36 -0
  57. minima/schemas/recommend.py +104 -0
  58. minima/schemas/savings.py +39 -0
  59. minima/schemas/strategies.py +57 -0
  60. minima/schemas/workflow.py +43 -0
  61. minima/seeding/__init__.py +1 -0
  62. minima/seeding/items.py +42 -0
  63. minima/seeding/llmrouterbench.py +232 -0
  64. minima/seeding/routerbench.py +141 -0
  65. minima/seeding/run_seed.py +56 -0
  66. minima/seeding/synthetic.py +70 -0
  67. minima/tenancy/__init__.py +8 -0
  68. minima/tenancy/context.py +37 -0
  69. minima/tenancy/passthrough.py +110 -0
  70. minima/version.py +3 -0
  71. minima_cli-0.4.9.dist-info/METADATA +275 -0
  72. minima_cli-0.4.9.dist-info/RECORD +161 -0
  73. minima_cli-0.4.9.dist-info/WHEEL +4 -0
  74. minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
  75. minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
  76. minima_client/__init__.py +19 -0
  77. minima_client/autocapture.py +101 -0
  78. minima_client/client.py +301 -0
  79. minima_client/errors.py +23 -0
  80. minima_harness/LICENSE_PI +32 -0
  81. minima_harness/__init__.py +16 -0
  82. minima_harness/agent/__init__.py +72 -0
  83. minima_harness/agent/agent.py +276 -0
  84. minima_harness/agent/events.py +124 -0
  85. minima_harness/agent/loop.py +311 -0
  86. minima_harness/agent/state.py +79 -0
  87. minima_harness/agent/tools.py +97 -0
  88. minima_harness/ai/__init__.py +66 -0
  89. minima_harness/ai/compat.py +71 -0
  90. minima_harness/ai/errors.py +96 -0
  91. minima_harness/ai/events.py +117 -0
  92. minima_harness/ai/openrouter_catalog.py +153 -0
  93. minima_harness/ai/provider_catalog.py +299 -0
  94. minima_harness/ai/provider_quirks.py +37 -0
  95. minima_harness/ai/providers/__init__.py +75 -0
  96. minima_harness/ai/providers/_common.py +48 -0
  97. minima_harness/ai/providers/anthropic.py +290 -0
  98. minima_harness/ai/providers/base.py +65 -0
  99. minima_harness/ai/providers/faux.py +173 -0
  100. minima_harness/ai/providers/google.py +221 -0
  101. minima_harness/ai/providers/openai_compat.py +278 -0
  102. minima_harness/ai/registry.py +184 -0
  103. minima_harness/ai/stream.py +82 -0
  104. minima_harness/ai/tools.py +51 -0
  105. minima_harness/ai/types.py +204 -0
  106. minima_harness/ai/usage.py +41 -0
  107. minima_harness/minima/__init__.py +40 -0
  108. minima_harness/minima/cache.py +102 -0
  109. minima_harness/minima/config.py +85 -0
  110. minima_harness/minima/goals.py +226 -0
  111. minima_harness/minima/judge.py +144 -0
  112. minima_harness/minima/mapping.py +147 -0
  113. minima_harness/minima/meter.py +143 -0
  114. minima_harness/minima/router.py +220 -0
  115. minima_harness/minima/runtime.py +544 -0
  116. minima_harness/minima/signals.py +195 -0
  117. minima_harness/session/__init__.py +14 -0
  118. minima_harness/session/format.py +35 -0
  119. minima_harness/session/store.py +236 -0
  120. minima_harness/tasks/__init__.py +17 -0
  121. minima_harness/tasks/task_set.py +78 -0
  122. minima_harness/tools/__init__.py +7 -0
  123. minima_harness/tools/_io.py +34 -0
  124. minima_harness/tools/bash.py +70 -0
  125. minima_harness/tools/builtin.py +23 -0
  126. minima_harness/tools/edit.py +50 -0
  127. minima_harness/tools/find.py +38 -0
  128. minima_harness/tools/grep.py +73 -0
  129. minima_harness/tools/ls.py +35 -0
  130. minima_harness/tools/read.py +38 -0
  131. minima_harness/tools/tasks.py +75 -0
  132. minima_harness/tools/write.py +36 -0
  133. minima_harness/tui/__init__.py +3 -0
  134. minima_harness/tui/analytics.py +111 -0
  135. minima_harness/tui/app.py +1927 -0
  136. minima_harness/tui/bridge.py +103 -0
  137. minima_harness/tui/cli.py +227 -0
  138. minima_harness/tui/clipboard.py +60 -0
  139. minima_harness/tui/commands.py +49 -0
  140. minima_harness/tui/compaction.py +17 -0
  141. minima_harness/tui/config_cli.py +141 -0
  142. minima_harness/tui/config_store.py +237 -0
  143. minima_harness/tui/context.py +93 -0
  144. minima_harness/tui/customize.py +95 -0
  145. minima_harness/tui/diff.py +53 -0
  146. minima_harness/tui/editor.py +43 -0
  147. minima_harness/tui/extensions.py +84 -0
  148. minima_harness/tui/extra_models.py +52 -0
  149. minima_harness/tui/history.py +71 -0
  150. minima_harness/tui/mubit.py +295 -0
  151. minima_harness/tui/overlays.py +593 -0
  152. minima_harness/tui/packages.py +59 -0
  153. minima_harness/tui/run_modes.py +66 -0
  154. minima_harness/tui/theme.py +77 -0
  155. minima_harness/tui/welcome.py +83 -0
  156. minima_harness/tui/widgets/__init__.py +3 -0
  157. minima_harness/tui/widgets/banner.py +38 -0
  158. minima_harness/tui/widgets/editor.py +83 -0
  159. minima_harness/tui/widgets/footer.py +73 -0
  160. minima_harness/tui/widgets/messages.py +151 -0
  161. minima_harness/tui/widgets/status.py +57 -0
@@ -0,0 +1,997 @@
1
+ """The recommendation orchestrator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import math
7
+ import random
8
+ import time
9
+ import uuid
10
+
11
+ from minima.catalog.store import CatalogStore
12
+ from minima.config import Settings
13
+ from minima.llm.base import CandidateView, Reasoner
14
+ from minima.logging import get_logger
15
+ from minima.memory.adapter import Memory
16
+ from minima.memory.keys import build_content, salient_signature, task_cluster, task_fingerprint
17
+ from minima.memory.records import clamp01
18
+ from minima.metrics.calibration import CalibratorSet, fit_calibrators
19
+ from minima.recommender import escalation, score
20
+ from minima.recommender.aggregate import aggregate_by_model, apply_ipw
21
+ from minima.recommender.classify import classify, classify_from_neighbors
22
+ from minima.recommender.decisionlog import CandidateSnapshot, DecisionLog, DecisionRecord
23
+ from minima.recommender.durablerefs import DurableRefs
24
+ from minima.recommender.propensity import Propensity, PropensityTracker
25
+ from minima.recommender.recstore import RecStore, StoredRecommendation
26
+ from minima.recommender.types import CandidateScore, ModelAggregate
27
+ from minima.schemas.common import DecisionBasis, Difficulty, TaskType
28
+ from minima.schemas.models_catalog import ModelCard
29
+ from minima.schemas.recommend import (
30
+ EvidenceRef,
31
+ RankedModel,
32
+ RecommendRequest,
33
+ RecommendResponse,
34
+ )
35
+
36
+ log = get_logger("minima.recommender")
37
+
38
+ # Any positive recalled-outcome mass makes a candidate's prediction "memory-driven";
39
+ # the confidence field separately conveys how strong that evidence is.
40
+ MEMORY_WEIGHT_MIN = 0.0
41
+ # Max neighbors echoed back per candidate in the explained response.
42
+ MAX_EVIDENCE_PER_CANDIDATE = 5
43
+
44
+
45
+ class NoCandidatesError(ValueError):
46
+ """Raised when constraints eliminate every catalog model."""
47
+
48
+
49
+ class Recommender:
50
+ def __init__(
51
+ self,
52
+ settings: Settings,
53
+ memory: Memory,
54
+ catalog_store: CatalogStore,
55
+ recstore: RecStore,
56
+ reasoner: Reasoner | None = None,
57
+ propensity: Propensity | None = None,
58
+ decision_log: DecisionLog | None = None,
59
+ org_id: str = "default",
60
+ rng: random.Random | None = None,
61
+ durable_refs: DurableRefs | None = None,
62
+ ):
63
+ self._settings = settings
64
+ self._memory = memory
65
+ self._catalog_store = catalog_store
66
+ self._recstore = recstore
67
+ self._reasoner = reasoner
68
+ self._propensity = propensity or PropensityTracker()
69
+ self._decision_log = decision_log
70
+ self._org_id = org_id
71
+ self._durable_refs = durable_refs
72
+ self._rng = rng or random.Random() # noqa: S311 — exploration sampling, not crypto
73
+ epsilon_orgs = {
74
+ o.strip() for o in settings.minima_epsilon_selection_orgs.split(",") if o.strip()
75
+ }
76
+ self._epsilon_enabled = org_id in epsilon_orgs
77
+ thompson_orgs = {
78
+ o.strip() for o in settings.minima_thompson_selection_orgs.split(",") if o.strip()
79
+ }
80
+ self._thompson_enabled = org_id in thompson_orgs
81
+ # Lazily-fit, cached calibrator (org-scoped via this Recommender's decision log).
82
+ self._calibrators: CalibratorSet | None = None
83
+ self._calibrators_fitted_at: float = 0.0
84
+
85
+ async def recommend(self, req: RecommendRequest) -> RecommendResponse:
86
+ started = time.monotonic()
87
+ settings = self._settings
88
+ warnings: list[str] = []
89
+
90
+ task_type, difficulty = classify(req.task)
91
+ task_type, difficulty = await self._maybe_llm_classify(req, task_type, difficulty, warnings)
92
+ signature = (
93
+ salient_signature(req.task.task, settings.minima_cluster_signature_tokens)
94
+ if settings.minima_cluster_granularity.lower() == "fine"
95
+ else None
96
+ )
97
+ cluster = task_cluster(task_type, difficulty, signature)
98
+ fingerprint = task_fingerprint(req.task.task)
99
+ lane = settings.lane(req.namespace)
100
+
101
+ catalog = self._catalog_store.get()
102
+ candidates = _select_candidates(catalog.cards, req, task_type, req.max_candidates)
103
+ if not candidates:
104
+ raise NoCandidatesError("no models match the supplied constraints")
105
+ candidate_ids = {c.model_id for c in candidates}
106
+
107
+ recall, fastpath_evidence = await self._recall_with_fastpath(
108
+ req=req, lane=lane, cluster=cluster, candidate_ids=candidate_ids
109
+ )
110
+ if recall.timed_out:
111
+ warnings.append("recall_timeout")
112
+ elif recall.error:
113
+ warnings.append("memory_unavailable")
114
+ evidence = recall.outcome_evidence + fastpath_evidence
115
+
116
+ # Neighbor-vote refinement: if the heuristic couldn't place the task, let the
117
+ # ANN-recalled semantic neighbors vote on its type (free; the cluster key then
118
+ # becomes coherent for scoring + the stored outcome). Caller-supplied types win.
119
+ if (
120
+ req.task.task_type is None
121
+ and task_type == TaskType.other
122
+ and settings.minima_neighbor_classify
123
+ and evidence
124
+ ):
125
+ voted = classify_from_neighbors(
126
+ [(ev.record.task_type, ev.score) for ev in evidence if ev.record is not None]
127
+ )
128
+ if voted is not None and voted != task_type:
129
+ task_type = voted
130
+ cluster = task_cluster(task_type, difficulty, signature)
131
+ warnings.append("neighbor_classified")
132
+
133
+ # Remember durable-record ids surfaced by recall so the fast path can
134
+ # Dereference them next time (live records only — seeds are per-row inserts,
135
+ # not the durable (cluster, model) upsert). Bookkeeping only: a store failure
136
+ # must never break the recommendation.
137
+ if self._durable_refs is not None:
138
+ try:
139
+ for ev in recall.outcome_evidence:
140
+ rec = ev.record
141
+ if (
142
+ rec is not None
143
+ and rec.task_cluster == cluster
144
+ and rec.source_dataset is None
145
+ and (ev.reference_id or ev.referenceable)
146
+ ):
147
+ self._durable_refs.upsert(
148
+ lane, cluster, rec.model_id, ev.entry_id, ev.reference_id or ""
149
+ )
150
+ except Exception as exc: # noqa: BLE001
151
+ log.warning("durable_ref_upsert_failed", error=str(exc))
152
+
153
+ aggregates = aggregate_by_model(
154
+ evidence,
155
+ candidate_ids,
156
+ half_life_days=settings.minima_evidence_half_life_days,
157
+ decay_floor=settings.minima_evidence_decay_floor,
158
+ seed_weight=settings.minima_seed_weight,
159
+ seed_crowdout_n=settings.minima_seed_crowdout_n,
160
+ )
161
+ if settings.minima_ipw_enabled and aggregates:
162
+ apply_ipw(
163
+ aggregates,
164
+ self._propensity.propensities(lane, cluster, candidate_ids),
165
+ settings.minima_ipw_clip_low,
166
+ settings.minima_ipw_clip_high,
167
+ )
168
+
169
+ input_tokens = req.task.expected_input_tokens or settings.minima_default_input_tokens
170
+ output_tokens = req.task.expected_output_tokens or int(
171
+ settings.minima_default_output_tokens
172
+ * settings.minima_difficulty_output_multipliers.get(difficulty.value, 1.0)
173
+ )
174
+ scored = self._score_candidates(
175
+ candidates, aggregates, task_type, input_tokens, output_tokens, req
176
+ )
177
+ # Premium counterfactual baseline, captured BEFORE the cost/latency filters
178
+ # shrink the set — otherwise the baseline itself would shift with the caller's
179
+ # constraints and savings would not be comparable across requests.
180
+ est_cost_premium = max((c.est_cost_usd for c in scored), default=0.0)
181
+
182
+ if req.constraints.max_cost_per_call is not None:
183
+ affordable = [c for c in scored if c.est_cost_usd <= req.constraints.max_cost_per_call]
184
+ if affordable:
185
+ scored = affordable
186
+ else:
187
+ warnings.append("no_model_within_cost_budget")
188
+
189
+ if req.constraints.max_latency_ms is not None:
190
+ # Only exclude candidates with OBSERVED latency evidence above the budget —
191
+ # a model is never condemned without data (its est_latency_ms stays None).
192
+ within = [
193
+ c
194
+ for c in scored
195
+ if c.est_latency_ms is None or c.est_latency_ms <= req.constraints.max_latency_ms
196
+ ]
197
+ if within:
198
+ scored = within
199
+ else:
200
+ warnings.append("no_model_within_latency_budget")
201
+
202
+ tau = score.threshold_from_slider(
203
+ req.cost_quality_tradeoff,
204
+ settings.minima_tau_min,
205
+ settings.minima_tau_max,
206
+ req.constraints.min_quality,
207
+ )
208
+
209
+ recommended, fallback, ranked, opt_warnings = self._finalize(
210
+ scored, tau, req.cost_quality_tradeoff
211
+ )
212
+ overall_basis = recommended.decision_basis
213
+
214
+ esc = escalation.evaluate(
215
+ settings=settings,
216
+ allow=req.allow_llm_escalation,
217
+ total_weight=sum(a.weight_sum for a in aggregates.values()),
218
+ distinct_models_with_evidence=sum(1 for a in aggregates.values() if a.weight_sum > 0),
219
+ recommended_confidence=recommended.confidence,
220
+ ranked=ranked,
221
+ aggregates=aggregates,
222
+ recommended_interval_width=score.posterior_interval_width(
223
+ aggregates.get(recommended.card.model_id),
224
+ score.capability_prior(recommended.card, task_type),
225
+ settings.minima_beta_pseudocount,
226
+ ),
227
+ recommended_predicted_success=recommended.predicted_success,
228
+ tau=tau,
229
+ )
230
+ if esc.should_escalate:
231
+ warnings.extend(f"escalation_suggested:{reason}" for reason in esc.reasons)
232
+ if self._reasoner is not None and settings.reasoner_enabled:
233
+ consulted = await self._consult_reasoner(
234
+ scored=scored, task_type=task_type, difficulty=difficulty, lane=lane, req=req
235
+ )
236
+ if consulted:
237
+ recommended, fallback, ranked, opt_warnings = self._finalize(
238
+ scored, tau, req.cost_quality_tradeoff
239
+ )
240
+ overall_basis = DecisionBasis.llm
241
+ warnings.append("reasoner_consulted")
242
+ else:
243
+ warnings.append("reasoner_failed")
244
+ else:
245
+ warnings.append("reasoner_disabled")
246
+ warnings.extend(opt_warnings)
247
+
248
+ if not evidence:
249
+ warnings.append("cold_start")
250
+ if catalog.stale:
251
+ warnings.append("prices_stale")
252
+
253
+ # Selection policy: deterministic argmin everywhere; epsilon-softmax over the
254
+ # tau-ELIGIBLE set for opted-in orgs (the safety floor is eligibility itself).
255
+ # The propensity vector is logged either way so off-policy evaluation can tell
256
+ # a degenerate (deterministic) log from a stochastic one.
257
+ selection_policy = "argmin"
258
+ explored_pick = False
259
+ sel_propensities: dict[str, float] = dict.fromkeys(
260
+ (c.card.model_id for c in ranked), 0.0
261
+ )
262
+ sel_propensities[recommended.card.model_id] = 1.0
263
+ if self._thompson_enabled and len(scored) >= 2:
264
+ # Posterior-sampling selection: sample each candidate's success, pick cheapest
265
+ # clearing tau under the sample. MC frequencies are the logged propensities.
266
+ selection_policy = "thompson"
267
+ items = [(c.card.model_id, c.alpha, c.beta, c.est_cost_usd) for c in scored]
268
+ pick_id, pi = score.thompson_select(
269
+ items, tau, self._rng, settings.minima_thompson_samples
270
+ )
271
+ sel_propensities = dict.fromkeys((c.card.model_id for c in ranked), 0.0)
272
+ sel_propensities.update(pi)
273
+ if pick_id and pick_id != recommended.card.model_id:
274
+ sampled = next((c for c in scored if c.card.model_id == pick_id), None)
275
+ if sampled is not None:
276
+ fallback = recommended # the deterministic pick is the natural retry
277
+ recommended = sampled
278
+ overall_basis = recommended.decision_basis
279
+ explored_pick = True
280
+ warnings.append("thompson_pick")
281
+ elif self._epsilon_enabled:
282
+ eligible = [c for c in ranked if c.predicted_success >= tau]
283
+ if len(eligible) >= 2:
284
+ selection_policy = "epsilon_softmax"
285
+ argmin_id = recommended.card.model_id
286
+ pi = score.softmax_propensities(
287
+ {c.card.model_id: c.score for c in eligible},
288
+ argmin_id,
289
+ settings.minima_epsilon,
290
+ settings.minima_epsilon_softmax_temperature,
291
+ )
292
+ sel_propensities.update(pi)
293
+ sampled = self._maybe_explore(eligible, argmin_id)
294
+ if sampled is not None and sampled.card.model_id != argmin_id:
295
+ fallback = recommended # the deterministic pick is the natural retry
296
+ recommended = sampled
297
+ overall_basis = recommended.decision_basis
298
+ explored_pick = True
299
+ warnings.append("exploration_pick")
300
+
301
+ self._propensity.record(lane, cluster, recommended.card.model_id)
302
+
303
+ # Advisory shadow bandit: log what a UCB policy WOULD pick (never overrides).
304
+ shadow_pick: str | None = None
305
+ if settings.minima_shadow_bandit and ranked:
306
+ shadow_pick = _shadow_pick(
307
+ ranked, req.cost_quality_tradeoff, settings.minima_shadow_ucb_alpha
308
+ )
309
+ if shadow_pick is not None and shadow_pick != recommended.card.model_id:
310
+ warnings.append("shadow_disagree")
311
+
312
+ recommendation_id = uuid.uuid4().hex
313
+ self._recstore.put(
314
+ StoredRecommendation(
315
+ recommendation_id=recommendation_id,
316
+ lane=lane,
317
+ user_id=req.user_id,
318
+ task_type=task_type.value,
319
+ difficulty=difficulty.value,
320
+ task_cluster=cluster,
321
+ task_fingerprint=fingerprint,
322
+ content=build_content(task_type.value, difficulty.value, req.task.task),
323
+ env_tags=list(req.task.tags or []),
324
+ recommended_model_id=recommended.card.model_id,
325
+ neighbors_by_model={
326
+ mid: [(ev.entry_id, ev.reference_id) for ev in agg.evidence]
327
+ for mid, agg in aggregates.items()
328
+ },
329
+ )
330
+ )
331
+ self._log_decision(
332
+ recommendation_id=recommendation_id,
333
+ req=req,
334
+ lane=lane,
335
+ cluster=cluster,
336
+ task_type=task_type,
337
+ difficulty=difficulty,
338
+ fingerprint=fingerprint,
339
+ tau=tau,
340
+ selection_policy=selection_policy,
341
+ explored_pick=explored_pick,
342
+ sel_propensities=sel_propensities,
343
+ recommended=recommended,
344
+ ranked=ranked,
345
+ esc=esc,
346
+ input_tokens=input_tokens,
347
+ output_tokens=output_tokens,
348
+ est_cost_premium=est_cost_premium,
349
+ shadow_chosen_model_id=shadow_pick,
350
+ )
351
+
352
+ confidence = _overall_confidence(overall_basis, recommended.confidence)
353
+ return RecommendResponse(
354
+ recommendation_id=recommendation_id,
355
+ recommended_model=_to_ranked_model(recommended, req.explain),
356
+ ranked=[_to_ranked_model(c, req.explain) for c in ranked],
357
+ fallback_model=_to_ranked_model(fallback, req.explain) if fallback else None,
358
+ confidence=round(confidence, 4),
359
+ decision_basis=overall_basis,
360
+ threshold_used=round(tau, 4),
361
+ classified_task_type=task_type,
362
+ classified_difficulty=difficulty,
363
+ catalog_version=catalog.version,
364
+ catalog_stale=catalog.stale,
365
+ latency_ms=int((time.monotonic() - started) * 1000),
366
+ warnings=warnings,
367
+ selection_policy=selection_policy,
368
+ recommended_actions=_actions_for(recommended.card),
369
+ )
370
+
371
+ def _maybe_explore(
372
+ self, eligible: list[CandidateScore], argmin_id: str
373
+ ) -> CandidateScore | None:
374
+ """Sample the epsilon branch: softmax over eligible ranking scores.
375
+
376
+ Returns the sampled candidate (possibly the argmin itself) or None when the
377
+ (1 - epsilon) deterministic branch was taken.
378
+ """
379
+ settings = self._settings
380
+ if self._rng.random() >= settings.minima_epsilon:
381
+ return None
382
+ t = max(settings.minima_epsilon_softmax_temperature, 1e-6)
383
+ peak = max(c.score for c in eligible)
384
+ weights = [math.exp((c.score - peak) / t) for c in eligible]
385
+ return self._rng.choices(eligible, weights=weights, k=1)[0]
386
+
387
+ async def _recall_with_fastpath(
388
+ self,
389
+ *,
390
+ req: RecommendRequest,
391
+ lane: str,
392
+ cluster: str,
393
+ candidate_ids: set[str],
394
+ ):
395
+ """ANN recall joined by a deterministic keyed lookup for the current cluster.
396
+
397
+ The lookup (POST /v2/core/lookup) fetches outcome records for all candidate
398
+ models in this cluster straight from storage — no ANN, no flicker. Records
399
+ already returned by ANN are deduped by entry_id so they are never double-counted.
400
+
401
+ The old dereference-based fastpath (MINIMA_DURABLE_FASTPATH=on/shadow) is
402
+ retained for backward compatibility and runs concurrently when configured.
403
+ """
404
+ settings = self._settings
405
+ recall_coro = self._memory.recall(
406
+ query=req.task.task,
407
+ lane=lane,
408
+ user_id=req.user_id,
409
+ limit=settings.minima_memory_recall_limit,
410
+ env_tags=req.task.tags or None,
411
+ )
412
+
413
+ # Keyed lookup: one filter clause per candidate model in this cluster.
414
+ # OR-combined on the server; returns all matching non-deleted records.
415
+ lookup_coro = self._memory.lookup(
416
+ lane=lane,
417
+ match=[
418
+ {"kind": "outcome", "task_cluster": cluster, "model_id": mid}
419
+ for mid in candidate_ids
420
+ ],
421
+ )
422
+
423
+ mode = settings.minima_durable_fastpath.lower()
424
+ refs = (
425
+ self._durable_refs.refs(lane, cluster, settings.minima_durable_fastpath_max_refs)
426
+ if mode in ("shadow", "on") and self._durable_refs is not None
427
+ else []
428
+ )
429
+
430
+ if not refs:
431
+ # Fast common path: recall + lookup, no dereferences.
432
+ recall, lookup_evidence = await asyncio.gather(recall_coro, lookup_coro)
433
+ ann_ids = {ev.entry_id for ev in recall.evidence}
434
+ extra = [ev for ev in lookup_evidence if ev.entry_id not in ann_ids]
435
+ if extra:
436
+ log.info(
437
+ "keyed_lookup_delta",
438
+ cluster=cluster,
439
+ added=len(extra),
440
+ models=[ev.record.model_id for ev in extra if ev.record],
441
+ )
442
+ return recall, extra
443
+
444
+ # Old dereference fastpath active: run all three concurrently, share the
445
+ # recall latency budget, then merge — lookup results deduped against both
446
+ # ANN and dereference results so no evidence is double-counted.
447
+ started = time.monotonic()
448
+ budget_s = settings.minima_memory_recall_timeout_ms / 1000.0
449
+ recall_task = asyncio.ensure_future(recall_coro)
450
+ lookup_task = asyncio.ensure_future(lookup_coro)
451
+ deref_tasks = [
452
+ asyncio.ensure_future(
453
+ self._memory.dereference(lane=lane, reference_id=r.reference_id or r.entry_id)
454
+ )
455
+ for r in refs
456
+ ]
457
+ recall = await recall_task
458
+ lookup_evidence = await lookup_task
459
+ remaining = max(0.05, budget_s - (time.monotonic() - started))
460
+ done, pending = await asyncio.wait(deref_tasks, timeout=remaining)
461
+ for task in pending:
462
+ task.cancel()
463
+ if pending:
464
+ log.warning("durable_fastpath_timeout", cluster=cluster, dropped=len(pending))
465
+ derefs = [
466
+ task.result()
467
+ for task in done
468
+ if not task.cancelled() and task.exception() is None
469
+ ]
470
+ fetched = [d for d in derefs if d is not None and d.record is not None]
471
+ ann_ids = {ev.entry_id for ev in recall.evidence}
472
+ missed = [d for d in fetched if d.entry_id not in ann_ids]
473
+ if missed:
474
+ log.info(
475
+ "durable_fastpath_delta",
476
+ mode=mode,
477
+ cluster=cluster,
478
+ fetched=len(fetched),
479
+ ann_missed=len(missed),
480
+ missed_models=[d.record.model_id for d in missed if d.record],
481
+ )
482
+ deref_extra = missed if mode == "on" else []
483
+ seen_ids = ann_ids | {ev.entry_id for ev in deref_extra}
484
+ lookup_extra = [ev for ev in lookup_evidence if ev.entry_id not in seen_ids]
485
+ if lookup_extra:
486
+ log.info(
487
+ "keyed_lookup_delta",
488
+ cluster=cluster,
489
+ added=len(lookup_extra),
490
+ models=[ev.record.model_id for ev in lookup_extra if ev.record],
491
+ )
492
+ return recall, lookup_extra + deref_extra
493
+
494
+ def _log_decision(
495
+ self,
496
+ *,
497
+ recommendation_id: str,
498
+ req: RecommendRequest,
499
+ lane: str,
500
+ cluster: str,
501
+ task_type: TaskType,
502
+ difficulty: Difficulty,
503
+ fingerprint: str,
504
+ tau: float,
505
+ selection_policy: str,
506
+ explored_pick: bool,
507
+ sel_propensities: dict[str, float],
508
+ recommended: CandidateScore,
509
+ ranked: list[CandidateScore],
510
+ esc: escalation.EscalationDecision,
511
+ input_tokens: int,
512
+ output_tokens: int,
513
+ est_cost_premium: float,
514
+ shadow_chosen_model_id: str | None = None,
515
+ ) -> None:
516
+ """Persist the decision row (best-effort — never breaks a recommendation)."""
517
+ if self._decision_log is None:
518
+ return
519
+ settings = self._settings
520
+ # Counterfactual baselines on the same cost basis as the candidate set: premium =
521
+ # the most expensive scored candidate BEFORE constraint filters (mirrors the
522
+ # workflow endpoint's total_est_cost_if_all_premium); declared = the caller's
523
+ # stated default model.
524
+ baseline_cost: float | None = None
525
+ if req.baseline_model_id:
526
+ in_ranked = next(
527
+ (c for c in ranked if c.card.model_id == req.baseline_model_id), None
528
+ )
529
+ if in_ranked is not None:
530
+ baseline_cost = in_ranked.est_cost_usd
531
+ else:
532
+ card = next(
533
+ (
534
+ m
535
+ for m in self._catalog_store.get().cards
536
+ if m.model_id == req.baseline_model_id
537
+ ),
538
+ None,
539
+ )
540
+ if card is not None:
541
+ baseline_cost = score.estimate_cost(card, input_tokens, output_tokens)[0]
542
+ try:
543
+ self._decision_log.put(
544
+ DecisionRecord(
545
+ recommendation_id=recommendation_id,
546
+ org_id=self._org_id,
547
+ lane=lane,
548
+ cluster=cluster,
549
+ task_type=task_type.value,
550
+ difficulty=difficulty.value,
551
+ fingerprint=fingerprint,
552
+ ts=time.time(),
553
+ tau=tau,
554
+ policy=selection_policy,
555
+ epsilon=settings.minima_epsilon if selection_policy != "argmin" else 0.0,
556
+ chosen_model_id=recommended.card.model_id,
557
+ escalated=esc.should_escalate,
558
+ shadow_chosen_model_id=shadow_chosen_model_id,
559
+ explored=explored_pick,
560
+ escalation_reasons=list(esc.reasons),
561
+ candidates=[
562
+ CandidateSnapshot(
563
+ model_id=c.card.model_id,
564
+ predicted_success=round(c.predicted_success, 6),
565
+ confidence=round(c.confidence, 6),
566
+ est_cost_usd=c.est_cost_usd,
567
+ propensity=round(sel_propensities.get(c.card.model_id, 0.0), 6),
568
+ raw_predicted_success=(
569
+ round(c.raw_predicted_success, 6)
570
+ if c.raw_predicted_success is not None
571
+ else None
572
+ ),
573
+ est_cost_low=c.est_cost_low,
574
+ est_cost_high=c.est_cost_high,
575
+ )
576
+ for c in ranked
577
+ ],
578
+ est_cost_recommended=recommended.est_cost_usd,
579
+ est_cost_premium=est_cost_premium,
580
+ baseline_model_id=req.baseline_model_id,
581
+ est_cost_baseline_declared=baseline_cost,
582
+ user_id=req.user_id,
583
+ env_tags=list(req.task.tags or []),
584
+ content=build_content(task_type.value, difficulty.value, req.task.task),
585
+ )
586
+ )
587
+ except Exception as exc: # noqa: BLE001 — analytics must never break the hot path
588
+ log.warning("decision_log_write_failed", error=str(exc))
589
+
590
+ async def _maybe_llm_classify(
591
+ self,
592
+ req: RecommendRequest,
593
+ task_type: TaskType,
594
+ difficulty: Difficulty,
595
+ warnings: list[str],
596
+ ) -> tuple[TaskType, Difficulty]:
597
+ """Refine an ambiguous heuristic classification via the reasoner (best-effort)."""
598
+ if not (
599
+ self._settings.minima_reasoner_classify
600
+ and req.allow_llm_escalation
601
+ and req.task.task_type is None
602
+ and task_type == TaskType.other
603
+ and self._reasoner is not None
604
+ and self._settings.reasoner_enabled
605
+ and hasattr(self._reasoner, "classify")
606
+ ):
607
+ return task_type, difficulty
608
+ try:
609
+ result = await self._reasoner.classify(task=req.task.task)
610
+ except Exception as exc: # noqa: BLE001
611
+ log.warning("llm_classify_failed", error=str(exc))
612
+ return task_type, difficulty
613
+ if result is None:
614
+ return task_type, difficulty
615
+ warnings.append("llm_classified")
616
+ return result
617
+
618
+ async def _consult_reasoner(
619
+ self,
620
+ *,
621
+ scored: list[CandidateScore],
622
+ task_type: TaskType,
623
+ difficulty: Difficulty,
624
+ lane: str,
625
+ req: RecommendRequest,
626
+ ) -> bool:
627
+ memory_block = await self._memory.get_context(
628
+ query=req.task.task, lane=lane, user_id=req.user_id, max_token_budget=1500
629
+ )
630
+ views = [
631
+ CandidateView(
632
+ model_id=c.card.model_id,
633
+ provider=c.card.provider,
634
+ input_cost_per_mtok=c.card.input_cost_per_mtok,
635
+ output_cost_per_mtok=c.card.output_cost_per_mtok,
636
+ context_window=c.card.context_window,
637
+ capability_prior=score.capability_prior(c.card, task_type),
638
+ est_cost_usd=c.est_cost_usd,
639
+ predicted_success=c.predicted_success,
640
+ est_latency_ms=c.est_latency_ms,
641
+ )
642
+ for c in scored
643
+ ]
644
+ result = await self._reasoner.rank( # type: ignore[union-attr]
645
+ task=req.task.task,
646
+ task_type=task_type.value,
647
+ difficulty=difficulty.value,
648
+ candidates=views,
649
+ memory_block=memory_block,
650
+ cost_quality_tradeoff=req.cost_quality_tradeoff,
651
+ )
652
+ if not result or not result.rankings:
653
+ return False
654
+ settings = self._settings
655
+ rankings = result.by_model()
656
+ changed = False
657
+ for c in scored:
658
+ ranking = rankings.get(c.card.model_id)
659
+ if ranking is None:
660
+ continue
661
+ if settings.minima_reasoner_blend_adaptive:
662
+ # Evidence-mass-adaptive: a candidate backed by heavy deterministic
663
+ # evidence (confidence -> 1) barely moves toward the LLM's estimate; a
664
+ # cold candidate (confidence -> 0) leans on it. Replaces the fixed blend
665
+ # that weighted a 50-outcome aggregate and a guess identically.
666
+ raw = settings.minima_reasoner_blend_max * (1.0 - c.confidence)
667
+ blend = min(0.9, max(0.1, raw))
668
+ else:
669
+ blend = settings.minima_reasoner_blend
670
+ c.predicted_success = clamp01(
671
+ blend * ranking.predicted_success + (1.0 - blend) * c.predicted_success
672
+ )
673
+ c.decision_basis = DecisionBasis.llm
674
+ if ranking.rationale:
675
+ c.rationale = ranking.rationale
676
+ changed = True
677
+ return changed
678
+
679
+ def _finalize(
680
+ self, scored: list[CandidateScore], tau: float, cost_quality_tradeoff: float
681
+ ) -> tuple[CandidateScore, CandidateScore | None, list[CandidateScore], list[str]]:
682
+ max_cost = max((c.est_cost_usd for c in scored), default=0.0) or 1.0
683
+ for c in scored:
684
+ c.score = score.ranking_score(
685
+ c.predicted_success, c.est_cost_usd / max_cost, cost_quality_tradeoff
686
+ )
687
+ return _optimize(scored, tau, self._settings.minima_collapse_margin)
688
+
689
+ # --------------------------------------------------------------- calibration
690
+ def _calibrate(self, task_type_value: str, predicted: float) -> float:
691
+ """Remap the raw Beta mean through the fitted calibrator (identity when unfit)."""
692
+ cal = self._get_calibrators()
693
+ if cal is None:
694
+ return predicted
695
+ return cal.transform(task_type_value, predicted)
696
+
697
+ def _get_calibrators(self) -> CalibratorSet | None:
698
+ settings = self._settings
699
+ if not settings.minima_calibration_apply or self._decision_log is None:
700
+ return None
701
+ now = time.monotonic()
702
+ if (
703
+ self._calibrators is None
704
+ or now - self._calibrators_fitted_at > settings.minima_calibration_refresh_seconds
705
+ ):
706
+ # Stamp BEFORE refit so concurrent requests for this org don't all refit at once.
707
+ self._calibrators_fitted_at = now
708
+ self._refit_calibrators()
709
+ return self._calibrators
710
+
711
+ def _refit_calibrators(self) -> None:
712
+ """Refit from the org's reconciled decision rows (best-effort: keep prior on failure)."""
713
+ settings = self._settings
714
+ assert self._decision_log is not None
715
+ try:
716
+ since = time.time() - settings.minima_calibration_window_days * 86_400.0
717
+ rows = self._decision_log.rows(since=since)
718
+ self._calibrators = fit_calibrators(
719
+ rows,
720
+ min_n=settings.minima_calibration_min_n,
721
+ shrinkage_k=settings.minima_calibration_shrinkage_k,
722
+ now=time.time(),
723
+ )
724
+ except Exception as exc: # noqa: BLE001 — calibration must never break a recommendation
725
+ log.warning("calibrator_refit_failed", error=str(exc))
726
+
727
+ def _score_candidates(
728
+ self,
729
+ candidates: list[ModelCard],
730
+ aggregates: dict[str, ModelAggregate],
731
+ task_type: TaskType,
732
+ input_tokens: int,
733
+ output_tokens: int,
734
+ req: RecommendRequest,
735
+ ) -> list[CandidateScore]:
736
+ settings = self._settings
737
+ scored: list[CandidateScore] = []
738
+ min_cost_n = settings.minima_observed_cost_min_n
739
+ # Decide the cost basis ONCE for the whole candidate set so all costs are compared
740
+ # like-for-like (never mix per-request estimates with historical realized costs across
741
+ # candidates). Prefers re-scaled observed output behavior (size-exact + reasoning-aware),
742
+ # then robust observed $/call, else the cache-aware token estimate.
743
+ cost_basis = score.choose_cost_basis(
744
+ {c.model_id: aggregates.get(c.model_id) for c in candidates},
745
+ settings.minima_use_observed_cost,
746
+ req.constraints.require_prompt_caching,
747
+ min_cost_n,
748
+ )
749
+ for card in candidates:
750
+ agg = aggregates.get(card.model_id)
751
+ prior = score.capability_prior(card, task_type)
752
+ predicted, confidence = score.predicted_success(
753
+ agg, prior, settings.minima_beta_pseudocount
754
+ )
755
+ raw_predicted = predicted
756
+ interval_width = score.posterior_interval_width(
757
+ agg, prior, settings.minima_beta_pseudocount
758
+ )
759
+ alpha, beta = score.beta_params(agg, prior, settings.minima_beta_pseudocount)
760
+ # Calibrate the honest Beta mean to a truthful probability BEFORE the
761
+ # exploration bonus (deliberate optimism) is layered on for the tau decision.
762
+ predicted = self._calibrate(task_type.value, predicted)
763
+ predicted = score.with_exploration_bonus(
764
+ predicted, confidence, settings.minima_exploration_bonus
765
+ )
766
+ use_cache = req.constraints.require_prompt_caching and card.supports_prompt_caching
767
+ cache_fraction = (
768
+ settings.minima_cost_cache_input_fraction
769
+ if settings.minima_cost_lever_aware
770
+ and card.supports_prompt_caching
771
+ and not use_cache
772
+ else 0.0
773
+ )
774
+ est_cost, breakdown = score.effective_cost(
775
+ card, agg, input_tokens, output_tokens, use_cache, cost_basis, min_cost_n,
776
+ cache_fraction,
777
+ )
778
+ cost_band = score.effective_cost_band(
779
+ card, agg, input_tokens, use_cache, cost_basis, min_cost_n
780
+ )
781
+ est_cost_low, est_cost_high, cost_band_basis = (
782
+ (cost_band[0][0], cost_band[0][1], cost_band[1])
783
+ if cost_band is not None
784
+ else (None, None, "")
785
+ )
786
+ cost_word = "obs" if ("observed_avg" in breakdown or "rescaled" in breakdown) else "est"
787
+ est_latency = (
788
+ agg.observed_latency_ms(
789
+ settings.minima_latency_min_n, settings.minima_latency_percentile
790
+ )
791
+ if agg is not None
792
+ else None
793
+ )
794
+
795
+ if agg is not None and agg.weight_sum > MEMORY_WEIGHT_MIN:
796
+ basis = DecisionBasis.memory
797
+ rationale = (
798
+ f"{agg.n} similar past outcome(s); weighted success "
799
+ f"{agg.weighted_success_rate:.0%}; {cost_word} ${est_cost:.5f}/call"
800
+ )
801
+ evidence = agg.evidence[:MAX_EVIDENCE_PER_CANDIDATE]
802
+ else:
803
+ basis = DecisionBasis.prior
804
+ rationale = (
805
+ f"no memory yet; capability prior {prior:.0%} for {task_type.value}; "
806
+ f"{cost_word} ${est_cost:.5f}/call"
807
+ )
808
+ evidence = agg.evidence[:MAX_EVIDENCE_PER_CANDIDATE] if agg else []
809
+
810
+ scored.append(
811
+ CandidateScore(
812
+ card=card,
813
+ predicted_success=predicted,
814
+ raw_predicted_success=raw_predicted,
815
+ confidence=confidence,
816
+ est_cost_usd=est_cost,
817
+ est_cost_breakdown=breakdown,
818
+ decision_basis=basis,
819
+ evidence=evidence,
820
+ rationale=rationale,
821
+ interval_width=interval_width,
822
+ alpha=alpha,
823
+ beta=beta,
824
+ est_latency_ms=est_latency,
825
+ latency_basis=(
826
+ f"observed_p{int(settings.minima_latency_percentile * 100)}"
827
+ if est_latency is not None
828
+ else ""
829
+ ),
830
+ est_cost_low=est_cost_low,
831
+ est_cost_high=est_cost_high,
832
+ cost_band_basis=cost_band_basis,
833
+ )
834
+ )
835
+ return scored
836
+
837
+
838
+ def _shadow_pick(
839
+ scored: list[CandidateScore], cost_quality_tradeoff: float, alpha: float
840
+ ) -> str | None:
841
+ """The UCB shadow policy's pick (argmax ucb_score over the scored candidates)."""
842
+ if not scored:
843
+ return None
844
+ max_cost = max((c.est_cost_usd for c in scored), default=0.0) or 1.0
845
+ best = max(
846
+ scored,
847
+ key=lambda c: score.ucb_score(
848
+ c.predicted_success, c.interval_width, c.est_cost_usd / max_cost,
849
+ cost_quality_tradeoff, alpha,
850
+ ),
851
+ )
852
+ return best.card.model_id
853
+
854
+
855
+ def _actions_for(card: ModelCard) -> list[str]:
856
+ """Near-free cost-saving actions the caller should apply to realize the quoted cost.
857
+
858
+ Currently: prompt caching for models that support it (the harness applies it, so the
859
+ realized cost reflects the cache discount). Batch mode is left to the caller's
860
+ interactive/background signal and is not inferred here.
861
+ """
862
+ actions: list[str] = []
863
+ if card.supports_prompt_caching:
864
+ actions.append("enable_prompt_cache")
865
+ return actions
866
+
867
+
868
+ def _overall_confidence(basis: DecisionBasis, recommended_confidence: float) -> float:
869
+ if basis == DecisionBasis.memory:
870
+ return recommended_confidence
871
+ if basis == DecisionBasis.llm:
872
+ return max(recommended_confidence, 0.5)
873
+ return min(recommended_confidence, 0.5)
874
+
875
+
876
+ def _select_candidates(
877
+ cards: list[ModelCard], req: RecommendRequest, task_type: TaskType, max_candidates: int
878
+ ) -> list[ModelCard]:
879
+ c = req.constraints
880
+ selected = list(cards)
881
+ if c.candidate_models:
882
+ wanted = set(c.candidate_models)
883
+ selected = [m for m in selected if m.model_id in wanted]
884
+ if c.allowed_providers:
885
+ allowed = {p.lower() for p in c.allowed_providers}
886
+ selected = [m for m in selected if m.provider.lower() in allowed]
887
+ if c.excluded_models:
888
+ excluded = set(c.excluded_models)
889
+ selected = [m for m in selected if m.model_id not in excluded]
890
+ if c.require_prompt_caching:
891
+ selected = [m for m in selected if m.supports_prompt_caching]
892
+ if c.require_context_window:
893
+ selected = [m for m in selected if m.context_window >= c.require_context_window]
894
+ selected.sort(key=lambda m: score.capability_prior(m, task_type), reverse=True)
895
+ return selected[:max_candidates]
896
+
897
+
898
+ def _optimize(
899
+ scored: list[CandidateScore], tau: float, collapse_margin: float = 0.0
900
+ ) -> tuple[CandidateScore, CandidateScore | None, list[CandidateScore], list[str]]:
901
+ warnings: list[str] = []
902
+ ranked = sorted(scored, key=lambda c: c.score, reverse=True)
903
+ eligible = [c for c in scored if c.predicted_success >= tau]
904
+
905
+ # Tau-aware optimism: the rescue shrinks as the quality bar rises, so at a HIGH
906
+ # cost_quality setting (user wants quality) the guard barely fires, and at a LOW bar
907
+ # (cost-leaning) it rescues cheap-but-uncertain models freely. This is what keeps the
908
+ # guard from trading away quality exactly where the user asked for it.
909
+ effective_margin = collapse_margin * max(0.0, 1.0 - tau)
910
+
911
+ def _optimistic_clears(c: CandidateScore) -> bool:
912
+ # Upper credible-bound view: predicted + effective_margin * half-width clears tau.
913
+ # Only applied to candidates with ACTUAL evidence (confidence > 0) — at cold start,
914
+ # capability priors (not optimism over a maximal interval) decide, so the guard is inert.
915
+ if c.confidence <= 0.0:
916
+ return False
917
+ return c.predicted_success + effective_margin * 0.5 * c.interval_width >= tau
918
+
919
+ if eligible:
920
+ recommended = min(
921
+ eligible, key=lambda c: (c.est_cost_usd, -c.predicted_success, -c.confidence)
922
+ )
923
+ # Routing-collapse guard: if the cheapest model clearing tau is ITSELF the priciest
924
+ # candidate, prefer a cheaper candidate whose credible interval could still clear tau
925
+ # (the judge/escalation loop catches an over-optimistic cheap pick).
926
+ if collapse_margin > 0.0 and len(scored) > 1:
927
+ max_cost = max(c.est_cost_usd for c in scored)
928
+ if recommended.est_cost_usd >= max_cost - 1e-12:
929
+ cheaper = [
930
+ c
931
+ for c in scored
932
+ if c.est_cost_usd < recommended.est_cost_usd and _optimistic_clears(c)
933
+ ]
934
+ if cheaper:
935
+ recommended = min(
936
+ cheaper,
937
+ key=lambda c: (c.est_cost_usd, -c.predicted_success, -c.confidence),
938
+ )
939
+ warnings.append("collapse_guard_applied")
940
+ else:
941
+ warnings.append("no_model_meets_threshold")
942
+ # Don't default to the strongest (usually priciest) model: prefer the cheapest whose
943
+ # optimistic upper bound could still clear tau, falling back to strongest if none.
944
+ plausible = [c for c in scored if _optimistic_clears(c)] if collapse_margin > 0.0 else []
945
+ if plausible:
946
+ recommended = min(plausible, key=lambda c: (c.est_cost_usd, -c.predicted_success))
947
+ warnings.append("collapse_guard_applied")
948
+ else:
949
+ recommended = max(scored, key=lambda c: c.predicted_success)
950
+
951
+ others = [c for c in eligible if c.card.model_id != recommended.card.model_id]
952
+ reliable = [c for c in others if c.predicted_success >= tau + 0.05]
953
+ if reliable:
954
+ fallback: CandidateScore | None = min(reliable, key=lambda c: c.est_cost_usd)
955
+ else:
956
+ rest = [c for c in ranked if c.card.model_id != recommended.card.model_id]
957
+ fallback = max(rest, key=lambda c: c.predicted_success) if rest else None
958
+
959
+ return recommended, fallback, ranked, warnings
960
+
961
+
962
+ def _to_ranked_model(c: CandidateScore, explain: bool) -> RankedModel:
963
+ evidence = (
964
+ [
965
+ EvidenceRef(
966
+ entry_id=ev.entry_id,
967
+ reference_id=ev.reference_id,
968
+ model_id=ev.record.model_id if ev.record else c.card.model_id,
969
+ score=round(ev.score, 4),
970
+ knowledge_confidence=round(ev.knowledge_confidence, 4),
971
+ observed_success=round(ev.record.quality_score, 4) if ev.record else 0.0,
972
+ is_stale=ev.is_stale,
973
+ )
974
+ for ev in c.evidence
975
+ ]
976
+ if explain
977
+ else []
978
+ )
979
+ return RankedModel(
980
+ model_id=c.card.model_id,
981
+ provider=c.card.provider,
982
+ predicted_success=round(c.predicted_success, 4),
983
+ est_cost_usd=round(c.est_cost_usd, 8),
984
+ est_cost_breakdown=c.est_cost_breakdown,
985
+ score=round(c.score, 4),
986
+ rationale=c.rationale,
987
+ decision_basis=c.decision_basis,
988
+ evidence=evidence,
989
+ supports_prompt_caching=c.card.supports_prompt_caching,
990
+ context_window=c.card.context_window,
991
+ est_latency_ms=round(c.est_latency_ms, 1) if c.est_latency_ms is not None else None,
992
+ latency_basis=c.latency_basis,
993
+ est_cost_low=round(c.est_cost_low, 8) if c.est_cost_low is not None else None,
994
+ est_cost_high=round(c.est_cost_high, 8) if c.est_cost_high is not None else None,
995
+ cost_band_basis=c.cost_band_basis,
996
+ success_interval_width=round(c.interval_width, 4),
997
+ )