minima-cli 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minima/__init__.py +5 -0
- minima/api/__init__.py +1 -0
- minima/api/auth.py +39 -0
- minima/api/errors.py +40 -0
- minima/api/routers/__init__.py +1 -0
- minima/api/routers/calibration.py +50 -0
- minima/api/routers/feedback.py +279 -0
- minima/api/routers/health.py +50 -0
- minima/api/routers/models.py +42 -0
- minima/api/routers/recommend.py +66 -0
- minima/api/routers/savings.py +55 -0
- minima/api/routers/strategies.py +33 -0
- minima/catalog/__init__.py +1 -0
- minima/catalog/data/capability_priors.json +210 -0
- minima/catalog/data/model_aliases.json +12 -0
- minima/catalog/merge.py +69 -0
- minima/catalog/refresh.py +54 -0
- minima/catalog/sources/__init__.py +1 -0
- minima/catalog/sources/litellm.py +19 -0
- minima/catalog/sources/openrouter.py +25 -0
- minima/catalog/store.py +86 -0
- minima/config.py +288 -0
- minima/deps.py +35 -0
- minima/llm/__init__.py +1 -0
- minima/llm/anthropic.py +106 -0
- minima/llm/base.py +196 -0
- minima/llm/gemini.py +124 -0
- minima/llm/registry.py +54 -0
- minima/logging.py +28 -0
- minima/main.py +109 -0
- minima/memory/__init__.py +1 -0
- minima/memory/adapter.py +572 -0
- minima/memory/keys.py +83 -0
- minima/memory/records.py +190 -0
- minima/memory/threadpool.py +41 -0
- minima/metrics/__init__.py +1 -0
- minima/metrics/calibration.py +415 -0
- minima/metrics/report.py +116 -0
- minima/metrics/savings.py +98 -0
- minima/recommender/__init__.py +1 -0
- minima/recommender/_pg_pool.py +38 -0
- minima/recommender/_redis_client.py +32 -0
- minima/recommender/aggregate.py +157 -0
- minima/recommender/classify.py +165 -0
- minima/recommender/decisionlog.py +505 -0
- minima/recommender/durablerefs.py +312 -0
- minima/recommender/engine.py +997 -0
- minima/recommender/escalation.py +83 -0
- minima/recommender/propensity.py +189 -0
- minima/recommender/recstore.py +368 -0
- minima/recommender/score.py +318 -0
- minima/recommender/types.py +166 -0
- minima/schemas/__init__.py +1 -0
- minima/schemas/common.py +73 -0
- minima/schemas/feedback.py +34 -0
- minima/schemas/models_catalog.py +36 -0
- minima/schemas/recommend.py +104 -0
- minima/schemas/savings.py +39 -0
- minima/schemas/strategies.py +57 -0
- minima/schemas/workflow.py +43 -0
- minima/seeding/__init__.py +1 -0
- minima/seeding/items.py +42 -0
- minima/seeding/llmrouterbench.py +232 -0
- minima/seeding/routerbench.py +141 -0
- minima/seeding/run_seed.py +56 -0
- minima/seeding/synthetic.py +70 -0
- minima/tenancy/__init__.py +8 -0
- minima/tenancy/context.py +37 -0
- minima/tenancy/passthrough.py +110 -0
- minima/version.py +3 -0
- minima_cli-0.4.9.dist-info/METADATA +275 -0
- minima_cli-0.4.9.dist-info/RECORD +161 -0
- minima_cli-0.4.9.dist-info/WHEEL +4 -0
- minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
- minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
- minima_client/__init__.py +19 -0
- minima_client/autocapture.py +101 -0
- minima_client/client.py +301 -0
- minima_client/errors.py +23 -0
- minima_harness/LICENSE_PI +32 -0
- minima_harness/__init__.py +16 -0
- minima_harness/agent/__init__.py +72 -0
- minima_harness/agent/agent.py +276 -0
- minima_harness/agent/events.py +124 -0
- minima_harness/agent/loop.py +311 -0
- minima_harness/agent/state.py +79 -0
- minima_harness/agent/tools.py +97 -0
- minima_harness/ai/__init__.py +66 -0
- minima_harness/ai/compat.py +71 -0
- minima_harness/ai/errors.py +96 -0
- minima_harness/ai/events.py +117 -0
- minima_harness/ai/openrouter_catalog.py +153 -0
- minima_harness/ai/provider_catalog.py +299 -0
- minima_harness/ai/provider_quirks.py +37 -0
- minima_harness/ai/providers/__init__.py +75 -0
- minima_harness/ai/providers/_common.py +48 -0
- minima_harness/ai/providers/anthropic.py +290 -0
- minima_harness/ai/providers/base.py +65 -0
- minima_harness/ai/providers/faux.py +173 -0
- minima_harness/ai/providers/google.py +221 -0
- minima_harness/ai/providers/openai_compat.py +278 -0
- minima_harness/ai/registry.py +184 -0
- minima_harness/ai/stream.py +82 -0
- minima_harness/ai/tools.py +51 -0
- minima_harness/ai/types.py +204 -0
- minima_harness/ai/usage.py +41 -0
- minima_harness/minima/__init__.py +40 -0
- minima_harness/minima/cache.py +102 -0
- minima_harness/minima/config.py +85 -0
- minima_harness/minima/goals.py +226 -0
- minima_harness/minima/judge.py +144 -0
- minima_harness/minima/mapping.py +147 -0
- minima_harness/minima/meter.py +143 -0
- minima_harness/minima/router.py +220 -0
- minima_harness/minima/runtime.py +544 -0
- minima_harness/minima/signals.py +195 -0
- minima_harness/session/__init__.py +14 -0
- minima_harness/session/format.py +35 -0
- minima_harness/session/store.py +236 -0
- minima_harness/tasks/__init__.py +17 -0
- minima_harness/tasks/task_set.py +78 -0
- minima_harness/tools/__init__.py +7 -0
- minima_harness/tools/_io.py +34 -0
- minima_harness/tools/bash.py +70 -0
- minima_harness/tools/builtin.py +23 -0
- minima_harness/tools/edit.py +50 -0
- minima_harness/tools/find.py +38 -0
- minima_harness/tools/grep.py +73 -0
- minima_harness/tools/ls.py +35 -0
- minima_harness/tools/read.py +38 -0
- minima_harness/tools/tasks.py +75 -0
- minima_harness/tools/write.py +36 -0
- minima_harness/tui/__init__.py +3 -0
- minima_harness/tui/analytics.py +111 -0
- minima_harness/tui/app.py +1927 -0
- minima_harness/tui/bridge.py +103 -0
- minima_harness/tui/cli.py +227 -0
- minima_harness/tui/clipboard.py +60 -0
- minima_harness/tui/commands.py +49 -0
- minima_harness/tui/compaction.py +17 -0
- minima_harness/tui/config_cli.py +141 -0
- minima_harness/tui/config_store.py +237 -0
- minima_harness/tui/context.py +93 -0
- minima_harness/tui/customize.py +95 -0
- minima_harness/tui/diff.py +53 -0
- minima_harness/tui/editor.py +43 -0
- minima_harness/tui/extensions.py +84 -0
- minima_harness/tui/extra_models.py +52 -0
- minima_harness/tui/history.py +71 -0
- minima_harness/tui/mubit.py +295 -0
- minima_harness/tui/overlays.py +593 -0
- minima_harness/tui/packages.py +59 -0
- minima_harness/tui/run_modes.py +66 -0
- minima_harness/tui/theme.py +77 -0
- minima_harness/tui/welcome.py +83 -0
- minima_harness/tui/widgets/__init__.py +3 -0
- minima_harness/tui/widgets/banner.py +38 -0
- minima_harness/tui/widgets/editor.py +83 -0
- minima_harness/tui/widgets/footer.py +73 -0
- minima_harness/tui/widgets/messages.py +151 -0
- minima_harness/tui/widgets/status.py +57 -0
|
@@ -0,0 +1,997 @@
|
|
|
1
|
+
"""The recommendation orchestrator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import math
|
|
7
|
+
import random
|
|
8
|
+
import time
|
|
9
|
+
import uuid
|
|
10
|
+
|
|
11
|
+
from minima.catalog.store import CatalogStore
|
|
12
|
+
from minima.config import Settings
|
|
13
|
+
from minima.llm.base import CandidateView, Reasoner
|
|
14
|
+
from minima.logging import get_logger
|
|
15
|
+
from minima.memory.adapter import Memory
|
|
16
|
+
from minima.memory.keys import build_content, salient_signature, task_cluster, task_fingerprint
|
|
17
|
+
from minima.memory.records import clamp01
|
|
18
|
+
from minima.metrics.calibration import CalibratorSet, fit_calibrators
|
|
19
|
+
from minima.recommender import escalation, score
|
|
20
|
+
from minima.recommender.aggregate import aggregate_by_model, apply_ipw
|
|
21
|
+
from minima.recommender.classify import classify, classify_from_neighbors
|
|
22
|
+
from minima.recommender.decisionlog import CandidateSnapshot, DecisionLog, DecisionRecord
|
|
23
|
+
from minima.recommender.durablerefs import DurableRefs
|
|
24
|
+
from minima.recommender.propensity import Propensity, PropensityTracker
|
|
25
|
+
from minima.recommender.recstore import RecStore, StoredRecommendation
|
|
26
|
+
from minima.recommender.types import CandidateScore, ModelAggregate
|
|
27
|
+
from minima.schemas.common import DecisionBasis, Difficulty, TaskType
|
|
28
|
+
from minima.schemas.models_catalog import ModelCard
|
|
29
|
+
from minima.schemas.recommend import (
|
|
30
|
+
EvidenceRef,
|
|
31
|
+
RankedModel,
|
|
32
|
+
RecommendRequest,
|
|
33
|
+
RecommendResponse,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
log = get_logger("minima.recommender")
|
|
37
|
+
|
|
38
|
+
# Any positive recalled-outcome mass makes a candidate's prediction "memory-driven";
|
|
39
|
+
# the confidence field separately conveys how strong that evidence is.
|
|
40
|
+
MEMORY_WEIGHT_MIN = 0.0
|
|
41
|
+
# Max neighbors echoed back per candidate in the explained response.
|
|
42
|
+
MAX_EVIDENCE_PER_CANDIDATE = 5
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class NoCandidatesError(ValueError):
|
|
46
|
+
"""Raised when constraints eliminate every catalog model."""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Recommender:
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
settings: Settings,
|
|
53
|
+
memory: Memory,
|
|
54
|
+
catalog_store: CatalogStore,
|
|
55
|
+
recstore: RecStore,
|
|
56
|
+
reasoner: Reasoner | None = None,
|
|
57
|
+
propensity: Propensity | None = None,
|
|
58
|
+
decision_log: DecisionLog | None = None,
|
|
59
|
+
org_id: str = "default",
|
|
60
|
+
rng: random.Random | None = None,
|
|
61
|
+
durable_refs: DurableRefs | None = None,
|
|
62
|
+
):
|
|
63
|
+
self._settings = settings
|
|
64
|
+
self._memory = memory
|
|
65
|
+
self._catalog_store = catalog_store
|
|
66
|
+
self._recstore = recstore
|
|
67
|
+
self._reasoner = reasoner
|
|
68
|
+
self._propensity = propensity or PropensityTracker()
|
|
69
|
+
self._decision_log = decision_log
|
|
70
|
+
self._org_id = org_id
|
|
71
|
+
self._durable_refs = durable_refs
|
|
72
|
+
self._rng = rng or random.Random() # noqa: S311 — exploration sampling, not crypto
|
|
73
|
+
epsilon_orgs = {
|
|
74
|
+
o.strip() for o in settings.minima_epsilon_selection_orgs.split(",") if o.strip()
|
|
75
|
+
}
|
|
76
|
+
self._epsilon_enabled = org_id in epsilon_orgs
|
|
77
|
+
thompson_orgs = {
|
|
78
|
+
o.strip() for o in settings.minima_thompson_selection_orgs.split(",") if o.strip()
|
|
79
|
+
}
|
|
80
|
+
self._thompson_enabled = org_id in thompson_orgs
|
|
81
|
+
# Lazily-fit, cached calibrator (org-scoped via this Recommender's decision log).
|
|
82
|
+
self._calibrators: CalibratorSet | None = None
|
|
83
|
+
self._calibrators_fitted_at: float = 0.0
|
|
84
|
+
|
|
85
|
+
async def recommend(self, req: RecommendRequest) -> RecommendResponse:
|
|
86
|
+
started = time.monotonic()
|
|
87
|
+
settings = self._settings
|
|
88
|
+
warnings: list[str] = []
|
|
89
|
+
|
|
90
|
+
task_type, difficulty = classify(req.task)
|
|
91
|
+
task_type, difficulty = await self._maybe_llm_classify(req, task_type, difficulty, warnings)
|
|
92
|
+
signature = (
|
|
93
|
+
salient_signature(req.task.task, settings.minima_cluster_signature_tokens)
|
|
94
|
+
if settings.minima_cluster_granularity.lower() == "fine"
|
|
95
|
+
else None
|
|
96
|
+
)
|
|
97
|
+
cluster = task_cluster(task_type, difficulty, signature)
|
|
98
|
+
fingerprint = task_fingerprint(req.task.task)
|
|
99
|
+
lane = settings.lane(req.namespace)
|
|
100
|
+
|
|
101
|
+
catalog = self._catalog_store.get()
|
|
102
|
+
candidates = _select_candidates(catalog.cards, req, task_type, req.max_candidates)
|
|
103
|
+
if not candidates:
|
|
104
|
+
raise NoCandidatesError("no models match the supplied constraints")
|
|
105
|
+
candidate_ids = {c.model_id for c in candidates}
|
|
106
|
+
|
|
107
|
+
recall, fastpath_evidence = await self._recall_with_fastpath(
|
|
108
|
+
req=req, lane=lane, cluster=cluster, candidate_ids=candidate_ids
|
|
109
|
+
)
|
|
110
|
+
if recall.timed_out:
|
|
111
|
+
warnings.append("recall_timeout")
|
|
112
|
+
elif recall.error:
|
|
113
|
+
warnings.append("memory_unavailable")
|
|
114
|
+
evidence = recall.outcome_evidence + fastpath_evidence
|
|
115
|
+
|
|
116
|
+
# Neighbor-vote refinement: if the heuristic couldn't place the task, let the
|
|
117
|
+
# ANN-recalled semantic neighbors vote on its type (free; the cluster key then
|
|
118
|
+
# becomes coherent for scoring + the stored outcome). Caller-supplied types win.
|
|
119
|
+
if (
|
|
120
|
+
req.task.task_type is None
|
|
121
|
+
and task_type == TaskType.other
|
|
122
|
+
and settings.minima_neighbor_classify
|
|
123
|
+
and evidence
|
|
124
|
+
):
|
|
125
|
+
voted = classify_from_neighbors(
|
|
126
|
+
[(ev.record.task_type, ev.score) for ev in evidence if ev.record is not None]
|
|
127
|
+
)
|
|
128
|
+
if voted is not None and voted != task_type:
|
|
129
|
+
task_type = voted
|
|
130
|
+
cluster = task_cluster(task_type, difficulty, signature)
|
|
131
|
+
warnings.append("neighbor_classified")
|
|
132
|
+
|
|
133
|
+
# Remember durable-record ids surfaced by recall so the fast path can
|
|
134
|
+
# Dereference them next time (live records only — seeds are per-row inserts,
|
|
135
|
+
# not the durable (cluster, model) upsert). Bookkeeping only: a store failure
|
|
136
|
+
# must never break the recommendation.
|
|
137
|
+
if self._durable_refs is not None:
|
|
138
|
+
try:
|
|
139
|
+
for ev in recall.outcome_evidence:
|
|
140
|
+
rec = ev.record
|
|
141
|
+
if (
|
|
142
|
+
rec is not None
|
|
143
|
+
and rec.task_cluster == cluster
|
|
144
|
+
and rec.source_dataset is None
|
|
145
|
+
and (ev.reference_id or ev.referenceable)
|
|
146
|
+
):
|
|
147
|
+
self._durable_refs.upsert(
|
|
148
|
+
lane, cluster, rec.model_id, ev.entry_id, ev.reference_id or ""
|
|
149
|
+
)
|
|
150
|
+
except Exception as exc: # noqa: BLE001
|
|
151
|
+
log.warning("durable_ref_upsert_failed", error=str(exc))
|
|
152
|
+
|
|
153
|
+
aggregates = aggregate_by_model(
|
|
154
|
+
evidence,
|
|
155
|
+
candidate_ids,
|
|
156
|
+
half_life_days=settings.minima_evidence_half_life_days,
|
|
157
|
+
decay_floor=settings.minima_evidence_decay_floor,
|
|
158
|
+
seed_weight=settings.minima_seed_weight,
|
|
159
|
+
seed_crowdout_n=settings.minima_seed_crowdout_n,
|
|
160
|
+
)
|
|
161
|
+
if settings.minima_ipw_enabled and aggregates:
|
|
162
|
+
apply_ipw(
|
|
163
|
+
aggregates,
|
|
164
|
+
self._propensity.propensities(lane, cluster, candidate_ids),
|
|
165
|
+
settings.minima_ipw_clip_low,
|
|
166
|
+
settings.minima_ipw_clip_high,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
input_tokens = req.task.expected_input_tokens or settings.minima_default_input_tokens
|
|
170
|
+
output_tokens = req.task.expected_output_tokens or int(
|
|
171
|
+
settings.minima_default_output_tokens
|
|
172
|
+
* settings.minima_difficulty_output_multipliers.get(difficulty.value, 1.0)
|
|
173
|
+
)
|
|
174
|
+
scored = self._score_candidates(
|
|
175
|
+
candidates, aggregates, task_type, input_tokens, output_tokens, req
|
|
176
|
+
)
|
|
177
|
+
# Premium counterfactual baseline, captured BEFORE the cost/latency filters
|
|
178
|
+
# shrink the set — otherwise the baseline itself would shift with the caller's
|
|
179
|
+
# constraints and savings would not be comparable across requests.
|
|
180
|
+
est_cost_premium = max((c.est_cost_usd for c in scored), default=0.0)
|
|
181
|
+
|
|
182
|
+
if req.constraints.max_cost_per_call is not None:
|
|
183
|
+
affordable = [c for c in scored if c.est_cost_usd <= req.constraints.max_cost_per_call]
|
|
184
|
+
if affordable:
|
|
185
|
+
scored = affordable
|
|
186
|
+
else:
|
|
187
|
+
warnings.append("no_model_within_cost_budget")
|
|
188
|
+
|
|
189
|
+
if req.constraints.max_latency_ms is not None:
|
|
190
|
+
# Only exclude candidates with OBSERVED latency evidence above the budget —
|
|
191
|
+
# a model is never condemned without data (its est_latency_ms stays None).
|
|
192
|
+
within = [
|
|
193
|
+
c
|
|
194
|
+
for c in scored
|
|
195
|
+
if c.est_latency_ms is None or c.est_latency_ms <= req.constraints.max_latency_ms
|
|
196
|
+
]
|
|
197
|
+
if within:
|
|
198
|
+
scored = within
|
|
199
|
+
else:
|
|
200
|
+
warnings.append("no_model_within_latency_budget")
|
|
201
|
+
|
|
202
|
+
tau = score.threshold_from_slider(
|
|
203
|
+
req.cost_quality_tradeoff,
|
|
204
|
+
settings.minima_tau_min,
|
|
205
|
+
settings.minima_tau_max,
|
|
206
|
+
req.constraints.min_quality,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
recommended, fallback, ranked, opt_warnings = self._finalize(
|
|
210
|
+
scored, tau, req.cost_quality_tradeoff
|
|
211
|
+
)
|
|
212
|
+
overall_basis = recommended.decision_basis
|
|
213
|
+
|
|
214
|
+
esc = escalation.evaluate(
|
|
215
|
+
settings=settings,
|
|
216
|
+
allow=req.allow_llm_escalation,
|
|
217
|
+
total_weight=sum(a.weight_sum for a in aggregates.values()),
|
|
218
|
+
distinct_models_with_evidence=sum(1 for a in aggregates.values() if a.weight_sum > 0),
|
|
219
|
+
recommended_confidence=recommended.confidence,
|
|
220
|
+
ranked=ranked,
|
|
221
|
+
aggregates=aggregates,
|
|
222
|
+
recommended_interval_width=score.posterior_interval_width(
|
|
223
|
+
aggregates.get(recommended.card.model_id),
|
|
224
|
+
score.capability_prior(recommended.card, task_type),
|
|
225
|
+
settings.minima_beta_pseudocount,
|
|
226
|
+
),
|
|
227
|
+
recommended_predicted_success=recommended.predicted_success,
|
|
228
|
+
tau=tau,
|
|
229
|
+
)
|
|
230
|
+
if esc.should_escalate:
|
|
231
|
+
warnings.extend(f"escalation_suggested:{reason}" for reason in esc.reasons)
|
|
232
|
+
if self._reasoner is not None and settings.reasoner_enabled:
|
|
233
|
+
consulted = await self._consult_reasoner(
|
|
234
|
+
scored=scored, task_type=task_type, difficulty=difficulty, lane=lane, req=req
|
|
235
|
+
)
|
|
236
|
+
if consulted:
|
|
237
|
+
recommended, fallback, ranked, opt_warnings = self._finalize(
|
|
238
|
+
scored, tau, req.cost_quality_tradeoff
|
|
239
|
+
)
|
|
240
|
+
overall_basis = DecisionBasis.llm
|
|
241
|
+
warnings.append("reasoner_consulted")
|
|
242
|
+
else:
|
|
243
|
+
warnings.append("reasoner_failed")
|
|
244
|
+
else:
|
|
245
|
+
warnings.append("reasoner_disabled")
|
|
246
|
+
warnings.extend(opt_warnings)
|
|
247
|
+
|
|
248
|
+
if not evidence:
|
|
249
|
+
warnings.append("cold_start")
|
|
250
|
+
if catalog.stale:
|
|
251
|
+
warnings.append("prices_stale")
|
|
252
|
+
|
|
253
|
+
# Selection policy: deterministic argmin everywhere; epsilon-softmax over the
|
|
254
|
+
# tau-ELIGIBLE set for opted-in orgs (the safety floor is eligibility itself).
|
|
255
|
+
# The propensity vector is logged either way so off-policy evaluation can tell
|
|
256
|
+
# a degenerate (deterministic) log from a stochastic one.
|
|
257
|
+
selection_policy = "argmin"
|
|
258
|
+
explored_pick = False
|
|
259
|
+
sel_propensities: dict[str, float] = dict.fromkeys(
|
|
260
|
+
(c.card.model_id for c in ranked), 0.0
|
|
261
|
+
)
|
|
262
|
+
sel_propensities[recommended.card.model_id] = 1.0
|
|
263
|
+
if self._thompson_enabled and len(scored) >= 2:
|
|
264
|
+
# Posterior-sampling selection: sample each candidate's success, pick cheapest
|
|
265
|
+
# clearing tau under the sample. MC frequencies are the logged propensities.
|
|
266
|
+
selection_policy = "thompson"
|
|
267
|
+
items = [(c.card.model_id, c.alpha, c.beta, c.est_cost_usd) for c in scored]
|
|
268
|
+
pick_id, pi = score.thompson_select(
|
|
269
|
+
items, tau, self._rng, settings.minima_thompson_samples
|
|
270
|
+
)
|
|
271
|
+
sel_propensities = dict.fromkeys((c.card.model_id for c in ranked), 0.0)
|
|
272
|
+
sel_propensities.update(pi)
|
|
273
|
+
if pick_id and pick_id != recommended.card.model_id:
|
|
274
|
+
sampled = next((c for c in scored if c.card.model_id == pick_id), None)
|
|
275
|
+
if sampled is not None:
|
|
276
|
+
fallback = recommended # the deterministic pick is the natural retry
|
|
277
|
+
recommended = sampled
|
|
278
|
+
overall_basis = recommended.decision_basis
|
|
279
|
+
explored_pick = True
|
|
280
|
+
warnings.append("thompson_pick")
|
|
281
|
+
elif self._epsilon_enabled:
|
|
282
|
+
eligible = [c for c in ranked if c.predicted_success >= tau]
|
|
283
|
+
if len(eligible) >= 2:
|
|
284
|
+
selection_policy = "epsilon_softmax"
|
|
285
|
+
argmin_id = recommended.card.model_id
|
|
286
|
+
pi = score.softmax_propensities(
|
|
287
|
+
{c.card.model_id: c.score for c in eligible},
|
|
288
|
+
argmin_id,
|
|
289
|
+
settings.minima_epsilon,
|
|
290
|
+
settings.minima_epsilon_softmax_temperature,
|
|
291
|
+
)
|
|
292
|
+
sel_propensities.update(pi)
|
|
293
|
+
sampled = self._maybe_explore(eligible, argmin_id)
|
|
294
|
+
if sampled is not None and sampled.card.model_id != argmin_id:
|
|
295
|
+
fallback = recommended # the deterministic pick is the natural retry
|
|
296
|
+
recommended = sampled
|
|
297
|
+
overall_basis = recommended.decision_basis
|
|
298
|
+
explored_pick = True
|
|
299
|
+
warnings.append("exploration_pick")
|
|
300
|
+
|
|
301
|
+
self._propensity.record(lane, cluster, recommended.card.model_id)
|
|
302
|
+
|
|
303
|
+
# Advisory shadow bandit: log what a UCB policy WOULD pick (never overrides).
|
|
304
|
+
shadow_pick: str | None = None
|
|
305
|
+
if settings.minima_shadow_bandit and ranked:
|
|
306
|
+
shadow_pick = _shadow_pick(
|
|
307
|
+
ranked, req.cost_quality_tradeoff, settings.minima_shadow_ucb_alpha
|
|
308
|
+
)
|
|
309
|
+
if shadow_pick is not None and shadow_pick != recommended.card.model_id:
|
|
310
|
+
warnings.append("shadow_disagree")
|
|
311
|
+
|
|
312
|
+
recommendation_id = uuid.uuid4().hex
|
|
313
|
+
self._recstore.put(
|
|
314
|
+
StoredRecommendation(
|
|
315
|
+
recommendation_id=recommendation_id,
|
|
316
|
+
lane=lane,
|
|
317
|
+
user_id=req.user_id,
|
|
318
|
+
task_type=task_type.value,
|
|
319
|
+
difficulty=difficulty.value,
|
|
320
|
+
task_cluster=cluster,
|
|
321
|
+
task_fingerprint=fingerprint,
|
|
322
|
+
content=build_content(task_type.value, difficulty.value, req.task.task),
|
|
323
|
+
env_tags=list(req.task.tags or []),
|
|
324
|
+
recommended_model_id=recommended.card.model_id,
|
|
325
|
+
neighbors_by_model={
|
|
326
|
+
mid: [(ev.entry_id, ev.reference_id) for ev in agg.evidence]
|
|
327
|
+
for mid, agg in aggregates.items()
|
|
328
|
+
},
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
self._log_decision(
|
|
332
|
+
recommendation_id=recommendation_id,
|
|
333
|
+
req=req,
|
|
334
|
+
lane=lane,
|
|
335
|
+
cluster=cluster,
|
|
336
|
+
task_type=task_type,
|
|
337
|
+
difficulty=difficulty,
|
|
338
|
+
fingerprint=fingerprint,
|
|
339
|
+
tau=tau,
|
|
340
|
+
selection_policy=selection_policy,
|
|
341
|
+
explored_pick=explored_pick,
|
|
342
|
+
sel_propensities=sel_propensities,
|
|
343
|
+
recommended=recommended,
|
|
344
|
+
ranked=ranked,
|
|
345
|
+
esc=esc,
|
|
346
|
+
input_tokens=input_tokens,
|
|
347
|
+
output_tokens=output_tokens,
|
|
348
|
+
est_cost_premium=est_cost_premium,
|
|
349
|
+
shadow_chosen_model_id=shadow_pick,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
confidence = _overall_confidence(overall_basis, recommended.confidence)
|
|
353
|
+
return RecommendResponse(
|
|
354
|
+
recommendation_id=recommendation_id,
|
|
355
|
+
recommended_model=_to_ranked_model(recommended, req.explain),
|
|
356
|
+
ranked=[_to_ranked_model(c, req.explain) for c in ranked],
|
|
357
|
+
fallback_model=_to_ranked_model(fallback, req.explain) if fallback else None,
|
|
358
|
+
confidence=round(confidence, 4),
|
|
359
|
+
decision_basis=overall_basis,
|
|
360
|
+
threshold_used=round(tau, 4),
|
|
361
|
+
classified_task_type=task_type,
|
|
362
|
+
classified_difficulty=difficulty,
|
|
363
|
+
catalog_version=catalog.version,
|
|
364
|
+
catalog_stale=catalog.stale,
|
|
365
|
+
latency_ms=int((time.monotonic() - started) * 1000),
|
|
366
|
+
warnings=warnings,
|
|
367
|
+
selection_policy=selection_policy,
|
|
368
|
+
recommended_actions=_actions_for(recommended.card),
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
def _maybe_explore(
|
|
372
|
+
self, eligible: list[CandidateScore], argmin_id: str
|
|
373
|
+
) -> CandidateScore | None:
|
|
374
|
+
"""Sample the epsilon branch: softmax over eligible ranking scores.
|
|
375
|
+
|
|
376
|
+
Returns the sampled candidate (possibly the argmin itself) or None when the
|
|
377
|
+
(1 - epsilon) deterministic branch was taken.
|
|
378
|
+
"""
|
|
379
|
+
settings = self._settings
|
|
380
|
+
if self._rng.random() >= settings.minima_epsilon:
|
|
381
|
+
return None
|
|
382
|
+
t = max(settings.minima_epsilon_softmax_temperature, 1e-6)
|
|
383
|
+
peak = max(c.score for c in eligible)
|
|
384
|
+
weights = [math.exp((c.score - peak) / t) for c in eligible]
|
|
385
|
+
return self._rng.choices(eligible, weights=weights, k=1)[0]
|
|
386
|
+
|
|
387
|
+
async def _recall_with_fastpath(
|
|
388
|
+
self,
|
|
389
|
+
*,
|
|
390
|
+
req: RecommendRequest,
|
|
391
|
+
lane: str,
|
|
392
|
+
cluster: str,
|
|
393
|
+
candidate_ids: set[str],
|
|
394
|
+
):
|
|
395
|
+
"""ANN recall joined by a deterministic keyed lookup for the current cluster.
|
|
396
|
+
|
|
397
|
+
The lookup (POST /v2/core/lookup) fetches outcome records for all candidate
|
|
398
|
+
models in this cluster straight from storage — no ANN, no flicker. Records
|
|
399
|
+
already returned by ANN are deduped by entry_id so they are never double-counted.
|
|
400
|
+
|
|
401
|
+
The old dereference-based fastpath (MINIMA_DURABLE_FASTPATH=on/shadow) is
|
|
402
|
+
retained for backward compatibility and runs concurrently when configured.
|
|
403
|
+
"""
|
|
404
|
+
settings = self._settings
|
|
405
|
+
recall_coro = self._memory.recall(
|
|
406
|
+
query=req.task.task,
|
|
407
|
+
lane=lane,
|
|
408
|
+
user_id=req.user_id,
|
|
409
|
+
limit=settings.minima_memory_recall_limit,
|
|
410
|
+
env_tags=req.task.tags or None,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Keyed lookup: one filter clause per candidate model in this cluster.
|
|
414
|
+
# OR-combined on the server; returns all matching non-deleted records.
|
|
415
|
+
lookup_coro = self._memory.lookup(
|
|
416
|
+
lane=lane,
|
|
417
|
+
match=[
|
|
418
|
+
{"kind": "outcome", "task_cluster": cluster, "model_id": mid}
|
|
419
|
+
for mid in candidate_ids
|
|
420
|
+
],
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
mode = settings.minima_durable_fastpath.lower()
|
|
424
|
+
refs = (
|
|
425
|
+
self._durable_refs.refs(lane, cluster, settings.minima_durable_fastpath_max_refs)
|
|
426
|
+
if mode in ("shadow", "on") and self._durable_refs is not None
|
|
427
|
+
else []
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
if not refs:
|
|
431
|
+
# Fast common path: recall + lookup, no dereferences.
|
|
432
|
+
recall, lookup_evidence = await asyncio.gather(recall_coro, lookup_coro)
|
|
433
|
+
ann_ids = {ev.entry_id for ev in recall.evidence}
|
|
434
|
+
extra = [ev for ev in lookup_evidence if ev.entry_id not in ann_ids]
|
|
435
|
+
if extra:
|
|
436
|
+
log.info(
|
|
437
|
+
"keyed_lookup_delta",
|
|
438
|
+
cluster=cluster,
|
|
439
|
+
added=len(extra),
|
|
440
|
+
models=[ev.record.model_id for ev in extra if ev.record],
|
|
441
|
+
)
|
|
442
|
+
return recall, extra
|
|
443
|
+
|
|
444
|
+
# Old dereference fastpath active: run all three concurrently, share the
|
|
445
|
+
# recall latency budget, then merge — lookup results deduped against both
|
|
446
|
+
# ANN and dereference results so no evidence is double-counted.
|
|
447
|
+
started = time.monotonic()
|
|
448
|
+
budget_s = settings.minima_memory_recall_timeout_ms / 1000.0
|
|
449
|
+
recall_task = asyncio.ensure_future(recall_coro)
|
|
450
|
+
lookup_task = asyncio.ensure_future(lookup_coro)
|
|
451
|
+
deref_tasks = [
|
|
452
|
+
asyncio.ensure_future(
|
|
453
|
+
self._memory.dereference(lane=lane, reference_id=r.reference_id or r.entry_id)
|
|
454
|
+
)
|
|
455
|
+
for r in refs
|
|
456
|
+
]
|
|
457
|
+
recall = await recall_task
|
|
458
|
+
lookup_evidence = await lookup_task
|
|
459
|
+
remaining = max(0.05, budget_s - (time.monotonic() - started))
|
|
460
|
+
done, pending = await asyncio.wait(deref_tasks, timeout=remaining)
|
|
461
|
+
for task in pending:
|
|
462
|
+
task.cancel()
|
|
463
|
+
if pending:
|
|
464
|
+
log.warning("durable_fastpath_timeout", cluster=cluster, dropped=len(pending))
|
|
465
|
+
derefs = [
|
|
466
|
+
task.result()
|
|
467
|
+
for task in done
|
|
468
|
+
if not task.cancelled() and task.exception() is None
|
|
469
|
+
]
|
|
470
|
+
fetched = [d for d in derefs if d is not None and d.record is not None]
|
|
471
|
+
ann_ids = {ev.entry_id for ev in recall.evidence}
|
|
472
|
+
missed = [d for d in fetched if d.entry_id not in ann_ids]
|
|
473
|
+
if missed:
|
|
474
|
+
log.info(
|
|
475
|
+
"durable_fastpath_delta",
|
|
476
|
+
mode=mode,
|
|
477
|
+
cluster=cluster,
|
|
478
|
+
fetched=len(fetched),
|
|
479
|
+
ann_missed=len(missed),
|
|
480
|
+
missed_models=[d.record.model_id for d in missed if d.record],
|
|
481
|
+
)
|
|
482
|
+
deref_extra = missed if mode == "on" else []
|
|
483
|
+
seen_ids = ann_ids | {ev.entry_id for ev in deref_extra}
|
|
484
|
+
lookup_extra = [ev for ev in lookup_evidence if ev.entry_id not in seen_ids]
|
|
485
|
+
if lookup_extra:
|
|
486
|
+
log.info(
|
|
487
|
+
"keyed_lookup_delta",
|
|
488
|
+
cluster=cluster,
|
|
489
|
+
added=len(lookup_extra),
|
|
490
|
+
models=[ev.record.model_id for ev in lookup_extra if ev.record],
|
|
491
|
+
)
|
|
492
|
+
return recall, lookup_extra + deref_extra
|
|
493
|
+
|
|
494
|
+
def _log_decision(
|
|
495
|
+
self,
|
|
496
|
+
*,
|
|
497
|
+
recommendation_id: str,
|
|
498
|
+
req: RecommendRequest,
|
|
499
|
+
lane: str,
|
|
500
|
+
cluster: str,
|
|
501
|
+
task_type: TaskType,
|
|
502
|
+
difficulty: Difficulty,
|
|
503
|
+
fingerprint: str,
|
|
504
|
+
tau: float,
|
|
505
|
+
selection_policy: str,
|
|
506
|
+
explored_pick: bool,
|
|
507
|
+
sel_propensities: dict[str, float],
|
|
508
|
+
recommended: CandidateScore,
|
|
509
|
+
ranked: list[CandidateScore],
|
|
510
|
+
esc: escalation.EscalationDecision,
|
|
511
|
+
input_tokens: int,
|
|
512
|
+
output_tokens: int,
|
|
513
|
+
est_cost_premium: float,
|
|
514
|
+
shadow_chosen_model_id: str | None = None,
|
|
515
|
+
) -> None:
|
|
516
|
+
"""Persist the decision row (best-effort — never breaks a recommendation)."""
|
|
517
|
+
if self._decision_log is None:
|
|
518
|
+
return
|
|
519
|
+
settings = self._settings
|
|
520
|
+
# Counterfactual baselines on the same cost basis as the candidate set: premium =
|
|
521
|
+
# the most expensive scored candidate BEFORE constraint filters (mirrors the
|
|
522
|
+
# workflow endpoint's total_est_cost_if_all_premium); declared = the caller's
|
|
523
|
+
# stated default model.
|
|
524
|
+
baseline_cost: float | None = None
|
|
525
|
+
if req.baseline_model_id:
|
|
526
|
+
in_ranked = next(
|
|
527
|
+
(c for c in ranked if c.card.model_id == req.baseline_model_id), None
|
|
528
|
+
)
|
|
529
|
+
if in_ranked is not None:
|
|
530
|
+
baseline_cost = in_ranked.est_cost_usd
|
|
531
|
+
else:
|
|
532
|
+
card = next(
|
|
533
|
+
(
|
|
534
|
+
m
|
|
535
|
+
for m in self._catalog_store.get().cards
|
|
536
|
+
if m.model_id == req.baseline_model_id
|
|
537
|
+
),
|
|
538
|
+
None,
|
|
539
|
+
)
|
|
540
|
+
if card is not None:
|
|
541
|
+
baseline_cost = score.estimate_cost(card, input_tokens, output_tokens)[0]
|
|
542
|
+
try:
|
|
543
|
+
self._decision_log.put(
|
|
544
|
+
DecisionRecord(
|
|
545
|
+
recommendation_id=recommendation_id,
|
|
546
|
+
org_id=self._org_id,
|
|
547
|
+
lane=lane,
|
|
548
|
+
cluster=cluster,
|
|
549
|
+
task_type=task_type.value,
|
|
550
|
+
difficulty=difficulty.value,
|
|
551
|
+
fingerprint=fingerprint,
|
|
552
|
+
ts=time.time(),
|
|
553
|
+
tau=tau,
|
|
554
|
+
policy=selection_policy,
|
|
555
|
+
epsilon=settings.minima_epsilon if selection_policy != "argmin" else 0.0,
|
|
556
|
+
chosen_model_id=recommended.card.model_id,
|
|
557
|
+
escalated=esc.should_escalate,
|
|
558
|
+
shadow_chosen_model_id=shadow_chosen_model_id,
|
|
559
|
+
explored=explored_pick,
|
|
560
|
+
escalation_reasons=list(esc.reasons),
|
|
561
|
+
candidates=[
|
|
562
|
+
CandidateSnapshot(
|
|
563
|
+
model_id=c.card.model_id,
|
|
564
|
+
predicted_success=round(c.predicted_success, 6),
|
|
565
|
+
confidence=round(c.confidence, 6),
|
|
566
|
+
est_cost_usd=c.est_cost_usd,
|
|
567
|
+
propensity=round(sel_propensities.get(c.card.model_id, 0.0), 6),
|
|
568
|
+
raw_predicted_success=(
|
|
569
|
+
round(c.raw_predicted_success, 6)
|
|
570
|
+
if c.raw_predicted_success is not None
|
|
571
|
+
else None
|
|
572
|
+
),
|
|
573
|
+
est_cost_low=c.est_cost_low,
|
|
574
|
+
est_cost_high=c.est_cost_high,
|
|
575
|
+
)
|
|
576
|
+
for c in ranked
|
|
577
|
+
],
|
|
578
|
+
est_cost_recommended=recommended.est_cost_usd,
|
|
579
|
+
est_cost_premium=est_cost_premium,
|
|
580
|
+
baseline_model_id=req.baseline_model_id,
|
|
581
|
+
est_cost_baseline_declared=baseline_cost,
|
|
582
|
+
user_id=req.user_id,
|
|
583
|
+
env_tags=list(req.task.tags or []),
|
|
584
|
+
content=build_content(task_type.value, difficulty.value, req.task.task),
|
|
585
|
+
)
|
|
586
|
+
)
|
|
587
|
+
except Exception as exc: # noqa: BLE001 — analytics must never break the hot path
|
|
588
|
+
log.warning("decision_log_write_failed", error=str(exc))
|
|
589
|
+
|
|
590
|
+
async def _maybe_llm_classify(
|
|
591
|
+
self,
|
|
592
|
+
req: RecommendRequest,
|
|
593
|
+
task_type: TaskType,
|
|
594
|
+
difficulty: Difficulty,
|
|
595
|
+
warnings: list[str],
|
|
596
|
+
) -> tuple[TaskType, Difficulty]:
|
|
597
|
+
"""Refine an ambiguous heuristic classification via the reasoner (best-effort)."""
|
|
598
|
+
if not (
|
|
599
|
+
self._settings.minima_reasoner_classify
|
|
600
|
+
and req.allow_llm_escalation
|
|
601
|
+
and req.task.task_type is None
|
|
602
|
+
and task_type == TaskType.other
|
|
603
|
+
and self._reasoner is not None
|
|
604
|
+
and self._settings.reasoner_enabled
|
|
605
|
+
and hasattr(self._reasoner, "classify")
|
|
606
|
+
):
|
|
607
|
+
return task_type, difficulty
|
|
608
|
+
try:
|
|
609
|
+
result = await self._reasoner.classify(task=req.task.task)
|
|
610
|
+
except Exception as exc: # noqa: BLE001
|
|
611
|
+
log.warning("llm_classify_failed", error=str(exc))
|
|
612
|
+
return task_type, difficulty
|
|
613
|
+
if result is None:
|
|
614
|
+
return task_type, difficulty
|
|
615
|
+
warnings.append("llm_classified")
|
|
616
|
+
return result
|
|
617
|
+
|
|
618
|
+
async def _consult_reasoner(
|
|
619
|
+
self,
|
|
620
|
+
*,
|
|
621
|
+
scored: list[CandidateScore],
|
|
622
|
+
task_type: TaskType,
|
|
623
|
+
difficulty: Difficulty,
|
|
624
|
+
lane: str,
|
|
625
|
+
req: RecommendRequest,
|
|
626
|
+
) -> bool:
|
|
627
|
+
memory_block = await self._memory.get_context(
|
|
628
|
+
query=req.task.task, lane=lane, user_id=req.user_id, max_token_budget=1500
|
|
629
|
+
)
|
|
630
|
+
views = [
|
|
631
|
+
CandidateView(
|
|
632
|
+
model_id=c.card.model_id,
|
|
633
|
+
provider=c.card.provider,
|
|
634
|
+
input_cost_per_mtok=c.card.input_cost_per_mtok,
|
|
635
|
+
output_cost_per_mtok=c.card.output_cost_per_mtok,
|
|
636
|
+
context_window=c.card.context_window,
|
|
637
|
+
capability_prior=score.capability_prior(c.card, task_type),
|
|
638
|
+
est_cost_usd=c.est_cost_usd,
|
|
639
|
+
predicted_success=c.predicted_success,
|
|
640
|
+
est_latency_ms=c.est_latency_ms,
|
|
641
|
+
)
|
|
642
|
+
for c in scored
|
|
643
|
+
]
|
|
644
|
+
result = await self._reasoner.rank( # type: ignore[union-attr]
|
|
645
|
+
task=req.task.task,
|
|
646
|
+
task_type=task_type.value,
|
|
647
|
+
difficulty=difficulty.value,
|
|
648
|
+
candidates=views,
|
|
649
|
+
memory_block=memory_block,
|
|
650
|
+
cost_quality_tradeoff=req.cost_quality_tradeoff,
|
|
651
|
+
)
|
|
652
|
+
if not result or not result.rankings:
|
|
653
|
+
return False
|
|
654
|
+
settings = self._settings
|
|
655
|
+
rankings = result.by_model()
|
|
656
|
+
changed = False
|
|
657
|
+
for c in scored:
|
|
658
|
+
ranking = rankings.get(c.card.model_id)
|
|
659
|
+
if ranking is None:
|
|
660
|
+
continue
|
|
661
|
+
if settings.minima_reasoner_blend_adaptive:
|
|
662
|
+
# Evidence-mass-adaptive: a candidate backed by heavy deterministic
|
|
663
|
+
# evidence (confidence -> 1) barely moves toward the LLM's estimate; a
|
|
664
|
+
# cold candidate (confidence -> 0) leans on it. Replaces the fixed blend
|
|
665
|
+
# that weighted a 50-outcome aggregate and a guess identically.
|
|
666
|
+
raw = settings.minima_reasoner_blend_max * (1.0 - c.confidence)
|
|
667
|
+
blend = min(0.9, max(0.1, raw))
|
|
668
|
+
else:
|
|
669
|
+
blend = settings.minima_reasoner_blend
|
|
670
|
+
c.predicted_success = clamp01(
|
|
671
|
+
blend * ranking.predicted_success + (1.0 - blend) * c.predicted_success
|
|
672
|
+
)
|
|
673
|
+
c.decision_basis = DecisionBasis.llm
|
|
674
|
+
if ranking.rationale:
|
|
675
|
+
c.rationale = ranking.rationale
|
|
676
|
+
changed = True
|
|
677
|
+
return changed
|
|
678
|
+
|
|
679
|
+
def _finalize(
|
|
680
|
+
self, scored: list[CandidateScore], tau: float, cost_quality_tradeoff: float
|
|
681
|
+
) -> tuple[CandidateScore, CandidateScore | None, list[CandidateScore], list[str]]:
|
|
682
|
+
max_cost = max((c.est_cost_usd for c in scored), default=0.0) or 1.0
|
|
683
|
+
for c in scored:
|
|
684
|
+
c.score = score.ranking_score(
|
|
685
|
+
c.predicted_success, c.est_cost_usd / max_cost, cost_quality_tradeoff
|
|
686
|
+
)
|
|
687
|
+
return _optimize(scored, tau, self._settings.minima_collapse_margin)
|
|
688
|
+
|
|
689
|
+
# --------------------------------------------------------------- calibration
|
|
690
|
+
def _calibrate(self, task_type_value: str, predicted: float) -> float:
|
|
691
|
+
"""Remap the raw Beta mean through the fitted calibrator (identity when unfit)."""
|
|
692
|
+
cal = self._get_calibrators()
|
|
693
|
+
if cal is None:
|
|
694
|
+
return predicted
|
|
695
|
+
return cal.transform(task_type_value, predicted)
|
|
696
|
+
|
|
697
|
+
def _get_calibrators(self) -> CalibratorSet | None:
|
|
698
|
+
settings = self._settings
|
|
699
|
+
if not settings.minima_calibration_apply or self._decision_log is None:
|
|
700
|
+
return None
|
|
701
|
+
now = time.monotonic()
|
|
702
|
+
if (
|
|
703
|
+
self._calibrators is None
|
|
704
|
+
or now - self._calibrators_fitted_at > settings.minima_calibration_refresh_seconds
|
|
705
|
+
):
|
|
706
|
+
# Stamp BEFORE refit so concurrent requests for this org don't all refit at once.
|
|
707
|
+
self._calibrators_fitted_at = now
|
|
708
|
+
self._refit_calibrators()
|
|
709
|
+
return self._calibrators
|
|
710
|
+
|
|
711
|
+
def _refit_calibrators(self) -> None:
|
|
712
|
+
"""Refit from the org's reconciled decision rows (best-effort: keep prior on failure)."""
|
|
713
|
+
settings = self._settings
|
|
714
|
+
assert self._decision_log is not None
|
|
715
|
+
try:
|
|
716
|
+
since = time.time() - settings.minima_calibration_window_days * 86_400.0
|
|
717
|
+
rows = self._decision_log.rows(since=since)
|
|
718
|
+
self._calibrators = fit_calibrators(
|
|
719
|
+
rows,
|
|
720
|
+
min_n=settings.minima_calibration_min_n,
|
|
721
|
+
shrinkage_k=settings.minima_calibration_shrinkage_k,
|
|
722
|
+
now=time.time(),
|
|
723
|
+
)
|
|
724
|
+
except Exception as exc: # noqa: BLE001 — calibration must never break a recommendation
|
|
725
|
+
log.warning("calibrator_refit_failed", error=str(exc))
|
|
726
|
+
|
|
727
|
+
def _score_candidates(
|
|
728
|
+
self,
|
|
729
|
+
candidates: list[ModelCard],
|
|
730
|
+
aggregates: dict[str, ModelAggregate],
|
|
731
|
+
task_type: TaskType,
|
|
732
|
+
input_tokens: int,
|
|
733
|
+
output_tokens: int,
|
|
734
|
+
req: RecommendRequest,
|
|
735
|
+
) -> list[CandidateScore]:
|
|
736
|
+
settings = self._settings
|
|
737
|
+
scored: list[CandidateScore] = []
|
|
738
|
+
min_cost_n = settings.minima_observed_cost_min_n
|
|
739
|
+
# Decide the cost basis ONCE for the whole candidate set so all costs are compared
|
|
740
|
+
# like-for-like (never mix per-request estimates with historical realized costs across
|
|
741
|
+
# candidates). Prefers re-scaled observed output behavior (size-exact + reasoning-aware),
|
|
742
|
+
# then robust observed $/call, else the cache-aware token estimate.
|
|
743
|
+
cost_basis = score.choose_cost_basis(
|
|
744
|
+
{c.model_id: aggregates.get(c.model_id) for c in candidates},
|
|
745
|
+
settings.minima_use_observed_cost,
|
|
746
|
+
req.constraints.require_prompt_caching,
|
|
747
|
+
min_cost_n,
|
|
748
|
+
)
|
|
749
|
+
for card in candidates:
|
|
750
|
+
agg = aggregates.get(card.model_id)
|
|
751
|
+
prior = score.capability_prior(card, task_type)
|
|
752
|
+
predicted, confidence = score.predicted_success(
|
|
753
|
+
agg, prior, settings.minima_beta_pseudocount
|
|
754
|
+
)
|
|
755
|
+
raw_predicted = predicted
|
|
756
|
+
interval_width = score.posterior_interval_width(
|
|
757
|
+
agg, prior, settings.minima_beta_pseudocount
|
|
758
|
+
)
|
|
759
|
+
alpha, beta = score.beta_params(agg, prior, settings.minima_beta_pseudocount)
|
|
760
|
+
# Calibrate the honest Beta mean to a truthful probability BEFORE the
|
|
761
|
+
# exploration bonus (deliberate optimism) is layered on for the tau decision.
|
|
762
|
+
predicted = self._calibrate(task_type.value, predicted)
|
|
763
|
+
predicted = score.with_exploration_bonus(
|
|
764
|
+
predicted, confidence, settings.minima_exploration_bonus
|
|
765
|
+
)
|
|
766
|
+
use_cache = req.constraints.require_prompt_caching and card.supports_prompt_caching
|
|
767
|
+
cache_fraction = (
|
|
768
|
+
settings.minima_cost_cache_input_fraction
|
|
769
|
+
if settings.minima_cost_lever_aware
|
|
770
|
+
and card.supports_prompt_caching
|
|
771
|
+
and not use_cache
|
|
772
|
+
else 0.0
|
|
773
|
+
)
|
|
774
|
+
est_cost, breakdown = score.effective_cost(
|
|
775
|
+
card, agg, input_tokens, output_tokens, use_cache, cost_basis, min_cost_n,
|
|
776
|
+
cache_fraction,
|
|
777
|
+
)
|
|
778
|
+
cost_band = score.effective_cost_band(
|
|
779
|
+
card, agg, input_tokens, use_cache, cost_basis, min_cost_n
|
|
780
|
+
)
|
|
781
|
+
est_cost_low, est_cost_high, cost_band_basis = (
|
|
782
|
+
(cost_band[0][0], cost_band[0][1], cost_band[1])
|
|
783
|
+
if cost_band is not None
|
|
784
|
+
else (None, None, "")
|
|
785
|
+
)
|
|
786
|
+
cost_word = "obs" if ("observed_avg" in breakdown or "rescaled" in breakdown) else "est"
|
|
787
|
+
est_latency = (
|
|
788
|
+
agg.observed_latency_ms(
|
|
789
|
+
settings.minima_latency_min_n, settings.minima_latency_percentile
|
|
790
|
+
)
|
|
791
|
+
if agg is not None
|
|
792
|
+
else None
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
if agg is not None and agg.weight_sum > MEMORY_WEIGHT_MIN:
|
|
796
|
+
basis = DecisionBasis.memory
|
|
797
|
+
rationale = (
|
|
798
|
+
f"{agg.n} similar past outcome(s); weighted success "
|
|
799
|
+
f"{agg.weighted_success_rate:.0%}; {cost_word} ${est_cost:.5f}/call"
|
|
800
|
+
)
|
|
801
|
+
evidence = agg.evidence[:MAX_EVIDENCE_PER_CANDIDATE]
|
|
802
|
+
else:
|
|
803
|
+
basis = DecisionBasis.prior
|
|
804
|
+
rationale = (
|
|
805
|
+
f"no memory yet; capability prior {prior:.0%} for {task_type.value}; "
|
|
806
|
+
f"{cost_word} ${est_cost:.5f}/call"
|
|
807
|
+
)
|
|
808
|
+
evidence = agg.evidence[:MAX_EVIDENCE_PER_CANDIDATE] if agg else []
|
|
809
|
+
|
|
810
|
+
scored.append(
|
|
811
|
+
CandidateScore(
|
|
812
|
+
card=card,
|
|
813
|
+
predicted_success=predicted,
|
|
814
|
+
raw_predicted_success=raw_predicted,
|
|
815
|
+
confidence=confidence,
|
|
816
|
+
est_cost_usd=est_cost,
|
|
817
|
+
est_cost_breakdown=breakdown,
|
|
818
|
+
decision_basis=basis,
|
|
819
|
+
evidence=evidence,
|
|
820
|
+
rationale=rationale,
|
|
821
|
+
interval_width=interval_width,
|
|
822
|
+
alpha=alpha,
|
|
823
|
+
beta=beta,
|
|
824
|
+
est_latency_ms=est_latency,
|
|
825
|
+
latency_basis=(
|
|
826
|
+
f"observed_p{int(settings.minima_latency_percentile * 100)}"
|
|
827
|
+
if est_latency is not None
|
|
828
|
+
else ""
|
|
829
|
+
),
|
|
830
|
+
est_cost_low=est_cost_low,
|
|
831
|
+
est_cost_high=est_cost_high,
|
|
832
|
+
cost_band_basis=cost_band_basis,
|
|
833
|
+
)
|
|
834
|
+
)
|
|
835
|
+
return scored
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
def _shadow_pick(
|
|
839
|
+
scored: list[CandidateScore], cost_quality_tradeoff: float, alpha: float
|
|
840
|
+
) -> str | None:
|
|
841
|
+
"""The UCB shadow policy's pick (argmax ucb_score over the scored candidates)."""
|
|
842
|
+
if not scored:
|
|
843
|
+
return None
|
|
844
|
+
max_cost = max((c.est_cost_usd for c in scored), default=0.0) or 1.0
|
|
845
|
+
best = max(
|
|
846
|
+
scored,
|
|
847
|
+
key=lambda c: score.ucb_score(
|
|
848
|
+
c.predicted_success, c.interval_width, c.est_cost_usd / max_cost,
|
|
849
|
+
cost_quality_tradeoff, alpha,
|
|
850
|
+
),
|
|
851
|
+
)
|
|
852
|
+
return best.card.model_id
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
def _actions_for(card: ModelCard) -> list[str]:
|
|
856
|
+
"""Near-free cost-saving actions the caller should apply to realize the quoted cost.
|
|
857
|
+
|
|
858
|
+
Currently: prompt caching for models that support it (the harness applies it, so the
|
|
859
|
+
realized cost reflects the cache discount). Batch mode is left to the caller's
|
|
860
|
+
interactive/background signal and is not inferred here.
|
|
861
|
+
"""
|
|
862
|
+
actions: list[str] = []
|
|
863
|
+
if card.supports_prompt_caching:
|
|
864
|
+
actions.append("enable_prompt_cache")
|
|
865
|
+
return actions
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
def _overall_confidence(basis: DecisionBasis, recommended_confidence: float) -> float:
|
|
869
|
+
if basis == DecisionBasis.memory:
|
|
870
|
+
return recommended_confidence
|
|
871
|
+
if basis == DecisionBasis.llm:
|
|
872
|
+
return max(recommended_confidence, 0.5)
|
|
873
|
+
return min(recommended_confidence, 0.5)
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def _select_candidates(
|
|
877
|
+
cards: list[ModelCard], req: RecommendRequest, task_type: TaskType, max_candidates: int
|
|
878
|
+
) -> list[ModelCard]:
|
|
879
|
+
c = req.constraints
|
|
880
|
+
selected = list(cards)
|
|
881
|
+
if c.candidate_models:
|
|
882
|
+
wanted = set(c.candidate_models)
|
|
883
|
+
selected = [m for m in selected if m.model_id in wanted]
|
|
884
|
+
if c.allowed_providers:
|
|
885
|
+
allowed = {p.lower() for p in c.allowed_providers}
|
|
886
|
+
selected = [m for m in selected if m.provider.lower() in allowed]
|
|
887
|
+
if c.excluded_models:
|
|
888
|
+
excluded = set(c.excluded_models)
|
|
889
|
+
selected = [m for m in selected if m.model_id not in excluded]
|
|
890
|
+
if c.require_prompt_caching:
|
|
891
|
+
selected = [m for m in selected if m.supports_prompt_caching]
|
|
892
|
+
if c.require_context_window:
|
|
893
|
+
selected = [m for m in selected if m.context_window >= c.require_context_window]
|
|
894
|
+
selected.sort(key=lambda m: score.capability_prior(m, task_type), reverse=True)
|
|
895
|
+
return selected[:max_candidates]
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
def _optimize(
|
|
899
|
+
scored: list[CandidateScore], tau: float, collapse_margin: float = 0.0
|
|
900
|
+
) -> tuple[CandidateScore, CandidateScore | None, list[CandidateScore], list[str]]:
|
|
901
|
+
warnings: list[str] = []
|
|
902
|
+
ranked = sorted(scored, key=lambda c: c.score, reverse=True)
|
|
903
|
+
eligible = [c for c in scored if c.predicted_success >= tau]
|
|
904
|
+
|
|
905
|
+
# Tau-aware optimism: the rescue shrinks as the quality bar rises, so at a HIGH
|
|
906
|
+
# cost_quality setting (user wants quality) the guard barely fires, and at a LOW bar
|
|
907
|
+
# (cost-leaning) it rescues cheap-but-uncertain models freely. This is what keeps the
|
|
908
|
+
# guard from trading away quality exactly where the user asked for it.
|
|
909
|
+
effective_margin = collapse_margin * max(0.0, 1.0 - tau)
|
|
910
|
+
|
|
911
|
+
def _optimistic_clears(c: CandidateScore) -> bool:
|
|
912
|
+
# Upper credible-bound view: predicted + effective_margin * half-width clears tau.
|
|
913
|
+
# Only applied to candidates with ACTUAL evidence (confidence > 0) — at cold start,
|
|
914
|
+
# capability priors (not optimism over a maximal interval) decide, so the guard is inert.
|
|
915
|
+
if c.confidence <= 0.0:
|
|
916
|
+
return False
|
|
917
|
+
return c.predicted_success + effective_margin * 0.5 * c.interval_width >= tau
|
|
918
|
+
|
|
919
|
+
if eligible:
|
|
920
|
+
recommended = min(
|
|
921
|
+
eligible, key=lambda c: (c.est_cost_usd, -c.predicted_success, -c.confidence)
|
|
922
|
+
)
|
|
923
|
+
# Routing-collapse guard: if the cheapest model clearing tau is ITSELF the priciest
|
|
924
|
+
# candidate, prefer a cheaper candidate whose credible interval could still clear tau
|
|
925
|
+
# (the judge/escalation loop catches an over-optimistic cheap pick).
|
|
926
|
+
if collapse_margin > 0.0 and len(scored) > 1:
|
|
927
|
+
max_cost = max(c.est_cost_usd for c in scored)
|
|
928
|
+
if recommended.est_cost_usd >= max_cost - 1e-12:
|
|
929
|
+
cheaper = [
|
|
930
|
+
c
|
|
931
|
+
for c in scored
|
|
932
|
+
if c.est_cost_usd < recommended.est_cost_usd and _optimistic_clears(c)
|
|
933
|
+
]
|
|
934
|
+
if cheaper:
|
|
935
|
+
recommended = min(
|
|
936
|
+
cheaper,
|
|
937
|
+
key=lambda c: (c.est_cost_usd, -c.predicted_success, -c.confidence),
|
|
938
|
+
)
|
|
939
|
+
warnings.append("collapse_guard_applied")
|
|
940
|
+
else:
|
|
941
|
+
warnings.append("no_model_meets_threshold")
|
|
942
|
+
# Don't default to the strongest (usually priciest) model: prefer the cheapest whose
|
|
943
|
+
# optimistic upper bound could still clear tau, falling back to strongest if none.
|
|
944
|
+
plausible = [c for c in scored if _optimistic_clears(c)] if collapse_margin > 0.0 else []
|
|
945
|
+
if plausible:
|
|
946
|
+
recommended = min(plausible, key=lambda c: (c.est_cost_usd, -c.predicted_success))
|
|
947
|
+
warnings.append("collapse_guard_applied")
|
|
948
|
+
else:
|
|
949
|
+
recommended = max(scored, key=lambda c: c.predicted_success)
|
|
950
|
+
|
|
951
|
+
others = [c for c in eligible if c.card.model_id != recommended.card.model_id]
|
|
952
|
+
reliable = [c for c in others if c.predicted_success >= tau + 0.05]
|
|
953
|
+
if reliable:
|
|
954
|
+
fallback: CandidateScore | None = min(reliable, key=lambda c: c.est_cost_usd)
|
|
955
|
+
else:
|
|
956
|
+
rest = [c for c in ranked if c.card.model_id != recommended.card.model_id]
|
|
957
|
+
fallback = max(rest, key=lambda c: c.predicted_success) if rest else None
|
|
958
|
+
|
|
959
|
+
return recommended, fallback, ranked, warnings
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
def _to_ranked_model(c: CandidateScore, explain: bool) -> RankedModel:
|
|
963
|
+
evidence = (
|
|
964
|
+
[
|
|
965
|
+
EvidenceRef(
|
|
966
|
+
entry_id=ev.entry_id,
|
|
967
|
+
reference_id=ev.reference_id,
|
|
968
|
+
model_id=ev.record.model_id if ev.record else c.card.model_id,
|
|
969
|
+
score=round(ev.score, 4),
|
|
970
|
+
knowledge_confidence=round(ev.knowledge_confidence, 4),
|
|
971
|
+
observed_success=round(ev.record.quality_score, 4) if ev.record else 0.0,
|
|
972
|
+
is_stale=ev.is_stale,
|
|
973
|
+
)
|
|
974
|
+
for ev in c.evidence
|
|
975
|
+
]
|
|
976
|
+
if explain
|
|
977
|
+
else []
|
|
978
|
+
)
|
|
979
|
+
return RankedModel(
|
|
980
|
+
model_id=c.card.model_id,
|
|
981
|
+
provider=c.card.provider,
|
|
982
|
+
predicted_success=round(c.predicted_success, 4),
|
|
983
|
+
est_cost_usd=round(c.est_cost_usd, 8),
|
|
984
|
+
est_cost_breakdown=c.est_cost_breakdown,
|
|
985
|
+
score=round(c.score, 4),
|
|
986
|
+
rationale=c.rationale,
|
|
987
|
+
decision_basis=c.decision_basis,
|
|
988
|
+
evidence=evidence,
|
|
989
|
+
supports_prompt_caching=c.card.supports_prompt_caching,
|
|
990
|
+
context_window=c.card.context_window,
|
|
991
|
+
est_latency_ms=round(c.est_latency_ms, 1) if c.est_latency_ms is not None else None,
|
|
992
|
+
latency_basis=c.latency_basis,
|
|
993
|
+
est_cost_low=round(c.est_cost_low, 8) if c.est_cost_low is not None else None,
|
|
994
|
+
est_cost_high=round(c.est_cost_high, 8) if c.est_cost_high is not None else None,
|
|
995
|
+
cost_band_basis=c.cost_band_basis,
|
|
996
|
+
success_interval_width=round(c.interval_width, 4),
|
|
997
|
+
)
|