minima-cli 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minima/__init__.py +5 -0
- minima/api/__init__.py +1 -0
- minima/api/auth.py +39 -0
- minima/api/errors.py +40 -0
- minima/api/routers/__init__.py +1 -0
- minima/api/routers/calibration.py +50 -0
- minima/api/routers/feedback.py +279 -0
- minima/api/routers/health.py +50 -0
- minima/api/routers/models.py +42 -0
- minima/api/routers/recommend.py +66 -0
- minima/api/routers/savings.py +55 -0
- minima/api/routers/strategies.py +33 -0
- minima/catalog/__init__.py +1 -0
- minima/catalog/data/capability_priors.json +210 -0
- minima/catalog/data/model_aliases.json +12 -0
- minima/catalog/merge.py +69 -0
- minima/catalog/refresh.py +54 -0
- minima/catalog/sources/__init__.py +1 -0
- minima/catalog/sources/litellm.py +19 -0
- minima/catalog/sources/openrouter.py +25 -0
- minima/catalog/store.py +86 -0
- minima/config.py +288 -0
- minima/deps.py +35 -0
- minima/llm/__init__.py +1 -0
- minima/llm/anthropic.py +106 -0
- minima/llm/base.py +196 -0
- minima/llm/gemini.py +124 -0
- minima/llm/registry.py +54 -0
- minima/logging.py +28 -0
- minima/main.py +109 -0
- minima/memory/__init__.py +1 -0
- minima/memory/adapter.py +572 -0
- minima/memory/keys.py +83 -0
- minima/memory/records.py +190 -0
- minima/memory/threadpool.py +41 -0
- minima/metrics/__init__.py +1 -0
- minima/metrics/calibration.py +415 -0
- minima/metrics/report.py +116 -0
- minima/metrics/savings.py +98 -0
- minima/recommender/__init__.py +1 -0
- minima/recommender/_pg_pool.py +38 -0
- minima/recommender/_redis_client.py +32 -0
- minima/recommender/aggregate.py +157 -0
- minima/recommender/classify.py +165 -0
- minima/recommender/decisionlog.py +505 -0
- minima/recommender/durablerefs.py +312 -0
- minima/recommender/engine.py +997 -0
- minima/recommender/escalation.py +83 -0
- minima/recommender/propensity.py +189 -0
- minima/recommender/recstore.py +368 -0
- minima/recommender/score.py +318 -0
- minima/recommender/types.py +166 -0
- minima/schemas/__init__.py +1 -0
- minima/schemas/common.py +73 -0
- minima/schemas/feedback.py +34 -0
- minima/schemas/models_catalog.py +36 -0
- minima/schemas/recommend.py +104 -0
- minima/schemas/savings.py +39 -0
- minima/schemas/strategies.py +57 -0
- minima/schemas/workflow.py +43 -0
- minima/seeding/__init__.py +1 -0
- minima/seeding/items.py +42 -0
- minima/seeding/llmrouterbench.py +232 -0
- minima/seeding/routerbench.py +141 -0
- minima/seeding/run_seed.py +56 -0
- minima/seeding/synthetic.py +70 -0
- minima/tenancy/__init__.py +8 -0
- minima/tenancy/context.py +37 -0
- minima/tenancy/passthrough.py +110 -0
- minima/version.py +3 -0
- minima_cli-0.4.9.dist-info/METADATA +275 -0
- minima_cli-0.4.9.dist-info/RECORD +161 -0
- minima_cli-0.4.9.dist-info/WHEEL +4 -0
- minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
- minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
- minima_client/__init__.py +19 -0
- minima_client/autocapture.py +101 -0
- minima_client/client.py +301 -0
- minima_client/errors.py +23 -0
- minima_harness/LICENSE_PI +32 -0
- minima_harness/__init__.py +16 -0
- minima_harness/agent/__init__.py +72 -0
- minima_harness/agent/agent.py +276 -0
- minima_harness/agent/events.py +124 -0
- minima_harness/agent/loop.py +311 -0
- minima_harness/agent/state.py +79 -0
- minima_harness/agent/tools.py +97 -0
- minima_harness/ai/__init__.py +66 -0
- minima_harness/ai/compat.py +71 -0
- minima_harness/ai/errors.py +96 -0
- minima_harness/ai/events.py +117 -0
- minima_harness/ai/openrouter_catalog.py +153 -0
- minima_harness/ai/provider_catalog.py +299 -0
- minima_harness/ai/provider_quirks.py +37 -0
- minima_harness/ai/providers/__init__.py +75 -0
- minima_harness/ai/providers/_common.py +48 -0
- minima_harness/ai/providers/anthropic.py +290 -0
- minima_harness/ai/providers/base.py +65 -0
- minima_harness/ai/providers/faux.py +173 -0
- minima_harness/ai/providers/google.py +221 -0
- minima_harness/ai/providers/openai_compat.py +278 -0
- minima_harness/ai/registry.py +184 -0
- minima_harness/ai/stream.py +82 -0
- minima_harness/ai/tools.py +51 -0
- minima_harness/ai/types.py +204 -0
- minima_harness/ai/usage.py +41 -0
- minima_harness/minima/__init__.py +40 -0
- minima_harness/minima/cache.py +102 -0
- minima_harness/minima/config.py +85 -0
- minima_harness/minima/goals.py +226 -0
- minima_harness/minima/judge.py +144 -0
- minima_harness/minima/mapping.py +147 -0
- minima_harness/minima/meter.py +143 -0
- minima_harness/minima/router.py +220 -0
- minima_harness/minima/runtime.py +544 -0
- minima_harness/minima/signals.py +195 -0
- minima_harness/session/__init__.py +14 -0
- minima_harness/session/format.py +35 -0
- minima_harness/session/store.py +236 -0
- minima_harness/tasks/__init__.py +17 -0
- minima_harness/tasks/task_set.py +78 -0
- minima_harness/tools/__init__.py +7 -0
- minima_harness/tools/_io.py +34 -0
- minima_harness/tools/bash.py +70 -0
- minima_harness/tools/builtin.py +23 -0
- minima_harness/tools/edit.py +50 -0
- minima_harness/tools/find.py +38 -0
- minima_harness/tools/grep.py +73 -0
- minima_harness/tools/ls.py +35 -0
- minima_harness/tools/read.py +38 -0
- minima_harness/tools/tasks.py +75 -0
- minima_harness/tools/write.py +36 -0
- minima_harness/tui/__init__.py +3 -0
- minima_harness/tui/analytics.py +111 -0
- minima_harness/tui/app.py +1927 -0
- minima_harness/tui/bridge.py +103 -0
- minima_harness/tui/cli.py +227 -0
- minima_harness/tui/clipboard.py +60 -0
- minima_harness/tui/commands.py +49 -0
- minima_harness/tui/compaction.py +17 -0
- minima_harness/tui/config_cli.py +141 -0
- minima_harness/tui/config_store.py +237 -0
- minima_harness/tui/context.py +93 -0
- minima_harness/tui/customize.py +95 -0
- minima_harness/tui/diff.py +53 -0
- minima_harness/tui/editor.py +43 -0
- minima_harness/tui/extensions.py +84 -0
- minima_harness/tui/extra_models.py +52 -0
- minima_harness/tui/history.py +71 -0
- minima_harness/tui/mubit.py +295 -0
- minima_harness/tui/overlays.py +593 -0
- minima_harness/tui/packages.py +59 -0
- minima_harness/tui/run_modes.py +66 -0
- minima_harness/tui/theme.py +77 -0
- minima_harness/tui/welcome.py +83 -0
- minima_harness/tui/widgets/__init__.py +3 -0
- minima_harness/tui/widgets/banner.py +38 -0
- minima_harness/tui/widgets/editor.py +83 -0
- minima_harness/tui/widgets/footer.py +73 -0
- minima_harness/tui/widgets/messages.py +151 -0
- minima_harness/tui/widgets/status.py +57 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Scoring: capability prior + memory -> predicted success; cost; slider -> threshold."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import random
|
|
7
|
+
|
|
8
|
+
from minima.memory.records import clamp01
|
|
9
|
+
from minima.recommender.types import ModelAggregate
|
|
10
|
+
from minima.schemas.common import TaskType
|
|
11
|
+
from minima.schemas.models_catalog import ModelCard
|
|
12
|
+
|
|
13
|
+
_DEFAULT_PRIOR = 0.5
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def capability_prior(card: ModelCard, task_type: TaskType) -> float:
|
|
17
|
+
"""Prior probability that this model handles this task type well, in [0, 1]."""
|
|
18
|
+
by_type = card.capability_by_task_type.get(task_type)
|
|
19
|
+
if by_type is not None:
|
|
20
|
+
return clamp01(by_type)
|
|
21
|
+
intel = card.capability_priors.get("intelligence_index")
|
|
22
|
+
return clamp01(intel) if intel is not None else _DEFAULT_PRIOR
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def predicted_success(
|
|
26
|
+
agg: ModelAggregate | None, prior: float, pseudocount: float
|
|
27
|
+
) -> tuple[float, float]:
|
|
28
|
+
"""Beta-smoothed success blended with the capability prior.
|
|
29
|
+
|
|
30
|
+
Returns ``(predicted_success, confidence)``. With no evidence, predicted success
|
|
31
|
+
falls back to the prior and confidence is 0.
|
|
32
|
+
"""
|
|
33
|
+
alpha0 = prior * pseudocount
|
|
34
|
+
beta0 = (1.0 - prior) * pseudocount
|
|
35
|
+
if agg is None or agg.weight_sum <= 0.0:
|
|
36
|
+
return clamp01(prior), 0.0
|
|
37
|
+
p = (agg.weighted_success + alpha0) / (agg.weight_sum + alpha0 + beta0)
|
|
38
|
+
confidence = 1.0 - 1.0 / (1.0 + agg.weight_sum)
|
|
39
|
+
return clamp01(p), clamp01(confidence)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def estimate_cost(
|
|
43
|
+
card: ModelCard,
|
|
44
|
+
input_tokens: int,
|
|
45
|
+
output_tokens: int,
|
|
46
|
+
use_cache: bool = False,
|
|
47
|
+
cache_fraction: float = 0.0,
|
|
48
|
+
) -> tuple[float, dict[str, float]]:
|
|
49
|
+
"""Flat token estimate. ``use_cache`` prices input fully at the cache-read rate (caching
|
|
50
|
+
is REQUIRED); ``cache_fraction`` in (0,1] is the lever-aware blend — assume that fraction
|
|
51
|
+
of input is served from cache at the read rate, the rest at the full rate."""
|
|
52
|
+
if use_cache and card.cache_read_cost_per_mtok is not None:
|
|
53
|
+
in_price = card.cache_read_cost_per_mtok
|
|
54
|
+
elif cache_fraction > 0.0 and card.cache_read_cost_per_mtok is not None:
|
|
55
|
+
f = min(1.0, cache_fraction)
|
|
56
|
+
in_price = f * card.cache_read_cost_per_mtok + (1.0 - f) * card.input_cost_per_mtok
|
|
57
|
+
else:
|
|
58
|
+
in_price = card.input_cost_per_mtok
|
|
59
|
+
cost_in = (input_tokens / 1_000_000.0) * in_price
|
|
60
|
+
cost_out = (output_tokens / 1_000_000.0) * card.output_cost_per_mtok
|
|
61
|
+
breakdown = {"input": round(cost_in, 8), "output": round(cost_out, 8)}
|
|
62
|
+
return cost_in + cost_out, breakdown
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def choose_cost_basis(
|
|
66
|
+
aggs_by_id: dict[str, ModelAggregate | None],
|
|
67
|
+
use_observed: bool,
|
|
68
|
+
require_caching: bool,
|
|
69
|
+
min_cost_n: int,
|
|
70
|
+
) -> str:
|
|
71
|
+
"""Pick ONE cost basis for the whole candidate set so costs are compared like-for-like.
|
|
72
|
+
|
|
73
|
+
Returns the best tier EVERY candidate can support:
|
|
74
|
+
- ``"rescaled"``: observed output-token behavior priced for THIS request (size-exact AND
|
|
75
|
+
reasoning-aware) — when every candidate has >= ``min_cost_n`` output-token observations.
|
|
76
|
+
- ``"observed"``: robust median realized $/call (reasoning-aware, size-approximate) — when
|
|
77
|
+
every candidate has >= ``min_cost_n`` cost observations and caching is not requested
|
|
78
|
+
(recalled history is non-cached, so the cache-aware estimate is the right basis there).
|
|
79
|
+
- ``"estimate"``: flat (cache-aware) token estimate — cold-start / mixed-evidence fallback.
|
|
80
|
+
"""
|
|
81
|
+
if not use_observed:
|
|
82
|
+
return "estimate"
|
|
83
|
+
aggs = list(aggs_by_id.values())
|
|
84
|
+
if not aggs:
|
|
85
|
+
return "estimate"
|
|
86
|
+
if all(a is not None and a.observed_output_tokens(min_cost_n) is not None for a in aggs):
|
|
87
|
+
return "rescaled"
|
|
88
|
+
if not require_caching and all(
|
|
89
|
+
a is not None and a.observed_cost(min_cost_n) is not None for a in aggs
|
|
90
|
+
):
|
|
91
|
+
return "observed"
|
|
92
|
+
return "estimate"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def rescaled_cost(
|
|
96
|
+
card: ModelCard, agg: ModelAggregate, input_tokens: int, use_cache: bool, min_cost_n: int
|
|
97
|
+
) -> float | None:
|
|
98
|
+
"""Re-scale observed output behavior to the current request: this request's input tokens at
|
|
99
|
+
the (cache-aware) input rate + the model's observed median output tokens at the output rate.
|
|
100
|
+
None when there aren't enough output-token observations.
|
|
101
|
+
"""
|
|
102
|
+
out_tokens = agg.observed_output_tokens(min_cost_n)
|
|
103
|
+
if out_tokens is None:
|
|
104
|
+
return None
|
|
105
|
+
if use_cache and card.cache_read_cost_per_mtok is not None:
|
|
106
|
+
in_price = card.cache_read_cost_per_mtok
|
|
107
|
+
else:
|
|
108
|
+
in_price = card.input_cost_per_mtok
|
|
109
|
+
cost_in = (input_tokens / 1_000_000.0) * in_price
|
|
110
|
+
cost_out = (out_tokens / 1_000_000.0) * card.output_cost_per_mtok
|
|
111
|
+
return cost_in + cost_out
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def effective_cost(
|
|
115
|
+
card: ModelCard,
|
|
116
|
+
agg: ModelAggregate | None,
|
|
117
|
+
input_tokens: int,
|
|
118
|
+
output_tokens: int,
|
|
119
|
+
use_cache: bool,
|
|
120
|
+
basis: str,
|
|
121
|
+
min_cost_n: int,
|
|
122
|
+
cache_fraction: float = 0.0,
|
|
123
|
+
) -> tuple[float, dict[str, float]]:
|
|
124
|
+
"""Cost used for ranking, on the caller-chosen ``basis`` (homogeneous across candidates).
|
|
125
|
+
|
|
126
|
+
The token estimate assumes a fixed completion length, so it understates models that spend
|
|
127
|
+
many output tokens on internal reasoning/thinking. ``"rescaled"`` re-prices observed output
|
|
128
|
+
behavior for this request; ``"observed"`` uses the robust median realized $/call; both fall
|
|
129
|
+
through to the (cache-aware) ``estimate`` when their evidence is absent.
|
|
130
|
+
"""
|
|
131
|
+
if basis == "rescaled" and agg is not None:
|
|
132
|
+
rc = rescaled_cost(card, agg, input_tokens, use_cache, min_cost_n)
|
|
133
|
+
if rc is not None:
|
|
134
|
+
obs_out = agg.observed_output_tokens(min_cost_n) or 0.0
|
|
135
|
+
return rc, {"rescaled": round(rc, 8), "obs_output_tokens": round(obs_out, 1)}
|
|
136
|
+
if basis == "observed" and agg is not None:
|
|
137
|
+
observed = agg.observed_cost(min_cost_n)
|
|
138
|
+
if observed is not None:
|
|
139
|
+
return observed, {"observed_avg": round(observed, 8)}
|
|
140
|
+
return estimate_cost(card, input_tokens, output_tokens, use_cache, cache_fraction)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def effective_cost_band(
|
|
144
|
+
card: ModelCard,
|
|
145
|
+
agg: ModelAggregate | None,
|
|
146
|
+
input_tokens: int,
|
|
147
|
+
use_cache: bool,
|
|
148
|
+
basis: str,
|
|
149
|
+
min_cost_n: int,
|
|
150
|
+
q_low: float = 0.25,
|
|
151
|
+
q_high: float = 0.75,
|
|
152
|
+
) -> tuple[tuple[float, float], str] | None:
|
|
153
|
+
"""Data-grounded predictable cost band ``((low, high), basis_label)`` matching the ranking
|
|
154
|
+
``basis`` — the honest range behind the point ``effective_cost``. ``"rescaled"`` re-prices
|
|
155
|
+
the observed output-token band for this request (input fixed, output the band); ``"observed"``
|
|
156
|
+
uses the realized $/call band directly. Returns ``None`` for the ``"estimate"`` basis or when
|
|
157
|
+
evidence is below ``min_cost_n`` — the caller renders "no range yet" rather than fabricating.
|
|
158
|
+
"""
|
|
159
|
+
if agg is None:
|
|
160
|
+
return None
|
|
161
|
+
label = f"p{int(round(q_low * 100))}_p{int(round(q_high * 100))}"
|
|
162
|
+
if basis == "rescaled":
|
|
163
|
+
band = agg.observed_output_tokens_band(min_cost_n, q_low, q_high)
|
|
164
|
+
if band is not None:
|
|
165
|
+
lo_out, hi_out = band
|
|
166
|
+
in_price = (
|
|
167
|
+
card.cache_read_cost_per_mtok
|
|
168
|
+
if use_cache and card.cache_read_cost_per_mtok is not None
|
|
169
|
+
else card.input_cost_per_mtok
|
|
170
|
+
)
|
|
171
|
+
cost_in = (input_tokens / 1_000_000.0) * in_price
|
|
172
|
+
lo = cost_in + (lo_out / 1_000_000.0) * card.output_cost_per_mtok
|
|
173
|
+
hi = cost_in + (hi_out / 1_000_000.0) * card.output_cost_per_mtok
|
|
174
|
+
return (lo, hi), f"rescaled_{label}"
|
|
175
|
+
if basis == "observed":
|
|
176
|
+
band = agg.observed_cost_band(min_cost_n, q_low, q_high)
|
|
177
|
+
if band is not None:
|
|
178
|
+
return band, f"observed_{label}"
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def threshold_from_slider(
|
|
183
|
+
cost_quality_tradeoff: float, tau_min: float, tau_max: float, min_quality: float | None = None
|
|
184
|
+
) -> float:
|
|
185
|
+
"""Map the 0..10 slider to a minimum acceptable predicted-success threshold.
|
|
186
|
+
|
|
187
|
+
0 = accept the cheapest model clearing ``tau_min``; 10 = require ``tau_max``.
|
|
188
|
+
"""
|
|
189
|
+
cq = max(0.0, min(10.0, cost_quality_tradeoff))
|
|
190
|
+
tau = tau_min + (cq / 10.0) * (tau_max - tau_min)
|
|
191
|
+
if min_quality is not None:
|
|
192
|
+
tau = max(tau, min_quality)
|
|
193
|
+
return tau
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def with_exploration_bonus(predicted: float, confidence: float, bonus: float) -> float:
|
|
197
|
+
"""Optimistically inflate predicted success for under-explored candidates.
|
|
198
|
+
|
|
199
|
+
The bonus is scaled by ``(1 - confidence)`` so well-evidenced models are barely
|
|
200
|
+
touched while models with little/no recalled evidence get the full nudge — enough
|
|
201
|
+
to occasionally clear the threshold and earn a recommendation (and thus feedback).
|
|
202
|
+
``bonus`` of 0 disables exploration entirely (pure exploitation).
|
|
203
|
+
"""
|
|
204
|
+
if bonus <= 0.0:
|
|
205
|
+
return predicted
|
|
206
|
+
return clamp01(predicted + bonus * (1.0 - clamp01(confidence)))
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def ranking_score(predicted: float, normalized_cost: float, cost_quality_tradeoff: float) -> float:
|
|
210
|
+
"""Smooth blend used to order the returned list (distinct from the hard threshold)."""
|
|
211
|
+
cq = max(0.0, min(10.0, cost_quality_tradeoff))
|
|
212
|
+
lam = 0.3 + 0.07 * cq # cq=0 -> 0.3 (cost-leaning); cq=10 -> 1.0 (quality-only)
|
|
213
|
+
return lam * predicted - (1.0 - lam) * normalized_cost
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def ucb_score(
|
|
217
|
+
predicted: float,
|
|
218
|
+
interval_width: float,
|
|
219
|
+
normalized_cost: float,
|
|
220
|
+
cost_quality_tradeoff: float,
|
|
221
|
+
alpha: float,
|
|
222
|
+
) -> float:
|
|
223
|
+
"""Upper-confidence-bound contextual-bandit score (optimism-in-the-face-of-uncertainty).
|
|
224
|
+
|
|
225
|
+
Same cost/quality scalarization as :func:`ranking_score`, but the success term gets an
|
|
226
|
+
optimism bonus of ``alpha * half-width`` so under-explored arms are favoured for
|
|
227
|
+
exploration. Used by the SHADOW bandit policy (logged for regret comparison, never
|
|
228
|
+
overrides the deployed conjugate pick).
|
|
229
|
+
"""
|
|
230
|
+
cq = max(0.0, min(10.0, cost_quality_tradeoff))
|
|
231
|
+
lam = 0.3 + 0.07 * cq
|
|
232
|
+
optimistic = clamp01(predicted + alpha * 0.5 * interval_width)
|
|
233
|
+
return lam * optimistic - (1.0 - lam) * normalized_cost
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def posterior_interval_width(
|
|
237
|
+
agg: ModelAggregate | None, prior: float, pseudocount: float
|
|
238
|
+
) -> float:
|
|
239
|
+
"""Approximate 95% credible-interval width of the Beta-smoothed success estimate.
|
|
240
|
+
|
|
241
|
+
Normal approximation on the posterior mean: width = 2 * 1.96 * sqrt(p(1-p)/n_eff)
|
|
242
|
+
where n_eff = weight_sum + pseudocount. With no evidence the width is maximal (1.0) —
|
|
243
|
+
"we know nothing" reads as full uncertainty, the natural escalation signal.
|
|
244
|
+
"""
|
|
245
|
+
p, _ = predicted_success(agg, prior, pseudocount)
|
|
246
|
+
n_eff = (agg.weight_sum if agg is not None else 0.0) + max(pseudocount, 1e-9)
|
|
247
|
+
width = 2.0 * 1.96 * (max(p * (1.0 - p), 1e-9) / n_eff) ** 0.5
|
|
248
|
+
return min(1.0, width)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def softmax_propensities(
|
|
252
|
+
scores: dict[str, float], argmin_id: str, epsilon: float, temperature: float
|
|
253
|
+
) -> dict[str, float]:
|
|
254
|
+
"""Selection propensities for the epsilon-softmax policy over the eligible set.
|
|
255
|
+
|
|
256
|
+
pi(m) = (1 - eps) * 1[m == argmin] + eps * softmax(score(m) / temperature).
|
|
257
|
+
The deterministic policy is the eps=0 special case (degenerate vector). Returned
|
|
258
|
+
propensities sum to 1 over the eligible candidates.
|
|
259
|
+
"""
|
|
260
|
+
if not scores:
|
|
261
|
+
return {}
|
|
262
|
+
t = max(temperature, 1e-6)
|
|
263
|
+
peak = max(scores.values())
|
|
264
|
+
exps = {mid: math.exp((s - peak) / t) for mid, s in scores.items()}
|
|
265
|
+
total = sum(exps.values()) or 1.0
|
|
266
|
+
soft = {mid: e / total for mid, e in exps.items()}
|
|
267
|
+
eps = max(0.0, min(1.0, epsilon))
|
|
268
|
+
return {
|
|
269
|
+
mid: (1.0 - eps) * (1.0 if mid == argmin_id else 0.0) + eps * soft[mid]
|
|
270
|
+
for mid in scores
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def beta_params(
|
|
275
|
+
agg: ModelAggregate | None, prior: float, pseudocount: float
|
|
276
|
+
) -> tuple[float, float]:
|
|
277
|
+
"""Beta posterior (alpha, beta) for a candidate's success — the conjugate of
|
|
278
|
+
:func:`predicted_success` (whose mean is alpha / (alpha + beta)). Both are floored at a
|
|
279
|
+
tiny positive value so they are valid Beta parameters for sampling.
|
|
280
|
+
"""
|
|
281
|
+
alpha0 = prior * pseudocount
|
|
282
|
+
beta0 = (1.0 - prior) * pseudocount
|
|
283
|
+
if agg is None or agg.weight_sum <= 0.0:
|
|
284
|
+
return max(alpha0, 1e-6), max(beta0, 1e-6)
|
|
285
|
+
alpha = agg.weighted_success + alpha0
|
|
286
|
+
beta = (agg.weight_sum - agg.weighted_success) + beta0
|
|
287
|
+
return max(alpha, 1e-6), max(beta, 1e-6)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def thompson_select(
|
|
291
|
+
items: list[tuple[str, float, float, float]],
|
|
292
|
+
tau: float,
|
|
293
|
+
rng: random.Random,
|
|
294
|
+
samples: int = 128,
|
|
295
|
+
) -> tuple[str, dict[str, float]]:
|
|
296
|
+
"""Posterior-sampling (Thompson) selection over the cost-aware objective.
|
|
297
|
+
|
|
298
|
+
``items`` is ``(model_id, alpha, beta, est_cost_usd)`` per candidate. Each Monte-Carlo
|
|
299
|
+
round samples theta_m ~ Beta(alpha_m, beta_m) and picks the cheapest model whose sampled
|
|
300
|
+
success clears ``tau`` (falling back to the highest sampled success when none clears).
|
|
301
|
+
The selection frequencies ARE the propensities (so IPW/off-policy evaluation stay valid),
|
|
302
|
+
and the returned pick is sampled proportional to those frequencies — consistent with them.
|
|
303
|
+
"""
|
|
304
|
+
if not items:
|
|
305
|
+
return "", {}
|
|
306
|
+
counts = {m: 0 for m, _, _, _ in items}
|
|
307
|
+
for _ in range(max(1, samples)):
|
|
308
|
+
theta = {m: rng.betavariate(a, b) for m, a, b, _ in items}
|
|
309
|
+
clears = [(m, cost) for m, _, _, cost in items if theta[m] >= tau]
|
|
310
|
+
if clears:
|
|
311
|
+
pick = min(clears, key=lambda mc: (mc[1], -theta[mc[0]]))[0]
|
|
312
|
+
else:
|
|
313
|
+
pick = max(items, key=lambda it: theta[it[0]])[0]
|
|
314
|
+
counts[pick] += 1
|
|
315
|
+
total = sum(counts.values()) or 1
|
|
316
|
+
propensities = {m: counts[m] / total for m in counts}
|
|
317
|
+
pick_id = rng.choices(list(counts), weights=[counts[m] for m in counts], k=1)[0]
|
|
318
|
+
return pick_id, propensities
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Internal dataclasses shared across the recommender stages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
from minima.memory.records import RecalledEvidence
|
|
8
|
+
from minima.schemas.common import DecisionBasis
|
|
9
|
+
from minima.schemas.models_catalog import ModelCard
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _weighted_quantile(pairs: list[tuple[float, float]], q: float) -> float:
|
|
13
|
+
"""Lower weighted q-quantile of (value, weight) pairs (robust to outliers)."""
|
|
14
|
+
items = sorted(pairs, key=lambda vw: vw[0])
|
|
15
|
+
total = sum(w for _, w in items)
|
|
16
|
+
q = max(0.0, min(1.0, q))
|
|
17
|
+
if total <= 0.0: # all-zero weights -> plain positional quantile
|
|
18
|
+
vals = [v for v, _ in items]
|
|
19
|
+
idx = min(len(vals) - 1, int(q * len(vals)))
|
|
20
|
+
return vals[idx]
|
|
21
|
+
target, acc = total * q, 0.0
|
|
22
|
+
for value, weight in items:
|
|
23
|
+
acc += weight
|
|
24
|
+
if acc >= target:
|
|
25
|
+
return value
|
|
26
|
+
return items[-1][0]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _weighted_median(pairs: list[tuple[float, float]]) -> float:
|
|
30
|
+
"""Lower weighted median of (value, weight) pairs (robust to outliers)."""
|
|
31
|
+
return _weighted_quantile(pairs, 0.5)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(slots=True)
|
|
35
|
+
class ModelAggregate:
|
|
36
|
+
"""Weighted summary of recalled outcomes for one candidate model."""
|
|
37
|
+
|
|
38
|
+
model_id: str
|
|
39
|
+
weight_sum: float = 0.0
|
|
40
|
+
weighted_success: float = 0.0
|
|
41
|
+
n: int = 0
|
|
42
|
+
avg_knowledge_confidence: float = 0.0
|
|
43
|
+
evidence: list[RecalledEvidence] = field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def weighted_success_rate(self) -> float:
|
|
47
|
+
if self.weight_sum <= 0:
|
|
48
|
+
return 0.0
|
|
49
|
+
return self.weighted_success / self.weight_sum
|
|
50
|
+
|
|
51
|
+
def observed_cost(self, min_n: int) -> float | None:
|
|
52
|
+
"""Robust realized $/call over cost-bearing neighbors: a similarity-weighted MEDIAN.
|
|
53
|
+
|
|
54
|
+
A realized cost is an objective measurement, so it is weighted by topical similarity
|
|
55
|
+
only — NOT by the staleness/knowledge-confidence factors that legitimately discount the
|
|
56
|
+
*success* signal (a past call's dollar amount doesn't get cheaper because the record is
|
|
57
|
+
old). The median keeps a single mis-recorded or pathological cost_usd (wrong units, a
|
|
58
|
+
cumulative total, a timed-out retry) from dominating. Returns None when fewer than
|
|
59
|
+
``min_n`` recalled neighbors carry a positive cost.
|
|
60
|
+
"""
|
|
61
|
+
pairs = [
|
|
62
|
+
(ev.record.cost_usd, max(0.0, ev.score))
|
|
63
|
+
for ev in self.evidence
|
|
64
|
+
if ev.record is not None and ev.record.cost_usd and ev.record.cost_usd > 0.0
|
|
65
|
+
]
|
|
66
|
+
if len(pairs) < min_n:
|
|
67
|
+
return None
|
|
68
|
+
return _weighted_median(pairs)
|
|
69
|
+
|
|
70
|
+
def observed_output_tokens(self, min_n: int) -> float | None:
|
|
71
|
+
"""Robust median realized OUTPUT tokens/call (incl. reasoning/thinking) over neighbors.
|
|
72
|
+
|
|
73
|
+
Captures the model's true output behavior on similar tasks — the part a flat token
|
|
74
|
+
estimate misses — so cost can be re-scaled to the current request's input size while
|
|
75
|
+
keeping the realized output (thinking) volume. Similarity-weighted median; None when
|
|
76
|
+
fewer than ``min_n`` recalled neighbors carry an output-token count.
|
|
77
|
+
"""
|
|
78
|
+
pairs = [
|
|
79
|
+
(float(ev.record.output_tokens), max(0.0, ev.score))
|
|
80
|
+
for ev in self.evidence
|
|
81
|
+
if ev.record is not None and ev.record.output_tokens and ev.record.output_tokens > 0
|
|
82
|
+
]
|
|
83
|
+
if len(pairs) < min_n:
|
|
84
|
+
return None
|
|
85
|
+
return _weighted_median(pairs)
|
|
86
|
+
|
|
87
|
+
def observed_latency_ms(self, min_n: int, q: float = 0.75) -> float | None:
|
|
88
|
+
"""Robust observed latency percentile (default p75) over latency-bearing neighbors.
|
|
89
|
+
|
|
90
|
+
Like realized cost, latency is an objective measurement: weighted by topical
|
|
91
|
+
similarity only, not by staleness/knowledge-confidence. A high percentile (not
|
|
92
|
+
the median) is deliberate — SLA enforcement cares about the typical-worst case.
|
|
93
|
+
None when fewer than ``min_n`` recalled neighbors carry a latency.
|
|
94
|
+
"""
|
|
95
|
+
pairs = [
|
|
96
|
+
(float(ev.record.latency_ms), max(0.0, ev.score))
|
|
97
|
+
for ev in self.evidence
|
|
98
|
+
if ev.record is not None and ev.record.latency_ms and ev.record.latency_ms > 0
|
|
99
|
+
]
|
|
100
|
+
if len(pairs) < min_n:
|
|
101
|
+
return None
|
|
102
|
+
return _weighted_quantile(pairs, q)
|
|
103
|
+
|
|
104
|
+
def observed_cost_band(
|
|
105
|
+
self, min_n: int, q_low: float = 0.25, q_high: float = 0.75
|
|
106
|
+
) -> tuple[float, float] | None:
|
|
107
|
+
"""Robust p_low–p_high band of realized $/call (default p25–p75) — the data-grounded
|
|
108
|
+
predictable cost range. Same (cost, similarity) pairs and similarity-only weighting as
|
|
109
|
+
:meth:`observed_cost`; None when fewer than ``min_n`` neighbors carry a positive cost.
|
|
110
|
+
"""
|
|
111
|
+
pairs = [
|
|
112
|
+
(ev.record.cost_usd, max(0.0, ev.score))
|
|
113
|
+
for ev in self.evidence
|
|
114
|
+
if ev.record is not None and ev.record.cost_usd and ev.record.cost_usd > 0.0
|
|
115
|
+
]
|
|
116
|
+
if len(pairs) < min_n:
|
|
117
|
+
return None
|
|
118
|
+
return (_weighted_quantile(pairs, q_low), _weighted_quantile(pairs, q_high))
|
|
119
|
+
|
|
120
|
+
def observed_output_tokens_band(
|
|
121
|
+
self, min_n: int, q_low: float = 0.25, q_high: float = 0.75
|
|
122
|
+
) -> tuple[float, float] | None:
|
|
123
|
+
"""Robust p_low–p_high band of realized output tokens/call — for re-pricing the cost
|
|
124
|
+
band to the current request's input size (rescaled basis). None below ``min_n``."""
|
|
125
|
+
pairs = [
|
|
126
|
+
(float(ev.record.output_tokens), max(0.0, ev.score))
|
|
127
|
+
for ev in self.evidence
|
|
128
|
+
if ev.record is not None and ev.record.output_tokens and ev.record.output_tokens > 0
|
|
129
|
+
]
|
|
130
|
+
if len(pairs) < min_n:
|
|
131
|
+
return None
|
|
132
|
+
return (_weighted_quantile(pairs, q_low), _weighted_quantile(pairs, q_high))
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass(slots=True)
|
|
136
|
+
class CandidateScore:
|
|
137
|
+
card: ModelCard
|
|
138
|
+
predicted_success: float
|
|
139
|
+
confidence: float
|
|
140
|
+
est_cost_usd: float
|
|
141
|
+
est_cost_breakdown: dict[str, float]
|
|
142
|
+
decision_basis: DecisionBasis
|
|
143
|
+
evidence: list[RecalledEvidence] = field(default_factory=list)
|
|
144
|
+
score: float = 0.0
|
|
145
|
+
rationale: str = ""
|
|
146
|
+
# Observed latency percentile (ms) from recalled outcomes; None without evidence.
|
|
147
|
+
est_latency_ms: float | None = None
|
|
148
|
+
latency_basis: str = ""
|
|
149
|
+
# Data-grounded predictable cost band (low, high) matching the chosen basis; None when
|
|
150
|
+
# evidence is too thin to estimate a range. ``cost_band_basis`` labels its source
|
|
151
|
+
# (e.g. "observed_p25_p75", "rescaled_p25_p75", "heuristic").
|
|
152
|
+
est_cost_low: float | None = None
|
|
153
|
+
est_cost_high: float | None = None
|
|
154
|
+
cost_band_basis: str = ""
|
|
155
|
+
# 95% credible-interval width of the success estimate (1.0 = no evidence). Powers the
|
|
156
|
+
# routing-collapse margin guard and the harness green/amber/red confidence signal.
|
|
157
|
+
interval_width: float = 1.0
|
|
158
|
+
# Beta posterior parameters for the (uncalibrated) success estimate — used by Thompson
|
|
159
|
+
# sampling when that selection policy is enabled.
|
|
160
|
+
alpha: float = 0.0
|
|
161
|
+
beta: float = 0.0
|
|
162
|
+
# The pre-calibration, pre-exploration-bonus Beta-posterior mean — the HONEST
|
|
163
|
+
# evidence-based probability. ``predicted_success`` above is the deployed value
|
|
164
|
+
# (calibrated + exploration bonus); this raw value is what calibration is fit on,
|
|
165
|
+
# so the recalibration loop converges instead of oscillating. None when unset.
|
|
166
|
+
raw_predicted_success: float | None = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Public request/response schemas (Pydantic v2)."""
|
minima/schemas/common.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Shared enums and request building blocks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TaskType(StrEnum):
|
|
11
|
+
code = "code"
|
|
12
|
+
summarization = "summarization"
|
|
13
|
+
extraction = "extraction"
|
|
14
|
+
qa = "qa"
|
|
15
|
+
reasoning = "reasoning"
|
|
16
|
+
classification = "classification"
|
|
17
|
+
translation = "translation"
|
|
18
|
+
creative = "creative"
|
|
19
|
+
rag = "rag"
|
|
20
|
+
tool_use = "tool_use"
|
|
21
|
+
other = "other"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Difficulty(StrEnum):
|
|
25
|
+
trivial = "trivial"
|
|
26
|
+
easy = "easy"
|
|
27
|
+
medium = "medium"
|
|
28
|
+
hard = "hard"
|
|
29
|
+
expert = "expert"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class OutcomeLabel(StrEnum):
|
|
33
|
+
success = "success"
|
|
34
|
+
partial = "partial"
|
|
35
|
+
failure = "failure"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DecisionBasis(StrEnum):
|
|
39
|
+
"""Which path produced a recommendation."""
|
|
40
|
+
|
|
41
|
+
memory = "memory" # driven by empirical recalled outcomes
|
|
42
|
+
prior = "prior" # driven by capability priors (thin/no memory)
|
|
43
|
+
llm = "llm" # cheap-LLM reasoner was consulted
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Constraints(BaseModel):
|
|
47
|
+
"""Optional hard limits a caller can place on the candidate set."""
|
|
48
|
+
|
|
49
|
+
allowed_providers: list[str] | None = None
|
|
50
|
+
candidate_models: list[str] | None = None
|
|
51
|
+
excluded_models: list[str] | None = None
|
|
52
|
+
max_cost_per_call: float | None = Field(None, ge=0, description="USD; hard filter")
|
|
53
|
+
min_quality: float | None = Field(None, ge=0, le=1, description="predicted_success floor")
|
|
54
|
+
require_prompt_caching: bool = False
|
|
55
|
+
max_latency_ms: int | None = Field(None, gt=0)
|
|
56
|
+
require_context_window: int | None = Field(None, gt=0)
|
|
57
|
+
|
|
58
|
+
def merged_over(self, base: Constraints) -> Constraints:
|
|
59
|
+
"""Return self with any unset field inherited from ``base``."""
|
|
60
|
+
data = base.model_dump()
|
|
61
|
+
for key, value in self.model_dump().items():
|
|
62
|
+
if value is not None and value is not False:
|
|
63
|
+
data[key] = value
|
|
64
|
+
return Constraints(**data)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TaskInput(BaseModel):
|
|
68
|
+
task: str = Field(..., min_length=1, description="Raw task/prompt text; embedded by Mubit")
|
|
69
|
+
task_type: TaskType | None = None
|
|
70
|
+
difficulty: Difficulty | None = None
|
|
71
|
+
expected_input_tokens: int | None = Field(None, ge=0)
|
|
72
|
+
expected_output_tokens: int | None = Field(None, ge=0)
|
|
73
|
+
tags: list[str] = Field(default_factory=list, description="-> Mubit env_tags")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Schemas for the feedback / learning-loop endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from minima.schemas.common import OutcomeLabel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FeedbackRequest(BaseModel):
|
|
11
|
+
recommendation_id: str = Field(..., min_length=1)
|
|
12
|
+
chosen_model_id: str = Field(..., min_length=1, description="model actually run (may differ)")
|
|
13
|
+
outcome: OutcomeLabel
|
|
14
|
+
quality_score: float | None = Field(None, ge=0, le=1, description="caller-supplied; no judge")
|
|
15
|
+
input_tokens: int | None = Field(None, ge=0)
|
|
16
|
+
output_tokens: int | None = Field(None, ge=0)
|
|
17
|
+
actual_cost_usd: float | None = Field(None, ge=0)
|
|
18
|
+
latency_ms: int | None = Field(None, ge=0)
|
|
19
|
+
iterations: int | None = Field(
|
|
20
|
+
None, ge=0, description="agent loop turns to resolution (token-yield signal)"
|
|
21
|
+
)
|
|
22
|
+
verified_in_production: bool = False
|
|
23
|
+
notes: str | None = None
|
|
24
|
+
idempotency_key: str | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FeedbackResponse(BaseModel):
|
|
28
|
+
accepted: bool
|
|
29
|
+
record_id: str | None = None
|
|
30
|
+
reinforced_entry_ids: list[str] = Field(default_factory=list)
|
|
31
|
+
updated_confidence: float | None = None
|
|
32
|
+
reflection_triggered: bool = False
|
|
33
|
+
lesson_promoted: bool = False
|
|
34
|
+
warnings: list[str] = Field(default_factory=list)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Schemas for the model catalog endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
from minima.schemas.common import TaskType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ModelCard(BaseModel):
|
|
13
|
+
model_config = ConfigDict(protected_namespaces=())
|
|
14
|
+
|
|
15
|
+
model_id: str
|
|
16
|
+
provider: str
|
|
17
|
+
display_name: str = ""
|
|
18
|
+
input_cost_per_mtok: float = Field(..., ge=0)
|
|
19
|
+
output_cost_per_mtok: float = Field(..., ge=0)
|
|
20
|
+
cache_read_cost_per_mtok: float | None = None
|
|
21
|
+
supports_prompt_caching: bool = False
|
|
22
|
+
context_window: int = 0
|
|
23
|
+
max_output_tokens: int | None = None
|
|
24
|
+
capability_priors: dict[str, float] = Field(default_factory=dict)
|
|
25
|
+
capability_by_task_type: dict[TaskType, float] = Field(default_factory=dict)
|
|
26
|
+
cost_source: str = ""
|
|
27
|
+
cost_fetched_at: datetime | None = None
|
|
28
|
+
cost_stale: bool = False
|
|
29
|
+
capability_source: str = ""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ModelsResponse(BaseModel):
|
|
33
|
+
models: list[ModelCard]
|
|
34
|
+
catalog_version: str
|
|
35
|
+
refreshed_at: datetime | None = None
|
|
36
|
+
stale: bool = False
|