minima-cli 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. minima/__init__.py +5 -0
  2. minima/api/__init__.py +1 -0
  3. minima/api/auth.py +39 -0
  4. minima/api/errors.py +40 -0
  5. minima/api/routers/__init__.py +1 -0
  6. minima/api/routers/calibration.py +50 -0
  7. minima/api/routers/feedback.py +279 -0
  8. minima/api/routers/health.py +50 -0
  9. minima/api/routers/models.py +42 -0
  10. minima/api/routers/recommend.py +66 -0
  11. minima/api/routers/savings.py +55 -0
  12. minima/api/routers/strategies.py +33 -0
  13. minima/catalog/__init__.py +1 -0
  14. minima/catalog/data/capability_priors.json +210 -0
  15. minima/catalog/data/model_aliases.json +12 -0
  16. minima/catalog/merge.py +69 -0
  17. minima/catalog/refresh.py +54 -0
  18. minima/catalog/sources/__init__.py +1 -0
  19. minima/catalog/sources/litellm.py +19 -0
  20. minima/catalog/sources/openrouter.py +25 -0
  21. minima/catalog/store.py +86 -0
  22. minima/config.py +288 -0
  23. minima/deps.py +35 -0
  24. minima/llm/__init__.py +1 -0
  25. minima/llm/anthropic.py +106 -0
  26. minima/llm/base.py +196 -0
  27. minima/llm/gemini.py +124 -0
  28. minima/llm/registry.py +54 -0
  29. minima/logging.py +28 -0
  30. minima/main.py +109 -0
  31. minima/memory/__init__.py +1 -0
  32. minima/memory/adapter.py +572 -0
  33. minima/memory/keys.py +83 -0
  34. minima/memory/records.py +190 -0
  35. minima/memory/threadpool.py +41 -0
  36. minima/metrics/__init__.py +1 -0
  37. minima/metrics/calibration.py +415 -0
  38. minima/metrics/report.py +116 -0
  39. minima/metrics/savings.py +98 -0
  40. minima/recommender/__init__.py +1 -0
  41. minima/recommender/_pg_pool.py +38 -0
  42. minima/recommender/_redis_client.py +32 -0
  43. minima/recommender/aggregate.py +157 -0
  44. minima/recommender/classify.py +165 -0
  45. minima/recommender/decisionlog.py +505 -0
  46. minima/recommender/durablerefs.py +312 -0
  47. minima/recommender/engine.py +997 -0
  48. minima/recommender/escalation.py +83 -0
  49. minima/recommender/propensity.py +189 -0
  50. minima/recommender/recstore.py +368 -0
  51. minima/recommender/score.py +318 -0
  52. minima/recommender/types.py +166 -0
  53. minima/schemas/__init__.py +1 -0
  54. minima/schemas/common.py +73 -0
  55. minima/schemas/feedback.py +34 -0
  56. minima/schemas/models_catalog.py +36 -0
  57. minima/schemas/recommend.py +104 -0
  58. minima/schemas/savings.py +39 -0
  59. minima/schemas/strategies.py +57 -0
  60. minima/schemas/workflow.py +43 -0
  61. minima/seeding/__init__.py +1 -0
  62. minima/seeding/items.py +42 -0
  63. minima/seeding/llmrouterbench.py +232 -0
  64. minima/seeding/routerbench.py +141 -0
  65. minima/seeding/run_seed.py +56 -0
  66. minima/seeding/synthetic.py +70 -0
  67. minima/tenancy/__init__.py +8 -0
  68. minima/tenancy/context.py +37 -0
  69. minima/tenancy/passthrough.py +110 -0
  70. minima/version.py +3 -0
  71. minima_cli-0.4.9.dist-info/METADATA +275 -0
  72. minima_cli-0.4.9.dist-info/RECORD +161 -0
  73. minima_cli-0.4.9.dist-info/WHEEL +4 -0
  74. minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
  75. minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
  76. minima_client/__init__.py +19 -0
  77. minima_client/autocapture.py +101 -0
  78. minima_client/client.py +301 -0
  79. minima_client/errors.py +23 -0
  80. minima_harness/LICENSE_PI +32 -0
  81. minima_harness/__init__.py +16 -0
  82. minima_harness/agent/__init__.py +72 -0
  83. minima_harness/agent/agent.py +276 -0
  84. minima_harness/agent/events.py +124 -0
  85. minima_harness/agent/loop.py +311 -0
  86. minima_harness/agent/state.py +79 -0
  87. minima_harness/agent/tools.py +97 -0
  88. minima_harness/ai/__init__.py +66 -0
  89. minima_harness/ai/compat.py +71 -0
  90. minima_harness/ai/errors.py +96 -0
  91. minima_harness/ai/events.py +117 -0
  92. minima_harness/ai/openrouter_catalog.py +153 -0
  93. minima_harness/ai/provider_catalog.py +299 -0
  94. minima_harness/ai/provider_quirks.py +37 -0
  95. minima_harness/ai/providers/__init__.py +75 -0
  96. minima_harness/ai/providers/_common.py +48 -0
  97. minima_harness/ai/providers/anthropic.py +290 -0
  98. minima_harness/ai/providers/base.py +65 -0
  99. minima_harness/ai/providers/faux.py +173 -0
  100. minima_harness/ai/providers/google.py +221 -0
  101. minima_harness/ai/providers/openai_compat.py +278 -0
  102. minima_harness/ai/registry.py +184 -0
  103. minima_harness/ai/stream.py +82 -0
  104. minima_harness/ai/tools.py +51 -0
  105. minima_harness/ai/types.py +204 -0
  106. minima_harness/ai/usage.py +41 -0
  107. minima_harness/minima/__init__.py +40 -0
  108. minima_harness/minima/cache.py +102 -0
  109. minima_harness/minima/config.py +85 -0
  110. minima_harness/minima/goals.py +226 -0
  111. minima_harness/minima/judge.py +144 -0
  112. minima_harness/minima/mapping.py +147 -0
  113. minima_harness/minima/meter.py +143 -0
  114. minima_harness/minima/router.py +220 -0
  115. minima_harness/minima/runtime.py +544 -0
  116. minima_harness/minima/signals.py +195 -0
  117. minima_harness/session/__init__.py +14 -0
  118. minima_harness/session/format.py +35 -0
  119. minima_harness/session/store.py +236 -0
  120. minima_harness/tasks/__init__.py +17 -0
  121. minima_harness/tasks/task_set.py +78 -0
  122. minima_harness/tools/__init__.py +7 -0
  123. minima_harness/tools/_io.py +34 -0
  124. minima_harness/tools/bash.py +70 -0
  125. minima_harness/tools/builtin.py +23 -0
  126. minima_harness/tools/edit.py +50 -0
  127. minima_harness/tools/find.py +38 -0
  128. minima_harness/tools/grep.py +73 -0
  129. minima_harness/tools/ls.py +35 -0
  130. minima_harness/tools/read.py +38 -0
  131. minima_harness/tools/tasks.py +75 -0
  132. minima_harness/tools/write.py +36 -0
  133. minima_harness/tui/__init__.py +3 -0
  134. minima_harness/tui/analytics.py +111 -0
  135. minima_harness/tui/app.py +1927 -0
  136. minima_harness/tui/bridge.py +103 -0
  137. minima_harness/tui/cli.py +227 -0
  138. minima_harness/tui/clipboard.py +60 -0
  139. minima_harness/tui/commands.py +49 -0
  140. minima_harness/tui/compaction.py +17 -0
  141. minima_harness/tui/config_cli.py +141 -0
  142. minima_harness/tui/config_store.py +237 -0
  143. minima_harness/tui/context.py +93 -0
  144. minima_harness/tui/customize.py +95 -0
  145. minima_harness/tui/diff.py +53 -0
  146. minima_harness/tui/editor.py +43 -0
  147. minima_harness/tui/extensions.py +84 -0
  148. minima_harness/tui/extra_models.py +52 -0
  149. minima_harness/tui/history.py +71 -0
  150. minima_harness/tui/mubit.py +295 -0
  151. minima_harness/tui/overlays.py +593 -0
  152. minima_harness/tui/packages.py +59 -0
  153. minima_harness/tui/run_modes.py +66 -0
  154. minima_harness/tui/theme.py +77 -0
  155. minima_harness/tui/welcome.py +83 -0
  156. minima_harness/tui/widgets/__init__.py +3 -0
  157. minima_harness/tui/widgets/banner.py +38 -0
  158. minima_harness/tui/widgets/editor.py +83 -0
  159. minima_harness/tui/widgets/footer.py +73 -0
  160. minima_harness/tui/widgets/messages.py +151 -0
  161. minima_harness/tui/widgets/status.py +57 -0
@@ -0,0 +1,318 @@
1
+ """Scoring: capability prior + memory -> predicted success; cost; slider -> threshold."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ import random
7
+
8
+ from minima.memory.records import clamp01
9
+ from minima.recommender.types import ModelAggregate
10
+ from minima.schemas.common import TaskType
11
+ from minima.schemas.models_catalog import ModelCard
12
+
13
+ _DEFAULT_PRIOR = 0.5
14
+
15
+
16
+ def capability_prior(card: ModelCard, task_type: TaskType) -> float:
17
+ """Prior probability that this model handles this task type well, in [0, 1]."""
18
+ by_type = card.capability_by_task_type.get(task_type)
19
+ if by_type is not None:
20
+ return clamp01(by_type)
21
+ intel = card.capability_priors.get("intelligence_index")
22
+ return clamp01(intel) if intel is not None else _DEFAULT_PRIOR
23
+
24
+
25
+ def predicted_success(
26
+ agg: ModelAggregate | None, prior: float, pseudocount: float
27
+ ) -> tuple[float, float]:
28
+ """Beta-smoothed success blended with the capability prior.
29
+
30
+ Returns ``(predicted_success, confidence)``. With no evidence, predicted success
31
+ falls back to the prior and confidence is 0.
32
+ """
33
+ alpha0 = prior * pseudocount
34
+ beta0 = (1.0 - prior) * pseudocount
35
+ if agg is None or agg.weight_sum <= 0.0:
36
+ return clamp01(prior), 0.0
37
+ p = (agg.weighted_success + alpha0) / (agg.weight_sum + alpha0 + beta0)
38
+ confidence = 1.0 - 1.0 / (1.0 + agg.weight_sum)
39
+ return clamp01(p), clamp01(confidence)
40
+
41
+
42
+ def estimate_cost(
43
+ card: ModelCard,
44
+ input_tokens: int,
45
+ output_tokens: int,
46
+ use_cache: bool = False,
47
+ cache_fraction: float = 0.0,
48
+ ) -> tuple[float, dict[str, float]]:
49
+ """Flat token estimate. ``use_cache`` prices input fully at the cache-read rate (caching
50
+ is REQUIRED); ``cache_fraction`` in (0,1] is the lever-aware blend — assume that fraction
51
+ of input is served from cache at the read rate, the rest at the full rate."""
52
+ if use_cache and card.cache_read_cost_per_mtok is not None:
53
+ in_price = card.cache_read_cost_per_mtok
54
+ elif cache_fraction > 0.0 and card.cache_read_cost_per_mtok is not None:
55
+ f = min(1.0, cache_fraction)
56
+ in_price = f * card.cache_read_cost_per_mtok + (1.0 - f) * card.input_cost_per_mtok
57
+ else:
58
+ in_price = card.input_cost_per_mtok
59
+ cost_in = (input_tokens / 1_000_000.0) * in_price
60
+ cost_out = (output_tokens / 1_000_000.0) * card.output_cost_per_mtok
61
+ breakdown = {"input": round(cost_in, 8), "output": round(cost_out, 8)}
62
+ return cost_in + cost_out, breakdown
63
+
64
+
65
+ def choose_cost_basis(
66
+ aggs_by_id: dict[str, ModelAggregate | None],
67
+ use_observed: bool,
68
+ require_caching: bool,
69
+ min_cost_n: int,
70
+ ) -> str:
71
+ """Pick ONE cost basis for the whole candidate set so costs are compared like-for-like.
72
+
73
+ Returns the best tier EVERY candidate can support:
74
+ - ``"rescaled"``: observed output-token behavior priced for THIS request (size-exact AND
75
+ reasoning-aware) — when every candidate has >= ``min_cost_n`` output-token observations.
76
+ - ``"observed"``: robust median realized $/call (reasoning-aware, size-approximate) — when
77
+ every candidate has >= ``min_cost_n`` cost observations and caching is not requested
78
+ (recalled history is non-cached, so the cache-aware estimate is the right basis there).
79
+ - ``"estimate"``: flat (cache-aware) token estimate — cold-start / mixed-evidence fallback.
80
+ """
81
+ if not use_observed:
82
+ return "estimate"
83
+ aggs = list(aggs_by_id.values())
84
+ if not aggs:
85
+ return "estimate"
86
+ if all(a is not None and a.observed_output_tokens(min_cost_n) is not None for a in aggs):
87
+ return "rescaled"
88
+ if not require_caching and all(
89
+ a is not None and a.observed_cost(min_cost_n) is not None for a in aggs
90
+ ):
91
+ return "observed"
92
+ return "estimate"
93
+
94
+
95
+ def rescaled_cost(
96
+ card: ModelCard, agg: ModelAggregate, input_tokens: int, use_cache: bool, min_cost_n: int
97
+ ) -> float | None:
98
+ """Re-scale observed output behavior to the current request: this request's input tokens at
99
+ the (cache-aware) input rate + the model's observed median output tokens at the output rate.
100
+ None when there aren't enough output-token observations.
101
+ """
102
+ out_tokens = agg.observed_output_tokens(min_cost_n)
103
+ if out_tokens is None:
104
+ return None
105
+ if use_cache and card.cache_read_cost_per_mtok is not None:
106
+ in_price = card.cache_read_cost_per_mtok
107
+ else:
108
+ in_price = card.input_cost_per_mtok
109
+ cost_in = (input_tokens / 1_000_000.0) * in_price
110
+ cost_out = (out_tokens / 1_000_000.0) * card.output_cost_per_mtok
111
+ return cost_in + cost_out
112
+
113
+
114
+ def effective_cost(
115
+ card: ModelCard,
116
+ agg: ModelAggregate | None,
117
+ input_tokens: int,
118
+ output_tokens: int,
119
+ use_cache: bool,
120
+ basis: str,
121
+ min_cost_n: int,
122
+ cache_fraction: float = 0.0,
123
+ ) -> tuple[float, dict[str, float]]:
124
+ """Cost used for ranking, on the caller-chosen ``basis`` (homogeneous across candidates).
125
+
126
+ The token estimate assumes a fixed completion length, so it understates models that spend
127
+ many output tokens on internal reasoning/thinking. ``"rescaled"`` re-prices observed output
128
+ behavior for this request; ``"observed"`` uses the robust median realized $/call; both fall
129
+ through to the (cache-aware) ``estimate`` when their evidence is absent.
130
+ """
131
+ if basis == "rescaled" and agg is not None:
132
+ rc = rescaled_cost(card, agg, input_tokens, use_cache, min_cost_n)
133
+ if rc is not None:
134
+ obs_out = agg.observed_output_tokens(min_cost_n) or 0.0
135
+ return rc, {"rescaled": round(rc, 8), "obs_output_tokens": round(obs_out, 1)}
136
+ if basis == "observed" and agg is not None:
137
+ observed = agg.observed_cost(min_cost_n)
138
+ if observed is not None:
139
+ return observed, {"observed_avg": round(observed, 8)}
140
+ return estimate_cost(card, input_tokens, output_tokens, use_cache, cache_fraction)
141
+
142
+
143
+ def effective_cost_band(
144
+ card: ModelCard,
145
+ agg: ModelAggregate | None,
146
+ input_tokens: int,
147
+ use_cache: bool,
148
+ basis: str,
149
+ min_cost_n: int,
150
+ q_low: float = 0.25,
151
+ q_high: float = 0.75,
152
+ ) -> tuple[tuple[float, float], str] | None:
153
+ """Data-grounded predictable cost band ``((low, high), basis_label)`` matching the ranking
154
+ ``basis`` — the honest range behind the point ``effective_cost``. ``"rescaled"`` re-prices
155
+ the observed output-token band for this request (input fixed, output the band); ``"observed"``
156
+ uses the realized $/call band directly. Returns ``None`` for the ``"estimate"`` basis or when
157
+ evidence is below ``min_cost_n`` — the caller renders "no range yet" rather than fabricating.
158
+ """
159
+ if agg is None:
160
+ return None
161
+ label = f"p{int(round(q_low * 100))}_p{int(round(q_high * 100))}"
162
+ if basis == "rescaled":
163
+ band = agg.observed_output_tokens_band(min_cost_n, q_low, q_high)
164
+ if band is not None:
165
+ lo_out, hi_out = band
166
+ in_price = (
167
+ card.cache_read_cost_per_mtok
168
+ if use_cache and card.cache_read_cost_per_mtok is not None
169
+ else card.input_cost_per_mtok
170
+ )
171
+ cost_in = (input_tokens / 1_000_000.0) * in_price
172
+ lo = cost_in + (lo_out / 1_000_000.0) * card.output_cost_per_mtok
173
+ hi = cost_in + (hi_out / 1_000_000.0) * card.output_cost_per_mtok
174
+ return (lo, hi), f"rescaled_{label}"
175
+ if basis == "observed":
176
+ band = agg.observed_cost_band(min_cost_n, q_low, q_high)
177
+ if band is not None:
178
+ return band, f"observed_{label}"
179
+ return None
180
+
181
+
182
+ def threshold_from_slider(
183
+ cost_quality_tradeoff: float, tau_min: float, tau_max: float, min_quality: float | None = None
184
+ ) -> float:
185
+ """Map the 0..10 slider to a minimum acceptable predicted-success threshold.
186
+
187
+ 0 = accept the cheapest model clearing ``tau_min``; 10 = require ``tau_max``.
188
+ """
189
+ cq = max(0.0, min(10.0, cost_quality_tradeoff))
190
+ tau = tau_min + (cq / 10.0) * (tau_max - tau_min)
191
+ if min_quality is not None:
192
+ tau = max(tau, min_quality)
193
+ return tau
194
+
195
+
196
+ def with_exploration_bonus(predicted: float, confidence: float, bonus: float) -> float:
197
+ """Optimistically inflate predicted success for under-explored candidates.
198
+
199
+ The bonus is scaled by ``(1 - confidence)`` so well-evidenced models are barely
200
+ touched while models with little/no recalled evidence get the full nudge — enough
201
+ to occasionally clear the threshold and earn a recommendation (and thus feedback).
202
+ ``bonus`` of 0 disables exploration entirely (pure exploitation).
203
+ """
204
+ if bonus <= 0.0:
205
+ return predicted
206
+ return clamp01(predicted + bonus * (1.0 - clamp01(confidence)))
207
+
208
+
209
+ def ranking_score(predicted: float, normalized_cost: float, cost_quality_tradeoff: float) -> float:
210
+ """Smooth blend used to order the returned list (distinct from the hard threshold)."""
211
+ cq = max(0.0, min(10.0, cost_quality_tradeoff))
212
+ lam = 0.3 + 0.07 * cq # cq=0 -> 0.3 (cost-leaning); cq=10 -> 1.0 (quality-only)
213
+ return lam * predicted - (1.0 - lam) * normalized_cost
214
+
215
+
216
+ def ucb_score(
217
+ predicted: float,
218
+ interval_width: float,
219
+ normalized_cost: float,
220
+ cost_quality_tradeoff: float,
221
+ alpha: float,
222
+ ) -> float:
223
+ """Upper-confidence-bound contextual-bandit score (optimism-in-the-face-of-uncertainty).
224
+
225
+ Same cost/quality scalarization as :func:`ranking_score`, but the success term gets an
226
+ optimism bonus of ``alpha * half-width`` so under-explored arms are favoured for
227
+ exploration. Used by the SHADOW bandit policy (logged for regret comparison, never
228
+ overrides the deployed conjugate pick).
229
+ """
230
+ cq = max(0.0, min(10.0, cost_quality_tradeoff))
231
+ lam = 0.3 + 0.07 * cq
232
+ optimistic = clamp01(predicted + alpha * 0.5 * interval_width)
233
+ return lam * optimistic - (1.0 - lam) * normalized_cost
234
+
235
+
236
+ def posterior_interval_width(
237
+ agg: ModelAggregate | None, prior: float, pseudocount: float
238
+ ) -> float:
239
+ """Approximate 95% credible-interval width of the Beta-smoothed success estimate.
240
+
241
+ Normal approximation on the posterior mean: width = 2 * 1.96 * sqrt(p(1-p)/n_eff)
242
+ where n_eff = weight_sum + pseudocount. With no evidence the width is maximal (1.0) —
243
+ "we know nothing" reads as full uncertainty, the natural escalation signal.
244
+ """
245
+ p, _ = predicted_success(agg, prior, pseudocount)
246
+ n_eff = (agg.weight_sum if agg is not None else 0.0) + max(pseudocount, 1e-9)
247
+ width = 2.0 * 1.96 * (max(p * (1.0 - p), 1e-9) / n_eff) ** 0.5
248
+ return min(1.0, width)
249
+
250
+
251
+ def softmax_propensities(
252
+ scores: dict[str, float], argmin_id: str, epsilon: float, temperature: float
253
+ ) -> dict[str, float]:
254
+ """Selection propensities for the epsilon-softmax policy over the eligible set.
255
+
256
+ pi(m) = (1 - eps) * 1[m == argmin] + eps * softmax(score(m) / temperature).
257
+ The deterministic policy is the eps=0 special case (degenerate vector). Returned
258
+ propensities sum to 1 over the eligible candidates.
259
+ """
260
+ if not scores:
261
+ return {}
262
+ t = max(temperature, 1e-6)
263
+ peak = max(scores.values())
264
+ exps = {mid: math.exp((s - peak) / t) for mid, s in scores.items()}
265
+ total = sum(exps.values()) or 1.0
266
+ soft = {mid: e / total for mid, e in exps.items()}
267
+ eps = max(0.0, min(1.0, epsilon))
268
+ return {
269
+ mid: (1.0 - eps) * (1.0 if mid == argmin_id else 0.0) + eps * soft[mid]
270
+ for mid in scores
271
+ }
272
+
273
+
274
+ def beta_params(
275
+ agg: ModelAggregate | None, prior: float, pseudocount: float
276
+ ) -> tuple[float, float]:
277
+ """Beta posterior (alpha, beta) for a candidate's success — the conjugate of
278
+ :func:`predicted_success` (whose mean is alpha / (alpha + beta)). Both are floored at a
279
+ tiny positive value so they are valid Beta parameters for sampling.
280
+ """
281
+ alpha0 = prior * pseudocount
282
+ beta0 = (1.0 - prior) * pseudocount
283
+ if agg is None or agg.weight_sum <= 0.0:
284
+ return max(alpha0, 1e-6), max(beta0, 1e-6)
285
+ alpha = agg.weighted_success + alpha0
286
+ beta = (agg.weight_sum - agg.weighted_success) + beta0
287
+ return max(alpha, 1e-6), max(beta, 1e-6)
288
+
289
+
290
+ def thompson_select(
291
+ items: list[tuple[str, float, float, float]],
292
+ tau: float,
293
+ rng: random.Random,
294
+ samples: int = 128,
295
+ ) -> tuple[str, dict[str, float]]:
296
+ """Posterior-sampling (Thompson) selection over the cost-aware objective.
297
+
298
+ ``items`` is ``(model_id, alpha, beta, est_cost_usd)`` per candidate. Each Monte-Carlo
299
+ round samples theta_m ~ Beta(alpha_m, beta_m) and picks the cheapest model whose sampled
300
+ success clears ``tau`` (falling back to the highest sampled success when none clears).
301
+ The selection frequencies ARE the propensities (so IPW/off-policy evaluation stay valid),
302
+ and the returned pick is sampled proportional to those frequencies — consistent with them.
303
+ """
304
+ if not items:
305
+ return "", {}
306
+ counts = {m: 0 for m, _, _, _ in items}
307
+ for _ in range(max(1, samples)):
308
+ theta = {m: rng.betavariate(a, b) for m, a, b, _ in items}
309
+ clears = [(m, cost) for m, _, _, cost in items if theta[m] >= tau]
310
+ if clears:
311
+ pick = min(clears, key=lambda mc: (mc[1], -theta[mc[0]]))[0]
312
+ else:
313
+ pick = max(items, key=lambda it: theta[it[0]])[0]
314
+ counts[pick] += 1
315
+ total = sum(counts.values()) or 1
316
+ propensities = {m: counts[m] / total for m in counts}
317
+ pick_id = rng.choices(list(counts), weights=[counts[m] for m in counts], k=1)[0]
318
+ return pick_id, propensities
@@ -0,0 +1,166 @@
1
+ """Internal dataclasses shared across the recommender stages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+ from minima.memory.records import RecalledEvidence
8
+ from minima.schemas.common import DecisionBasis
9
+ from minima.schemas.models_catalog import ModelCard
10
+
11
+
12
+ def _weighted_quantile(pairs: list[tuple[float, float]], q: float) -> float:
13
+ """Lower weighted q-quantile of (value, weight) pairs (robust to outliers)."""
14
+ items = sorted(pairs, key=lambda vw: vw[0])
15
+ total = sum(w for _, w in items)
16
+ q = max(0.0, min(1.0, q))
17
+ if total <= 0.0: # all-zero weights -> plain positional quantile
18
+ vals = [v for v, _ in items]
19
+ idx = min(len(vals) - 1, int(q * len(vals)))
20
+ return vals[idx]
21
+ target, acc = total * q, 0.0
22
+ for value, weight in items:
23
+ acc += weight
24
+ if acc >= target:
25
+ return value
26
+ return items[-1][0]
27
+
28
+
29
+ def _weighted_median(pairs: list[tuple[float, float]]) -> float:
30
+ """Lower weighted median of (value, weight) pairs (robust to outliers)."""
31
+ return _weighted_quantile(pairs, 0.5)
32
+
33
+
34
+ @dataclass(slots=True)
35
+ class ModelAggregate:
36
+ """Weighted summary of recalled outcomes for one candidate model."""
37
+
38
+ model_id: str
39
+ weight_sum: float = 0.0
40
+ weighted_success: float = 0.0
41
+ n: int = 0
42
+ avg_knowledge_confidence: float = 0.0
43
+ evidence: list[RecalledEvidence] = field(default_factory=list)
44
+
45
+ @property
46
+ def weighted_success_rate(self) -> float:
47
+ if self.weight_sum <= 0:
48
+ return 0.0
49
+ return self.weighted_success / self.weight_sum
50
+
51
+ def observed_cost(self, min_n: int) -> float | None:
52
+ """Robust realized $/call over cost-bearing neighbors: a similarity-weighted MEDIAN.
53
+
54
+ A realized cost is an objective measurement, so it is weighted by topical similarity
55
+ only — NOT by the staleness/knowledge-confidence factors that legitimately discount the
56
+ *success* signal (a past call's dollar amount doesn't get cheaper because the record is
57
+ old). The median keeps a single mis-recorded or pathological cost_usd (wrong units, a
58
+ cumulative total, a timed-out retry) from dominating. Returns None when fewer than
59
+ ``min_n`` recalled neighbors carry a positive cost.
60
+ """
61
+ pairs = [
62
+ (ev.record.cost_usd, max(0.0, ev.score))
63
+ for ev in self.evidence
64
+ if ev.record is not None and ev.record.cost_usd and ev.record.cost_usd > 0.0
65
+ ]
66
+ if len(pairs) < min_n:
67
+ return None
68
+ return _weighted_median(pairs)
69
+
70
+ def observed_output_tokens(self, min_n: int) -> float | None:
71
+ """Robust median realized OUTPUT tokens/call (incl. reasoning/thinking) over neighbors.
72
+
73
+ Captures the model's true output behavior on similar tasks — the part a flat token
74
+ estimate misses — so cost can be re-scaled to the current request's input size while
75
+ keeping the realized output (thinking) volume. Similarity-weighted median; None when
76
+ fewer than ``min_n`` recalled neighbors carry an output-token count.
77
+ """
78
+ pairs = [
79
+ (float(ev.record.output_tokens), max(0.0, ev.score))
80
+ for ev in self.evidence
81
+ if ev.record is not None and ev.record.output_tokens and ev.record.output_tokens > 0
82
+ ]
83
+ if len(pairs) < min_n:
84
+ return None
85
+ return _weighted_median(pairs)
86
+
87
+ def observed_latency_ms(self, min_n: int, q: float = 0.75) -> float | None:
88
+ """Robust observed latency percentile (default p75) over latency-bearing neighbors.
89
+
90
+ Like realized cost, latency is an objective measurement: weighted by topical
91
+ similarity only, not by staleness/knowledge-confidence. A high percentile (not
92
+ the median) is deliberate — SLA enforcement cares about the typical-worst case.
93
+ None when fewer than ``min_n`` recalled neighbors carry a latency.
94
+ """
95
+ pairs = [
96
+ (float(ev.record.latency_ms), max(0.0, ev.score))
97
+ for ev in self.evidence
98
+ if ev.record is not None and ev.record.latency_ms and ev.record.latency_ms > 0
99
+ ]
100
+ if len(pairs) < min_n:
101
+ return None
102
+ return _weighted_quantile(pairs, q)
103
+
104
+ def observed_cost_band(
105
+ self, min_n: int, q_low: float = 0.25, q_high: float = 0.75
106
+ ) -> tuple[float, float] | None:
107
+ """Robust p_low–p_high band of realized $/call (default p25–p75) — the data-grounded
108
+ predictable cost range. Same (cost, similarity) pairs and similarity-only weighting as
109
+ :meth:`observed_cost`; None when fewer than ``min_n`` neighbors carry a positive cost.
110
+ """
111
+ pairs = [
112
+ (ev.record.cost_usd, max(0.0, ev.score))
113
+ for ev in self.evidence
114
+ if ev.record is not None and ev.record.cost_usd and ev.record.cost_usd > 0.0
115
+ ]
116
+ if len(pairs) < min_n:
117
+ return None
118
+ return (_weighted_quantile(pairs, q_low), _weighted_quantile(pairs, q_high))
119
+
120
+ def observed_output_tokens_band(
121
+ self, min_n: int, q_low: float = 0.25, q_high: float = 0.75
122
+ ) -> tuple[float, float] | None:
123
+ """Robust p_low–p_high band of realized output tokens/call — for re-pricing the cost
124
+ band to the current request's input size (rescaled basis). None below ``min_n``."""
125
+ pairs = [
126
+ (float(ev.record.output_tokens), max(0.0, ev.score))
127
+ for ev in self.evidence
128
+ if ev.record is not None and ev.record.output_tokens and ev.record.output_tokens > 0
129
+ ]
130
+ if len(pairs) < min_n:
131
+ return None
132
+ return (_weighted_quantile(pairs, q_low), _weighted_quantile(pairs, q_high))
133
+
134
+
135
+ @dataclass(slots=True)
136
+ class CandidateScore:
137
+ card: ModelCard
138
+ predicted_success: float
139
+ confidence: float
140
+ est_cost_usd: float
141
+ est_cost_breakdown: dict[str, float]
142
+ decision_basis: DecisionBasis
143
+ evidence: list[RecalledEvidence] = field(default_factory=list)
144
+ score: float = 0.0
145
+ rationale: str = ""
146
+ # Observed latency percentile (ms) from recalled outcomes; None without evidence.
147
+ est_latency_ms: float | None = None
148
+ latency_basis: str = ""
149
+ # Data-grounded predictable cost band (low, high) matching the chosen basis; None when
150
+ # evidence is too thin to estimate a range. ``cost_band_basis`` labels its source
151
+ # (e.g. "observed_p25_p75", "rescaled_p25_p75", "heuristic").
152
+ est_cost_low: float | None = None
153
+ est_cost_high: float | None = None
154
+ cost_band_basis: str = ""
155
+ # 95% credible-interval width of the success estimate (1.0 = no evidence). Powers the
156
+ # routing-collapse margin guard and the harness green/amber/red confidence signal.
157
+ interval_width: float = 1.0
158
+ # Beta posterior parameters for the (uncalibrated) success estimate — used by Thompson
159
+ # sampling when that selection policy is enabled.
160
+ alpha: float = 0.0
161
+ beta: float = 0.0
162
+ # The pre-calibration, pre-exploration-bonus Beta-posterior mean — the HONEST
163
+ # evidence-based probability. ``predicted_success`` above is the deployed value
164
+ # (calibrated + exploration bonus); this raw value is what calibration is fit on,
165
+ # so the recalibration loop converges instead of oscillating. None when unset.
166
+ raw_predicted_success: float | None = None
@@ -0,0 +1 @@
1
+ """Public request/response schemas (Pydantic v2)."""
@@ -0,0 +1,73 @@
1
+ """Shared enums and request building blocks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import StrEnum
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class TaskType(StrEnum):
11
+ code = "code"
12
+ summarization = "summarization"
13
+ extraction = "extraction"
14
+ qa = "qa"
15
+ reasoning = "reasoning"
16
+ classification = "classification"
17
+ translation = "translation"
18
+ creative = "creative"
19
+ rag = "rag"
20
+ tool_use = "tool_use"
21
+ other = "other"
22
+
23
+
24
+ class Difficulty(StrEnum):
25
+ trivial = "trivial"
26
+ easy = "easy"
27
+ medium = "medium"
28
+ hard = "hard"
29
+ expert = "expert"
30
+
31
+
32
+ class OutcomeLabel(StrEnum):
33
+ success = "success"
34
+ partial = "partial"
35
+ failure = "failure"
36
+
37
+
38
+ class DecisionBasis(StrEnum):
39
+ """Which path produced a recommendation."""
40
+
41
+ memory = "memory" # driven by empirical recalled outcomes
42
+ prior = "prior" # driven by capability priors (thin/no memory)
43
+ llm = "llm" # cheap-LLM reasoner was consulted
44
+
45
+
46
+ class Constraints(BaseModel):
47
+ """Optional hard limits a caller can place on the candidate set."""
48
+
49
+ allowed_providers: list[str] | None = None
50
+ candidate_models: list[str] | None = None
51
+ excluded_models: list[str] | None = None
52
+ max_cost_per_call: float | None = Field(None, ge=0, description="USD; hard filter")
53
+ min_quality: float | None = Field(None, ge=0, le=1, description="predicted_success floor")
54
+ require_prompt_caching: bool = False
55
+ max_latency_ms: int | None = Field(None, gt=0)
56
+ require_context_window: int | None = Field(None, gt=0)
57
+
58
+ def merged_over(self, base: Constraints) -> Constraints:
59
+ """Return self with any unset field inherited from ``base``."""
60
+ data = base.model_dump()
61
+ for key, value in self.model_dump().items():
62
+ if value is not None and value is not False:
63
+ data[key] = value
64
+ return Constraints(**data)
65
+
66
+
67
+ class TaskInput(BaseModel):
68
+ task: str = Field(..., min_length=1, description="Raw task/prompt text; embedded by Mubit")
69
+ task_type: TaskType | None = None
70
+ difficulty: Difficulty | None = None
71
+ expected_input_tokens: int | None = Field(None, ge=0)
72
+ expected_output_tokens: int | None = Field(None, ge=0)
73
+ tags: list[str] = Field(default_factory=list, description="-> Mubit env_tags")
@@ -0,0 +1,34 @@
1
+ """Schemas for the feedback / learning-loop endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from minima.schemas.common import OutcomeLabel
8
+
9
+
10
+ class FeedbackRequest(BaseModel):
11
+ recommendation_id: str = Field(..., min_length=1)
12
+ chosen_model_id: str = Field(..., min_length=1, description="model actually run (may differ)")
13
+ outcome: OutcomeLabel
14
+ quality_score: float | None = Field(None, ge=0, le=1, description="caller-supplied; no judge")
15
+ input_tokens: int | None = Field(None, ge=0)
16
+ output_tokens: int | None = Field(None, ge=0)
17
+ actual_cost_usd: float | None = Field(None, ge=0)
18
+ latency_ms: int | None = Field(None, ge=0)
19
+ iterations: int | None = Field(
20
+ None, ge=0, description="agent loop turns to resolution (token-yield signal)"
21
+ )
22
+ verified_in_production: bool = False
23
+ notes: str | None = None
24
+ idempotency_key: str | None = None
25
+
26
+
27
+ class FeedbackResponse(BaseModel):
28
+ accepted: bool
29
+ record_id: str | None = None
30
+ reinforced_entry_ids: list[str] = Field(default_factory=list)
31
+ updated_confidence: float | None = None
32
+ reflection_triggered: bool = False
33
+ lesson_promoted: bool = False
34
+ warnings: list[str] = Field(default_factory=list)
@@ -0,0 +1,36 @@
1
+ """Schemas for the model catalog endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+ from minima.schemas.common import TaskType
10
+
11
+
12
+ class ModelCard(BaseModel):
13
+ model_config = ConfigDict(protected_namespaces=())
14
+
15
+ model_id: str
16
+ provider: str
17
+ display_name: str = ""
18
+ input_cost_per_mtok: float = Field(..., ge=0)
19
+ output_cost_per_mtok: float = Field(..., ge=0)
20
+ cache_read_cost_per_mtok: float | None = None
21
+ supports_prompt_caching: bool = False
22
+ context_window: int = 0
23
+ max_output_tokens: int | None = None
24
+ capability_priors: dict[str, float] = Field(default_factory=dict)
25
+ capability_by_task_type: dict[TaskType, float] = Field(default_factory=dict)
26
+ cost_source: str = ""
27
+ cost_fetched_at: datetime | None = None
28
+ cost_stale: bool = False
29
+ capability_source: str = ""
30
+
31
+
32
+ class ModelsResponse(BaseModel):
33
+ models: list[ModelCard]
34
+ catalog_version: str
35
+ refreshed_at: datetime | None = None
36
+ stale: bool = False