minima-cli 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minima/__init__.py +5 -0
- minima/api/__init__.py +1 -0
- minima/api/auth.py +39 -0
- minima/api/errors.py +40 -0
- minima/api/routers/__init__.py +1 -0
- minima/api/routers/calibration.py +50 -0
- minima/api/routers/feedback.py +279 -0
- minima/api/routers/health.py +50 -0
- minima/api/routers/models.py +42 -0
- minima/api/routers/recommend.py +66 -0
- minima/api/routers/savings.py +55 -0
- minima/api/routers/strategies.py +33 -0
- minima/catalog/__init__.py +1 -0
- minima/catalog/data/capability_priors.json +210 -0
- minima/catalog/data/model_aliases.json +12 -0
- minima/catalog/merge.py +69 -0
- minima/catalog/refresh.py +54 -0
- minima/catalog/sources/__init__.py +1 -0
- minima/catalog/sources/litellm.py +19 -0
- minima/catalog/sources/openrouter.py +25 -0
- minima/catalog/store.py +86 -0
- minima/config.py +288 -0
- minima/deps.py +35 -0
- minima/llm/__init__.py +1 -0
- minima/llm/anthropic.py +106 -0
- minima/llm/base.py +196 -0
- minima/llm/gemini.py +124 -0
- minima/llm/registry.py +54 -0
- minima/logging.py +28 -0
- minima/main.py +109 -0
- minima/memory/__init__.py +1 -0
- minima/memory/adapter.py +572 -0
- minima/memory/keys.py +83 -0
- minima/memory/records.py +190 -0
- minima/memory/threadpool.py +41 -0
- minima/metrics/__init__.py +1 -0
- minima/metrics/calibration.py +415 -0
- minima/metrics/report.py +116 -0
- minima/metrics/savings.py +98 -0
- minima/recommender/__init__.py +1 -0
- minima/recommender/_pg_pool.py +38 -0
- minima/recommender/_redis_client.py +32 -0
- minima/recommender/aggregate.py +157 -0
- minima/recommender/classify.py +165 -0
- minima/recommender/decisionlog.py +505 -0
- minima/recommender/durablerefs.py +312 -0
- minima/recommender/engine.py +997 -0
- minima/recommender/escalation.py +83 -0
- minima/recommender/propensity.py +189 -0
- minima/recommender/recstore.py +368 -0
- minima/recommender/score.py +318 -0
- minima/recommender/types.py +166 -0
- minima/schemas/__init__.py +1 -0
- minima/schemas/common.py +73 -0
- minima/schemas/feedback.py +34 -0
- minima/schemas/models_catalog.py +36 -0
- minima/schemas/recommend.py +104 -0
- minima/schemas/savings.py +39 -0
- minima/schemas/strategies.py +57 -0
- minima/schemas/workflow.py +43 -0
- minima/seeding/__init__.py +1 -0
- minima/seeding/items.py +42 -0
- minima/seeding/llmrouterbench.py +232 -0
- minima/seeding/routerbench.py +141 -0
- minima/seeding/run_seed.py +56 -0
- minima/seeding/synthetic.py +70 -0
- minima/tenancy/__init__.py +8 -0
- minima/tenancy/context.py +37 -0
- minima/tenancy/passthrough.py +110 -0
- minima/version.py +3 -0
- minima_cli-0.4.9.dist-info/METADATA +275 -0
- minima_cli-0.4.9.dist-info/RECORD +161 -0
- minima_cli-0.4.9.dist-info/WHEEL +4 -0
- minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
- minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
- minima_client/__init__.py +19 -0
- minima_client/autocapture.py +101 -0
- minima_client/client.py +301 -0
- minima_client/errors.py +23 -0
- minima_harness/LICENSE_PI +32 -0
- minima_harness/__init__.py +16 -0
- minima_harness/agent/__init__.py +72 -0
- minima_harness/agent/agent.py +276 -0
- minima_harness/agent/events.py +124 -0
- minima_harness/agent/loop.py +311 -0
- minima_harness/agent/state.py +79 -0
- minima_harness/agent/tools.py +97 -0
- minima_harness/ai/__init__.py +66 -0
- minima_harness/ai/compat.py +71 -0
- minima_harness/ai/errors.py +96 -0
- minima_harness/ai/events.py +117 -0
- minima_harness/ai/openrouter_catalog.py +153 -0
- minima_harness/ai/provider_catalog.py +299 -0
- minima_harness/ai/provider_quirks.py +37 -0
- minima_harness/ai/providers/__init__.py +75 -0
- minima_harness/ai/providers/_common.py +48 -0
- minima_harness/ai/providers/anthropic.py +290 -0
- minima_harness/ai/providers/base.py +65 -0
- minima_harness/ai/providers/faux.py +173 -0
- minima_harness/ai/providers/google.py +221 -0
- minima_harness/ai/providers/openai_compat.py +278 -0
- minima_harness/ai/registry.py +184 -0
- minima_harness/ai/stream.py +82 -0
- minima_harness/ai/tools.py +51 -0
- minima_harness/ai/types.py +204 -0
- minima_harness/ai/usage.py +41 -0
- minima_harness/minima/__init__.py +40 -0
- minima_harness/minima/cache.py +102 -0
- minima_harness/minima/config.py +85 -0
- minima_harness/minima/goals.py +226 -0
- minima_harness/minima/judge.py +144 -0
- minima_harness/minima/mapping.py +147 -0
- minima_harness/minima/meter.py +143 -0
- minima_harness/minima/router.py +220 -0
- minima_harness/minima/runtime.py +544 -0
- minima_harness/minima/signals.py +195 -0
- minima_harness/session/__init__.py +14 -0
- minima_harness/session/format.py +35 -0
- minima_harness/session/store.py +236 -0
- minima_harness/tasks/__init__.py +17 -0
- minima_harness/tasks/task_set.py +78 -0
- minima_harness/tools/__init__.py +7 -0
- minima_harness/tools/_io.py +34 -0
- minima_harness/tools/bash.py +70 -0
- minima_harness/tools/builtin.py +23 -0
- minima_harness/tools/edit.py +50 -0
- minima_harness/tools/find.py +38 -0
- minima_harness/tools/grep.py +73 -0
- minima_harness/tools/ls.py +35 -0
- minima_harness/tools/read.py +38 -0
- minima_harness/tools/tasks.py +75 -0
- minima_harness/tools/write.py +36 -0
- minima_harness/tui/__init__.py +3 -0
- minima_harness/tui/analytics.py +111 -0
- minima_harness/tui/app.py +1927 -0
- minima_harness/tui/bridge.py +103 -0
- minima_harness/tui/cli.py +227 -0
- minima_harness/tui/clipboard.py +60 -0
- minima_harness/tui/commands.py +49 -0
- minima_harness/tui/compaction.py +17 -0
- minima_harness/tui/config_cli.py +141 -0
- minima_harness/tui/config_store.py +237 -0
- minima_harness/tui/context.py +93 -0
- minima_harness/tui/customize.py +95 -0
- minima_harness/tui/diff.py +53 -0
- minima_harness/tui/editor.py +43 -0
- minima_harness/tui/extensions.py +84 -0
- minima_harness/tui/extra_models.py +52 -0
- minima_harness/tui/history.py +71 -0
- minima_harness/tui/mubit.py +295 -0
- minima_harness/tui/overlays.py +593 -0
- minima_harness/tui/packages.py +59 -0
- minima_harness/tui/run_modes.py +66 -0
- minima_harness/tui/theme.py +77 -0
- minima_harness/tui/welcome.py +83 -0
- minima_harness/tui/widgets/__init__.py +3 -0
- minima_harness/tui/widgets/banner.py +38 -0
- minima_harness/tui/widgets/editor.py +83 -0
- minima_harness/tui/widgets/footer.py +73 -0
- minima_harness/tui/widgets/messages.py +151 -0
- minima_harness/tui/widgets/status.py +57 -0
minima/memory/records.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Mapping between Minima's internal outcome model and Mubit memory metadata.
|
|
2
|
+
|
|
3
|
+
All three intake paths (explicit feedback, auto-capture, offline seed) converge on
|
|
4
|
+
this one record shape, so the recommender is agnostic to where evidence came from.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from collections.abc import Mapping
|
|
11
|
+
from dataclasses import asdict, dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
SCHEMA_VERSION = 2 # v2 adds recorded_at (unix seconds); v1 records parse unchanged
|
|
15
|
+
|
|
16
|
+
_OUTCOME_DEFAULT_QUALITY = {"success": 0.9, "partial": 0.5, "failure": 0.1}
|
|
17
|
+
|
|
18
|
+
# Caller-supplied quality scores that flatly contradict the outcome label are clamped
|
|
19
|
+
# (never rejected — nuanced feedback like "succeeded but mediocre" must survive).
|
|
20
|
+
_FAILURE_QUALITY_CAP = 0.6
|
|
21
|
+
_SUCCESS_QUALITY_FLOOR = 0.4
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _clamp(x: float, lo: float, hi: float) -> float:
|
|
25
|
+
return max(lo, min(hi, x))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def clamp01(x: float) -> float:
|
|
29
|
+
return _clamp(x, 0.0, 1.0)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def quality_from_outcome(outcome: str, quality_score: float | None) -> float:
|
|
33
|
+
"""Caller-supplied quality wins; else a label-based default."""
|
|
34
|
+
if quality_score is not None:
|
|
35
|
+
return clamp01(float(quality_score))
|
|
36
|
+
return _OUTCOME_DEFAULT_QUALITY.get(outcome, 0.5)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def reconcile_quality(outcome: str, quality: float) -> tuple[float, str | None]:
|
|
40
|
+
"""Log-and-clamp gate for outcome/quality contradictions.
|
|
41
|
+
|
|
42
|
+
A "failure" reported with quality 0.95 (or a "success" with 0.05) would poison the
|
|
43
|
+
weighted-success aggregate with a label/score pair that can't both be true. Clamp
|
|
44
|
+
into the consistent band and surface a warning so the caller can fix their scorer.
|
|
45
|
+
"""
|
|
46
|
+
if outcome == "failure" and quality > _FAILURE_QUALITY_CAP:
|
|
47
|
+
return _FAILURE_QUALITY_CAP, "quality_outcome_mismatch"
|
|
48
|
+
if outcome == "success" and quality < _SUCCESS_QUALITY_FLOOR:
|
|
49
|
+
return _SUCCESS_QUALITY_FLOOR, "quality_outcome_mismatch"
|
|
50
|
+
return quality, None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def signal_from_outcome(outcome: str, quality: float) -> float:
|
|
54
|
+
"""Map an outcome+quality to a reinforcement signal in [-1, 1]."""
|
|
55
|
+
if outcome == "success":
|
|
56
|
+
return 1.0
|
|
57
|
+
if outcome == "partial":
|
|
58
|
+
return _clamp(2.0 * quality - 1.0, -1.0, 1.0)
|
|
59
|
+
return _clamp(quality - 1.0, -1.0, 0.0) # failure
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass(slots=True)
|
|
63
|
+
class OutcomeRecord:
|
|
64
|
+
"""A single (task, model, outcome) observation."""
|
|
65
|
+
|
|
66
|
+
model_id: str
|
|
67
|
+
provider: str = ""
|
|
68
|
+
task_type: str = "other"
|
|
69
|
+
difficulty: str = "medium"
|
|
70
|
+
task_fingerprint: str = ""
|
|
71
|
+
task_cluster: str = ""
|
|
72
|
+
input_tokens: int = 0
|
|
73
|
+
output_tokens: int = 0
|
|
74
|
+
cost_usd: float = 0.0
|
|
75
|
+
latency_ms: int | None = None
|
|
76
|
+
quality_score: float = 0.0
|
|
77
|
+
outcome: str = "success"
|
|
78
|
+
recommendation_id: str | None = None
|
|
79
|
+
verified_in_production: bool = False
|
|
80
|
+
source_dataset: str | None = None
|
|
81
|
+
# Agent loop turns to resolution (token-yield signal; a cheap model that takes many
|
|
82
|
+
# turns to resolve can cost more than one frontier turn). Backward-compatible: None
|
|
83
|
+
# on legacy records.
|
|
84
|
+
iterations: int | None = None
|
|
85
|
+
# Unix seconds when the outcome was observed. Powers evidence age decay; None on
|
|
86
|
+
# legacy (schema v1) records, which fall back to the binary staleness penalty.
|
|
87
|
+
recorded_at: float | None = None
|
|
88
|
+
kind: str = "outcome"
|
|
89
|
+
schema_version: int = SCHEMA_VERSION
|
|
90
|
+
extra: dict = field(default_factory=dict)
|
|
91
|
+
|
|
92
|
+
def to_metadata(self) -> dict:
|
|
93
|
+
data = asdict(self)
|
|
94
|
+
extra = data.pop("extra", {}) or {}
|
|
95
|
+
return {**extra, **{k: v for k, v in data.items() if v is not None}}
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_metadata(cls, meta: Mapping | str | None) -> OutcomeRecord | None:
|
|
99
|
+
"""Parse a Mubit ``metadata_json`` (string or dict) into an OutcomeRecord.
|
|
100
|
+
|
|
101
|
+
Returns ``None`` when the entry is not a Minima outcome record.
|
|
102
|
+
"""
|
|
103
|
+
parsed = _coerce_mapping(meta)
|
|
104
|
+
if not parsed:
|
|
105
|
+
return None
|
|
106
|
+
if parsed.get("kind") != "outcome":
|
|
107
|
+
return None
|
|
108
|
+
model_id = parsed.get("model_id")
|
|
109
|
+
if not model_id:
|
|
110
|
+
return None
|
|
111
|
+
return cls(
|
|
112
|
+
model_id=str(model_id),
|
|
113
|
+
provider=str(parsed.get("provider", "")),
|
|
114
|
+
task_type=str(parsed.get("task_type", "other")),
|
|
115
|
+
difficulty=str(parsed.get("difficulty", "medium")),
|
|
116
|
+
task_fingerprint=str(parsed.get("task_fingerprint", "")),
|
|
117
|
+
task_cluster=str(parsed.get("task_cluster", "")),
|
|
118
|
+
input_tokens=_as_int(parsed.get("input_tokens")),
|
|
119
|
+
output_tokens=_as_int(parsed.get("output_tokens")),
|
|
120
|
+
cost_usd=_as_float(parsed.get("cost_usd")),
|
|
121
|
+
latency_ms=_as_int(parsed.get("latency_ms")) if parsed.get("latency_ms") else None,
|
|
122
|
+
quality_score=clamp01(_as_float(parsed.get("quality_score"))),
|
|
123
|
+
outcome=str(parsed.get("outcome", "success")),
|
|
124
|
+
recommendation_id=parsed.get("recommendation_id"),
|
|
125
|
+
verified_in_production=bool(parsed.get("verified_in_production", False)),
|
|
126
|
+
source_dataset=parsed.get("source_dataset"),
|
|
127
|
+
recorded_at=(
|
|
128
|
+
_as_float(parsed.get("recorded_at")) if parsed.get("recorded_at") else None
|
|
129
|
+
),
|
|
130
|
+
iterations=(_as_int(parsed.get("iterations")) if parsed.get("iterations") else None),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass(slots=True)
|
|
135
|
+
class RecalledEvidence:
|
|
136
|
+
"""One recalled Mubit entry, with its parsed outcome record (if any)."""
|
|
137
|
+
|
|
138
|
+
entry_id: str
|
|
139
|
+
reference_id: str | None
|
|
140
|
+
score: float
|
|
141
|
+
knowledge_confidence: float
|
|
142
|
+
is_stale: bool
|
|
143
|
+
content: str
|
|
144
|
+
record: OutcomeRecord | None
|
|
145
|
+
# Whether this entry can be re-read exactly via Dereference (durable fast path).
|
|
146
|
+
referenceable: bool = False
|
|
147
|
+
entry_type: str = ""
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass(slots=True)
|
|
151
|
+
class RecallResult:
|
|
152
|
+
evidence: list[RecalledEvidence]
|
|
153
|
+
degraded: bool = False
|
|
154
|
+
raw_confidence: float = 0.0
|
|
155
|
+
timed_out: bool = False
|
|
156
|
+
error: str | None = None
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def outcome_evidence(self) -> list[RecalledEvidence]:
|
|
160
|
+
return [e for e in self.evidence if e.record is not None]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _coerce_mapping(meta: Mapping | str | None) -> dict | None:
|
|
164
|
+
if meta is None:
|
|
165
|
+
return None
|
|
166
|
+
if isinstance(meta, str):
|
|
167
|
+
if not meta.strip():
|
|
168
|
+
return None
|
|
169
|
+
try:
|
|
170
|
+
loaded = json.loads(meta)
|
|
171
|
+
except (json.JSONDecodeError, ValueError):
|
|
172
|
+
return None
|
|
173
|
+
return loaded if isinstance(loaded, dict) else None
|
|
174
|
+
if isinstance(meta, Mapping):
|
|
175
|
+
return dict(meta)
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _as_int(value: Any, default: int = 0) -> int:
|
|
180
|
+
try:
|
|
181
|
+
return int(value)
|
|
182
|
+
except (TypeError, ValueError):
|
|
183
|
+
return default
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _as_float(value: Any, default: float = 0.0) -> float:
|
|
187
|
+
try:
|
|
188
|
+
return float(value)
|
|
189
|
+
except (TypeError, ValueError):
|
|
190
|
+
return default
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Run the synchronous Mubit SDK off the event loop.
|
|
2
|
+
|
|
3
|
+
The Mubit Python SDK is blocking (``requests``/``grpc``). Every adapter call goes
|
|
4
|
+
through a worker thread so FastAPI's event loop stays responsive.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import functools
|
|
10
|
+
import inspect
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from typing import TypeVar
|
|
13
|
+
|
|
14
|
+
import anyio
|
|
15
|
+
|
|
16
|
+
T = TypeVar("T")
|
|
17
|
+
|
|
18
|
+
# anyio renamed ``cancellable`` -> ``abandon_on_cancel`` in 4.1. Detect once.
|
|
19
|
+
_ABANDON_KW = (
|
|
20
|
+
"abandon_on_cancel"
|
|
21
|
+
if "abandon_on_cancel" in inspect.signature(anyio.to_thread.run_sync).parameters
|
|
22
|
+
else "cancellable"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def run(func: Callable[..., T], *args: object, **kwargs: object) -> T:
|
|
27
|
+
"""Run a blocking call in a worker thread (not abandoned on cancel)."""
|
|
28
|
+
call = functools.partial(func, *args, **kwargs)
|
|
29
|
+
return await anyio.to_thread.run_sync(call)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def run_cancellable(func: Callable[..., T], *args: object, **kwargs: object) -> T:
|
|
33
|
+
"""Run a blocking call, abandoning the thread if the await is cancelled.
|
|
34
|
+
|
|
35
|
+
Used for the latency-bounded recall path: on timeout we stop waiting, while the
|
|
36
|
+
abandoned thread finishes harmlessly in the background.
|
|
37
|
+
"""
|
|
38
|
+
call = functools.partial(func, *args, **kwargs)
|
|
39
|
+
if _ABANDON_KW == "abandon_on_cancel":
|
|
40
|
+
return await anyio.to_thread.run_sync(call, abandon_on_cancel=True)
|
|
41
|
+
return await anyio.to_thread.run_sync(call, cancellable=True)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Measurement layer: calibration, savings accounting, routing health."""
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""Calibration and routing-health metrics over the decision log.
|
|
2
|
+
|
|
3
|
+
Pure functions over reconciled ``DecisionRecord`` rows — no state of their own, so the
|
|
4
|
+
same code powers the tenant-scoped ``GET /v1/calibration`` endpoint and the ops-side
|
|
5
|
+
``minima-calibration-report`` console script.
|
|
6
|
+
|
|
7
|
+
A recommendation is "reconciled" once feedback arrived; only reconciled rows carry a
|
|
8
|
+
realized label. Calibration compares the chosen candidate's predicted_success at
|
|
9
|
+
decision time against that label (success=1 primary; quality-weighted alongside).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import bisect
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
|
|
17
|
+
from minima.memory.records import clamp01
|
|
18
|
+
from minima.recommender.decisionlog import DecisionRecord
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(slots=True)
|
|
22
|
+
class ReliabilityBin:
|
|
23
|
+
lo: float
|
|
24
|
+
hi: float
|
|
25
|
+
n: int = 0
|
|
26
|
+
avg_predicted: float = 0.0
|
|
27
|
+
avg_realized: float = 0.0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(slots=True)
|
|
31
|
+
class CalibrationReport:
|
|
32
|
+
"""ECE + reliability for one slice (a task_type, or the global pool)."""
|
|
33
|
+
|
|
34
|
+
slice_key: str
|
|
35
|
+
n: int
|
|
36
|
+
ece: float
|
|
37
|
+
ece_shrunk: float
|
|
38
|
+
ece_quality: float
|
|
39
|
+
bins: list[ReliabilityBin] = field(default_factory=list)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(slots=True)
|
|
43
|
+
class CusumFlag:
|
|
44
|
+
cluster: str
|
|
45
|
+
model_id: str
|
|
46
|
+
n: int
|
|
47
|
+
statistic: float
|
|
48
|
+
direction: str # "over_predicting" | "under_predicting"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _pairs(rows: list[DecisionRecord]) -> list[tuple[float, float, float]]:
|
|
52
|
+
"""(predicted, realized_label, realized_quality) for reconciled rows."""
|
|
53
|
+
out: list[tuple[float, float, float]] = []
|
|
54
|
+
for r in rows:
|
|
55
|
+
if not r.reconciled:
|
|
56
|
+
continue
|
|
57
|
+
predicted = r.predicted_success_chosen
|
|
58
|
+
if predicted is None:
|
|
59
|
+
continue
|
|
60
|
+
label = 1.0 if r.realized_outcome == "success" else 0.0
|
|
61
|
+
quality = r.realized_quality if r.realized_quality is not None else label
|
|
62
|
+
out.append((predicted, label, quality))
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _ece(pairs: list[tuple[float, float]], n_bins: int) -> tuple[float, list[ReliabilityBin]]:
|
|
67
|
+
bins = [
|
|
68
|
+
ReliabilityBin(lo=i / n_bins, hi=(i + 1) / n_bins)
|
|
69
|
+
for i in range(max(1, n_bins))
|
|
70
|
+
]
|
|
71
|
+
sums_p = [0.0] * len(bins)
|
|
72
|
+
sums_y = [0.0] * len(bins)
|
|
73
|
+
for p, y in pairs:
|
|
74
|
+
idx = min(len(bins) - 1, int(p * len(bins)))
|
|
75
|
+
bins[idx].n += 1
|
|
76
|
+
sums_p[idx] += p
|
|
77
|
+
sums_y[idx] += y
|
|
78
|
+
total = sum(b.n for b in bins)
|
|
79
|
+
if total == 0:
|
|
80
|
+
return 0.0, bins
|
|
81
|
+
ece = 0.0
|
|
82
|
+
for i, b in enumerate(bins):
|
|
83
|
+
if b.n == 0:
|
|
84
|
+
continue
|
|
85
|
+
b.avg_predicted = sums_p[i] / b.n
|
|
86
|
+
b.avg_realized = sums_y[i] / b.n
|
|
87
|
+
ece += (b.n / total) * abs(b.avg_predicted - b.avg_realized)
|
|
88
|
+
return ece, bins
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def calibration_by_task_type(
|
|
92
|
+
rows: list[DecisionRecord],
|
|
93
|
+
*,
|
|
94
|
+
n_bins: int = 10,
|
|
95
|
+
shrinkage_k: float = 20.0,
|
|
96
|
+
) -> list[CalibrationReport]:
|
|
97
|
+
"""Per-task_type ECE with hierarchical shrinkage toward the global estimate.
|
|
98
|
+
|
|
99
|
+
Sparse slices are pulled toward the global ECE with weight ``n / (n + k)`` so a
|
|
100
|
+
task_type with three feedbacks doesn't read as perfectly (mis)calibrated.
|
|
101
|
+
The first report ("global") is the unshrunk pool.
|
|
102
|
+
"""
|
|
103
|
+
global_pairs = _pairs(rows)
|
|
104
|
+
g_label = [(p, y) for p, y, _ in global_pairs]
|
|
105
|
+
g_quality = [(p, q) for p, _, q in global_pairs]
|
|
106
|
+
global_ece, global_bins = _ece(g_label, n_bins)
|
|
107
|
+
global_ece_q, _ = _ece(g_quality, n_bins)
|
|
108
|
+
reports = [
|
|
109
|
+
CalibrationReport(
|
|
110
|
+
slice_key="global",
|
|
111
|
+
n=len(global_pairs),
|
|
112
|
+
ece=round(global_ece, 4),
|
|
113
|
+
ece_shrunk=round(global_ece, 4),
|
|
114
|
+
ece_quality=round(global_ece_q, 4),
|
|
115
|
+
bins=global_bins,
|
|
116
|
+
)
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
by_type: dict[str, list[DecisionRecord]] = {}
|
|
120
|
+
for r in rows:
|
|
121
|
+
by_type.setdefault(r.task_type, []).append(r)
|
|
122
|
+
for task_type in sorted(by_type):
|
|
123
|
+
pairs = _pairs(by_type[task_type])
|
|
124
|
+
if not pairs:
|
|
125
|
+
continue
|
|
126
|
+
ece, bins = _ece([(p, y) for p, y, _ in pairs], n_bins)
|
|
127
|
+
ece_q, _ = _ece([(p, q) for p, _, q in pairs], n_bins)
|
|
128
|
+
n = len(pairs)
|
|
129
|
+
shrunk = (n * ece + shrinkage_k * global_ece) / (n + shrinkage_k)
|
|
130
|
+
reports.append(
|
|
131
|
+
CalibrationReport(
|
|
132
|
+
slice_key=task_type,
|
|
133
|
+
n=n,
|
|
134
|
+
ece=round(ece, 4),
|
|
135
|
+
ece_shrunk=round(shrunk, 4),
|
|
136
|
+
ece_quality=round(ece_q, 4),
|
|
137
|
+
bins=bins,
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
return reports
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def cusum_flags(
|
|
144
|
+
rows: list[DecisionRecord],
|
|
145
|
+
*,
|
|
146
|
+
k: float = 0.25,
|
|
147
|
+
h: float = 2.0,
|
|
148
|
+
) -> list[CusumFlag]:
|
|
149
|
+
"""Two-sided CUSUM on (predicted - realized) residuals per (cluster, chosen model).
|
|
150
|
+
|
|
151
|
+
Flags sustained over-prediction (model got worse than the evidence says — the
|
|
152
|
+
expensive failure mode) and under-prediction, ordered by feedback time. Detection
|
|
153
|
+
only: acting on a flag (evidence reset / down-weight) is a later-phase policy.
|
|
154
|
+
Defaults are sized for binary residuals (|resid| up to 1): the slack absorbs
|
|
155
|
+
routine misses, the threshold requires a sustained run before flagging.
|
|
156
|
+
"""
|
|
157
|
+
series: dict[tuple[str, str], list[tuple[float, float]]] = {}
|
|
158
|
+
for r in rows:
|
|
159
|
+
if not r.reconciled:
|
|
160
|
+
continue
|
|
161
|
+
predicted = r.predicted_success_chosen
|
|
162
|
+
if predicted is None or r.realized_model_id is None:
|
|
163
|
+
continue
|
|
164
|
+
label = 1.0 if r.realized_outcome == "success" else 0.0
|
|
165
|
+
series.setdefault((r.cluster, r.realized_model_id), []).append(
|
|
166
|
+
(r.feedback_ts or r.ts, predicted - label)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
flags: list[CusumFlag] = []
|
|
170
|
+
for (cluster, model_id), points in series.items():
|
|
171
|
+
points.sort(key=lambda tr: tr[0])
|
|
172
|
+
s_hi = s_lo = 0.0
|
|
173
|
+
peak_hi = peak_lo = 0.0
|
|
174
|
+
for _, resid in points:
|
|
175
|
+
s_hi = max(0.0, s_hi + resid - k)
|
|
176
|
+
s_lo = max(0.0, s_lo - resid - k)
|
|
177
|
+
peak_hi = max(peak_hi, s_hi)
|
|
178
|
+
peak_lo = max(peak_lo, s_lo)
|
|
179
|
+
if peak_hi > h:
|
|
180
|
+
flags.append(
|
|
181
|
+
CusumFlag(
|
|
182
|
+
cluster=cluster,
|
|
183
|
+
model_id=model_id,
|
|
184
|
+
n=len(points),
|
|
185
|
+
statistic=round(peak_hi, 4),
|
|
186
|
+
direction="over_predicting",
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
if peak_lo > h:
|
|
190
|
+
flags.append(
|
|
191
|
+
CusumFlag(
|
|
192
|
+
cluster=cluster,
|
|
193
|
+
model_id=model_id,
|
|
194
|
+
n=len(points),
|
|
195
|
+
statistic=round(peak_lo, 4),
|
|
196
|
+
direction="under_predicting",
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
flags.sort(key=lambda f: f.statistic, reverse=True)
|
|
200
|
+
return flags
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def routing_health(rows: list[DecisionRecord]) -> dict[str, float | int]:
|
|
204
|
+
"""Decision-stream health rates; the fitness gates for everything analytical.
|
|
205
|
+
|
|
206
|
+
feedback_coverage is the share of recommendations that ever got feedback — the
|
|
207
|
+
statistic that decides whether calibration/MNAR machinery is fit for purpose.
|
|
208
|
+
"""
|
|
209
|
+
n = len(rows)
|
|
210
|
+
if n == 0:
|
|
211
|
+
return {
|
|
212
|
+
"recommendations": 0,
|
|
213
|
+
"feedback_coverage": 0.0,
|
|
214
|
+
"late_feedback_share": 0.0,
|
|
215
|
+
"escalation_rate": 0.0,
|
|
216
|
+
"exploration_share": 0.0,
|
|
217
|
+
"epsilon_policy_share": 0.0,
|
|
218
|
+
"success_rate": 0.0,
|
|
219
|
+
"top_model_share": 0.0,
|
|
220
|
+
"cheapest_model_share": 0.0,
|
|
221
|
+
"cost_position": 0.0,
|
|
222
|
+
"shadow_agreement": 0.0,
|
|
223
|
+
}
|
|
224
|
+
reconciled = sum(1 for r in rows if r.reconciled)
|
|
225
|
+
late = sum(1 for r in rows if r.late_feedback)
|
|
226
|
+
escalated = sum(1 for r in rows if r.escalated)
|
|
227
|
+
successes = sum(1 for r in rows if r.realized_outcome == "success")
|
|
228
|
+
# exploration_share = picks actually changed by the epsilon branch (~epsilon when
|
|
229
|
+
# active); epsilon_policy_share = share of decisions where exploration was possible.
|
|
230
|
+
explored = sum(1 for r in rows if r.explored)
|
|
231
|
+
epsilon_policy = sum(1 for r in rows if r.policy == "epsilon_softmax")
|
|
232
|
+
top_share, cheapest_share, cost_position = _cost_metrics(rows)
|
|
233
|
+
return {
|
|
234
|
+
"recommendations": n,
|
|
235
|
+
"feedback_coverage": round(reconciled / n, 4),
|
|
236
|
+
"late_feedback_share": round(late / reconciled, 4) if reconciled else 0.0,
|
|
237
|
+
"escalation_rate": round(escalated / n, 4),
|
|
238
|
+
"exploration_share": round(explored / n, 4),
|
|
239
|
+
"epsilon_policy_share": round(epsilon_policy / n, 4),
|
|
240
|
+
# success_rate over reconciled rows — pair with cost_position for the Pareto view.
|
|
241
|
+
"success_rate": round(successes / reconciled, 4) if reconciled else 0.0,
|
|
242
|
+
# Routing-optimality signals over the candidate price ladder:
|
|
243
|
+
# top_model_share — share picking the MOST expensive candidate (collapse signal,
|
|
244
|
+
# arXiv 2602.03478).
|
|
245
|
+
# cheapest_model_share — share picking the CHEAPEST candidate (aggressive saving).
|
|
246
|
+
# cost_position — mean normalized position 0=cheapest .. 1=priciest. The honest
|
|
247
|
+
# online "how far up the price ladder do we routinely pick"
|
|
248
|
+
# number; pair with success_rate (true regret-vs-oracle needs
|
|
249
|
+
# counterfactuals — that lives in the offline RouterBench eval).
|
|
250
|
+
"top_model_share": top_share,
|
|
251
|
+
"cheapest_model_share": cheapest_share,
|
|
252
|
+
"cost_position": cost_position,
|
|
253
|
+
# Share of decisions where the advisory shadow bandit agreed with the deployed pick
|
|
254
|
+
# (over rows that logged a shadow pick). Low agreement => the policies diverge; pair
|
|
255
|
+
# with offline regret before considering promotion.
|
|
256
|
+
"shadow_agreement": _shadow_agreement(rows),
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _shadow_agreement(rows: list[DecisionRecord]) -> float:
|
|
261
|
+
counted = agree = 0
|
|
262
|
+
for r in rows:
|
|
263
|
+
if r.shadow_chosen_model_id is None:
|
|
264
|
+
continue
|
|
265
|
+
counted += 1
|
|
266
|
+
if r.shadow_chosen_model_id == r.chosen_model_id:
|
|
267
|
+
agree += 1
|
|
268
|
+
return round(agree / counted, 4) if counted else 0.0
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _cost_metrics(rows: list[DecisionRecord]) -> tuple[float, float, float]:
|
|
272
|
+
"""(top_model_share, cheapest_model_share, mean cost_position) over rows with candidates."""
|
|
273
|
+
counted = picked_top = picked_cheap = 0
|
|
274
|
+
position_sum = 0.0
|
|
275
|
+
for r in rows:
|
|
276
|
+
if not r.candidates:
|
|
277
|
+
continue
|
|
278
|
+
counted += 1
|
|
279
|
+
costs = [c.est_cost_usd for c in r.candidates]
|
|
280
|
+
lo, hi = min(costs), max(costs)
|
|
281
|
+
chosen = next((c for c in r.candidates if c.model_id == r.chosen_model_id), None)
|
|
282
|
+
if chosen is None:
|
|
283
|
+
continue
|
|
284
|
+
if chosen.est_cost_usd >= hi - 1e-12:
|
|
285
|
+
picked_top += 1
|
|
286
|
+
if chosen.est_cost_usd <= lo + 1e-12:
|
|
287
|
+
picked_cheap += 1
|
|
288
|
+
position_sum += (chosen.est_cost_usd - lo) / (hi - lo) if hi > lo else 0.0
|
|
289
|
+
if not counted:
|
|
290
|
+
return 0.0, 0.0, 0.0
|
|
291
|
+
return (
|
|
292
|
+
round(picked_top / counted, 4),
|
|
293
|
+
round(picked_cheap / counted, 4),
|
|
294
|
+
round(position_sum / counted, 4),
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# --------------------------------------------------------------------------- calibration FIT
|
|
299
|
+
# The reports above MEASURE calibration; the machinery below FITS a monotonic remap that the
|
|
300
|
+
# recommender applies to predicted_success before the tau-clearing decision, so a "0.7" really
|
|
301
|
+
# means ~70% realized success. Isotonic regression (pool-adjacent-violators) is non-parametric
|
|
302
|
+
# and monotonic; we shrink it toward the identity by n / (n + k) so a sparse slice barely moves
|
|
303
|
+
# (the same hierarchical-shrinkage instinct as ``calibration_by_task_type``). Pure stdlib — no
|
|
304
|
+
# numpy/sklearn — to stay on the recommend() hot path's dependency budget.
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _isotonic_pav(pairs: list[tuple[float, float]]) -> tuple[list[float], list[float]]:
|
|
308
|
+
"""Pool-adjacent-violators isotonic regression of y on x.
|
|
309
|
+
|
|
310
|
+
Returns ``(xs, ys)`` where ``xs`` are block right-edges (ascending) and ``ys`` the
|
|
311
|
+
block means (non-decreasing) — a monotonic step function. Empty when no pairs.
|
|
312
|
+
"""
|
|
313
|
+
pts = sorted(pairs, key=lambda t: t[0])
|
|
314
|
+
if not pts:
|
|
315
|
+
return [], []
|
|
316
|
+
|
|
317
|
+
def _mean(b: list[float]) -> float:
|
|
318
|
+
return b[0] / b[1]
|
|
319
|
+
|
|
320
|
+
# Each block: [sum_y, count, right_edge_x].
|
|
321
|
+
blocks: list[list[float]] = []
|
|
322
|
+
for x, y in pts:
|
|
323
|
+
blocks.append([y, 1.0, x])
|
|
324
|
+
while len(blocks) >= 2 and _mean(blocks[-2]) >= _mean(blocks[-1]):
|
|
325
|
+
sy2, c2, x2 = blocks.pop()
|
|
326
|
+
sy1, c1, x1 = blocks.pop()
|
|
327
|
+
blocks.append([sy1 + sy2, c1 + c2, max(x1, x2)])
|
|
328
|
+
xs = [b[2] for b in blocks]
|
|
329
|
+
ys = [clamp01(b[0] / b[1]) for b in blocks]
|
|
330
|
+
return xs, ys
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
@dataclass(slots=True)
|
|
334
|
+
class IsotonicCalibrator:
|
|
335
|
+
"""A monotonic predicted->realized remap, shrunk toward the identity at low n."""
|
|
336
|
+
|
|
337
|
+
xs: list[float]
|
|
338
|
+
ys: list[float]
|
|
339
|
+
weight: float # shrinkage toward identity: n / (n + k), in [0, 1]
|
|
340
|
+
n: int
|
|
341
|
+
|
|
342
|
+
def transform(self, p: float) -> float:
|
|
343
|
+
if not self.xs:
|
|
344
|
+
return clamp01(p)
|
|
345
|
+
i = bisect.bisect_left(self.xs, p)
|
|
346
|
+
if i >= len(self.ys):
|
|
347
|
+
i = len(self.ys) - 1
|
|
348
|
+
iso = self.ys[i]
|
|
349
|
+
return clamp01(self.weight * iso + (1.0 - self.weight) * p)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
@dataclass(slots=True)
|
|
353
|
+
class CalibratorSet:
|
|
354
|
+
"""Per-task_type calibrators with a global fallback; identity when a slice is unknown."""
|
|
355
|
+
|
|
356
|
+
by_task_type: dict[str, IsotonicCalibrator]
|
|
357
|
+
global_map: IsotonicCalibrator | None
|
|
358
|
+
fitted_at: float
|
|
359
|
+
n: int
|
|
360
|
+
|
|
361
|
+
def transform(self, task_type: str, p: float) -> float:
|
|
362
|
+
m = self.by_task_type.get(task_type) or self.global_map
|
|
363
|
+
return m.transform(p) if m is not None else clamp01(p)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _raw_label_pairs(rows: list[DecisionRecord]) -> list[tuple[float, float, str]]:
|
|
367
|
+
"""(raw_predicted_chosen, realized_label, task_type) over reconciled rows."""
|
|
368
|
+
out: list[tuple[float, float, str]] = []
|
|
369
|
+
for r in rows:
|
|
370
|
+
if not r.reconciled:
|
|
371
|
+
continue
|
|
372
|
+
raw = r.raw_predicted_success_chosen
|
|
373
|
+
if raw is None:
|
|
374
|
+
continue
|
|
375
|
+
label = 1.0 if r.realized_outcome == "success" else 0.0
|
|
376
|
+
out.append((raw, label, r.task_type))
|
|
377
|
+
return out
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _fit_one(pairs: list[tuple[float, float]], shrinkage_k: float) -> IsotonicCalibrator | None:
|
|
381
|
+
n = len(pairs)
|
|
382
|
+
if n == 0:
|
|
383
|
+
return None
|
|
384
|
+
xs, ys = _isotonic_pav(pairs)
|
|
385
|
+
weight = n / (n + shrinkage_k)
|
|
386
|
+
return IsotonicCalibrator(xs=xs, ys=ys, weight=weight, n=n)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def fit_calibrators(
|
|
390
|
+
rows: list[DecisionRecord],
|
|
391
|
+
*,
|
|
392
|
+
min_n: int,
|
|
393
|
+
shrinkage_k: float,
|
|
394
|
+
now: float,
|
|
395
|
+
) -> CalibratorSet | None:
|
|
396
|
+
"""Fit a global + per-task_type isotonic calibrator from reconciled decision rows.
|
|
397
|
+
|
|
398
|
+
Returns None (=> identity everywhere) when fewer than ``min_n`` reconciled pairs exist.
|
|
399
|
+
Per-task_type maps are only fit for slices that themselves clear ``min_n``; everything
|
|
400
|
+
else falls back to the global map.
|
|
401
|
+
"""
|
|
402
|
+
triples = _raw_label_pairs(rows)
|
|
403
|
+
if len(triples) < min_n:
|
|
404
|
+
return None
|
|
405
|
+
global_map = _fit_one([(p, y) for p, y, _ in triples], shrinkage_k)
|
|
406
|
+
grouped: dict[str, list[tuple[float, float]]] = {}
|
|
407
|
+
for p, y, tt in triples:
|
|
408
|
+
grouped.setdefault(tt, []).append((p, y))
|
|
409
|
+
by_type: dict[str, IsotonicCalibrator] = {}
|
|
410
|
+
for tt, ps in grouped.items():
|
|
411
|
+
if len(ps) >= min_n:
|
|
412
|
+
fitted = _fit_one(ps, shrinkage_k)
|
|
413
|
+
if fitted is not None:
|
|
414
|
+
by_type[tt] = fitted
|
|
415
|
+
return CalibratorSet(by_task_type=by_type, global_map=global_map, fitted_at=now, n=len(triples))
|