minima-cli 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minima/__init__.py +5 -0
- minima/api/__init__.py +1 -0
- minima/api/auth.py +39 -0
- minima/api/errors.py +40 -0
- minima/api/routers/__init__.py +1 -0
- minima/api/routers/calibration.py +50 -0
- minima/api/routers/feedback.py +279 -0
- minima/api/routers/health.py +50 -0
- minima/api/routers/models.py +42 -0
- minima/api/routers/recommend.py +66 -0
- minima/api/routers/savings.py +55 -0
- minima/api/routers/strategies.py +33 -0
- minima/catalog/__init__.py +1 -0
- minima/catalog/data/capability_priors.json +210 -0
- minima/catalog/data/model_aliases.json +12 -0
- minima/catalog/merge.py +69 -0
- minima/catalog/refresh.py +54 -0
- minima/catalog/sources/__init__.py +1 -0
- minima/catalog/sources/litellm.py +19 -0
- minima/catalog/sources/openrouter.py +25 -0
- minima/catalog/store.py +86 -0
- minima/config.py +288 -0
- minima/deps.py +35 -0
- minima/llm/__init__.py +1 -0
- minima/llm/anthropic.py +106 -0
- minima/llm/base.py +196 -0
- minima/llm/gemini.py +124 -0
- minima/llm/registry.py +54 -0
- minima/logging.py +28 -0
- minima/main.py +109 -0
- minima/memory/__init__.py +1 -0
- minima/memory/adapter.py +572 -0
- minima/memory/keys.py +83 -0
- minima/memory/records.py +190 -0
- minima/memory/threadpool.py +41 -0
- minima/metrics/__init__.py +1 -0
- minima/metrics/calibration.py +415 -0
- minima/metrics/report.py +116 -0
- minima/metrics/savings.py +98 -0
- minima/recommender/__init__.py +1 -0
- minima/recommender/_pg_pool.py +38 -0
- minima/recommender/_redis_client.py +32 -0
- minima/recommender/aggregate.py +157 -0
- minima/recommender/classify.py +165 -0
- minima/recommender/decisionlog.py +505 -0
- minima/recommender/durablerefs.py +312 -0
- minima/recommender/engine.py +997 -0
- minima/recommender/escalation.py +83 -0
- minima/recommender/propensity.py +189 -0
- minima/recommender/recstore.py +368 -0
- minima/recommender/score.py +318 -0
- minima/recommender/types.py +166 -0
- minima/schemas/__init__.py +1 -0
- minima/schemas/common.py +73 -0
- minima/schemas/feedback.py +34 -0
- minima/schemas/models_catalog.py +36 -0
- minima/schemas/recommend.py +104 -0
- minima/schemas/savings.py +39 -0
- minima/schemas/strategies.py +57 -0
- minima/schemas/workflow.py +43 -0
- minima/seeding/__init__.py +1 -0
- minima/seeding/items.py +42 -0
- minima/seeding/llmrouterbench.py +232 -0
- minima/seeding/routerbench.py +141 -0
- minima/seeding/run_seed.py +56 -0
- minima/seeding/synthetic.py +70 -0
- minima/tenancy/__init__.py +8 -0
- minima/tenancy/context.py +37 -0
- minima/tenancy/passthrough.py +110 -0
- minima/version.py +3 -0
- minima_cli-0.4.9.dist-info/METADATA +275 -0
- minima_cli-0.4.9.dist-info/RECORD +161 -0
- minima_cli-0.4.9.dist-info/WHEEL +4 -0
- minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
- minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
- minima_client/__init__.py +19 -0
- minima_client/autocapture.py +101 -0
- minima_client/client.py +301 -0
- minima_client/errors.py +23 -0
- minima_harness/LICENSE_PI +32 -0
- minima_harness/__init__.py +16 -0
- minima_harness/agent/__init__.py +72 -0
- minima_harness/agent/agent.py +276 -0
- minima_harness/agent/events.py +124 -0
- minima_harness/agent/loop.py +311 -0
- minima_harness/agent/state.py +79 -0
- minima_harness/agent/tools.py +97 -0
- minima_harness/ai/__init__.py +66 -0
- minima_harness/ai/compat.py +71 -0
- minima_harness/ai/errors.py +96 -0
- minima_harness/ai/events.py +117 -0
- minima_harness/ai/openrouter_catalog.py +153 -0
- minima_harness/ai/provider_catalog.py +299 -0
- minima_harness/ai/provider_quirks.py +37 -0
- minima_harness/ai/providers/__init__.py +75 -0
- minima_harness/ai/providers/_common.py +48 -0
- minima_harness/ai/providers/anthropic.py +290 -0
- minima_harness/ai/providers/base.py +65 -0
- minima_harness/ai/providers/faux.py +173 -0
- minima_harness/ai/providers/google.py +221 -0
- minima_harness/ai/providers/openai_compat.py +278 -0
- minima_harness/ai/registry.py +184 -0
- minima_harness/ai/stream.py +82 -0
- minima_harness/ai/tools.py +51 -0
- minima_harness/ai/types.py +204 -0
- minima_harness/ai/usage.py +41 -0
- minima_harness/minima/__init__.py +40 -0
- minima_harness/minima/cache.py +102 -0
- minima_harness/minima/config.py +85 -0
- minima_harness/minima/goals.py +226 -0
- minima_harness/minima/judge.py +144 -0
- minima_harness/minima/mapping.py +147 -0
- minima_harness/minima/meter.py +143 -0
- minima_harness/minima/router.py +220 -0
- minima_harness/minima/runtime.py +544 -0
- minima_harness/minima/signals.py +195 -0
- minima_harness/session/__init__.py +14 -0
- minima_harness/session/format.py +35 -0
- minima_harness/session/store.py +236 -0
- minima_harness/tasks/__init__.py +17 -0
- minima_harness/tasks/task_set.py +78 -0
- minima_harness/tools/__init__.py +7 -0
- minima_harness/tools/_io.py +34 -0
- minima_harness/tools/bash.py +70 -0
- minima_harness/tools/builtin.py +23 -0
- minima_harness/tools/edit.py +50 -0
- minima_harness/tools/find.py +38 -0
- minima_harness/tools/grep.py +73 -0
- minima_harness/tools/ls.py +35 -0
- minima_harness/tools/read.py +38 -0
- minima_harness/tools/tasks.py +75 -0
- minima_harness/tools/write.py +36 -0
- minima_harness/tui/__init__.py +3 -0
- minima_harness/tui/analytics.py +111 -0
- minima_harness/tui/app.py +1927 -0
- minima_harness/tui/bridge.py +103 -0
- minima_harness/tui/cli.py +227 -0
- minima_harness/tui/clipboard.py +60 -0
- minima_harness/tui/commands.py +49 -0
- minima_harness/tui/compaction.py +17 -0
- minima_harness/tui/config_cli.py +141 -0
- minima_harness/tui/config_store.py +237 -0
- minima_harness/tui/context.py +93 -0
- minima_harness/tui/customize.py +95 -0
- minima_harness/tui/diff.py +53 -0
- minima_harness/tui/editor.py +43 -0
- minima_harness/tui/extensions.py +84 -0
- minima_harness/tui/extra_models.py +52 -0
- minima_harness/tui/history.py +71 -0
- minima_harness/tui/mubit.py +295 -0
- minima_harness/tui/overlays.py +593 -0
- minima_harness/tui/packages.py +59 -0
- minima_harness/tui/run_modes.py +66 -0
- minima_harness/tui/theme.py +77 -0
- minima_harness/tui/welcome.py +83 -0
- minima_harness/tui/widgets/__init__.py +3 -0
- minima_harness/tui/widgets/banner.py +38 -0
- minima_harness/tui/widgets/editor.py +83 -0
- minima_harness/tui/widgets/footer.py +73 -0
- minima_harness/tui/widgets/messages.py +151 -0
- minima_harness/tui/widgets/status.py +57 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""Goal / task tracking for the harness — the data model behind ``/goals``.
|
|
2
|
+
|
|
3
|
+
A :class:`Goal` is a high-level objective plus a checklist of :class:`GoalTask` items the agent
|
|
4
|
+
maintains as it works (one ``in_progress`` at a time). Phase 1 uses this purely to keep the
|
|
5
|
+
agent on-track and show progress; Phase 2 adds cost fields (``est_cost_usd`` / ``actual_cost_usd``
|
|
6
|
+
/ ``budget_usd``) so a goal can be tracked against a budget — Minima's differentiator.
|
|
7
|
+
|
|
8
|
+
State is owned by :class:`GoalStore`, which (de)serializes to the per-session store so a goal
|
|
9
|
+
survives ``--continue`` / ``--resume``.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import asdict, dataclass, field
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
_STATUSES = ("pending", "in_progress", "completed", "blocked")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _slug(text: str, n: int = 24) -> str:
|
|
22
|
+
s = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
|
|
23
|
+
return (s[:n].rstrip("-")) or "task"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class GoalTask:
|
|
28
|
+
id: str
|
|
29
|
+
content: str # imperative form ("Add OAuth login")
|
|
30
|
+
active_form: str = "" # present-continuous ("Adding OAuth login"); falls back to content
|
|
31
|
+
status: str = "pending" # pending | in_progress | completed | blocked
|
|
32
|
+
est_cost_usd: float = 0.0 # routing estimate captured while worked (Phase 2)
|
|
33
|
+
actual_cost_usd: float = 0.0 # realized cost attributed to this task (Phase 2)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def label(self) -> str:
|
|
37
|
+
if self.status == "in_progress" and self.active_form:
|
|
38
|
+
return self.active_form
|
|
39
|
+
return self.content
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def make(cls, content: str, active_form: str = "", status: str = "pending") -> GoalTask:
|
|
43
|
+
status = status if status in _STATUSES else "pending"
|
|
44
|
+
return cls(id=_slug(content), content=content, active_form=active_form, status=status)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class Goal:
|
|
49
|
+
title: str
|
|
50
|
+
tasks: list[GoalTask] = field(default_factory=list)
|
|
51
|
+
task_type: str | None = None
|
|
52
|
+
tags: list[str] = field(default_factory=list)
|
|
53
|
+
budget_usd: float | None = None
|
|
54
|
+
started_ts: float = 0.0
|
|
55
|
+
done: bool = False
|
|
56
|
+
# Cost attributed to turns that ran while no task was in_progress (Phase 2).
|
|
57
|
+
spent_extra_usd: float = 0.0
|
|
58
|
+
|
|
59
|
+
def progress(self) -> tuple[int, int]:
|
|
60
|
+
"""(completed, total)."""
|
|
61
|
+
return (sum(1 for t in self.tasks if t.status == "completed"), len(self.tasks))
|
|
62
|
+
|
|
63
|
+
def active(self) -> GoalTask | None:
|
|
64
|
+
return next((t for t in self.tasks if t.status == "in_progress"), None)
|
|
65
|
+
|
|
66
|
+
def routing_signals(self) -> tuple[str | None, list[str]]:
|
|
67
|
+
"""(task_type, tags) to feed the router so a goal's turns cluster + route coherently."""
|
|
68
|
+
tags = list(self.tags)
|
|
69
|
+
if self.title:
|
|
70
|
+
tags = [f"goal:{_slug(self.title)}", *tags]
|
|
71
|
+
return self.task_type, tags
|
|
72
|
+
|
|
73
|
+
def record_turn_cost(
|
|
74
|
+
self, actual_usd: float, est_usd: float, newly_completed_ids: list[str] | None = None
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Attribute a turn's realized cost. Order of preference:
|
|
77
|
+
|
|
78
|
+
1. the in_progress task (the model marked one — ideal);
|
|
79
|
+
2. else split evenly across tasks that flipped to completed THIS turn (the common case
|
|
80
|
+
where a model batches: plan → do the work → mark several done at once);
|
|
81
|
+
3. else the goal at large (``spent_extra_usd``), so goal-level spend is always accurate.
|
|
82
|
+
"""
|
|
83
|
+
task = self.active()
|
|
84
|
+
if task is not None:
|
|
85
|
+
task.actual_cost_usd += actual_usd
|
|
86
|
+
if task.est_cost_usd == 0.0:
|
|
87
|
+
task.est_cost_usd = est_usd
|
|
88
|
+
return
|
|
89
|
+
targets = [t for t in self.tasks if t.id in (newly_completed_ids or [])]
|
|
90
|
+
if targets:
|
|
91
|
+
share_a, share_e = actual_usd / len(targets), est_usd / len(targets)
|
|
92
|
+
for t in targets:
|
|
93
|
+
t.actual_cost_usd += share_a
|
|
94
|
+
if t.est_cost_usd == 0.0:
|
|
95
|
+
t.est_cost_usd = share_e
|
|
96
|
+
return
|
|
97
|
+
self.spent_extra_usd += actual_usd
|
|
98
|
+
|
|
99
|
+
def spent_usd(self) -> float:
|
|
100
|
+
return sum(t.actual_cost_usd for t in self.tasks) + self.spent_extra_usd
|
|
101
|
+
|
|
102
|
+
def projected_total_usd(self) -> float | None:
|
|
103
|
+
"""Linear extrapolation of total goal cost from progress (None until ≥1 task done)."""
|
|
104
|
+
done, total = self.progress()
|
|
105
|
+
if done <= 0 or total <= 0:
|
|
106
|
+
return None
|
|
107
|
+
return self.spent_usd() / done * total
|
|
108
|
+
|
|
109
|
+
def to_dict(self) -> dict[str, Any]:
|
|
110
|
+
return asdict(self)
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def from_dict(cls, data: dict[str, Any]) -> Goal:
|
|
114
|
+
tasks = [GoalTask(**t) for t in data.get("tasks", [])]
|
|
115
|
+
return cls(
|
|
116
|
+
title=data.get("title", ""),
|
|
117
|
+
tasks=tasks,
|
|
118
|
+
task_type=data.get("task_type"),
|
|
119
|
+
tags=list(data.get("tags", [])),
|
|
120
|
+
budget_usd=data.get("budget_usd"),
|
|
121
|
+
started_ts=float(data.get("started_ts", 0.0)),
|
|
122
|
+
done=bool(data.get("done", False)),
|
|
123
|
+
spent_extra_usd=float(data.get("spent_extra_usd", 0.0)),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class GoalStore:
|
|
128
|
+
"""Owns the active goal and (de)serializes it to/from the session store.
|
|
129
|
+
|
|
130
|
+
The session store is the single source of truth: :meth:`save` appends a GOAL entry and
|
|
131
|
+
:meth:`load` reads the latest one, so a goal survives resume with no extra storage.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
def __init__(self, goal: Goal | None = None) -> None:
|
|
135
|
+
self.goal = goal
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def active(self) -> bool:
|
|
139
|
+
return self.goal is not None and not self.goal.done
|
|
140
|
+
|
|
141
|
+
# ---- mutation (driven by the `tasks` tool and the /goals command) ----
|
|
142
|
+
def start(self, title: str, *, now: float = 0.0) -> Goal:
|
|
143
|
+
self.goal = Goal(title=title, started_ts=now)
|
|
144
|
+
return self.goal
|
|
145
|
+
|
|
146
|
+
def clear(self) -> None:
|
|
147
|
+
if self.goal is not None:
|
|
148
|
+
self.goal.done = True
|
|
149
|
+
|
|
150
|
+
def set_budget(self, amount: float | None) -> None:
|
|
151
|
+
if self.goal is not None:
|
|
152
|
+
self.goal.budget_usd = amount
|
|
153
|
+
|
|
154
|
+
def completed_ids(self) -> set[str]:
|
|
155
|
+
return {t.id for t in self.goal.tasks if t.status == "completed"} if self.goal else set()
|
|
156
|
+
|
|
157
|
+
def record_turn_cost(
|
|
158
|
+
self, actual_usd: float, est_usd: float, newly_completed_ids: list[str] | None = None
|
|
159
|
+
) -> None:
|
|
160
|
+
if self.active and self.goal is not None:
|
|
161
|
+
self.goal.record_turn_cost(actual_usd, est_usd, newly_completed_ids)
|
|
162
|
+
|
|
163
|
+
def set_tasks(self, items: list[dict[str, Any]]) -> None:
|
|
164
|
+
"""Replace the task list (the model's `tasks set` op)."""
|
|
165
|
+
if self.goal is None:
|
|
166
|
+
self.goal = Goal(title="")
|
|
167
|
+
self.goal.tasks = [
|
|
168
|
+
GoalTask.make(
|
|
169
|
+
str(it.get("content", "")).strip(),
|
|
170
|
+
str(it.get("active_form", "")).strip(),
|
|
171
|
+
str(it.get("status", "pending")),
|
|
172
|
+
)
|
|
173
|
+
for it in items
|
|
174
|
+
if str(it.get("content", "")).strip()
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
def update_task(self, task_id: str, status: str) -> bool:
|
|
178
|
+
"""Set one task's status; enforces the single-in_progress invariant. Returns matched."""
|
|
179
|
+
if self.goal is None or status not in _STATUSES:
|
|
180
|
+
return False
|
|
181
|
+
match = next((t for t in self.goal.tasks if t.id == task_id), None)
|
|
182
|
+
if match is None:
|
|
183
|
+
return False
|
|
184
|
+
if status == "in_progress": # demote any other in_progress task
|
|
185
|
+
for t in self.goal.tasks:
|
|
186
|
+
if t.status == "in_progress":
|
|
187
|
+
t.status = "pending"
|
|
188
|
+
match.status = status
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
# ---- persistence ----
|
|
192
|
+
def save(self, session: Any) -> None:
|
|
193
|
+
if self.goal is None:
|
|
194
|
+
return
|
|
195
|
+
try:
|
|
196
|
+
from minima_harness.session.format import EntryType
|
|
197
|
+
|
|
198
|
+
session.append(EntryType.GOAL, self.goal.to_dict())
|
|
199
|
+
except Exception: # noqa: BLE001 - goal persistence must never break a turn
|
|
200
|
+
pass
|
|
201
|
+
|
|
202
|
+
def load(self, session: Any) -> None:
|
|
203
|
+
try:
|
|
204
|
+
from minima_harness.session.format import EntryType
|
|
205
|
+
|
|
206
|
+
latest = None
|
|
207
|
+
for entry in getattr(session, "entries", []):
|
|
208
|
+
if entry.type == EntryType.GOAL:
|
|
209
|
+
latest = entry
|
|
210
|
+
if latest is not None:
|
|
211
|
+
self.goal = Goal.from_dict(latest.payload)
|
|
212
|
+
except Exception: # noqa: BLE001
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
# ---- prompt rendering ----
|
|
216
|
+
def prompt_block(self) -> str:
|
|
217
|
+
"""The goal + open tasks, injected into the system prompt each turn to re-anchor."""
|
|
218
|
+
if not self.active or self.goal is None:
|
|
219
|
+
return ""
|
|
220
|
+
lines = [f"# Current goal: {self.goal.title}".rstrip()]
|
|
221
|
+
if self.goal.tasks:
|
|
222
|
+
lines.append("Task list (keep it current via the `tasks` tool; one in_progress):")
|
|
223
|
+
mark = {"completed": "[x]", "in_progress": "[~]", "blocked": "[!]", "pending": "[ ]"}
|
|
224
|
+
for t in self.goal.tasks:
|
|
225
|
+
lines.append(f" {mark.get(t.status, '[ ]')} {t.content}")
|
|
226
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Quality judging for the Minima feedback loop.
|
|
2
|
+
|
|
3
|
+
A judge turns a model's output into a [0, 1] quality score, which the router folds into
|
|
4
|
+
the outcome label it sends to ``POST /v1/feedback``. Three implementations cover the
|
|
5
|
+
common cases: an LLM grader (default when a key is present), a deterministic scorer
|
|
6
|
+
(wraps a ``quality_fn``, matching ``minima_harness.tasks``), and a constant for when
|
|
7
|
+
judging is disabled.
|
|
8
|
+
|
|
9
|
+
``grade`` returns ``float | None``: ``None`` means the judge ABSTAINS — it could not
|
|
10
|
+
produce a trustworthy score (LLM call failed, output unparseable, or no judge
|
|
11
|
+
configured). Abstention is NOT a failure: feeding a fabricated 0.0 (API error) or a
|
|
12
|
+
neutral 0.5 (unparseable) into ``/v1/feedback`` poisons the learning loop, so the caller
|
|
13
|
+
records the realized cost/latency but sends NO quality/outcome signal on abstention.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import re
|
|
20
|
+
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
|
21
|
+
|
|
22
|
+
from minima_harness.ai import Context, Message, complete
|
|
23
|
+
from minima_harness.ai.types import Model
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from collections.abc import Callable
|
|
27
|
+
|
|
28
|
+
_log = logging.getLogger("minima_harness.judge")
|
|
29
|
+
|
|
30
|
+
JUDGE_SYSTEM = (
|
|
31
|
+
"You grade an AI assistant's response to a task on a 0-10 scale: 10 excellent, "
|
|
32
|
+
"5 acceptable, 0 wrong. Judge correctness, completeness, and adherence to any rubric. "
|
|
33
|
+
"Reply with ONLY a single integer 0-10, nothing else."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@runtime_checkable
|
|
38
|
+
class QualityJudge(Protocol):
|
|
39
|
+
async def grade(
|
|
40
|
+
self, task: str, output: str, *, rubric: str = "", expected: str = ""
|
|
41
|
+
) -> float | None:
|
|
42
|
+
"""Return a quality score in [0, 1], or ``None`` to abstain (no trustworthy score)."""
|
|
43
|
+
...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def clamp01(x: float) -> float:
|
|
47
|
+
return max(0.0, min(1.0, x))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DeterministicJudge:
|
|
51
|
+
"""Wraps a ``quality_fn(output) -> float`` callable (the tasks/task_set convention)."""
|
|
52
|
+
|
|
53
|
+
def __init__(self, fn: Callable[[str], float]) -> None:
|
|
54
|
+
self._fn = fn
|
|
55
|
+
|
|
56
|
+
async def grade(
|
|
57
|
+
self, task: str, output: str, *, rubric: str = "", expected: str = ""
|
|
58
|
+
) -> float | None:
|
|
59
|
+
try:
|
|
60
|
+
return clamp01(float(self._fn(output)))
|
|
61
|
+
except Exception: # noqa: BLE001 - a broken scorer must ABSTAIN, not record a failure
|
|
62
|
+
_log.warning("deterministic_judge_failed", exc_info=True)
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ConstJudge:
|
|
67
|
+
"""Returns a fixed quality (or ``None`` to abstain). ``ConstJudge(None)`` = always abstain."""
|
|
68
|
+
|
|
69
|
+
def __init__(self, quality: float | None = 0.5) -> None:
|
|
70
|
+
self._quality = clamp01(quality) if quality is not None else None
|
|
71
|
+
|
|
72
|
+
async def grade(
|
|
73
|
+
self, task: str, output: str, *, rubric: str = "", expected: str = ""
|
|
74
|
+
) -> float | None:
|
|
75
|
+
return self._quality
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class LLMJudge:
|
|
79
|
+
"""Grades via a cheap independent model (default claude-haiku). 0-10 -> /10 -> clamp.
|
|
80
|
+
|
|
81
|
+
Uses the harness's own ``ai.complete`` so it shares provider plumbing; pick a
|
|
82
|
+
different provider than your candidates to avoid self-grading bias.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
model: Model,
|
|
88
|
+
*,
|
|
89
|
+
api_key: str | None = None,
|
|
90
|
+
timeout: float = 30.0,
|
|
91
|
+
) -> None:
|
|
92
|
+
self._model = model
|
|
93
|
+
self._api_key = api_key
|
|
94
|
+
self._timeout = timeout
|
|
95
|
+
|
|
96
|
+
async def grade(
|
|
97
|
+
self, task: str, output: str, *, rubric: str = "", expected: str = ""
|
|
98
|
+
) -> float | None:
|
|
99
|
+
user = f"TASK:\n{task[:4000]}\n\nRESPONSE:\n{output[:4000]}"
|
|
100
|
+
if rubric:
|
|
101
|
+
user += f"\n\nRUBRIC:\n{rubric[:1000]}"
|
|
102
|
+
if expected:
|
|
103
|
+
user += f"\n\nEXPECTED:\n{expected[:1000]}"
|
|
104
|
+
# Judge inputs (task + response) are unique per turn, so prompt caching would only
|
|
105
|
+
# incur a cache-write with no future read — disable it.
|
|
106
|
+
options: dict = {"timeout": self._timeout, "prompt_cache": False}
|
|
107
|
+
if self._api_key:
|
|
108
|
+
options["api_key"] = self._api_key
|
|
109
|
+
try:
|
|
110
|
+
resp = await complete(
|
|
111
|
+
self._model,
|
|
112
|
+
Context(
|
|
113
|
+
system_prompt=JUDGE_SYSTEM,
|
|
114
|
+
messages=[Message(role="user", content=user)],
|
|
115
|
+
),
|
|
116
|
+
options=options,
|
|
117
|
+
)
|
|
118
|
+
except Exception: # noqa: BLE001 - a judge API error is NOT a model failure: abstain
|
|
119
|
+
_log.warning("llm_judge_call_failed", exc_info=True)
|
|
120
|
+
return None
|
|
121
|
+
score = _parse_score(resp.text)
|
|
122
|
+
# Unparseable judge output -> abstain rather than fabricate a neutral 0.5.
|
|
123
|
+
return None if score is None else clamp01(score / 10.0)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _parse_score(text: str) -> float | None:
|
|
127
|
+
"""Extract a 0-10 integer score from the judge's reply; ``None`` when none is found.
|
|
128
|
+
|
|
129
|
+
The judge is asked for a bare integer, but real replies vary. Prefer, in order:
|
|
130
|
+
an exact single integer, an ``N/10`` form, a ``score/rating/grade: N`` form, and
|
|
131
|
+
finally the LAST standalone 0-10 integer (judges tend to conclude with the score,
|
|
132
|
+
e.g. "there were 3 issues, so 7"). Returns ``None`` only when no 0-10 integer exists.
|
|
133
|
+
"""
|
|
134
|
+
t = text.strip()
|
|
135
|
+
if re.fullmatch(r"\d+", t) and 0 <= int(t) <= 10:
|
|
136
|
+
return float(t)
|
|
137
|
+
m = re.search(r"\b(\d+)\s*/\s*10\b", t)
|
|
138
|
+
if m and 0 <= int(m.group(1)) <= 10:
|
|
139
|
+
return float(m.group(1))
|
|
140
|
+
m = re.search(r"(?:score|rating|grade)\D{0,5}(\d+)", t, re.IGNORECASE)
|
|
141
|
+
if m and 0 <= int(m.group(1)) <= 10:
|
|
142
|
+
return float(m.group(1))
|
|
143
|
+
candidates = [int(x) for x in re.findall(r"\d+", t) if 0 <= int(x) <= 10]
|
|
144
|
+
return float(candidates[-1]) if candidates else None
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Map a Minima ``RankedModel`` to a harness :class:`~minima_harness.ai.types.Model`.
|
|
2
|
+
|
|
3
|
+
Minima's catalog and the harness registry are kept deliberately separate: Minima is the
|
|
4
|
+
source of truth for *routing*, the harness registry for *calling*. This module bridges
|
|
5
|
+
them with a tolerant lookup (exact -> id-only -> ``provider/model`` split -> fallback) so
|
|
6
|
+
a recommendation resolves to a callable model even when ids drift slightly between the
|
|
7
|
+
two catalogs.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
from minima_harness.ai.provider_catalog import provider_key_present
|
|
16
|
+
from minima_harness.ai.registry import all_models, find_model_by_id, try_get_model
|
|
17
|
+
from minima_harness.ai.types import Model
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from minima.schemas.recommend import RankedModel
|
|
21
|
+
|
|
22
|
+
_log = logging.getLogger("minima_harness.mapping")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _has_provider_key(model: Model) -> bool:
|
|
26
|
+
"""True if a key for ``model``'s OWN provider is set (or it needs none, e.g. a local runtime).
|
|
27
|
+
|
|
28
|
+
Provider-specific (via the provider catalog): a Groq model needs GROQ_API_KEY, an OpenAI
|
|
29
|
+
model needs OPENAI_API_KEY — an OpenRouter key never green-lights an api.openai.com model.
|
|
30
|
+
"""
|
|
31
|
+
return provider_key_present(model.provider)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _fallback_cost(model: Model) -> float:
|
|
35
|
+
"""Sort key for the offline fallback: combined per-token cost, but treat an unpriced
|
|
36
|
+
(cost 0) model as most-expensive so a local/custom 0-cost stub isn't mistaken for the
|
|
37
|
+
cheapest runnable default."""
|
|
38
|
+
total = model.cost.input + model.cost.output
|
|
39
|
+
return float("inf") if total <= 0 else total
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ModelMapping:
|
|
43
|
+
"""Resolve Minima's pick to a callable harness model."""
|
|
44
|
+
|
|
45
|
+
def to_model(
|
|
46
|
+
self,
|
|
47
|
+
ranked: RankedModel,
|
|
48
|
+
*,
|
|
49
|
+
offline_default: Model | None = None,
|
|
50
|
+
) -> Model:
|
|
51
|
+
model = self._resolve(ranked.provider, ranked.model_id)
|
|
52
|
+
if model is not None:
|
|
53
|
+
return model
|
|
54
|
+
if offline_default is not None:
|
|
55
|
+
_log.debug(
|
|
56
|
+
"mapping_fallback_to_offline_default provider=%s model_id=%s",
|
|
57
|
+
ranked.provider,
|
|
58
|
+
ranked.model_id,
|
|
59
|
+
)
|
|
60
|
+
return offline_default
|
|
61
|
+
raise KeyError(
|
|
62
|
+
f"no harness model for minima pick {ranked.provider}/{ranked.model_id!r}; "
|
|
63
|
+
"register it or pass an offline_default"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def default_model(self) -> Model:
|
|
67
|
+
"""Offline fallback: the cheapest registered model the user can actually run.
|
|
68
|
+
|
|
69
|
+
Prefers the cheapest model whose provider key is configured, so an offline
|
|
70
|
+
fallback doesn't pick (say) gpt-4o-mini when only Anthropic/Gemini keys are set.
|
|
71
|
+
Falls back to the globally cheapest model if no provider key is present (the run
|
|
72
|
+
will then surface a clear provider-auth error rather than a silent mismatch)."""
|
|
73
|
+
models = all_models()
|
|
74
|
+
if not models:
|
|
75
|
+
raise KeyError("harness model registry is empty")
|
|
76
|
+
by_cost = sorted(models, key=lambda m: (_fallback_cost(m), m.id))
|
|
77
|
+
for model in by_cost:
|
|
78
|
+
if _has_provider_key(model):
|
|
79
|
+
return model
|
|
80
|
+
return by_cost[0]
|
|
81
|
+
|
|
82
|
+
def _resolve(self, provider: str, model_id: str) -> Model | None:
|
|
83
|
+
# 1. exact (provider, id)
|
|
84
|
+
model = try_get_model(provider, model_id)
|
|
85
|
+
if model is not None:
|
|
86
|
+
return model
|
|
87
|
+
# 2. id-only (Minima's provider string may differ from ours)
|
|
88
|
+
model = find_model_by_id(model_id)
|
|
89
|
+
if model is not None:
|
|
90
|
+
return model
|
|
91
|
+
# 3. openrouter-style "provider/model" ids
|
|
92
|
+
if "/" in model_id:
|
|
93
|
+
prov, _, mid = model_id.partition("/")
|
|
94
|
+
model = (
|
|
95
|
+
try_get_model(prov, model_id) or try_get_model(prov, mid) or find_model_by_id(mid)
|
|
96
|
+
)
|
|
97
|
+
if model is not None:
|
|
98
|
+
return model
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def sync_catalog(client: object, mapping: ModelMapping | None = None) -> int:
|
|
103
|
+
"""Overlay Minima's authoritative live pricing onto the registered harness models.
|
|
104
|
+
|
|
105
|
+
Minima's ``GET /v1/models`` carries cost/context that the server overlays from live
|
|
106
|
+
LiteLLM pricing and *scores routing against*. The harness registry is hand-seeded and can
|
|
107
|
+
drift from it, so the cost the harness reports for a call can disagree with the cost the
|
|
108
|
+
server routed on — which corrupts the est-vs-actual loop. This pulls the catalog and
|
|
109
|
+
overlays cost/context/max_output onto each matching registered model (tolerant id match,
|
|
110
|
+
reusing :meth:`ModelMapping._resolve`). Returns the number of models updated.
|
|
111
|
+
|
|
112
|
+
Offline-safe: any failure (unreachable Minima, bad shape) is logged at DEBUG and returns 0,
|
|
113
|
+
leaving the seeded prices in place. ``client`` is duck-typed on a sync ``.models()``.
|
|
114
|
+
"""
|
|
115
|
+
from minima_harness.ai.registry import register_model
|
|
116
|
+
from minima_harness.ai.types import ModelCost
|
|
117
|
+
|
|
118
|
+
mapping = mapping or ModelMapping()
|
|
119
|
+
try:
|
|
120
|
+
resp = client.models(include_stale=True) # type: ignore[attr-defined]
|
|
121
|
+
cards = list(getattr(resp, "models", None) or [])
|
|
122
|
+
except Exception: # noqa: BLE001 - the harness must run on the seeded catalog if this fails
|
|
123
|
+
_log.debug("catalog_overlay_skipped", exc_info=True)
|
|
124
|
+
return 0
|
|
125
|
+
updated = 0
|
|
126
|
+
for card in cards:
|
|
127
|
+
model = mapping._resolve(card.provider, card.model_id)
|
|
128
|
+
if model is None:
|
|
129
|
+
continue
|
|
130
|
+
model.cost = ModelCost(
|
|
131
|
+
input=card.input_cost_per_mtok,
|
|
132
|
+
output=card.output_cost_per_mtok,
|
|
133
|
+
cache_read=(
|
|
134
|
+
card.cache_read_cost_per_mtok
|
|
135
|
+
if card.cache_read_cost_per_mtok is not None
|
|
136
|
+
else model.cost.cache_read
|
|
137
|
+
),
|
|
138
|
+
cache_write=model.cost.cache_write,
|
|
139
|
+
)
|
|
140
|
+
if card.context_window:
|
|
141
|
+
model.context_window = card.context_window
|
|
142
|
+
if card.max_output_tokens:
|
|
143
|
+
model.max_tokens = card.max_output_tokens
|
|
144
|
+
register_model(model) # re-register (same instance) so the overlay is authoritative
|
|
145
|
+
updated += 1
|
|
146
|
+
_log.debug("catalog_overlay matched %d of %d minima catalog models", updated, len(cards))
|
|
147
|
+
return updated
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""CostMeter — per-prompt cost observability for a MinimaAgent run.
|
|
2
|
+
|
|
3
|
+
Owned by :class:`MinimaAgent` (the routing decision isn't part of the ``AgentEvent``
|
|
4
|
+
stream, so the meter is fed directly from ``prompt()`` rather than via ``subscribe()``).
|
|
5
|
+
Accumulates one row per prompt — model picked, why, est vs actual $, savings vs the
|
|
6
|
+
configured baseline, quality, outcome — and renders a report + summary totals. This is
|
|
7
|
+
the "see exactly what you spend and why" surface: the data already flowed to Minima; the
|
|
8
|
+
meter just surfaces it to the human.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from minima_harness.minima.router import RoutingResult
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(slots=True)
|
|
21
|
+
class CostRow:
|
|
22
|
+
label: str
|
|
23
|
+
model: str
|
|
24
|
+
decision_basis: str
|
|
25
|
+
est_cost_usd: float
|
|
26
|
+
actual_cost_usd: float
|
|
27
|
+
baseline_cost_usd: float | None
|
|
28
|
+
quality: float | None
|
|
29
|
+
outcome: str
|
|
30
|
+
turns: int = 0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(slots=True)
|
|
34
|
+
class CostTotals:
|
|
35
|
+
n: int = 0
|
|
36
|
+
est_cost_usd: float = 0.0
|
|
37
|
+
actual_cost_usd: float = 0.0
|
|
38
|
+
baseline_cost_usd: float = 0.0
|
|
39
|
+
baseline_rows: int = 0 # prompts that had a baseline to compare against
|
|
40
|
+
successes: int = 0
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def savings_usd(self) -> float:
|
|
44
|
+
return self.baseline_cost_usd - self.actual_cost_usd
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def savings_pct(self) -> float:
|
|
48
|
+
if self.baseline_cost_usd <= 0:
|
|
49
|
+
return 0.0
|
|
50
|
+
return 100.0 * self.savings_usd / self.baseline_cost_usd
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def success_rate(self) -> float:
|
|
54
|
+
return (100.0 * self.successes / self.n) if self.n else 0.0
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class CostMeter:
|
|
58
|
+
def __init__(self) -> None:
|
|
59
|
+
self.rows: list[CostRow] = []
|
|
60
|
+
|
|
61
|
+
def record(
|
|
62
|
+
self,
|
|
63
|
+
*,
|
|
64
|
+
label: str,
|
|
65
|
+
routing: RoutingResult | None,
|
|
66
|
+
actual_cost_usd: float,
|
|
67
|
+
quality: float | None,
|
|
68
|
+
outcome: str,
|
|
69
|
+
turns: int = 0,
|
|
70
|
+
) -> CostRow:
|
|
71
|
+
row = CostRow(
|
|
72
|
+
label=label,
|
|
73
|
+
model=(routing.chosen_model_id if routing else None) or "(offline)",
|
|
74
|
+
decision_basis=routing.decision_basis if routing else "-",
|
|
75
|
+
est_cost_usd=routing.est_cost_usd if routing else 0.0,
|
|
76
|
+
actual_cost_usd=actual_cost_usd,
|
|
77
|
+
baseline_cost_usd=routing.baseline_cost_usd if routing else None,
|
|
78
|
+
quality=quality,
|
|
79
|
+
outcome=outcome,
|
|
80
|
+
turns=turns,
|
|
81
|
+
)
|
|
82
|
+
self.rows.append(row)
|
|
83
|
+
return row
|
|
84
|
+
|
|
85
|
+
def totals(self) -> CostTotals:
|
|
86
|
+
t = CostTotals()
|
|
87
|
+
for r in self.rows:
|
|
88
|
+
t.n += 1
|
|
89
|
+
t.est_cost_usd += r.est_cost_usd
|
|
90
|
+
t.actual_cost_usd += r.actual_cost_usd
|
|
91
|
+
if r.baseline_cost_usd is not None:
|
|
92
|
+
t.baseline_cost_usd += r.baseline_cost_usd
|
|
93
|
+
t.baseline_rows += 1
|
|
94
|
+
if r.outcome == "success":
|
|
95
|
+
t.successes += 1
|
|
96
|
+
return t
|
|
97
|
+
|
|
98
|
+
def report(self) -> str:
|
|
99
|
+
if not self.rows:
|
|
100
|
+
return "(cost meter: no prompts recorded)"
|
|
101
|
+
cols = [
|
|
102
|
+
"label",
|
|
103
|
+
"model",
|
|
104
|
+
"basis",
|
|
105
|
+
"est$",
|
|
106
|
+
"actual$",
|
|
107
|
+
"save$",
|
|
108
|
+
"turns",
|
|
109
|
+
"quality",
|
|
110
|
+
"outcome",
|
|
111
|
+
]
|
|
112
|
+
rendered = [
|
|
113
|
+
{
|
|
114
|
+
"label": r.label,
|
|
115
|
+
"model": r.model,
|
|
116
|
+
"basis": r.decision_basis,
|
|
117
|
+
"est$": f"{r.est_cost_usd:.6f}",
|
|
118
|
+
"actual$": f"{r.actual_cost_usd:.6f}",
|
|
119
|
+
"save$": (
|
|
120
|
+
f"{r.baseline_cost_usd - r.actual_cost_usd:.6f}"
|
|
121
|
+
if r.baseline_cost_usd is not None
|
|
122
|
+
else "-"
|
|
123
|
+
),
|
|
124
|
+
"turns": str(r.turns),
|
|
125
|
+
"quality": f"{r.quality:.2f}" if r.quality is not None else "-",
|
|
126
|
+
"outcome": r.outcome,
|
|
127
|
+
}
|
|
128
|
+
for r in self.rows
|
|
129
|
+
]
|
|
130
|
+
widths = {c: max(len(c), max(len(str(row[c])) for row in rendered)) for c in cols}
|
|
131
|
+
header = " ".join(c.ljust(widths[c]) for c in cols)
|
|
132
|
+
lines = [header, "-" * len(header)]
|
|
133
|
+
for row in rendered:
|
|
134
|
+
lines.append(" ".join(str(row[c]).ljust(widths[c]) for c in cols))
|
|
135
|
+
t = self.totals()
|
|
136
|
+
lines.append("")
|
|
137
|
+
lines.append(
|
|
138
|
+
f"total actual ${t.actual_cost_usd:.6f} | "
|
|
139
|
+
f"baseline ${t.baseline_cost_usd:.6f} ({t.baseline_rows} rows) | "
|
|
140
|
+
f"savings {t.savings_pct:.1f}% (${t.savings_usd:.6f}) | "
|
|
141
|
+
f"success {t.success_rate:.1f}% ({t.successes}/{t.n})"
|
|
142
|
+
)
|
|
143
|
+
return "\n".join(lines)
|