minima-cli 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. minima/__init__.py +5 -0
  2. minima/api/__init__.py +1 -0
  3. minima/api/auth.py +39 -0
  4. minima/api/errors.py +40 -0
  5. minima/api/routers/__init__.py +1 -0
  6. minima/api/routers/calibration.py +50 -0
  7. minima/api/routers/feedback.py +279 -0
  8. minima/api/routers/health.py +50 -0
  9. minima/api/routers/models.py +42 -0
  10. minima/api/routers/recommend.py +66 -0
  11. minima/api/routers/savings.py +55 -0
  12. minima/api/routers/strategies.py +33 -0
  13. minima/catalog/__init__.py +1 -0
  14. minima/catalog/data/capability_priors.json +210 -0
  15. minima/catalog/data/model_aliases.json +12 -0
  16. minima/catalog/merge.py +69 -0
  17. minima/catalog/refresh.py +54 -0
  18. minima/catalog/sources/__init__.py +1 -0
  19. minima/catalog/sources/litellm.py +19 -0
  20. minima/catalog/sources/openrouter.py +25 -0
  21. minima/catalog/store.py +86 -0
  22. minima/config.py +288 -0
  23. minima/deps.py +35 -0
  24. minima/llm/__init__.py +1 -0
  25. minima/llm/anthropic.py +106 -0
  26. minima/llm/base.py +196 -0
  27. minima/llm/gemini.py +124 -0
  28. minima/llm/registry.py +54 -0
  29. minima/logging.py +28 -0
  30. minima/main.py +109 -0
  31. minima/memory/__init__.py +1 -0
  32. minima/memory/adapter.py +572 -0
  33. minima/memory/keys.py +83 -0
  34. minima/memory/records.py +190 -0
  35. minima/memory/threadpool.py +41 -0
  36. minima/metrics/__init__.py +1 -0
  37. minima/metrics/calibration.py +415 -0
  38. minima/metrics/report.py +116 -0
  39. minima/metrics/savings.py +98 -0
  40. minima/recommender/__init__.py +1 -0
  41. minima/recommender/_pg_pool.py +38 -0
  42. minima/recommender/_redis_client.py +32 -0
  43. minima/recommender/aggregate.py +157 -0
  44. minima/recommender/classify.py +165 -0
  45. minima/recommender/decisionlog.py +505 -0
  46. minima/recommender/durablerefs.py +312 -0
  47. minima/recommender/engine.py +997 -0
  48. minima/recommender/escalation.py +83 -0
  49. minima/recommender/propensity.py +189 -0
  50. minima/recommender/recstore.py +368 -0
  51. minima/recommender/score.py +318 -0
  52. minima/recommender/types.py +166 -0
  53. minima/schemas/__init__.py +1 -0
  54. minima/schemas/common.py +73 -0
  55. minima/schemas/feedback.py +34 -0
  56. minima/schemas/models_catalog.py +36 -0
  57. minima/schemas/recommend.py +104 -0
  58. minima/schemas/savings.py +39 -0
  59. minima/schemas/strategies.py +57 -0
  60. minima/schemas/workflow.py +43 -0
  61. minima/seeding/__init__.py +1 -0
  62. minima/seeding/items.py +42 -0
  63. minima/seeding/llmrouterbench.py +232 -0
  64. minima/seeding/routerbench.py +141 -0
  65. minima/seeding/run_seed.py +56 -0
  66. minima/seeding/synthetic.py +70 -0
  67. minima/tenancy/__init__.py +8 -0
  68. minima/tenancy/context.py +37 -0
  69. minima/tenancy/passthrough.py +110 -0
  70. minima/version.py +3 -0
  71. minima_cli-0.4.9.dist-info/METADATA +275 -0
  72. minima_cli-0.4.9.dist-info/RECORD +161 -0
  73. minima_cli-0.4.9.dist-info/WHEEL +4 -0
  74. minima_cli-0.4.9.dist-info/entry_points.txt +5 -0
  75. minima_cli-0.4.9.dist-info/licenses/LICENSE +295 -0
  76. minima_client/__init__.py +19 -0
  77. minima_client/autocapture.py +101 -0
  78. minima_client/client.py +301 -0
  79. minima_client/errors.py +23 -0
  80. minima_harness/LICENSE_PI +32 -0
  81. minima_harness/__init__.py +16 -0
  82. minima_harness/agent/__init__.py +72 -0
  83. minima_harness/agent/agent.py +276 -0
  84. minima_harness/agent/events.py +124 -0
  85. minima_harness/agent/loop.py +311 -0
  86. minima_harness/agent/state.py +79 -0
  87. minima_harness/agent/tools.py +97 -0
  88. minima_harness/ai/__init__.py +66 -0
  89. minima_harness/ai/compat.py +71 -0
  90. minima_harness/ai/errors.py +96 -0
  91. minima_harness/ai/events.py +117 -0
  92. minima_harness/ai/openrouter_catalog.py +153 -0
  93. minima_harness/ai/provider_catalog.py +299 -0
  94. minima_harness/ai/provider_quirks.py +37 -0
  95. minima_harness/ai/providers/__init__.py +75 -0
  96. minima_harness/ai/providers/_common.py +48 -0
  97. minima_harness/ai/providers/anthropic.py +290 -0
  98. minima_harness/ai/providers/base.py +65 -0
  99. minima_harness/ai/providers/faux.py +173 -0
  100. minima_harness/ai/providers/google.py +221 -0
  101. minima_harness/ai/providers/openai_compat.py +278 -0
  102. minima_harness/ai/registry.py +184 -0
  103. minima_harness/ai/stream.py +82 -0
  104. minima_harness/ai/tools.py +51 -0
  105. minima_harness/ai/types.py +204 -0
  106. minima_harness/ai/usage.py +41 -0
  107. minima_harness/minima/__init__.py +40 -0
  108. minima_harness/minima/cache.py +102 -0
  109. minima_harness/minima/config.py +85 -0
  110. minima_harness/minima/goals.py +226 -0
  111. minima_harness/minima/judge.py +144 -0
  112. minima_harness/minima/mapping.py +147 -0
  113. minima_harness/minima/meter.py +143 -0
  114. minima_harness/minima/router.py +220 -0
  115. minima_harness/minima/runtime.py +544 -0
  116. minima_harness/minima/signals.py +195 -0
  117. minima_harness/session/__init__.py +14 -0
  118. minima_harness/session/format.py +35 -0
  119. minima_harness/session/store.py +236 -0
  120. minima_harness/tasks/__init__.py +17 -0
  121. minima_harness/tasks/task_set.py +78 -0
  122. minima_harness/tools/__init__.py +7 -0
  123. minima_harness/tools/_io.py +34 -0
  124. minima_harness/tools/bash.py +70 -0
  125. minima_harness/tools/builtin.py +23 -0
  126. minima_harness/tools/edit.py +50 -0
  127. minima_harness/tools/find.py +38 -0
  128. minima_harness/tools/grep.py +73 -0
  129. minima_harness/tools/ls.py +35 -0
  130. minima_harness/tools/read.py +38 -0
  131. minima_harness/tools/tasks.py +75 -0
  132. minima_harness/tools/write.py +36 -0
  133. minima_harness/tui/__init__.py +3 -0
  134. minima_harness/tui/analytics.py +111 -0
  135. minima_harness/tui/app.py +1927 -0
  136. minima_harness/tui/bridge.py +103 -0
  137. minima_harness/tui/cli.py +227 -0
  138. minima_harness/tui/clipboard.py +60 -0
  139. minima_harness/tui/commands.py +49 -0
  140. minima_harness/tui/compaction.py +17 -0
  141. minima_harness/tui/config_cli.py +141 -0
  142. minima_harness/tui/config_store.py +237 -0
  143. minima_harness/tui/context.py +93 -0
  144. minima_harness/tui/customize.py +95 -0
  145. minima_harness/tui/diff.py +53 -0
  146. minima_harness/tui/editor.py +43 -0
  147. minima_harness/tui/extensions.py +84 -0
  148. minima_harness/tui/extra_models.py +52 -0
  149. minima_harness/tui/history.py +71 -0
  150. minima_harness/tui/mubit.py +295 -0
  151. minima_harness/tui/overlays.py +593 -0
  152. minima_harness/tui/packages.py +59 -0
  153. minima_harness/tui/run_modes.py +66 -0
  154. minima_harness/tui/theme.py +77 -0
  155. minima_harness/tui/welcome.py +83 -0
  156. minima_harness/tui/widgets/__init__.py +3 -0
  157. minima_harness/tui/widgets/banner.py +38 -0
  158. minima_harness/tui/widgets/editor.py +83 -0
  159. minima_harness/tui/widgets/footer.py +73 -0
  160. minima_harness/tui/widgets/messages.py +151 -0
  161. minima_harness/tui/widgets/status.py +57 -0
@@ -0,0 +1,226 @@
1
+ """Goal / task tracking for the harness — the data model behind ``/goals``.
2
+
3
+ A :class:`Goal` is a high-level objective plus a checklist of :class:`GoalTask` items the agent
4
+ maintains as it works (one ``in_progress`` at a time). Phase 1 uses this purely to keep the
5
+ agent on-track and show progress; Phase 2 adds cost fields (``est_cost_usd`` / ``actual_cost_usd``
6
+ / ``budget_usd``) so a goal can be tracked against a budget — Minima's differentiator.
7
+
8
+ State is owned by :class:`GoalStore`, which (de)serializes to the per-session store so a goal
9
+ survives ``--continue`` / ``--resume``.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from dataclasses import asdict, dataclass, field
16
+ from typing import Any
17
+
18
+ _STATUSES = ("pending", "in_progress", "completed", "blocked")
19
+
20
+
21
+ def _slug(text: str, n: int = 24) -> str:
22
+ s = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
23
+ return (s[:n].rstrip("-")) or "task"
24
+
25
+
26
+ @dataclass
27
+ class GoalTask:
28
+ id: str
29
+ content: str # imperative form ("Add OAuth login")
30
+ active_form: str = "" # present-continuous ("Adding OAuth login"); falls back to content
31
+ status: str = "pending" # pending | in_progress | completed | blocked
32
+ est_cost_usd: float = 0.0 # routing estimate captured while worked (Phase 2)
33
+ actual_cost_usd: float = 0.0 # realized cost attributed to this task (Phase 2)
34
+
35
+ @property
36
+ def label(self) -> str:
37
+ if self.status == "in_progress" and self.active_form:
38
+ return self.active_form
39
+ return self.content
40
+
41
+ @classmethod
42
+ def make(cls, content: str, active_form: str = "", status: str = "pending") -> GoalTask:
43
+ status = status if status in _STATUSES else "pending"
44
+ return cls(id=_slug(content), content=content, active_form=active_form, status=status)
45
+
46
+
47
+ @dataclass
48
+ class Goal:
49
+ title: str
50
+ tasks: list[GoalTask] = field(default_factory=list)
51
+ task_type: str | None = None
52
+ tags: list[str] = field(default_factory=list)
53
+ budget_usd: float | None = None
54
+ started_ts: float = 0.0
55
+ done: bool = False
56
+ # Cost attributed to turns that ran while no task was in_progress (Phase 2).
57
+ spent_extra_usd: float = 0.0
58
+
59
+ def progress(self) -> tuple[int, int]:
60
+ """(completed, total)."""
61
+ return (sum(1 for t in self.tasks if t.status == "completed"), len(self.tasks))
62
+
63
+ def active(self) -> GoalTask | None:
64
+ return next((t for t in self.tasks if t.status == "in_progress"), None)
65
+
66
+ def routing_signals(self) -> tuple[str | None, list[str]]:
67
+ """(task_type, tags) to feed the router so a goal's turns cluster + route coherently."""
68
+ tags = list(self.tags)
69
+ if self.title:
70
+ tags = [f"goal:{_slug(self.title)}", *tags]
71
+ return self.task_type, tags
72
+
73
+ def record_turn_cost(
74
+ self, actual_usd: float, est_usd: float, newly_completed_ids: list[str] | None = None
75
+ ) -> None:
76
+ """Attribute a turn's realized cost. Order of preference:
77
+
78
+ 1. the in_progress task (the model marked one — ideal);
79
+ 2. else split evenly across tasks that flipped to completed THIS turn (the common case
80
+ where a model batches: plan → do the work → mark several done at once);
81
+ 3. else the goal at large (``spent_extra_usd``), so goal-level spend is always accurate.
82
+ """
83
+ task = self.active()
84
+ if task is not None:
85
+ task.actual_cost_usd += actual_usd
86
+ if task.est_cost_usd == 0.0:
87
+ task.est_cost_usd = est_usd
88
+ return
89
+ targets = [t for t in self.tasks if t.id in (newly_completed_ids or [])]
90
+ if targets:
91
+ share_a, share_e = actual_usd / len(targets), est_usd / len(targets)
92
+ for t in targets:
93
+ t.actual_cost_usd += share_a
94
+ if t.est_cost_usd == 0.0:
95
+ t.est_cost_usd = share_e
96
+ return
97
+ self.spent_extra_usd += actual_usd
98
+
99
+ def spent_usd(self) -> float:
100
+ return sum(t.actual_cost_usd for t in self.tasks) + self.spent_extra_usd
101
+
102
+ def projected_total_usd(self) -> float | None:
103
+ """Linear extrapolation of total goal cost from progress (None until ≥1 task done)."""
104
+ done, total = self.progress()
105
+ if done <= 0 or total <= 0:
106
+ return None
107
+ return self.spent_usd() / done * total
108
+
109
+ def to_dict(self) -> dict[str, Any]:
110
+ return asdict(self)
111
+
112
+ @classmethod
113
+ def from_dict(cls, data: dict[str, Any]) -> Goal:
114
+ tasks = [GoalTask(**t) for t in data.get("tasks", [])]
115
+ return cls(
116
+ title=data.get("title", ""),
117
+ tasks=tasks,
118
+ task_type=data.get("task_type"),
119
+ tags=list(data.get("tags", [])),
120
+ budget_usd=data.get("budget_usd"),
121
+ started_ts=float(data.get("started_ts", 0.0)),
122
+ done=bool(data.get("done", False)),
123
+ spent_extra_usd=float(data.get("spent_extra_usd", 0.0)),
124
+ )
125
+
126
+
127
+ class GoalStore:
128
+ """Owns the active goal and (de)serializes it to/from the session store.
129
+
130
+ The session store is the single source of truth: :meth:`save` appends a GOAL entry and
131
+ :meth:`load` reads the latest one, so a goal survives resume with no extra storage.
132
+ """
133
+
134
+ def __init__(self, goal: Goal | None = None) -> None:
135
+ self.goal = goal
136
+
137
+ @property
138
+ def active(self) -> bool:
139
+ return self.goal is not None and not self.goal.done
140
+
141
+ # ---- mutation (driven by the `tasks` tool and the /goals command) ----
142
+ def start(self, title: str, *, now: float = 0.0) -> Goal:
143
+ self.goal = Goal(title=title, started_ts=now)
144
+ return self.goal
145
+
146
+ def clear(self) -> None:
147
+ if self.goal is not None:
148
+ self.goal.done = True
149
+
150
+ def set_budget(self, amount: float | None) -> None:
151
+ if self.goal is not None:
152
+ self.goal.budget_usd = amount
153
+
154
+ def completed_ids(self) -> set[str]:
155
+ return {t.id for t in self.goal.tasks if t.status == "completed"} if self.goal else set()
156
+
157
+ def record_turn_cost(
158
+ self, actual_usd: float, est_usd: float, newly_completed_ids: list[str] | None = None
159
+ ) -> None:
160
+ if self.active and self.goal is not None:
161
+ self.goal.record_turn_cost(actual_usd, est_usd, newly_completed_ids)
162
+
163
+ def set_tasks(self, items: list[dict[str, Any]]) -> None:
164
+ """Replace the task list (the model's `tasks set` op)."""
165
+ if self.goal is None:
166
+ self.goal = Goal(title="")
167
+ self.goal.tasks = [
168
+ GoalTask.make(
169
+ str(it.get("content", "")).strip(),
170
+ str(it.get("active_form", "")).strip(),
171
+ str(it.get("status", "pending")),
172
+ )
173
+ for it in items
174
+ if str(it.get("content", "")).strip()
175
+ ]
176
+
177
+ def update_task(self, task_id: str, status: str) -> bool:
178
+ """Set one task's status; enforces the single-in_progress invariant. Returns matched."""
179
+ if self.goal is None or status not in _STATUSES:
180
+ return False
181
+ match = next((t for t in self.goal.tasks if t.id == task_id), None)
182
+ if match is None:
183
+ return False
184
+ if status == "in_progress": # demote any other in_progress task
185
+ for t in self.goal.tasks:
186
+ if t.status == "in_progress":
187
+ t.status = "pending"
188
+ match.status = status
189
+ return True
190
+
191
+ # ---- persistence ----
192
+ def save(self, session: Any) -> None:
193
+ if self.goal is None:
194
+ return
195
+ try:
196
+ from minima_harness.session.format import EntryType
197
+
198
+ session.append(EntryType.GOAL, self.goal.to_dict())
199
+ except Exception: # noqa: BLE001 - goal persistence must never break a turn
200
+ pass
201
+
202
+ def load(self, session: Any) -> None:
203
+ try:
204
+ from minima_harness.session.format import EntryType
205
+
206
+ latest = None
207
+ for entry in getattr(session, "entries", []):
208
+ if entry.type == EntryType.GOAL:
209
+ latest = entry
210
+ if latest is not None:
211
+ self.goal = Goal.from_dict(latest.payload)
212
+ except Exception: # noqa: BLE001
213
+ pass
214
+
215
+ # ---- prompt rendering ----
216
+ def prompt_block(self) -> str:
217
+ """The goal + open tasks, injected into the system prompt each turn to re-anchor."""
218
+ if not self.active or self.goal is None:
219
+ return ""
220
+ lines = [f"# Current goal: {self.goal.title}".rstrip()]
221
+ if self.goal.tasks:
222
+ lines.append("Task list (keep it current via the `tasks` tool; one in_progress):")
223
+ mark = {"completed": "[x]", "in_progress": "[~]", "blocked": "[!]", "pending": "[ ]"}
224
+ for t in self.goal.tasks:
225
+ lines.append(f" {mark.get(t.status, '[ ]')} {t.content}")
226
+ return "\n".join(lines)
@@ -0,0 +1,144 @@
1
+ """Quality judging for the Minima feedback loop.
2
+
3
+ A judge turns a model's output into a [0, 1] quality score, which the router folds into
4
+ the outcome label it sends to ``POST /v1/feedback``. Three implementations cover the
5
+ common cases: an LLM grader (default when a key is present), a deterministic scorer
6
+ (wraps a ``quality_fn``, matching ``minima_harness.tasks``), and a constant for when
7
+ judging is disabled.
8
+
9
+ ``grade`` returns ``float | None``: ``None`` means the judge ABSTAINS — it could not
10
+ produce a trustworthy score (LLM call failed, output unparseable, or no judge
11
+ configured). Abstention is NOT a failure: feeding a fabricated 0.0 (API error) or a
12
+ neutral 0.5 (unparseable) into ``/v1/feedback`` poisons the learning loop, so the caller
13
+ records the realized cost/latency but sends NO quality/outcome signal on abstention.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import re
20
+ from typing import TYPE_CHECKING, Protocol, runtime_checkable
21
+
22
+ from minima_harness.ai import Context, Message, complete
23
+ from minima_harness.ai.types import Model
24
+
25
+ if TYPE_CHECKING:
26
+ from collections.abc import Callable
27
+
28
+ _log = logging.getLogger("minima_harness.judge")
29
+
30
+ JUDGE_SYSTEM = (
31
+ "You grade an AI assistant's response to a task on a 0-10 scale: 10 excellent, "
32
+ "5 acceptable, 0 wrong. Judge correctness, completeness, and adherence to any rubric. "
33
+ "Reply with ONLY a single integer 0-10, nothing else."
34
+ )
35
+
36
+
37
+ @runtime_checkable
38
+ class QualityJudge(Protocol):
39
+ async def grade(
40
+ self, task: str, output: str, *, rubric: str = "", expected: str = ""
41
+ ) -> float | None:
42
+ """Return a quality score in [0, 1], or ``None`` to abstain (no trustworthy score)."""
43
+ ...
44
+
45
+
46
+ def clamp01(x: float) -> float:
47
+ return max(0.0, min(1.0, x))
48
+
49
+
50
+ class DeterministicJudge:
51
+ """Wraps a ``quality_fn(output) -> float`` callable (the tasks/task_set convention)."""
52
+
53
+ def __init__(self, fn: Callable[[str], float]) -> None:
54
+ self._fn = fn
55
+
56
+ async def grade(
57
+ self, task: str, output: str, *, rubric: str = "", expected: str = ""
58
+ ) -> float | None:
59
+ try:
60
+ return clamp01(float(self._fn(output)))
61
+ except Exception: # noqa: BLE001 - a broken scorer must ABSTAIN, not record a failure
62
+ _log.warning("deterministic_judge_failed", exc_info=True)
63
+ return None
64
+
65
+
66
+ class ConstJudge:
67
+ """Returns a fixed quality (or ``None`` to abstain). ``ConstJudge(None)`` = always abstain."""
68
+
69
+ def __init__(self, quality: float | None = 0.5) -> None:
70
+ self._quality = clamp01(quality) if quality is not None else None
71
+
72
+ async def grade(
73
+ self, task: str, output: str, *, rubric: str = "", expected: str = ""
74
+ ) -> float | None:
75
+ return self._quality
76
+
77
+
78
+ class LLMJudge:
79
+ """Grades via a cheap independent model (default claude-haiku). 0-10 -> /10 -> clamp.
80
+
81
+ Uses the harness's own ``ai.complete`` so it shares provider plumbing; pick a
82
+ different provider than your candidates to avoid self-grading bias.
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ model: Model,
88
+ *,
89
+ api_key: str | None = None,
90
+ timeout: float = 30.0,
91
+ ) -> None:
92
+ self._model = model
93
+ self._api_key = api_key
94
+ self._timeout = timeout
95
+
96
+ async def grade(
97
+ self, task: str, output: str, *, rubric: str = "", expected: str = ""
98
+ ) -> float | None:
99
+ user = f"TASK:\n{task[:4000]}\n\nRESPONSE:\n{output[:4000]}"
100
+ if rubric:
101
+ user += f"\n\nRUBRIC:\n{rubric[:1000]}"
102
+ if expected:
103
+ user += f"\n\nEXPECTED:\n{expected[:1000]}"
104
+ # Judge inputs (task + response) are unique per turn, so prompt caching would only
105
+ # incur a cache-write with no future read — disable it.
106
+ options: dict = {"timeout": self._timeout, "prompt_cache": False}
107
+ if self._api_key:
108
+ options["api_key"] = self._api_key
109
+ try:
110
+ resp = await complete(
111
+ self._model,
112
+ Context(
113
+ system_prompt=JUDGE_SYSTEM,
114
+ messages=[Message(role="user", content=user)],
115
+ ),
116
+ options=options,
117
+ )
118
+ except Exception: # noqa: BLE001 - a judge API error is NOT a model failure: abstain
119
+ _log.warning("llm_judge_call_failed", exc_info=True)
120
+ return None
121
+ score = _parse_score(resp.text)
122
+ # Unparseable judge output -> abstain rather than fabricate a neutral 0.5.
123
+ return None if score is None else clamp01(score / 10.0)
124
+
125
+
126
+ def _parse_score(text: str) -> float | None:
127
+ """Extract a 0-10 integer score from the judge's reply; ``None`` when none is found.
128
+
129
+ The judge is asked for a bare integer, but real replies vary. Prefer, in order:
130
+ an exact single integer, an ``N/10`` form, a ``score/rating/grade: N`` form, and
131
+ finally the LAST standalone 0-10 integer (judges tend to conclude with the score,
132
+ e.g. "there were 3 issues, so 7"). Returns ``None`` only when no 0-10 integer exists.
133
+ """
134
+ t = text.strip()
135
+ if re.fullmatch(r"\d+", t) and 0 <= int(t) <= 10:
136
+ return float(t)
137
+ m = re.search(r"\b(\d+)\s*/\s*10\b", t)
138
+ if m and 0 <= int(m.group(1)) <= 10:
139
+ return float(m.group(1))
140
+ m = re.search(r"(?:score|rating|grade)\D{0,5}(\d+)", t, re.IGNORECASE)
141
+ if m and 0 <= int(m.group(1)) <= 10:
142
+ return float(m.group(1))
143
+ candidates = [int(x) for x in re.findall(r"\d+", t) if 0 <= int(x) <= 10]
144
+ return float(candidates[-1]) if candidates else None
@@ -0,0 +1,147 @@
1
+ """Map a Minima ``RankedModel`` to a harness :class:`~minima_harness.ai.types.Model`.
2
+
3
+ Minima's catalog and the harness registry are kept deliberately separate: Minima is the
4
+ source of truth for *routing*, the harness registry for *calling*. This module bridges
5
+ them with a tolerant lookup (exact -> id-only -> ``provider/model`` split -> fallback) so
6
+ a recommendation resolves to a callable model even when ids drift slightly between the
7
+ two catalogs.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ from typing import TYPE_CHECKING
14
+
15
+ from minima_harness.ai.provider_catalog import provider_key_present
16
+ from minima_harness.ai.registry import all_models, find_model_by_id, try_get_model
17
+ from minima_harness.ai.types import Model
18
+
19
+ if TYPE_CHECKING:
20
+ from minima.schemas.recommend import RankedModel
21
+
22
+ _log = logging.getLogger("minima_harness.mapping")
23
+
24
+
25
+ def _has_provider_key(model: Model) -> bool:
26
+ """True if a key for ``model``'s OWN provider is set (or it needs none, e.g. a local runtime).
27
+
28
+ Provider-specific (via the provider catalog): a Groq model needs GROQ_API_KEY, an OpenAI
29
+ model needs OPENAI_API_KEY — an OpenRouter key never green-lights an api.openai.com model.
30
+ """
31
+ return provider_key_present(model.provider)
32
+
33
+
34
+ def _fallback_cost(model: Model) -> float:
35
+ """Sort key for the offline fallback: combined per-token cost, but treat an unpriced
36
+ (cost 0) model as most-expensive so a local/custom 0-cost stub isn't mistaken for the
37
+ cheapest runnable default."""
38
+ total = model.cost.input + model.cost.output
39
+ return float("inf") if total <= 0 else total
40
+
41
+
42
+ class ModelMapping:
43
+ """Resolve Minima's pick to a callable harness model."""
44
+
45
+ def to_model(
46
+ self,
47
+ ranked: RankedModel,
48
+ *,
49
+ offline_default: Model | None = None,
50
+ ) -> Model:
51
+ model = self._resolve(ranked.provider, ranked.model_id)
52
+ if model is not None:
53
+ return model
54
+ if offline_default is not None:
55
+ _log.debug(
56
+ "mapping_fallback_to_offline_default provider=%s model_id=%s",
57
+ ranked.provider,
58
+ ranked.model_id,
59
+ )
60
+ return offline_default
61
+ raise KeyError(
62
+ f"no harness model for minima pick {ranked.provider}/{ranked.model_id!r}; "
63
+ "register it or pass an offline_default"
64
+ )
65
+
66
+ def default_model(self) -> Model:
67
+ """Offline fallback: the cheapest registered model the user can actually run.
68
+
69
+ Prefers the cheapest model whose provider key is configured, so an offline
70
+ fallback doesn't pick (say) gpt-4o-mini when only Anthropic/Gemini keys are set.
71
+ Falls back to the globally cheapest model if no provider key is present (the run
72
+ will then surface a clear provider-auth error rather than a silent mismatch)."""
73
+ models = all_models()
74
+ if not models:
75
+ raise KeyError("harness model registry is empty")
76
+ by_cost = sorted(models, key=lambda m: (_fallback_cost(m), m.id))
77
+ for model in by_cost:
78
+ if _has_provider_key(model):
79
+ return model
80
+ return by_cost[0]
81
+
82
+ def _resolve(self, provider: str, model_id: str) -> Model | None:
83
+ # 1. exact (provider, id)
84
+ model = try_get_model(provider, model_id)
85
+ if model is not None:
86
+ return model
87
+ # 2. id-only (Minima's provider string may differ from ours)
88
+ model = find_model_by_id(model_id)
89
+ if model is not None:
90
+ return model
91
+ # 3. openrouter-style "provider/model" ids
92
+ if "/" in model_id:
93
+ prov, _, mid = model_id.partition("/")
94
+ model = (
95
+ try_get_model(prov, model_id) or try_get_model(prov, mid) or find_model_by_id(mid)
96
+ )
97
+ if model is not None:
98
+ return model
99
+ return None
100
+
101
+
102
+ def sync_catalog(client: object, mapping: ModelMapping | None = None) -> int:
103
+ """Overlay Minima's authoritative live pricing onto the registered harness models.
104
+
105
+ Minima's ``GET /v1/models`` carries cost/context that the server overlays from live
106
+ LiteLLM pricing and *scores routing against*. The harness registry is hand-seeded and can
107
+ drift from it, so the cost the harness reports for a call can disagree with the cost the
108
+ server routed on — which corrupts the est-vs-actual loop. This pulls the catalog and
109
+ overlays cost/context/max_output onto each matching registered model (tolerant id match,
110
+ reusing :meth:`ModelMapping._resolve`). Returns the number of models updated.
111
+
112
+ Offline-safe: any failure (unreachable Minima, bad shape) is logged at DEBUG and returns 0,
113
+ leaving the seeded prices in place. ``client`` is duck-typed on a sync ``.models()``.
114
+ """
115
+ from minima_harness.ai.registry import register_model
116
+ from minima_harness.ai.types import ModelCost
117
+
118
+ mapping = mapping or ModelMapping()
119
+ try:
120
+ resp = client.models(include_stale=True) # type: ignore[attr-defined]
121
+ cards = list(getattr(resp, "models", None) or [])
122
+ except Exception: # noqa: BLE001 - the harness must run on the seeded catalog if this fails
123
+ _log.debug("catalog_overlay_skipped", exc_info=True)
124
+ return 0
125
+ updated = 0
126
+ for card in cards:
127
+ model = mapping._resolve(card.provider, card.model_id)
128
+ if model is None:
129
+ continue
130
+ model.cost = ModelCost(
131
+ input=card.input_cost_per_mtok,
132
+ output=card.output_cost_per_mtok,
133
+ cache_read=(
134
+ card.cache_read_cost_per_mtok
135
+ if card.cache_read_cost_per_mtok is not None
136
+ else model.cost.cache_read
137
+ ),
138
+ cache_write=model.cost.cache_write,
139
+ )
140
+ if card.context_window:
141
+ model.context_window = card.context_window
142
+ if card.max_output_tokens:
143
+ model.max_tokens = card.max_output_tokens
144
+ register_model(model) # re-register (same instance) so the overlay is authoritative
145
+ updated += 1
146
+ _log.debug("catalog_overlay matched %d of %d minima catalog models", updated, len(cards))
147
+ return updated
@@ -0,0 +1,143 @@
1
+ """CostMeter — per-prompt cost observability for a MinimaAgent run.
2
+
3
+ Owned by :class:`MinimaAgent` (the routing decision isn't part of the ``AgentEvent``
4
+ stream, so the meter is fed directly from ``prompt()`` rather than via ``subscribe()``).
5
+ Accumulates one row per prompt — model picked, why, est vs actual $, savings vs the
6
+ configured baseline, quality, outcome — and renders a report + summary totals. This is
7
+ the "see exactly what you spend and why" surface: the data already flowed to Minima; the
8
+ meter just surfaces it to the human.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from typing import TYPE_CHECKING
15
+
16
+ if TYPE_CHECKING:
17
+ from minima_harness.minima.router import RoutingResult
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class CostRow:
22
+ label: str
23
+ model: str
24
+ decision_basis: str
25
+ est_cost_usd: float
26
+ actual_cost_usd: float
27
+ baseline_cost_usd: float | None
28
+ quality: float | None
29
+ outcome: str
30
+ turns: int = 0
31
+
32
+
33
+ @dataclass(slots=True)
34
+ class CostTotals:
35
+ n: int = 0
36
+ est_cost_usd: float = 0.0
37
+ actual_cost_usd: float = 0.0
38
+ baseline_cost_usd: float = 0.0
39
+ baseline_rows: int = 0 # prompts that had a baseline to compare against
40
+ successes: int = 0
41
+
42
+ @property
43
+ def savings_usd(self) -> float:
44
+ return self.baseline_cost_usd - self.actual_cost_usd
45
+
46
+ @property
47
+ def savings_pct(self) -> float:
48
+ if self.baseline_cost_usd <= 0:
49
+ return 0.0
50
+ return 100.0 * self.savings_usd / self.baseline_cost_usd
51
+
52
+ @property
53
+ def success_rate(self) -> float:
54
+ return (100.0 * self.successes / self.n) if self.n else 0.0
55
+
56
+
57
+ class CostMeter:
58
+ def __init__(self) -> None:
59
+ self.rows: list[CostRow] = []
60
+
61
+ def record(
62
+ self,
63
+ *,
64
+ label: str,
65
+ routing: RoutingResult | None,
66
+ actual_cost_usd: float,
67
+ quality: float | None,
68
+ outcome: str,
69
+ turns: int = 0,
70
+ ) -> CostRow:
71
+ row = CostRow(
72
+ label=label,
73
+ model=(routing.chosen_model_id if routing else None) or "(offline)",
74
+ decision_basis=routing.decision_basis if routing else "-",
75
+ est_cost_usd=routing.est_cost_usd if routing else 0.0,
76
+ actual_cost_usd=actual_cost_usd,
77
+ baseline_cost_usd=routing.baseline_cost_usd if routing else None,
78
+ quality=quality,
79
+ outcome=outcome,
80
+ turns=turns,
81
+ )
82
+ self.rows.append(row)
83
+ return row
84
+
85
+ def totals(self) -> CostTotals:
86
+ t = CostTotals()
87
+ for r in self.rows:
88
+ t.n += 1
89
+ t.est_cost_usd += r.est_cost_usd
90
+ t.actual_cost_usd += r.actual_cost_usd
91
+ if r.baseline_cost_usd is not None:
92
+ t.baseline_cost_usd += r.baseline_cost_usd
93
+ t.baseline_rows += 1
94
+ if r.outcome == "success":
95
+ t.successes += 1
96
+ return t
97
+
98
+ def report(self) -> str:
99
+ if not self.rows:
100
+ return "(cost meter: no prompts recorded)"
101
+ cols = [
102
+ "label",
103
+ "model",
104
+ "basis",
105
+ "est$",
106
+ "actual$",
107
+ "save$",
108
+ "turns",
109
+ "quality",
110
+ "outcome",
111
+ ]
112
+ rendered = [
113
+ {
114
+ "label": r.label,
115
+ "model": r.model,
116
+ "basis": r.decision_basis,
117
+ "est$": f"{r.est_cost_usd:.6f}",
118
+ "actual$": f"{r.actual_cost_usd:.6f}",
119
+ "save$": (
120
+ f"{r.baseline_cost_usd - r.actual_cost_usd:.6f}"
121
+ if r.baseline_cost_usd is not None
122
+ else "-"
123
+ ),
124
+ "turns": str(r.turns),
125
+ "quality": f"{r.quality:.2f}" if r.quality is not None else "-",
126
+ "outcome": r.outcome,
127
+ }
128
+ for r in self.rows
129
+ ]
130
+ widths = {c: max(len(c), max(len(str(row[c])) for row in rendered)) for c in cols}
131
+ header = " ".join(c.ljust(widths[c]) for c in cols)
132
+ lines = [header, "-" * len(header)]
133
+ for row in rendered:
134
+ lines.append(" ".join(str(row[c]).ljust(widths[c]) for c in cols))
135
+ t = self.totals()
136
+ lines.append("")
137
+ lines.append(
138
+ f"total actual ${t.actual_cost_usd:.6f} | "
139
+ f"baseline ${t.baseline_cost_usd:.6f} ({t.baseline_rows} rows) | "
140
+ f"savings {t.savings_pct:.1f}% (${t.savings_usd:.6f}) | "
141
+ f"success {t.success_rate:.1f}% ({t.successes}/{t.n})"
142
+ )
143
+ return "\n".join(lines)