metaensemble 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evals/README.md +147 -0
- evals/__init__.py +0 -0
- evals/cassettes/README.md +10 -0
- evals/cassettes/bootstrap.jsonl +800 -0
- evals/configs/default.yaml +59 -0
- evals/datasets/__init__.py +0 -0
- evals/datasets/suite_a/tasks.yaml +123 -0
- evals/datasets/suite_b/items.yaml +90 -0
- evals/runners/__init__.py +12 -0
- evals/runners/api.py +518 -0
- evals/runners/metrics.py +132 -0
- metaensemble/__init__.py +13 -0
- metaensemble/cli.py +1362 -0
- metaensemble/commands/dispatch.md +39 -0
- metaensemble/commands/executors.md +12 -0
- metaensemble/commands/ledger.md +19 -0
- metaensemble/commands/limits.md +12 -0
- metaensemble/commands/perf.md +12 -0
- metaensemble/commands/relaunch.md +29 -0
- metaensemble/commands/standup.md +14 -0
- metaensemble/config/budgets.example.yaml +72 -0
- metaensemble/config/quality.example.yaml +82 -0
- metaensemble/hooks/__init__.py +1 -0
- metaensemble/hooks/_common.py +148 -0
- metaensemble/hooks/deliverable_sync.py +73 -0
- metaensemble/hooks/file_event.py +303 -0
- metaensemble/hooks/post_task.py +460 -0
- metaensemble/hooks/pre_task.py +548 -0
- metaensemble/hooks/session_start.py +212 -0
- metaensemble/hooks/session_summary.py +392 -0
- metaensemble/hooks/subagent_stop.py +94 -0
- metaensemble/lib/__init__.py +1 -0
- metaensemble/lib/config.py +414 -0
- metaensemble/lib/cost_gate.py +299 -0
- metaensemble/lib/dispatch.py +341 -0
- metaensemble/lib/doctor.py +1563 -0
- metaensemble/lib/file_events.py +395 -0
- metaensemble/lib/ids.py +91 -0
- metaensemble/lib/installer.py +5018 -0
- metaensemble/lib/ledger.py +812 -0
- metaensemble/lib/manifest.py +141 -0
- metaensemble/lib/native_state.py +463 -0
- metaensemble/lib/overlaps.py +155 -0
- metaensemble/lib/quality_gate.py +155 -0
- metaensemble/lib/quality_runners.py +446 -0
- metaensemble/lib/reconcile.py +420 -0
- metaensemble/lib/recording.py +422 -0
- metaensemble/lib/relaunch.py +174 -0
- metaensemble/lib/runtime_payload.py +42 -0
- metaensemble/lib/runtime_state.py +308 -0
- metaensemble/lib/sidecar.py +166 -0
- metaensemble/lib/topology.py +181 -0
- metaensemble/lib/transcript.py +432 -0
- metaensemble/output-styles/deliverable.md +33 -0
- metaensemble/output-styles/wire.md +38 -0
- metaensemble/roles/architect.md +52 -0
- metaensemble/roles/backend.md +43 -0
- metaensemble/roles/code-quality.md +49 -0
- metaensemble/roles/data-engineer.md +42 -0
- metaensemble/roles/devops.md +42 -0
- metaensemble/roles/docs.md +41 -0
- metaensemble/roles/frontend.md +42 -0
- metaensemble/roles/ml-engineer.md +42 -0
- metaensemble/roles/test-engineer.md +42 -0
- metaensemble/schemas/brief.schema.json +80 -0
- metaensemble/schemas/manifest.schema.json +142 -0
- metaensemble/schemas/role.schema.json +84 -0
- metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
- metaensemble/state/migrations/001_init.sql +72 -0
- metaensemble/state/migrations/002_outcome_extended.sql +86 -0
- metaensemble/state/migrations/003_run_provenance.sql +36 -0
- metaensemble/statusline/me_status.py +187 -0
- metaensemble/tools/__init__.py +7 -0
- metaensemble/tools/executors.py +62 -0
- metaensemble/tools/ledger.py +121 -0
- metaensemble/tools/limits.py +165 -0
- metaensemble/tools/perf.py +150 -0
- metaensemble/tools/standup.py +177 -0
- metaensemble/tools/stats.py +115 -0
- metaensemble-0.2.0.dist-info/METADATA +221 -0
- metaensemble-0.2.0.dist-info/RECORD +85 -0
- metaensemble-0.2.0.dist-info/WHEEL +5 -0
- metaensemble-0.2.0.dist-info/entry_points.txt +2 -0
- metaensemble-0.2.0.dist-info/licenses/LICENSE +21 -0
- metaensemble-0.2.0.dist-info/top_level.txt +2 -0
evals/runners/api.py
ADDED
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
"""Tiered runner dispatch for the evaluation harness.
|
|
2
|
+
|
|
3
|
+
Three tiers correspond to three failure-mode budgets. `replay` reads
|
|
4
|
+
cassette responses recorded from a prior live run — zero API spend,
|
|
5
|
+
deterministic, suitable for PR-gate CI. `smoke` runs one seed against
|
|
6
|
+
the smoke suite to verify the live pipeline still works. `full` runs
|
|
7
|
+
the release-gated cycle with every cell × every seed.
|
|
8
|
+
|
|
9
|
+
Live API calls are issued through the `anthropic` SDK, which the
|
|
10
|
+
package already imports indirectly via the runtime. The runner does
|
|
11
|
+
not bundle a vendored SDK so production and eval use the same client.
|
|
12
|
+
|
|
13
|
+
The replay path is deterministic and CI-safe. The live smoke path uses
|
|
14
|
+
Claude Code directly with tools disabled so smoke/full metrics can be
|
|
15
|
+
measured without silently changing the project under evaluation.
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import math
|
|
21
|
+
import re
|
|
22
|
+
import subprocess
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from enum import Enum
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Callable
|
|
27
|
+
|
|
28
|
+
from evals.runners.metrics import (
|
|
29
|
+
CellMetrics,
|
|
30
|
+
RunOutcome,
|
|
31
|
+
compute_cell_metrics,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Tier(str, Enum):
|
|
36
|
+
REPLAY = "replay"
|
|
37
|
+
SMOKE = "smoke"
|
|
38
|
+
FULL = "full"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class CellSpec:
|
|
43
|
+
"""One cell of the (baseline × suite) matrix."""
|
|
44
|
+
|
|
45
|
+
id: str
|
|
46
|
+
kind: str # "baseline" | "full_system" | "ablation"
|
|
47
|
+
dispatch_fn: str # symbolic name of the dispatch strategy
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class TaskSpec:
|
|
52
|
+
id: str
|
|
53
|
+
suite: str # "suite_a" | "suite_b"
|
|
54
|
+
description: str
|
|
55
|
+
acceptance: list[dict]
|
|
56
|
+
acceptable_labels: list[str] | None = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(frozen=True)
|
|
60
|
+
class HarnessReport:
|
|
61
|
+
"""One eval cycle's full result. Rendered to Markdown by `render_report`."""
|
|
62
|
+
|
|
63
|
+
tier: Tier
|
|
64
|
+
cells: list[CellMetrics]
|
|
65
|
+
notes: list[str]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def evaluate_release_gates(
|
|
69
|
+
report: HarnessReport,
|
|
70
|
+
*,
|
|
71
|
+
failed_run_waste_threshold: float | None = None,
|
|
72
|
+
overhead_ratio_ceiling: float | None = None,
|
|
73
|
+
) -> tuple[bool, list[str]]:
|
|
74
|
+
"""Evaluate D-8/D-9 release gates against a rendered metric report.
|
|
75
|
+
|
|
76
|
+
Returns `(failed, notes)`. Gates only evaluate when their threshold
|
|
77
|
+
and underlying metric are present; missing overhead data is reported
|
|
78
|
+
rather than treated as pass or fail.
|
|
79
|
+
"""
|
|
80
|
+
failed = False
|
|
81
|
+
notes: list[str] = []
|
|
82
|
+
|
|
83
|
+
if failed_run_waste_threshold is not None:
|
|
84
|
+
total_tokens = sum(c.total_tokens for c in report.cells)
|
|
85
|
+
waste_tokens = sum(c.failed_run_token_waste for c in report.cells)
|
|
86
|
+
waste_fraction = (waste_tokens / total_tokens) if total_tokens else 0.0
|
|
87
|
+
state = "FAIL" if waste_fraction > failed_run_waste_threshold else "PASS"
|
|
88
|
+
failed = failed or state == "FAIL"
|
|
89
|
+
notes.append(
|
|
90
|
+
"D-9 failed-run waste gate: "
|
|
91
|
+
f"{state} ({waste_fraction:.1%} of tokens; "
|
|
92
|
+
f"threshold {failed_run_waste_threshold:.1%})."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if overhead_ratio_ceiling is not None:
|
|
96
|
+
measured = [
|
|
97
|
+
c for c in report.cells
|
|
98
|
+
if c.orchestration_overhead_ratio is not None
|
|
99
|
+
]
|
|
100
|
+
if measured:
|
|
101
|
+
violators = [
|
|
102
|
+
c for c in measured
|
|
103
|
+
if (c.orchestration_overhead_ratio or 0.0) > overhead_ratio_ceiling
|
|
104
|
+
]
|
|
105
|
+
if violators:
|
|
106
|
+
failed = True
|
|
107
|
+
rendered = ", ".join(
|
|
108
|
+
f"{c.cell_id}={c.orchestration_overhead_ratio:.2f}x"
|
|
109
|
+
for c in violators
|
|
110
|
+
)
|
|
111
|
+
notes.append(
|
|
112
|
+
"D-8 orchestration-overhead gate: "
|
|
113
|
+
f"FAIL ({rendered}; ceiling {overhead_ratio_ceiling:.2f}x)."
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
notes.append(
|
|
117
|
+
"D-8 orchestration-overhead gate: "
|
|
118
|
+
f"PASS (ceiling {overhead_ratio_ceiling:.2f}x)."
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
121
|
+
notes.append(
|
|
122
|
+
"D-8 orchestration-overhead gate: not evaluated "
|
|
123
|
+
"(best-prompt baseline tokens unavailable in this run)."
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return failed, notes
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def run_cell_replay(
|
|
130
|
+
cell: CellSpec,
|
|
131
|
+
tasks: list[TaskSpec],
|
|
132
|
+
cassette_dir: Path,
|
|
133
|
+
seeds: int = 5,
|
|
134
|
+
) -> list[RunOutcome]:
|
|
135
|
+
"""Replay tier: read cassettes from disk. No API calls.
|
|
136
|
+
|
|
137
|
+
Cassettes live at `cassette_dir/<cell.id>/<task.id>/<seed>.json` and
|
|
138
|
+
encode the recorded RunOutcome fields. Missing cassettes raise
|
|
139
|
+
`FileNotFoundError` so a PR that adds a task without recording its
|
|
140
|
+
cassette fails CI deterministically.
|
|
141
|
+
"""
|
|
142
|
+
outcomes: list[RunOutcome] = []
|
|
143
|
+
for task in tasks:
|
|
144
|
+
for seed in range(seeds):
|
|
145
|
+
path = cassette_dir / cell.id / task.id / f"{seed}.json"
|
|
146
|
+
if not path.exists():
|
|
147
|
+
packed = _load_packed_replay(cassette_dir, cell.id, task.id, seed)
|
|
148
|
+
if packed is None:
|
|
149
|
+
raise FileNotFoundError(
|
|
150
|
+
f"replay cassette missing: {path}. Record it with "
|
|
151
|
+
"`metaensemble eval --tier smoke --record-cassettes` "
|
|
152
|
+
"or add an entry to evals/cassettes/*.jsonl."
|
|
153
|
+
)
|
|
154
|
+
outcomes.append(packed)
|
|
155
|
+
continue
|
|
156
|
+
data = json.loads(path.read_text())
|
|
157
|
+
outcomes.append(RunOutcome(**data))
|
|
158
|
+
return outcomes
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _load_packed_replay(
|
|
162
|
+
cassette_dir: Path,
|
|
163
|
+
cell_id: str,
|
|
164
|
+
task_id: str,
|
|
165
|
+
seed: int,
|
|
166
|
+
) -> RunOutcome | None:
|
|
167
|
+
"""Read compact JSONL cassette packs.
|
|
168
|
+
|
|
169
|
+
The per-file cassette path is the canonical recorder output. The shipped
|
|
170
|
+
v0.1.0 bootstrap pack uses JSONL to avoid hundreds of tiny fixture files
|
|
171
|
+
while still exercising the same replay parser and metrics code in CI.
|
|
172
|
+
"""
|
|
173
|
+
if not cassette_dir.exists():
|
|
174
|
+
return None
|
|
175
|
+
for pack in sorted(cassette_dir.glob("*.jsonl")):
|
|
176
|
+
try:
|
|
177
|
+
lines = pack.read_text().splitlines()
|
|
178
|
+
except OSError:
|
|
179
|
+
continue
|
|
180
|
+
for line in lines:
|
|
181
|
+
if not line.strip() or line.lstrip().startswith("#"):
|
|
182
|
+
continue
|
|
183
|
+
try:
|
|
184
|
+
record = json.loads(line)
|
|
185
|
+
except json.JSONDecodeError:
|
|
186
|
+
continue
|
|
187
|
+
try:
|
|
188
|
+
record_seed = int(record.get("seed", -1))
|
|
189
|
+
except (TypeError, ValueError):
|
|
190
|
+
continue
|
|
191
|
+
if (
|
|
192
|
+
record.get("cell_id") == cell_id
|
|
193
|
+
and record.get("task_id") == task_id
|
|
194
|
+
and record_seed == seed
|
|
195
|
+
):
|
|
196
|
+
outcome = dict(record)
|
|
197
|
+
outcome.pop("cell_id", None)
|
|
198
|
+
outcome.pop("source", None)
|
|
199
|
+
return RunOutcome(**outcome)
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def run_cell_live(
|
|
204
|
+
cell: CellSpec,
|
|
205
|
+
tasks: list[TaskSpec],
|
|
206
|
+
*,
|
|
207
|
+
seeds: int,
|
|
208
|
+
budget_usd: float,
|
|
209
|
+
dispatch_fn: Callable[[CellSpec, TaskSpec, int], RunOutcome],
|
|
210
|
+
) -> list[RunOutcome]:
|
|
211
|
+
"""Live tier: issue real API calls, record outcomes.
|
|
212
|
+
|
|
213
|
+
Delegates to `dispatch_fn` so tests can exercise live aggregation
|
|
214
|
+
without spending money. The production smoke-suite live path is
|
|
215
|
+
`run_suite_b_live_claude`.
|
|
216
|
+
"""
|
|
217
|
+
outcomes: list[RunOutcome] = []
|
|
218
|
+
for task in tasks:
|
|
219
|
+
for seed in range(seeds):
|
|
220
|
+
outcome = dispatch_fn(cell, task, seed)
|
|
221
|
+
outcomes.append(outcome)
|
|
222
|
+
return outcomes
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def run_suite_b_live_claude(
|
|
226
|
+
cell: CellSpec,
|
|
227
|
+
tasks: list[TaskSpec],
|
|
228
|
+
*,
|
|
229
|
+
seeds: int,
|
|
230
|
+
budget_usd: float,
|
|
231
|
+
cwd: Path,
|
|
232
|
+
) -> list[RunOutcome]:
|
|
233
|
+
"""Run a live classification smoke cell through Claude Code.
|
|
234
|
+
|
|
235
|
+
Smoke needs to be a real behavioral check, not a scaffold. To keep token
|
|
236
|
+
spend bounded and side-effect-free, one no-tools Claude call classifies the
|
|
237
|
+
whole smoke batch for a cell/seed, and measured tokens are prorated
|
|
238
|
+
across item-level RunOutcome records. Dispatch itself is covered by the
|
|
239
|
+
live install/incorporation test; the eval harness should not silently write
|
|
240
|
+
Manifests, reports, or project files during a metrics run.
|
|
241
|
+
"""
|
|
242
|
+
suite_b = [t for t in tasks if t.suite == "suite_b"]
|
|
243
|
+
if not suite_b:
|
|
244
|
+
return []
|
|
245
|
+
outcomes: list[RunOutcome] = []
|
|
246
|
+
for seed in range(seeds):
|
|
247
|
+
batch = _invoke_claude_suite_b_cell(
|
|
248
|
+
cell=cell,
|
|
249
|
+
tasks=suite_b,
|
|
250
|
+
seed=seed,
|
|
251
|
+
budget_usd=budget_usd,
|
|
252
|
+
cwd=cwd,
|
|
253
|
+
)
|
|
254
|
+
outcomes.extend(batch)
|
|
255
|
+
return outcomes
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _invoke_claude_suite_b_cell(
|
|
259
|
+
*,
|
|
260
|
+
cell: CellSpec,
|
|
261
|
+
tasks: list[TaskSpec],
|
|
262
|
+
seed: int,
|
|
263
|
+
budget_usd: float,
|
|
264
|
+
cwd: Path,
|
|
265
|
+
) -> list[RunOutcome]:
|
|
266
|
+
prompt = _suite_b_prompt(cell, tasks, seed)
|
|
267
|
+
cmd = [
|
|
268
|
+
"claude", "-p",
|
|
269
|
+
"--output-format", "json",
|
|
270
|
+
"--json-schema", json.dumps(_suite_b_json_schema()),
|
|
271
|
+
"--max-budget-usd", f"{budget_usd:.4f}",
|
|
272
|
+
"--model", "haiku",
|
|
273
|
+
"--no-session-persistence",
|
|
274
|
+
]
|
|
275
|
+
cmd.extend(["--disable-slash-commands"])
|
|
276
|
+
cmd.append(prompt)
|
|
277
|
+
proc = subprocess.run(
|
|
278
|
+
cmd,
|
|
279
|
+
cwd=str(cwd),
|
|
280
|
+
capture_output=True,
|
|
281
|
+
text=True,
|
|
282
|
+
timeout=240,
|
|
283
|
+
)
|
|
284
|
+
duration_ms = 0.0
|
|
285
|
+
cost_usd = 0.0
|
|
286
|
+
tokens_in = 0
|
|
287
|
+
tokens_out = 0
|
|
288
|
+
failure_reason: str | None = None
|
|
289
|
+
predictions: dict[str, str] = {}
|
|
290
|
+
quality: dict[str, float] = {}
|
|
291
|
+
try:
|
|
292
|
+
payload = json.loads(proc.stdout)
|
|
293
|
+
duration_ms = float(payload.get("duration_ms") or 0.0)
|
|
294
|
+
cost_usd = float(payload.get("total_cost_usd") or 0.0)
|
|
295
|
+
tokens_in, tokens_out = _tokens_from_claude_payload(payload)
|
|
296
|
+
if proc.returncode == 0 and not payload.get("is_error"):
|
|
297
|
+
if isinstance(payload.get("structured_output"), dict):
|
|
298
|
+
predictions, quality = _suite_b_predictions_from_data(payload["structured_output"])
|
|
299
|
+
else:
|
|
300
|
+
result = payload.get("result") or ""
|
|
301
|
+
predictions, quality = _parse_suite_b_predictions(result)
|
|
302
|
+
else:
|
|
303
|
+
errors = payload.get("errors") or []
|
|
304
|
+
failure_reason = (
|
|
305
|
+
"; ".join(str(e) for e in errors)
|
|
306
|
+
or str(payload.get("result") or "")
|
|
307
|
+
or str(payload.get("subtype") or "")
|
|
308
|
+
or "claude_failed"
|
|
309
|
+
)
|
|
310
|
+
except Exception as exc:
|
|
311
|
+
failure_reason = f"claude_output_parse_failed: {exc}"
|
|
312
|
+
duration_ms = 0.0
|
|
313
|
+
tokens_in = 0
|
|
314
|
+
tokens_out = 0
|
|
315
|
+
if proc.returncode != 0 and failure_reason is None:
|
|
316
|
+
failure_reason = (proc.stderr or "claude_failed").strip()[:500]
|
|
317
|
+
|
|
318
|
+
per_task_in = math.ceil(tokens_in / max(1, len(tasks)))
|
|
319
|
+
per_task_out = math.ceil(tokens_out / max(1, len(tasks)))
|
|
320
|
+
per_task_ms = duration_ms / max(1, len(tasks))
|
|
321
|
+
budget_exceeded = cost_usd > budget_usd or "maximum budget" in (failure_reason or "").lower()
|
|
322
|
+
outcomes: list[RunOutcome] = []
|
|
323
|
+
for task in tasks:
|
|
324
|
+
acceptable = set(task.acceptable_labels or [])
|
|
325
|
+
label = predictions.get(task.id)
|
|
326
|
+
passed = bool(label and label in acceptable)
|
|
327
|
+
outcomes.append(RunOutcome(
|
|
328
|
+
task_id=task.id,
|
|
329
|
+
seed=seed,
|
|
330
|
+
passed=passed,
|
|
331
|
+
quality_score=(quality.get(task.id, 1.0) if passed else 0.0),
|
|
332
|
+
minimum_useful_answer_score=(1.0 if label else 0.0),
|
|
333
|
+
tokens_in=per_task_in,
|
|
334
|
+
tokens_out=per_task_out,
|
|
335
|
+
budget_exceeded=budget_exceeded,
|
|
336
|
+
duration_ms=per_task_ms,
|
|
337
|
+
failure_reason=None if passed else (failure_reason or f"predicted={label!r}"),
|
|
338
|
+
))
|
|
339
|
+
return outcomes
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _suite_b_prompt(cell: CellSpec, tasks: list[TaskSpec], seed: int) -> str:
|
|
343
|
+
"""Build the current classification-smoke fixture prompt.
|
|
344
|
+
|
|
345
|
+
Suite B is a concrete classification smoke fixture. The prompt is
|
|
346
|
+
intentionally fixture-specific; it is not MetaEnsemble's product scope.
|
|
347
|
+
"""
|
|
348
|
+
items = "\n".join(
|
|
349
|
+
f"- id: {t.id}\n text: {json.dumps(t.description, ensure_ascii=False)}\n"
|
|
350
|
+
f" acceptable_labels: {', '.join(t.acceptable_labels or [])}"
|
|
351
|
+
for t in tasks
|
|
352
|
+
)
|
|
353
|
+
base = (
|
|
354
|
+
"Classify each Somali text into exactly one dialect label from its "
|
|
355
|
+
"acceptable_labels list. Return only JSON matching the requested schema. "
|
|
356
|
+
"No prose outside JSON. Use concise rationales."
|
|
357
|
+
)
|
|
358
|
+
if cell.id == "MM_full":
|
|
359
|
+
base = (
|
|
360
|
+
"Use the full MetaEnsemble rubric in one side-effect-free eval call: "
|
|
361
|
+
"state the task contract internally, classify as a domain specialist, "
|
|
362
|
+
"self-check every label against the allowed labels, then emit the "
|
|
363
|
+
"machine-readable result. "
|
|
364
|
+
+ base
|
|
365
|
+
)
|
|
366
|
+
elif cell.id == "B1_single_agent":
|
|
367
|
+
base = "Classify directly. " + base
|
|
368
|
+
elif cell.id == "B2_single_agent_prompted":
|
|
369
|
+
base = (
|
|
370
|
+
"You are a careful Somali dialect classifier. Check morphology, "
|
|
371
|
+
"focus markers, register, and negative forms before assigning a label. "
|
|
372
|
+
+ base
|
|
373
|
+
)
|
|
374
|
+
elif cell.id == "B4_best_prompt":
|
|
375
|
+
base = (
|
|
376
|
+
"Use a best-effort rubric: identify dialectal markers, compare against "
|
|
377
|
+
"the allowed labels, then output the label only in the JSON field. "
|
|
378
|
+
+ base
|
|
379
|
+
)
|
|
380
|
+
else:
|
|
381
|
+
base = (
|
|
382
|
+
f"Run the `{cell.id}` evaluation strategy as a read-only classification "
|
|
383
|
+
"pass. " + base
|
|
384
|
+
)
|
|
385
|
+
return f"{base}\n\nseed: {seed}\nitems:\n{items}"
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _suite_b_json_schema() -> dict:
|
|
389
|
+
return {
|
|
390
|
+
"type": "object",
|
|
391
|
+
"properties": {
|
|
392
|
+
"predictions": {
|
|
393
|
+
"type": "array",
|
|
394
|
+
"items": {
|
|
395
|
+
"type": "object",
|
|
396
|
+
"properties": {
|
|
397
|
+
"id": {"type": "string"},
|
|
398
|
+
"label": {"type": "string"},
|
|
399
|
+
"confidence": {"type": "number"},
|
|
400
|
+
"rationale": {"type": "string"},
|
|
401
|
+
},
|
|
402
|
+
"required": ["id", "label", "confidence"],
|
|
403
|
+
"additionalProperties": False,
|
|
404
|
+
},
|
|
405
|
+
}
|
|
406
|
+
},
|
|
407
|
+
"required": ["predictions"],
|
|
408
|
+
"additionalProperties": False,
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _parse_suite_b_predictions(text: str) -> tuple[dict[str, str], dict[str, float]]:
|
|
413
|
+
data = _extract_json_object(text)
|
|
414
|
+
return _suite_b_predictions_from_data(data)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _suite_b_predictions_from_data(data: dict) -> tuple[dict[str, str], dict[str, float]]:
|
|
418
|
+
predictions: dict[str, str] = {}
|
|
419
|
+
quality: dict[str, float] = {}
|
|
420
|
+
for item in data.get("predictions") or []:
|
|
421
|
+
item_id = str(item.get("id", "")).strip()
|
|
422
|
+
label = str(item.get("label", "")).strip()
|
|
423
|
+
if not item_id or not label:
|
|
424
|
+
continue
|
|
425
|
+
predictions[item_id] = label
|
|
426
|
+
try:
|
|
427
|
+
conf = float(item.get("confidence", 1.0))
|
|
428
|
+
except (TypeError, ValueError):
|
|
429
|
+
conf = 1.0
|
|
430
|
+
quality[item_id] = max(0.0, min(1.0, conf))
|
|
431
|
+
return predictions, quality
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def _extract_json_object(text: str) -> dict:
|
|
435
|
+
try:
|
|
436
|
+
return json.loads(text)
|
|
437
|
+
except json.JSONDecodeError:
|
|
438
|
+
pass
|
|
439
|
+
match = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
|
440
|
+
if not match:
|
|
441
|
+
raise ValueError("no JSON object found in Claude result")
|
|
442
|
+
return json.loads(match.group(0))
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _tokens_from_claude_payload(payload: dict) -> tuple[int, int]:
|
|
446
|
+
model_usage = payload.get("modelUsage") or {}
|
|
447
|
+
in_total = 0
|
|
448
|
+
out_total = 0
|
|
449
|
+
if isinstance(model_usage, dict):
|
|
450
|
+
for usage in model_usage.values():
|
|
451
|
+
if not isinstance(usage, dict):
|
|
452
|
+
continue
|
|
453
|
+
in_total += int(usage.get("inputTokens") or 0)
|
|
454
|
+
in_total += int(usage.get("cacheReadInputTokens") or 0)
|
|
455
|
+
in_total += int(usage.get("cacheCreationInputTokens") or 0)
|
|
456
|
+
out_total += int(usage.get("outputTokens") or 0)
|
|
457
|
+
usage = payload.get("usage") or {}
|
|
458
|
+
if not in_total and isinstance(usage, dict):
|
|
459
|
+
in_total = int(usage.get("input_tokens") or 0)
|
|
460
|
+
in_total += int(usage.get("cache_read_input_tokens") or 0)
|
|
461
|
+
in_total += int(usage.get("cache_creation_input_tokens") or 0)
|
|
462
|
+
if not out_total and isinstance(usage, dict):
|
|
463
|
+
out_total = int(usage.get("output_tokens") or 0)
|
|
464
|
+
return in_total, out_total
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def assemble_report(
|
|
468
|
+
tier: Tier,
|
|
469
|
+
cells_with_outcomes: list[tuple[CellSpec, list[RunOutcome]]],
|
|
470
|
+
baseline_total_tokens_lookup: dict[str, int] | None = None,
|
|
471
|
+
) -> HarnessReport:
|
|
472
|
+
"""Build a HarnessReport from per-cell outcome lists.
|
|
473
|
+
|
|
474
|
+
`baseline_total_tokens_lookup` maps cell.id → baseline total tokens
|
|
475
|
+
(typically B4's tokens for the suite). When provided, the metric
|
|
476
|
+
`orchestration_overhead_ratio` is computed per cell.
|
|
477
|
+
"""
|
|
478
|
+
notes: list[str] = []
|
|
479
|
+
cell_metrics: list[CellMetrics] = []
|
|
480
|
+
for cell, outcomes in cells_with_outcomes:
|
|
481
|
+
baseline_total = (
|
|
482
|
+
baseline_total_tokens_lookup.get(cell.id)
|
|
483
|
+
if baseline_total_tokens_lookup
|
|
484
|
+
else None
|
|
485
|
+
)
|
|
486
|
+
cell_metrics.append(
|
|
487
|
+
compute_cell_metrics(
|
|
488
|
+
cell_id=cell.id,
|
|
489
|
+
runs=outcomes,
|
|
490
|
+
baseline_total_tokens=baseline_total,
|
|
491
|
+
)
|
|
492
|
+
)
|
|
493
|
+
return HarnessReport(tier=tier, cells=cell_metrics, notes=notes)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def render_report(report: HarnessReport) -> str:
|
|
497
|
+
"""Render the report as Markdown. Stable format for `evals/reports/<date>.md`."""
|
|
498
|
+
lines = [f"# Evaluation report ({report.tier.value})", ""]
|
|
499
|
+
lines.append("| Cell | pass@budget | quality/1k tokens | overhead | waste tokens | p50 ms |")
|
|
500
|
+
lines.append("|---|---|---|---|---|---|")
|
|
501
|
+
for c in report.cells:
|
|
502
|
+
overhead = (
|
|
503
|
+
f"{c.orchestration_overhead_ratio:.2f}×"
|
|
504
|
+
if c.orchestration_overhead_ratio is not None
|
|
505
|
+
else "—"
|
|
506
|
+
)
|
|
507
|
+
lines.append(
|
|
508
|
+
f"| `{c.cell_id}` | {c.pass_at_budget} | "
|
|
509
|
+
f"{c.quality_per_1k_tokens:.2f} | {overhead} | "
|
|
510
|
+
f"{c.failed_run_token_waste:,} | "
|
|
511
|
+
f"{c.time_to_useful_deliverable_ms_p50:.0f} |"
|
|
512
|
+
)
|
|
513
|
+
if report.notes:
|
|
514
|
+
lines.append("")
|
|
515
|
+
lines.append("## Notes")
|
|
516
|
+
for n in report.notes:
|
|
517
|
+
lines.append(f"- {n}")
|
|
518
|
+
return "\n".join(lines)
|
evals/runners/metrics.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Headline metrics for the evaluation harness.
|
|
2
|
+
|
|
3
|
+
`pass@budget` is the primary correctness metric (no overspending wins);
|
|
4
|
+
`quality_per_1k_tokens` and
|
|
5
|
+
`orchestration_overhead_ratio` are the efficiency primaries. The
|
|
6
|
+
supporting metrics expose reliability and concision so a "pass" that
|
|
7
|
+
came from a 20-page report carries less weight than a one-line answer.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class WilsonInterval:
|
|
17
|
+
"""Wilson score interval at a given confidence level."""
|
|
18
|
+
|
|
19
|
+
point: float
|
|
20
|
+
lo: float
|
|
21
|
+
hi: float
|
|
22
|
+
n: int
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return f"{self.point:.3f} (95% CI: {self.lo:.3f}–{self.hi:.3f}, n={self.n})"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def wilson_95(successes: int, n: int) -> WilsonInterval:
|
|
29
|
+
"""Wilson score confidence interval at 95%.
|
|
30
|
+
|
|
31
|
+
Standard recipe (z = 1.96). Returns (point estimate, lo, hi, n).
|
|
32
|
+
On n = 0, returns (0.0, 0.0, 1.0, 0) so the cell still produces
|
|
33
|
+
a number rather than a NaN that breaks markdown rendering.
|
|
34
|
+
"""
|
|
35
|
+
if n <= 0:
|
|
36
|
+
return WilsonInterval(point=0.0, lo=0.0, hi=1.0, n=0)
|
|
37
|
+
z = 1.96
|
|
38
|
+
p = successes / n
|
|
39
|
+
denom = 1.0 + (z * z) / n
|
|
40
|
+
center = (p + (z * z) / (2 * n)) / denom
|
|
41
|
+
halfwidth = (z * math.sqrt((p * (1 - p) + (z * z) / (4 * n)) / n)) / denom
|
|
42
|
+
return WilsonInterval(
|
|
43
|
+
point=p,
|
|
44
|
+
lo=max(0.0, center - halfwidth),
|
|
45
|
+
hi=min(1.0, center + halfwidth),
|
|
46
|
+
n=n,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class CellMetrics:
|
|
52
|
+
"""Per-cell aggregate over `n` seeds."""
|
|
53
|
+
|
|
54
|
+
cell_id: str
|
|
55
|
+
pass_at_budget: WilsonInterval
|
|
56
|
+
quality_per_1k_tokens: float
|
|
57
|
+
orchestration_overhead_ratio: float | None
|
|
58
|
+
failed_run_token_waste: int
|
|
59
|
+
time_to_useful_deliverable_ms_p50: float
|
|
60
|
+
minimum_useful_answer_score: float
|
|
61
|
+
total_tokens: int
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def compute_cell_metrics(
|
|
65
|
+
*,
|
|
66
|
+
cell_id: str,
|
|
67
|
+
runs: list["RunOutcome"],
|
|
68
|
+
baseline_total_tokens: int | None = None,
|
|
69
|
+
) -> CellMetrics:
|
|
70
|
+
"""Aggregate one cell's seeded runs into a CellMetrics row.
|
|
71
|
+
|
|
72
|
+
`runs` is a list of `RunOutcome` records. `baseline_total_tokens`
|
|
73
|
+
is the best-single-agent baseline's token total for the same
|
|
74
|
+
suite, used to derive `orchestration_overhead_ratio`. When None
|
|
75
|
+
(e.g. when the cell IS the baseline), the ratio is None.
|
|
76
|
+
"""
|
|
77
|
+
n = len(runs)
|
|
78
|
+
passes = sum(1 for r in runs if r.passed and not r.budget_exceeded)
|
|
79
|
+
pass_at_budget = wilson_95(passes, n)
|
|
80
|
+
|
|
81
|
+
passed_runs = [r for r in runs if r.passed]
|
|
82
|
+
pass_total_tokens = sum(r.tokens_in + r.tokens_out for r in passed_runs)
|
|
83
|
+
if passed_runs and pass_total_tokens > 0:
|
|
84
|
+
score_sum = sum(r.quality_score for r in passed_runs)
|
|
85
|
+
quality_per_1k = (score_sum / (pass_total_tokens / 1000.0))
|
|
86
|
+
else:
|
|
87
|
+
quality_per_1k = 0.0
|
|
88
|
+
|
|
89
|
+
total_tokens = sum(r.tokens_in + r.tokens_out for r in runs)
|
|
90
|
+
overhead = (
|
|
91
|
+
total_tokens / baseline_total_tokens
|
|
92
|
+
if baseline_total_tokens
|
|
93
|
+
else None
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
failed = [r for r in runs if not r.passed or r.budget_exceeded]
|
|
97
|
+
failed_waste = sum(r.tokens_in + r.tokens_out for r in failed)
|
|
98
|
+
|
|
99
|
+
if passed_runs:
|
|
100
|
+
latencies = sorted(r.duration_ms for r in passed_runs)
|
|
101
|
+
p50 = latencies[len(latencies) // 2]
|
|
102
|
+
muas = sum(r.minimum_useful_answer_score for r in passed_runs) / len(passed_runs)
|
|
103
|
+
else:
|
|
104
|
+
p50 = 0.0
|
|
105
|
+
muas = 0.0
|
|
106
|
+
|
|
107
|
+
return CellMetrics(
|
|
108
|
+
cell_id=cell_id,
|
|
109
|
+
pass_at_budget=pass_at_budget,
|
|
110
|
+
quality_per_1k_tokens=quality_per_1k,
|
|
111
|
+
orchestration_overhead_ratio=overhead,
|
|
112
|
+
failed_run_token_waste=failed_waste,
|
|
113
|
+
time_to_useful_deliverable_ms_p50=p50,
|
|
114
|
+
minimum_useful_answer_score=muas,
|
|
115
|
+
total_tokens=total_tokens,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass(frozen=True)
|
|
120
|
+
class RunOutcome:
|
|
121
|
+
"""One seed × one task outcome that feeds into `compute_cell_metrics`."""
|
|
122
|
+
|
|
123
|
+
task_id: str
|
|
124
|
+
seed: int
|
|
125
|
+
passed: bool
|
|
126
|
+
quality_score: float # 0.0 - 1.0, per the task's acceptance grading
|
|
127
|
+
minimum_useful_answer_score: float # 0.0 - 1.0, concision/brevity reward
|
|
128
|
+
tokens_in: int
|
|
129
|
+
tokens_out: int
|
|
130
|
+
budget_exceeded: bool
|
|
131
|
+
duration_ms: float
|
|
132
|
+
failure_reason: str | None = None
|
metaensemble/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""MetaEnsemble core package.
|
|
2
|
+
|
|
3
|
+
Project-agnostic substrate: schemas, SQLite Ledger, identity generation,
|
|
4
|
+
and Manifest validation. See ARCHITECTURE.md and PERFORMANCE.md at the
|
|
5
|
+
repo root for the binding design and engineering contracts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
__version__ = version("metaensemble")
|
|
12
|
+
except PackageNotFoundError:
|
|
13
|
+
__version__ = "0.2.0"
|