metaensemble 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. evals/README.md +147 -0
  2. evals/__init__.py +0 -0
  3. evals/cassettes/README.md +10 -0
  4. evals/cassettes/bootstrap.jsonl +800 -0
  5. evals/configs/default.yaml +59 -0
  6. evals/datasets/__init__.py +0 -0
  7. evals/datasets/suite_a/tasks.yaml +123 -0
  8. evals/datasets/suite_b/items.yaml +90 -0
  9. evals/runners/__init__.py +12 -0
  10. evals/runners/api.py +518 -0
  11. evals/runners/metrics.py +132 -0
  12. metaensemble/__init__.py +13 -0
  13. metaensemble/cli.py +1362 -0
  14. metaensemble/commands/dispatch.md +39 -0
  15. metaensemble/commands/executors.md +12 -0
  16. metaensemble/commands/ledger.md +19 -0
  17. metaensemble/commands/limits.md +12 -0
  18. metaensemble/commands/perf.md +12 -0
  19. metaensemble/commands/relaunch.md +29 -0
  20. metaensemble/commands/standup.md +14 -0
  21. metaensemble/config/budgets.example.yaml +72 -0
  22. metaensemble/config/quality.example.yaml +82 -0
  23. metaensemble/hooks/__init__.py +1 -0
  24. metaensemble/hooks/_common.py +148 -0
  25. metaensemble/hooks/deliverable_sync.py +73 -0
  26. metaensemble/hooks/file_event.py +303 -0
  27. metaensemble/hooks/post_task.py +460 -0
  28. metaensemble/hooks/pre_task.py +548 -0
  29. metaensemble/hooks/session_start.py +212 -0
  30. metaensemble/hooks/session_summary.py +392 -0
  31. metaensemble/hooks/subagent_stop.py +94 -0
  32. metaensemble/lib/__init__.py +1 -0
  33. metaensemble/lib/config.py +414 -0
  34. metaensemble/lib/cost_gate.py +299 -0
  35. metaensemble/lib/dispatch.py +341 -0
  36. metaensemble/lib/doctor.py +1563 -0
  37. metaensemble/lib/file_events.py +395 -0
  38. metaensemble/lib/ids.py +91 -0
  39. metaensemble/lib/installer.py +5018 -0
  40. metaensemble/lib/ledger.py +812 -0
  41. metaensemble/lib/manifest.py +141 -0
  42. metaensemble/lib/native_state.py +463 -0
  43. metaensemble/lib/overlaps.py +155 -0
  44. metaensemble/lib/quality_gate.py +155 -0
  45. metaensemble/lib/quality_runners.py +446 -0
  46. metaensemble/lib/reconcile.py +420 -0
  47. metaensemble/lib/recording.py +422 -0
  48. metaensemble/lib/relaunch.py +174 -0
  49. metaensemble/lib/runtime_payload.py +42 -0
  50. metaensemble/lib/runtime_state.py +308 -0
  51. metaensemble/lib/sidecar.py +166 -0
  52. metaensemble/lib/topology.py +181 -0
  53. metaensemble/lib/transcript.py +432 -0
  54. metaensemble/output-styles/deliverable.md +33 -0
  55. metaensemble/output-styles/wire.md +38 -0
  56. metaensemble/roles/architect.md +52 -0
  57. metaensemble/roles/backend.md +43 -0
  58. metaensemble/roles/code-quality.md +49 -0
  59. metaensemble/roles/data-engineer.md +42 -0
  60. metaensemble/roles/devops.md +42 -0
  61. metaensemble/roles/docs.md +41 -0
  62. metaensemble/roles/frontend.md +42 -0
  63. metaensemble/roles/ml-engineer.md +42 -0
  64. metaensemble/roles/test-engineer.md +42 -0
  65. metaensemble/schemas/brief.schema.json +80 -0
  66. metaensemble/schemas/manifest.schema.json +142 -0
  67. metaensemble/schemas/role.schema.json +84 -0
  68. metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
  69. metaensemble/state/migrations/001_init.sql +72 -0
  70. metaensemble/state/migrations/002_outcome_extended.sql +86 -0
  71. metaensemble/state/migrations/003_run_provenance.sql +36 -0
  72. metaensemble/statusline/me_status.py +187 -0
  73. metaensemble/tools/__init__.py +7 -0
  74. metaensemble/tools/executors.py +62 -0
  75. metaensemble/tools/ledger.py +121 -0
  76. metaensemble/tools/limits.py +165 -0
  77. metaensemble/tools/perf.py +150 -0
  78. metaensemble/tools/standup.py +177 -0
  79. metaensemble/tools/stats.py +115 -0
  80. metaensemble-0.2.0.dist-info/METADATA +221 -0
  81. metaensemble-0.2.0.dist-info/RECORD +85 -0
  82. metaensemble-0.2.0.dist-info/WHEEL +5 -0
  83. metaensemble-0.2.0.dist-info/entry_points.txt +2 -0
  84. metaensemble-0.2.0.dist-info/licenses/LICENSE +21 -0
  85. metaensemble-0.2.0.dist-info/top_level.txt +2 -0
evals/runners/api.py ADDED
@@ -0,0 +1,518 @@
1
+ """Tiered runner dispatch for the evaluation harness.
2
+
3
+ Three tiers correspond to three failure-mode budgets. `replay` reads
4
+ cassette responses recorded from a prior live run — zero API spend,
5
+ deterministic, suitable for PR-gate CI. `smoke` runs one seed against
6
+ the smoke suite to verify the live pipeline still works. `full` runs
7
+ the release-gated cycle with every cell × every seed.
8
+
9
+ Live API calls are issued through the `anthropic` SDK, which the
10
+ package already imports indirectly via the runtime. The runner does
11
+ not bundle a vendored SDK so production and eval use the same client.
12
+
13
+ The replay path is deterministic and CI-safe. The live smoke path uses
14
+ Claude Code directly with tools disabled so smoke/full metrics can be
15
+ measured without silently changing the project under evaluation.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import math
21
+ import re
22
+ import subprocess
23
+ from dataclasses import dataclass
24
+ from enum import Enum
25
+ from pathlib import Path
26
+ from typing import Callable
27
+
28
+ from evals.runners.metrics import (
29
+ CellMetrics,
30
+ RunOutcome,
31
+ compute_cell_metrics,
32
+ )
33
+
34
+
35
+ class Tier(str, Enum):
36
+ REPLAY = "replay"
37
+ SMOKE = "smoke"
38
+ FULL = "full"
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class CellSpec:
43
+ """One cell of the (baseline × suite) matrix."""
44
+
45
+ id: str
46
+ kind: str # "baseline" | "full_system" | "ablation"
47
+ dispatch_fn: str # symbolic name of the dispatch strategy
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class TaskSpec:
52
+ id: str
53
+ suite: str # "suite_a" | "suite_b"
54
+ description: str
55
+ acceptance: list[dict]
56
+ acceptable_labels: list[str] | None = None
57
+
58
+
59
+ @dataclass(frozen=True)
60
+ class HarnessReport:
61
+ """One eval cycle's full result. Rendered to Markdown by `render_report`."""
62
+
63
+ tier: Tier
64
+ cells: list[CellMetrics]
65
+ notes: list[str]
66
+
67
+
68
+ def evaluate_release_gates(
69
+ report: HarnessReport,
70
+ *,
71
+ failed_run_waste_threshold: float | None = None,
72
+ overhead_ratio_ceiling: float | None = None,
73
+ ) -> tuple[bool, list[str]]:
74
+ """Evaluate D-8/D-9 release gates against a rendered metric report.
75
+
76
+ Returns `(failed, notes)`. Gates only evaluate when their threshold
77
+ and underlying metric are present; missing overhead data is reported
78
+ rather than treated as pass or fail.
79
+ """
80
+ failed = False
81
+ notes: list[str] = []
82
+
83
+ if failed_run_waste_threshold is not None:
84
+ total_tokens = sum(c.total_tokens for c in report.cells)
85
+ waste_tokens = sum(c.failed_run_token_waste for c in report.cells)
86
+ waste_fraction = (waste_tokens / total_tokens) if total_tokens else 0.0
87
+ state = "FAIL" if waste_fraction > failed_run_waste_threshold else "PASS"
88
+ failed = failed or state == "FAIL"
89
+ notes.append(
90
+ "D-9 failed-run waste gate: "
91
+ f"{state} ({waste_fraction:.1%} of tokens; "
92
+ f"threshold {failed_run_waste_threshold:.1%})."
93
+ )
94
+
95
+ if overhead_ratio_ceiling is not None:
96
+ measured = [
97
+ c for c in report.cells
98
+ if c.orchestration_overhead_ratio is not None
99
+ ]
100
+ if measured:
101
+ violators = [
102
+ c for c in measured
103
+ if (c.orchestration_overhead_ratio or 0.0) > overhead_ratio_ceiling
104
+ ]
105
+ if violators:
106
+ failed = True
107
+ rendered = ", ".join(
108
+ f"{c.cell_id}={c.orchestration_overhead_ratio:.2f}x"
109
+ for c in violators
110
+ )
111
+ notes.append(
112
+ "D-8 orchestration-overhead gate: "
113
+ f"FAIL ({rendered}; ceiling {overhead_ratio_ceiling:.2f}x)."
114
+ )
115
+ else:
116
+ notes.append(
117
+ "D-8 orchestration-overhead gate: "
118
+ f"PASS (ceiling {overhead_ratio_ceiling:.2f}x)."
119
+ )
120
+ else:
121
+ notes.append(
122
+ "D-8 orchestration-overhead gate: not evaluated "
123
+ "(best-prompt baseline tokens unavailable in this run)."
124
+ )
125
+
126
+ return failed, notes
127
+
128
+
129
+ def run_cell_replay(
130
+ cell: CellSpec,
131
+ tasks: list[TaskSpec],
132
+ cassette_dir: Path,
133
+ seeds: int = 5,
134
+ ) -> list[RunOutcome]:
135
+ """Replay tier: read cassettes from disk. No API calls.
136
+
137
+ Cassettes live at `cassette_dir/<cell.id>/<task.id>/<seed>.json` and
138
+ encode the recorded RunOutcome fields. Missing cassettes raise
139
+ `FileNotFoundError` so a PR that adds a task without recording its
140
+ cassette fails CI deterministically.
141
+ """
142
+ outcomes: list[RunOutcome] = []
143
+ for task in tasks:
144
+ for seed in range(seeds):
145
+ path = cassette_dir / cell.id / task.id / f"{seed}.json"
146
+ if not path.exists():
147
+ packed = _load_packed_replay(cassette_dir, cell.id, task.id, seed)
148
+ if packed is None:
149
+ raise FileNotFoundError(
150
+ f"replay cassette missing: {path}. Record it with "
151
+ "`metaensemble eval --tier smoke --record-cassettes` "
152
+ "or add an entry to evals/cassettes/*.jsonl."
153
+ )
154
+ outcomes.append(packed)
155
+ continue
156
+ data = json.loads(path.read_text())
157
+ outcomes.append(RunOutcome(**data))
158
+ return outcomes
159
+
160
+
161
+ def _load_packed_replay(
162
+ cassette_dir: Path,
163
+ cell_id: str,
164
+ task_id: str,
165
+ seed: int,
166
+ ) -> RunOutcome | None:
167
+ """Read compact JSONL cassette packs.
168
+
169
+ The per-file cassette path is the canonical recorder output. The shipped
170
+ v0.1.0 bootstrap pack uses JSONL to avoid hundreds of tiny fixture files
171
+ while still exercising the same replay parser and metrics code in CI.
172
+ """
173
+ if not cassette_dir.exists():
174
+ return None
175
+ for pack in sorted(cassette_dir.glob("*.jsonl")):
176
+ try:
177
+ lines = pack.read_text().splitlines()
178
+ except OSError:
179
+ continue
180
+ for line in lines:
181
+ if not line.strip() or line.lstrip().startswith("#"):
182
+ continue
183
+ try:
184
+ record = json.loads(line)
185
+ except json.JSONDecodeError:
186
+ continue
187
+ try:
188
+ record_seed = int(record.get("seed", -1))
189
+ except (TypeError, ValueError):
190
+ continue
191
+ if (
192
+ record.get("cell_id") == cell_id
193
+ and record.get("task_id") == task_id
194
+ and record_seed == seed
195
+ ):
196
+ outcome = dict(record)
197
+ outcome.pop("cell_id", None)
198
+ outcome.pop("source", None)
199
+ return RunOutcome(**outcome)
200
+ return None
201
+
202
+
203
+ def run_cell_live(
204
+ cell: CellSpec,
205
+ tasks: list[TaskSpec],
206
+ *,
207
+ seeds: int,
208
+ budget_usd: float,
209
+ dispatch_fn: Callable[[CellSpec, TaskSpec, int], RunOutcome],
210
+ ) -> list[RunOutcome]:
211
+ """Live tier: issue real API calls, record outcomes.
212
+
213
+ Delegates to `dispatch_fn` so tests can exercise live aggregation
214
+ without spending money. The production smoke-suite live path is
215
+ `run_suite_b_live_claude`.
216
+ """
217
+ outcomes: list[RunOutcome] = []
218
+ for task in tasks:
219
+ for seed in range(seeds):
220
+ outcome = dispatch_fn(cell, task, seed)
221
+ outcomes.append(outcome)
222
+ return outcomes
223
+
224
+
225
+ def run_suite_b_live_claude(
226
+ cell: CellSpec,
227
+ tasks: list[TaskSpec],
228
+ *,
229
+ seeds: int,
230
+ budget_usd: float,
231
+ cwd: Path,
232
+ ) -> list[RunOutcome]:
233
+ """Run a live classification smoke cell through Claude Code.
234
+
235
+ Smoke needs to be a real behavioral check, not a scaffold. To keep token
236
+ spend bounded and side-effect-free, one no-tools Claude call classifies the
237
+ whole smoke batch for a cell/seed, and measured tokens are prorated
238
+ across item-level RunOutcome records. Dispatch itself is covered by the
239
+ live install/incorporation test; the eval harness should not silently write
240
+ Manifests, reports, or project files during a metrics run.
241
+ """
242
+ suite_b = [t for t in tasks if t.suite == "suite_b"]
243
+ if not suite_b:
244
+ return []
245
+ outcomes: list[RunOutcome] = []
246
+ for seed in range(seeds):
247
+ batch = _invoke_claude_suite_b_cell(
248
+ cell=cell,
249
+ tasks=suite_b,
250
+ seed=seed,
251
+ budget_usd=budget_usd,
252
+ cwd=cwd,
253
+ )
254
+ outcomes.extend(batch)
255
+ return outcomes
256
+
257
+
258
+ def _invoke_claude_suite_b_cell(
259
+ *,
260
+ cell: CellSpec,
261
+ tasks: list[TaskSpec],
262
+ seed: int,
263
+ budget_usd: float,
264
+ cwd: Path,
265
+ ) -> list[RunOutcome]:
266
+ prompt = _suite_b_prompt(cell, tasks, seed)
267
+ cmd = [
268
+ "claude", "-p",
269
+ "--output-format", "json",
270
+ "--json-schema", json.dumps(_suite_b_json_schema()),
271
+ "--max-budget-usd", f"{budget_usd:.4f}",
272
+ "--model", "haiku",
273
+ "--no-session-persistence",
274
+ ]
275
+ cmd.extend(["--disable-slash-commands"])
276
+ cmd.append(prompt)
277
+ proc = subprocess.run(
278
+ cmd,
279
+ cwd=str(cwd),
280
+ capture_output=True,
281
+ text=True,
282
+ timeout=240,
283
+ )
284
+ duration_ms = 0.0
285
+ cost_usd = 0.0
286
+ tokens_in = 0
287
+ tokens_out = 0
288
+ failure_reason: str | None = None
289
+ predictions: dict[str, str] = {}
290
+ quality: dict[str, float] = {}
291
+ try:
292
+ payload = json.loads(proc.stdout)
293
+ duration_ms = float(payload.get("duration_ms") or 0.0)
294
+ cost_usd = float(payload.get("total_cost_usd") or 0.0)
295
+ tokens_in, tokens_out = _tokens_from_claude_payload(payload)
296
+ if proc.returncode == 0 and not payload.get("is_error"):
297
+ if isinstance(payload.get("structured_output"), dict):
298
+ predictions, quality = _suite_b_predictions_from_data(payload["structured_output"])
299
+ else:
300
+ result = payload.get("result") or ""
301
+ predictions, quality = _parse_suite_b_predictions(result)
302
+ else:
303
+ errors = payload.get("errors") or []
304
+ failure_reason = (
305
+ "; ".join(str(e) for e in errors)
306
+ or str(payload.get("result") or "")
307
+ or str(payload.get("subtype") or "")
308
+ or "claude_failed"
309
+ )
310
+ except Exception as exc:
311
+ failure_reason = f"claude_output_parse_failed: {exc}"
312
+ duration_ms = 0.0
313
+ tokens_in = 0
314
+ tokens_out = 0
315
+ if proc.returncode != 0 and failure_reason is None:
316
+ failure_reason = (proc.stderr or "claude_failed").strip()[:500]
317
+
318
+ per_task_in = math.ceil(tokens_in / max(1, len(tasks)))
319
+ per_task_out = math.ceil(tokens_out / max(1, len(tasks)))
320
+ per_task_ms = duration_ms / max(1, len(tasks))
321
+ budget_exceeded = cost_usd > budget_usd or "maximum budget" in (failure_reason or "").lower()
322
+ outcomes: list[RunOutcome] = []
323
+ for task in tasks:
324
+ acceptable = set(task.acceptable_labels or [])
325
+ label = predictions.get(task.id)
326
+ passed = bool(label and label in acceptable)
327
+ outcomes.append(RunOutcome(
328
+ task_id=task.id,
329
+ seed=seed,
330
+ passed=passed,
331
+ quality_score=(quality.get(task.id, 1.0) if passed else 0.0),
332
+ minimum_useful_answer_score=(1.0 if label else 0.0),
333
+ tokens_in=per_task_in,
334
+ tokens_out=per_task_out,
335
+ budget_exceeded=budget_exceeded,
336
+ duration_ms=per_task_ms,
337
+ failure_reason=None if passed else (failure_reason or f"predicted={label!r}"),
338
+ ))
339
+ return outcomes
340
+
341
+
342
+ def _suite_b_prompt(cell: CellSpec, tasks: list[TaskSpec], seed: int) -> str:
343
+ """Build the current classification-smoke fixture prompt.
344
+
345
+ Suite B is a concrete classification smoke fixture. The prompt is
346
+ intentionally fixture-specific; it is not MetaEnsemble's product scope.
347
+ """
348
+ items = "\n".join(
349
+ f"- id: {t.id}\n text: {json.dumps(t.description, ensure_ascii=False)}\n"
350
+ f" acceptable_labels: {', '.join(t.acceptable_labels or [])}"
351
+ for t in tasks
352
+ )
353
+ base = (
354
+ "Classify each Somali text into exactly one dialect label from its "
355
+ "acceptable_labels list. Return only JSON matching the requested schema. "
356
+ "No prose outside JSON. Use concise rationales."
357
+ )
358
+ if cell.id == "MM_full":
359
+ base = (
360
+ "Use the full MetaEnsemble rubric in one side-effect-free eval call: "
361
+ "state the task contract internally, classify as a domain specialist, "
362
+ "self-check every label against the allowed labels, then emit the "
363
+ "machine-readable result. "
364
+ + base
365
+ )
366
+ elif cell.id == "B1_single_agent":
367
+ base = "Classify directly. " + base
368
+ elif cell.id == "B2_single_agent_prompted":
369
+ base = (
370
+ "You are a careful Somali dialect classifier. Check morphology, "
371
+ "focus markers, register, and negative forms before assigning a label. "
372
+ + base
373
+ )
374
+ elif cell.id == "B4_best_prompt":
375
+ base = (
376
+ "Use a best-effort rubric: identify dialectal markers, compare against "
377
+ "the allowed labels, then output the label only in the JSON field. "
378
+ + base
379
+ )
380
+ else:
381
+ base = (
382
+ f"Run the `{cell.id}` evaluation strategy as a read-only classification "
383
+ "pass. " + base
384
+ )
385
+ return f"{base}\n\nseed: {seed}\nitems:\n{items}"
386
+
387
+
388
+ def _suite_b_json_schema() -> dict:
389
+ return {
390
+ "type": "object",
391
+ "properties": {
392
+ "predictions": {
393
+ "type": "array",
394
+ "items": {
395
+ "type": "object",
396
+ "properties": {
397
+ "id": {"type": "string"},
398
+ "label": {"type": "string"},
399
+ "confidence": {"type": "number"},
400
+ "rationale": {"type": "string"},
401
+ },
402
+ "required": ["id", "label", "confidence"],
403
+ "additionalProperties": False,
404
+ },
405
+ }
406
+ },
407
+ "required": ["predictions"],
408
+ "additionalProperties": False,
409
+ }
410
+
411
+
412
+ def _parse_suite_b_predictions(text: str) -> tuple[dict[str, str], dict[str, float]]:
413
+ data = _extract_json_object(text)
414
+ return _suite_b_predictions_from_data(data)
415
+
416
+
417
+ def _suite_b_predictions_from_data(data: dict) -> tuple[dict[str, str], dict[str, float]]:
418
+ predictions: dict[str, str] = {}
419
+ quality: dict[str, float] = {}
420
+ for item in data.get("predictions") or []:
421
+ item_id = str(item.get("id", "")).strip()
422
+ label = str(item.get("label", "")).strip()
423
+ if not item_id or not label:
424
+ continue
425
+ predictions[item_id] = label
426
+ try:
427
+ conf = float(item.get("confidence", 1.0))
428
+ except (TypeError, ValueError):
429
+ conf = 1.0
430
+ quality[item_id] = max(0.0, min(1.0, conf))
431
+ return predictions, quality
432
+
433
+
434
+ def _extract_json_object(text: str) -> dict:
435
+ try:
436
+ return json.loads(text)
437
+ except json.JSONDecodeError:
438
+ pass
439
+ match = re.search(r"\{.*\}", text, flags=re.DOTALL)
440
+ if not match:
441
+ raise ValueError("no JSON object found in Claude result")
442
+ return json.loads(match.group(0))
443
+
444
+
445
+ def _tokens_from_claude_payload(payload: dict) -> tuple[int, int]:
446
+ model_usage = payload.get("modelUsage") or {}
447
+ in_total = 0
448
+ out_total = 0
449
+ if isinstance(model_usage, dict):
450
+ for usage in model_usage.values():
451
+ if not isinstance(usage, dict):
452
+ continue
453
+ in_total += int(usage.get("inputTokens") or 0)
454
+ in_total += int(usage.get("cacheReadInputTokens") or 0)
455
+ in_total += int(usage.get("cacheCreationInputTokens") or 0)
456
+ out_total += int(usage.get("outputTokens") or 0)
457
+ usage = payload.get("usage") or {}
458
+ if not in_total and isinstance(usage, dict):
459
+ in_total = int(usage.get("input_tokens") or 0)
460
+ in_total += int(usage.get("cache_read_input_tokens") or 0)
461
+ in_total += int(usage.get("cache_creation_input_tokens") or 0)
462
+ if not out_total and isinstance(usage, dict):
463
+ out_total = int(usage.get("output_tokens") or 0)
464
+ return in_total, out_total
465
+
466
+
467
+ def assemble_report(
468
+ tier: Tier,
469
+ cells_with_outcomes: list[tuple[CellSpec, list[RunOutcome]]],
470
+ baseline_total_tokens_lookup: dict[str, int] | None = None,
471
+ ) -> HarnessReport:
472
+ """Build a HarnessReport from per-cell outcome lists.
473
+
474
+ `baseline_total_tokens_lookup` maps cell.id → baseline total tokens
475
+ (typically B4's tokens for the suite). When provided, the metric
476
+ `orchestration_overhead_ratio` is computed per cell.
477
+ """
478
+ notes: list[str] = []
479
+ cell_metrics: list[CellMetrics] = []
480
+ for cell, outcomes in cells_with_outcomes:
481
+ baseline_total = (
482
+ baseline_total_tokens_lookup.get(cell.id)
483
+ if baseline_total_tokens_lookup
484
+ else None
485
+ )
486
+ cell_metrics.append(
487
+ compute_cell_metrics(
488
+ cell_id=cell.id,
489
+ runs=outcomes,
490
+ baseline_total_tokens=baseline_total,
491
+ )
492
+ )
493
+ return HarnessReport(tier=tier, cells=cell_metrics, notes=notes)
494
+
495
+
496
+ def render_report(report: HarnessReport) -> str:
497
+ """Render the report as Markdown. Stable format for `evals/reports/<date>.md`."""
498
+ lines = [f"# Evaluation report ({report.tier.value})", ""]
499
+ lines.append("| Cell | pass@budget | quality/1k tokens | overhead | waste tokens | p50 ms |")
500
+ lines.append("|---|---|---|---|---|---|")
501
+ for c in report.cells:
502
+ overhead = (
503
+ f"{c.orchestration_overhead_ratio:.2f}×"
504
+ if c.orchestration_overhead_ratio is not None
505
+ else "—"
506
+ )
507
+ lines.append(
508
+ f"| `{c.cell_id}` | {c.pass_at_budget} | "
509
+ f"{c.quality_per_1k_tokens:.2f} | {overhead} | "
510
+ f"{c.failed_run_token_waste:,} | "
511
+ f"{c.time_to_useful_deliverable_ms_p50:.0f} |"
512
+ )
513
+ if report.notes:
514
+ lines.append("")
515
+ lines.append("## Notes")
516
+ for n in report.notes:
517
+ lines.append(f"- {n}")
518
+ return "\n".join(lines)
@@ -0,0 +1,132 @@
1
+ """Headline metrics for the evaluation harness.
2
+
3
+ `pass@budget` is the primary correctness metric (no overspending wins);
4
+ `quality_per_1k_tokens` and
5
+ `orchestration_overhead_ratio` are the efficiency primaries. The
6
+ supporting metrics expose reliability and concision so a "pass" that
7
+ came from a 20-page report carries less weight than a one-line answer.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ from dataclasses import dataclass
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class WilsonInterval:
17
+ """Wilson score interval at a given confidence level."""
18
+
19
+ point: float
20
+ lo: float
21
+ hi: float
22
+ n: int
23
+
24
+ def __str__(self) -> str:
25
+ return f"{self.point:.3f} (95% CI: {self.lo:.3f}–{self.hi:.3f}, n={self.n})"
26
+
27
+
28
+ def wilson_95(successes: int, n: int) -> WilsonInterval:
29
+ """Wilson score confidence interval at 95%.
30
+
31
+ Standard recipe (z = 1.96). Returns (point estimate, lo, hi, n).
32
+ On n = 0, returns (0.0, 0.0, 1.0, 0) so the cell still produces
33
+ a number rather than a NaN that breaks markdown rendering.
34
+ """
35
+ if n <= 0:
36
+ return WilsonInterval(point=0.0, lo=0.0, hi=1.0, n=0)
37
+ z = 1.96
38
+ p = successes / n
39
+ denom = 1.0 + (z * z) / n
40
+ center = (p + (z * z) / (2 * n)) / denom
41
+ halfwidth = (z * math.sqrt((p * (1 - p) + (z * z) / (4 * n)) / n)) / denom
42
+ return WilsonInterval(
43
+ point=p,
44
+ lo=max(0.0, center - halfwidth),
45
+ hi=min(1.0, center + halfwidth),
46
+ n=n,
47
+ )
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class CellMetrics:
52
+ """Per-cell aggregate over `n` seeds."""
53
+
54
+ cell_id: str
55
+ pass_at_budget: WilsonInterval
56
+ quality_per_1k_tokens: float
57
+ orchestration_overhead_ratio: float | None
58
+ failed_run_token_waste: int
59
+ time_to_useful_deliverable_ms_p50: float
60
+ minimum_useful_answer_score: float
61
+ total_tokens: int
62
+
63
+
64
+ def compute_cell_metrics(
65
+ *,
66
+ cell_id: str,
67
+ runs: list["RunOutcome"],
68
+ baseline_total_tokens: int | None = None,
69
+ ) -> CellMetrics:
70
+ """Aggregate one cell's seeded runs into a CellMetrics row.
71
+
72
+ `runs` is a list of `RunOutcome` records. `baseline_total_tokens`
73
+ is the best-single-agent baseline's token total for the same
74
+ suite, used to derive `orchestration_overhead_ratio`. When None
75
+ (e.g. when the cell IS the baseline), the ratio is None.
76
+ """
77
+ n = len(runs)
78
+ passes = sum(1 for r in runs if r.passed and not r.budget_exceeded)
79
+ pass_at_budget = wilson_95(passes, n)
80
+
81
+ passed_runs = [r for r in runs if r.passed]
82
+ pass_total_tokens = sum(r.tokens_in + r.tokens_out for r in passed_runs)
83
+ if passed_runs and pass_total_tokens > 0:
84
+ score_sum = sum(r.quality_score for r in passed_runs)
85
+ quality_per_1k = (score_sum / (pass_total_tokens / 1000.0))
86
+ else:
87
+ quality_per_1k = 0.0
88
+
89
+ total_tokens = sum(r.tokens_in + r.tokens_out for r in runs)
90
+ overhead = (
91
+ total_tokens / baseline_total_tokens
92
+ if baseline_total_tokens
93
+ else None
94
+ )
95
+
96
+ failed = [r for r in runs if not r.passed or r.budget_exceeded]
97
+ failed_waste = sum(r.tokens_in + r.tokens_out for r in failed)
98
+
99
+ if passed_runs:
100
+ latencies = sorted(r.duration_ms for r in passed_runs)
101
+ p50 = latencies[len(latencies) // 2]
102
+ muas = sum(r.minimum_useful_answer_score for r in passed_runs) / len(passed_runs)
103
+ else:
104
+ p50 = 0.0
105
+ muas = 0.0
106
+
107
+ return CellMetrics(
108
+ cell_id=cell_id,
109
+ pass_at_budget=pass_at_budget,
110
+ quality_per_1k_tokens=quality_per_1k,
111
+ orchestration_overhead_ratio=overhead,
112
+ failed_run_token_waste=failed_waste,
113
+ time_to_useful_deliverable_ms_p50=p50,
114
+ minimum_useful_answer_score=muas,
115
+ total_tokens=total_tokens,
116
+ )
117
+
118
+
119
+ @dataclass(frozen=True)
120
+ class RunOutcome:
121
+ """One seed × one task outcome that feeds into `compute_cell_metrics`."""
122
+
123
+ task_id: str
124
+ seed: int
125
+ passed: bool
126
+ quality_score: float # 0.0 - 1.0, per the task's acceptance grading
127
+ minimum_useful_answer_score: float # 0.0 - 1.0, concision/brevity reward
128
+ tokens_in: int
129
+ tokens_out: int
130
+ budget_exceeded: bool
131
+ duration_ms: float
132
+ failure_reason: str | None = None
@@ -0,0 +1,13 @@
1
+ """MetaEnsemble core package.
2
+
3
+ Project-agnostic substrate: schemas, SQLite Ledger, identity generation,
4
+ and Manifest validation. See ARCHITECTURE.md and PERFORMANCE.md at the
5
+ repo root for the binding design and engineering contracts.
6
+ """
7
+
8
+ from importlib.metadata import PackageNotFoundError, version
9
+
10
+ try:
11
+ __version__ = version("metaensemble")
12
+ except PackageNotFoundError:
13
+ __version__ = "0.2.0"