metaensemble 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. metaensemble-0.2.0/LICENSE +21 -0
  2. metaensemble-0.2.0/PKG-INFO +221 -0
  3. metaensemble-0.2.0/README.md +194 -0
  4. metaensemble-0.2.0/evals/README.md +147 -0
  5. metaensemble-0.2.0/evals/__init__.py +0 -0
  6. metaensemble-0.2.0/evals/cassettes/README.md +10 -0
  7. metaensemble-0.2.0/evals/cassettes/bootstrap.jsonl +800 -0
  8. metaensemble-0.2.0/evals/configs/default.yaml +59 -0
  9. metaensemble-0.2.0/evals/datasets/__init__.py +0 -0
  10. metaensemble-0.2.0/evals/datasets/suite_a/tasks.yaml +123 -0
  11. metaensemble-0.2.0/evals/datasets/suite_b/items.yaml +90 -0
  12. metaensemble-0.2.0/evals/runners/__init__.py +12 -0
  13. metaensemble-0.2.0/evals/runners/api.py +518 -0
  14. metaensemble-0.2.0/evals/runners/metrics.py +132 -0
  15. metaensemble-0.2.0/metaensemble/__init__.py +13 -0
  16. metaensemble-0.2.0/metaensemble/cli.py +1362 -0
  17. metaensemble-0.2.0/metaensemble/commands/dispatch.md +39 -0
  18. metaensemble-0.2.0/metaensemble/commands/executors.md +12 -0
  19. metaensemble-0.2.0/metaensemble/commands/ledger.md +19 -0
  20. metaensemble-0.2.0/metaensemble/commands/limits.md +12 -0
  21. metaensemble-0.2.0/metaensemble/commands/perf.md +12 -0
  22. metaensemble-0.2.0/metaensemble/commands/relaunch.md +29 -0
  23. metaensemble-0.2.0/metaensemble/commands/standup.md +14 -0
  24. metaensemble-0.2.0/metaensemble/config/budgets.example.yaml +72 -0
  25. metaensemble-0.2.0/metaensemble/config/quality.example.yaml +82 -0
  26. metaensemble-0.2.0/metaensemble/hooks/__init__.py +1 -0
  27. metaensemble-0.2.0/metaensemble/hooks/_common.py +148 -0
  28. metaensemble-0.2.0/metaensemble/hooks/deliverable_sync.py +73 -0
  29. metaensemble-0.2.0/metaensemble/hooks/file_event.py +303 -0
  30. metaensemble-0.2.0/metaensemble/hooks/post_task.py +460 -0
  31. metaensemble-0.2.0/metaensemble/hooks/pre_task.py +548 -0
  32. metaensemble-0.2.0/metaensemble/hooks/session_start.py +212 -0
  33. metaensemble-0.2.0/metaensemble/hooks/session_summary.py +392 -0
  34. metaensemble-0.2.0/metaensemble/hooks/subagent_stop.py +94 -0
  35. metaensemble-0.2.0/metaensemble/lib/__init__.py +1 -0
  36. metaensemble-0.2.0/metaensemble/lib/config.py +414 -0
  37. metaensemble-0.2.0/metaensemble/lib/cost_gate.py +299 -0
  38. metaensemble-0.2.0/metaensemble/lib/dispatch.py +341 -0
  39. metaensemble-0.2.0/metaensemble/lib/doctor.py +1563 -0
  40. metaensemble-0.2.0/metaensemble/lib/file_events.py +395 -0
  41. metaensemble-0.2.0/metaensemble/lib/ids.py +91 -0
  42. metaensemble-0.2.0/metaensemble/lib/installer.py +5018 -0
  43. metaensemble-0.2.0/metaensemble/lib/ledger.py +812 -0
  44. metaensemble-0.2.0/metaensemble/lib/manifest.py +141 -0
  45. metaensemble-0.2.0/metaensemble/lib/native_state.py +463 -0
  46. metaensemble-0.2.0/metaensemble/lib/overlaps.py +155 -0
  47. metaensemble-0.2.0/metaensemble/lib/quality_gate.py +155 -0
  48. metaensemble-0.2.0/metaensemble/lib/quality_runners.py +446 -0
  49. metaensemble-0.2.0/metaensemble/lib/reconcile.py +420 -0
  50. metaensemble-0.2.0/metaensemble/lib/recording.py +422 -0
  51. metaensemble-0.2.0/metaensemble/lib/relaunch.py +174 -0
  52. metaensemble-0.2.0/metaensemble/lib/runtime_payload.py +42 -0
  53. metaensemble-0.2.0/metaensemble/lib/runtime_state.py +308 -0
  54. metaensemble-0.2.0/metaensemble/lib/sidecar.py +166 -0
  55. metaensemble-0.2.0/metaensemble/lib/topology.py +181 -0
  56. metaensemble-0.2.0/metaensemble/lib/transcript.py +432 -0
  57. metaensemble-0.2.0/metaensemble/output-styles/deliverable.md +33 -0
  58. metaensemble-0.2.0/metaensemble/output-styles/wire.md +38 -0
  59. metaensemble-0.2.0/metaensemble/roles/architect.md +52 -0
  60. metaensemble-0.2.0/metaensemble/roles/backend.md +43 -0
  61. metaensemble-0.2.0/metaensemble/roles/code-quality.md +49 -0
  62. metaensemble-0.2.0/metaensemble/roles/data-engineer.md +42 -0
  63. metaensemble-0.2.0/metaensemble/roles/devops.md +42 -0
  64. metaensemble-0.2.0/metaensemble/roles/docs.md +41 -0
  65. metaensemble-0.2.0/metaensemble/roles/frontend.md +42 -0
  66. metaensemble-0.2.0/metaensemble/roles/ml-engineer.md +42 -0
  67. metaensemble-0.2.0/metaensemble/roles/test-engineer.md +42 -0
  68. metaensemble-0.2.0/metaensemble/schemas/brief.schema.json +80 -0
  69. metaensemble-0.2.0/metaensemble/schemas/manifest.schema.json +142 -0
  70. metaensemble-0.2.0/metaensemble/schemas/role.schema.json +84 -0
  71. metaensemble-0.2.0/metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
  72. metaensemble-0.2.0/metaensemble/state/migrations/001_init.sql +72 -0
  73. metaensemble-0.2.0/metaensemble/state/migrations/002_outcome_extended.sql +86 -0
  74. metaensemble-0.2.0/metaensemble/state/migrations/003_run_provenance.sql +36 -0
  75. metaensemble-0.2.0/metaensemble/statusline/me_status.py +187 -0
  76. metaensemble-0.2.0/metaensemble/tools/__init__.py +7 -0
  77. metaensemble-0.2.0/metaensemble/tools/executors.py +62 -0
  78. metaensemble-0.2.0/metaensemble/tools/ledger.py +121 -0
  79. metaensemble-0.2.0/metaensemble/tools/limits.py +165 -0
  80. metaensemble-0.2.0/metaensemble/tools/perf.py +150 -0
  81. metaensemble-0.2.0/metaensemble/tools/standup.py +177 -0
  82. metaensemble-0.2.0/metaensemble/tools/stats.py +115 -0
  83. metaensemble-0.2.0/metaensemble.egg-info/PKG-INFO +221 -0
  84. metaensemble-0.2.0/metaensemble.egg-info/SOURCES.txt +88 -0
  85. metaensemble-0.2.0/metaensemble.egg-info/dependency_links.txt +1 -0
  86. metaensemble-0.2.0/metaensemble.egg-info/entry_points.txt +2 -0
  87. metaensemble-0.2.0/metaensemble.egg-info/requires.txt +21 -0
  88. metaensemble-0.2.0/metaensemble.egg-info/top_level.txt +2 -0
  89. metaensemble-0.2.0/pyproject.toml +88 -0
  90. metaensemble-0.2.0/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 MetaEnsemble contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: metaensemble
3
+ Version: 0.2.0
4
+ Summary: A typed runtime for ensembles of cognitive agents
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: jsonschema>=4.20
10
+ Requires-Dist: pyyaml>=6.0
11
+ Provides-Extra: test
12
+ Requires-Dist: pytest>=7.0; extra == "test"
13
+ Requires-Dist: pytest-cov>=4.0; extra == "test"
14
+ Requires-Dist: bandit>=1.7; extra == "test"
15
+ Requires-Dist: ruff>=0.5; extra == "test"
16
+ Requires-Dist: radon>=6.0; extra == "test"
17
+ Requires-Dist: coverage>=7.0; extra == "test"
18
+ Requires-Dist: build>=1.0; extra == "test"
19
+ Requires-Dist: tomli>=2.0; python_version < "3.11" and extra == "test"
20
+ Provides-Extra: quality
21
+ Requires-Dist: bandit>=1.7; extra == "quality"
22
+ Requires-Dist: ruff>=0.5; extra == "quality"
23
+ Requires-Dist: radon>=6.0; extra == "quality"
24
+ Requires-Dist: coverage>=7.0; extra == "quality"
25
+ Requires-Dist: pip-audit>=2.6; extra == "quality"
26
+ Dynamic: license-file
27
+
28
+ # MetaEnsemble
29
+
30
+ **Stable identity, typed contracts, and observable runs for ensembles of cognitive agents.**
31
+
32
+ MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
33
+
34
+ **v0.2.0 status:** feedback-first release. The software records and gates local agent work, but measured quality-per-token improvements remain a product hypothesis until the live evaluation set is larger and fully baseline-comparable. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md).
35
+
36
+ ---
37
+
38
+ ## Why MetaEnsemble exists
39
+
40
+ Coordinating multiple cognitive agents tends to fail in the same three places:
41
+
42
+ 1. **No stable identity.** Each agent invocation is anonymous. No way to say "follow up with the same Executor next week."
43
+ 2. **No typed handoffs.** Context passes between agents as free-form prose. Every receiver re-derives state by searching, re-reading, re-grepping.
44
+ 3. **No observability.** Token spend, model choice, outcome — nothing captured per run. Optimization is guesswork.
45
+
46
+ MetaEnsemble fixes all three at the substrate, not as features. Every primitive in the system carries an ID, every transport is schema-validated, every execution lands in the Ledger.
47
+
48
+ ---
49
+
50
+ ## What MetaEnsemble gives you
51
+
52
+ - **Persistent identities.** Every Executor has a UUIDv7 and a short alias (`arch-7b3`). Resume any past Executor across sessions with `/relaunch arch-7b3`.
53
+ - **Typed contracts.** Handoffs travel as YAML Manifests validated against a JSON Schema. Inter-Executor messages travel as terse JSON Briefs. No prose context-injection, no re-search on the receiving side.
54
+ - **Observable runs.** Append-only Ledger (SQLite live, JSONL mirror for replay) records every Run with token cost, requested model tier, runtime-observed model when available, outcome, and links to its Deliverable.
55
+ - **MetaEnsemble dispatch.** Spawn N Executors from one Role spec for parallel hypothesis exploration, consensus review, or fan-out implementation. Default is N=1; multi-instance is opt-in and currently validated at the planning/protocol layer.
56
+ - **Cross-session continuity.** An Executor's identity is a Ledger row, not a live process. Relaunch is cheap (last Brief + last Deliverable summary) by default, deep (`--full`) when needed.
57
+ - **Two-channel design.** Machine-to-machine traffic (Briefs) stays terse and structured. Human-facing output (Deliverables) stays full English. Same Run produces both. No "compression tier" knob to misset.
58
+ - **Threshold-based cost gating.** The Coordinator auto-decides cheap, reversible work. It surfaces only the calls that warrant Principal judgment, in a structured options table — never as conversational back-and-forth.
59
+
60
+ ---
61
+
62
+ ## Primitives
63
+
64
+ | Term | Shape | What it is |
65
+ |---|---|---|
66
+ | **Principal** | The human running the system | The person who dispatches work and approves above-threshold decisions. Maps to the IAM Principal concept. |
67
+ | **Coordinator** | The main agent in the active session | Plans Tasks, dispatches Executors, validates contracts, synthesizes Deliverables. Maps to the Kafka / ZooKeeper / Cassandra coordinator pattern. |
68
+ | **Role** | Markdown file with frontmatter spec | The Job Description. Declarative, versioned. Maps to a Kubernetes Deployment spec or IAM Role. |
69
+ | **Executor** | Row in the registry, identified by UUIDv7 + alias | A live instance of a Role. Multiple per Role per Task. Survives sessions. Maps to a Spark Executor or K8s Pod. |
70
+ | **Task** | Unit of work | What the Principal asks the ensemble to do. Has dependencies, expected deliverables, budget. |
71
+ | **Run** | Row in the Ledger | One execution attempt by one Executor for one Task. Maps to an MLflow run. |
72
+ | **Brief** | Schema-validated JSON | Wire-format message between Executors. Terse, machine-targeted. |
73
+ | **Manifest** | Schema-validated YAML | Handoff contract. Typed pointers to files, line ranges, schemas, prior runs. Maps to a dbt or OpenAPI manifest. |
74
+ | **Deliverable** | Markdown report | Human-readable output. English prose. Institutional memory. |
75
+ | **Ledger** | SQLite + JSONL mirror | Append-only log of every Run. Queryable, replayable. Maps to MLflow tracking. |
76
+ | **Registry** | View over the Ledger + Executor table | Current-state snapshot. Live Executors, open Tasks, dependencies. Maps to a service-mesh control-plane view. |
77
+ | **Dispatch** | Verb / slash command | The act of launching N Executors of a Role for a Task. |
78
+
79
+ ---
80
+
81
+ ## High-level flow
82
+
83
+ ```
84
+ ┌─────────────────────┐
85
+ │ Principal │ (you)
86
+ └──────────┬──────────┘
87
+ │ intent
88
+ ┌──────────▼──────────┐
89
+ │ Coordinator │ plans, dispatches, synthesizes
90
+ └─────┬────────┬──────┘
91
+ │ │
92
+ ┌─────────────┘ └───────────┐
93
+ │ │
94
+ ┌────▼─────────┐ ┌───────▼──────┐
95
+ │ Role: backend│ │ Role: review │
96
+ │ spec file │ │ spec file │
97
+ └────┬─────────┘ └───────┬──────┘
98
+ │ dispatch N=2 │ dispatch N=3
99
+ ┌────┴────┐ ┌─────┼─────┐
100
+ ▼ ▼ ▼ ▼ ▼
101
+ ┌─────┐ ┌─────┐ ┌────┐┌────┐┌────┐
102
+ │be-1 │ │be-2 │ │rv-1││rv-2││rv-3│
103
+ └──┬──┘ └──┬──┘ └─┬──┘└─┬──┘└─┬──┘
104
+ │ Brief │ Brief │ │ │
105
+ ▼ ▼ ▼ ▼ ▼
106
+ ┌────────────────────────────────────────────────┐
107
+ │ Ledger (SQLite + JSONL) │
108
+ └────────────────────────────────────────────────┘
109
+
110
+
111
+ ┌──────────────┐
112
+ │ Deliverables │ English, for humans
113
+ └──────────────┘
114
+ ```
115
+
116
+ A single `/dispatch` produces N Executors across one or more Roles. Each Executor emits a Brief downstream and a Deliverable upstream. Every Run is logged. The Principal sees Deliverables and the standup view; never the wire traffic.
117
+
118
+ ---
119
+
120
+ ## Why two channels
121
+
122
+ A single Run produces two artifacts:
123
+
124
+ - The **Brief** is what the next Executor receives. Terse JSON. Schema-validated. Machine-targeted. Cheap to emit, cheap to parse.
125
+ - The **Deliverable** is what you, the Principal, read. Full English. Prose. Institutional memory.
126
+
127
+ These are not intensity tiers. They are different artifacts for different audiences, produced together. The receiving Executor does not parse English; the human does not parse JSON. Each gets the format that earns its place.
128
+
129
+ ---
130
+
131
+ ## How it runs
132
+
133
+ MetaEnsemble runs entirely on your laptop. Clone the repo, drop the conventions into your local agent runtime configuration, and dispatch. No servers, no cloud accounts, no hosting. Your Ledger, your Executors, your Briefs all live on your filesystem. State is portable: copy the repo and the state directory, and MetaEnsemble runs anywhere the agent runtime is installed.
134
+
135
+ ---
136
+
137
+ ## Adopting MetaEnsemble in your project
138
+
139
+ MetaEnsemble is project-agnostic by design. Three layers, with project-specific knowledge confined to the project layer:
140
+
141
+ ```
142
+ metaensemble/ # shipped with MetaEnsemble; project-agnostic
143
+ ~/.metaensemble/ # per-engineer preferences; the vendored runtime (runtime/, runtime-versions/); the runner at runtime/bin/me-run
144
+ <your-project>/.metaensemble/ # project-specific state, manifests, and install decisions
145
+ ```
146
+
147
+ The adoption flow has two layers, asked separately:
148
+
149
+ ```bash
150
+ metaensemble setup # interactive wizard: picks a project, asks for layout, runs the two steps below
151
+ ```
152
+
153
+ The wizard lists every Claude Code project on this machine, lets you pick one, asks once for the layout (namespaced or top-level), and then runs `user-setup` and `adopt` in sequence. The two underlying commands are explicit if you prefer them:
154
+
155
+ ```bash
156
+ metaensemble user-setup --layout=namespaced # once per machine: vendors runtime to ~/.metaensemble/runtime/, wires commands/hooks/statusline
157
+ # or
158
+ metaensemble user-setup --layout=top-level # same, but slash commands install top-level under ~/.claude/commands/
159
+
160
+ metaensemble adopt # per project: writes <project>/.metaensemble/ and honors install-decisions
161
+ ```
162
+
163
+ `user-setup` is global (one layout for the whole machine; re-run with a different layout to switch). `adopt` is per-project and portable — run it once per project you want to register.
164
+
165
+ The inspection is the load-bearing piece. It writes two files into `<project>/.metaensemble/`:
166
+
167
+ - A short Markdown report naming what was found, what we recommend, and why.
168
+ - `install-decisions.yaml`, the editable choice surface. Every agent in your setup and every curated Role MetaEnsemble ships gets one entry with a sensible default. It also records the project's memory surfaces (`CLAUDE.md` and friends) so dispatch contracts hand Executors your existing project memory instead of rebuilding it. Read once, edit only what you disagree with.
169
+
170
+ Per-agent decisions span four cases (`collision`, `user_unique`, `curated_relevant`, `curated_optional`) and seven actions (`keep_yours`, `take_ours`, `keep_both`, `preserve`, `convert`, `activate`, `retire`). The installer reads the file and honors every choice. Nothing the user authored is silently converted; the default for every collision is to keep the user's agent.
171
+
172
+ Recovery mirrors the install split. `metaensemble unadopt` reverses one project's adoption: it walks `<project>/.metaensemble/backups/` in reverse, reverses project-scope actions, strips the managed `.gitignore` block, and leaves user-level integration intact. `metaensemble user-teardown` reverses `user-setup` by removing managed `~/.claude/` symlinks and hook entries. Each command accepts `--purge-state` for the matching `.metaensemble/` directory. For a full local rollback after live testing, run `metaensemble reconcile --older-than-minutes 0` first so stranded pending Runs are written to the Ledger, then run `metaensemble unadopt --purge-state` from the project root and `metaensemble user-teardown --purge-state` from anywhere. `metaensemble export-agents` reverse-converts MetaEnsemble Roles back to Claude Code agent files, even when the install's backups directory is missing. Every contract above is tested.
173
+
174
+ Starter packs (`--pack ml`, `--pack web`, `--pack data`) are planned for a future release.
175
+
176
+ If your project lives in an iCloud-synced directory (e.g., `~/Desktop/` with iCloud Desktop & Documents Sync enabled), consider excluding `.venv/` from iCloud sync. iCloud's conflict-resolution against rapid `pip install` file churn produces phantom duplicate files in `site-packages`; MetaEnsemble filters them correctly but they consume iCloud quota and slow installs. `metaensemble doctor` C11 surfaces this state as a WARN with remediation. The same caveat applies more strongly to `.metaensemble/state/`: when iCloud places `department.db` into a dataless placeholder state, SQLite's `open()` can fail intermittently and PreToolUse hooks surface as `Agent hook error` with no stderr. The robust fix is to host active MetaEnsemble projects outside iCloud-synced paths, or exclude the project from iCloud Drive. `metaensemble doctor` C4 names this cause when it detects the layout. See [USER-GUIDE.md — When something feels off](./docs/USER-GUIDE.md) for the troubleshooting recipe.
177
+
178
+ See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the full reference. See [ARCHITECTURE.md §4 — Portability](./docs/ARCHITECTURE.md) for the layering, merge order, and the hard rule that keeps Core project-agnostic.
179
+
180
+ ---
181
+
182
+ ## Status
183
+
184
+ v0.2.0. All core phases complete and tested:
185
+
186
+ - Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
187
+ - Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
188
+ - Principal-facing surface: seven slash commands plus CLI subcommands including `metaensemble setup`, `metaensemble user-setup`, `metaensemble adopt`, `metaensemble unadopt`, `metaensemble user-teardown`, `metaensemble reconcile`, `metaensemble eval`, `metaensemble stats`, and `metaensemble projects`.
189
+ - Multi-instance patterns (fanout / consensus / shadow / peer-review) with the `N ≥ 2` guard enforced deterministically by the PreToolUse marker hook.
190
+ - Installer with idempotent re-runs, explicit purge modes, and a residue report after every uninstall.
191
+ - Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
192
+ - Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
193
+ - Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
194
+ - Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The shipped replay pack is a non-empirical bootstrap fixture. Live smoke/full runs are wired for side-effect-free classification-smoke checks; calibration and baseline-superiority claims still require larger labeled/fixture sets.
195
+
196
+ v0.2.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
197
+
198
+ See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
199
+ Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
200
+
201
+ ---
202
+
203
+ ## Where to start
204
+
205
+ - **[ARCHITECTURE.md](./docs/ARCHITECTURE.md)** — the layered design, the data model, the lifecycle, what MetaEnsemble is and is not.
206
+ - **[USER-GUIDE.md](./docs/USER-GUIDE.md)** — a friendly Principal guide for day-one users.
207
+ - **[PERFORMANCE.md](./docs/PERFORMANCE.md)** — the binding engineering contract: token budgets, time budgets, query rules, and CI-gated benchmarks. Required reading before changing performance-sensitive code.
208
+ - **[RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md)** — artifact, security, installer, and live-eval gates for publishing a release.
209
+ - **[GLOSSARY.md](./docs/GLOSSARY.md)** — every term defined precisely, every analog named.
210
+
211
+ ---
212
+
213
+ ## Operating principles
214
+
215
+ Three values drive every design choice in MetaEnsemble:
216
+
217
+ 1. **Conserve the budget.** The constraint is window exhaustion, not dollars. Per-Executor model tiering, terse wire format, schema-driven handoffs that eliminate re-search — all designed to fit more useful work in fewer tokens.
218
+ 2. **Move fast.** Parallel dispatch is a primitive, not a workaround. Hooks fire on lifecycle events automatically. The Principal never types boilerplate.
219
+ 3. **Hold the line on quality.** Speed and budget never come at the cost of standards. The schema layer enforces correctness; the Ledger enforces accountability; the Deliverable channel preserves institutional memory at full fidelity.
220
+
221
+ If a proposed feature compromises any of these three, it does not ship.
@@ -0,0 +1,194 @@
1
+ # MetaEnsemble
2
+
3
+ **Stable identity, typed contracts, and observable runs for ensembles of cognitive agents.**
4
+
5
+ MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
6
+
7
+ **v0.2.0 status:** feedback-first release. The software records and gates local agent work, but measured quality-per-token improvements remain a product hypothesis until the live evaluation set is larger and fully baseline-comparable. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md).
8
+
9
+ ---
10
+
11
+ ## Why MetaEnsemble exists
12
+
13
+ Coordinating multiple cognitive agents tends to fail in the same three places:
14
+
15
+ 1. **No stable identity.** Each agent invocation is anonymous. No way to say "follow up with the same Executor next week."
16
+ 2. **No typed handoffs.** Context passes between agents as free-form prose. Every receiver re-derives state by searching, re-reading, re-grepping.
17
+ 3. **No observability.** Token spend, model choice, outcome — nothing captured per run. Optimization is guesswork.
18
+
19
+ MetaEnsemble fixes all three at the substrate, not as features. Every primitive in the system carries an ID, every transport is schema-validated, every execution lands in the Ledger.
20
+
21
+ ---
22
+
23
+ ## What MetaEnsemble gives you
24
+
25
+ - **Persistent identities.** Every Executor has a UUIDv7 and a short alias (`arch-7b3`). Resume any past Executor across sessions with `/relaunch arch-7b3`.
26
+ - **Typed contracts.** Handoffs travel as YAML Manifests validated against a JSON Schema. Inter-Executor messages travel as terse JSON Briefs. No prose context-injection, no re-search on the receiving side.
27
+ - **Observable runs.** Append-only Ledger (SQLite live, JSONL mirror for replay) records every Run with token cost, requested model tier, runtime-observed model when available, outcome, and links to its Deliverable.
28
+ - **MetaEnsemble dispatch.** Spawn N Executors from one Role spec for parallel hypothesis exploration, consensus review, or fan-out implementation. Default is N=1; multi-instance is opt-in and currently validated at the planning/protocol layer.
29
+ - **Cross-session continuity.** An Executor's identity is a Ledger row, not a live process. Relaunch is cheap (last Brief + last Deliverable summary) by default, deep (`--full`) when needed.
30
+ - **Two-channel design.** Machine-to-machine traffic (Briefs) stays terse and structured. Human-facing output (Deliverables) stays full English. Same Run produces both. No "compression tier" knob to misset.
31
+ - **Threshold-based cost gating.** The Coordinator auto-decides cheap, reversible work. It surfaces only the calls that warrant Principal judgment, in a structured options table — never as conversational back-and-forth.
32
+
33
+ ---
34
+
35
+ ## Primitives
36
+
37
+ | Term | Shape | What it is |
38
+ |---|---|---|
39
+ | **Principal** | The human running the system | The person who dispatches work and approves above-threshold decisions. Maps to the IAM Principal concept. |
40
+ | **Coordinator** | The main agent in the active session | Plans Tasks, dispatches Executors, validates contracts, synthesizes Deliverables. Maps to the Kafka / ZooKeeper / Cassandra coordinator pattern. |
41
+ | **Role** | Markdown file with frontmatter spec | The Job Description. Declarative, versioned. Maps to a Kubernetes Deployment spec or IAM Role. |
42
+ | **Executor** | Row in the registry, identified by UUIDv7 + alias | A live instance of a Role. Multiple per Role per Task. Survives sessions. Maps to a Spark Executor or K8s Pod. |
43
+ | **Task** | Unit of work | What the Principal asks the ensemble to do. Has dependencies, expected deliverables, budget. |
44
+ | **Run** | Row in the Ledger | One execution attempt by one Executor for one Task. Maps to an MLflow run. |
45
+ | **Brief** | Schema-validated JSON | Wire-format message between Executors. Terse, machine-targeted. |
46
+ | **Manifest** | Schema-validated YAML | Handoff contract. Typed pointers to files, line ranges, schemas, prior runs. Maps to a dbt or OpenAPI manifest. |
47
+ | **Deliverable** | Markdown report | Human-readable output. English prose. Institutional memory. |
48
+ | **Ledger** | SQLite + JSONL mirror | Append-only log of every Run. Queryable, replayable. Maps to MLflow tracking. |
49
+ | **Registry** | View over the Ledger + Executor table | Current-state snapshot. Live Executors, open Tasks, dependencies. Maps to a service-mesh control-plane view. |
50
+ | **Dispatch** | Verb / slash command | The act of launching N Executors of a Role for a Task. |
51
+
52
+ ---
53
+
54
+ ## High-level flow
55
+
56
+ ```
57
+ ┌─────────────────────┐
58
+ │ Principal │ (you)
59
+ └──────────┬──────────┘
60
+ │ intent
61
+ ┌──────────▼──────────┐
62
+ │ Coordinator │ plans, dispatches, synthesizes
63
+ └─────┬────────┬──────┘
64
+ │ │
65
+ ┌─────────────┘ └───────────┐
66
+ │ │
67
+ ┌────▼─────────┐ ┌───────▼──────┐
68
+ │ Role: backend│ │ Role: review │
69
+ │ spec file │ │ spec file │
70
+ └────┬─────────┘ └───────┬──────┘
71
+ │ dispatch N=2 │ dispatch N=3
72
+ ┌────┴────┐ ┌─────┼─────┐
73
+ ▼ ▼ ▼ ▼ ▼
74
+ ┌─────┐ ┌─────┐ ┌────┐┌────┐┌────┐
75
+ │be-1 │ │be-2 │ │rv-1││rv-2││rv-3│
76
+ └──┬──┘ └──┬──┘ └─┬──┘└─┬──┘└─┬──┘
77
+ │ Brief │ Brief │ │ │
78
+ ▼ ▼ ▼ ▼ ▼
79
+ ┌────────────────────────────────────────────────┐
80
+ │ Ledger (SQLite + JSONL) │
81
+ └────────────────────────────────────────────────┘
82
+
83
+
84
+ ┌──────────────┐
85
+ │ Deliverables │ English, for humans
86
+ └──────────────┘
87
+ ```
88
+
89
+ A single `/dispatch` produces N Executors across one or more Roles. Each Executor emits a Brief downstream and a Deliverable upstream. Every Run is logged. The Principal sees Deliverables and the standup view; never the wire traffic.
90
+
91
+ ---
92
+
93
+ ## Why two channels
94
+
95
+ A single Run produces two artifacts:
96
+
97
+ - The **Brief** is what the next Executor receives. Terse JSON. Schema-validated. Machine-targeted. Cheap to emit, cheap to parse.
98
+ - The **Deliverable** is what you, the Principal, read. Full English. Prose. Institutional memory.
99
+
100
+ These are not intensity tiers. They are different artifacts for different audiences, produced together. The receiving Executor does not parse English; the human does not parse JSON. Each gets the format that earns its place.
101
+
102
+ ---
103
+
104
+ ## How it runs
105
+
106
+ MetaEnsemble runs entirely on your laptop. Clone the repo, drop the conventions into your local agent runtime configuration, and dispatch. No servers, no cloud accounts, no hosting. Your Ledger, your Executors, your Briefs all live on your filesystem. State is portable: copy the repo and the state directory, and MetaEnsemble runs anywhere the agent runtime is installed.
107
+
108
+ ---
109
+
110
+ ## Adopting MetaEnsemble in your project
111
+
112
+ MetaEnsemble is project-agnostic by design. Three layers, with project-specific knowledge confined to the project layer:
113
+
114
+ ```
115
+ metaensemble/ # shipped with MetaEnsemble; project-agnostic
116
+ ~/.metaensemble/ # per-engineer preferences; the vendored runtime (runtime/, runtime-versions/); the runner at runtime/bin/me-run
117
+ <your-project>/.metaensemble/ # project-specific state, manifests, and install decisions
118
+ ```
119
+
120
+ The adoption flow has two layers, asked separately:
121
+
122
+ ```bash
123
+ metaensemble setup # interactive wizard: picks a project, asks for layout, runs the two steps below
124
+ ```
125
+
126
+ The wizard lists every Claude Code project on this machine, lets you pick one, asks once for the layout (namespaced or top-level), and then runs `user-setup` and `adopt` in sequence. The two underlying commands are explicit if you prefer them:
127
+
128
+ ```bash
129
+ metaensemble user-setup --layout=namespaced # once per machine: vendors runtime to ~/.metaensemble/runtime/, wires commands/hooks/statusline
130
+ # or
131
+ metaensemble user-setup --layout=top-level # same, but slash commands install top-level under ~/.claude/commands/
132
+
133
+ metaensemble adopt # per project: writes <project>/.metaensemble/ and honors install-decisions
134
+ ```
135
+
136
+ `user-setup` is global (one layout for the whole machine; re-run with a different layout to switch). `adopt` is per-project and portable — run it once per project you want to register.
137
+
138
+ The inspection is the load-bearing piece. It writes two files into `<project>/.metaensemble/`:
139
+
140
+ - A short Markdown report naming what was found, what we recommend, and why.
141
+ - `install-decisions.yaml`, the editable choice surface. Every agent in your setup and every curated Role MetaEnsemble ships gets one entry with a sensible default. It also records the project's memory surfaces (`CLAUDE.md` and friends) so dispatch contracts hand Executors your existing project memory instead of rebuilding it. Read once, edit only what you disagree with.
142
+
143
+ Per-agent decisions span four cases (`collision`, `user_unique`, `curated_relevant`, `curated_optional`) and seven actions (`keep_yours`, `take_ours`, `keep_both`, `preserve`, `convert`, `activate`, `retire`). The installer reads the file and honors every choice. Nothing the user authored is silently converted; the default for every collision is to keep the user's agent.
144
+
145
+ Recovery mirrors the install split. `metaensemble unadopt` reverses one project's adoption: it walks `<project>/.metaensemble/backups/` in reverse, reverses project-scope actions, strips the managed `.gitignore` block, and leaves user-level integration intact. `metaensemble user-teardown` reverses `user-setup` by removing managed `~/.claude/` symlinks and hook entries. Each command accepts `--purge-state` for the matching `.metaensemble/` directory. For a full local rollback after live testing, run `metaensemble reconcile --older-than-minutes 0` first so stranded pending Runs are written to the Ledger, then run `metaensemble unadopt --purge-state` from the project root and `metaensemble user-teardown --purge-state` from anywhere. `metaensemble export-agents` reverse-converts MetaEnsemble Roles back to Claude Code agent files, even when the install's backups directory is missing. Every contract above is tested.
146
+
147
+ Starter packs (`--pack ml`, `--pack web`, `--pack data`) are planned for a future release.
148
+
149
+ If your project lives in an iCloud-synced directory (e.g., `~/Desktop/` with iCloud Desktop & Documents Sync enabled), consider excluding `.venv/` from iCloud sync. iCloud's conflict-resolution against rapid `pip install` file churn produces phantom duplicate files in `site-packages`; MetaEnsemble filters them correctly but they consume iCloud quota and slow installs. `metaensemble doctor` C11 surfaces this state as a WARN with remediation. The same caveat applies more strongly to `.metaensemble/state/`: when iCloud places `department.db` into a dataless placeholder state, SQLite's `open()` can fail intermittently and PreToolUse hooks surface as `Agent hook error` with no stderr. The robust fix is to host active MetaEnsemble projects outside iCloud-synced paths, or exclude the project from iCloud Drive. `metaensemble doctor` C4 names this cause when it detects the layout. See [USER-GUIDE.md — When something feels off](./docs/USER-GUIDE.md) for the troubleshooting recipe.
150
+
151
+ See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the full reference. See [ARCHITECTURE.md §4 — Portability](./docs/ARCHITECTURE.md) for the layering, merge order, and the hard rule that keeps Core project-agnostic.
152
+
153
+ ---
154
+
155
+ ## Status
156
+
157
+ v0.2.0. All core phases complete and tested:
158
+
159
+ - Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
160
+ - Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
161
+ - Principal-facing surface: seven slash commands plus CLI subcommands including `metaensemble setup`, `metaensemble user-setup`, `metaensemble adopt`, `metaensemble unadopt`, `metaensemble user-teardown`, `metaensemble reconcile`, `metaensemble eval`, `metaensemble stats`, and `metaensemble projects`.
162
+ - Multi-instance patterns (fanout / consensus / shadow / peer-review) with the `N ≥ 2` guard enforced deterministically by the PreToolUse marker hook.
163
+ - Installer with idempotent re-runs, explicit purge modes, and a residue report after every uninstall.
164
+ - Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
165
+ - Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
166
+ - Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
167
+ - Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The shipped replay pack is a non-empirical bootstrap fixture. Live smoke/full runs are wired for side-effect-free classification-smoke checks; calibration and baseline-superiority claims still require larger labeled/fixture sets.
168
+
169
+ v0.2.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
170
+
171
+ See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
172
+ Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
173
+
174
+ ---
175
+
176
+ ## Where to start
177
+
178
+ - **[ARCHITECTURE.md](./docs/ARCHITECTURE.md)** — the layered design, the data model, the lifecycle, what MetaEnsemble is and is not.
179
+ - **[USER-GUIDE.md](./docs/USER-GUIDE.md)** — a friendly Principal guide for day-one users.
180
+ - **[PERFORMANCE.md](./docs/PERFORMANCE.md)** — the binding engineering contract: token budgets, time budgets, query rules, and CI-gated benchmarks. Required reading before changing performance-sensitive code.
181
+ - **[RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md)** — artifact, security, installer, and live-eval gates for publishing a release.
182
+ - **[GLOSSARY.md](./docs/GLOSSARY.md)** — every term defined precisely, every analog named.
183
+
184
+ ---
185
+
186
+ ## Operating principles
187
+
188
+ Three values drive every design choice in MetaEnsemble:
189
+
190
+ 1. **Conserve the budget.** The constraint is window exhaustion, not dollars. Per-Executor model tiering, terse wire format, schema-driven handoffs that eliminate re-search — all designed to fit more useful work in fewer tokens.
191
+ 2. **Move fast.** Parallel dispatch is a primitive, not a workaround. Hooks fire on lifecycle events automatically. The Principal never types boilerplate.
192
+ 3. **Hold the line on quality.** Speed and budget never come at the cost of standards. The schema layer enforces correctness; the Ledger enforces accountability; the Deliverable channel preserves institutional memory at full fidelity.
193
+
194
+ If a proposed feature compromises any of these three, it does not ship.
@@ -0,0 +1,147 @@
1
+ # MetaEnsemble Evaluation Harness
2
+
3
+ The harness exists so the quality-per-token claim — *the system around
4
+ the model is strong enough to deploy the competence the model already
5
+ has* — can be tested rather than asserted. Replay keeps the harness
6
+ deterministic in CI. Smoke and full tiers make live, side-effect-free
7
+ Claude Code calls and write measured reports under the caller's
8
+ `evals/reports/`.
9
+ The shipped classification data is one narrow smoke fixture, not a calibration set
10
+ and not a statement of product scope. MetaEnsemble is project-agnostic.
11
+
12
+ ## Directory layout
13
+
14
+ ```
15
+ evals/
16
+ ├── README.md # this file
17
+ ├── configs/
18
+ │ └── default.yaml # eval-cycle parameters (seeds, budget, model routing)
19
+ ├── datasets/
20
+ │ ├── suite_a/ # 8 software-engineering tasks
21
+ │ │ ├── README.md
22
+ │ │ └── tasks.yaml
23
+ │ └── suite_b/ # domain-specific classification smoke set
24
+ │ ├── README.md
25
+ │ └── items.yaml
26
+ ├── baselines/ # B1 / B2 / B3 baseline definitions
27
+ │ ├── b1_single_agent.yaml
28
+ │ ├── b2_single_agent_prompted.yaml
29
+ │ ├── b3_subagent_default.yaml
30
+ │ └── b4_best_prompt.yaml # best-single-agent baseline
31
+ ├── cassettes/ # replay fixtures; bootstrap pack is non-empirical
32
+ ├── runners/ # cell × seed executors
33
+ │ ├── __init__.py
34
+ │ ├── api.py # tiered runner: replay / live / smoke
35
+ │ ├── metrics.py # Wilson CI, pass@budget, quality_per_1k_tokens
36
+ │ └── replay.py # cassette-based PR runner
37
+ └── reports/ # generated reports per cycle (gitignored)
38
+ ```
39
+
40
+ ## Tiered evaluation
41
+
42
+ | Tier | When it runs | Live API calls | Budget |
43
+ |---|---|---|---|
44
+ | Replay | Every PR; reads recorded cassette responses. | No. | $0 |
45
+ | Smoke | Nightly cron or local preflight. 1 seed, `MM_full`, classification smoke set only. | Yes. | ~$0.30 default cap |
46
+ | Full | Release-gated. Defaults to 5 seeds × every configured cell. | Yes. | Principal-set cap |
47
+
48
+ The PR tier exists to keep regressions cheap to catch; the full tier
49
+ exists to certify a release. A release candidate is not allowed
50
+ to claim quality-per-token superiority unless the same report includes
51
+ baseline cells and MetaEnsemble cells over the same task set.
52
+
53
+ The release ships a compact `evals/cassettes/bootstrap.jsonl` pack so the
54
+ replay tier works in a clean checkout. That pack is deliberately marked
55
+ non-empirical; it verifies the harness mechanics, not MetaEnsemble's
56
+ quality claim. Live smoke/full reports are empirical for the cells and
57
+ datasets actually run; the report notes any skipped deferred fixtures.
58
+
59
+ ## Headline metrics
60
+
61
+ The harness reports three co-primary metrics per cell:
62
+
63
+ | Metric | Definition |
64
+ |---|---|
65
+ | `pass@budget` | Pass-rate against the cell's per-task budget. A "win" that overspends does not count. |
66
+ | `quality_per_1k_tokens` | Average score across passing runs divided by tokens / 1000. Directly tests the efficiency thesis. |
67
+ | `orchestration_overhead_ratio` | MetaEnsemble token cost over the best single-agent baseline's token cost, on the same task. |
68
+
69
+ Plus the supporting metrics in `runners/metrics.py`:
70
+ `failed_run_token_waste`, `time_to_useful_deliverable`,
71
+ `minimum_useful_answer_score`.
72
+
73
+ For live reports, include these context fields in the release note or
74
+ system-card link: exact model IDs when the runtime exposes them, seed
75
+ count, cells run, skipped fixtures, total observed tokens, estimated vs
76
+ observed token error where available, and any cost-gate or Python
77
+ deliverable-check interventions.
78
+
79
+ ## Suite A — software engineering (8 tasks)
80
+
81
+ Eight tasks drawn from the project's own backlog and from small
82
+ open-source repos. Each task has:
83
+
84
+ - A one-paragraph description (English).
85
+ - A frozen-commit starting state (commit SHA of the project under test).
86
+ - Graded acceptance criteria (build passes, tests count ≥ N, lint
87
+ clean, manifest existed, deliverable file present).
88
+
89
+ See `evals/datasets/suite_a/tasks.yaml` for the current set.
90
+
91
+ The current Suite-A rows still contain deferred fixture SHAs. The live
92
+ full tier names those skipped tasks in the report rather than treating
93
+ them as passed or failed. Release certification across software tasks
94
+ requires replacing the deferred SHAs with real fixture repositories.
95
+
96
+ ## Suite B — domain-specific classification (12 items, *smoke only*)
97
+
98
+ Twelve items is too few for calibration claims. The 12-item set in
99
+ `evals/datasets/suite_b/items.yaml` is the **smoke suite** that proves
100
+ the pipeline end-to-end. It is intentionally narrow; it does not make
101
+ MetaEnsemble domain-specific. Any release claim about a particular domain
102
+ needs its own independently labeled calibration set. The system card states
103
+ this limitation explicitly so no calibration claim is implied by the smoke
104
+ set.
105
+
106
+ ## Running the harness
107
+
108
+ ```bash
109
+ # PR-tier replay (no API calls).
110
+ metaensemble eval --tier replay --cells all
111
+
112
+ # Nightly smoke (one cell × one seed × classification smoke set only).
113
+ metaensemble eval --tier smoke
114
+
115
+ # Constrained full-tier check.
116
+ metaensemble eval --tier full --allow-live --cells MM_full --seeds 1 --budget-usd 0.30
117
+
118
+ # Release-gated full cycle once fixture SHAs and budget are set.
119
+ metaensemble eval --tier full --allow-live --cells all --seeds 5 --budget-usd 0.30
120
+ ```
121
+
122
+ The output report lands in the current working directory at
123
+ `evals/reports/<UTC-date>-<tier>.md` and is linked from
124
+ `PERFORMANCE.md §4` once a cycle ships.
125
+
126
+ Supported flags:
127
+
128
+ | Flag | Meaning |
129
+ |---|---|
130
+ | `--cells all` or `--cells A,B` | Select all cells or a comma-separated subset. Smoke defaults to `MM_full`; replay/full default to all. |
131
+ | `--seeds N` | Override seed count. Smoke defaults to 1; replay/full default to `evals/configs/default.yaml`. |
132
+ | `--budget-usd X` | Override the live-tier per-run budget shown in preflight. |
133
+ | `--allow-live` | Required before the full tier proceeds past preflight. |
134
+
135
+ ## Sign-off thresholds (D-8, D-9)
136
+
137
+ D-8 and D-9 are numerical full-tier release gates:
138
+
139
+ - **D-8 orchestration overhead**: any measured MetaEnsemble cell above
140
+ `2.0x` the best single-agent prompt baseline (`B4`) blocks the full
141
+ tier's ship verdict.
142
+ - **D-9 failed-run waste**: failed and budget-exceeded runs above `10%`
143
+ of total evaluated tokens block the full tier's ship verdict.
144
+
145
+ The thresholds live in `evals/configs/default.yaml`. If a full run does
146
+ not include the `B4_best_prompt` baseline, D-8 is reported as not
147
+ evaluated rather than silently passing.
File without changes
@@ -0,0 +1,10 @@
1
+ # Eval Cassettes
2
+
3
+ `bootstrap.jsonl` is a v0.1.0 replay fixture pack. It exists so the
4
+ zero-cost replay tier exercises task loading, cell selection, metrics,
5
+ and report rendering in a clean checkout.
6
+
7
+ It is not empirical benchmark evidence. Each record is marked
8
+ `source: bootstrap_fixture_not_empirical`; the first live smoke/full
9
+ cycle should replace or supplement this pack with recorded cassette
10
+ outputs from real runs.