metaensemble 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaensemble-0.2.0/LICENSE +21 -0
- metaensemble-0.2.0/PKG-INFO +221 -0
- metaensemble-0.2.0/README.md +194 -0
- metaensemble-0.2.0/evals/README.md +147 -0
- metaensemble-0.2.0/evals/__init__.py +0 -0
- metaensemble-0.2.0/evals/cassettes/README.md +10 -0
- metaensemble-0.2.0/evals/cassettes/bootstrap.jsonl +800 -0
- metaensemble-0.2.0/evals/configs/default.yaml +59 -0
- metaensemble-0.2.0/evals/datasets/__init__.py +0 -0
- metaensemble-0.2.0/evals/datasets/suite_a/tasks.yaml +123 -0
- metaensemble-0.2.0/evals/datasets/suite_b/items.yaml +90 -0
- metaensemble-0.2.0/evals/runners/__init__.py +12 -0
- metaensemble-0.2.0/evals/runners/api.py +518 -0
- metaensemble-0.2.0/evals/runners/metrics.py +132 -0
- metaensemble-0.2.0/metaensemble/__init__.py +13 -0
- metaensemble-0.2.0/metaensemble/cli.py +1362 -0
- metaensemble-0.2.0/metaensemble/commands/dispatch.md +39 -0
- metaensemble-0.2.0/metaensemble/commands/executors.md +12 -0
- metaensemble-0.2.0/metaensemble/commands/ledger.md +19 -0
- metaensemble-0.2.0/metaensemble/commands/limits.md +12 -0
- metaensemble-0.2.0/metaensemble/commands/perf.md +12 -0
- metaensemble-0.2.0/metaensemble/commands/relaunch.md +29 -0
- metaensemble-0.2.0/metaensemble/commands/standup.md +14 -0
- metaensemble-0.2.0/metaensemble/config/budgets.example.yaml +72 -0
- metaensemble-0.2.0/metaensemble/config/quality.example.yaml +82 -0
- metaensemble-0.2.0/metaensemble/hooks/__init__.py +1 -0
- metaensemble-0.2.0/metaensemble/hooks/_common.py +148 -0
- metaensemble-0.2.0/metaensemble/hooks/deliverable_sync.py +73 -0
- metaensemble-0.2.0/metaensemble/hooks/file_event.py +303 -0
- metaensemble-0.2.0/metaensemble/hooks/post_task.py +460 -0
- metaensemble-0.2.0/metaensemble/hooks/pre_task.py +548 -0
- metaensemble-0.2.0/metaensemble/hooks/session_start.py +212 -0
- metaensemble-0.2.0/metaensemble/hooks/session_summary.py +392 -0
- metaensemble-0.2.0/metaensemble/hooks/subagent_stop.py +94 -0
- metaensemble-0.2.0/metaensemble/lib/__init__.py +1 -0
- metaensemble-0.2.0/metaensemble/lib/config.py +414 -0
- metaensemble-0.2.0/metaensemble/lib/cost_gate.py +299 -0
- metaensemble-0.2.0/metaensemble/lib/dispatch.py +341 -0
- metaensemble-0.2.0/metaensemble/lib/doctor.py +1563 -0
- metaensemble-0.2.0/metaensemble/lib/file_events.py +395 -0
- metaensemble-0.2.0/metaensemble/lib/ids.py +91 -0
- metaensemble-0.2.0/metaensemble/lib/installer.py +5018 -0
- metaensemble-0.2.0/metaensemble/lib/ledger.py +812 -0
- metaensemble-0.2.0/metaensemble/lib/manifest.py +141 -0
- metaensemble-0.2.0/metaensemble/lib/native_state.py +463 -0
- metaensemble-0.2.0/metaensemble/lib/overlaps.py +155 -0
- metaensemble-0.2.0/metaensemble/lib/quality_gate.py +155 -0
- metaensemble-0.2.0/metaensemble/lib/quality_runners.py +446 -0
- metaensemble-0.2.0/metaensemble/lib/reconcile.py +420 -0
- metaensemble-0.2.0/metaensemble/lib/recording.py +422 -0
- metaensemble-0.2.0/metaensemble/lib/relaunch.py +174 -0
- metaensemble-0.2.0/metaensemble/lib/runtime_payload.py +42 -0
- metaensemble-0.2.0/metaensemble/lib/runtime_state.py +308 -0
- metaensemble-0.2.0/metaensemble/lib/sidecar.py +166 -0
- metaensemble-0.2.0/metaensemble/lib/topology.py +181 -0
- metaensemble-0.2.0/metaensemble/lib/transcript.py +432 -0
- metaensemble-0.2.0/metaensemble/output-styles/deliverable.md +33 -0
- metaensemble-0.2.0/metaensemble/output-styles/wire.md +38 -0
- metaensemble-0.2.0/metaensemble/roles/architect.md +52 -0
- metaensemble-0.2.0/metaensemble/roles/backend.md +43 -0
- metaensemble-0.2.0/metaensemble/roles/code-quality.md +49 -0
- metaensemble-0.2.0/metaensemble/roles/data-engineer.md +42 -0
- metaensemble-0.2.0/metaensemble/roles/devops.md +42 -0
- metaensemble-0.2.0/metaensemble/roles/docs.md +41 -0
- metaensemble-0.2.0/metaensemble/roles/frontend.md +42 -0
- metaensemble-0.2.0/metaensemble/roles/ml-engineer.md +42 -0
- metaensemble-0.2.0/metaensemble/roles/test-engineer.md +42 -0
- metaensemble-0.2.0/metaensemble/schemas/brief.schema.json +80 -0
- metaensemble-0.2.0/metaensemble/schemas/manifest.schema.json +142 -0
- metaensemble-0.2.0/metaensemble/schemas/role.schema.json +84 -0
- metaensemble-0.2.0/metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
- metaensemble-0.2.0/metaensemble/state/migrations/001_init.sql +72 -0
- metaensemble-0.2.0/metaensemble/state/migrations/002_outcome_extended.sql +86 -0
- metaensemble-0.2.0/metaensemble/state/migrations/003_run_provenance.sql +36 -0
- metaensemble-0.2.0/metaensemble/statusline/me_status.py +187 -0
- metaensemble-0.2.0/metaensemble/tools/__init__.py +7 -0
- metaensemble-0.2.0/metaensemble/tools/executors.py +62 -0
- metaensemble-0.2.0/metaensemble/tools/ledger.py +121 -0
- metaensemble-0.2.0/metaensemble/tools/limits.py +165 -0
- metaensemble-0.2.0/metaensemble/tools/perf.py +150 -0
- metaensemble-0.2.0/metaensemble/tools/standup.py +177 -0
- metaensemble-0.2.0/metaensemble/tools/stats.py +115 -0
- metaensemble-0.2.0/metaensemble.egg-info/PKG-INFO +221 -0
- metaensemble-0.2.0/metaensemble.egg-info/SOURCES.txt +88 -0
- metaensemble-0.2.0/metaensemble.egg-info/dependency_links.txt +1 -0
- metaensemble-0.2.0/metaensemble.egg-info/entry_points.txt +2 -0
- metaensemble-0.2.0/metaensemble.egg-info/requires.txt +21 -0
- metaensemble-0.2.0/metaensemble.egg-info/top_level.txt +2 -0
- metaensemble-0.2.0/pyproject.toml +88 -0
- metaensemble-0.2.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 MetaEnsemble contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: metaensemble
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A typed runtime for ensembles of cognitive agents
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: jsonschema>=4.20
|
|
10
|
+
Requires-Dist: pyyaml>=6.0
|
|
11
|
+
Provides-Extra: test
|
|
12
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
13
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
14
|
+
Requires-Dist: bandit>=1.7; extra == "test"
|
|
15
|
+
Requires-Dist: ruff>=0.5; extra == "test"
|
|
16
|
+
Requires-Dist: radon>=6.0; extra == "test"
|
|
17
|
+
Requires-Dist: coverage>=7.0; extra == "test"
|
|
18
|
+
Requires-Dist: build>=1.0; extra == "test"
|
|
19
|
+
Requires-Dist: tomli>=2.0; python_version < "3.11" and extra == "test"
|
|
20
|
+
Provides-Extra: quality
|
|
21
|
+
Requires-Dist: bandit>=1.7; extra == "quality"
|
|
22
|
+
Requires-Dist: ruff>=0.5; extra == "quality"
|
|
23
|
+
Requires-Dist: radon>=6.0; extra == "quality"
|
|
24
|
+
Requires-Dist: coverage>=7.0; extra == "quality"
|
|
25
|
+
Requires-Dist: pip-audit>=2.6; extra == "quality"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# MetaEnsemble
|
|
29
|
+
|
|
30
|
+
**Stable identity, typed contracts, and observable runs for ensembles of cognitive agents.**
|
|
31
|
+
|
|
32
|
+
MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
|
|
33
|
+
|
|
34
|
+
**v0.2.0 status:** feedback-first release. The software records and gates local agent work, but measured quality-per-token improvements remain a product hypothesis until the live evaluation set is larger and fully baseline-comparable. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md).
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Why MetaEnsemble exists
|
|
39
|
+
|
|
40
|
+
Coordinating multiple cognitive agents tends to fail in the same three places:
|
|
41
|
+
|
|
42
|
+
1. **No stable identity.** Each agent invocation is anonymous. No way to say "follow up with the same Executor next week."
|
|
43
|
+
2. **No typed handoffs.** Context passes between agents as free-form prose. Every receiver re-derives state by searching, re-reading, re-grepping.
|
|
44
|
+
3. **No observability.** Token spend, model choice, outcome — nothing captured per run. Optimization is guesswork.
|
|
45
|
+
|
|
46
|
+
MetaEnsemble fixes all three at the substrate, not as features. Every primitive in the system carries an ID, every transport is schema-validated, every execution lands in the Ledger.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## What MetaEnsemble gives you
|
|
51
|
+
|
|
52
|
+
- **Persistent identities.** Every Executor has a UUIDv7 and a short alias (`arch-7b3`). Resume any past Executor across sessions with `/relaunch arch-7b3`.
|
|
53
|
+
- **Typed contracts.** Handoffs travel as YAML Manifests validated against a JSON Schema. Inter-Executor messages travel as terse JSON Briefs. No prose context-injection, no re-search on the receiving side.
|
|
54
|
+
- **Observable runs.** Append-only Ledger (SQLite live, JSONL mirror for replay) records every Run with token cost, requested model tier, runtime-observed model when available, outcome, and links to its Deliverable.
|
|
55
|
+
- **MetaEnsemble dispatch.** Spawn N Executors from one Role spec for parallel hypothesis exploration, consensus review, or fan-out implementation. Default is N=1; multi-instance is opt-in and currently validated at the planning/protocol layer.
|
|
56
|
+
- **Cross-session continuity.** An Executor's identity is a Ledger row, not a live process. Relaunch is cheap (last Brief + last Deliverable summary) by default, deep (`--full`) when needed.
|
|
57
|
+
- **Two-channel design.** Machine-to-machine traffic (Briefs) stays terse and structured. Human-facing output (Deliverables) stays full English. Same Run produces both. No "compression tier" knob to misset.
|
|
58
|
+
- **Threshold-based cost gating.** The Coordinator auto-decides cheap, reversible work. It surfaces only the calls that warrant Principal judgment, in a structured options table — never as conversational back-and-forth.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Primitives
|
|
63
|
+
|
|
64
|
+
| Term | Shape | What it is |
|
|
65
|
+
|---|---|---|
|
|
66
|
+
| **Principal** | The human running the system | The person who dispatches work and approves above-threshold decisions. Maps to the IAM Principal concept. |
|
|
67
|
+
| **Coordinator** | The main agent in the active session | Plans Tasks, dispatches Executors, validates contracts, synthesizes Deliverables. Maps to the Kafka / ZooKeeper / Cassandra coordinator pattern. |
|
|
68
|
+
| **Role** | Markdown file with frontmatter spec | The Job Description. Declarative, versioned. Maps to a Kubernetes Deployment spec or IAM Role. |
|
|
69
|
+
| **Executor** | Row in the registry, identified by UUIDv7 + alias | A live instance of a Role. Multiple per Role per Task. Survives sessions. Maps to a Spark Executor or K8s Pod. |
|
|
70
|
+
| **Task** | Unit of work | What the Principal asks the ensemble to do. Has dependencies, expected deliverables, budget. |
|
|
71
|
+
| **Run** | Row in the Ledger | One execution attempt by one Executor for one Task. Maps to an MLflow run. |
|
|
72
|
+
| **Brief** | Schema-validated JSON | Wire-format message between Executors. Terse, machine-targeted. |
|
|
73
|
+
| **Manifest** | Schema-validated YAML | Handoff contract. Typed pointers to files, line ranges, schemas, prior runs. Maps to a dbt or OpenAPI manifest. |
|
|
74
|
+
| **Deliverable** | Markdown report | Human-readable output. English prose. Institutional memory. |
|
|
75
|
+
| **Ledger** | SQLite + JSONL mirror | Append-only log of every Run. Queryable, replayable. Maps to MLflow tracking. |
|
|
76
|
+
| **Registry** | View over the Ledger + Executor table | Current-state snapshot. Live Executors, open Tasks, dependencies. Maps to a service-mesh control-plane view. |
|
|
77
|
+
| **Dispatch** | Verb / slash command | The act of launching N Executors of a Role for a Task. |
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## High-level flow
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
┌─────────────────────┐
|
|
85
|
+
│ Principal │ (you)
|
|
86
|
+
└──────────┬──────────┘
|
|
87
|
+
│ intent
|
|
88
|
+
┌──────────▼──────────┐
|
|
89
|
+
│ Coordinator │ plans, dispatches, synthesizes
|
|
90
|
+
└─────┬────────┬──────┘
|
|
91
|
+
│ │
|
|
92
|
+
┌─────────────┘ └───────────┐
|
|
93
|
+
│ │
|
|
94
|
+
┌────▼─────────┐ ┌───────▼──────┐
|
|
95
|
+
│ Role: backend│ │ Role: review │
|
|
96
|
+
│ spec file │ │ spec file │
|
|
97
|
+
└────┬─────────┘ └───────┬──────┘
|
|
98
|
+
│ dispatch N=2 │ dispatch N=3
|
|
99
|
+
┌────┴────┐ ┌─────┼─────┐
|
|
100
|
+
▼ ▼ ▼ ▼ ▼
|
|
101
|
+
┌─────┐ ┌─────┐ ┌────┐┌────┐┌────┐
|
|
102
|
+
│be-1 │ │be-2 │ │rv-1││rv-2││rv-3│
|
|
103
|
+
└──┬──┘ └──┬──┘ └─┬──┘└─┬──┘└─┬──┘
|
|
104
|
+
│ Brief │ Brief │ │ │
|
|
105
|
+
▼ ▼ ▼ ▼ ▼
|
|
106
|
+
┌────────────────────────────────────────────────┐
|
|
107
|
+
│ Ledger (SQLite + JSONL) │
|
|
108
|
+
└────────────────────────────────────────────────┘
|
|
109
|
+
│
|
|
110
|
+
▼
|
|
111
|
+
┌──────────────┐
|
|
112
|
+
│ Deliverables │ English, for humans
|
|
113
|
+
└──────────────┘
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
A single `/dispatch` produces N Executors across one or more Roles. Each Executor emits a Brief downstream and a Deliverable upstream. Every Run is logged. The Principal sees Deliverables and the standup view; never the wire traffic.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Why two channels
|
|
121
|
+
|
|
122
|
+
A single Run produces two artifacts:
|
|
123
|
+
|
|
124
|
+
- The **Brief** is what the next Executor receives. Terse JSON. Schema-validated. Machine-targeted. Cheap to emit, cheap to parse.
|
|
125
|
+
- The **Deliverable** is what you, the Principal, read. Full English. Prose. Institutional memory.
|
|
126
|
+
|
|
127
|
+
These are not intensity tiers. They are different artifacts for different audiences, produced together. The receiving Executor does not parse English; the human does not parse JSON. Each gets the format that earns its place.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## How it runs
|
|
132
|
+
|
|
133
|
+
MetaEnsemble runs entirely on your laptop. Clone the repo, drop the conventions into your local agent runtime configuration, and dispatch. No servers, no cloud accounts, no hosting. Your Ledger, your Executors, your Briefs all live on your filesystem. State is portable: copy the repo and the state directory, and MetaEnsemble runs anywhere the agent runtime is installed.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Adopting MetaEnsemble in your project
|
|
138
|
+
|
|
139
|
+
MetaEnsemble is project-agnostic by design. Three layers, with project-specific knowledge confined to the project layer:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
metaensemble/ # shipped with MetaEnsemble; project-agnostic
|
|
143
|
+
~/.metaensemble/ # per-engineer preferences; the vendored runtime (runtime/, runtime-versions/); the runner at runtime/bin/me-run
|
|
144
|
+
<your-project>/.metaensemble/ # project-specific state, manifests, and install decisions
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
The adoption flow has two layers, asked separately:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
metaensemble setup # interactive wizard: picks a project, asks for layout, runs the two steps below
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
The wizard lists every Claude Code project on this machine, lets you pick one, asks once for the layout (namespaced or top-level), and then runs `user-setup` and `adopt` in sequence. The two underlying commands are explicit if you prefer them:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
metaensemble user-setup --layout=namespaced # once per machine: vendors runtime to ~/.metaensemble/runtime/, wires commands/hooks/statusline
|
|
157
|
+
# or
|
|
158
|
+
metaensemble user-setup --layout=top-level # same, but slash commands install top-level under ~/.claude/commands/
|
|
159
|
+
|
|
160
|
+
metaensemble adopt # per project: writes <project>/.metaensemble/ and honors install-decisions
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
`user-setup` is global (one layout for the whole machine; re-run with a different layout to switch). `adopt` is per-project and portable — run it once per project you want to register.
|
|
164
|
+
|
|
165
|
+
The inspection is the load-bearing piece. It writes two files into `<project>/.metaensemble/`:
|
|
166
|
+
|
|
167
|
+
- A short Markdown report naming what was found, what we recommend, and why.
|
|
168
|
+
- `install-decisions.yaml`, the editable choice surface. Every agent in your setup and every curated Role MetaEnsemble ships gets one entry with a sensible default. It also records the project's memory surfaces (`CLAUDE.md` and friends) so dispatch contracts hand Executors your existing project memory instead of rebuilding it. Read once, edit only what you disagree with.
|
|
169
|
+
|
|
170
|
+
Per-agent decisions span four cases (`collision`, `user_unique`, `curated_relevant`, `curated_optional`) and seven actions (`keep_yours`, `take_ours`, `keep_both`, `preserve`, `convert`, `activate`, `retire`). The installer reads the file and honors every choice. Nothing the user authored is silently converted; the default for every collision is to keep the user's agent.
|
|
171
|
+
|
|
172
|
+
Recovery mirrors the install split. `metaensemble unadopt` reverses one project's adoption: it walks `<project>/.metaensemble/backups/` in reverse, reverses project-scope actions, strips the managed `.gitignore` block, and leaves user-level integration intact. `metaensemble user-teardown` reverses `user-setup` by removing managed `~/.claude/` symlinks and hook entries. Each command accepts `--purge-state` for the matching `.metaensemble/` directory. For a full local rollback after live testing, run `metaensemble reconcile --older-than-minutes 0` first so stranded pending Runs are written to the Ledger, then run `metaensemble unadopt --purge-state` from the project root and `metaensemble user-teardown --purge-state` from anywhere. `metaensemble export-agents` reverse-converts MetaEnsemble Roles back to Claude Code agent files, even when the install's backups directory is missing. Every contract above is tested.
|
|
173
|
+
|
|
174
|
+
Starter packs (`--pack ml`, `--pack web`, `--pack data`) are planned for a future release.
|
|
175
|
+
|
|
176
|
+
If your project lives in an iCloud-synced directory (e.g., `~/Desktop/` with iCloud Desktop & Documents Sync enabled), consider excluding `.venv/` from iCloud sync. iCloud's conflict-resolution against rapid `pip install` file churn produces phantom duplicate files in `site-packages`; MetaEnsemble filters them correctly but they consume iCloud quota and slow installs. `metaensemble doctor` C11 surfaces this state as a WARN with remediation. The same caveat applies more strongly to `.metaensemble/state/`: when iCloud places `department.db` into a dataless placeholder state, SQLite's `open()` can fail intermittently and PreToolUse hooks surface as `Agent hook error` with no stderr. The robust fix is to host active MetaEnsemble projects outside iCloud-synced paths, or exclude the project from iCloud Drive. `metaensemble doctor` C4 names this cause when it detects the layout. See [USER-GUIDE.md — When something feels off](./docs/USER-GUIDE.md) for the troubleshooting recipe.
|
|
177
|
+
|
|
178
|
+
See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the full reference. See [ARCHITECTURE.md §4 — Portability](./docs/ARCHITECTURE.md) for the layering, merge order, and the hard rule that keeps Core project-agnostic.
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## Status
|
|
183
|
+
|
|
184
|
+
v0.2.0. All core phases complete and tested:
|
|
185
|
+
|
|
186
|
+
- Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
|
|
187
|
+
- Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
|
|
188
|
+
- Principal-facing surface: seven slash commands plus CLI subcommands including `metaensemble setup`, `metaensemble user-setup`, `metaensemble adopt`, `metaensemble unadopt`, `metaensemble user-teardown`, `metaensemble reconcile`, `metaensemble eval`, `metaensemble stats`, and `metaensemble projects`.
|
|
189
|
+
- Multi-instance patterns (fanout / consensus / shadow / peer-review) with the `N ≥ 2` guard enforced deterministically by the PreToolUse marker hook.
|
|
190
|
+
- Installer with idempotent re-runs, explicit purge modes, and a residue report after every uninstall.
|
|
191
|
+
- Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
|
|
192
|
+
- Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
|
|
193
|
+
- Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
|
|
194
|
+
- Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The shipped replay pack is a non-empirical bootstrap fixture. Live smoke/full runs are wired for side-effect-free classification-smoke checks; calibration and baseline-superiority claims still require larger labeled/fixture sets.
|
|
195
|
+
|
|
196
|
+
v0.2.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
|
|
197
|
+
|
|
198
|
+
See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
|
|
199
|
+
Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Where to start
|
|
204
|
+
|
|
205
|
+
- **[ARCHITECTURE.md](./docs/ARCHITECTURE.md)** — the layered design, the data model, the lifecycle, what MetaEnsemble is and is not.
|
|
206
|
+
- **[USER-GUIDE.md](./docs/USER-GUIDE.md)** — a friendly Principal guide for day-one users.
|
|
207
|
+
- **[PERFORMANCE.md](./docs/PERFORMANCE.md)** — the binding engineering contract: token budgets, time budgets, query rules, and CI-gated benchmarks. Required reading before changing performance-sensitive code.
|
|
208
|
+
- **[RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md)** — artifact, security, installer, and live-eval gates for publishing a release.
|
|
209
|
+
- **[GLOSSARY.md](./docs/GLOSSARY.md)** — every term defined precisely, every analog named.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Operating principles
|
|
214
|
+
|
|
215
|
+
Three values drive every design choice in MetaEnsemble:
|
|
216
|
+
|
|
217
|
+
1. **Conserve the budget.** The constraint is window exhaustion, not dollars. Per-Executor model tiering, terse wire format, schema-driven handoffs that eliminate re-search — all designed to fit more useful work in fewer tokens.
|
|
218
|
+
2. **Move fast.** Parallel dispatch is a primitive, not a workaround. Hooks fire on lifecycle events automatically. The Principal never types boilerplate.
|
|
219
|
+
3. **Hold the line on quality.** Speed and budget never come at the cost of standards. The schema layer enforces correctness; the Ledger enforces accountability; the Deliverable channel preserves institutional memory at full fidelity.
|
|
220
|
+
|
|
221
|
+
If a proposed feature compromises any of these three, it does not ship.
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# MetaEnsemble
|
|
2
|
+
|
|
3
|
+
**Stable identity, typed contracts, and observable runs for ensembles of cognitive agents.**
|
|
4
|
+
|
|
5
|
+
MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
|
|
6
|
+
|
|
7
|
+
**v0.2.0 status:** feedback-first release. The software records and gates local agent work, but measured quality-per-token improvements remain a product hypothesis until the live evaluation set is larger and fully baseline-comparable. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md).
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Why MetaEnsemble exists
|
|
12
|
+
|
|
13
|
+
Coordinating multiple cognitive agents tends to fail in the same three places:
|
|
14
|
+
|
|
15
|
+
1. **No stable identity.** Each agent invocation is anonymous. No way to say "follow up with the same Executor next week."
|
|
16
|
+
2. **No typed handoffs.** Context passes between agents as free-form prose. Every receiver re-derives state by searching, re-reading, re-grepping.
|
|
17
|
+
3. **No observability.** Token spend, model choice, outcome — nothing captured per run. Optimization is guesswork.
|
|
18
|
+
|
|
19
|
+
MetaEnsemble fixes all three at the substrate, not as features. Every primitive in the system carries an ID, every transport is schema-validated, every execution lands in the Ledger.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## What MetaEnsemble gives you
|
|
24
|
+
|
|
25
|
+
- **Persistent identities.** Every Executor has a UUIDv7 and a short alias (`arch-7b3`). Resume any past Executor across sessions with `/relaunch arch-7b3`.
|
|
26
|
+
- **Typed contracts.** Handoffs travel as YAML Manifests validated against a JSON Schema. Inter-Executor messages travel as terse JSON Briefs. No prose context-injection, no re-search on the receiving side.
|
|
27
|
+
- **Observable runs.** Append-only Ledger (SQLite live, JSONL mirror for replay) records every Run with token cost, requested model tier, runtime-observed model when available, outcome, and links to its Deliverable.
|
|
28
|
+
- **MetaEnsemble dispatch.** Spawn N Executors from one Role spec for parallel hypothesis exploration, consensus review, or fan-out implementation. Default is N=1; multi-instance is opt-in and currently validated at the planning/protocol layer.
|
|
29
|
+
- **Cross-session continuity.** An Executor's identity is a Ledger row, not a live process. Relaunch is cheap (last Brief + last Deliverable summary) by default, deep (`--full`) when needed.
|
|
30
|
+
- **Two-channel design.** Machine-to-machine traffic (Briefs) stays terse and structured. Human-facing output (Deliverables) stays full English. Same Run produces both. No "compression tier" knob to misset.
|
|
31
|
+
- **Threshold-based cost gating.** The Coordinator auto-decides cheap, reversible work. It surfaces only the calls that warrant Principal judgment, in a structured options table — never as conversational back-and-forth.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Primitives
|
|
36
|
+
|
|
37
|
+
| Term | Shape | What it is |
|
|
38
|
+
|---|---|---|
|
|
39
|
+
| **Principal** | The human running the system | The person who dispatches work and approves above-threshold decisions. Maps to the IAM Principal concept. |
|
|
40
|
+
| **Coordinator** | The main agent in the active session | Plans Tasks, dispatches Executors, validates contracts, synthesizes Deliverables. Maps to the Kafka / ZooKeeper / Cassandra coordinator pattern. |
|
|
41
|
+
| **Role** | Markdown file with frontmatter spec | The Job Description. Declarative, versioned. Maps to a Kubernetes Deployment spec or IAM Role. |
|
|
42
|
+
| **Executor** | Row in the registry, identified by UUIDv7 + alias | A live instance of a Role. Multiple per Role per Task. Survives sessions. Maps to a Spark Executor or K8s Pod. |
|
|
43
|
+
| **Task** | Unit of work | What the Principal asks the ensemble to do. Has dependencies, expected deliverables, budget. |
|
|
44
|
+
| **Run** | Row in the Ledger | One execution attempt by one Executor for one Task. Maps to an MLflow run. |
|
|
45
|
+
| **Brief** | Schema-validated JSON | Wire-format message between Executors. Terse, machine-targeted. |
|
|
46
|
+
| **Manifest** | Schema-validated YAML | Handoff contract. Typed pointers to files, line ranges, schemas, prior runs. Maps to a dbt or OpenAPI manifest. |
|
|
47
|
+
| **Deliverable** | Markdown report | Human-readable output. English prose. Institutional memory. |
|
|
48
|
+
| **Ledger** | SQLite + JSONL mirror | Append-only log of every Run. Queryable, replayable. Maps to MLflow tracking. |
|
|
49
|
+
| **Registry** | View over the Ledger + Executor table | Current-state snapshot. Live Executors, open Tasks, dependencies. Maps to a service-mesh control-plane view. |
|
|
50
|
+
| **Dispatch** | Verb / slash command | The act of launching N Executors of a Role for a Task. |
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## High-level flow
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
┌─────────────────────┐
|
|
58
|
+
│ Principal │ (you)
|
|
59
|
+
└──────────┬──────────┘
|
|
60
|
+
│ intent
|
|
61
|
+
┌──────────▼──────────┐
|
|
62
|
+
│ Coordinator │ plans, dispatches, synthesizes
|
|
63
|
+
└─────┬────────┬──────┘
|
|
64
|
+
│ │
|
|
65
|
+
┌─────────────┘ └───────────┐
|
|
66
|
+
│ │
|
|
67
|
+
┌────▼─────────┐ ┌───────▼──────┐
|
|
68
|
+
│ Role: backend│ │ Role: review │
|
|
69
|
+
│ spec file │ │ spec file │
|
|
70
|
+
└────┬─────────┘ └───────┬──────┘
|
|
71
|
+
│ dispatch N=2 │ dispatch N=3
|
|
72
|
+
┌────┴────┐ ┌─────┼─────┐
|
|
73
|
+
▼ ▼ ▼ ▼ ▼
|
|
74
|
+
┌─────┐ ┌─────┐ ┌────┐┌────┐┌────┐
|
|
75
|
+
│be-1 │ │be-2 │ │rv-1││rv-2││rv-3│
|
|
76
|
+
└──┬──┘ └──┬──┘ └─┬──┘└─┬──┘└─┬──┘
|
|
77
|
+
│ Brief │ Brief │ │ │
|
|
78
|
+
▼ ▼ ▼ ▼ ▼
|
|
79
|
+
┌────────────────────────────────────────────────┐
|
|
80
|
+
│ Ledger (SQLite + JSONL) │
|
|
81
|
+
└────────────────────────────────────────────────┘
|
|
82
|
+
│
|
|
83
|
+
▼
|
|
84
|
+
┌──────────────┐
|
|
85
|
+
│ Deliverables │ English, for humans
|
|
86
|
+
└──────────────┘
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
A single `/dispatch` produces N Executors across one or more Roles. Each Executor emits a Brief downstream and a Deliverable upstream. Every Run is logged. The Principal sees Deliverables and the standup view; never the wire traffic.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Why two channels
|
|
94
|
+
|
|
95
|
+
A single Run produces two artifacts:
|
|
96
|
+
|
|
97
|
+
- The **Brief** is what the next Executor receives. Terse JSON. Schema-validated. Machine-targeted. Cheap to emit, cheap to parse.
|
|
98
|
+
- The **Deliverable** is what you, the Principal, read. Full English. Prose. Institutional memory.
|
|
99
|
+
|
|
100
|
+
These are not intensity tiers. They are different artifacts for different audiences, produced together. The receiving Executor does not parse English; the human does not parse JSON. Each gets the format that earns its place.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## How it runs
|
|
105
|
+
|
|
106
|
+
MetaEnsemble runs entirely on your laptop. Clone the repo, drop the conventions into your local agent runtime configuration, and dispatch. No servers, no cloud accounts, no hosting. Your Ledger, your Executors, your Briefs all live on your filesystem. State is portable: copy the repo and the state directory, and MetaEnsemble runs anywhere the agent runtime is installed.
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Adopting MetaEnsemble in your project
|
|
111
|
+
|
|
112
|
+
MetaEnsemble is project-agnostic by design. Three layers, with project-specific knowledge confined to the project layer:
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
metaensemble/ # shipped with MetaEnsemble; project-agnostic
|
|
116
|
+
~/.metaensemble/ # per-engineer preferences; the vendored runtime (runtime/, runtime-versions/); the runner at runtime/bin/me-run
|
|
117
|
+
<your-project>/.metaensemble/ # project-specific state, manifests, and install decisions
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The adoption flow has two layers, asked separately:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
metaensemble setup # interactive wizard: picks a project, asks for layout, runs the two steps below
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
The wizard lists every Claude Code project on this machine, lets you pick one, asks once for the layout (namespaced or top-level), and then runs `user-setup` and `adopt` in sequence. The two underlying commands are explicit if you prefer them:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
metaensemble user-setup --layout=namespaced # once per machine: vendors runtime to ~/.metaensemble/runtime/, wires commands/hooks/statusline
|
|
130
|
+
# or
|
|
131
|
+
metaensemble user-setup --layout=top-level # same, but slash commands install top-level under ~/.claude/commands/
|
|
132
|
+
|
|
133
|
+
metaensemble adopt # per project: writes <project>/.metaensemble/ and honors install-decisions
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
`user-setup` is global (one layout for the whole machine; re-run with a different layout to switch). `adopt` is per-project and portable — run it once per project you want to register.
|
|
137
|
+
|
|
138
|
+
The inspection is the load-bearing piece. It writes two files into `<project>/.metaensemble/`:
|
|
139
|
+
|
|
140
|
+
- A short Markdown report naming what was found, what we recommend, and why.
|
|
141
|
+
- `install-decisions.yaml`, the editable choice surface. Every agent in your setup and every curated Role MetaEnsemble ships gets one entry with a sensible default. It also records the project's memory surfaces (`CLAUDE.md` and friends) so dispatch contracts hand Executors your existing project memory instead of rebuilding it. Read once, edit only what you disagree with.
|
|
142
|
+
|
|
143
|
+
Per-agent decisions span four cases (`collision`, `user_unique`, `curated_relevant`, `curated_optional`) and seven actions (`keep_yours`, `take_ours`, `keep_both`, `preserve`, `convert`, `activate`, `retire`). The installer reads the file and honors every choice. Nothing the user authored is silently converted; the default for every collision is to keep the user's agent.
|
|
144
|
+
|
|
145
|
+
Recovery mirrors the install split. `metaensemble unadopt` reverses one project's adoption: it walks `<project>/.metaensemble/backups/` in reverse, reverses project-scope actions, strips the managed `.gitignore` block, and leaves user-level integration intact. `metaensemble user-teardown` reverses `user-setup` by removing managed `~/.claude/` symlinks and hook entries. Each command accepts `--purge-state` for the matching `.metaensemble/` directory. For a full local rollback after live testing, run `metaensemble reconcile --older-than-minutes 0` first so stranded pending Runs are written to the Ledger, then run `metaensemble unadopt --purge-state` from the project root and `metaensemble user-teardown --purge-state` from anywhere. `metaensemble export-agents` reverse-converts MetaEnsemble Roles back to Claude Code agent files, even when the install's backups directory is missing. Every contract above is tested.
|
|
146
|
+
|
|
147
|
+
Starter packs (`--pack ml`, `--pack web`, `--pack data`) are planned for a future release.
|
|
148
|
+
|
|
149
|
+
If your project lives in an iCloud-synced directory (e.g., `~/Desktop/` with iCloud Desktop & Documents Sync enabled), consider excluding `.venv/` from iCloud sync. iCloud's conflict-resolution against rapid `pip install` file churn produces phantom duplicate files in `site-packages`; MetaEnsemble filters them correctly but they consume iCloud quota and slow installs. `metaensemble doctor` C11 surfaces this state as a WARN with remediation. The same caveat applies more strongly to `.metaensemble/state/`: when iCloud places `department.db` into a dataless placeholder state, SQLite's `open()` can fail intermittently and PreToolUse hooks surface as `Agent hook error` with no stderr. The robust fix is to host active MetaEnsemble projects outside iCloud-synced paths, or exclude the project from iCloud Drive. `metaensemble doctor` C4 names this cause when it detects the layout. See [USER-GUIDE.md — When something feels off](./docs/USER-GUIDE.md) for the troubleshooting recipe.
|
|
150
|
+
|
|
151
|
+
See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the full reference. See [ARCHITECTURE.md §4 — Portability](./docs/ARCHITECTURE.md) for the layering, merge order, and the hard rule that keeps Core project-agnostic.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Status
|
|
156
|
+
|
|
157
|
+
v0.2.0. All core phases complete and tested:
|
|
158
|
+
|
|
159
|
+
- Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
|
|
160
|
+
- Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
|
|
161
|
+
- Principal-facing surface: seven slash commands plus CLI subcommands including `metaensemble setup`, `metaensemble user-setup`, `metaensemble adopt`, `metaensemble unadopt`, `metaensemble user-teardown`, `metaensemble reconcile`, `metaensemble eval`, `metaensemble stats`, and `metaensemble projects`.
|
|
162
|
+
- Multi-instance patterns (fanout / consensus / shadow / peer-review) with the `N ≥ 2` guard enforced deterministically by the PreToolUse marker hook.
|
|
163
|
+
- Installer with idempotent re-runs, explicit purge modes, and a residue report after every uninstall.
|
|
164
|
+
- Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
|
|
165
|
+
- Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
|
|
166
|
+
- Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
|
|
167
|
+
- Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The shipped replay pack is a non-empirical bootstrap fixture. Live smoke/full runs are wired for side-effect-free classification-smoke checks; calibration and baseline-superiority claims still require larger labeled/fixture sets.
|
|
168
|
+
|
|
169
|
+
v0.2.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
|
|
170
|
+
|
|
171
|
+
See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
|
|
172
|
+
Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Where to start
|
|
177
|
+
|
|
178
|
+
- **[ARCHITECTURE.md](./docs/ARCHITECTURE.md)** — the layered design, the data model, the lifecycle, what MetaEnsemble is and is not.
|
|
179
|
+
- **[USER-GUIDE.md](./docs/USER-GUIDE.md)** — a friendly Principal guide for day-one users.
|
|
180
|
+
- **[PERFORMANCE.md](./docs/PERFORMANCE.md)** — the binding engineering contract: token budgets, time budgets, query rules, and CI-gated benchmarks. Required reading before changing performance-sensitive code.
|
|
181
|
+
- **[RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md)** — artifact, security, installer, and live-eval gates for publishing a release.
|
|
182
|
+
- **[GLOSSARY.md](./docs/GLOSSARY.md)** — every term defined precisely, every analog named.
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Operating principles
|
|
187
|
+
|
|
188
|
+
Three values drive every design choice in MetaEnsemble:
|
|
189
|
+
|
|
190
|
+
1. **Conserve the budget.** The constraint is window exhaustion, not dollars. Per-Executor model tiering, terse wire format, schema-driven handoffs that eliminate re-search — all designed to fit more useful work in fewer tokens.
|
|
191
|
+
2. **Move fast.** Parallel dispatch is a primitive, not a workaround. Hooks fire on lifecycle events automatically. The Principal never types boilerplate.
|
|
192
|
+
3. **Hold the line on quality.** Speed and budget never come at the cost of standards. The schema layer enforces correctness; the Ledger enforces accountability; the Deliverable channel preserves institutional memory at full fidelity.
|
|
193
|
+
|
|
194
|
+
If a proposed feature compromises any of these three, it does not ship.
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# MetaEnsemble Evaluation Harness
|
|
2
|
+
|
|
3
|
+
The harness exists so the quality-per-token claim — *the system around
|
|
4
|
+
the model is strong enough to deploy the competence the model already
|
|
5
|
+
has* — can be tested rather than asserted. Replay keeps the harness
|
|
6
|
+
deterministic in CI. Smoke and full tiers make live, side-effect-free
|
|
7
|
+
Claude Code calls and write measured reports under the caller's
|
|
8
|
+
`evals/reports/`.
|
|
9
|
+
The shipped classification data is one narrow smoke fixture, not a calibration set
|
|
10
|
+
and not a statement of product scope. MetaEnsemble is project-agnostic.
|
|
11
|
+
|
|
12
|
+
## Directory layout
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
evals/
|
|
16
|
+
├── README.md # this file
|
|
17
|
+
├── configs/
|
|
18
|
+
│ └── default.yaml # eval-cycle parameters (seeds, budget, model routing)
|
|
19
|
+
├── datasets/
|
|
20
|
+
│ ├── suite_a/ # 8 software-engineering tasks
|
|
21
|
+
│ │ ├── README.md
|
|
22
|
+
│ │ └── tasks.yaml
|
|
23
|
+
│ └── suite_b/ # domain-specific classification smoke set
|
|
24
|
+
│ ├── README.md
|
|
25
|
+
│ └── items.yaml
|
|
26
|
+
├── baselines/ # B1 / B2 / B3 baseline definitions
|
|
27
|
+
│ ├── b1_single_agent.yaml
|
|
28
|
+
│ ├── b2_single_agent_prompted.yaml
|
|
29
|
+
│ ├── b3_subagent_default.yaml
|
|
30
|
+
│ └── b4_best_prompt.yaml # best-single-agent baseline
|
|
31
|
+
├── cassettes/ # replay fixtures; bootstrap pack is non-empirical
|
|
32
|
+
├── runners/ # cell × seed executors
|
|
33
|
+
│ ├── __init__.py
|
|
34
|
+
│ ├── api.py # tiered runner: replay / live / smoke
|
|
35
|
+
│ ├── metrics.py # Wilson CI, pass@budget, quality_per_1k_tokens
|
|
36
|
+
│ └── replay.py # cassette-based PR runner
|
|
37
|
+
└── reports/ # generated reports per cycle (gitignored)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Tiered evaluation
|
|
41
|
+
|
|
42
|
+
| Tier | When it runs | Live API calls | Budget |
|
|
43
|
+
|---|---|---|---|
|
|
44
|
+
| Replay | Every PR; reads recorded cassette responses. | No. | $0 |
|
|
45
|
+
| Smoke | Nightly cron or local preflight. 1 seed, `MM_full`, classification smoke set only. | Yes. | ~$0.30 default cap |
|
|
46
|
+
| Full | Release-gated. Defaults to 5 seeds × every configured cell. | Yes. | Principal-set cap |
|
|
47
|
+
|
|
48
|
+
The PR tier exists to keep regressions cheap to catch; the full tier
|
|
49
|
+
exists to certify a release. A release candidate is not allowed
|
|
50
|
+
to claim quality-per-token superiority unless the same report includes
|
|
51
|
+
baseline cells and MetaEnsemble cells over the same task set.
|
|
52
|
+
|
|
53
|
+
The release ships a compact `evals/cassettes/bootstrap.jsonl` pack so the
|
|
54
|
+
replay tier works in a clean checkout. That pack is deliberately marked
|
|
55
|
+
non-empirical; it verifies the harness mechanics, not MetaEnsemble's
|
|
56
|
+
quality claim. Live smoke/full reports are empirical for the cells and
|
|
57
|
+
datasets actually run; the report notes any skipped deferred fixtures.
|
|
58
|
+
|
|
59
|
+
## Headline metrics
|
|
60
|
+
|
|
61
|
+
The harness reports three co-primary metrics per cell:
|
|
62
|
+
|
|
63
|
+
| Metric | Definition |
|
|
64
|
+
|---|---|
|
|
65
|
+
| `pass@budget` | Pass-rate against the cell's per-task budget. A "win" that overspends does not count. |
|
|
66
|
+
| `quality_per_1k_tokens` | Average score across passing runs divided by tokens / 1000. Directly tests the efficiency thesis. |
|
|
67
|
+
| `orchestration_overhead_ratio` | MetaEnsemble token cost over the best single-agent baseline's token cost, on the same task. |
|
|
68
|
+
|
|
69
|
+
Plus the supporting metrics in `runners/metrics.py`:
|
|
70
|
+
`failed_run_token_waste`, `time_to_useful_deliverable`,
|
|
71
|
+
`minimum_useful_answer_score`.
|
|
72
|
+
|
|
73
|
+
For live reports, include these context fields in the release note or
|
|
74
|
+
system-card link: exact model IDs when the runtime exposes them, seed
|
|
75
|
+
count, cells run, skipped fixtures, total observed tokens, estimated vs
|
|
76
|
+
observed token error where available, and any cost-gate or Python
|
|
77
|
+
deliverable-check interventions.
|
|
78
|
+
|
|
79
|
+
## Suite A — software engineering (8 tasks)
|
|
80
|
+
|
|
81
|
+
Eight tasks drawn from the project's own backlog and from small
|
|
82
|
+
open-source repos. Each task has:
|
|
83
|
+
|
|
84
|
+
- A one-paragraph description (English).
|
|
85
|
+
- A frozen-commit starting state (commit SHA of the project under test).
|
|
86
|
+
- Graded acceptance criteria (build passes, tests count ≥ N, lint
|
|
87
|
+
clean, manifest existed, deliverable file present).
|
|
88
|
+
|
|
89
|
+
See `evals/datasets/suite_a/tasks.yaml` for the current set.
|
|
90
|
+
|
|
91
|
+
The current Suite-A rows still contain deferred fixture SHAs. The live
|
|
92
|
+
full tier names those skipped tasks in the report rather than treating
|
|
93
|
+
them as passed or failed. Release certification across software tasks
|
|
94
|
+
requires replacing the deferred SHAs with real fixture repositories.
|
|
95
|
+
|
|
96
|
+
## Suite B — domain-specific classification (12 items, *smoke only*)
|
|
97
|
+
|
|
98
|
+
Twelve items is too few for calibration claims. The 12-item set in
|
|
99
|
+
`evals/datasets/suite_b/items.yaml` is the **smoke suite** that proves
|
|
100
|
+
the pipeline end-to-end. It is intentionally narrow; it does not make
|
|
101
|
+
MetaEnsemble domain-specific. Any release claim about a particular domain
|
|
102
|
+
needs its own independently labeled calibration set. The system card states
|
|
103
|
+
this limitation explicitly so no calibration claim is implied by the smoke
|
|
104
|
+
set.
|
|
105
|
+
|
|
106
|
+
## Running the harness
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# PR-tier replay (no API calls).
|
|
110
|
+
metaensemble eval --tier replay --cells all
|
|
111
|
+
|
|
112
|
+
# Nightly smoke (one cell × one seed × classification smoke set only).
|
|
113
|
+
metaensemble eval --tier smoke
|
|
114
|
+
|
|
115
|
+
# Constrained full-tier check.
|
|
116
|
+
metaensemble eval --tier full --allow-live --cells MM_full --seeds 1 --budget-usd 0.30
|
|
117
|
+
|
|
118
|
+
# Release-gated full cycle once fixture SHAs and budget are set.
|
|
119
|
+
metaensemble eval --tier full --allow-live --cells all --seeds 5 --budget-usd 0.30
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
The output report lands in the current working directory at
|
|
123
|
+
`evals/reports/<UTC-date>-<tier>.md` and is linked from
|
|
124
|
+
`PERFORMANCE.md §4` once a cycle ships.
|
|
125
|
+
|
|
126
|
+
Supported flags:
|
|
127
|
+
|
|
128
|
+
| Flag | Meaning |
|
|
129
|
+
|---|---|
|
|
130
|
+
| `--cells all` or `--cells A,B` | Select all cells or a comma-separated subset. Smoke defaults to `MM_full`; replay/full default to all. |
|
|
131
|
+
| `--seeds N` | Override seed count. Smoke defaults to 1; replay/full default to `evals/configs/default.yaml`. |
|
|
132
|
+
| `--budget-usd X` | Override the live-tier per-run budget shown in preflight. |
|
|
133
|
+
| `--allow-live` | Required before the full tier proceeds past preflight. |
|
|
134
|
+
|
|
135
|
+
## Sign-off thresholds (D-8, D-9)
|
|
136
|
+
|
|
137
|
+
D-8 and D-9 are numerical full-tier release gates:
|
|
138
|
+
|
|
139
|
+
- **D-8 orchestration overhead**: any measured MetaEnsemble cell above
|
|
140
|
+
`2.0x` the best single-agent prompt baseline (`B4`) blocks the full
|
|
141
|
+
tier's ship verdict.
|
|
142
|
+
- **D-9 failed-run waste**: failed and budget-exceeded runs above `10%`
|
|
143
|
+
of total evaluated tokens block the full tier's ship verdict.
|
|
144
|
+
|
|
145
|
+
The thresholds live in `evals/configs/default.yaml`. If a full run does
|
|
146
|
+
not include the `B4_best_prompt` baseline, D-8 is reported as not
|
|
147
|
+
evaluated rather than silently passing.
|
|
File without changes
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Eval Cassettes
|
|
2
|
+
|
|
3
|
+
`bootstrap.jsonl` is a v0.1.0 replay fixture pack. It exists so the
|
|
4
|
+
zero-cost replay tier exercises task loading, cell selection, metrics,
|
|
5
|
+
and report rendering in a clean checkout.
|
|
6
|
+
|
|
7
|
+
It is not empirical benchmark evidence. Each record is marked
|
|
8
|
+
`source: bootstrap_fixture_not_empirical`; the first live smoke/full
|
|
9
|
+
cycle should replace or supplement this pack with recorded cassette
|
|
10
|
+
outputs from real runs.
|