metaensemble 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {metaensemble-0.2.0 → metaensemble-0.3.0}/PKG-INFO +5 -5
- {metaensemble-0.2.0 → metaensemble-0.3.0}/README.md +4 -4
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/README.md +29 -14
- metaensemble-0.3.0/evals/datasets/suite_a/tasks.yaml +157 -0
- metaensemble-0.3.0/evals/fixtures/__init__.py +7 -0
- metaensemble-0.3.0/evals/fixtures/build.py +140 -0
- metaensemble-0.3.0/evals/fixtures/legacy/legacy/__init__.py +1 -0
- metaensemble-0.3.0/evals/fixtures/legacy/legacy/big_module.py +680 -0
- metaensemble-0.3.0/evals/fixtures/legacy/test_big_module.py +147 -0
- metaensemble-0.3.0/evals/fixtures/paginator/pagination.py +43 -0
- metaensemble-0.3.0/evals/fixtures/paginator/test_pagination.py +24 -0
- metaensemble-0.3.0/evals/runners/acceptance.py +528 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/runners/api.py +4 -1
- metaensemble-0.3.0/evals/runners/suite_a.py +552 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/__init__.py +1 -1
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/cli.py +78 -13
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/PKG-INFO +5 -5
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/SOURCES.txt +9 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/pyproject.toml +1 -1
- metaensemble-0.2.0/evals/datasets/suite_a/tasks.yaml +0 -123
- {metaensemble-0.2.0 → metaensemble-0.3.0}/LICENSE +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/__init__.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/cassettes/README.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/cassettes/bootstrap.jsonl +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/configs/default.yaml +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/datasets/__init__.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/datasets/suite_b/items.yaml +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/runners/__init__.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/runners/metrics.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/dispatch.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/executors.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/ledger.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/limits.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/perf.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/relaunch.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/standup.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/config/budgets.example.yaml +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/config/quality.example.yaml +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/__init__.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/_common.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/deliverable_sync.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/file_event.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/post_task.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/pre_task.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/session_start.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/session_summary.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/subagent_stop.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/__init__.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/config.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/cost_gate.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/dispatch.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/doctor.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/file_events.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/ids.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/installer.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/ledger.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/manifest.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/native_state.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/overlaps.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/quality_gate.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/quality_runners.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/reconcile.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/recording.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/relaunch.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/runtime_payload.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/runtime_state.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/sidecar.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/topology.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/transcript.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/output-styles/deliverable.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/output-styles/wire.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/architect.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/backend.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/code-quality.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/data-engineer.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/devops.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/docs.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/frontend.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/ml-engineer.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/test-engineer.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/schemas/brief.schema.json +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/schemas/manifest.schema.json +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/schemas/role.schema.json +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/skills/metaensemble-protocol/SKILL.md +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/state/migrations/001_init.sql +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/state/migrations/002_outcome_extended.sql +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/state/migrations/003_run_provenance.sql +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/statusline/me_status.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/__init__.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/executors.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/ledger.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/limits.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/perf.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/standup.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/stats.py +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/dependency_links.txt +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/entry_points.txt +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/requires.txt +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/top_level.txt +0 -0
- {metaensemble-0.2.0 → metaensemble-0.3.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: metaensemble
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A typed runtime for ensembles of cognitive agents
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -31,7 +31,7 @@ Dynamic: license-file
|
|
|
31
31
|
|
|
32
32
|
MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
|
|
33
33
|
|
|
34
|
-
**v0.
|
|
34
|
+
**v0.3.0 status:** feedback-first release, now with a measured calibration. The first full-tier cycle (320 live runs, 8 cells × 8 software tasks × 5 seeds) found acceptance-quality parity with strong single-agent baselines at a 1.55× token premium on tasks that fit one context, that every protocol primitive is load-bearing (ablations degrade it — the Manifest most), and that the full protocol more than doubles the runtime's default-subagent baseline. No quality-per-token superiority is claimed for this task class. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) and `evals/reports/20260704T140844Z-full.md`.
|
|
35
35
|
|
|
36
36
|
---
|
|
37
37
|
|
|
@@ -181,7 +181,7 @@ See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the f
|
|
|
181
181
|
|
|
182
182
|
## Status
|
|
183
183
|
|
|
184
|
-
v0.
|
|
184
|
+
v0.3.0. All core phases complete and tested:
|
|
185
185
|
|
|
186
186
|
- Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
|
|
187
187
|
- Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
|
|
@@ -191,9 +191,9 @@ v0.2.0. All core phases complete and tested:
|
|
|
191
191
|
- Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
|
|
192
192
|
- Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
|
|
193
193
|
- Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
|
|
194
|
-
- Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The
|
|
194
|
+
- Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The full tier runs Suite-A software tasks live in sandboxed per-run workspaces with graded acceptance and honest cell isolation; the first calibration cycle is checked in at `evals/reports/20260704T140844Z-full.md`. Domain-classification calibration still requires an independently labeled set.
|
|
195
195
|
|
|
196
|
-
v0.
|
|
196
|
+
v0.3.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
|
|
197
197
|
|
|
198
198
|
See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
|
|
199
199
|
Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
|
|
6
6
|
|
|
7
|
-
**v0.
|
|
7
|
+
**v0.3.0 status:** feedback-first release, now with a measured calibration. The first full-tier cycle (320 live runs, 8 cells × 8 software tasks × 5 seeds) found acceptance-quality parity with strong single-agent baselines at a 1.55× token premium on tasks that fit one context, that every protocol primitive is load-bearing (ablations degrade it — the Manifest most), and that the full protocol more than doubles the runtime's default-subagent baseline. No quality-per-token superiority is claimed for this task class. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) and `evals/reports/20260704T140844Z-full.md`.
|
|
8
8
|
|
|
9
9
|
---
|
|
10
10
|
|
|
@@ -154,7 +154,7 @@ See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the f
|
|
|
154
154
|
|
|
155
155
|
## Status
|
|
156
156
|
|
|
157
|
-
v0.
|
|
157
|
+
v0.3.0. All core phases complete and tested:
|
|
158
158
|
|
|
159
159
|
- Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
|
|
160
160
|
- Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
|
|
@@ -164,9 +164,9 @@ v0.2.0. All core phases complete and tested:
|
|
|
164
164
|
- Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
|
|
165
165
|
- Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
|
|
166
166
|
- Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
|
|
167
|
-
- Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The
|
|
167
|
+
- Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The full tier runs Suite-A software tasks live in sandboxed per-run workspaces with graded acceptance and honest cell isolation; the first calibration cycle is checked in at `evals/reports/20260704T140844Z-full.md`. Domain-classification calibration still requires an independently labeled set.
|
|
168
168
|
|
|
169
|
-
v0.
|
|
169
|
+
v0.3.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
|
|
170
170
|
|
|
171
171
|
See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
|
|
172
172
|
Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
|
|
@@ -23,20 +23,26 @@ evals/
|
|
|
23
23
|
│ └── suite_b/ # domain-specific classification smoke set
|
|
24
24
|
│ ├── README.md
|
|
25
25
|
│ └── items.yaml
|
|
26
|
-
├──
|
|
27
|
-
│ ├──
|
|
28
|
-
│
|
|
29
|
-
│ ├──
|
|
30
|
-
│ └──
|
|
26
|
+
├── fixtures/ # deterministic Suite-A fixture repos
|
|
27
|
+
│ ├── build.py # single-commit builder; SHAs identical on every
|
|
28
|
+
│ │ # machine (pinned author/committer/date)
|
|
29
|
+
│ ├── paginator/ # oss-fixture-paginator source tree
|
|
30
|
+
│ └── legacy/ # oss-fixture-legacy source tree
|
|
31
31
|
├── cassettes/ # replay fixtures; bootstrap pack is non-empirical
|
|
32
32
|
├── runners/ # cell × seed executors
|
|
33
33
|
│ ├── __init__.py
|
|
34
|
-
│ ├── api.py # tiered
|
|
35
|
-
│ ├──
|
|
36
|
-
│
|
|
34
|
+
│ ├── api.py # tiered dispatch: replay / live smoke (suite B)
|
|
35
|
+
│ ├── suite_a.py # live Suite-A: sandboxed workspaces, per-cell
|
|
36
|
+
│ │ # prompts, hook isolation via --setting-sources
|
|
37
|
+
│ ├── acceptance.py # graded acceptance checkers (build, tests, lint,
|
|
38
|
+
│ │ # API surface, links, perf, CI matrix)
|
|
39
|
+
│ └── metrics.py # Wilson CI, pass@budget, quality_per_1k_tokens
|
|
37
40
|
└── reports/ # generated reports per cycle (gitignored)
|
|
38
41
|
```
|
|
39
42
|
|
|
43
|
+
The cell matrix (B1–B4 baselines, `MM_full`, three ablations) is defined
|
|
44
|
+
in `configs/default.yaml`, not in separate baseline files.
|
|
45
|
+
|
|
40
46
|
## Tiered evaluation
|
|
41
47
|
|
|
42
48
|
| Tier | When it runs | Live API calls | Budget |
|
|
@@ -54,7 +60,7 @@ The release ships a compact `evals/cassettes/bootstrap.jsonl` pack so the
|
|
|
54
60
|
replay tier works in a clean checkout. That pack is deliberately marked
|
|
55
61
|
non-empirical; it verifies the harness mechanics, not MetaEnsemble's
|
|
56
62
|
quality claim. Live smoke/full reports are empirical for the cells and
|
|
57
|
-
datasets actually run
|
|
63
|
+
datasets actually run.
|
|
58
64
|
|
|
59
65
|
## Headline metrics
|
|
60
66
|
|
|
@@ -88,10 +94,18 @@ open-source repos. Each task has:
|
|
|
88
94
|
|
|
89
95
|
See `evals/datasets/suite_a/tasks.yaml` for the current set.
|
|
90
96
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
97
|
+
Every Suite-A row pins a resolved starting SHA: tasks a1/a2 pin the
|
|
98
|
+
deterministic fixture commits from `evals/fixtures/build.py` (the builder
|
|
99
|
+
produces byte-identical trees and therefore identical SHAs on every
|
|
100
|
+
machine), and tasks a3–a8 pin the v0.2.0 release commit of this
|
|
101
|
+
repository, with each description verified true at that SHA. Live runs
|
|
102
|
+
materialize a fresh sandbox workspace per cell × task × seed (local
|
|
103
|
+
clones only — no network), grade the result with
|
|
104
|
+
`evals/runners/acceptance.py`, and keep every workspace on disk beside a
|
|
105
|
+
`run-manifest.jsonl` for post-hoc inspection. Baseline cells run with
|
|
106
|
+
`--setting-sources project,local` so the user-level MetaEnsemble hooks
|
|
107
|
+
are excluded; MM cells run with all setting sources — the cell
|
|
108
|
+
difference is the real orchestration layer, not only the prompt.
|
|
95
109
|
|
|
96
110
|
## Suite B — domain-specific classification (12 items, *smoke only*)
|
|
97
111
|
|
|
@@ -121,7 +135,8 @@ metaensemble eval --tier full --allow-live --cells all --seeds 5 --budget-usd 0.
|
|
|
121
135
|
|
|
122
136
|
The output report lands in the current working directory at
|
|
123
137
|
`evals/reports/<UTC-date>-<tier>.md` and is linked from
|
|
124
|
-
`PERFORMANCE.md §4
|
|
138
|
+
`PERFORMANCE.md §4`. The first full-tier cycle shipped 2026-07-04 and is
|
|
139
|
+
checked in at `evals/reports/20260704T140844Z-full.md`.
|
|
125
140
|
|
|
126
141
|
Supported flags:
|
|
127
142
|
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Suite A — eight software-engineering tasks.
|
|
2
|
+
#
|
|
3
|
+
# Each task gates the cell's pass-rate against measurable acceptance
|
|
4
|
+
# criteria. The starting state is a frozen Git SHA so re-runs are
|
|
5
|
+
# reproducible; the acceptance criteria are executed by `runners/api.py`
|
|
6
|
+
# after the cell's deliverable lands.
|
|
7
|
+
#
|
|
8
|
+
# Starting states:
|
|
9
|
+
# - `oss-fixture-*` repos are materialized by `evals/fixtures/build.py`;
|
|
10
|
+
# their SHAs are the deterministic single-commit values in
|
|
11
|
+
# `evals.fixtures.build.FIXTURE_SHAS`.
|
|
12
|
+
# - `metaensemble` tasks pin the v0.2.0 release commit
|
|
13
|
+
# (`git rev-parse v0.2.0^{commit}`). Every description below was
|
|
14
|
+
# reconciled against that tree.
|
|
15
|
+
|
|
16
|
+
tasks:
|
|
17
|
+
- id: a1_bugfix_off_by_one
|
|
18
|
+
title: Fix an off-by-one in a paginator
|
|
19
|
+
description: |
|
|
20
|
+
The paginator at `pagination.py:42` returns one fewer item than
|
|
21
|
+
expected on every page boundary. Locate and fix the bug; add a
|
|
22
|
+
regression test.
|
|
23
|
+
starting_repo: oss-fixture-paginator
|
|
24
|
+
starting_sha: cbb6c2178af85ab778dd215379bf0928b6e52268
|
|
25
|
+
acceptance:
|
|
26
|
+
- kind: build_passes
|
|
27
|
+
- kind: test_count_at_least
|
|
28
|
+
value: 5
|
|
29
|
+
- kind: lint_clean
|
|
30
|
+
- kind: file_modified
|
|
31
|
+
path: pagination.py
|
|
32
|
+
- kind: file_modified
|
|
33
|
+
path: test_pagination.py
|
|
34
|
+
|
|
35
|
+
- id: a2_refactor_module
|
|
36
|
+
title: Split a 680-line module into three cohesive files
|
|
37
|
+
description: |
|
|
38
|
+
The module `legacy/big_module.py` (680 lines) has three
|
|
39
|
+
responsibilities: parsing, validation, and rendering of the
|
|
40
|
+
recfile record format. Split it into three files preserving the
|
|
41
|
+
public API listed in `api_manifest.json`. No behavior change.
|
|
42
|
+
starting_repo: oss-fixture-legacy
|
|
43
|
+
starting_sha: c04afa1fb995fc47c53a7336dcb5873c4a4bdeb4
|
|
44
|
+
acceptance:
|
|
45
|
+
- kind: build_passes
|
|
46
|
+
- kind: test_count_at_least
|
|
47
|
+
value: 12
|
|
48
|
+
- kind: api_surface_preserved
|
|
49
|
+
|
|
50
|
+
- id: a3_doc_update
|
|
51
|
+
title: Document rollback verification in the USER-GUIDE
|
|
52
|
+
description: |
|
|
53
|
+
The "When all else fails — recovery" section of `docs/USER-GUIDE.md`
|
|
54
|
+
lists the split rollback commands (`metaensemble unadopt
|
|
55
|
+
--purge-state`, `metaensemble user-teardown --purge-state`) but
|
|
56
|
+
never tells the reader how to confirm a completed rollback.
|
|
57
|
+
`docs/DEPLOYMENT.md` covers this under "Verifying the rollback".
|
|
58
|
+
Add a short paragraph to the recovery section of
|
|
59
|
+
`docs/USER-GUIDE.md` explaining how to verify both rollback
|
|
60
|
+
scopes, with a link to the DEPLOYMENT.md section.
|
|
61
|
+
starting_repo: metaensemble
|
|
62
|
+
starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
|
|
63
|
+
acceptance:
|
|
64
|
+
- kind: markdown_links_resolve
|
|
65
|
+
- kind: file_modified
|
|
66
|
+
path: docs/USER-GUIDE.md
|
|
67
|
+
|
|
68
|
+
- id: a4_test_addition
|
|
69
|
+
title: Add a reconcile provenance test
|
|
70
|
+
description: |
|
|
71
|
+
`metaensemble/tests/test_reconcile.py` has 20 tests covering
|
|
72
|
+
session and stale reconciliation. Reconciliation copies a pending
|
|
73
|
+
sidecar's `brief_in_path` into the recorded Run row, but no test
|
|
74
|
+
asserts it. Add a test that asserts a sidecar's `brief_in_path`
|
|
75
|
+
survives reconciliation into the Run row.
|
|
76
|
+
starting_repo: metaensemble
|
|
77
|
+
starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
|
|
78
|
+
acceptance:
|
|
79
|
+
- kind: build_passes
|
|
80
|
+
- kind: test_count_delta_at_least
|
|
81
|
+
value: 1
|
|
82
|
+
- kind: file_modified
|
|
83
|
+
path: metaensemble/tests/test_reconcile.py
|
|
84
|
+
|
|
85
|
+
- id: a5_design_review
|
|
86
|
+
title: Review the uninstall and rollback design
|
|
87
|
+
description: |
|
|
88
|
+
Read the "Recovery and rollback" section of `docs/DEPLOYMENT.md`,
|
|
89
|
+
including its "Verifying the rollback" and "Full local rollback
|
|
90
|
+
after live testing" subsections, and produce a one-page review at
|
|
91
|
+
`reports/<date>-uninstall-review.md` naming at least three risks
|
|
92
|
+
the documented design does not address. Create the workspace-local
|
|
93
|
+
`reports/` directory if it does not exist.
|
|
94
|
+
starting_repo: metaensemble
|
|
95
|
+
starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
|
|
96
|
+
acceptance:
|
|
97
|
+
- kind: file_exists
|
|
98
|
+
glob: reports/*-uninstall-review.md
|
|
99
|
+
- kind: word_count_at_least
|
|
100
|
+
value: 300
|
|
101
|
+
|
|
102
|
+
- id: a6_security_review
|
|
103
|
+
title: Security-review the transcript walker
|
|
104
|
+
description: |
|
|
105
|
+
`metaensemble/lib/transcript.py` reads the runtime's session
|
|
106
|
+
transcript JSONL from the hook-supplied `transcript_path`. Write
|
|
107
|
+
`reports/<date>-transcript-security.md` listing every defensive
|
|
108
|
+
assumption the walker makes and one concrete attack it survives.
|
|
109
|
+
Create the workspace-local `reports/` directory if it does not
|
|
110
|
+
exist.
|
|
111
|
+
starting_repo: metaensemble
|
|
112
|
+
starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
|
|
113
|
+
acceptance:
|
|
114
|
+
- kind: file_exists
|
|
115
|
+
glob: reports/*-transcript-security.md
|
|
116
|
+
|
|
117
|
+
- id: a7_perf_tune
|
|
118
|
+
title: Tighten the get_window_burn p95 budget
|
|
119
|
+
description: |
|
|
120
|
+
`metaensemble/tests/test_perf_ledger.py` benchmarks
|
|
121
|
+
`get_window_burn` against the module-wide 10ms p95 budget on a
|
|
122
|
+
10k-row Ledger, and `idx_runs_window` already keeps the query
|
|
123
|
+
indexed. Measure the actual headroom, then tighten
|
|
124
|
+
`test_get_window_burn_meets_p95` to assert a dedicated 5ms p95
|
|
125
|
+
budget for `get_window_burn` without loosening any other
|
|
126
|
+
benchmark's budget.
|
|
127
|
+
starting_repo: metaensemble
|
|
128
|
+
starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
|
|
129
|
+
acceptance:
|
|
130
|
+
- kind: build_passes
|
|
131
|
+
- kind: perf_benchmark_passes
|
|
132
|
+
benchmark: test_get_window_burn_meets_p95
|
|
133
|
+
- kind: file_modified
|
|
134
|
+
path: metaensemble/tests/test_perf_ledger.py
|
|
135
|
+
|
|
136
|
+
- id: a8_infra_change
|
|
137
|
+
title: Add the no-quality CI matrix axis
|
|
138
|
+
description: |
|
|
139
|
+
`.github/workflows/ci.yml` runs one test job over a
|
|
140
|
+
python-version matrix, and every leg installs the quality
|
|
141
|
+
runners (ruff, bandit, radon) via the `[test]` extra. Add a
|
|
142
|
+
`no-quality` matrix axis that runs pytest a second time with the
|
|
143
|
+
quality runners absent, and add `@pytest.mark.requires_radon` /
|
|
144
|
+
`requires_bandit` markers to the tests that need those tools so
|
|
145
|
+
the new leg can deselect them.
|
|
146
|
+
starting_repo: metaensemble
|
|
147
|
+
starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
|
|
148
|
+
acceptance:
|
|
149
|
+
- kind: ci_yaml_has_matrix_axis
|
|
150
|
+
axis: no-quality
|
|
151
|
+
|
|
152
|
+
# Every starting_sha above is a resolved, frozen commit: a1/a2 pin the
|
|
153
|
+
# deterministic fixture commits built by `evals/fixtures/build.py`
|
|
154
|
+
# (FIXTURE_SHAS), and a3-a8 pin the v0.2.0 release commit. The loader
|
|
155
|
+
# needs no network round-trip, and
|
|
156
|
+
# `metaensemble/tests/test_eval_fixtures.py` gates this file against
|
|
157
|
+
# placeholder or drifted SHAs.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Suite-A fixture source trees and the deterministic workspace builder.
|
|
2
|
+
|
|
3
|
+
`build.build_fixture` materializes a fixture source tree into a
|
|
4
|
+
single-commit git repository whose SHA is identical on every machine;
|
|
5
|
+
`build.FIXTURE_SHAS` records the expected SHAs that
|
|
6
|
+
`evals/datasets/suite_a/tasks.yaml` pins as `starting_sha`.
|
|
7
|
+
"""
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Materialize Suite-A fixture workspaces as deterministic git repos.
|
|
2
|
+
|
|
3
|
+
Each fixture source tree under ``evals/fixtures/`` is plain files with
|
|
4
|
+
no ``.git``. :func:`build_fixture` copies a tree into a destination
|
|
5
|
+
directory, normalizes file modes, and creates exactly one commit with a
|
|
6
|
+
fixed author/committer identity and date, so the resulting commit SHA
|
|
7
|
+
is identical on every machine. ``FIXTURE_SHAS`` records the expected
|
|
8
|
+
SHAs; ``evals/datasets/suite_a/tasks.yaml`` pins the same values as
|
|
9
|
+
``starting_sha`` for the ``oss-fixture-*`` tasks.
|
|
10
|
+
|
|
11
|
+
Recompute the SHAs after editing a fixture source tree with::
|
|
12
|
+
|
|
13
|
+
python -m evals.fixtures.build --print-shas
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
import tempfile
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
_FIXTURES_ROOT = Path(__file__).resolve().parent
|
|
25
|
+
|
|
26
|
+
_SOURCE_DIRS: dict[str, Path] = {
|
|
27
|
+
"oss-fixture-paginator": _FIXTURES_ROOT / "paginator",
|
|
28
|
+
"oss-fixture-legacy": _FIXTURES_ROOT / "legacy",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# Expected deterministic single-commit SHAs, produced by running this
|
|
32
|
+
# builder. `metaensemble/tests/test_eval_fixtures.py` fails when a
|
|
33
|
+
# fixture source tree drifts from these values without re-pinning.
|
|
34
|
+
FIXTURE_SHAS: dict[str, str] = {
|
|
35
|
+
"oss-fixture-paginator": "cbb6c2178af85ab778dd215379bf0928b6e52268",
|
|
36
|
+
"oss-fixture-legacy": "c04afa1fb995fc47c53a7336dcb5873c4a4bdeb4",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Fixed commit identity: with author, committer, and both dates pinned,
|
|
40
|
+
# the commit SHA depends only on the tree contents and the message.
|
|
41
|
+
_COMMIT_ENV = {
|
|
42
|
+
"GIT_AUTHOR_NAME": "MetaEnsemble Fixtures",
|
|
43
|
+
"GIT_AUTHOR_EMAIL": "fixtures@metaensemble.invalid",
|
|
44
|
+
"GIT_AUTHOR_DATE": "2026-01-01T00:00:00 +0000",
|
|
45
|
+
"GIT_COMMITTER_NAME": "MetaEnsemble Fixtures",
|
|
46
|
+
"GIT_COMMITTER_EMAIL": "fixtures@metaensemble.invalid",
|
|
47
|
+
"GIT_COMMITTER_DATE": "2026-01-01T00:00:00 +0000",
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
_IGNORED_NAMES = ("__pycache__", "*.pyc", ".pytest_cache", ".DS_Store", ".git")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _git(args: list[str], cwd: Path) -> str:
|
|
54
|
+
"""Run git with the pinned identity and no user/system config."""
|
|
55
|
+
env = dict(os.environ)
|
|
56
|
+
env.update(_COMMIT_ENV)
|
|
57
|
+
# Isolate from user- and machine-level git config (gpg signing,
|
|
58
|
+
# autocrlf, templates) so the commit is byte-identical everywhere.
|
|
59
|
+
env["GIT_CONFIG_GLOBAL"] = os.devnull
|
|
60
|
+
env["GIT_CONFIG_SYSTEM"] = os.devnull
|
|
61
|
+
proc = subprocess.run(
|
|
62
|
+
["git", *args],
|
|
63
|
+
cwd=str(cwd),
|
|
64
|
+
env=env,
|
|
65
|
+
capture_output=True,
|
|
66
|
+
text=True,
|
|
67
|
+
)
|
|
68
|
+
if proc.returncode != 0:
|
|
69
|
+
raise RuntimeError(
|
|
70
|
+
f"git {' '.join(args)} failed in {cwd}: {proc.stderr.strip()}"
|
|
71
|
+
)
|
|
72
|
+
return proc.stdout.strip()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_fixture(name: str, dest: Path) -> str:
|
|
76
|
+
"""Materialize fixture ``name`` into ``dest`` as a one-commit git repo.
|
|
77
|
+
|
|
78
|
+
``name`` is one of ``FIXTURE_SHAS``'s keys. ``dest`` is created if
|
|
79
|
+
needed and must be empty. Returns the full 40-character commit SHA,
|
|
80
|
+
which is deterministic across machines.
|
|
81
|
+
"""
|
|
82
|
+
source = _SOURCE_DIRS.get(name)
|
|
83
|
+
if source is None:
|
|
84
|
+
known = ", ".join(sorted(_SOURCE_DIRS))
|
|
85
|
+
raise ValueError(f"unknown fixture {name!r}; expected one of: {known}")
|
|
86
|
+
dest = Path(dest)
|
|
87
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
if any(dest.iterdir()):
|
|
89
|
+
raise ValueError(f"fixture destination {dest} is not empty")
|
|
90
|
+
shutil.copytree(
|
|
91
|
+
source,
|
|
92
|
+
dest,
|
|
93
|
+
dirs_exist_ok=True,
|
|
94
|
+
ignore=shutil.ignore_patterns(*_IGNORED_NAMES),
|
|
95
|
+
)
|
|
96
|
+
# Normalize modes so umask and checkout quirks cannot change the
|
|
97
|
+
# tree hash: directories 755, files 644.
|
|
98
|
+
for path in sorted(dest.rglob("*")):
|
|
99
|
+
if path.is_dir():
|
|
100
|
+
path.chmod(0o755)
|
|
101
|
+
elif path.is_file():
|
|
102
|
+
path.chmod(0o644)
|
|
103
|
+
_git(["init", "-q"], dest)
|
|
104
|
+
_git(["add", "-A"], dest)
|
|
105
|
+
_git(
|
|
106
|
+
["commit", "-q", "--no-gpg-sign", "-m", f"fixture: {name} frozen starting state"],
|
|
107
|
+
dest,
|
|
108
|
+
)
|
|
109
|
+
sha = _git(["rev-parse", "HEAD"], dest)
|
|
110
|
+
if len(sha) != 40:
|
|
111
|
+
raise RuntimeError(f"unexpected rev-parse output: {sha!r}")
|
|
112
|
+
return sha
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def main(argv: list[str] | None = None) -> int:
|
|
116
|
+
parser = argparse.ArgumentParser(
|
|
117
|
+
prog="python -m evals.fixtures.build",
|
|
118
|
+
description=__doc__,
|
|
119
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
120
|
+
)
|
|
121
|
+
parser.add_argument(
|
|
122
|
+
"--print-shas",
|
|
123
|
+
action="store_true",
|
|
124
|
+
help="build every fixture into a temp dir and print `name sha` lines",
|
|
125
|
+
)
|
|
126
|
+
args = parser.parse_args(argv)
|
|
127
|
+
if not args.print_shas:
|
|
128
|
+
parser.print_help()
|
|
129
|
+
return 2
|
|
130
|
+
for name in sorted(_SOURCE_DIRS):
|
|
131
|
+
with tempfile.TemporaryDirectory(prefix="me-fixture-") as tmp:
|
|
132
|
+
sha = build_fixture(name, Path(tmp) / "repo")
|
|
133
|
+
expected = FIXTURE_SHAS.get(name)
|
|
134
|
+
marker = "" if sha == expected else " (differs from FIXTURE_SHAS)"
|
|
135
|
+
print(f"{name} {sha}{marker}")
|
|
136
|
+
return 0
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""The ``legacy`` package. Public API lives in ``legacy.big_module``."""
|