metaensemble 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {metaensemble-0.2.0 → metaensemble-0.3.0}/PKG-INFO +5 -5
  2. {metaensemble-0.2.0 → metaensemble-0.3.0}/README.md +4 -4
  3. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/README.md +29 -14
  4. metaensemble-0.3.0/evals/datasets/suite_a/tasks.yaml +157 -0
  5. metaensemble-0.3.0/evals/fixtures/__init__.py +7 -0
  6. metaensemble-0.3.0/evals/fixtures/build.py +140 -0
  7. metaensemble-0.3.0/evals/fixtures/legacy/legacy/__init__.py +1 -0
  8. metaensemble-0.3.0/evals/fixtures/legacy/legacy/big_module.py +680 -0
  9. metaensemble-0.3.0/evals/fixtures/legacy/test_big_module.py +147 -0
  10. metaensemble-0.3.0/evals/fixtures/paginator/pagination.py +43 -0
  11. metaensemble-0.3.0/evals/fixtures/paginator/test_pagination.py +24 -0
  12. metaensemble-0.3.0/evals/runners/acceptance.py +528 -0
  13. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/runners/api.py +4 -1
  14. metaensemble-0.3.0/evals/runners/suite_a.py +552 -0
  15. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/__init__.py +1 -1
  16. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/cli.py +78 -13
  17. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/PKG-INFO +5 -5
  18. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/SOURCES.txt +9 -0
  19. {metaensemble-0.2.0 → metaensemble-0.3.0}/pyproject.toml +1 -1
  20. metaensemble-0.2.0/evals/datasets/suite_a/tasks.yaml +0 -123
  21. {metaensemble-0.2.0 → metaensemble-0.3.0}/LICENSE +0 -0
  22. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/__init__.py +0 -0
  23. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/cassettes/README.md +0 -0
  24. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/cassettes/bootstrap.jsonl +0 -0
  25. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/configs/default.yaml +0 -0
  26. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/datasets/__init__.py +0 -0
  27. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/datasets/suite_b/items.yaml +0 -0
  28. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/runners/__init__.py +0 -0
  29. {metaensemble-0.2.0 → metaensemble-0.3.0}/evals/runners/metrics.py +0 -0
  30. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/dispatch.md +0 -0
  31. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/executors.md +0 -0
  32. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/ledger.md +0 -0
  33. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/limits.md +0 -0
  34. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/perf.md +0 -0
  35. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/relaunch.md +0 -0
  36. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/commands/standup.md +0 -0
  37. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/config/budgets.example.yaml +0 -0
  38. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/config/quality.example.yaml +0 -0
  39. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/__init__.py +0 -0
  40. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/_common.py +0 -0
  41. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/deliverable_sync.py +0 -0
  42. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/file_event.py +0 -0
  43. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/post_task.py +0 -0
  44. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/pre_task.py +0 -0
  45. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/session_start.py +0 -0
  46. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/session_summary.py +0 -0
  47. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/hooks/subagent_stop.py +0 -0
  48. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/__init__.py +0 -0
  49. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/config.py +0 -0
  50. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/cost_gate.py +0 -0
  51. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/dispatch.py +0 -0
  52. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/doctor.py +0 -0
  53. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/file_events.py +0 -0
  54. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/ids.py +0 -0
  55. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/installer.py +0 -0
  56. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/ledger.py +0 -0
  57. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/manifest.py +0 -0
  58. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/native_state.py +0 -0
  59. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/overlaps.py +0 -0
  60. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/quality_gate.py +0 -0
  61. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/quality_runners.py +0 -0
  62. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/reconcile.py +0 -0
  63. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/recording.py +0 -0
  64. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/relaunch.py +0 -0
  65. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/runtime_payload.py +0 -0
  66. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/runtime_state.py +0 -0
  67. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/sidecar.py +0 -0
  68. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/topology.py +0 -0
  69. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/lib/transcript.py +0 -0
  70. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/output-styles/deliverable.md +0 -0
  71. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/output-styles/wire.md +0 -0
  72. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/architect.md +0 -0
  73. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/backend.md +0 -0
  74. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/code-quality.md +0 -0
  75. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/data-engineer.md +0 -0
  76. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/devops.md +0 -0
  77. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/docs.md +0 -0
  78. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/frontend.md +0 -0
  79. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/ml-engineer.md +0 -0
  80. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/roles/test-engineer.md +0 -0
  81. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/schemas/brief.schema.json +0 -0
  82. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/schemas/manifest.schema.json +0 -0
  83. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/schemas/role.schema.json +0 -0
  84. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/skills/metaensemble-protocol/SKILL.md +0 -0
  85. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/state/migrations/001_init.sql +0 -0
  86. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/state/migrations/002_outcome_extended.sql +0 -0
  87. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/state/migrations/003_run_provenance.sql +0 -0
  88. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/statusline/me_status.py +0 -0
  89. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/__init__.py +0 -0
  90. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/executors.py +0 -0
  91. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/ledger.py +0 -0
  92. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/limits.py +0 -0
  93. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/perf.py +0 -0
  94. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/standup.py +0 -0
  95. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble/tools/stats.py +0 -0
  96. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/dependency_links.txt +0 -0
  97. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/entry_points.txt +0 -0
  98. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/requires.txt +0 -0
  99. {metaensemble-0.2.0 → metaensemble-0.3.0}/metaensemble.egg-info/top_level.txt +0 -0
  100. {metaensemble-0.2.0 → metaensemble-0.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metaensemble
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: A typed runtime for ensembles of cognitive agents
5
5
  License-Expression: MIT
6
6
  Requires-Python: >=3.10
@@ -31,7 +31,7 @@ Dynamic: license-file
31
31
 
32
32
  MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
33
33
 
34
- **v0.2.0 status:** feedback-first release. The software records and gates local agent work, but measured quality-per-token improvements remain a product hypothesis until the live evaluation set is larger and fully baseline-comparable. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md).
34
+ **v0.3.0 status:** feedback-first release, now with a measured calibration. The first full-tier cycle (320 live runs, 8 cells × 8 software tasks × 5 seeds) found acceptance-quality parity with strong single-agent baselines at a 1.55× token premium on tasks that fit one context, that every protocol primitive is load-bearing (ablations degrade it — the Manifest most), and that the full protocol more than doubles the runtime's default-subagent baseline. No quality-per-token superiority is claimed for this task class. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) and `evals/reports/20260704T140844Z-full.md`.
35
35
 
36
36
  ---
37
37
 
@@ -181,7 +181,7 @@ See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the f
181
181
 
182
182
  ## Status
183
183
 
184
- v0.2.0. All core phases complete and tested:
184
+ v0.3.0. All core phases complete and tested:
185
185
 
186
186
  - Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
187
187
  - Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
@@ -191,9 +191,9 @@ v0.2.0. All core phases complete and tested:
191
191
  - Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
192
192
  - Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
193
193
  - Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
194
- - Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The shipped replay pack is a non-empirical bootstrap fixture. Live smoke/full runs are wired for side-effect-free classification-smoke checks; calibration and baseline-superiority claims still require larger labeled/fixture sets.
194
+ - Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The full tier runs Suite-A software tasks live in sandboxed per-run workspaces with graded acceptance and honest cell isolation; the first calibration cycle is checked in at `evals/reports/20260704T140844Z-full.md`. Domain-classification calibration still requires an independently labeled set.
195
195
 
196
- v0.2.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
196
+ v0.3.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
197
197
 
198
198
  See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
199
199
  Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
@@ -4,7 +4,7 @@
4
4
 
5
5
  MetaEnsemble gives every agent a persistent ID, every handoff a schema-validated contract, and every run an entry in an append-only ledger. Multiple agents instantiated from one Role specification execute in parallel. Identities survive across sessions. Token-efficient by construction.
6
6
 
7
- **v0.2.0 status:** feedback-first release. The software records and gates local agent work, but measured quality-per-token improvements remain a product hypothesis until the live evaluation set is larger and fully baseline-comparable. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md).
7
+ **v0.3.0 status:** feedback-first release, now with a measured calibration. The first full-tier cycle (320 live runs, 8 cells × 8 software tasks × 5 seeds) found acceptance-quality parity with strong single-agent baselines at a 1.55× token premium on tasks that fit one context, that every protocol primitive is load-bearing (ablations degrade it — the Manifest most), and that the full protocol more than doubles the runtime's default-subagent baseline. No quality-per-token superiority is claimed for this task class. See [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) and `evals/reports/20260704T140844Z-full.md`.
8
8
 
9
9
  ---
10
10
 
@@ -154,7 +154,7 @@ See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the per-action behaviour and the f
154
154
 
155
155
  ## Status
156
156
 
157
- v0.2.0. All core phases complete and tested:
157
+ v0.3.0. All core phases complete and tested:
158
158
 
159
159
  - Typed substrate (Manifest YAML, Brief JSON, Ledger SQLite + JSONL).
160
160
  - Lifecycle hooks for SessionStart, PreToolUse, PostToolUse, Write/deliverable-sync, file-tool provenance, SubagentStop (background-dispatch finalization), and Stop, with command-injection invariants enforced by an audit test.
@@ -164,9 +164,9 @@ v0.2.0. All core phases complete and tested:
164
164
  - Five-axis deliverable check on successful Runs: pytest, bandit, ruff, radon, and coverage for `.py` deliverables, plus project-configured per-axis commands (`axis_commands` in `quality.yaml`) so non-Python deliverables are checked across the same correctness/security/maintainability/complexity/coverage axes; quality runners ship in the `[test]` extras so CI runs the real tools.
165
165
  - Failed-run accounting via the `interrupted` and `budget_exceeded` outcomes (schema migration 002) plus the two-layer reconcile module.
166
166
  - Ledger field completeness — every documented Ledger field (Role version, model, tool use, files touched, output, gate state, review findings) is a column with an assertion test.
167
- - Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The shipped replay pack is a non-empirical bootstrap fixture. Live smoke/full runs are wired for side-effect-free classification-smoke checks; calibration and baseline-superiority claims still require larger labeled/fixture sets.
167
+ - Evaluation harness under `evals/` with replay/smoke/full tiers, Wilson confidence intervals, and `pass@budget` / `quality_per_1k_tokens` / `orchestration_overhead_ratio` metrics. The full tier runs Suite-A software tasks live in sandboxed per-run workspaces with graded acceptance and honest cell isolation; the first calibration cycle is checked in at `evals/reports/20260704T140844Z-full.md`. Domain-classification calibration still requires an independently labeled set.
168
168
 
169
- v0.2.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
169
+ v0.3.0 is feedback-first. Issues are welcome; see [CONTRIBUTING.md](./CONTRIBUTING.md) to get started.
170
170
 
171
171
  See [PERFORMANCE.md](./docs/PERFORMANCE.md) for the engineering contract and benchmark numbers, [SYSTEM-CARD.md](./docs/SYSTEM-CARD.md) for known limitations and intended-use boundaries, and [SECURITY.md](./SECURITY.md) for the trust model.
172
172
  Release publication is gated by [RELEASE-CHECKLIST.md](./docs/RELEASE-CHECKLIST.md).
@@ -23,20 +23,26 @@ evals/
23
23
  │ └── suite_b/ # domain-specific classification smoke set
24
24
  │ ├── README.md
25
25
  │ └── items.yaml
26
- ├── baselines/ # B1 / B2 / B3 baseline definitions
27
- │ ├── b1_single_agent.yaml
28
- ├── b2_single_agent_prompted.yaml
29
- │ ├── b3_subagent_default.yaml
30
- │ └── b4_best_prompt.yaml # best-single-agent baseline
26
+ ├── fixtures/ # deterministic Suite-A fixture repos
27
+ │ ├── build.py # single-commit builder; SHAs identical on every
28
+ │ # machine (pinned author/committer/date)
29
+ │ ├── paginator/ # oss-fixture-paginator source tree
30
+ │ └── legacy/ # oss-fixture-legacy source tree
31
31
  ├── cassettes/ # replay fixtures; bootstrap pack is non-empirical
32
32
  ├── runners/ # cell × seed executors
33
33
  │ ├── __init__.py
34
- │ ├── api.py # tiered runner: replay / live / smoke
35
- │ ├── metrics.py # Wilson CI, pass@budget, quality_per_1k_tokens
36
- └── replay.py # cassette-based PR runner
34
+ │ ├── api.py # tiered dispatch: replay / live smoke (suite B)
35
+ │ ├── suite_a.py # live Suite-A: sandboxed workspaces, per-cell
36
+ # prompts, hook isolation via --setting-sources
37
+ │ ├── acceptance.py # graded acceptance checkers (build, tests, lint,
38
+ │ │ # API surface, links, perf, CI matrix)
39
+ │ └── metrics.py # Wilson CI, pass@budget, quality_per_1k_tokens
37
40
  └── reports/ # generated reports per cycle (gitignored)
38
41
  ```
39
42
 
43
+ The cell matrix (B1–B4 baselines, `MM_full`, three ablations) is defined
44
+ in `configs/default.yaml`, not in separate baseline files.
45
+
40
46
  ## Tiered evaluation
41
47
 
42
48
  | Tier | When it runs | Live API calls | Budget |
@@ -54,7 +60,7 @@ The release ships a compact `evals/cassettes/bootstrap.jsonl` pack so the
54
60
  replay tier works in a clean checkout. That pack is deliberately marked
55
61
  non-empirical; it verifies the harness mechanics, not MetaEnsemble's
56
62
  quality claim. Live smoke/full reports are empirical for the cells and
57
- datasets actually run; the report notes any skipped deferred fixtures.
63
+ datasets actually run.
58
64
 
59
65
  ## Headline metrics
60
66
 
@@ -88,10 +94,18 @@ open-source repos. Each task has:
88
94
 
89
95
  See `evals/datasets/suite_a/tasks.yaml` for the current set.
90
96
 
91
- The current Suite-A rows still contain deferred fixture SHAs. The live
92
- full tier names those skipped tasks in the report rather than treating
93
- them as passed or failed. Release certification across software tasks
94
- requires replacing the deferred SHAs with real fixture repositories.
97
+ Every Suite-A row pins a resolved starting SHA: tasks a1/a2 pin the
98
+ deterministic fixture commits from `evals/fixtures/build.py` (the builder
99
+ produces byte-identical trees and therefore identical SHAs on every
100
+ machine), and tasks a3–a8 pin the v0.2.0 release commit of this
101
+ repository, with each description verified true at that SHA. Live runs
102
+ materialize a fresh sandbox workspace per cell × task × seed (local
103
+ clones only — no network), grade the result with
104
+ `evals/runners/acceptance.py`, and keep every workspace on disk beside a
105
+ `run-manifest.jsonl` for post-hoc inspection. Baseline cells run with
106
+ `--setting-sources project,local` so the user-level MetaEnsemble hooks
107
+ are excluded; MM cells run with all setting sources — the cell
108
+ difference is the real orchestration layer, not only the prompt.
95
109
 
96
110
  ## Suite B — domain-specific classification (12 items, *smoke only*)
97
111
 
@@ -121,7 +135,8 @@ metaensemble eval --tier full --allow-live --cells all --seeds 5 --budget-usd 0.
121
135
 
122
136
  The output report lands in the current working directory at
123
137
  `evals/reports/<UTC-date>-<tier>.md` and is linked from
124
- `PERFORMANCE.md §4` once a cycle ships.
138
+ `PERFORMANCE.md §4`. The first full-tier cycle shipped 2026-07-04 and is
139
+ checked in at `evals/reports/20260704T140844Z-full.md`.
125
140
 
126
141
  Supported flags:
127
142
 
@@ -0,0 +1,157 @@
1
+ # Suite A — eight software-engineering tasks.
2
+ #
3
+ # Each task gates the cell's pass-rate against measurable acceptance
4
+ # criteria. The starting state is a frozen Git SHA so re-runs are
5
+ # reproducible; the acceptance criteria are executed by `runners/api.py`
6
+ # after the cell's deliverable lands.
7
+ #
8
+ # Starting states:
9
+ # - `oss-fixture-*` repos are materialized by `evals/fixtures/build.py`;
10
+ # their SHAs are the deterministic single-commit values in
11
+ # `evals.fixtures.build.FIXTURE_SHAS`.
12
+ # - `metaensemble` tasks pin the v0.2.0 release commit
13
+ # (`git rev-parse v0.2.0^{commit}`). Every description below was
14
+ # reconciled against that tree.
15
+
16
+ tasks:
17
+ - id: a1_bugfix_off_by_one
18
+ title: Fix an off-by-one in a paginator
19
+ description: |
20
+ The paginator at `pagination.py:42` returns one fewer item than
21
+ expected on every page boundary. Locate and fix the bug; add a
22
+ regression test.
23
+ starting_repo: oss-fixture-paginator
24
+ starting_sha: cbb6c2178af85ab778dd215379bf0928b6e52268
25
+ acceptance:
26
+ - kind: build_passes
27
+ - kind: test_count_at_least
28
+ value: 5
29
+ - kind: lint_clean
30
+ - kind: file_modified
31
+ path: pagination.py
32
+ - kind: file_modified
33
+ path: test_pagination.py
34
+
35
+ - id: a2_refactor_module
36
+ title: Split a 680-line module into three cohesive files
37
+ description: |
38
+ The module `legacy/big_module.py` (680 lines) has three
39
+ responsibilities: parsing, validation, and rendering of the
40
+ recfile record format. Split it into three files preserving the
41
+ public API listed in `api_manifest.json`. No behavior change.
42
+ starting_repo: oss-fixture-legacy
43
+ starting_sha: c04afa1fb995fc47c53a7336dcb5873c4a4bdeb4
44
+ acceptance:
45
+ - kind: build_passes
46
+ - kind: test_count_at_least
47
+ value: 12
48
+ - kind: api_surface_preserved
49
+
50
+ - id: a3_doc_update
51
+ title: Document rollback verification in the USER-GUIDE
52
+ description: |
53
+ The "When all else fails — recovery" section of `docs/USER-GUIDE.md`
54
+ lists the split rollback commands (`metaensemble unadopt
55
+ --purge-state`, `metaensemble user-teardown --purge-state`) but
56
+ never tells the reader how to confirm a completed rollback.
57
+ `docs/DEPLOYMENT.md` covers this under "Verifying the rollback".
58
+ Add a short paragraph to the recovery section of
59
+ `docs/USER-GUIDE.md` explaining how to verify both rollback
60
+ scopes, with a link to the DEPLOYMENT.md section.
61
+ starting_repo: metaensemble
62
+ starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
63
+ acceptance:
64
+ - kind: markdown_links_resolve
65
+ - kind: file_modified
66
+ path: docs/USER-GUIDE.md
67
+
68
+ - id: a4_test_addition
69
+ title: Add a reconcile provenance test
70
+ description: |
71
+ `metaensemble/tests/test_reconcile.py` has 20 tests covering
72
+ session and stale reconciliation. Reconciliation copies a pending
73
+ sidecar's `brief_in_path` into the recorded Run row, but no test
74
+ asserts it. Add a test that asserts a sidecar's `brief_in_path`
75
+ survives reconciliation into the Run row.
76
+ starting_repo: metaensemble
77
+ starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
78
+ acceptance:
79
+ - kind: build_passes
80
+ - kind: test_count_delta_at_least
81
+ value: 1
82
+ - kind: file_modified
83
+ path: metaensemble/tests/test_reconcile.py
84
+
85
+ - id: a5_design_review
86
+ title: Review the uninstall and rollback design
87
+ description: |
88
+ Read the "Recovery and rollback" section of `docs/DEPLOYMENT.md`,
89
+ including its "Verifying the rollback" and "Full local rollback
90
+ after live testing" subsections, and produce a one-page review at
91
+ `reports/<date>-uninstall-review.md` naming at least three risks
92
+ the documented design does not address. Create the workspace-local
93
+ `reports/` directory if it does not exist.
94
+ starting_repo: metaensemble
95
+ starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
96
+ acceptance:
97
+ - kind: file_exists
98
+ glob: reports/*-uninstall-review.md
99
+ - kind: word_count_at_least
100
+ value: 300
101
+
102
+ - id: a6_security_review
103
+ title: Security-review the transcript walker
104
+ description: |
105
+ `metaensemble/lib/transcript.py` reads the runtime's session
106
+ transcript JSONL from the hook-supplied `transcript_path`. Write
107
+ `reports/<date>-transcript-security.md` listing every defensive
108
+ assumption the walker makes and one concrete attack it survives.
109
+ Create the workspace-local `reports/` directory if it does not
110
+ exist.
111
+ starting_repo: metaensemble
112
+ starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
113
+ acceptance:
114
+ - kind: file_exists
115
+ glob: reports/*-transcript-security.md
116
+
117
+ - id: a7_perf_tune
118
+ title: Tighten the get_window_burn p95 budget
119
+ description: |
120
+ `metaensemble/tests/test_perf_ledger.py` benchmarks
121
+ `get_window_burn` against the module-wide 10ms p95 budget on a
122
+ 10k-row Ledger, and `idx_runs_window` already keeps the query
123
+ indexed. Measure the actual headroom, then tighten
124
+ `test_get_window_burn_meets_p95` to assert a dedicated 5ms p95
125
+ budget for `get_window_burn` without loosening any other
126
+ benchmark's budget.
127
+ starting_repo: metaensemble
128
+ starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
129
+ acceptance:
130
+ - kind: build_passes
131
+ - kind: perf_benchmark_passes
132
+ benchmark: test_get_window_burn_meets_p95
133
+ - kind: file_modified
134
+ path: metaensemble/tests/test_perf_ledger.py
135
+
136
+ - id: a8_infra_change
137
+ title: Add the no-quality CI matrix axis
138
+ description: |
139
+ `.github/workflows/ci.yml` runs one test job over a
140
+ python-version matrix, and every leg installs the quality
141
+ runners (ruff, bandit, radon) via the `[test]` extra. Add a
142
+ `no-quality` matrix axis that runs pytest a second time with the
143
+ quality runners absent, and add `@pytest.mark.requires_radon` /
144
+ `requires_bandit` markers to the tests that need those tools so
145
+ the new leg can deselect them.
146
+ starting_repo: metaensemble
147
+ starting_sha: 27ac404d80312028eff49a5dca3a04338ff8f8ed
148
+ acceptance:
149
+ - kind: ci_yaml_has_matrix_axis
150
+ axis: no-quality
151
+
152
+ # Every starting_sha above is a resolved, frozen commit: a1/a2 pin the
153
+ # deterministic fixture commits built by `evals/fixtures/build.py`
154
+ # (FIXTURE_SHAS), and a3-a8 pin the v0.2.0 release commit. The loader
155
+ # needs no network round-trip, and
156
+ # `metaensemble/tests/test_eval_fixtures.py` gates this file against
157
+ # placeholder or drifted SHAs.
@@ -0,0 +1,7 @@
1
+ """Suite-A fixture source trees and the deterministic workspace builder.
2
+
3
+ `build.build_fixture` materializes a fixture source tree into a
4
+ single-commit git repository whose SHA is identical on every machine;
5
+ `build.FIXTURE_SHAS` records the expected SHAs that
6
+ `evals/datasets/suite_a/tasks.yaml` pins as `starting_sha`.
7
+ """
@@ -0,0 +1,140 @@
1
+ """Materialize Suite-A fixture workspaces as deterministic git repos.
2
+
3
+ Each fixture source tree under ``evals/fixtures/`` is plain files with
4
+ no ``.git``. :func:`build_fixture` copies a tree into a destination
5
+ directory, normalizes file modes, and creates exactly one commit with a
6
+ fixed author/committer identity and date, so the resulting commit SHA
7
+ is identical on every machine. ``FIXTURE_SHAS`` records the expected
8
+ SHAs; ``evals/datasets/suite_a/tasks.yaml`` pins the same values as
9
+ ``starting_sha`` for the ``oss-fixture-*`` tasks.
10
+
11
+ Recompute the SHAs after editing a fixture source tree with::
12
+
13
+ python -m evals.fixtures.build --print-shas
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ import tempfile
22
+ from pathlib import Path
23
+
24
+ _FIXTURES_ROOT = Path(__file__).resolve().parent
25
+
26
+ _SOURCE_DIRS: dict[str, Path] = {
27
+ "oss-fixture-paginator": _FIXTURES_ROOT / "paginator",
28
+ "oss-fixture-legacy": _FIXTURES_ROOT / "legacy",
29
+ }
30
+
31
+ # Expected deterministic single-commit SHAs, produced by running this
32
+ # builder. `metaensemble/tests/test_eval_fixtures.py` fails when a
33
+ # fixture source tree drifts from these values without re-pinning.
34
+ FIXTURE_SHAS: dict[str, str] = {
35
+ "oss-fixture-paginator": "cbb6c2178af85ab778dd215379bf0928b6e52268",
36
+ "oss-fixture-legacy": "c04afa1fb995fc47c53a7336dcb5873c4a4bdeb4",
37
+ }
38
+
39
+ # Fixed commit identity: with author, committer, and both dates pinned,
40
+ # the commit SHA depends only on the tree contents and the message.
41
+ _COMMIT_ENV = {
42
+ "GIT_AUTHOR_NAME": "MetaEnsemble Fixtures",
43
+ "GIT_AUTHOR_EMAIL": "fixtures@metaensemble.invalid",
44
+ "GIT_AUTHOR_DATE": "2026-01-01T00:00:00 +0000",
45
+ "GIT_COMMITTER_NAME": "MetaEnsemble Fixtures",
46
+ "GIT_COMMITTER_EMAIL": "fixtures@metaensemble.invalid",
47
+ "GIT_COMMITTER_DATE": "2026-01-01T00:00:00 +0000",
48
+ }
49
+
50
+ _IGNORED_NAMES = ("__pycache__", "*.pyc", ".pytest_cache", ".DS_Store", ".git")
51
+
52
+
53
+ def _git(args: list[str], cwd: Path) -> str:
54
+ """Run git with the pinned identity and no user/system config."""
55
+ env = dict(os.environ)
56
+ env.update(_COMMIT_ENV)
57
+ # Isolate from user- and machine-level git config (gpg signing,
58
+ # autocrlf, templates) so the commit is byte-identical everywhere.
59
+ env["GIT_CONFIG_GLOBAL"] = os.devnull
60
+ env["GIT_CONFIG_SYSTEM"] = os.devnull
61
+ proc = subprocess.run(
62
+ ["git", *args],
63
+ cwd=str(cwd),
64
+ env=env,
65
+ capture_output=True,
66
+ text=True,
67
+ )
68
+ if proc.returncode != 0:
69
+ raise RuntimeError(
70
+ f"git {' '.join(args)} failed in {cwd}: {proc.stderr.strip()}"
71
+ )
72
+ return proc.stdout.strip()
73
+
74
+
75
+ def build_fixture(name: str, dest: Path) -> str:
76
+ """Materialize fixture ``name`` into ``dest`` as a one-commit git repo.
77
+
78
+ ``name`` is one of ``FIXTURE_SHAS``'s keys. ``dest`` is created if
79
+ needed and must be empty. Returns the full 40-character commit SHA,
80
+ which is deterministic across machines.
81
+ """
82
+ source = _SOURCE_DIRS.get(name)
83
+ if source is None:
84
+ known = ", ".join(sorted(_SOURCE_DIRS))
85
+ raise ValueError(f"unknown fixture {name!r}; expected one of: {known}")
86
+ dest = Path(dest)
87
+ dest.mkdir(parents=True, exist_ok=True)
88
+ if any(dest.iterdir()):
89
+ raise ValueError(f"fixture destination {dest} is not empty")
90
+ shutil.copytree(
91
+ source,
92
+ dest,
93
+ dirs_exist_ok=True,
94
+ ignore=shutil.ignore_patterns(*_IGNORED_NAMES),
95
+ )
96
+ # Normalize modes so umask and checkout quirks cannot change the
97
+ # tree hash: directories 755, files 644.
98
+ for path in sorted(dest.rglob("*")):
99
+ if path.is_dir():
100
+ path.chmod(0o755)
101
+ elif path.is_file():
102
+ path.chmod(0o644)
103
+ _git(["init", "-q"], dest)
104
+ _git(["add", "-A"], dest)
105
+ _git(
106
+ ["commit", "-q", "--no-gpg-sign", "-m", f"fixture: {name} frozen starting state"],
107
+ dest,
108
+ )
109
+ sha = _git(["rev-parse", "HEAD"], dest)
110
+ if len(sha) != 40:
111
+ raise RuntimeError(f"unexpected rev-parse output: {sha!r}")
112
+ return sha
113
+
114
+
115
+ def main(argv: list[str] | None = None) -> int:
116
+ parser = argparse.ArgumentParser(
117
+ prog="python -m evals.fixtures.build",
118
+ description=__doc__,
119
+ formatter_class=argparse.RawDescriptionHelpFormatter,
120
+ )
121
+ parser.add_argument(
122
+ "--print-shas",
123
+ action="store_true",
124
+ help="build every fixture into a temp dir and print `name sha` lines",
125
+ )
126
+ args = parser.parse_args(argv)
127
+ if not args.print_shas:
128
+ parser.print_help()
129
+ return 2
130
+ for name in sorted(_SOURCE_DIRS):
131
+ with tempfile.TemporaryDirectory(prefix="me-fixture-") as tmp:
132
+ sha = build_fixture(name, Path(tmp) / "repo")
133
+ expected = FIXTURE_SHAS.get(name)
134
+ marker = "" if sha == expected else " (differs from FIXTURE_SHAS)"
135
+ print(f"{name} {sha}{marker}")
136
+ return 0
137
+
138
+
139
+ if __name__ == "__main__":
140
+ raise SystemExit(main())
@@ -0,0 +1 @@
1
+ """The ``legacy`` package. Public API lives in ``legacy.big_module``."""