metaensemble 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. evals/README.md +147 -0
  2. evals/__init__.py +0 -0
  3. evals/cassettes/README.md +10 -0
  4. evals/cassettes/bootstrap.jsonl +800 -0
  5. evals/configs/default.yaml +59 -0
  6. evals/datasets/__init__.py +0 -0
  7. evals/datasets/suite_a/tasks.yaml +123 -0
  8. evals/datasets/suite_b/items.yaml +90 -0
  9. evals/runners/__init__.py +12 -0
  10. evals/runners/api.py +518 -0
  11. evals/runners/metrics.py +132 -0
  12. metaensemble/__init__.py +13 -0
  13. metaensemble/cli.py +1362 -0
  14. metaensemble/commands/dispatch.md +39 -0
  15. metaensemble/commands/executors.md +12 -0
  16. metaensemble/commands/ledger.md +19 -0
  17. metaensemble/commands/limits.md +12 -0
  18. metaensemble/commands/perf.md +12 -0
  19. metaensemble/commands/relaunch.md +29 -0
  20. metaensemble/commands/standup.md +14 -0
  21. metaensemble/config/budgets.example.yaml +72 -0
  22. metaensemble/config/quality.example.yaml +82 -0
  23. metaensemble/hooks/__init__.py +1 -0
  24. metaensemble/hooks/_common.py +148 -0
  25. metaensemble/hooks/deliverable_sync.py +73 -0
  26. metaensemble/hooks/file_event.py +303 -0
  27. metaensemble/hooks/post_task.py +460 -0
  28. metaensemble/hooks/pre_task.py +548 -0
  29. metaensemble/hooks/session_start.py +212 -0
  30. metaensemble/hooks/session_summary.py +392 -0
  31. metaensemble/hooks/subagent_stop.py +94 -0
  32. metaensemble/lib/__init__.py +1 -0
  33. metaensemble/lib/config.py +414 -0
  34. metaensemble/lib/cost_gate.py +299 -0
  35. metaensemble/lib/dispatch.py +341 -0
  36. metaensemble/lib/doctor.py +1563 -0
  37. metaensemble/lib/file_events.py +395 -0
  38. metaensemble/lib/ids.py +91 -0
  39. metaensemble/lib/installer.py +5018 -0
  40. metaensemble/lib/ledger.py +812 -0
  41. metaensemble/lib/manifest.py +141 -0
  42. metaensemble/lib/native_state.py +463 -0
  43. metaensemble/lib/overlaps.py +155 -0
  44. metaensemble/lib/quality_gate.py +155 -0
  45. metaensemble/lib/quality_runners.py +446 -0
  46. metaensemble/lib/reconcile.py +420 -0
  47. metaensemble/lib/recording.py +422 -0
  48. metaensemble/lib/relaunch.py +174 -0
  49. metaensemble/lib/runtime_payload.py +42 -0
  50. metaensemble/lib/runtime_state.py +308 -0
  51. metaensemble/lib/sidecar.py +166 -0
  52. metaensemble/lib/topology.py +181 -0
  53. metaensemble/lib/transcript.py +432 -0
  54. metaensemble/output-styles/deliverable.md +33 -0
  55. metaensemble/output-styles/wire.md +38 -0
  56. metaensemble/roles/architect.md +52 -0
  57. metaensemble/roles/backend.md +43 -0
  58. metaensemble/roles/code-quality.md +49 -0
  59. metaensemble/roles/data-engineer.md +42 -0
  60. metaensemble/roles/devops.md +42 -0
  61. metaensemble/roles/docs.md +41 -0
  62. metaensemble/roles/frontend.md +42 -0
  63. metaensemble/roles/ml-engineer.md +42 -0
  64. metaensemble/roles/test-engineer.md +42 -0
  65. metaensemble/schemas/brief.schema.json +80 -0
  66. metaensemble/schemas/manifest.schema.json +142 -0
  67. metaensemble/schemas/role.schema.json +84 -0
  68. metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
  69. metaensemble/state/migrations/001_init.sql +72 -0
  70. metaensemble/state/migrations/002_outcome_extended.sql +86 -0
  71. metaensemble/state/migrations/003_run_provenance.sql +36 -0
  72. metaensemble/statusline/me_status.py +187 -0
  73. metaensemble/tools/__init__.py +7 -0
  74. metaensemble/tools/executors.py +62 -0
  75. metaensemble/tools/ledger.py +121 -0
  76. metaensemble/tools/limits.py +165 -0
  77. metaensemble/tools/perf.py +150 -0
  78. metaensemble/tools/standup.py +177 -0
  79. metaensemble/tools/stats.py +115 -0
  80. metaensemble-0.2.0.dist-info/METADATA +221 -0
  81. metaensemble-0.2.0.dist-info/RECORD +85 -0
  82. metaensemble-0.2.0.dist-info/WHEEL +5 -0
  83. metaensemble-0.2.0.dist-info/entry_points.txt +2 -0
  84. metaensemble-0.2.0.dist-info/licenses/LICENSE +21 -0
  85. metaensemble-0.2.0.dist-info/top_level.txt +2 -0
evals/README.md ADDED
@@ -0,0 +1,147 @@
1
+ # MetaEnsemble Evaluation Harness
2
+
3
+ The harness exists so the quality-per-token claim — *the system around
4
+ the model is strong enough to deploy the competence the model already
5
+ has* — can be tested rather than asserted. Replay keeps the harness
6
+ deterministic in CI. Smoke and full tiers make live, side-effect-free
7
+ Claude Code calls and write measured reports under the caller's
8
+ `evals/reports/`.
9
+ The shipped classification data is one narrow smoke fixture, not a calibration set
10
+ and not a statement of product scope. MetaEnsemble is project-agnostic.
11
+
12
+ ## Directory layout
13
+
14
+ ```
15
+ evals/
16
+ ├── README.md # this file
17
+ ├── configs/
18
+ │ └── default.yaml # eval-cycle parameters (seeds, budget, model routing)
19
+ ├── datasets/
20
+ │ ├── suite_a/ # 8 software-engineering tasks
21
+ │ │ ├── README.md
22
+ │ │ └── tasks.yaml
23
+ │ └── suite_b/ # domain-specific classification smoke set
24
+ │ ├── README.md
25
+ │ └── items.yaml
26
+ ├── baselines/ # B1 / B2 / B3 baseline definitions
27
+ │ ├── b1_single_agent.yaml
28
+ │ ├── b2_single_agent_prompted.yaml
29
+ │ ├── b3_subagent_default.yaml
30
+ │ └── b4_best_prompt.yaml # best-single-agent baseline
31
+ ├── cassettes/ # replay fixtures; bootstrap pack is non-empirical
32
+ ├── runners/ # cell × seed executors
33
+ │ ├── __init__.py
34
+ │ ├── api.py # tiered runner: replay / live / smoke
35
+ │ ├── metrics.py # Wilson CI, pass@budget, quality_per_1k_tokens
36
+ │ └── replay.py # cassette-based PR runner
37
+ └── reports/ # generated reports per cycle (gitignored)
38
+ ```
39
+
40
+ ## Tiered evaluation
41
+
42
+ | Tier | When it runs | Live API calls | Budget |
43
+ |---|---|---|---|
44
+ | Replay | Every PR; reads recorded cassette responses. | No. | $0 |
45
+ | Smoke | Nightly cron or local preflight. 1 seed, `MM_full`, classification smoke set only. | Yes. | ~$0.30 default cap |
46
+ | Full | Release-gated. Defaults to 5 seeds × every configured cell. | Yes. | Principal-set cap |
47
+
48
+ The PR tier exists to keep regressions cheap to catch; the full tier
49
+ exists to certify a release. A release candidate is not allowed
50
+ to claim quality-per-token superiority unless the same report includes
51
+ baseline cells and MetaEnsemble cells over the same task set.
52
+
53
+ The release ships a compact `evals/cassettes/bootstrap.jsonl` pack so the
54
+ replay tier works in a clean checkout. That pack is deliberately marked
55
+ non-empirical; it verifies the harness mechanics, not MetaEnsemble's
56
+ quality claim. Live smoke/full reports are empirical for the cells and
57
+ datasets actually run; the report notes any skipped deferred fixtures.
58
+
59
+ ## Headline metrics
60
+
61
+ The harness reports three co-primary metrics per cell:
62
+
63
+ | Metric | Definition |
64
+ |---|---|
65
+ | `pass@budget` | Pass-rate against the cell's per-task budget. A "win" that overspends does not count. |
66
+ | `quality_per_1k_tokens` | Average score across passing runs divided by tokens / 1000. Directly tests the efficiency thesis. |
67
+ | `orchestration_overhead_ratio` | MetaEnsemble token cost over the best single-agent baseline's token cost, on the same task. |
68
+
69
+ Plus the supporting metrics in `runners/metrics.py`:
70
+ `failed_run_token_waste`, `time_to_useful_deliverable`,
71
+ `minimum_useful_answer_score`.
72
+
73
+ For live reports, include these context fields in the release note or
74
+ system-card link: exact model IDs when the runtime exposes them, seed
75
+ count, cells run, skipped fixtures, total observed tokens, estimated vs
76
+ observed token error where available, and any cost-gate or Python
77
+ deliverable-check interventions.
78
+
79
+ ## Suite A — software engineering (8 tasks)
80
+
81
+ Eight tasks drawn from the project's own backlog and from small
82
+ open-source repos. Each task has:
83
+
84
+ - A one-paragraph description (English).
85
+ - A frozen-commit starting state (commit SHA of the project under test).
86
+ - Graded acceptance criteria (build passes, tests count ≥ N, lint
87
+ clean, manifest existed, deliverable file present).
88
+
89
+ See `evals/datasets/suite_a/tasks.yaml` for the current set.
90
+
91
+ The current Suite-A rows still contain deferred fixture SHAs. The live
92
+ full tier names those skipped tasks in the report rather than treating
93
+ them as passed or failed. Release certification across software tasks
94
+ requires replacing the deferred SHAs with real fixture repositories.
95
+
96
+ ## Suite B — domain-specific classification (12 items, *smoke only*)
97
+
98
+ Twelve items is too few for calibration claims. The 12-item set in
99
+ `evals/datasets/suite_b/items.yaml` is the **smoke suite** that proves
100
+ the pipeline end-to-end. It is intentionally narrow; it does not make
101
+ MetaEnsemble domain-specific. Any release claim about a particular domain
102
+ needs its own independently labeled calibration set. The system card states
103
+ this limitation explicitly so no calibration claim is implied by the smoke
104
+ set.
105
+
106
+ ## Running the harness
107
+
108
+ ```bash
109
+ # PR-tier replay (no API calls).
110
+ metaensemble eval --tier replay --cells all
111
+
112
+ # Nightly smoke (one cell × one seed × classification smoke set only).
113
+ metaensemble eval --tier smoke
114
+
115
+ # Constrained full-tier check.
116
+ metaensemble eval --tier full --allow-live --cells MM_full --seeds 1 --budget-usd 0.30
117
+
118
+ # Release-gated full cycle once fixture SHAs and budget are set.
119
+ metaensemble eval --tier full --allow-live --cells all --seeds 5 --budget-usd 0.30
120
+ ```
121
+
122
+ The output report lands in the current working directory at
123
+ `evals/reports/<UTC-date>-<tier>.md` and is linked from
124
+ `PERFORMANCE.md §4` once a cycle ships.
125
+
126
+ Supported flags:
127
+
128
+ | Flag | Meaning |
129
+ |---|---|
130
+ | `--cells all` or `--cells A,B` | Select all cells or a comma-separated subset. Smoke defaults to `MM_full`; replay/full default to all. |
131
+ | `--seeds N` | Override seed count. Smoke defaults to 1; replay/full default to `evals/configs/default.yaml`. |
132
+ | `--budget-usd X` | Override the live-tier per-run budget shown in preflight. |
133
+ | `--allow-live` | Required before the full tier proceeds past preflight. |
134
+
135
+ ## Sign-off thresholds (D-8, D-9)
136
+
137
+ D-8 and D-9 are numerical full-tier release gates:
138
+
139
+ - **D-8 orchestration overhead**: any measured MetaEnsemble cell above
140
+ `2.0x` the best single-agent prompt baseline (`B4`) blocks the full
141
+ tier's ship verdict.
142
+ - **D-9 failed-run waste**: failed and budget-exceeded runs above `10%`
143
+ of total evaluated tokens block the full tier's ship verdict.
144
+
145
+ The thresholds live in `evals/configs/default.yaml`. If a full run does
146
+ not include the `B4_best_prompt` baseline, D-8 is reported as not
147
+ evaluated rather than silently passing.
evals/__init__.py ADDED
File without changes
@@ -0,0 +1,10 @@
1
+ # Eval Cassettes
2
+
3
+ `bootstrap.jsonl` is a v0.1.0 replay fixture pack. It exists so the
4
+ zero-cost replay tier exercises task loading, cell selection, metrics,
5
+ and report rendering in a clean checkout.
6
+
7
+ It is not empirical benchmark evidence. Each record is marked
8
+ `source: bootstrap_fixture_not_empirical`; the first live smoke/full
9
+ cycle should replace or supplement this pack with recorded cassette
10
+ outputs from real runs.