metaensemble 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evals/README.md +147 -0
- evals/__init__.py +0 -0
- evals/cassettes/README.md +10 -0
- evals/cassettes/bootstrap.jsonl +800 -0
- evals/configs/default.yaml +59 -0
- evals/datasets/__init__.py +0 -0
- evals/datasets/suite_a/tasks.yaml +123 -0
- evals/datasets/suite_b/items.yaml +90 -0
- evals/runners/__init__.py +12 -0
- evals/runners/api.py +518 -0
- evals/runners/metrics.py +132 -0
- metaensemble/__init__.py +13 -0
- metaensemble/cli.py +1362 -0
- metaensemble/commands/dispatch.md +39 -0
- metaensemble/commands/executors.md +12 -0
- metaensemble/commands/ledger.md +19 -0
- metaensemble/commands/limits.md +12 -0
- metaensemble/commands/perf.md +12 -0
- metaensemble/commands/relaunch.md +29 -0
- metaensemble/commands/standup.md +14 -0
- metaensemble/config/budgets.example.yaml +72 -0
- metaensemble/config/quality.example.yaml +82 -0
- metaensemble/hooks/__init__.py +1 -0
- metaensemble/hooks/_common.py +148 -0
- metaensemble/hooks/deliverable_sync.py +73 -0
- metaensemble/hooks/file_event.py +303 -0
- metaensemble/hooks/post_task.py +460 -0
- metaensemble/hooks/pre_task.py +548 -0
- metaensemble/hooks/session_start.py +212 -0
- metaensemble/hooks/session_summary.py +392 -0
- metaensemble/hooks/subagent_stop.py +94 -0
- metaensemble/lib/__init__.py +1 -0
- metaensemble/lib/config.py +414 -0
- metaensemble/lib/cost_gate.py +299 -0
- metaensemble/lib/dispatch.py +341 -0
- metaensemble/lib/doctor.py +1563 -0
- metaensemble/lib/file_events.py +395 -0
- metaensemble/lib/ids.py +91 -0
- metaensemble/lib/installer.py +5018 -0
- metaensemble/lib/ledger.py +812 -0
- metaensemble/lib/manifest.py +141 -0
- metaensemble/lib/native_state.py +463 -0
- metaensemble/lib/overlaps.py +155 -0
- metaensemble/lib/quality_gate.py +155 -0
- metaensemble/lib/quality_runners.py +446 -0
- metaensemble/lib/reconcile.py +420 -0
- metaensemble/lib/recording.py +422 -0
- metaensemble/lib/relaunch.py +174 -0
- metaensemble/lib/runtime_payload.py +42 -0
- metaensemble/lib/runtime_state.py +308 -0
- metaensemble/lib/sidecar.py +166 -0
- metaensemble/lib/topology.py +181 -0
- metaensemble/lib/transcript.py +432 -0
- metaensemble/output-styles/deliverable.md +33 -0
- metaensemble/output-styles/wire.md +38 -0
- metaensemble/roles/architect.md +52 -0
- metaensemble/roles/backend.md +43 -0
- metaensemble/roles/code-quality.md +49 -0
- metaensemble/roles/data-engineer.md +42 -0
- metaensemble/roles/devops.md +42 -0
- metaensemble/roles/docs.md +41 -0
- metaensemble/roles/frontend.md +42 -0
- metaensemble/roles/ml-engineer.md +42 -0
- metaensemble/roles/test-engineer.md +42 -0
- metaensemble/schemas/brief.schema.json +80 -0
- metaensemble/schemas/manifest.schema.json +142 -0
- metaensemble/schemas/role.schema.json +84 -0
- metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
- metaensemble/state/migrations/001_init.sql +72 -0
- metaensemble/state/migrations/002_outcome_extended.sql +86 -0
- metaensemble/state/migrations/003_run_provenance.sql +36 -0
- metaensemble/statusline/me_status.py +187 -0
- metaensemble/tools/__init__.py +7 -0
- metaensemble/tools/executors.py +62 -0
- metaensemble/tools/ledger.py +121 -0
- metaensemble/tools/limits.py +165 -0
- metaensemble/tools/perf.py +150 -0
- metaensemble/tools/standup.py +177 -0
- metaensemble/tools/stats.py +115 -0
- metaensemble-0.2.0.dist-info/METADATA +221 -0
- metaensemble-0.2.0.dist-info/RECORD +85 -0
- metaensemble-0.2.0.dist-info/WHEEL +5 -0
- metaensemble-0.2.0.dist-info/entry_points.txt +2 -0
- metaensemble-0.2.0.dist-info/licenses/LICENSE +21 -0
- metaensemble-0.2.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# MetaEnsemble evaluation default config.
|
|
2
|
+
#
|
|
3
|
+
# Read by `metaensemble eval` to set per-cycle parameters. Override on
|
|
4
|
+
# the command line with `--seeds`, `--budget-usd`, etc.
|
|
5
|
+
|
|
6
|
+
cycle:
|
|
7
|
+
# Number of seeds per cell. Smoke = 1, full = 5, very thorough = 10.
|
|
8
|
+
seeds: 5
|
|
9
|
+
# Per-run USD budget passed to `claude --max-budget-usd`. A run that
|
|
10
|
+
# exceeds the budget counts as a failure for `pass@budget`.
|
|
11
|
+
budget_usd: 0.30
|
|
12
|
+
# Default models for each role in a cell. Cheap models handle
|
|
13
|
+
# parsing/pre-flight; the executor uses the manifest tier;
|
|
14
|
+
# the reviewer and synthesizer go higher only when needed.
|
|
15
|
+
model_routing:
|
|
16
|
+
parser: claude-haiku-4-5
|
|
17
|
+
executor: claude-sonnet-4-6
|
|
18
|
+
reviewer: claude-sonnet-4-6
|
|
19
|
+
synthesizer: claude-opus-4-7
|
|
20
|
+
|
|
21
|
+
cells:
|
|
22
|
+
# Three baselines plus the canonical full system.
|
|
23
|
+
- id: B1_single_agent
|
|
24
|
+
kind: baseline
|
|
25
|
+
description: One model call, no Coordinator protocol, no Manifest, no Ledger.
|
|
26
|
+
- id: B2_single_agent_prompted
|
|
27
|
+
kind: baseline
|
|
28
|
+
description: B1 plus a strong system prompt naming role + acceptance criteria.
|
|
29
|
+
- id: B3_subagent_default
|
|
30
|
+
kind: baseline
|
|
31
|
+
description: Runtime's default subagent path; no MetaEnsemble layer.
|
|
32
|
+
- id: B4_best_prompt
|
|
33
|
+
kind: baseline
|
|
34
|
+
description: Best-effort single-agent baseline with same Manifest pointers
|
|
35
|
+
and acceptance criteria as MM. The strongest competitor MM has to beat.
|
|
36
|
+
- id: MM_full
|
|
37
|
+
kind: full_system
|
|
38
|
+
description: Coordinator + Manifest + Ledger + Quality gate.
|
|
39
|
+
# Three ablations.
|
|
40
|
+
- id: MM_minus_manifest
|
|
41
|
+
kind: ablation
|
|
42
|
+
description: Coordinator dispatches without composing a typed contract.
|
|
43
|
+
- id: MM_minus_ledger
|
|
44
|
+
kind: ablation
|
|
45
|
+
description: Coordinator dispatches without recording Run rows.
|
|
46
|
+
# Harness logs externally so removing the Ledger does not break measurement.
|
|
47
|
+
- id: MM_minus_quality_gate
|
|
48
|
+
kind: ablation
|
|
49
|
+
description: Coordinator dispatches without post-Deliverable quality check.
|
|
50
|
+
|
|
51
|
+
reporting:
|
|
52
|
+
# Confidence interval method for pass-rates.
|
|
53
|
+
confidence: wilson_95
|
|
54
|
+
# Token-waste threshold (D-9). Failed-run waste above this fraction of
|
|
55
|
+
# total tokens blocks a "ship" verdict.
|
|
56
|
+
failed_run_waste_threshold: 0.10
|
|
57
|
+
# Orchestration-overhead ceiling (D-8). MM tokens / B4 tokens above
|
|
58
|
+
# this ratio blocks a "ship" verdict.
|
|
59
|
+
overhead_ratio_ceiling: 2.0
|
|
File without changes
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Suite A — eight software-engineering tasks.
|
|
2
|
+
#
|
|
3
|
+
# Each task gates the cell's pass-rate against measurable acceptance
|
|
4
|
+
# criteria. The starting state is a frozen Git SHA so re-runs are
|
|
5
|
+
# reproducible; the acceptance criteria are executed by `runners/api.py`
|
|
6
|
+
# after the cell's deliverable lands.
|
|
7
|
+
|
|
8
|
+
tasks:
|
|
9
|
+
- id: a1_bugfix_off_by_one
|
|
10
|
+
title: Fix an off-by-one in a paginator
|
|
11
|
+
description: |
|
|
12
|
+
The paginator at `pagination.py:42` returns one fewer item than
|
|
13
|
+
expected on every page boundary. Locate and fix the bug; add a
|
|
14
|
+
regression test.
|
|
15
|
+
starting_repo: oss-fixture-paginator
|
|
16
|
+
starting_sha: __DEFERRED__
|
|
17
|
+
acceptance:
|
|
18
|
+
- kind: build_passes
|
|
19
|
+
- kind: test_count_at_least
|
|
20
|
+
value: 5
|
|
21
|
+
- kind: lint_clean
|
|
22
|
+
- kind: file_modified
|
|
23
|
+
path: pagination.py
|
|
24
|
+
- kind: file_modified
|
|
25
|
+
path: test_pagination.py
|
|
26
|
+
|
|
27
|
+
- id: a2_refactor_module
|
|
28
|
+
title: Split a 600-line module into three cohesive files
|
|
29
|
+
description: |
|
|
30
|
+
The module `legacy/big_module.py` (612 lines) has three responsibilities.
|
|
31
|
+
Split it into three files preserving the public API. No behavior change.
|
|
32
|
+
starting_repo: oss-fixture-legacy
|
|
33
|
+
starting_sha: __DEFERRED__
|
|
34
|
+
acceptance:
|
|
35
|
+
- kind: build_passes
|
|
36
|
+
- kind: test_count_at_least
|
|
37
|
+
value: 12
|
|
38
|
+
- kind: api_surface_preserved
|
|
39
|
+
|
|
40
|
+
- id: a3_doc_update
|
|
41
|
+
title: Document split rollback commands in USER-GUIDE
|
|
42
|
+
description: |
|
|
43
|
+
The CLI has separate project and user rollback commands:
|
|
44
|
+
`metaensemble unadopt --purge-state` and
|
|
45
|
+
`metaensemble user-teardown --purge-state`. Add a paragraph to
|
|
46
|
+
USER-GUIDE.md under the rollback section.
|
|
47
|
+
starting_repo: metaensemble
|
|
48
|
+
starting_sha: __DEFERRED__
|
|
49
|
+
acceptance:
|
|
50
|
+
- kind: markdown_links_resolve
|
|
51
|
+
- kind: file_modified
|
|
52
|
+
path: USER-GUIDE.md
|
|
53
|
+
|
|
54
|
+
- id: a4_test_addition
|
|
55
|
+
title: Add tests for the reconcile module
|
|
56
|
+
description: |
|
|
57
|
+
The `metaensemble.lib.reconcile` module has 8 tests covering Layer-1 and
|
|
58
|
+
Layer-2 reconcile. Add a test that asserts a sidecar's
|
|
59
|
+
`manifest_id` survives reconciliation into the Run row.
|
|
60
|
+
starting_repo: metaensemble
|
|
61
|
+
starting_sha: __DEFERRED__
|
|
62
|
+
acceptance:
|
|
63
|
+
- kind: build_passes
|
|
64
|
+
- kind: test_count_delta_at_least
|
|
65
|
+
value: 1
|
|
66
|
+
- kind: file_modified
|
|
67
|
+
path: metaensemble/tests/test_reconcile.py
|
|
68
|
+
|
|
69
|
+
- id: a5_design_review
|
|
70
|
+
title: Review the uninstall-mode design
|
|
71
|
+
description: |
|
|
72
|
+
Read the "Uninstall modes" and "Recovery and rollback" sections of
|
|
73
|
+
DEPLOYMENT.md and produce a one-page review
|
|
74
|
+
(`reports/<date>-uninstall-review.md`) naming at least three risks
|
|
75
|
+
the documented design does not address.
|
|
76
|
+
starting_repo: metaensemble
|
|
77
|
+
starting_sha: __DEFERRED__
|
|
78
|
+
acceptance:
|
|
79
|
+
- kind: file_exists
|
|
80
|
+
glob: reports/*-uninstall-review.md
|
|
81
|
+
- kind: word_count_at_least
|
|
82
|
+
value: 300
|
|
83
|
+
|
|
84
|
+
- id: a6_security_review
|
|
85
|
+
title: Security-review the new transcript walker
|
|
86
|
+
description: |
|
|
87
|
+
The new `metaensemble/lib/transcript.py` reads JSONL from `transcript_path`.
|
|
88
|
+
Write `reports/<date>-transcript-security.md` listing every defensive
|
|
89
|
+
assumption and one concrete attack the walker survives.
|
|
90
|
+
starting_repo: metaensemble
|
|
91
|
+
starting_sha: __DEFERRED__
|
|
92
|
+
acceptance:
|
|
93
|
+
- kind: file_exists
|
|
94
|
+
glob: reports/*-transcript-security.md
|
|
95
|
+
|
|
96
|
+
- id: a7_perf_tune
|
|
97
|
+
title: Tune `get_window_burn` to <5ms p95 on 100k Runs
|
|
98
|
+
description: |
|
|
99
|
+
Current p95 is 9ms on 100k Runs. Add the missing index and assert the
|
|
100
|
+
new p95 in `metaensemble/tests/test_perf_ledger.py`.
|
|
101
|
+
starting_repo: metaensemble
|
|
102
|
+
starting_sha: __DEFERRED__
|
|
103
|
+
acceptance:
|
|
104
|
+
- kind: build_passes
|
|
105
|
+
- kind: perf_benchmark_passes
|
|
106
|
+
benchmark: test_get_window_burn_meets_p95
|
|
107
|
+
|
|
108
|
+
- id: a8_infra_change
|
|
109
|
+
title: Add the no-quality CI matrix axis
|
|
110
|
+
description: |
|
|
111
|
+
The CI workflow should run pytest a second time with the quality
|
|
112
|
+
runners absent. Update `.github/workflows/ci.yml` and
|
|
113
|
+
add `@pytest.mark.requires_radon` / `requires_bandit` markers to
|
|
114
|
+
the tests that need them.
|
|
115
|
+
starting_repo: metaensemble
|
|
116
|
+
starting_sha: __DEFERRED__
|
|
117
|
+
acceptance:
|
|
118
|
+
- kind: ci_yaml_has_matrix_axis
|
|
119
|
+
axis: no-quality
|
|
120
|
+
|
|
121
|
+
# Per W8 sign-off, every starting_sha resolves to a real commit before
|
|
122
|
+
# the harness ships its first benchmark. The __DEFERRED__ values are
|
|
123
|
+
# placeholders so the test-time loader does not need a network round-trip.
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Suite B — domain-specific classification smoke fixture.
|
|
2
|
+
#
|
|
3
|
+
# STATUS: SMOKE SUITE.
|
|
4
|
+
#
|
|
5
|
+
# Twelve labeled items is not enough to make calibration claims. This
|
|
6
|
+
# file is the smoke suite: it proves the
|
|
7
|
+
# pipeline end-to-end and lets a regression run in CI without
|
|
8
|
+
# real-money API spend. It is NOT the calibration set.
|
|
9
|
+
#
|
|
10
|
+
# This fixture's current labels happen to use a narrow Somali dialect
|
|
11
|
+
# classification task. That is an eval fixture, not MetaEnsemble's product
|
|
12
|
+
# scope. MetaEnsemble is project-agnostic.
|
|
13
|
+
#
|
|
14
|
+
# Smoke items are written by a non-native Somali reader using
|
|
15
|
+
# published dialect references. They are deliberately easy — clearly
|
|
16
|
+
# Northern/Standard Somali — so a pass means the pipeline works,
|
|
17
|
+
# not that the model is calibrated.
|
|
18
|
+
|
|
19
|
+
items:
|
|
20
|
+
- id: b_smoke_01
|
|
21
|
+
text: "Maxaa waaye sababta aad maanta ii soo wici weyday, ma mashquul baad ahayd?"
|
|
22
|
+
gold_label: northern_standard
|
|
23
|
+
acceptable_labels: [northern_standard]
|
|
24
|
+
notes: |
|
|
25
|
+
Clear Northern Somali markers: "wici weyday" (negative habitual)
|
|
26
|
+
and the interrogative "ma … baad ahayd" pattern.
|
|
27
|
+
|
|
28
|
+
- id: b_smoke_02
|
|
29
|
+
text: "Meeqo qof baa shirka yimid, yaase ka hadlay arrintaas?"
|
|
30
|
+
gold_label: northern_standard
|
|
31
|
+
acceptable_labels: [northern_standard]
|
|
32
|
+
notes: |
|
|
33
|
+
"Baa" focus marker + "yimid" verb form pin the dialect to
|
|
34
|
+
Northern/Standard.
|
|
35
|
+
|
|
36
|
+
- id: b_smoke_03
|
|
37
|
+
text: "Sidee laga yaabaa in arrintaan loo xalliyo si toosan?"
|
|
38
|
+
gold_label: northern_standard
|
|
39
|
+
acceptable_labels: [northern_standard]
|
|
40
|
+
notes: Standard register; common in news/formal speech.
|
|
41
|
+
|
|
42
|
+
- id: b_smoke_04
|
|
43
|
+
text: "Markii uu yimid, waxaan u sheegay inay tahay arrin muhiim ah."
|
|
44
|
+
gold_label: northern_standard
|
|
45
|
+
acceptable_labels: [northern_standard]
|
|
46
|
+
|
|
47
|
+
- id: b_smoke_05
|
|
48
|
+
text: "Waxaan tagay suuqa subaxnimo, badda agteeda."
|
|
49
|
+
gold_label: northern_standard
|
|
50
|
+
acceptable_labels: [northern_standard]
|
|
51
|
+
|
|
52
|
+
- id: b_smoke_06
|
|
53
|
+
text: "Maxaad u maleyneysaa inay ka jirto fursad cusub?"
|
|
54
|
+
gold_label: northern_standard
|
|
55
|
+
acceptable_labels: [northern_standard]
|
|
56
|
+
|
|
57
|
+
- id: b_smoke_07
|
|
58
|
+
text: "Sannadkaan dugsiga ardayda waa la kordhay."
|
|
59
|
+
gold_label: northern_standard
|
|
60
|
+
acceptable_labels: [northern_standard]
|
|
61
|
+
|
|
62
|
+
- id: b_smoke_08
|
|
63
|
+
text: "Markaa sidee ayaad mooddaa arrintaas?"
|
|
64
|
+
gold_label: northern_standard
|
|
65
|
+
acceptable_labels: [northern_standard]
|
|
66
|
+
|
|
67
|
+
- id: b_smoke_09
|
|
68
|
+
text: "Wax ka qabasho ah waa la sameeyay, balse natiijada lama hubo."
|
|
69
|
+
gold_label: northern_standard
|
|
70
|
+
acceptable_labels: [northern_standard]
|
|
71
|
+
|
|
72
|
+
- id: b_smoke_10
|
|
73
|
+
text: "Haddii aanan tegin, ma waxaa lagaa filan inaad ka soo qaybgasho?"
|
|
74
|
+
gold_label: northern_standard
|
|
75
|
+
acceptable_labels: [northern_standard]
|
|
76
|
+
|
|
77
|
+
- id: b_smoke_11
|
|
78
|
+
text: "Cuntada caawa waa fiicnaa, mahad waxaad leedahay."
|
|
79
|
+
gold_label: northern_standard
|
|
80
|
+
acceptable_labels: [northern_standard]
|
|
81
|
+
|
|
82
|
+
- id: b_smoke_12
|
|
83
|
+
text: "Khayraadka dalka ku jira waxay u baahan yihiin maamul fiican."
|
|
84
|
+
gold_label: northern_standard
|
|
85
|
+
acceptable_labels: [northern_standard]
|
|
86
|
+
|
|
87
|
+
# Acceptable labels left as a singleton set on every item is part of the
|
|
88
|
+
# smoke-only contract. The calibration set will carry richer alternative
|
|
89
|
+
# labels (Maay, Benadiri, Northern, mixed, code-switching) per the
|
|
90
|
+
# failure-mode catalog the system card promises.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""MetaEnsemble evaluation runners.
|
|
2
|
+
|
|
3
|
+
Three tiers:
|
|
4
|
+
- replay: cassette-based, no API spend. PR gate.
|
|
5
|
+
- smoke: one seed × classification smoke set. Nightly.
|
|
6
|
+
- full: N seeds × every cell × every suite. Release gate.
|
|
7
|
+
|
|
8
|
+
Modules:
|
|
9
|
+
- api: tiered runner dispatch.
|
|
10
|
+
- metrics: Wilson CI, pass@budget, quality_per_1k_tokens, overhead ratio.
|
|
11
|
+
- replay: cassette reader for the PR tier.
|
|
12
|
+
"""
|