metaensemble 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. evals/README.md +147 -0
  2. evals/__init__.py +0 -0
  3. evals/cassettes/README.md +10 -0
  4. evals/cassettes/bootstrap.jsonl +800 -0
  5. evals/configs/default.yaml +59 -0
  6. evals/datasets/__init__.py +0 -0
  7. evals/datasets/suite_a/tasks.yaml +123 -0
  8. evals/datasets/suite_b/items.yaml +90 -0
  9. evals/runners/__init__.py +12 -0
  10. evals/runners/api.py +518 -0
  11. evals/runners/metrics.py +132 -0
  12. metaensemble/__init__.py +13 -0
  13. metaensemble/cli.py +1362 -0
  14. metaensemble/commands/dispatch.md +39 -0
  15. metaensemble/commands/executors.md +12 -0
  16. metaensemble/commands/ledger.md +19 -0
  17. metaensemble/commands/limits.md +12 -0
  18. metaensemble/commands/perf.md +12 -0
  19. metaensemble/commands/relaunch.md +29 -0
  20. metaensemble/commands/standup.md +14 -0
  21. metaensemble/config/budgets.example.yaml +72 -0
  22. metaensemble/config/quality.example.yaml +82 -0
  23. metaensemble/hooks/__init__.py +1 -0
  24. metaensemble/hooks/_common.py +148 -0
  25. metaensemble/hooks/deliverable_sync.py +73 -0
  26. metaensemble/hooks/file_event.py +303 -0
  27. metaensemble/hooks/post_task.py +460 -0
  28. metaensemble/hooks/pre_task.py +548 -0
  29. metaensemble/hooks/session_start.py +212 -0
  30. metaensemble/hooks/session_summary.py +392 -0
  31. metaensemble/hooks/subagent_stop.py +94 -0
  32. metaensemble/lib/__init__.py +1 -0
  33. metaensemble/lib/config.py +414 -0
  34. metaensemble/lib/cost_gate.py +299 -0
  35. metaensemble/lib/dispatch.py +341 -0
  36. metaensemble/lib/doctor.py +1563 -0
  37. metaensemble/lib/file_events.py +395 -0
  38. metaensemble/lib/ids.py +91 -0
  39. metaensemble/lib/installer.py +5018 -0
  40. metaensemble/lib/ledger.py +812 -0
  41. metaensemble/lib/manifest.py +141 -0
  42. metaensemble/lib/native_state.py +463 -0
  43. metaensemble/lib/overlaps.py +155 -0
  44. metaensemble/lib/quality_gate.py +155 -0
  45. metaensemble/lib/quality_runners.py +446 -0
  46. metaensemble/lib/reconcile.py +420 -0
  47. metaensemble/lib/recording.py +422 -0
  48. metaensemble/lib/relaunch.py +174 -0
  49. metaensemble/lib/runtime_payload.py +42 -0
  50. metaensemble/lib/runtime_state.py +308 -0
  51. metaensemble/lib/sidecar.py +166 -0
  52. metaensemble/lib/topology.py +181 -0
  53. metaensemble/lib/transcript.py +432 -0
  54. metaensemble/output-styles/deliverable.md +33 -0
  55. metaensemble/output-styles/wire.md +38 -0
  56. metaensemble/roles/architect.md +52 -0
  57. metaensemble/roles/backend.md +43 -0
  58. metaensemble/roles/code-quality.md +49 -0
  59. metaensemble/roles/data-engineer.md +42 -0
  60. metaensemble/roles/devops.md +42 -0
  61. metaensemble/roles/docs.md +41 -0
  62. metaensemble/roles/frontend.md +42 -0
  63. metaensemble/roles/ml-engineer.md +42 -0
  64. metaensemble/roles/test-engineer.md +42 -0
  65. metaensemble/schemas/brief.schema.json +80 -0
  66. metaensemble/schemas/manifest.schema.json +142 -0
  67. metaensemble/schemas/role.schema.json +84 -0
  68. metaensemble/skills/metaensemble-protocol/SKILL.md +226 -0
  69. metaensemble/state/migrations/001_init.sql +72 -0
  70. metaensemble/state/migrations/002_outcome_extended.sql +86 -0
  71. metaensemble/state/migrations/003_run_provenance.sql +36 -0
  72. metaensemble/statusline/me_status.py +187 -0
  73. metaensemble/tools/__init__.py +7 -0
  74. metaensemble/tools/executors.py +62 -0
  75. metaensemble/tools/ledger.py +121 -0
  76. metaensemble/tools/limits.py +165 -0
  77. metaensemble/tools/perf.py +150 -0
  78. metaensemble/tools/standup.py +177 -0
  79. metaensemble/tools/stats.py +115 -0
  80. metaensemble-0.2.0.dist-info/METADATA +221 -0
  81. metaensemble-0.2.0.dist-info/RECORD +85 -0
  82. metaensemble-0.2.0.dist-info/WHEEL +5 -0
  83. metaensemble-0.2.0.dist-info/entry_points.txt +2 -0
  84. metaensemble-0.2.0.dist-info/licenses/LICENSE +21 -0
  85. metaensemble-0.2.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,59 @@
1
+ # MetaEnsemble evaluation default config.
2
+ #
3
+ # Read by `metaensemble eval` to set per-cycle parameters. Override on
4
+ # the command line with `--seeds`, `--budget-usd`, etc.
5
+
6
+ cycle:
7
+ # Number of seeds per cell. Smoke = 1, full = 5, very thorough = 10.
8
+ seeds: 5
9
+ # Per-run USD budget passed to `claude --max-budget-usd`. A run that
10
+ # exceeds the budget counts as a failure for `pass@budget`.
11
+ budget_usd: 0.30
12
+ # Default models for each role in a cell. Cheap models handle
13
+ # parsing/pre-flight; the executor uses the manifest tier;
14
+ # the reviewer and synthesizer go higher only when needed.
15
+ model_routing:
16
+ parser: claude-haiku-4-5
17
+ executor: claude-sonnet-4-6
18
+ reviewer: claude-sonnet-4-6
19
+ synthesizer: claude-opus-4-7
20
+
21
+ cells:
22
+ # Three baselines plus the canonical full system.
23
+ - id: B1_single_agent
24
+ kind: baseline
25
+ description: One model call, no Coordinator protocol, no Manifest, no Ledger.
26
+ - id: B2_single_agent_prompted
27
+ kind: baseline
28
+ description: B1 plus a strong system prompt naming role + acceptance criteria.
29
+ - id: B3_subagent_default
30
+ kind: baseline
31
+ description: Runtime's default subagent path; no MetaEnsemble layer.
32
+ - id: B4_best_prompt
33
+ kind: baseline
34
+ description: Best-effort single-agent baseline with same Manifest pointers
35
+ and acceptance criteria as MM. The strongest competitor MM has to beat.
36
+ - id: MM_full
37
+ kind: full_system
38
+ description: Coordinator + Manifest + Ledger + Quality gate.
39
+ # Three ablations.
40
+ - id: MM_minus_manifest
41
+ kind: ablation
42
+ description: Coordinator dispatches without composing a typed contract.
43
+ - id: MM_minus_ledger
44
+ kind: ablation
45
+ description: Coordinator dispatches without recording Run rows.
46
+ # Harness logs externally so removing the Ledger does not break measurement.
47
+ - id: MM_minus_quality_gate
48
+ kind: ablation
49
+ description: Coordinator dispatches without post-Deliverable quality check.
50
+
51
+ reporting:
52
+ # Confidence interval method for pass-rates.
53
+ confidence: wilson_95
54
+ # Token-waste threshold (D-9). Failed-run waste above this fraction of
55
+ # total tokens blocks a "ship" verdict.
56
+ failed_run_waste_threshold: 0.10
57
+ # Orchestration-overhead ceiling (D-8). MM tokens / B4 tokens above
58
+ # this ratio blocks a "ship" verdict.
59
+ overhead_ratio_ceiling: 2.0
File without changes
@@ -0,0 +1,123 @@
1
+ # Suite A — eight software-engineering tasks.
2
+ #
3
+ # Each task gates the cell's pass-rate against measurable acceptance
4
+ # criteria. The starting state is a frozen Git SHA so re-runs are
5
+ # reproducible; the acceptance criteria are executed by `runners/api.py`
6
+ # after the cell's deliverable lands.
7
+
8
+ tasks:
9
+ - id: a1_bugfix_off_by_one
10
+ title: Fix an off-by-one in a paginator
11
+ description: |
12
+ The paginator at `pagination.py:42` returns one fewer item than
13
+ expected on every page boundary. Locate and fix the bug; add a
14
+ regression test.
15
+ starting_repo: oss-fixture-paginator
16
+ starting_sha: __DEFERRED__
17
+ acceptance:
18
+ - kind: build_passes
19
+ - kind: test_count_at_least
20
+ value: 5
21
+ - kind: lint_clean
22
+ - kind: file_modified
23
+ path: pagination.py
24
+ - kind: file_modified
25
+ path: test_pagination.py
26
+
27
+ - id: a2_refactor_module
28
+ title: Split a 600-line module into three cohesive files
29
+ description: |
30
+ The module `legacy/big_module.py` (612 lines) has three responsibilities.
31
+ Split it into three files preserving the public API. No behavior change.
32
+ starting_repo: oss-fixture-legacy
33
+ starting_sha: __DEFERRED__
34
+ acceptance:
35
+ - kind: build_passes
36
+ - kind: test_count_at_least
37
+ value: 12
38
+ - kind: api_surface_preserved
39
+
40
+ - id: a3_doc_update
41
+ title: Document split rollback commands in USER-GUIDE
42
+ description: |
43
+ The CLI has separate project and user rollback commands:
44
+ `metaensemble unadopt --purge-state` and
45
+ `metaensemble user-teardown --purge-state`. Add a paragraph to
46
+ USER-GUIDE.md under the rollback section.
47
+ starting_repo: metaensemble
48
+ starting_sha: __DEFERRED__
49
+ acceptance:
50
+ - kind: markdown_links_resolve
51
+ - kind: file_modified
52
+ path: USER-GUIDE.md
53
+
54
+ - id: a4_test_addition
55
+ title: Add tests for the reconcile module
56
+ description: |
57
+ The `metaensemble.lib.reconcile` module has 8 tests covering Layer-1 and
58
+ Layer-2 reconcile. Add a test that asserts a sidecar's
59
+ `manifest_id` survives reconciliation into the Run row.
60
+ starting_repo: metaensemble
61
+ starting_sha: __DEFERRED__
62
+ acceptance:
63
+ - kind: build_passes
64
+ - kind: test_count_delta_at_least
65
+ value: 1
66
+ - kind: file_modified
67
+ path: metaensemble/tests/test_reconcile.py
68
+
69
+ - id: a5_design_review
70
+ title: Review the uninstall-mode design
71
+ description: |
72
+ Read the "Uninstall modes" and "Recovery and rollback" sections of
73
+ DEPLOYMENT.md and produce a one-page review
74
+ (`reports/<date>-uninstall-review.md`) naming at least three risks
75
+ the documented design does not address.
76
+ starting_repo: metaensemble
77
+ starting_sha: __DEFERRED__
78
+ acceptance:
79
+ - kind: file_exists
80
+ glob: reports/*-uninstall-review.md
81
+ - kind: word_count_at_least
82
+ value: 300
83
+
84
+ - id: a6_security_review
85
+ title: Security-review the new transcript walker
86
+ description: |
87
+ The new `metaensemble/lib/transcript.py` reads JSONL from `transcript_path`.
88
+ Write `reports/<date>-transcript-security.md` listing every defensive
89
+ assumption and one concrete attack the walker survives.
90
+ starting_repo: metaensemble
91
+ starting_sha: __DEFERRED__
92
+ acceptance:
93
+ - kind: file_exists
94
+ glob: reports/*-transcript-security.md
95
+
96
+ - id: a7_perf_tune
97
+ title: Tune `get_window_burn` to <5ms p95 on 100k Runs
98
+ description: |
99
+ Current p95 is 9ms on 100k Runs. Add the missing index and assert the
100
+ new p95 in `metaensemble/tests/test_perf_ledger.py`.
101
+ starting_repo: metaensemble
102
+ starting_sha: __DEFERRED__
103
+ acceptance:
104
+ - kind: build_passes
105
+ - kind: perf_benchmark_passes
106
+ benchmark: test_get_window_burn_meets_p95
107
+
108
+ - id: a8_infra_change
109
+ title: Add the no-quality CI matrix axis
110
+ description: |
111
+ The CI workflow should run pytest a second time with the quality
112
+ runners absent. Update `.github/workflows/ci.yml` and
113
+ add `@pytest.mark.requires_radon` / `requires_bandit` markers to
114
+ the tests that need them.
115
+ starting_repo: metaensemble
116
+ starting_sha: __DEFERRED__
117
+ acceptance:
118
+ - kind: ci_yaml_has_matrix_axis
119
+ axis: no-quality
120
+
121
+ # Per W8 sign-off, every starting_sha resolves to a real commit before
122
+ # the harness ships its first benchmark. The __DEFERRED__ values are
123
+ # placeholders so the test-time loader does not need a network round-trip.
@@ -0,0 +1,90 @@
1
+ # Suite B — domain-specific classification smoke fixture.
2
+ #
3
+ # STATUS: SMOKE SUITE.
4
+ #
5
+ # Twelve labeled items is not enough to make calibration claims. This
6
+ # file is the smoke suite: it proves the
7
+ # pipeline end-to-end and lets a regression run in CI without
8
+ # real-money API spend. It is NOT the calibration set.
9
+ #
10
+ # This fixture's current labels happen to use a narrow Somali dialect
11
+ # classification task. That is an eval fixture, not MetaEnsemble's product
12
+ # scope. MetaEnsemble is project-agnostic.
13
+ #
14
+ # Smoke items are written by a non-native Somali reader using
15
+ # published dialect references. They are deliberately easy — clearly
16
+ # Northern/Standard Somali — so a pass means the pipeline works,
17
+ # not that the model is calibrated.
18
+
19
+ items:
20
+ - id: b_smoke_01
21
+ text: "Maxaa waaye sababta aad maanta ii soo wici weyday, ma mashquul baad ahayd?"
22
+ gold_label: northern_standard
23
+ acceptable_labels: [northern_standard]
24
+ notes: |
25
+ Clear Northern Somali markers: "wici weyday" (negative habitual)
26
+ and the interrogative "ma … baad ahayd" pattern.
27
+
28
+ - id: b_smoke_02
29
+ text: "Meeqo qof baa shirka yimid, yaase ka hadlay arrintaas?"
30
+ gold_label: northern_standard
31
+ acceptable_labels: [northern_standard]
32
+ notes: |
33
+ "Baa" focus marker + "yimid" verb form pin the dialect to
34
+ Northern/Standard.
35
+
36
+ - id: b_smoke_03
37
+ text: "Sidee laga yaabaa in arrintaan loo xalliyo si toosan?"
38
+ gold_label: northern_standard
39
+ acceptable_labels: [northern_standard]
40
+ notes: Standard register; common in news/formal speech.
41
+
42
+ - id: b_smoke_04
43
+ text: "Markii uu yimid, waxaan u sheegay inay tahay arrin muhiim ah."
44
+ gold_label: northern_standard
45
+ acceptable_labels: [northern_standard]
46
+
47
+ - id: b_smoke_05
48
+ text: "Waxaan tagay suuqa subaxnimo, badda agteeda."
49
+ gold_label: northern_standard
50
+ acceptable_labels: [northern_standard]
51
+
52
+ - id: b_smoke_06
53
+ text: "Maxaad u maleyneysaa inay ka jirto fursad cusub?"
54
+ gold_label: northern_standard
55
+ acceptable_labels: [northern_standard]
56
+
57
+ - id: b_smoke_07
58
+ text: "Sannadkaan dugsiga ardayda waa la kordhay."
59
+ gold_label: northern_standard
60
+ acceptable_labels: [northern_standard]
61
+
62
+ - id: b_smoke_08
63
+ text: "Markaa sidee ayaad mooddaa arrintaas?"
64
+ gold_label: northern_standard
65
+ acceptable_labels: [northern_standard]
66
+
67
+ - id: b_smoke_09
68
+ text: "Wax ka qabasho ah waa la sameeyay, balse natiijada lama hubo."
69
+ gold_label: northern_standard
70
+ acceptable_labels: [northern_standard]
71
+
72
+ - id: b_smoke_10
73
+ text: "Haddii aanan tegin, ma waxaa lagaa filan inaad ka soo qaybgasho?"
74
+ gold_label: northern_standard
75
+ acceptable_labels: [northern_standard]
76
+
77
+ - id: b_smoke_11
78
+ text: "Cuntada caawa waa fiicnaa, mahad waxaad leedahay."
79
+ gold_label: northern_standard
80
+ acceptable_labels: [northern_standard]
81
+
82
+ - id: b_smoke_12
83
+ text: "Khayraadka dalka ku jira waxay u baahan yihiin maamul fiican."
84
+ gold_label: northern_standard
85
+ acceptable_labels: [northern_standard]
86
+
87
+ # Acceptable labels left as a singleton set on every item is part of the
88
+ # smoke-only contract. The calibration set will carry richer alternative
89
+ # labels (Maay, Benadiri, Northern, mixed, code-switching) per the
90
+ # failure-mode catalog the system card promises.
@@ -0,0 +1,12 @@
1
+ """MetaEnsemble evaluation runners.
2
+
3
+ Three tiers:
4
+ - replay: cassette-based, no API spend. PR gate.
5
+ - smoke: one seed × classification smoke set. Nightly.
6
+ - full: N seeds × every cell × every suite. Release gate.
7
+
8
+ Modules:
9
+ - api: tiered runner dispatch.
10
+ - metrics: Wilson CI, pass@budget, quality_per_1k_tokens, overhead ratio.
11
+ - replay: cassette reader for the PR tier.
12
+ """