superagentic-metaharness 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- superagentic_metaharness-0.1.0/LICENSE +21 -0
- superagentic_metaharness-0.1.0/PKG-INFO +355 -0
- superagentic_metaharness-0.1.0/README.md +327 -0
- superagentic_metaharness-0.1.0/pyproject.toml +62 -0
- superagentic_metaharness-0.1.0/setup.cfg +4 -0
- superagentic_metaharness-0.1.0/src/metaharness/__init__.py +27 -0
- superagentic_metaharness-0.1.0/src/metaharness/api.py +39 -0
- superagentic_metaharness-0.1.0/src/metaharness/bootstrap.py +260 -0
- superagentic_metaharness-0.1.0/src/metaharness/cli.py +437 -0
- superagentic_metaharness-0.1.0/src/metaharness/core/__init__.py +1 -0
- superagentic_metaharness-0.1.0/src/metaharness/core/engine.py +266 -0
- superagentic_metaharness-0.1.0/src/metaharness/core/protocols.py +14 -0
- superagentic_metaharness-0.1.0/src/metaharness/experiment_config.py +136 -0
- superagentic_metaharness-0.1.0/src/metaharness/experiments.py +379 -0
- superagentic_metaharness-0.1.0/src/metaharness/integrations/__init__.py +1 -0
- superagentic_metaharness-0.1.0/src/metaharness/integrations/coding_tool/__init__.py +16 -0
- superagentic_metaharness-0.1.0/src/metaharness/integrations/coding_tool/config.py +84 -0
- superagentic_metaharness-0.1.0/src/metaharness/integrations/coding_tool/runtime.py +415 -0
- superagentic_metaharness-0.1.0/src/metaharness/models.py +133 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/__init__.py +1 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/base.py +15 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/codex_exec.py +252 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/fake.py +98 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/gemini_cli.py +84 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/instructions.py +121 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/normalized_events.py +19 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/__init__.py +1 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/codex.py +144 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/gemini.py +41 -0
- superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/pi.py +38 -0
- superagentic_metaharness-0.1.0/src/metaharness/reporting.py +516 -0
- superagentic_metaharness-0.1.0/src/metaharness/scaffold.py +483 -0
- superagentic_metaharness-0.1.0/src/metaharness/store/__init__.py +1 -0
- superagentic_metaharness-0.1.0/src/metaharness/store/filesystem.py +271 -0
- superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/PKG-INFO +355 -0
- superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/SOURCES.txt +52 -0
- superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/dependency_links.txt +1 -0
- superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/entry_points.txt +2 -0
- superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/requires.txt +4 -0
- superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/top_level.txt +1 -0
- superagentic_metaharness-0.1.0/tests/test_bootstrap.py +28 -0
- superagentic_metaharness-0.1.0/tests/test_candidate_outcomes.py +172 -0
- superagentic_metaharness-0.1.0/tests/test_cli.py +388 -0
- superagentic_metaharness-0.1.0/tests/test_codex_parser.py +30 -0
- superagentic_metaharness-0.1.0/tests/test_codex_timeout.py +67 -0
- superagentic_metaharness-0.1.0/tests/test_coding_tool_config.py +114 -0
- superagentic_metaharness-0.1.0/tests/test_engine_fake_backend.py +84 -0
- superagentic_metaharness-0.1.0/tests/test_experiments.py +131 -0
- superagentic_metaharness-0.1.0/tests/test_instructions.py +38 -0
- superagentic_metaharness-0.1.0/tests/test_live_codex_smoke.py +52 -0
- superagentic_metaharness-0.1.0/tests/test_python_cli_benchmark.py +36 -0
- superagentic_metaharness-0.1.0/tests/test_python_fixture_benchmark.py +40 -0
- superagentic_metaharness-0.1.0/tests/test_reporting.py +126 -0
- superagentic_metaharness-0.1.0/tests/test_ticket_router_example.py +36 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shashi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: superagentic-metaharness
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Filesystem-first harness optimization for coding agents.
|
|
5
|
+
Author-email: Superagentic AI <hello@super-agentic.ai>, Shashi Jagtap <shashikant.jagtap@icloud.com>, Shashi Jagtap <shashi@super-agentic.ai>
|
|
6
|
+
Maintainer-email: Superagentic AI <hello@super-agentic.ai>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/SuperagenticAI/metaharness
|
|
9
|
+
Project-URL: Documentation, https://superagenticai.github.io/metaharness/
|
|
10
|
+
Project-URL: Repository, https://github.com/SuperagenticAI/metaharness
|
|
11
|
+
Project-URL: Issues, https://github.com/SuperagenticAI/metaharness/issues
|
|
12
|
+
Keywords: agents,benchmarking,codex,evaluation,harness,optimization
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: mkdocs<2,>=1.6; extra == "dev"
|
|
26
|
+
Requires-Dist: mkdocs-material<10,>=9.6; extra == "dev"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# metaharness
|
|
30
|
+
|
|
31
|
+
[](https://github.com/SuperagenticAI/metaharness/actions/workflows/ci.yml)
|
|
32
|
+
[](https://github.com/SuperagenticAI/metaharness/actions/workflows/pages.yml)
|
|
33
|
+
[](https://superagenticai.github.io/metaharness/)
|
|
34
|
+
[](https://github.com/SuperagenticAI/metaharness/blob/main/pyproject.toml)
|
|
35
|
+
[](https://github.com/SuperagenticAI/metaharness/blob/main/LICENSE)
|
|
36
|
+
[](https://github.com/SuperagenticAI/metaharness)
|
|
37
|
+
[](https://arxiv.org/pdf/2603.28052)
|
|
38
|
+
|
|
39
|
+
`metaharness` is an open source Python library for optimizing executable harnesses around agentic coding systems.
|
|
40
|
+
It is inspired by the [Meta Harness paper](https://arxiv.org/pdf/2603.28052) and is an unofficial open source implementation of the core ideas in that work.
|
|
41
|
+
The current implementation and benchmark evidence in this repository are centered on the Codex CLI path, including hosted Codex and Codex over local Ollama models.
|
|
42
|
+
|
|
43
|
+
It is built for teams who want to improve the code and files around an agent workflow, not just the prompt.
|
|
44
|
+
That includes instruction files, setup flows, validation scripts, test scripts, routing logic, and other executable support code.
|
|
45
|
+
|
|
46
|
+
## Why `metaharness`
|
|
47
|
+
|
|
48
|
+
Many agent failures come from the harness around the model:
|
|
49
|
+
|
|
50
|
+
- weak repository instructions
|
|
51
|
+
- missing setup steps
|
|
52
|
+
- broken validation logic
|
|
53
|
+
- incomplete test flows
|
|
54
|
+
- poor iteration memory
|
|
55
|
+
- acceptance checks that do not match the real task
|
|
56
|
+
|
|
57
|
+
`metaharness` turns those artifacts into a repeatable optimization target with stored evidence for every proposal.
|
|
58
|
+
It also captures a compact environment snapshot before each proposal so agents do not waste early turns on basic workspace discovery.
|
|
59
|
+
Projects can also declare an allowed write scope so off-target edits are rejected automatically.
|
|
60
|
+
|
|
61
|
+
## How It Works
|
|
62
|
+
|
|
63
|
+
`metaharness` runs an outer optimization loop around a harness:
|
|
64
|
+
|
|
65
|
+
1. start from a baseline workspace
|
|
66
|
+
2. ask a coding agent to improve it
|
|
67
|
+
3. validate and evaluate the result
|
|
68
|
+
4. keep the best candidate
|
|
69
|
+
5. store all artifacts on disk
|
|
70
|
+
|
|
71
|
+
The result is a practical, inspectable workflow for improving real harnesses instead of ad hoc prompt tinkering.
|
|
72
|
+
|
|
73
|
+
## Who It Is For
|
|
74
|
+
|
|
75
|
+
- developers building agentic coding systems who want to optimize harness code, workflow scripts, retrieval wrappers, routing, and evaluation flows
|
|
76
|
+
- practitioners using coding-agent tools who want to improve `AGENTS.md`, `GEMINI.md`, bootstrap scripts, validation scripts, and acceptance tests
|
|
77
|
+
|
|
78
|
+
## Quickstart
|
|
79
|
+
|
|
80
|
+
Install the project:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
uv sync
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Run the fake backend on a real benchmark:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name quickstart
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Inspect the run:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
uv run metaharness inspect examples/python_fixture_benchmark/runs/quickstart
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Export the candidate ledger:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
uv run metaharness ledger examples/python_fixture_benchmark/runs/quickstart --tsv
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Run a saved experiment matrix:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Core Capabilities
|
|
111
|
+
|
|
112
|
+
- a minimal optimization engine
|
|
113
|
+
- a filesystem-backed run store
|
|
114
|
+
- automatic environment bootstrap snapshots for each proposal
|
|
115
|
+
- optional write-scope enforcement through `allowed_write_paths`
|
|
116
|
+
- a provider-neutral proposer backend interface
|
|
117
|
+
- a real `CodexExecBackend`
|
|
118
|
+
- a deterministic `FakeBackend`
|
|
119
|
+
- a coding-tool integration for instruction files and script-based harnesses
|
|
120
|
+
- explicit per-candidate outcomes: `keep`, `discard`, `crash`, `timeout`, `no-change`, and `scope-violation`
|
|
121
|
+
- reporting commands for `inspect`, `ledger`, `summarize`, and `compare`
|
|
122
|
+
- experiment-matrix execution with JSON and TSV outputs
|
|
123
|
+
- benchmark targets and experiment records
|
|
124
|
+
|
|
125
|
+
## Current Status
|
|
126
|
+
|
|
127
|
+
The repository currently includes:
|
|
128
|
+
|
|
129
|
+
- two real coding-tool benchmark targets
|
|
130
|
+
- a smaller deterministic ticket-router example
|
|
131
|
+
- hosted Codex runs on the real benchmarks
|
|
132
|
+
- local Codex over Ollama runs with `gpt-oss:20b` and `gpt-oss:120b`
|
|
133
|
+
- a docs site published from GitHub Actions
|
|
134
|
+
|
|
135
|
+
Current documented experiments in this repository show:
|
|
136
|
+
|
|
137
|
+
- hosted Codex solves both real benchmarks in one proposal iteration
|
|
138
|
+
- local `gpt-oss:120b` solves `python_fixture_benchmark`
|
|
139
|
+
- local `gpt-oss:20b` is useful for smoke checks but timed out on the current real benchmark runs
|
|
140
|
+
|
|
141
|
+
Detailed experiment records:
|
|
142
|
+
|
|
143
|
+
- [Benchmark overview](BENCHMARKS.md)
|
|
144
|
+
- [Recorded benchmark results](BENCHMARK_RESULTS.md)
|
|
145
|
+
- [Experiment notes](docs/experiments.md)
|
|
146
|
+
|
|
147
|
+
## Provider Status
|
|
148
|
+
|
|
149
|
+
- Codex is the main validated harness path in this repository today
|
|
150
|
+
- hosted Codex is the strongest current path for real runs
|
|
151
|
+
- local Codex over Ollama works and has been exercised with `gpt-oss:20b` and `gpt-oss:120b`
|
|
152
|
+
- Gemini exists as a scaffolded backend and is not yet at parity with Codex
|
|
153
|
+
|
|
154
|
+
All real provider results currently documented in this repository were produced through the Codex CLI path.
|
|
155
|
+
That includes both hosted Codex runs and local Ollama runs driven through Codex with `gpt-oss` models.
|
|
156
|
+
Other coding-agent evaluations in the wider ecosystem often emphasize Claude Code and Opus, but this repository's current benchmark evidence is Codex-first.
|
|
157
|
+
|
|
158
|
+
## Documentation
|
|
159
|
+
|
|
160
|
+
- [Project documentation](https://superagenticai.github.io/metaharness/)
|
|
161
|
+
- [Getting started](https://superagenticai.github.io/metaharness/getting-started/)
|
|
162
|
+
- [Architecture](https://superagenticai.github.io/metaharness/architecture/)
|
|
163
|
+
- [Providers](https://superagenticai.github.io/metaharness/providers/)
|
|
164
|
+
- [Benchmarks](https://superagenticai.github.io/metaharness/benchmarks/)
|
|
165
|
+
- [CLI reference](https://superagenticai.github.io/metaharness/cli-reference/)
|
|
166
|
+
- [Experiments](https://superagenticai.github.io/metaharness/experiments/)
|
|
167
|
+
|
|
168
|
+
## Installation
|
|
169
|
+
|
|
170
|
+
Project setup:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
uv sync
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
If you want the docs toolchain too:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
uv sync --group dev
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Check the CLI:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
uv run metaharness --help
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Editable install with `pip` also works:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
pip install -e .
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Hosted Codex
|
|
195
|
+
|
|
196
|
+
Requirements:
|
|
197
|
+
|
|
198
|
+
- `codex` CLI installed
|
|
199
|
+
- authenticated Codex session or API key
|
|
200
|
+
- outbound network access
|
|
201
|
+
|
|
202
|
+
Run a real benchmark with hosted Codex:
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
uv run metaharness run examples/python_fixture_benchmark --backend codex --hosted --budget 1 --run-name hosted-codex
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Important:
|
|
209
|
+
|
|
210
|
+
- use `--hosted` when a project config defaults to local Ollama
|
|
211
|
+
- the library is ready for hosted Codex runs today
|
|
212
|
+
|
|
213
|
+
## Local Codex Over Ollama
|
|
214
|
+
|
|
215
|
+
Probe the local setup:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
uv run metaharness smoke codex examples/python_fixture_benchmark --probe-only --oss --local-provider ollama --model gpt-oss:20b
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Run with `gpt-oss:20b`:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:20b --proposal-timeout 240 --budget 1 --run-name ollama-20b
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Run with `gpt-oss:120b`:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:120b --proposal-timeout 420 --budget 1 --run-name ollama-120b
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Benchmarks And Examples
|
|
234
|
+
|
|
235
|
+
Real benchmarks:
|
|
236
|
+
|
|
237
|
+
- [examples/python_fixture_benchmark](examples/python_fixture_benchmark)
|
|
238
|
+
- [examples/python_cli_benchmark](examples/python_cli_benchmark)
|
|
239
|
+
|
|
240
|
+
Smaller deterministic example:
|
|
241
|
+
|
|
242
|
+
- [examples/ticket_router](examples/ticket_router)
|
|
243
|
+
|
|
244
|
+
Run the ticket router example:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
uv run python examples/ticket_router/run.py --backend fake --budget 1
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Scaffold Your Own Project
|
|
251
|
+
|
|
252
|
+
Create a coding-tool project:
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
uv run metaharness scaffold coding-tool ./my-coding-tool-optimizer
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Available profiles:
|
|
259
|
+
|
|
260
|
+
- `standard`
|
|
261
|
+
- `local-oss-smoke`
|
|
262
|
+
- `local-oss-medium`
|
|
263
|
+
|
|
264
|
+
Run the scaffold with the fake backend:
|
|
265
|
+
|
|
266
|
+
```bash
|
|
267
|
+
uv run metaharness run ./my-coding-tool-optimizer --backend fake --budget 1
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
## CLI Overview
|
|
271
|
+
|
|
272
|
+
Create a scaffold:
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
uv run metaharness scaffold coding-tool ./my-project
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
Run a project:
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
uv run metaharness run ./my-project --backend fake --budget 1
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
Probe Codex:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
uv run metaharness smoke codex ./my-project --probe-only
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Inspect a run:
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
uv run metaharness inspect ./my-project/runs/example
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
Compare runs:
|
|
297
|
+
|
|
298
|
+
```bash
|
|
299
|
+
uv run metaharness compare \
|
|
300
|
+
./examples/python_fixture_benchmark/runs/hosted-codex-20260401 \
|
|
301
|
+
./examples/python_fixture_benchmark/runs/ollama-20b-20260401 \
|
|
302
|
+
./examples/python_fixture_benchmark/runs/ollama-120b-20260401
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
Run an experiment matrix:
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## Benefits Of The Filesystem Approach
|
|
312
|
+
|
|
313
|
+
Every run stores:
|
|
314
|
+
|
|
315
|
+
- prompts
|
|
316
|
+
- candidate workspaces
|
|
317
|
+
- validation results
|
|
318
|
+
- evaluation results
|
|
319
|
+
- proposal metadata
|
|
320
|
+
- workspace diffs
|
|
321
|
+
- per-candidate manifests
|
|
322
|
+
|
|
323
|
+
That makes the optimization history reviewable, debuggable, and reusable.
|
|
324
|
+
|
|
325
|
+
## Development
|
|
326
|
+
|
|
327
|
+
Compile checks:
|
|
328
|
+
|
|
329
|
+
```bash
|
|
330
|
+
uv run python -m compileall -q src tests examples docs
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
Unit tests:
|
|
334
|
+
|
|
335
|
+
```bash
|
|
336
|
+
uv run python -m unittest discover -s tests -v
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
Docs build:
|
|
340
|
+
|
|
341
|
+
```bash
|
|
342
|
+
uv run mkdocs build --strict
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
Fake benchmark smoke runs:
|
|
346
|
+
|
|
347
|
+
```bash
|
|
348
|
+
uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name ci-fixture-local
|
|
349
|
+
uv run metaharness run examples/python_cli_benchmark --backend fake --budget 1 --run-name ci-cli-local
|
|
350
|
+
uv run python examples/ticket_router/run.py --backend fake --budget 1
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
## License
|
|
354
|
+
|
|
355
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
# metaharness
|
|
2
|
+
|
|
3
|
+
[](https://github.com/SuperagenticAI/metaharness/actions/workflows/ci.yml)
|
|
4
|
+
[](https://github.com/SuperagenticAI/metaharness/actions/workflows/pages.yml)
|
|
5
|
+
[](https://superagenticai.github.io/metaharness/)
|
|
6
|
+
[](https://github.com/SuperagenticAI/metaharness/blob/main/pyproject.toml)
|
|
7
|
+
[](https://github.com/SuperagenticAI/metaharness/blob/main/LICENSE)
|
|
8
|
+
[](https://github.com/SuperagenticAI/metaharness)
|
|
9
|
+
[](https://arxiv.org/pdf/2603.28052)
|
|
10
|
+
|
|
11
|
+
`metaharness` is an open source Python library for optimizing executable harnesses around agentic coding systems.
|
|
12
|
+
It is inspired by the [Meta Harness paper](https://arxiv.org/pdf/2603.28052) and is an unofficial open source implementation of the core ideas in that work.
|
|
13
|
+
The current implementation and benchmark evidence in this repository are centered on the Codex CLI path, including hosted Codex and Codex over local Ollama models.
|
|
14
|
+
|
|
15
|
+
It is built for teams who want to improve the code and files around an agent workflow, not just the prompt.
|
|
16
|
+
That includes instruction files, setup flows, validation scripts, test scripts, routing logic, and other executable support code.
|
|
17
|
+
|
|
18
|
+
## Why `metaharness`
|
|
19
|
+
|
|
20
|
+
Many agent failures come from the harness around the model:
|
|
21
|
+
|
|
22
|
+
- weak repository instructions
|
|
23
|
+
- missing setup steps
|
|
24
|
+
- broken validation logic
|
|
25
|
+
- incomplete test flows
|
|
26
|
+
- poor iteration memory
|
|
27
|
+
- acceptance checks that do not match the real task
|
|
28
|
+
|
|
29
|
+
`metaharness` turns those artifacts into a repeatable optimization target with stored evidence for every proposal.
|
|
30
|
+
It also captures a compact environment snapshot before each proposal so agents do not waste early turns on basic workspace discovery.
|
|
31
|
+
Projects can also declare an allowed write scope so off-target edits are rejected automatically.
|
|
32
|
+
|
|
33
|
+
## How It Works
|
|
34
|
+
|
|
35
|
+
`metaharness` runs an outer optimization loop around a harness:
|
|
36
|
+
|
|
37
|
+
1. start from a baseline workspace
|
|
38
|
+
2. ask a coding agent to improve it
|
|
39
|
+
3. validate and evaluate the result
|
|
40
|
+
4. keep the best candidate
|
|
41
|
+
5. store all artifacts on disk
|
|
42
|
+
|
|
43
|
+
The result is a practical, inspectable workflow for improving real harnesses instead of ad hoc prompt tinkering.
|
|
44
|
+
|
|
45
|
+
## Who It Is For
|
|
46
|
+
|
|
47
|
+
- developers building agentic coding systems who want to optimize harness code, workflow scripts, retrieval wrappers, routing, and evaluation flows
|
|
48
|
+
- practitioners using coding-agent tools who want to improve `AGENTS.md`, `GEMINI.md`, bootstrap scripts, validation scripts, and acceptance tests
|
|
49
|
+
|
|
50
|
+
## Quickstart
|
|
51
|
+
|
|
52
|
+
Install the project:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
uv sync
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Run the fake backend on a real benchmark:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name quickstart
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Inspect the run:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
uv run metaharness inspect examples/python_fixture_benchmark/runs/quickstart
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Export the candidate ledger:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
uv run metaharness ledger examples/python_fixture_benchmark/runs/quickstart --tsv
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Run a saved experiment matrix:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Core Capabilities
|
|
83
|
+
|
|
84
|
+
- a minimal optimization engine
|
|
85
|
+
- a filesystem-backed run store
|
|
86
|
+
- automatic environment bootstrap snapshots for each proposal
|
|
87
|
+
- optional write-scope enforcement through `allowed_write_paths`
|
|
88
|
+
- a provider-neutral proposer backend interface
|
|
89
|
+
- a real `CodexExecBackend`
|
|
90
|
+
- a deterministic `FakeBackend`
|
|
91
|
+
- a coding-tool integration for instruction files and script-based harnesses
|
|
92
|
+
- explicit per-candidate outcomes: `keep`, `discard`, `crash`, `timeout`, `no-change`, and `scope-violation`
|
|
93
|
+
- reporting commands for `inspect`, `ledger`, `summarize`, and `compare`
|
|
94
|
+
- experiment-matrix execution with JSON and TSV outputs
|
|
95
|
+
- benchmark targets and experiment records
|
|
96
|
+
|
|
97
|
+
## Current Status
|
|
98
|
+
|
|
99
|
+
The repository currently includes:
|
|
100
|
+
|
|
101
|
+
- two real coding-tool benchmark targets
|
|
102
|
+
- a smaller deterministic ticket-router example
|
|
103
|
+
- hosted Codex runs on the real benchmarks
|
|
104
|
+
- local Codex over Ollama runs with `gpt-oss:20b` and `gpt-oss:120b`
|
|
105
|
+
- a docs site published from GitHub Actions
|
|
106
|
+
|
|
107
|
+
Current documented experiments in this repository show:
|
|
108
|
+
|
|
109
|
+
- hosted Codex solves both real benchmarks in one proposal iteration
|
|
110
|
+
- local `gpt-oss:120b` solves `python_fixture_benchmark`
|
|
111
|
+
- local `gpt-oss:20b` is useful for smoke checks but timed out on the current real benchmark runs
|
|
112
|
+
|
|
113
|
+
Detailed experiment records:
|
|
114
|
+
|
|
115
|
+
- [Benchmark overview](BENCHMARKS.md)
|
|
116
|
+
- [Recorded benchmark results](BENCHMARK_RESULTS.md)
|
|
117
|
+
- [Experiment notes](docs/experiments.md)
|
|
118
|
+
|
|
119
|
+
## Provider Status
|
|
120
|
+
|
|
121
|
+
- Codex is the main validated harness path in this repository today
|
|
122
|
+
- hosted Codex is the strongest current path for real runs
|
|
123
|
+
- local Codex over Ollama works and has been exercised with `gpt-oss:20b` and `gpt-oss:120b`
|
|
124
|
+
- Gemini exists as a scaffolded backend and is not yet at parity with Codex
|
|
125
|
+
|
|
126
|
+
All real provider results currently documented in this repository were produced through the Codex CLI path.
|
|
127
|
+
That includes both hosted Codex runs and local Ollama runs driven through Codex with `gpt-oss` models.
|
|
128
|
+
Other coding-agent evaluations in the wider ecosystem often emphasize Claude Code and Opus, but this repository's current benchmark evidence is Codex-first.
|
|
129
|
+
|
|
130
|
+
## Documentation
|
|
131
|
+
|
|
132
|
+
- [Project documentation](https://superagenticai.github.io/metaharness/)
|
|
133
|
+
- [Getting started](https://superagenticai.github.io/metaharness/getting-started/)
|
|
134
|
+
- [Architecture](https://superagenticai.github.io/metaharness/architecture/)
|
|
135
|
+
- [Providers](https://superagenticai.github.io/metaharness/providers/)
|
|
136
|
+
- [Benchmarks](https://superagenticai.github.io/metaharness/benchmarks/)
|
|
137
|
+
- [CLI reference](https://superagenticai.github.io/metaharness/cli-reference/)
|
|
138
|
+
- [Experiments](https://superagenticai.github.io/metaharness/experiments/)
|
|
139
|
+
|
|
140
|
+
## Installation
|
|
141
|
+
|
|
142
|
+
Project setup:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
uv sync
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
If you want the docs toolchain too:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
uv sync --group dev
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Check the CLI:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
uv run metaharness --help
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Editable install with `pip` also works:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
pip install -e .
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Hosted Codex
|
|
167
|
+
|
|
168
|
+
Requirements:
|
|
169
|
+
|
|
170
|
+
- `codex` CLI installed
|
|
171
|
+
- authenticated Codex session or API key
|
|
172
|
+
- outbound network access
|
|
173
|
+
|
|
174
|
+
Run a real benchmark with hosted Codex:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
uv run metaharness run examples/python_fixture_benchmark --backend codex --hosted --budget 1 --run-name hosted-codex
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Important:
|
|
181
|
+
|
|
182
|
+
- use `--hosted` when a project config defaults to local Ollama
|
|
183
|
+
- the library is ready for hosted Codex runs today
|
|
184
|
+
|
|
185
|
+
## Local Codex Over Ollama
|
|
186
|
+
|
|
187
|
+
Probe the local setup:
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
uv run metaharness smoke codex examples/python_fixture_benchmark --probe-only --oss --local-provider ollama --model gpt-oss:20b
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Run with `gpt-oss:20b`:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:20b --proposal-timeout 240 --budget 1 --run-name ollama-20b
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Run with `gpt-oss:120b`:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:120b --proposal-timeout 420 --budget 1 --run-name ollama-120b
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Benchmarks And Examples
|
|
206
|
+
|
|
207
|
+
Real benchmarks:
|
|
208
|
+
|
|
209
|
+
- [examples/python_fixture_benchmark](examples/python_fixture_benchmark)
|
|
210
|
+
- [examples/python_cli_benchmark](examples/python_cli_benchmark)
|
|
211
|
+
|
|
212
|
+
Smaller deterministic example:
|
|
213
|
+
|
|
214
|
+
- [examples/ticket_router](examples/ticket_router)
|
|
215
|
+
|
|
216
|
+
Run the ticket router example:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
uv run python examples/ticket_router/run.py --backend fake --budget 1
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Scaffold Your Own Project
|
|
223
|
+
|
|
224
|
+
Create a coding-tool project:
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
uv run metaharness scaffold coding-tool ./my-coding-tool-optimizer
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
Available profiles:
|
|
231
|
+
|
|
232
|
+
- `standard`
|
|
233
|
+
- `local-oss-smoke`
|
|
234
|
+
- `local-oss-medium`
|
|
235
|
+
|
|
236
|
+
Run the scaffold with the fake backend:
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
uv run metaharness run ./my-coding-tool-optimizer --backend fake --budget 1
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
## CLI Overview
|
|
243
|
+
|
|
244
|
+
Create a scaffold:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
uv run metaharness scaffold coding-tool ./my-project
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Run a project:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
uv run metaharness run ./my-project --backend fake --budget 1
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Probe Codex:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
uv run metaharness smoke codex ./my-project --probe-only
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Inspect a run:
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
uv run metaharness inspect ./my-project/runs/example
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
Compare runs:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
uv run metaharness compare \
|
|
272
|
+
./examples/python_fixture_benchmark/runs/hosted-codex-20260401 \
|
|
273
|
+
./examples/python_fixture_benchmark/runs/ollama-20b-20260401 \
|
|
274
|
+
./examples/python_fixture_benchmark/runs/ollama-120b-20260401
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Run an experiment matrix:
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## Benefits Of The Filesystem Approach
|
|
284
|
+
|
|
285
|
+
Every run stores:
|
|
286
|
+
|
|
287
|
+
- prompts
|
|
288
|
+
- candidate workspaces
|
|
289
|
+
- validation results
|
|
290
|
+
- evaluation results
|
|
291
|
+
- proposal metadata
|
|
292
|
+
- workspace diffs
|
|
293
|
+
- per-candidate manifests
|
|
294
|
+
|
|
295
|
+
That makes the optimization history reviewable, debuggable, and reusable.
|
|
296
|
+
|
|
297
|
+
## Development
|
|
298
|
+
|
|
299
|
+
Compile checks:
|
|
300
|
+
|
|
301
|
+
```bash
|
|
302
|
+
uv run python -m compileall -q src tests examples docs
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
Unit tests:
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
uv run python -m unittest discover -s tests -v
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Docs build:
|
|
312
|
+
|
|
313
|
+
```bash
|
|
314
|
+
uv run mkdocs build --strict
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
Fake benchmark smoke runs:
|
|
318
|
+
|
|
319
|
+
```bash
|
|
320
|
+
uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name ci-fixture-local
|
|
321
|
+
uv run metaharness run examples/python_cli_benchmark --backend fake --budget 1 --run-name ci-cli-local
|
|
322
|
+
uv run python examples/ticket_router/run.py --backend fake --budget 1
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
## License
|
|
326
|
+
|
|
327
|
+
MIT. See [LICENSE](LICENSE).
|