superagentic-metaharness 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. superagentic_metaharness-0.1.0/LICENSE +21 -0
  2. superagentic_metaharness-0.1.0/PKG-INFO +355 -0
  3. superagentic_metaharness-0.1.0/README.md +327 -0
  4. superagentic_metaharness-0.1.0/pyproject.toml +62 -0
  5. superagentic_metaharness-0.1.0/setup.cfg +4 -0
  6. superagentic_metaharness-0.1.0/src/metaharness/__init__.py +27 -0
  7. superagentic_metaharness-0.1.0/src/metaharness/api.py +39 -0
  8. superagentic_metaharness-0.1.0/src/metaharness/bootstrap.py +260 -0
  9. superagentic_metaharness-0.1.0/src/metaharness/cli.py +437 -0
  10. superagentic_metaharness-0.1.0/src/metaharness/core/__init__.py +1 -0
  11. superagentic_metaharness-0.1.0/src/metaharness/core/engine.py +266 -0
  12. superagentic_metaharness-0.1.0/src/metaharness/core/protocols.py +14 -0
  13. superagentic_metaharness-0.1.0/src/metaharness/experiment_config.py +136 -0
  14. superagentic_metaharness-0.1.0/src/metaharness/experiments.py +379 -0
  15. superagentic_metaharness-0.1.0/src/metaharness/integrations/__init__.py +1 -0
  16. superagentic_metaharness-0.1.0/src/metaharness/integrations/coding_tool/__init__.py +16 -0
  17. superagentic_metaharness-0.1.0/src/metaharness/integrations/coding_tool/config.py +84 -0
  18. superagentic_metaharness-0.1.0/src/metaharness/integrations/coding_tool/runtime.py +415 -0
  19. superagentic_metaharness-0.1.0/src/metaharness/models.py +133 -0
  20. superagentic_metaharness-0.1.0/src/metaharness/proposer/__init__.py +1 -0
  21. superagentic_metaharness-0.1.0/src/metaharness/proposer/base.py +15 -0
  22. superagentic_metaharness-0.1.0/src/metaharness/proposer/codex_exec.py +252 -0
  23. superagentic_metaharness-0.1.0/src/metaharness/proposer/fake.py +98 -0
  24. superagentic_metaharness-0.1.0/src/metaharness/proposer/gemini_cli.py +84 -0
  25. superagentic_metaharness-0.1.0/src/metaharness/proposer/instructions.py +121 -0
  26. superagentic_metaharness-0.1.0/src/metaharness/proposer/normalized_events.py +19 -0
  27. superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/__init__.py +1 -0
  28. superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/codex.py +144 -0
  29. superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/gemini.py +41 -0
  30. superagentic_metaharness-0.1.0/src/metaharness/proposer/parsers/pi.py +38 -0
  31. superagentic_metaharness-0.1.0/src/metaharness/reporting.py +516 -0
  32. superagentic_metaharness-0.1.0/src/metaharness/scaffold.py +483 -0
  33. superagentic_metaharness-0.1.0/src/metaharness/store/__init__.py +1 -0
  34. superagentic_metaharness-0.1.0/src/metaharness/store/filesystem.py +271 -0
  35. superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/PKG-INFO +355 -0
  36. superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/SOURCES.txt +52 -0
  37. superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/dependency_links.txt +1 -0
  38. superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/entry_points.txt +2 -0
  39. superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/requires.txt +4 -0
  40. superagentic_metaharness-0.1.0/src/superagentic_metaharness.egg-info/top_level.txt +1 -0
  41. superagentic_metaharness-0.1.0/tests/test_bootstrap.py +28 -0
  42. superagentic_metaharness-0.1.0/tests/test_candidate_outcomes.py +172 -0
  43. superagentic_metaharness-0.1.0/tests/test_cli.py +388 -0
  44. superagentic_metaharness-0.1.0/tests/test_codex_parser.py +30 -0
  45. superagentic_metaharness-0.1.0/tests/test_codex_timeout.py +67 -0
  46. superagentic_metaharness-0.1.0/tests/test_coding_tool_config.py +114 -0
  47. superagentic_metaharness-0.1.0/tests/test_engine_fake_backend.py +84 -0
  48. superagentic_metaharness-0.1.0/tests/test_experiments.py +131 -0
  49. superagentic_metaharness-0.1.0/tests/test_instructions.py +38 -0
  50. superagentic_metaharness-0.1.0/tests/test_live_codex_smoke.py +52 -0
  51. superagentic_metaharness-0.1.0/tests/test_python_cli_benchmark.py +36 -0
  52. superagentic_metaharness-0.1.0/tests/test_python_fixture_benchmark.py +40 -0
  53. superagentic_metaharness-0.1.0/tests/test_reporting.py +126 -0
  54. superagentic_metaharness-0.1.0/tests/test_ticket_router_example.py +36 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Shashi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,355 @@
1
+ Metadata-Version: 2.4
2
+ Name: superagentic-metaharness
3
+ Version: 0.1.0
4
+ Summary: Filesystem-first harness optimization for coding agents.
5
+ Author-email: Superagentic AI <hello@super-agentic.ai>, Shashi Jagtap <shashikant.jagtap@icloud.com>, Shashi Jagtap <shashi@super-agentic.ai>
6
+ Maintainer-email: Superagentic AI <hello@super-agentic.ai>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/SuperagenticAI/metaharness
9
+ Project-URL: Documentation, https://superagenticai.github.io/metaharness/
10
+ Project-URL: Repository, https://github.com/SuperagenticAI/metaharness
11
+ Project-URL: Issues, https://github.com/SuperagenticAI/metaharness/issues
12
+ Keywords: agents,benchmarking,codex,evaluation,harness,optimization
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Provides-Extra: dev
25
+ Requires-Dist: mkdocs<2,>=1.6; extra == "dev"
26
+ Requires-Dist: mkdocs-material<10,>=9.6; extra == "dev"
27
+ Dynamic: license-file
28
+
29
+ # metaharness
30
+
31
+ [![CI](https://github.com/SuperagenticAI/metaharness/actions/workflows/ci.yml/badge.svg)](https://github.com/SuperagenticAI/metaharness/actions/workflows/ci.yml)
32
+ [![Docs](https://github.com/SuperagenticAI/metaharness/actions/workflows/pages.yml/badge.svg)](https://github.com/SuperagenticAI/metaharness/actions/workflows/pages.yml)
33
+ [![Documentation](https://img.shields.io/badge/docs-GitHub%20Pages-111111?logo=github&logoColor=white)](https://superagenticai.github.io/metaharness/)
34
+ [![Python](https://img.shields.io/badge/python-3.11%2B-3776AB?logo=python&logoColor=white)](https://github.com/SuperagenticAI/metaharness/blob/main/pyproject.toml)
35
+ [![License](https://img.shields.io/github/license/SuperagenticAI/metaharness)](https://github.com/SuperagenticAI/metaharness/blob/main/LICENSE)
36
+ [![Status](https://img.shields.io/badge/status-alpha-F59E0B)](https://github.com/SuperagenticAI/metaharness)
37
+ [![Paper](https://img.shields.io/badge/paper-Meta%20Harness-B31B1B)](https://arxiv.org/pdf/2603.28052)
38
+
39
+ `metaharness` is an open source Python library for optimizing executable harnesses around agentic coding systems.
40
+ It is inspired by the [Meta Harness paper](https://arxiv.org/pdf/2603.28052) and is an unofficial open source implementation of the core ideas in that work.
41
+ The current implementation and benchmark evidence in this repository are centered on the Codex CLI path, including hosted Codex and Codex over local Ollama models.
42
+
43
+ It is built for teams who want to improve the code and files around an agent workflow, not just the prompt.
44
+ That includes instruction files, setup flows, validation scripts, test scripts, routing logic, and other executable support code.
45
+
46
+ ## Why `metaharness`
47
+
48
+ Many agent failures come from the harness around the model:
49
+
50
+ - weak repository instructions
51
+ - missing setup steps
52
+ - broken validation logic
53
+ - incomplete test flows
54
+ - poor iteration memory
55
+ - acceptance checks that do not match the real task
56
+
57
+ `metaharness` turns those artifacts into a repeatable optimization target with stored evidence for every proposal.
58
+ It also captures a compact environment snapshot before each proposal so agents do not waste early turns on basic workspace discovery.
59
+ Projects can also declare an allowed write scope so off-target edits are rejected automatically.
60
+
61
+ ## How It Works
62
+
63
+ `metaharness` runs an outer optimization loop around a harness:
64
+
65
+ 1. start from a baseline workspace
66
+ 2. ask a coding agent to improve it
67
+ 3. validate and evaluate the result
68
+ 4. keep the best candidate
69
+ 5. store all artifacts on disk
70
+
71
+ The result is a practical, inspectable workflow for improving real harnesses instead of ad hoc prompt tinkering.
72
+
73
+ ## Who It Is For
74
+
75
+ - developers building agentic coding systems who want to optimize harness code, workflow scripts, retrieval wrappers, routing, and evaluation flows
76
+ - practitioners using coding-agent tools who want to improve `AGENTS.md`, `GEMINI.md`, bootstrap scripts, validation scripts, and acceptance tests
77
+
78
+ ## Quickstart
79
+
80
+ Install the project:
81
+
82
+ ```bash
83
+ uv sync
84
+ ```
85
+
86
+ Run the fake backend on a real benchmark:
87
+
88
+ ```bash
89
+ uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name quickstart
90
+ ```
91
+
92
+ Inspect the run:
93
+
94
+ ```bash
95
+ uv run metaharness inspect examples/python_fixture_benchmark/runs/quickstart
96
+ ```
97
+
98
+ Export the candidate ledger:
99
+
100
+ ```bash
101
+ uv run metaharness ledger examples/python_fixture_benchmark/runs/quickstart --tsv
102
+ ```
103
+
104
+ Run a saved experiment matrix:
105
+
106
+ ```bash
107
+ uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
108
+ ```
109
+
110
+ ## Core Capabilities
111
+
112
+ - a minimal optimization engine
113
+ - a filesystem-backed run store
114
+ - automatic environment bootstrap snapshots for each proposal
115
+ - optional write-scope enforcement through `allowed_write_paths`
116
+ - a provider-neutral proposer backend interface
117
+ - a real `CodexExecBackend`
118
+ - a deterministic `FakeBackend`
119
+ - a coding-tool integration for instruction files and script-based harnesses
120
+ - explicit per-candidate outcomes: `keep`, `discard`, `crash`, `timeout`, `no-change`, and `scope-violation`
121
+ - reporting commands for `inspect`, `ledger`, `summarize`, and `compare`
122
+ - experiment-matrix execution with JSON and TSV outputs
123
+ - benchmark targets and experiment records
124
+
125
+ ## Current Status
126
+
127
+ The repository currently includes:
128
+
129
+ - two real coding-tool benchmark targets
130
+ - a smaller deterministic ticket-router example
131
+ - hosted Codex runs on the real benchmarks
132
+ - local Codex over Ollama runs with `gpt-oss:20b` and `gpt-oss:120b`
133
+ - a docs site published from GitHub Actions
134
+
135
+ Current documented experiments in this repository show:
136
+
137
+ - hosted Codex solves both real benchmarks in one proposal iteration
138
+ - local `gpt-oss:120b` solves `python_fixture_benchmark`
139
+ - local `gpt-oss:20b` is useful for smoke checks but timed out on the current real benchmark runs
140
+
141
+ Detailed experiment records:
142
+
143
+ - [Benchmark overview](BENCHMARKS.md)
144
+ - [Recorded benchmark results](BENCHMARK_RESULTS.md)
145
+ - [Experiment notes](docs/experiments.md)
146
+
147
+ ## Provider Status
148
+
149
+ - Codex is the main validated harness path in this repository today
150
+ - hosted Codex is the strongest current path for real runs
151
+ - local Codex over Ollama works and has been exercised with `gpt-oss:20b` and `gpt-oss:120b`
152
+ - Gemini exists as a scaffolded backend and is not yet at parity with Codex
153
+
154
+ All real provider results currently documented in this repository were produced through the Codex CLI path.
155
+ That includes both hosted Codex runs and local Ollama runs driven through Codex with `gpt-oss` models.
156
+ Other coding-agent evaluations in the wider ecosystem often emphasize Claude Code and Opus, but this repository's current benchmark evidence is Codex-first.
157
+
158
+ ## Documentation
159
+
160
+ - [Project documentation](https://superagenticai.github.io/metaharness/)
161
+ - [Getting started](https://superagenticai.github.io/metaharness/getting-started/)
162
+ - [Architecture](https://superagenticai.github.io/metaharness/architecture/)
163
+ - [Providers](https://superagenticai.github.io/metaharness/providers/)
164
+ - [Benchmarks](https://superagenticai.github.io/metaharness/benchmarks/)
165
+ - [CLI reference](https://superagenticai.github.io/metaharness/cli-reference/)
166
+ - [Experiments](https://superagenticai.github.io/metaharness/experiments/)
167
+
168
+ ## Installation
169
+
170
+ Project setup:
171
+
172
+ ```bash
173
+ uv sync
174
+ ```
175
+
176
+ If you want the docs toolchain too:
177
+
178
+ ```bash
179
+ uv sync --group dev
180
+ ```
181
+
182
+ Check the CLI:
183
+
184
+ ```bash
185
+ uv run metaharness --help
186
+ ```
187
+
188
+ Editable install with `pip` also works:
189
+
190
+ ```bash
191
+ pip install -e .
192
+ ```
193
+
194
+ ## Hosted Codex
195
+
196
+ Requirements:
197
+
198
+ - `codex` CLI installed
199
+ - authenticated Codex session or API key
200
+ - outbound network access
201
+
202
+ Run a real benchmark with hosted Codex:
203
+
204
+ ```bash
205
+ uv run metaharness run examples/python_fixture_benchmark --backend codex --hosted --budget 1 --run-name hosted-codex
206
+ ```
207
+
208
+ Important:
209
+
210
+ - use `--hosted` when a project config defaults to local Ollama
211
+ - the library is ready for hosted Codex runs today
212
+
213
+ ## Local Codex Over Ollama
214
+
215
+ Probe the local setup:
216
+
217
+ ```bash
218
+ uv run metaharness smoke codex examples/python_fixture_benchmark --probe-only --oss --local-provider ollama --model gpt-oss:20b
219
+ ```
220
+
221
+ Run with `gpt-oss:20b`:
222
+
223
+ ```bash
224
+ uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:20b --proposal-timeout 240 --budget 1 --run-name ollama-20b
225
+ ```
226
+
227
+ Run with `gpt-oss:120b`:
228
+
229
+ ```bash
230
+ uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:120b --proposal-timeout 420 --budget 1 --run-name ollama-120b
231
+ ```
232
+
233
+ ## Benchmarks And Examples
234
+
235
+ Real benchmarks:
236
+
237
+ - [examples/python_fixture_benchmark](examples/python_fixture_benchmark)
238
+ - [examples/python_cli_benchmark](examples/python_cli_benchmark)
239
+
240
+ Smaller deterministic example:
241
+
242
+ - [examples/ticket_router](examples/ticket_router)
243
+
244
+ Run the ticket router example:
245
+
246
+ ```bash
247
+ uv run python examples/ticket_router/run.py --backend fake --budget 1
248
+ ```
249
+
250
+ ## Scaffold Your Own Project
251
+
252
+ Create a coding-tool project:
253
+
254
+ ```bash
255
+ uv run metaharness scaffold coding-tool ./my-coding-tool-optimizer
256
+ ```
257
+
258
+ Available profiles:
259
+
260
+ - `standard`
261
+ - `local-oss-smoke`
262
+ - `local-oss-medium`
263
+
264
+ Run the scaffold with the fake backend:
265
+
266
+ ```bash
267
+ uv run metaharness run ./my-coding-tool-optimizer --backend fake --budget 1
268
+ ```
269
+
270
+ ## CLI Overview
271
+
272
+ Create a scaffold:
273
+
274
+ ```bash
275
+ uv run metaharness scaffold coding-tool ./my-project
276
+ ```
277
+
278
+ Run a project:
279
+
280
+ ```bash
281
+ uv run metaharness run ./my-project --backend fake --budget 1
282
+ ```
283
+
284
+ Probe Codex:
285
+
286
+ ```bash
287
+ uv run metaharness smoke codex ./my-project --probe-only
288
+ ```
289
+
290
+ Inspect a run:
291
+
292
+ ```bash
293
+ uv run metaharness inspect ./my-project/runs/example
294
+ ```
295
+
296
+ Compare runs:
297
+
298
+ ```bash
299
+ uv run metaharness compare \
300
+ ./examples/python_fixture_benchmark/runs/hosted-codex-20260401 \
301
+ ./examples/python_fixture_benchmark/runs/ollama-20b-20260401 \
302
+ ./examples/python_fixture_benchmark/runs/ollama-120b-20260401
303
+ ```
304
+
305
+ Run an experiment matrix:
306
+
307
+ ```bash
308
+ uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
309
+ ```
310
+
311
+ ## Benefits Of The Filesystem Approach
312
+
313
+ Every run stores:
314
+
315
+ - prompts
316
+ - candidate workspaces
317
+ - validation results
318
+ - evaluation results
319
+ - proposal metadata
320
+ - workspace diffs
321
+ - per-candidate manifests
322
+
323
+ That makes the optimization history reviewable, debuggable, and reusable.
324
+
325
+ ## Development
326
+
327
+ Compile checks:
328
+
329
+ ```bash
330
+ uv run python -m compileall -q src tests examples docs
331
+ ```
332
+
333
+ Unit tests:
334
+
335
+ ```bash
336
+ uv run python -m unittest discover -s tests -v
337
+ ```
338
+
339
+ Docs build:
340
+
341
+ ```bash
342
+ uv run mkdocs build --strict
343
+ ```
344
+
345
+ Fake benchmark smoke runs:
346
+
347
+ ```bash
348
+ uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name ci-fixture-local
349
+ uv run metaharness run examples/python_cli_benchmark --backend fake --budget 1 --run-name ci-cli-local
350
+ uv run python examples/ticket_router/run.py --backend fake --budget 1
351
+ ```
352
+
353
+ ## License
354
+
355
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,327 @@
1
+ # metaharness
2
+
3
+ [![CI](https://github.com/SuperagenticAI/metaharness/actions/workflows/ci.yml/badge.svg)](https://github.com/SuperagenticAI/metaharness/actions/workflows/ci.yml)
4
+ [![Docs](https://github.com/SuperagenticAI/metaharness/actions/workflows/pages.yml/badge.svg)](https://github.com/SuperagenticAI/metaharness/actions/workflows/pages.yml)
5
+ [![Documentation](https://img.shields.io/badge/docs-GitHub%20Pages-111111?logo=github&logoColor=white)](https://superagenticai.github.io/metaharness/)
6
+ [![Python](https://img.shields.io/badge/python-3.11%2B-3776AB?logo=python&logoColor=white)](https://github.com/SuperagenticAI/metaharness/blob/main/pyproject.toml)
7
+ [![License](https://img.shields.io/github/license/SuperagenticAI/metaharness)](https://github.com/SuperagenticAI/metaharness/blob/main/LICENSE)
8
+ [![Status](https://img.shields.io/badge/status-alpha-F59E0B)](https://github.com/SuperagenticAI/metaharness)
9
+ [![Paper](https://img.shields.io/badge/paper-Meta%20Harness-B31B1B)](https://arxiv.org/pdf/2603.28052)
10
+
11
+ `metaharness` is an open source Python library for optimizing executable harnesses around agentic coding systems.
12
+ It is inspired by the [Meta Harness paper](https://arxiv.org/pdf/2603.28052) and is an unofficial open source implementation of the core ideas in that work.
13
+ The current implementation and benchmark evidence in this repository are centered on the Codex CLI path, including hosted Codex and Codex over local Ollama models.
14
+
15
+ It is built for teams who want to improve the code and files around an agent workflow, not just the prompt.
16
+ That includes instruction files, setup flows, validation scripts, test scripts, routing logic, and other executable support code.
17
+
18
+ ## Why `metaharness`
19
+
20
+ Many agent failures come from the harness around the model:
21
+
22
+ - weak repository instructions
23
+ - missing setup steps
24
+ - broken validation logic
25
+ - incomplete test flows
26
+ - poor iteration memory
27
+ - acceptance checks that do not match the real task
28
+
29
+ `metaharness` turns those artifacts into a repeatable optimization target with stored evidence for every proposal.
30
+ It also captures a compact environment snapshot before each proposal so agents do not waste early turns on basic workspace discovery.
31
+ Projects can also declare an allowed write scope so off-target edits are rejected automatically.
32
+
33
+ ## How It Works
34
+
35
+ `metaharness` runs an outer optimization loop around a harness:
36
+
37
+ 1. start from a baseline workspace
38
+ 2. ask a coding agent to improve it
39
+ 3. validate and evaluate the result
40
+ 4. keep the best candidate
41
+ 5. store all artifacts on disk
42
+
43
+ The result is a practical, inspectable workflow for improving real harnesses instead of ad hoc prompt tinkering.
44
+
45
+ ## Who It Is For
46
+
47
+ - developers building agentic coding systems who want to optimize harness code, workflow scripts, retrieval wrappers, routing, and evaluation flows
48
+ - practitioners using coding-agent tools who want to improve `AGENTS.md`, `GEMINI.md`, bootstrap scripts, validation scripts, and acceptance tests
49
+
50
+ ## Quickstart
51
+
52
+ Install the project:
53
+
54
+ ```bash
55
+ uv sync
56
+ ```
57
+
58
+ Run the fake backend on a real benchmark:
59
+
60
+ ```bash
61
+ uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name quickstart
62
+ ```
63
+
64
+ Inspect the run:
65
+
66
+ ```bash
67
+ uv run metaharness inspect examples/python_fixture_benchmark/runs/quickstart
68
+ ```
69
+
70
+ Export the candidate ledger:
71
+
72
+ ```bash
73
+ uv run metaharness ledger examples/python_fixture_benchmark/runs/quickstart --tsv
74
+ ```
75
+
76
+ Run a saved experiment matrix:
77
+
78
+ ```bash
79
+ uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
80
+ ```
81
+
82
+ ## Core Capabilities
83
+
84
+ - a minimal optimization engine
85
+ - a filesystem-backed run store
86
+ - automatic environment bootstrap snapshots for each proposal
87
+ - optional write-scope enforcement through `allowed_write_paths`
88
+ - a provider-neutral proposer backend interface
89
+ - a real `CodexExecBackend`
90
+ - a deterministic `FakeBackend`
91
+ - a coding-tool integration for instruction files and script-based harnesses
92
+ - explicit per-candidate outcomes: `keep`, `discard`, `crash`, `timeout`, `no-change`, and `scope-violation`
93
+ - reporting commands for `inspect`, `ledger`, `summarize`, and `compare`
94
+ - experiment-matrix execution with JSON and TSV outputs
95
+ - benchmark targets and experiment records
96
+
97
+ ## Current Status
98
+
99
+ The repository currently includes:
100
+
101
+ - two real coding-tool benchmark targets
102
+ - a smaller deterministic ticket-router example
103
+ - hosted Codex runs on the real benchmarks
104
+ - local Codex over Ollama runs with `gpt-oss:20b` and `gpt-oss:120b`
105
+ - a docs site published from GitHub Actions
106
+
107
+ Current documented experiments in this repository show:
108
+
109
+ - hosted Codex solves both real benchmarks in one proposal iteration
110
+ - local `gpt-oss:120b` solves `python_fixture_benchmark`
111
+ - local `gpt-oss:20b` is useful for smoke checks but timed out on the current real benchmark runs
112
+
113
+ Detailed experiment records:
114
+
115
+ - [Benchmark overview](BENCHMARKS.md)
116
+ - [Recorded benchmark results](BENCHMARK_RESULTS.md)
117
+ - [Experiment notes](docs/experiments.md)
118
+
119
+ ## Provider Status
120
+
121
+ - Codex is the main validated harness path in this repository today
122
+ - hosted Codex is the strongest current path for real runs
123
+ - local Codex over Ollama works and has been exercised with `gpt-oss:20b` and `gpt-oss:120b`
124
+ - Gemini exists as a scaffolded backend and is not yet at parity with Codex
125
+
126
+ All real provider results currently documented in this repository were produced through the Codex CLI path.
127
+ That includes both hosted Codex runs and local Ollama runs driven through Codex with `gpt-oss` models.
128
+ Other coding-agent evaluations in the wider ecosystem often emphasize Claude Code and Opus, but this repository's current benchmark evidence is Codex-first.
129
+
130
+ ## Documentation
131
+
132
+ - [Project documentation](https://superagenticai.github.io/metaharness/)
133
+ - [Getting started](https://superagenticai.github.io/metaharness/getting-started/)
134
+ - [Architecture](https://superagenticai.github.io/metaharness/architecture/)
135
+ - [Providers](https://superagenticai.github.io/metaharness/providers/)
136
+ - [Benchmarks](https://superagenticai.github.io/metaharness/benchmarks/)
137
+ - [CLI reference](https://superagenticai.github.io/metaharness/cli-reference/)
138
+ - [Experiments](https://superagenticai.github.io/metaharness/experiments/)
139
+
140
+ ## Installation
141
+
142
+ Project setup:
143
+
144
+ ```bash
145
+ uv sync
146
+ ```
147
+
148
+ If you want the docs toolchain too:
149
+
150
+ ```bash
151
+ uv sync --group dev
152
+ ```
153
+
154
+ Check the CLI:
155
+
156
+ ```bash
157
+ uv run metaharness --help
158
+ ```
159
+
160
+ Editable install with `pip` also works:
161
+
162
+ ```bash
163
+ pip install -e .
164
+ ```
165
+
166
+ ## Hosted Codex
167
+
168
+ Requirements:
169
+
170
+ - `codex` CLI installed
171
+ - authenticated Codex session or API key
172
+ - outbound network access
173
+
174
+ Run a real benchmark with hosted Codex:
175
+
176
+ ```bash
177
+ uv run metaharness run examples/python_fixture_benchmark --backend codex --hosted --budget 1 --run-name hosted-codex
178
+ ```
179
+
180
+ Important:
181
+
182
+ - use `--hosted` when a project config defaults to local Ollama
183
+ - the library is ready for hosted Codex runs today
184
+
185
+ ## Local Codex Over Ollama
186
+
187
+ Probe the local setup:
188
+
189
+ ```bash
190
+ uv run metaharness smoke codex examples/python_fixture_benchmark --probe-only --oss --local-provider ollama --model gpt-oss:20b
191
+ ```
192
+
193
+ Run with `gpt-oss:20b`:
194
+
195
+ ```bash
196
+ uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:20b --proposal-timeout 240 --budget 1 --run-name ollama-20b
197
+ ```
198
+
199
+ Run with `gpt-oss:120b`:
200
+
201
+ ```bash
202
+ uv run metaharness run examples/python_fixture_benchmark --backend codex --oss --local-provider ollama --model gpt-oss:120b --proposal-timeout 420 --budget 1 --run-name ollama-120b
203
+ ```
204
+
205
+ ## Benchmarks And Examples
206
+
207
+ Real benchmarks:
208
+
209
+ - [examples/python_fixture_benchmark](examples/python_fixture_benchmark)
210
+ - [examples/python_cli_benchmark](examples/python_cli_benchmark)
211
+
212
+ Smaller deterministic example:
213
+
214
+ - [examples/ticket_router](examples/ticket_router)
215
+
216
+ Run the ticket router example:
217
+
218
+ ```bash
219
+ uv run python examples/ticket_router/run.py --backend fake --budget 1
220
+ ```
221
+
222
+ ## Scaffold Your Own Project
223
+
224
+ Create a coding-tool project:
225
+
226
+ ```bash
227
+ uv run metaharness scaffold coding-tool ./my-coding-tool-optimizer
228
+ ```
229
+
230
+ Available profiles:
231
+
232
+ - `standard`
233
+ - `local-oss-smoke`
234
+ - `local-oss-medium`
235
+
236
+ Run the scaffold with the fake backend:
237
+
238
+ ```bash
239
+ uv run metaharness run ./my-coding-tool-optimizer --backend fake --budget 1
240
+ ```
241
+
242
+ ## CLI Overview
243
+
244
+ Create a scaffold:
245
+
246
+ ```bash
247
+ uv run metaharness scaffold coding-tool ./my-project
248
+ ```
249
+
250
+ Run a project:
251
+
252
+ ```bash
253
+ uv run metaharness run ./my-project --backend fake --budget 1
254
+ ```
255
+
256
+ Probe Codex:
257
+
258
+ ```bash
259
+ uv run metaharness smoke codex ./my-project --probe-only
260
+ ```
261
+
262
+ Inspect a run:
263
+
264
+ ```bash
265
+ uv run metaharness inspect ./my-project/runs/example
266
+ ```
267
+
268
+ Compare runs:
269
+
270
+ ```bash
271
+ uv run metaharness compare \
272
+ ./examples/python_fixture_benchmark/runs/hosted-codex-20260401 \
273
+ ./examples/python_fixture_benchmark/runs/ollama-20b-20260401 \
274
+ ./examples/python_fixture_benchmark/runs/ollama-120b-20260401
275
+ ```
276
+
277
+ Run an experiment matrix:
278
+
279
+ ```bash
280
+ uv run metaharness experiment --config examples/experiment_configs/fake-benchmarks.json
281
+ ```
282
+
283
+ ## Benefits Of The Filesystem Approach
284
+
285
+ Every run stores:
286
+
287
+ - prompts
288
+ - candidate workspaces
289
+ - validation results
290
+ - evaluation results
291
+ - proposal metadata
292
+ - workspace diffs
293
+ - per-candidate manifests
294
+
295
+ That makes the optimization history reviewable, debuggable, and reusable.
296
+
297
+ ## Development
298
+
299
+ Compile checks:
300
+
301
+ ```bash
302
+ uv run python -m compileall -q src tests examples docs
303
+ ```
304
+
305
+ Unit tests:
306
+
307
+ ```bash
308
+ uv run python -m unittest discover -s tests -v
309
+ ```
310
+
311
+ Docs build:
312
+
313
+ ```bash
314
+ uv run mkdocs build --strict
315
+ ```
316
+
317
+ Fake benchmark smoke runs:
318
+
319
+ ```bash
320
+ uv run metaharness run examples/python_fixture_benchmark --backend fake --budget 1 --run-name ci-fixture-local
321
+ uv run metaharness run examples/python_cli_benchmark --backend fake --budget 1 --run-name ci-cli-local
322
+ uv run python examples/ticket_router/run.py --backend fake --budget 1
323
+ ```
324
+
325
+ ## License
326
+
327
+ MIT. See [LICENSE](LICENSE).