polyharness 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polyharness-0.2.0/src/polyharness.egg-info → polyharness-0.2.2}/PKG-INFO +135 -43
- {polyharness-0.2.0 → polyharness-0.2.2}/README.md +134 -42
- {polyharness-0.2.0 → polyharness-0.2.2}/pyproject.toml +1 -1
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/__init__.py +1 -1
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/cli.py +226 -17
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/config.py +82 -3
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/doctor.py +2 -0
- polyharness-0.2.2/src/polyharness/orchestrator.py +544 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/__init__.py +3 -0
- polyharness-0.2.2/src/polyharness/proposer/adapters/hermes.py +29 -0
- polyharness-0.2.2/src/polyharness/proposer/bandit.py +84 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/search_log.py +25 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/workspace.py +11 -0
- {polyharness-0.2.0 → polyharness-0.2.2/src/polyharness.egg-info}/PKG-INFO +135 -43
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/SOURCES.txt +3 -0
- polyharness-0.2.2/tests/test_bandit.py +66 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_cli_adapters.py +17 -1
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_cli_features.py +57 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_config.py +38 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_evolution.py +79 -0
- polyharness-0.2.2/tests/test_orchestrator.py +555 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_search_log.py +20 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_workspace.py +8 -0
- polyharness-0.2.0/src/polyharness/orchestrator.py +0 -287
- polyharness-0.2.0/tests/test_orchestrator.py +0 -229
- {polyharness-0.2.0 → polyharness-0.2.2}/LICENSE +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/setup.cfg +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/__main__.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/collector.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/evaluator/__init__.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/evaluator/evaluator.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/__init__.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/base.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/claude_code.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/claw_code.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/codex.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/opencode.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/api_proposer.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/base.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/cli_proposer.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/local_proposer.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/openai_proposer.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/__init__.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/api-calling/base_harness/harness.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/api-calling/evaluate.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/api-calling/tasks/test_cases.json +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/code-generation/base_harness/harness.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/code-generation/evaluate.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/code-generation/tasks/test_cases.json +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/math-word-problems/base_harness/harness.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/math-word-problems/evaluate.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/math-word-problems/tasks/test_cases.json +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/rag-qa/base_harness/harness.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/rag-qa/evaluate.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/rag-qa/tasks/test_cases.json +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/text-classification/base_harness/harness.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/text-classification/evaluate.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/text-classification/tasks/test_cases.json +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/utils/__init__.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/dependency_links.txt +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/entry_points.txt +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/requires.txt +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/top_level.txt +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_collector.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_compare.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_evaluator.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_example.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_export.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_log.py +0 -0
- {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: polyharness
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Automated harness optimization for AI agents — make your agent evolve.
|
|
5
5
|
Author: weijt606
|
|
6
6
|
License-Expression: MIT
|
|
@@ -48,7 +48,7 @@ Dynamic: license-file
|
|
|
48
48
|
|
|
49
49
|
[](LICENSE)
|
|
50
50
|
[](https://www.python.org/downloads/)
|
|
51
|
-
[]()
|
|
52
52
|
[](README_CN.md)
|
|
53
53
|
|
|
54
54
|
---
|
|
@@ -63,7 +63,7 @@ Your AI agent runs the same harness every time. Same prompts, same tool config,
|
|
|
63
63
|
| | |
|
|
64
64
|
|---|---|
|
|
65
65
|
| **Self-Evolution** | Iteratively searches over harness changes and keeps the full evaluation history in one workspace. |
|
|
66
|
-
| **
|
|
66
|
+
| **8 Agent Backends** | Claude Code · Claw Code · Codex · Hermes · OpenCode · API direct · OpenAI-compatible · Local — plug in any CLI agent. |
|
|
67
67
|
| **Full History** | Every iteration's code, scores, and traces preserved. The Meta-Harness paper reports that non-Markovian search outperforms blind retries. |
|
|
68
68
|
| **Search Tree** | Visualize the optimization path. Compare any two candidates with per-task diffs. |
|
|
69
69
|
| **One-Command Setup** | `ph init --base-harness ... --task-dir ...` — copies files, configures workspace, done. |
|
|
@@ -86,13 +86,19 @@ PolyHarness fills that gap. It's the open-source engine that makes Meta-Harness
|
|
|
86
86
|
> - Memory tools (like Supermemory) give agents persistent **memory** across conversations.
|
|
87
87
|
> - **PolyHarness gives agents persistent self-evolution** — you get a repeatable way to refine how they work over time.
|
|
88
88
|
|
|
89
|
+
### Part of a wave — specialized for harnesses
|
|
90
|
+
|
|
91
|
+
PolyHarness doesn't stand alone. A wave of open-source projects has shown that pairing LLMs with evolutionary search systematically improves code and prompts: [GEPA](https://github.com/gepa-ai/gepa) (reflective prompt evolution over a Pareto frontier), [ShinkaEvolve](https://github.com/SakanaAI/ShinkaEvolve) (sample-efficient program evolution), [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) (an open AlphaEvolve), and the [Darwin Gödel Machine](https://sakana.ai/dgm/) (open-ended self-improving agents).
|
|
92
|
+
|
|
93
|
+
Most of these evolve *general* programs or algorithms. PolyHarness is the member of this wave **specialized for agent harnesses** — the prompts, tool config, and orchestration *around* an existing agent — with a focus on **online evolution from real usage** (`ph wrap` → `ph evolve`). It borrows the strongest ideas from these projects and applies them to any CLI agent on your own tasks: Pareto-frontier parent selection (GEPA), code-novelty rejection and an adaptive backend ensemble (ShinkaEvolve), and cascade evaluation (AlphaEvolve/OpenEvolve).
|
|
94
|
+
|
|
89
95
|
## What PolyHarness Is
|
|
90
96
|
|
|
91
97
|
PolyHarness is the open-source engine for iteratively searching over an agent's harness.
|
|
92
98
|
|
|
93
99
|
It builds on ideas from the Meta-Harness paper and the TBench2 results reported there, while focusing this repository on the optimization workflow itself — how harness variants are proposed, evaluated, and revised over repeated runs.
|
|
94
100
|
|
|
95
|
-
If tools like ForgeCode help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
|
|
101
|
+
If tools like [ForgeCode](https://github.com/antinomyhq/forgecode) help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
|
|
96
102
|
|
|
97
103
|
---
|
|
98
104
|
|
|
@@ -262,7 +268,7 @@ PolyHarness automatically sandboxes your agent inside this workspace, ensuring i
|
|
|
262
268
|
|
|
263
269
|
| Scenario | How to configure |
|
|
264
270
|
|----------|------------------|
|
|
265
|
-
| **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, opencode)* |
|
|
271
|
+
| **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, hermes, opencode)* |
|
|
266
272
|
| **Anthropic API** | Run `ph init --agent api`. Set `export ANTHROPIC_API_KEY="sk-ant-..."` before `ph run`. |
|
|
267
273
|
| **OpenAI / Local Models** | Run `ph init --agent openai`. Then configure the endpoint — see [Local Model Setup](#local-model-setup) below. |
|
|
268
274
|
| **Custom CLI path** | If your CLI agent uses a non-standard command, edit `config.yaml` in the workspace before running:<br>`proposer: { cli_path: "npx @anthropic-ai/claude-code" }`|
|
|
@@ -275,6 +281,34 @@ ph run
|
|
|
275
281
|
|
|
276
282
|
The orchestrator: copies your harness → asks the Proposer agent for a candidate change → evaluates the result → stores everything → repeats.
|
|
277
283
|
|
|
284
|
+
```
|
|
285
|
+
┌──────────────────────────────────────────────────────────────┐
|
|
286
|
+
│ │
|
|
287
|
+
│ You PolyHarness │
|
|
288
|
+
│ │ │ │
|
|
289
|
+
│ ├── ph init ──────────────────→│ Creates workspace │
|
|
290
|
+
│ │ (harness + tasks + eval) │ Copies files │
|
|
291
|
+
│ │ │ Injects CLAUDE.md │
|
|
292
|
+
│ │ │ │
|
|
293
|
+
│ ├── ph run ───────────────────→│ Starts search loop: │
|
|
294
|
+
│ │ │ │
|
|
295
|
+
│ │ ┌──────────────────────────┤ │
|
|
296
|
+
│ │ │ Step 1: SELECT parent │ Best or Tournament │
|
|
297
|
+
│ │ │ Step 2: COPY harness │ From parent → candidate │
|
|
298
|
+
│ │ │ Step 3: PROPOSE changes │ Agent reads all history │
|
|
299
|
+
│ │ │ Step 4: EVALUATE │ Run tasks, get scores │
|
|
300
|
+
│ │ │ Step 5: STORE results │ Code + scores + traces │
|
|
301
|
+
│ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
|
|
302
|
+
│ │ └──────────┬───────────────┤ │
|
|
303
|
+
│ │ └── loop ───────┘ │
|
|
304
|
+
│ │ │ │
|
|
305
|
+
│ ├── ph log ───────────────────→│ Shows search tree │
|
|
306
|
+
│ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
|
|
307
|
+
│ └── ph apply ─────────────────→│ Writes best back │
|
|
308
|
+
│ │
|
|
309
|
+
└──────────────────────────────────────────────────────────────┘
|
|
310
|
+
```
|
|
311
|
+
|
|
278
312
|
### 5. Inspect and apply
|
|
279
313
|
|
|
280
314
|
```bash
|
|
@@ -303,6 +337,7 @@ Just add `ph wrap --auto-evolve` in front of your agent command (pick the one ma
|
|
|
303
337
|
ph wrap --auto-evolve claude -p "Refactor the auth module to use JWT" # Claude Code
|
|
304
338
|
ph wrap --auto-evolve claw -p "Write integration tests for payments" # Claw Code
|
|
305
339
|
ph wrap --auto-evolve codex "Add retry logic to the API client" # Codex
|
|
340
|
+
ph wrap --auto-evolve hermes chat -q "Refactor the DB connection pool" # Hermes Agent
|
|
306
341
|
ph wrap --auto-evolve opencode -p "Fix the flaky parser test" # OpenCode
|
|
307
342
|
|
|
308
343
|
# Local models — wrap the CLI command directly
|
|
@@ -358,7 +393,67 @@ ph evolve # trigger evolution manually
|
|
|
358
393
|
|
|
359
394
|
> **Tip:** Use `--no-record-output` if you don't want stdout/stderr saved (e.g., for sensitive output). Metadata is always recorded.
|
|
360
395
|
|
|
361
|
-
|
|
396
|
+
#### Zero-config auto-wrap: `ph shell-hook`
|
|
397
|
+
|
|
398
|
+
Don't want to type `ph wrap --auto-evolve` every time? Install a shell hook — it auto-intercepts agent commands:
|
|
399
|
+
|
|
400
|
+
```bash
|
|
401
|
+
ph shell-hook install # one-time setup, writes to ~/.zshrc
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
After that, just use your agent as usual:
|
|
405
|
+
|
|
406
|
+
```bash
|
|
407
|
+
claude -p "Refactor auth to JWT" # automatically becomes: ph wrap --auto-evolve claude -p ...
|
|
408
|
+
claw -p "Write payment tests" # same — auto-wrapped
|
|
409
|
+
codex "Add retry logic" # same
|
|
410
|
+
hermes chat -q "Refactor pool" # same
|
|
411
|
+
opencode -p "Fix flaky test" # same
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
How it works: a `preexec` hook in your shell detects `claude`/`claw`/`codex`/`hermes`/`opencode` commands and transparently redirects them through `ph wrap --auto-evolve`. Your output is unchanged.
|
|
415
|
+
|
|
416
|
+
```bash
|
|
417
|
+
ph shell-hook status # check if installed
|
|
418
|
+
ph shell-hook uninstall # remove cleanly (restores original rc file)
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
#### Auto-Evolution flow
|
|
422
|
+
|
|
423
|
+
```
|
|
424
|
+
┌──────────────────────────────────────────────────────────────┐
|
|
425
|
+
│ │
|
|
426
|
+
│ You PolyHarness │
|
|
427
|
+
│ │ │ │
|
|
428
|
+
│ ├── ph shell-hook install ────→ │ Injects preexec hook │
|
|
429
|
+
│ │ (one-time setup) │ into ~/.zshrc │
|
|
430
|
+
│ │ │ │
|
|
431
|
+
│ ├── claude -p "Fix bug" ──────→ │ Shell hook intercepts │
|
|
432
|
+
│ │ (normal usage) │ │
|
|
433
|
+
│ │ ├── Run agent │
|
|
434
|
+
│ │ ┌─ output passes through ──┤ │
|
|
435
|
+
│ │ │ ├── Record trace │
|
|
436
|
+
│ │ │ │ (~/.polyharness/ │
|
|
437
|
+
│ │ │ │ traces/) │
|
|
438
|
+
│ │ │ │ │
|
|
439
|
+
│ │ │ ├── Check threshold │
|
|
440
|
+
│ │ │ │ traces < 50? │
|
|
441
|
+
│ │ │ │ ├─ Yes: "7/50 traces" │
|
|
442
|
+
│ │ │ │ └─ No: trigger ───┐ │
|
|
443
|
+
│ │ │ │ │ │
|
|
444
|
+
│ │ │ │ ┌─────────────────┘ │
|
|
445
|
+
│ │ │ │ │ Evolution cycle │
|
|
446
|
+
│ │ │ │ │ (same as ph run) │
|
|
447
|
+
│ │ │ │ │ Propose → Evaluate │
|
|
448
|
+
│ │ │ │ │ → Store → Repeat │
|
|
449
|
+
│ │ │ │ └────────────────── │
|
|
450
|
+
│ │ │ │ │
|
|
451
|
+
│ └───┘ │ │
|
|
452
|
+
│ │
|
|
453
|
+
└──────────────────────────────────────────────────────────────┘
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
The key difference: **you never run `ph run` manually.** You use your agent as always; PolyHarness silently collects data and triggers evolution when it has enough signal.
|
|
362
457
|
|
|
363
458
|
### Try it now (no API key needed)
|
|
364
459
|
|
|
@@ -380,35 +475,7 @@ The score path above is the current measured result of the bundled `math-word-pr
|
|
|
380
475
|
|
|
381
476
|
## How It Works
|
|
382
477
|
|
|
383
|
-
PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes
|
|
384
|
-
|
|
385
|
-
```
|
|
386
|
-
┌──────────────────────────────────────────────────────────────┐
|
|
387
|
-
│ │
|
|
388
|
-
│ You PolyHarness │
|
|
389
|
-
│ │ │ │
|
|
390
|
-
│ ├── ph init ──────────────────→│ Creates workspace │
|
|
391
|
-
│ │ (harness + tasks + eval) │ Copies files │
|
|
392
|
-
│ │ │ Injects CLAUDE.md │
|
|
393
|
-
│ │ │ │
|
|
394
|
-
│ ├── ph run ───────────────────→│ Starts search loop: │
|
|
395
|
-
│ │ │ │
|
|
396
|
-
│ │ ┌──────────────────────────┤ │
|
|
397
|
-
│ │ │ Step 1: SELECT parent │ Best or Tournament │
|
|
398
|
-
│ │ │ Step 2: COPY harness │ From parent → candidate │
|
|
399
|
-
│ │ │ Step 3: PROPOSE changes │ Agent reads all history │
|
|
400
|
-
│ │ │ Step 4: EVALUATE │ Run tasks, get scores │
|
|
401
|
-
│ │ │ Step 5: STORE results │ Code + scores + traces │
|
|
402
|
-
│ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
|
|
403
|
-
│ │ └──────────┬───────────────┤ │
|
|
404
|
-
│ │ └── loop ───────┘ │
|
|
405
|
-
│ │ │ │
|
|
406
|
-
│ ├── ph log ───────────────────→│ Shows search tree │
|
|
407
|
-
│ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
|
|
408
|
-
│ └── ph apply ─────────────────→│ Writes best back │
|
|
409
|
-
│ │
|
|
410
|
-
└──────────────────────────────────────────────────────────────┘
|
|
411
|
-
```
|
|
478
|
+
PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes. See the detailed flow diagrams above in [Step 4](#4-run-the-optimization-loop) and [Step 6](#6-auto-evolution).
|
|
412
479
|
|
|
413
480
|
### Why it works: non-Markovian search
|
|
414
481
|
|
|
@@ -433,12 +500,23 @@ The Proposer reads **all of this** before generating the next candidate. It can
|
|
|
433
500
|
| `claude-code` | `claude -p` | Official Claude Code CLI (Pro/Teams subscription) |
|
|
434
501
|
| `claw-code` | `claw -p` | Open-source Claw Code CLI |
|
|
435
502
|
| `codex` | `codex --quiet` | OpenAI Codex CLI |
|
|
503
|
+
| `hermes` | `hermes chat -q` | Nous Research [Hermes Agent](https://github.com/NousResearch/hermes-agent) CLI |
|
|
436
504
|
| `opencode` | `opencode -p` | OpenCode CLI |
|
|
437
505
|
| `local` | — | Offline rule-based engine for development & testing |
|
|
438
506
|
|
|
439
507
|
`ph doctor` auto-detects all available backends and shows their status.
|
|
440
508
|
|
|
441
|
-
When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `OPENCODE.md` — each agent's native instruction format.
|
|
509
|
+
When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `AGENTS.md` (Hermes), `OPENCODE.md` — each agent's native instruction format.
|
|
510
|
+
|
|
511
|
+
#### Backend ensemble (adaptive selection)
|
|
512
|
+
|
|
513
|
+
Don't know which backend writes the best harness changes for your task? Let PolyHarness find out. Pass several and it picks one per iteration with a **UCB bandit**, shifting picks toward whichever backend actually produces *improving* candidates:
|
|
514
|
+
|
|
515
|
+
```bash
|
|
516
|
+
ph run --ensemble "claude-code,codex,local"
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
At the end of the run you get a per-backend breakdown (picks + improve-rate). Selection is deterministic given the reward sequence, so runs stay reproducible. Inspired by ShinkaEvolve's adaptive LLM-ensemble selection.
|
|
442
520
|
|
|
443
521
|
### Local Model Setup
|
|
444
522
|
|
|
@@ -488,10 +566,16 @@ After `ph init`, the workspace has a `config.yaml` with these sections:
|
|
|
488
566
|
search:
|
|
489
567
|
max_iterations: 20 # Maximum search iterations
|
|
490
568
|
early_stop_patience: 5 # Stop after N iterations with no improvement
|
|
491
|
-
parent_selection: best # Strategy: best | tournament | all
|
|
569
|
+
parent_selection: best # Strategy: best | tournament | all | pareto
|
|
570
|
+
novelty_filter: false # Reject near-duplicate candidates before eval (saves budget)
|
|
571
|
+
novelty_threshold: 0.97 # Similarity ratio above which a candidate is a near-duplicate
|
|
572
|
+
novelty_max_retries: 1 # Regenerate a near-duplicate this many times before skipping
|
|
573
|
+
seed: null # RNG seed — set an int to make randomized runs reproducible
|
|
492
574
|
|
|
493
575
|
proposer:
|
|
494
|
-
backend: api # api | openai | claude-code | claw-code | codex | opencode | local
|
|
576
|
+
backend: api # api | openai | claude-code | claw-code | codex | hermes | opencode | local
|
|
577
|
+
ensemble: [] # If non-empty, pick among these backends per iteration via a UCB bandit
|
|
578
|
+
bandit_c: 1.41421356 # UCB exploration constant (higher = more exploration)
|
|
495
579
|
model: claude-sonnet-4-20250514 # Model name (for api/openai backends)
|
|
496
580
|
base_url: null # Custom API endpoint (for openai backend)
|
|
497
581
|
api_key: null # API key override (null = use env var)
|
|
@@ -503,6 +587,9 @@ evaluator:
|
|
|
503
587
|
type: python # python | docker | custom
|
|
504
588
|
entry: evaluate.py # Evaluator script entrypoint
|
|
505
589
|
timeout: 300 # Per-task timeout in seconds
|
|
590
|
+
cascade: false # Stage cheap subset first; skip rest if it fails the gate (per-task mode)
|
|
591
|
+
cascade_threshold: 0.4 # Min stage-1 mean score required to run the full task set
|
|
592
|
+
cascade_stage1: 0 # Tasks in stage 1 (0 = auto, ~1/3 of the list)
|
|
506
593
|
|
|
507
594
|
harness:
|
|
508
595
|
language: python # Harness code language
|
|
@@ -570,11 +657,11 @@ python -m polyharness --version
|
|
|
570
657
|
| `ph init` | Initialize workspace with auto-copy of harness, tasks, eval script |
|
|
571
658
|
| `ph run` | Start the optimization search loop |
|
|
572
659
|
| `ph status` | Progress table with elapsed time, improvement rate, and delta |
|
|
573
|
-
| `ph log` | Search tree with delta (Δ) column (or `--flat` for table) |
|
|
660
|
+
| `ph log` | Search tree with delta (Δ) column and Pareto-frontier (◆) markers (or `--flat` for table) |
|
|
574
661
|
| `ph best` | Show best candidate: score, per-task breakdown, changes summary |
|
|
575
662
|
| `ph compare A B` | Compare two iterations: score deltas + unified code diff |
|
|
576
663
|
| `ph diff <N>` | Shorthand for `compare 0 <N>` |
|
|
577
|
-
| `ph leaderboard` | Ranked table of all candidates (`--top N`, `--tasks` drilldown) |
|
|
664
|
+
| `ph leaderboard` | Ranked table of all candidates with Pareto (◆) and backend columns (`--top N`, `--tasks` drilldown) |
|
|
578
665
|
| `ph trace <N>` | View stdout, stderr, metrics, exit code for an iteration |
|
|
579
666
|
| `ph report` | Generate a full markdown report with score trends and per-task table |
|
|
580
667
|
| `ph apply` | Copy best harness back to `base_harness/` (or `--target` dir) |
|
|
@@ -588,6 +675,9 @@ python -m polyharness --version
|
|
|
588
675
|
| `ph traces stats` | Summary statistics: total traces, scored count, agent distribution |
|
|
589
676
|
| `ph traces clear` | Remove collected traces (`--keep N` to retain newest, `-y` to skip confirm) |
|
|
590
677
|
| `ph evolve` | Trigger an online evolution cycle using collected traces as context |
|
|
678
|
+
| `ph shell-hook install` | Install shell hook to auto-wrap agent commands (claude, claw, codex, opencode) |
|
|
679
|
+
| `ph shell-hook uninstall` | Remove the shell hook from your rc file |
|
|
680
|
+
| `ph shell-hook status` | Check if the shell hook is installed |
|
|
591
681
|
| `ph upgrade` | Upgrade PolyHarness to the latest version |
|
|
592
682
|
| `ph uninstall` | Uninstall PolyHarness from the current environment (`-y` to skip confirm) |
|
|
593
683
|
|
|
@@ -615,7 +705,8 @@ python -m polyharness --version
|
|
|
615
705
|
--dry-run Only evaluate the base harness, skip search
|
|
616
706
|
--resume Continue an interrupted search from where it left off
|
|
617
707
|
--backend <name> Override proposer backend without editing config
|
|
618
|
-
--strategy <name> Override parent selection: best | tournament | all
|
|
708
|
+
--strategy <name> Override parent selection: best | tournament | all | pareto
|
|
709
|
+
--ensemble b1,b2,... Pick among multiple backends per iteration via a UCB bandit
|
|
619
710
|
```
|
|
620
711
|
|
|
621
712
|
### `ph wrap` options
|
|
@@ -697,7 +788,7 @@ ph run --max-iterations 5
|
|
|
697
788
|
```
|
|
698
789
|
polyharness/
|
|
699
790
|
├── src/polyharness/
|
|
700
|
-
│ ├── cli.py # Click CLI —
|
|
791
|
+
│ ├── cli.py # Click CLI — 25 commands/subcommands
|
|
701
792
|
│ ├── config.py # Pydantic config models (+ EvolutionConfig)
|
|
702
793
|
│ ├── collector.py # Trace collector for online evolution
|
|
703
794
|
│ ├── orchestrator.py # Meta-Harness search loop + progress bar + error recovery
|
|
@@ -715,6 +806,7 @@ polyharness/
|
|
|
715
806
|
│ │ ├── claude_code.py # claude -p
|
|
716
807
|
│ │ ├── claw_code.py # claw -p
|
|
717
808
|
│ │ ├── codex.py # codex --quiet --auto-edit
|
|
809
|
+
│ │ ├── hermes.py # hermes chat -q
|
|
718
810
|
│ │ └── opencode.py # opencode -p
|
|
719
811
|
│ └── templates/ # 5 built-in task templates
|
|
720
812
|
│ ├── text-classification/
|
|
@@ -722,7 +814,7 @@ polyharness/
|
|
|
722
814
|
│ ├── code-generation/
|
|
723
815
|
│ ├── rag-qa/
|
|
724
816
|
│ └── api-calling/
|
|
725
|
-
├── tests/ #
|
|
817
|
+
├── tests/ # 173 tests (pytest)
|
|
726
818
|
├── bin/ # npm wrapper (ph.mjs, postinstall.mjs)
|
|
727
819
|
├── docs/
|
|
728
820
|
│ ├── development/ # Product roadmap & technical architecture
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
[](LICENSE)
|
|
17
17
|
[](https://www.python.org/downloads/)
|
|
18
|
-
[]()
|
|
19
19
|
[](README_CN.md)
|
|
20
20
|
|
|
21
21
|
---
|
|
@@ -30,7 +30,7 @@ Your AI agent runs the same harness every time. Same prompts, same tool config,
|
|
|
30
30
|
| | |
|
|
31
31
|
|---|---|
|
|
32
32
|
| **Self-Evolution** | Iteratively searches over harness changes and keeps the full evaluation history in one workspace. |
|
|
33
|
-
| **
|
|
33
|
+
| **8 Agent Backends** | Claude Code · Claw Code · Codex · Hermes · OpenCode · API direct · OpenAI-compatible · Local — plug in any CLI agent. |
|
|
34
34
|
| **Full History** | Every iteration's code, scores, and traces preserved. The Meta-Harness paper reports that non-Markovian search outperforms blind retries. |
|
|
35
35
|
| **Search Tree** | Visualize the optimization path. Compare any two candidates with per-task diffs. |
|
|
36
36
|
| **One-Command Setup** | `ph init --base-harness ... --task-dir ...` — copies files, configures workspace, done. |
|
|
@@ -53,13 +53,19 @@ PolyHarness fills that gap. It's the open-source engine that makes Meta-Harness
|
|
|
53
53
|
> - Memory tools (like Supermemory) give agents persistent **memory** across conversations.
|
|
54
54
|
> - **PolyHarness gives agents persistent self-evolution** — you get a repeatable way to refine how they work over time.
|
|
55
55
|
|
|
56
|
+
### Part of a wave — specialized for harnesses
|
|
57
|
+
|
|
58
|
+
PolyHarness doesn't stand alone. A wave of open-source projects has shown that pairing LLMs with evolutionary search systematically improves code and prompts: [GEPA](https://github.com/gepa-ai/gepa) (reflective prompt evolution over a Pareto frontier), [ShinkaEvolve](https://github.com/SakanaAI/ShinkaEvolve) (sample-efficient program evolution), [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) (an open AlphaEvolve), and the [Darwin Gödel Machine](https://sakana.ai/dgm/) (open-ended self-improving agents).
|
|
59
|
+
|
|
60
|
+
Most of these evolve *general* programs or algorithms. PolyHarness is the member of this wave **specialized for agent harnesses** — the prompts, tool config, and orchestration *around* an existing agent — with a focus on **online evolution from real usage** (`ph wrap` → `ph evolve`). It borrows the strongest ideas from these projects and applies them to any CLI agent on your own tasks: Pareto-frontier parent selection (GEPA), code-novelty rejection and an adaptive backend ensemble (ShinkaEvolve), and cascade evaluation (AlphaEvolve/OpenEvolve).
|
|
61
|
+
|
|
56
62
|
## What PolyHarness Is
|
|
57
63
|
|
|
58
64
|
PolyHarness is the open-source engine for iteratively searching over an agent's harness.
|
|
59
65
|
|
|
60
66
|
It builds on ideas from the Meta-Harness paper and the TBench2 results reported there, while focusing this repository on the optimization workflow itself — how harness variants are proposed, evaluated, and revised over repeated runs.
|
|
61
67
|
|
|
62
|
-
If tools like ForgeCode help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
|
|
68
|
+
If tools like [ForgeCode](https://github.com/antinomyhq/forgecode) help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
|
|
63
69
|
|
|
64
70
|
---
|
|
65
71
|
|
|
@@ -229,7 +235,7 @@ PolyHarness automatically sandboxes your agent inside this workspace, ensuring i
|
|
|
229
235
|
|
|
230
236
|
| Scenario | How to configure |
|
|
231
237
|
|----------|------------------|
|
|
232
|
-
| **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, opencode)* |
|
|
238
|
+
| **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, hermes, opencode)* |
|
|
233
239
|
| **Anthropic API** | Run `ph init --agent api`. Set `export ANTHROPIC_API_KEY="sk-ant-..."` before `ph run`. |
|
|
234
240
|
| **OpenAI / Local Models** | Run `ph init --agent openai`. Then configure the endpoint — see [Local Model Setup](#local-model-setup) below. |
|
|
235
241
|
| **Custom CLI path** | If your CLI agent uses a non-standard command, edit `config.yaml` in the workspace before running:<br>`proposer: { cli_path: "npx @anthropic-ai/claude-code" }`|
|
|
@@ -242,6 +248,34 @@ ph run
|
|
|
242
248
|
|
|
243
249
|
The orchestrator: copies your harness → asks the Proposer agent for a candidate change → evaluates the result → stores everything → repeats.
|
|
244
250
|
|
|
251
|
+
```
|
|
252
|
+
┌──────────────────────────────────────────────────────────────┐
|
|
253
|
+
│ │
|
|
254
|
+
│ You PolyHarness │
|
|
255
|
+
│ │ │ │
|
|
256
|
+
│ ├── ph init ──────────────────→│ Creates workspace │
|
|
257
|
+
│ │ (harness + tasks + eval) │ Copies files │
|
|
258
|
+
│ │ │ Injects CLAUDE.md │
|
|
259
|
+
│ │ │ │
|
|
260
|
+
│ ├── ph run ───────────────────→│ Starts search loop: │
|
|
261
|
+
│ │ │ │
|
|
262
|
+
│ │ ┌──────────────────────────┤ │
|
|
263
|
+
│ │ │ Step 1: SELECT parent │ Best or Tournament │
|
|
264
|
+
│ │ │ Step 2: COPY harness │ From parent → candidate │
|
|
265
|
+
│ │ │ Step 3: PROPOSE changes │ Agent reads all history │
|
|
266
|
+
│ │ │ Step 4: EVALUATE │ Run tasks, get scores │
|
|
267
|
+
│ │ │ Step 5: STORE results │ Code + scores + traces │
|
|
268
|
+
│ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
|
|
269
|
+
│ │ └──────────┬───────────────┤ │
|
|
270
|
+
│ │ └── loop ───────┘ │
|
|
271
|
+
│ │ │ │
|
|
272
|
+
│ ├── ph log ───────────────────→│ Shows search tree │
|
|
273
|
+
│ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
|
|
274
|
+
│ └── ph apply ─────────────────→│ Writes best back │
|
|
275
|
+
│ │
|
|
276
|
+
└──────────────────────────────────────────────────────────────┘
|
|
277
|
+
```
|
|
278
|
+
|
|
245
279
|
### 5. Inspect and apply
|
|
246
280
|
|
|
247
281
|
```bash
|
|
@@ -270,6 +304,7 @@ Just add `ph wrap --auto-evolve` in front of your agent command (pick the one ma
|
|
|
270
304
|
ph wrap --auto-evolve claude -p "Refactor the auth module to use JWT" # Claude Code
|
|
271
305
|
ph wrap --auto-evolve claw -p "Write integration tests for payments" # Claw Code
|
|
272
306
|
ph wrap --auto-evolve codex "Add retry logic to the API client" # Codex
|
|
307
|
+
ph wrap --auto-evolve hermes chat -q "Refactor the DB connection pool" # Hermes Agent
|
|
273
308
|
ph wrap --auto-evolve opencode -p "Fix the flaky parser test" # OpenCode
|
|
274
309
|
|
|
275
310
|
# Local models — wrap the CLI command directly
|
|
@@ -325,7 +360,67 @@ ph evolve # trigger evolution manually
|
|
|
325
360
|
|
|
326
361
|
> **Tip:** Use `--no-record-output` if you don't want stdout/stderr saved (e.g., for sensitive output). Metadata is always recorded.
|
|
327
362
|
|
|
328
|
-
|
|
363
|
+
#### Zero-config auto-wrap: `ph shell-hook`
|
|
364
|
+
|
|
365
|
+
Don't want to type `ph wrap --auto-evolve` every time? Install a shell hook — it auto-intercepts agent commands:
|
|
366
|
+
|
|
367
|
+
```bash
|
|
368
|
+
ph shell-hook install # one-time setup, writes to ~/.zshrc
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
After that, just use your agent as usual:
|
|
372
|
+
|
|
373
|
+
```bash
|
|
374
|
+
claude -p "Refactor auth to JWT" # automatically becomes: ph wrap --auto-evolve claude -p ...
|
|
375
|
+
claw -p "Write payment tests" # same — auto-wrapped
|
|
376
|
+
codex "Add retry logic" # same
|
|
377
|
+
hermes chat -q "Refactor pool" # same
|
|
378
|
+
opencode -p "Fix flaky test" # same
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
How it works: a `preexec` hook in your shell detects `claude`/`claw`/`codex`/`hermes`/`opencode` commands and transparently redirects them through `ph wrap --auto-evolve`. Your output is unchanged.
|
|
382
|
+
|
|
383
|
+
```bash
|
|
384
|
+
ph shell-hook status # check if installed
|
|
385
|
+
ph shell-hook uninstall # remove cleanly (restores original rc file)
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
#### Auto-Evolution flow
|
|
389
|
+
|
|
390
|
+
```
|
|
391
|
+
┌──────────────────────────────────────────────────────────────┐
|
|
392
|
+
│ │
|
|
393
|
+
│ You PolyHarness │
|
|
394
|
+
│ │ │ │
|
|
395
|
+
│ ├── ph shell-hook install ────→ │ Injects preexec hook │
|
|
396
|
+
│ │ (one-time setup) │ into ~/.zshrc │
|
|
397
|
+
│ │ │ │
|
|
398
|
+
│ ├── claude -p "Fix bug" ──────→ │ Shell hook intercepts │
|
|
399
|
+
│ │ (normal usage) │ │
|
|
400
|
+
│ │ ├── Run agent │
|
|
401
|
+
│ │ ┌─ output passes through ──┤ │
|
|
402
|
+
│ │ │ ├── Record trace │
|
|
403
|
+
│ │ │ │ (~/.polyharness/ │
|
|
404
|
+
│ │ │ │ traces/) │
|
|
405
|
+
│ │ │ │ │
|
|
406
|
+
│ │ │ ├── Check threshold │
|
|
407
|
+
│ │ │ │ traces < 50? │
|
|
408
|
+
│ │ │ │ ├─ Yes: "7/50 traces" │
|
|
409
|
+
│ │ │ │ └─ No: trigger ───┐ │
|
|
410
|
+
│ │ │ │ │ │
|
|
411
|
+
│ │ │ │ ┌─────────────────┘ │
|
|
412
|
+
│ │ │ │ │ Evolution cycle │
|
|
413
|
+
│ │ │ │ │ (same as ph run) │
|
|
414
|
+
│ │ │ │ │ Propose → Evaluate │
|
|
415
|
+
│ │ │ │ │ → Store → Repeat │
|
|
416
|
+
│ │ │ │ └────────────────── │
|
|
417
|
+
│ │ │ │ │
|
|
418
|
+
│ └───┘ │ │
|
|
419
|
+
│ │
|
|
420
|
+
└──────────────────────────────────────────────────────────────┘
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
The key difference: **you never run `ph run` manually.** You use your agent as always; PolyHarness silently collects data and triggers evolution when it has enough signal.
|
|
329
424
|
|
|
330
425
|
### Try it now (no API key needed)
|
|
331
426
|
|
|
@@ -347,35 +442,7 @@ The score path above is the current measured result of the bundled `math-word-pr
|
|
|
347
442
|
|
|
348
443
|
## How It Works
|
|
349
444
|
|
|
350
|
-
PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes
|
|
351
|
-
|
|
352
|
-
```
|
|
353
|
-
┌──────────────────────────────────────────────────────────────┐
|
|
354
|
-
│ │
|
|
355
|
-
│ You PolyHarness │
|
|
356
|
-
│ │ │ │
|
|
357
|
-
│ ├── ph init ──────────────────→│ Creates workspace │
|
|
358
|
-
│ │ (harness + tasks + eval) │ Copies files │
|
|
359
|
-
│ │ │ Injects CLAUDE.md │
|
|
360
|
-
│ │ │ │
|
|
361
|
-
│ ├── ph run ───────────────────→│ Starts search loop: │
|
|
362
|
-
│ │ │ │
|
|
363
|
-
│ │ ┌──────────────────────────┤ │
|
|
364
|
-
│ │ │ Step 1: SELECT parent │ Best or Tournament │
|
|
365
|
-
│ │ │ Step 2: COPY harness │ From parent → candidate │
|
|
366
|
-
│ │ │ Step 3: PROPOSE changes │ Agent reads all history │
|
|
367
|
-
│ │ │ Step 4: EVALUATE │ Run tasks, get scores │
|
|
368
|
-
│ │ │ Step 5: STORE results │ Code + scores + traces │
|
|
369
|
-
│ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
|
|
370
|
-
│ │ └──────────┬───────────────┤ │
|
|
371
|
-
│ │ └── loop ───────┘ │
|
|
372
|
-
│ │ │ │
|
|
373
|
-
│ ├── ph log ───────────────────→│ Shows search tree │
|
|
374
|
-
│ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
|
|
375
|
-
│ └── ph apply ─────────────────→│ Writes best back │
|
|
376
|
-
│ │
|
|
377
|
-
└──────────────────────────────────────────────────────────────┘
|
|
378
|
-
```
|
|
445
|
+
PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes. See the detailed flow diagrams above in [Step 4](#4-run-the-optimization-loop) and [Step 6](#6-auto-evolution).
|
|
379
446
|
|
|
380
447
|
### Why it works: non-Markovian search
|
|
381
448
|
|
|
@@ -400,12 +467,23 @@ The Proposer reads **all of this** before generating the next candidate. It can
|
|
|
400
467
|
| `claude-code` | `claude -p` | Official Claude Code CLI (Pro/Teams subscription) |
|
|
401
468
|
| `claw-code` | `claw -p` | Open-source Claw Code CLI |
|
|
402
469
|
| `codex` | `codex --quiet` | OpenAI Codex CLI |
|
|
470
|
+
| `hermes` | `hermes chat -q` | Nous Research [Hermes Agent](https://github.com/NousResearch/hermes-agent) CLI |
|
|
403
471
|
| `opencode` | `opencode -p` | OpenCode CLI |
|
|
404
472
|
| `local` | — | Offline rule-based engine for development & testing |
|
|
405
473
|
|
|
406
474
|
`ph doctor` auto-detects all available backends and shows their status.
|
|
407
475
|
|
|
408
|
-
When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `OPENCODE.md` — each agent's native instruction format.
|
|
476
|
+
When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `AGENTS.md` (Hermes), `OPENCODE.md` — each agent's native instruction format.
|
|
477
|
+
|
|
478
|
+
#### Backend ensemble (adaptive selection)
|
|
479
|
+
|
|
480
|
+
Don't know which backend writes the best harness changes for your task? Let PolyHarness find out. Pass several and it picks one per iteration with a **UCB bandit**, shifting picks toward whichever backend actually produces *improving* candidates:
|
|
481
|
+
|
|
482
|
+
```bash
|
|
483
|
+
ph run --ensemble "claude-code,codex,local"
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
At the end of the run you get a per-backend breakdown (picks + improve-rate). Selection is deterministic given the reward sequence, so runs stay reproducible. Inspired by ShinkaEvolve's adaptive LLM-ensemble selection.
|
|
409
487
|
|
|
410
488
|
### Local Model Setup
|
|
411
489
|
|
|
@@ -455,10 +533,16 @@ After `ph init`, the workspace has a `config.yaml` with these sections:
|
|
|
455
533
|
search:
|
|
456
534
|
max_iterations: 20 # Maximum search iterations
|
|
457
535
|
early_stop_patience: 5 # Stop after N iterations with no improvement
|
|
458
|
-
parent_selection: best # Strategy: best | tournament | all
|
|
536
|
+
parent_selection: best # Strategy: best | tournament | all | pareto
|
|
537
|
+
novelty_filter: false # Reject near-duplicate candidates before eval (saves budget)
|
|
538
|
+
novelty_threshold: 0.97 # Similarity ratio above which a candidate is a near-duplicate
|
|
539
|
+
novelty_max_retries: 1 # Regenerate a near-duplicate this many times before skipping
|
|
540
|
+
seed: null # RNG seed — set an int to make randomized runs reproducible
|
|
459
541
|
|
|
460
542
|
proposer:
|
|
461
|
-
backend: api # api | openai | claude-code | claw-code | codex | opencode | local
|
|
543
|
+
backend: api # api | openai | claude-code | claw-code | codex | hermes | opencode | local
|
|
544
|
+
ensemble: [] # If non-empty, pick among these backends per iteration via a UCB bandit
|
|
545
|
+
bandit_c: 1.41421356 # UCB exploration constant (higher = more exploration)
|
|
462
546
|
model: claude-sonnet-4-20250514 # Model name (for api/openai backends)
|
|
463
547
|
base_url: null # Custom API endpoint (for openai backend)
|
|
464
548
|
api_key: null # API key override (null = use env var)
|
|
@@ -470,6 +554,9 @@ evaluator:
|
|
|
470
554
|
type: python # python | docker | custom
|
|
471
555
|
entry: evaluate.py # Evaluator script entrypoint
|
|
472
556
|
timeout: 300 # Per-task timeout in seconds
|
|
557
|
+
cascade: false # Stage cheap subset first; skip rest if it fails the gate (per-task mode)
|
|
558
|
+
cascade_threshold: 0.4 # Min stage-1 mean score required to run the full task set
|
|
559
|
+
cascade_stage1: 0 # Tasks in stage 1 (0 = auto, ~1/3 of the list)
|
|
473
560
|
|
|
474
561
|
harness:
|
|
475
562
|
language: python # Harness code language
|
|
@@ -537,11 +624,11 @@ python -m polyharness --version
|
|
|
537
624
|
| `ph init` | Initialize workspace with auto-copy of harness, tasks, eval script |
|
|
538
625
|
| `ph run` | Start the optimization search loop |
|
|
539
626
|
| `ph status` | Progress table with elapsed time, improvement rate, and delta |
|
|
540
|
-
| `ph log` | Search tree with delta (Δ) column (or `--flat` for table) |
|
|
627
|
+
| `ph log` | Search tree with delta (Δ) column and Pareto-frontier (◆) markers (or `--flat` for table) |
|
|
541
628
|
| `ph best` | Show best candidate: score, per-task breakdown, changes summary |
|
|
542
629
|
| `ph compare A B` | Compare two iterations: score deltas + unified code diff |
|
|
543
630
|
| `ph diff <N>` | Shorthand for `compare 0 <N>` |
|
|
544
|
-
| `ph leaderboard` | Ranked table of all candidates (`--top N`, `--tasks` drilldown) |
|
|
631
|
+
| `ph leaderboard` | Ranked table of all candidates with Pareto (◆) and backend columns (`--top N`, `--tasks` drilldown) |
|
|
545
632
|
| `ph trace <N>` | View stdout, stderr, metrics, exit code for an iteration |
|
|
546
633
|
| `ph report` | Generate a full markdown report with score trends and per-task table |
|
|
547
634
|
| `ph apply` | Copy best harness back to `base_harness/` (or `--target` dir) |
|
|
@@ -555,6 +642,9 @@ python -m polyharness --version
|
|
|
555
642
|
| `ph traces stats` | Summary statistics: total traces, scored count, agent distribution |
|
|
556
643
|
| `ph traces clear` | Remove collected traces (`--keep N` to retain newest, `-y` to skip confirm) |
|
|
557
644
|
| `ph evolve` | Trigger an online evolution cycle using collected traces as context |
|
|
645
|
+
| `ph shell-hook install` | Install shell hook to auto-wrap agent commands (claude, claw, codex, opencode) |
|
|
646
|
+
| `ph shell-hook uninstall` | Remove the shell hook from your rc file |
|
|
647
|
+
| `ph shell-hook status` | Check if the shell hook is installed |
|
|
558
648
|
| `ph upgrade` | Upgrade PolyHarness to the latest version |
|
|
559
649
|
| `ph uninstall` | Uninstall PolyHarness from the current environment (`-y` to skip confirm) |
|
|
560
650
|
|
|
@@ -582,7 +672,8 @@ python -m polyharness --version
|
|
|
582
672
|
--dry-run Only evaluate the base harness, skip search
|
|
583
673
|
--resume Continue an interrupted search from where it left off
|
|
584
674
|
--backend <name> Override proposer backend without editing config
|
|
585
|
-
--strategy <name> Override parent selection: best | tournament | all
|
|
675
|
+
--strategy <name> Override parent selection: best | tournament | all | pareto
|
|
676
|
+
--ensemble b1,b2,... Pick among multiple backends per iteration via a UCB bandit
|
|
586
677
|
```
|
|
587
678
|
|
|
588
679
|
### `ph wrap` options
|
|
@@ -664,7 +755,7 @@ ph run --max-iterations 5
|
|
|
664
755
|
```
|
|
665
756
|
polyharness/
|
|
666
757
|
├── src/polyharness/
|
|
667
|
-
│ ├── cli.py # Click CLI —
|
|
758
|
+
│ ├── cli.py # Click CLI — 25 commands/subcommands
|
|
668
759
|
│ ├── config.py # Pydantic config models (+ EvolutionConfig)
|
|
669
760
|
│ ├── collector.py # Trace collector for online evolution
|
|
670
761
|
│ ├── orchestrator.py # Meta-Harness search loop + progress bar + error recovery
|
|
@@ -682,6 +773,7 @@ polyharness/
|
|
|
682
773
|
│ │ ├── claude_code.py # claude -p
|
|
683
774
|
│ │ ├── claw_code.py # claw -p
|
|
684
775
|
│ │ ├── codex.py # codex --quiet --auto-edit
|
|
776
|
+
│ │ ├── hermes.py # hermes chat -q
|
|
685
777
|
│ │ └── opencode.py # opencode -p
|
|
686
778
|
│ └── templates/ # 5 built-in task templates
|
|
687
779
|
│ ├── text-classification/
|
|
@@ -689,7 +781,7 @@ polyharness/
|
|
|
689
781
|
│ ├── code-generation/
|
|
690
782
|
│ ├── rag-qa/
|
|
691
783
|
│ └── api-calling/
|
|
692
|
-
├── tests/ #
|
|
784
|
+
├── tests/ # 173 tests (pytest)
|
|
693
785
|
├── bin/ # npm wrapper (ph.mjs, postinstall.mjs)
|
|
694
786
|
├── docs/
|
|
695
787
|
│ ├── development/ # Product roadmap & technical architecture
|