polyharness 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {polyharness-0.2.0/src/polyharness.egg-info → polyharness-0.2.2}/PKG-INFO +135 -43
  2. {polyharness-0.2.0 → polyharness-0.2.2}/README.md +134 -42
  3. {polyharness-0.2.0 → polyharness-0.2.2}/pyproject.toml +1 -1
  4. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/__init__.py +1 -1
  5. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/cli.py +226 -17
  6. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/config.py +82 -3
  7. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/doctor.py +2 -0
  8. polyharness-0.2.2/src/polyharness/orchestrator.py +544 -0
  9. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/__init__.py +3 -0
  10. polyharness-0.2.2/src/polyharness/proposer/adapters/hermes.py +29 -0
  11. polyharness-0.2.2/src/polyharness/proposer/bandit.py +84 -0
  12. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/search_log.py +25 -0
  13. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/workspace.py +11 -0
  14. {polyharness-0.2.0 → polyharness-0.2.2/src/polyharness.egg-info}/PKG-INFO +135 -43
  15. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/SOURCES.txt +3 -0
  16. polyharness-0.2.2/tests/test_bandit.py +66 -0
  17. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_cli_adapters.py +17 -1
  18. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_cli_features.py +57 -0
  19. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_config.py +38 -0
  20. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_evolution.py +79 -0
  21. polyharness-0.2.2/tests/test_orchestrator.py +555 -0
  22. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_search_log.py +20 -0
  23. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_workspace.py +8 -0
  24. polyharness-0.2.0/src/polyharness/orchestrator.py +0 -287
  25. polyharness-0.2.0/tests/test_orchestrator.py +0 -229
  26. {polyharness-0.2.0 → polyharness-0.2.2}/LICENSE +0 -0
  27. {polyharness-0.2.0 → polyharness-0.2.2}/setup.cfg +0 -0
  28. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/__main__.py +0 -0
  29. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/collector.py +0 -0
  30. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/evaluator/__init__.py +0 -0
  31. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/evaluator/evaluator.py +0 -0
  32. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/__init__.py +0 -0
  33. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/base.py +0 -0
  34. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/claude_code.py +0 -0
  35. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/claw_code.py +0 -0
  36. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/codex.py +0 -0
  37. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/adapters/opencode.py +0 -0
  38. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/api_proposer.py +0 -0
  39. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/base.py +0 -0
  40. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/cli_proposer.py +0 -0
  41. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/local_proposer.py +0 -0
  42. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/proposer/openai_proposer.py +0 -0
  43. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/__init__.py +0 -0
  44. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/api-calling/base_harness/harness.py +0 -0
  45. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/api-calling/evaluate.py +0 -0
  46. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/api-calling/tasks/test_cases.json +0 -0
  47. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/code-generation/base_harness/harness.py +0 -0
  48. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/code-generation/evaluate.py +0 -0
  49. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/code-generation/tasks/test_cases.json +0 -0
  50. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/math-word-problems/base_harness/harness.py +0 -0
  51. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/math-word-problems/evaluate.py +0 -0
  52. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/math-word-problems/tasks/test_cases.json +0 -0
  53. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/rag-qa/base_harness/harness.py +0 -0
  54. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/rag-qa/evaluate.py +0 -0
  55. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/rag-qa/tasks/test_cases.json +0 -0
  56. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/text-classification/base_harness/harness.py +0 -0
  57. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/text-classification/evaluate.py +0 -0
  58. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/templates/text-classification/tasks/test_cases.json +0 -0
  59. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness/utils/__init__.py +0 -0
  60. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/dependency_links.txt +0 -0
  61. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/entry_points.txt +0 -0
  62. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/requires.txt +0 -0
  63. {polyharness-0.2.0 → polyharness-0.2.2}/src/polyharness.egg-info/top_level.txt +0 -0
  64. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_collector.py +0 -0
  65. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_compare.py +0 -0
  66. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_evaluator.py +0 -0
  67. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_example.py +0 -0
  68. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_export.py +0 -0
  69. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_log.py +0 -0
  70. {polyharness-0.2.0 → polyharness-0.2.2}/tests/test_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polyharness
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Automated harness optimization for AI agents — make your agent evolve.
5
5
  Author: weijt606
6
6
  License-Expression: MIT
@@ -48,7 +48,7 @@ Dynamic: license-file
48
48
 
49
49
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
50
50
  [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
51
- [![Tests](https://img.shields.io/badge/tests-165%20passing-brightgreen.svg)]()
51
+ [![Tests](https://img.shields.io/badge/tests-206%20passing-brightgreen.svg)]()
52
52
  [![中文文档](https://img.shields.io/badge/文档-中文版-red.svg)](README_CN.md)
53
53
 
54
54
  ---
@@ -63,7 +63,7 @@ Your AI agent runs the same harness every time. Same prompts, same tool config,
63
63
  | | |
64
64
  |---|---|
65
65
  | **Self-Evolution** | Iteratively searches over harness changes and keeps the full evaluation history in one workspace. |
66
- | **7 Agent Backends** | Claude Code · Claw Code · Codex · OpenCode · API direct · OpenAI-compatible · Local — plug in any CLI agent. |
66
+ | **8 Agent Backends** | Claude Code · Claw Code · Codex · Hermes · OpenCode · API direct · OpenAI-compatible · Local — plug in any CLI agent. |
67
67
  | **Full History** | Every iteration's code, scores, and traces preserved. The Meta-Harness paper reports that non-Markovian search outperforms blind retries. |
68
68
  | **Search Tree** | Visualize the optimization path. Compare any two candidates with per-task diffs. |
69
69
  | **One-Command Setup** | `ph init --base-harness ... --task-dir ...` — copies files, configures workspace, done. |
@@ -86,13 +86,19 @@ PolyHarness fills that gap. It's the open-source engine that makes Meta-Harness
86
86
  > - Memory tools (like Supermemory) give agents persistent **memory** across conversations.
87
87
  > - **PolyHarness gives agents persistent self-evolution** — you get a repeatable way to refine how they work over time.
88
88
 
89
+ ### Part of a wave — specialized for harnesses
90
+
91
+ PolyHarness doesn't stand alone. A wave of open-source projects has shown that pairing LLMs with evolutionary search systematically improves code and prompts: [GEPA](https://github.com/gepa-ai/gepa) (reflective prompt evolution over a Pareto frontier), [ShinkaEvolve](https://github.com/SakanaAI/ShinkaEvolve) (sample-efficient program evolution), [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) (an open AlphaEvolve), and the [Darwin Gödel Machine](https://sakana.ai/dgm/) (open-ended self-improving agents).
92
+
93
+ Most of these evolve *general* programs or algorithms. PolyHarness is the member of this wave **specialized for agent harnesses** — the prompts, tool config, and orchestration *around* an existing agent — with a focus on **online evolution from real usage** (`ph wrap` → `ph evolve`). It borrows the strongest ideas from these projects and applies them to any CLI agent on your own tasks: Pareto-frontier parent selection (GEPA), code-novelty rejection and an adaptive backend ensemble (ShinkaEvolve), and cascade evaluation (AlphaEvolve/OpenEvolve).
94
+
89
95
  ## What PolyHarness Is
90
96
 
91
97
  PolyHarness is the open-source engine for iteratively searching over an agent's harness.
92
98
 
93
99
  It builds on ideas from the Meta-Harness paper and the TBench2 results reported there, while focusing this repository on the optimization workflow itself — how harness variants are proposed, evaluated, and revised over repeated runs.
94
100
 
95
- If tools like ForgeCode help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
101
+ If tools like [ForgeCode](https://github.com/antinomyhq/forgecode) help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
96
102
 
97
103
  ---
98
104
 
@@ -262,7 +268,7 @@ PolyHarness automatically sandboxes your agent inside this workspace, ensuring i
262
268
 
263
269
  | Scenario | How to configure |
264
270
  |----------|------------------|
265
- | **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, opencode)* |
271
+ | **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, hermes, opencode)* |
266
272
  | **Anthropic API** | Run `ph init --agent api`. Set `export ANTHROPIC_API_KEY="sk-ant-..."` before `ph run`. |
267
273
  | **OpenAI / Local Models** | Run `ph init --agent openai`. Then configure the endpoint — see [Local Model Setup](#local-model-setup) below. |
268
274
  | **Custom CLI path** | If your CLI agent uses a non-standard command, edit `config.yaml` in the workspace before running:<br>`proposer: { cli_path: "npx @anthropic-ai/claude-code" }`|
@@ -275,6 +281,34 @@ ph run
275
281
 
276
282
  The orchestrator: copies your harness → asks the Proposer agent for a candidate change → evaluates the result → stores everything → repeats.
277
283
 
284
+ ```
285
+ ┌──────────────────────────────────────────────────────────────┐
286
+ │ │
287
+ │ You PolyHarness │
288
+ │ │ │ │
289
+ │ ├── ph init ──────────────────→│ Creates workspace │
290
+ │ │ (harness + tasks + eval) │ Copies files │
291
+ │ │ │ Injects CLAUDE.md │
292
+ │ │ │ │
293
+ │ ├── ph run ───────────────────→│ Starts search loop: │
294
+ │ │ │ │
295
+ │ │ ┌──────────────────────────┤ │
296
+ │ │ │ Step 1: SELECT parent │ Best or Tournament │
297
+ │ │ │ Step 2: COPY harness │ From parent → candidate │
298
+ │ │ │ Step 3: PROPOSE changes │ Agent reads all history │
299
+ │ │ │ Step 4: EVALUATE │ Run tasks, get scores │
300
+ │ │ │ Step 5: STORE results │ Code + scores + traces │
301
+ │ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
302
+ │ │ └──────────┬───────────────┤ │
303
+ │ │ └── loop ───────┘ │
304
+ │ │ │ │
305
+ │ ├── ph log ───────────────────→│ Shows search tree │
306
+ │ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
307
+ │ └── ph apply ─────────────────→│ Writes best back │
308
+ │ │
309
+ └──────────────────────────────────────────────────────────────┘
310
+ ```
311
+
278
312
  ### 5. Inspect and apply
279
313
 
280
314
  ```bash
@@ -303,6 +337,7 @@ Just add `ph wrap --auto-evolve` in front of your agent command (pick the one ma
303
337
  ph wrap --auto-evolve claude -p "Refactor the auth module to use JWT" # Claude Code
304
338
  ph wrap --auto-evolve claw -p "Write integration tests for payments" # Claw Code
305
339
  ph wrap --auto-evolve codex "Add retry logic to the API client" # Codex
340
+ ph wrap --auto-evolve hermes chat -q "Refactor the DB connection pool" # Hermes Agent
306
341
  ph wrap --auto-evolve opencode -p "Fix the flaky parser test" # OpenCode
307
342
 
308
343
  # Local models — wrap the CLI command directly
@@ -358,7 +393,67 @@ ph evolve # trigger evolution manually
358
393
 
359
394
  > **Tip:** Use `--no-record-output` if you don't want stdout/stderr saved (e.g., for sensitive output). Metadata is always recorded.
360
395
 
361
- > **Tip:** Create a shell alias for even less typing: `alias cc="ph wrap --auto-evolve claude"`
396
+ #### Zero-config auto-wrap: `ph shell-hook`
397
+
398
+ Don't want to type `ph wrap --auto-evolve` every time? Install a shell hook — it auto-intercepts agent commands:
399
+
400
+ ```bash
401
+ ph shell-hook install # one-time setup, writes to ~/.zshrc
402
+ ```
403
+
404
+ After that, just use your agent as usual:
405
+
406
+ ```bash
407
+ claude -p "Refactor auth to JWT" # automatically becomes: ph wrap --auto-evolve claude -p ...
408
+ claw -p "Write payment tests" # same — auto-wrapped
409
+ codex "Add retry logic" # same
410
+ hermes chat -q "Refactor pool" # same
411
+ opencode -p "Fix flaky test" # same
412
+ ```
413
+
414
+ How it works: a `preexec` hook in your shell detects `claude`/`claw`/`codex`/`hermes`/`opencode` commands and transparently redirects them through `ph wrap --auto-evolve`. Your output is unchanged.
415
+
416
+ ```bash
417
+ ph shell-hook status # check if installed
418
+ ph shell-hook uninstall # remove cleanly (restores original rc file)
419
+ ```
420
+
421
+ #### Auto-Evolution flow
422
+
423
+ ```
424
+ ┌──────────────────────────────────────────────────────────────┐
425
+ │ │
426
+ │ You PolyHarness │
427
+ │ │ │ │
428
+ │ ├── ph shell-hook install ────→ │ Injects preexec hook │
429
+ │ │ (one-time setup) │ into ~/.zshrc │
430
+ │ │ │ │
431
+ │ ├── claude -p "Fix bug" ──────→ │ Shell hook intercepts │
432
+ │ │ (normal usage) │ │
433
+ │ │ ├── Run agent │
434
+ │ │ ┌─ output passes through ──┤ │
435
+ │ │ │ ├── Record trace │
436
+ │ │ │ │ (~/.polyharness/ │
437
+ │ │ │ │ traces/) │
438
+ │ │ │ │ │
439
+ │ │ │ ├── Check threshold │
440
+ │ │ │ │ traces < 50? │
441
+ │ │ │ │ ├─ Yes: "7/50 traces" │
442
+ │ │ │ │ └─ No: trigger ───┐ │
443
+ │ │ │ │ │ │
444
+ │ │ │ │ ┌─────────────────┘ │
445
+ │ │ │ │ │ Evolution cycle │
446
+ │ │ │ │ │ (same as ph run) │
447
+ │ │ │ │ │ Propose → Evaluate │
448
+ │ │ │ │ │ → Store → Repeat │
449
+ │ │ │ │ └────────────────── │
450
+ │ │ │ │ │
451
+ │ └───┘ │ │
452
+ │ │
453
+ └──────────────────────────────────────────────────────────────┘
454
+ ```
455
+
456
+ The key difference: **you never run `ph run` manually.** You use your agent as always; PolyHarness silently collects data and triggers evolution when it has enough signal.
362
457
 
363
458
  ### Try it now (no API key needed)
364
459
 
@@ -380,35 +475,7 @@ The score path above is the current measured result of the bundled `math-word-pr
380
475
 
381
476
  ## How It Works
382
477
 
383
- PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes:
384
-
385
- ```
386
- ┌──────────────────────────────────────────────────────────────┐
387
- │ │
388
- │ You PolyHarness │
389
- │ │ │ │
390
- │ ├── ph init ──────────────────→│ Creates workspace │
391
- │ │ (harness + tasks + eval) │ Copies files │
392
- │ │ │ Injects CLAUDE.md │
393
- │ │ │ │
394
- │ ├── ph run ───────────────────→│ Starts search loop: │
395
- │ │ │ │
396
- │ │ ┌──────────────────────────┤ │
397
- │ │ │ Step 1: SELECT parent │ Best or Tournament │
398
- │ │ │ Step 2: COPY harness │ From parent → candidate │
399
- │ │ │ Step 3: PROPOSE changes │ Agent reads all history │
400
- │ │ │ Step 4: EVALUATE │ Run tasks, get scores │
401
- │ │ │ Step 5: STORE results │ Code + scores + traces │
402
- │ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
403
- │ │ └──────────┬───────────────┤ │
404
- │ │ └── loop ───────┘ │
405
- │ │ │ │
406
- │ ├── ph log ───────────────────→│ Shows search tree │
407
- │ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
408
- │ └── ph apply ─────────────────→│ Writes best back │
409
- │ │
410
- └──────────────────────────────────────────────────────────────┘
411
- ```
478
+ PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes. See the detailed flow diagrams above in [Step 4](#4-run-the-optimization-loop) and [Step 6](#6-auto-evolution).
412
479
 
413
480
  ### Why it works: non-Markovian search
414
481
 
@@ -433,12 +500,23 @@ The Proposer reads **all of this** before generating the next candidate. It can
433
500
  | `claude-code` | `claude -p` | Official Claude Code CLI (Pro/Teams subscription) |
434
501
  | `claw-code` | `claw -p` | Open-source Claw Code CLI |
435
502
  | `codex` | `codex --quiet` | OpenAI Codex CLI |
503
+ | `hermes` | `hermes chat -q` | Nous Research [Hermes Agent](https://github.com/NousResearch/hermes-agent) CLI |
436
504
  | `opencode` | `opencode -p` | OpenCode CLI |
437
505
  | `local` | — | Offline rule-based engine for development & testing |
438
506
 
439
507
  `ph doctor` auto-detects all available backends and shows their status.
440
508
 
441
- When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `OPENCODE.md` — each agent's native instruction format.
509
+ When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `AGENTS.md` (Hermes), `OPENCODE.md` — each agent's native instruction format.
510
+
511
+ #### Backend ensemble (adaptive selection)
512
+
513
+ Don't know which backend writes the best harness changes for your task? Let PolyHarness find out. Pass several and it picks one per iteration with a **UCB bandit**, shifting picks toward whichever backend actually produces *improving* candidates:
514
+
515
+ ```bash
516
+ ph run --ensemble "claude-code,codex,local"
517
+ ```
518
+
519
+ At the end of the run you get a per-backend breakdown (picks + improve-rate). Selection is deterministic given the reward sequence, so runs stay reproducible. Inspired by ShinkaEvolve's adaptive LLM-ensemble selection.
442
520
 
443
521
  ### Local Model Setup
444
522
 
@@ -488,10 +566,16 @@ After `ph init`, the workspace has a `config.yaml` with these sections:
488
566
  search:
489
567
  max_iterations: 20 # Maximum search iterations
490
568
  early_stop_patience: 5 # Stop after N iterations with no improvement
491
- parent_selection: best # Strategy: best | tournament | all
569
+ parent_selection: best # Strategy: best | tournament | all | pareto
570
+ novelty_filter: false # Reject near-duplicate candidates before eval (saves budget)
571
+ novelty_threshold: 0.97 # Similarity ratio above which a candidate is a near-duplicate
572
+ novelty_max_retries: 1 # Regenerate a near-duplicate this many times before skipping
573
+ seed: null # RNG seed — set an int to make randomized runs reproducible
492
574
 
493
575
  proposer:
494
- backend: api # api | openai | claude-code | claw-code | codex | opencode | local
576
+ backend: api # api | openai | claude-code | claw-code | codex | hermes | opencode | local
577
+ ensemble: [] # If non-empty, pick among these backends per iteration via a UCB bandit
578
+ bandit_c: 1.41421356 # UCB exploration constant (higher = more exploration)
495
579
  model: claude-sonnet-4-20250514 # Model name (for api/openai backends)
496
580
  base_url: null # Custom API endpoint (for openai backend)
497
581
  api_key: null # API key override (null = use env var)
@@ -503,6 +587,9 @@ evaluator:
503
587
  type: python # python | docker | custom
504
588
  entry: evaluate.py # Evaluator script entrypoint
505
589
  timeout: 300 # Per-task timeout in seconds
590
+ cascade: false # Stage cheap subset first; skip rest if it fails the gate (per-task mode)
591
+ cascade_threshold: 0.4 # Min stage-1 mean score required to run the full task set
592
+ cascade_stage1: 0 # Tasks in stage 1 (0 = auto, ~1/3 of the list)
506
593
 
507
594
  harness:
508
595
  language: python # Harness code language
@@ -570,11 +657,11 @@ python -m polyharness --version
570
657
  | `ph init` | Initialize workspace with auto-copy of harness, tasks, eval script |
571
658
  | `ph run` | Start the optimization search loop |
572
659
  | `ph status` | Progress table with elapsed time, improvement rate, and delta |
573
- | `ph log` | Search tree with delta (Δ) column (or `--flat` for table) |
660
+ | `ph log` | Search tree with delta (Δ) column and Pareto-frontier (◆) markers (or `--flat` for table) |
574
661
  | `ph best` | Show best candidate: score, per-task breakdown, changes summary |
575
662
  | `ph compare A B` | Compare two iterations: score deltas + unified code diff |
576
663
  | `ph diff <N>` | Shorthand for `compare 0 <N>` |
577
- | `ph leaderboard` | Ranked table of all candidates (`--top N`, `--tasks` drilldown) |
664
+ | `ph leaderboard` | Ranked table of all candidates with Pareto (◆) and backend columns (`--top N`, `--tasks` drilldown) |
578
665
  | `ph trace <N>` | View stdout, stderr, metrics, exit code for an iteration |
579
666
  | `ph report` | Generate a full markdown report with score trends and per-task table |
580
667
  | `ph apply` | Copy best harness back to `base_harness/` (or `--target` dir) |
@@ -588,6 +675,9 @@ python -m polyharness --version
588
675
  | `ph traces stats` | Summary statistics: total traces, scored count, agent distribution |
589
676
  | `ph traces clear` | Remove collected traces (`--keep N` to retain newest, `-y` to skip confirm) |
590
677
  | `ph evolve` | Trigger an online evolution cycle using collected traces as context |
678
+ | `ph shell-hook install` | Install shell hook to auto-wrap agent commands (claude, claw, codex, opencode) |
679
+ | `ph shell-hook uninstall` | Remove the shell hook from your rc file |
680
+ | `ph shell-hook status` | Check if the shell hook is installed |
591
681
  | `ph upgrade` | Upgrade PolyHarness to the latest version |
592
682
  | `ph uninstall` | Uninstall PolyHarness from the current environment (`-y` to skip confirm) |
593
683
 
@@ -615,7 +705,8 @@ python -m polyharness --version
615
705
  --dry-run Only evaluate the base harness, skip search
616
706
  --resume Continue an interrupted search from where it left off
617
707
  --backend <name> Override proposer backend without editing config
618
- --strategy <name> Override parent selection: best | tournament | all
708
+ --strategy <name> Override parent selection: best | tournament | all | pareto
709
+ --ensemble b1,b2,... Pick among multiple backends per iteration via a UCB bandit
619
710
  ```
620
711
 
621
712
  ### `ph wrap` options
@@ -697,7 +788,7 @@ ph run --max-iterations 5
697
788
  ```
698
789
  polyharness/
699
790
  ├── src/polyharness/
700
- │ ├── cli.py # Click CLI — 22 commands/subcommands
791
+ │ ├── cli.py # Click CLI — 25 commands/subcommands
701
792
  │ ├── config.py # Pydantic config models (+ EvolutionConfig)
702
793
  │ ├── collector.py # Trace collector for online evolution
703
794
  │ ├── orchestrator.py # Meta-Harness search loop + progress bar + error recovery
@@ -715,6 +806,7 @@ polyharness/
715
806
  │ │ ├── claude_code.py # claude -p
716
807
  │ │ ├── claw_code.py # claw -p
717
808
  │ │ ├── codex.py # codex --quiet --auto-edit
809
+ │ │ ├── hermes.py # hermes chat -q
718
810
  │ │ └── opencode.py # opencode -p
719
811
  │ └── templates/ # 5 built-in task templates
720
812
  │ ├── text-classification/
@@ -722,7 +814,7 @@ polyharness/
722
814
  │ ├── code-generation/
723
815
  │ ├── rag-qa/
724
816
  │ └── api-calling/
725
- ├── tests/ # 165 tests (pytest)
817
+ ├── tests/ # 173 tests (pytest)
726
818
  ├── bin/ # npm wrapper (ph.mjs, postinstall.mjs)
727
819
  ├── docs/
728
820
  │ ├── development/ # Product roadmap & technical architecture
@@ -15,7 +15,7 @@
15
15
 
16
16
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
17
17
  [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
18
- [![Tests](https://img.shields.io/badge/tests-165%20passing-brightgreen.svg)]()
18
+ [![Tests](https://img.shields.io/badge/tests-206%20passing-brightgreen.svg)]()
19
19
  [![中文文档](https://img.shields.io/badge/文档-中文版-red.svg)](README_CN.md)
20
20
 
21
21
  ---
@@ -30,7 +30,7 @@ Your AI agent runs the same harness every time. Same prompts, same tool config,
30
30
  | | |
31
31
  |---|---|
32
32
  | **Self-Evolution** | Iteratively searches over harness changes and keeps the full evaluation history in one workspace. |
33
- | **7 Agent Backends** | Claude Code · Claw Code · Codex · OpenCode · API direct · OpenAI-compatible · Local — plug in any CLI agent. |
33
+ | **8 Agent Backends** | Claude Code · Claw Code · Codex · Hermes · OpenCode · API direct · OpenAI-compatible · Local — plug in any CLI agent. |
34
34
  | **Full History** | Every iteration's code, scores, and traces preserved. The Meta-Harness paper reports that non-Markovian search outperforms blind retries. |
35
35
  | **Search Tree** | Visualize the optimization path. Compare any two candidates with per-task diffs. |
36
36
  | **One-Command Setup** | `ph init --base-harness ... --task-dir ...` — copies files, configures workspace, done. |
@@ -53,13 +53,19 @@ PolyHarness fills that gap. It's the open-source engine that makes Meta-Harness
53
53
  > - Memory tools (like Supermemory) give agents persistent **memory** across conversations.
54
54
  > - **PolyHarness gives agents persistent self-evolution** — you get a repeatable way to refine how they work over time.
55
55
 
56
+ ### Part of a wave — specialized for harnesses
57
+
58
+ PolyHarness doesn't stand alone. A wave of open-source projects has shown that pairing LLMs with evolutionary search systematically improves code and prompts: [GEPA](https://github.com/gepa-ai/gepa) (reflective prompt evolution over a Pareto frontier), [ShinkaEvolve](https://github.com/SakanaAI/ShinkaEvolve) (sample-efficient program evolution), [OpenEvolve](https://github.com/algorithmicsuperintelligence/openevolve) (an open AlphaEvolve), and the [Darwin Gödel Machine](https://sakana.ai/dgm/) (open-ended self-improving agents).
59
+
60
+ Most of these evolve *general* programs or algorithms. PolyHarness is the member of this wave **specialized for agent harnesses** — the prompts, tool config, and orchestration *around* an existing agent — with a focus on **online evolution from real usage** (`ph wrap` → `ph evolve`). It borrows the strongest ideas from these projects and applies them to any CLI agent on your own tasks: Pareto-frontier parent selection (GEPA), code-novelty rejection and an adaptive backend ensemble (ShinkaEvolve), and cascade evaluation (AlphaEvolve/OpenEvolve).
61
+
56
62
  ## What PolyHarness Is
57
63
 
58
64
  PolyHarness is the open-source engine for iteratively searching over an agent's harness.
59
65
 
60
66
  It builds on ideas from the Meta-Harness paper and the TBench2 results reported there, while focusing this repository on the optimization workflow itself — how harness variants are proposed, evaluated, and revised over repeated runs.
61
67
 
62
- If tools like ForgeCode help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
68
+ If tools like [ForgeCode](https://github.com/antinomyhq/forgecode) help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
63
69
 
64
70
  ---
65
71
 
@@ -229,7 +235,7 @@ PolyHarness automatically sandboxes your agent inside this workspace, ensuring i
229
235
 
230
236
  | Scenario | How to configure |
231
237
  |----------|------------------|
232
- | **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, opencode)* |
238
+ | **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, hermes, opencode)* |
233
239
  | **Anthropic API** | Run `ph init --agent api`. Set `export ANTHROPIC_API_KEY="sk-ant-..."` before `ph run`. |
234
240
  | **OpenAI / Local Models** | Run `ph init --agent openai`. Then configure the endpoint — see [Local Model Setup](#local-model-setup) below. |
235
241
  | **Custom CLI path** | If your CLI agent uses a non-standard command, edit `config.yaml` in the workspace before running:<br>`proposer: { cli_path: "npx @anthropic-ai/claude-code" }`|
@@ -242,6 +248,34 @@ ph run
242
248
 
243
249
  The orchestrator: copies your harness → asks the Proposer agent for a candidate change → evaluates the result → stores everything → repeats.
244
250
 
251
+ ```
252
+ ┌──────────────────────────────────────────────────────────────┐
253
+ │ │
254
+ │ You PolyHarness │
255
+ │ │ │ │
256
+ │ ├── ph init ──────────────────→│ Creates workspace │
257
+ │ │ (harness + tasks + eval) │ Copies files │
258
+ │ │ │ Injects CLAUDE.md │
259
+ │ │ │ │
260
+ │ ├── ph run ───────────────────→│ Starts search loop: │
261
+ │ │ │ │
262
+ │ │ ┌──────────────────────────┤ │
263
+ │ │ │ Step 1: SELECT parent │ Best or Tournament │
264
+ │ │ │ Step 2: COPY harness │ From parent → candidate │
265
+ │ │ │ Step 3: PROPOSE changes │ Agent reads all history │
266
+ │ │ │ Step 4: EVALUATE │ Run tasks, get scores │
267
+ │ │ │ Step 5: STORE results │ Code + scores + traces │
268
+ │ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
269
+ │ │ └──────────┬───────────────┤ │
270
+ │ │ └── loop ───────┘ │
271
+ │ │ │ │
272
+ │ ├── ph log ───────────────────→│ Shows search tree │
273
+ │ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
274
+ │ └── ph apply ─────────────────→│ Writes best back │
275
+ │ │
276
+ └──────────────────────────────────────────────────────────────┘
277
+ ```
278
+
245
279
  ### 5. Inspect and apply
246
280
 
247
281
  ```bash
@@ -270,6 +304,7 @@ Just add `ph wrap --auto-evolve` in front of your agent command (pick the one ma
270
304
  ph wrap --auto-evolve claude -p "Refactor the auth module to use JWT" # Claude Code
271
305
  ph wrap --auto-evolve claw -p "Write integration tests for payments" # Claw Code
272
306
  ph wrap --auto-evolve codex "Add retry logic to the API client" # Codex
307
+ ph wrap --auto-evolve hermes chat -q "Refactor the DB connection pool" # Hermes Agent
273
308
  ph wrap --auto-evolve opencode -p "Fix the flaky parser test" # OpenCode
274
309
 
275
310
  # Local models — wrap the CLI command directly
@@ -325,7 +360,67 @@ ph evolve # trigger evolution manually
325
360
 
326
361
  > **Tip:** Use `--no-record-output` if you don't want stdout/stderr saved (e.g., for sensitive output). Metadata is always recorded.
327
362
 
328
- > **Tip:** Create a shell alias for even less typing: `alias cc="ph wrap --auto-evolve claude"`
363
+ #### Zero-config auto-wrap: `ph shell-hook`
364
+
365
+ Don't want to type `ph wrap --auto-evolve` every time? Install a shell hook — it auto-intercepts agent commands:
366
+
367
+ ```bash
368
+ ph shell-hook install # one-time setup, writes to ~/.zshrc
369
+ ```
370
+
371
+ After that, just use your agent as usual:
372
+
373
+ ```bash
374
+ claude -p "Refactor auth to JWT" # automatically becomes: ph wrap --auto-evolve claude -p ...
375
+ claw -p "Write payment tests" # same — auto-wrapped
376
+ codex "Add retry logic" # same
377
+ hermes chat -q "Refactor pool" # same
378
+ opencode -p "Fix flaky test" # same
379
+ ```
380
+
381
+ How it works: a `preexec` hook in your shell detects `claude`/`claw`/`codex`/`hermes`/`opencode` commands and transparently redirects them through `ph wrap --auto-evolve`. Your output is unchanged.
382
+
383
+ ```bash
384
+ ph shell-hook status # check if installed
385
+ ph shell-hook uninstall # remove cleanly (restores original rc file)
386
+ ```
387
+
388
+ #### Auto-Evolution flow
389
+
390
+ ```
391
+ ┌──────────────────────────────────────────────────────────────┐
392
+ │ │
393
+ │ You PolyHarness │
394
+ │ │ │ │
395
+ │ ├── ph shell-hook install ────→ │ Injects preexec hook │
396
+ │ │ (one-time setup) │ into ~/.zshrc │
397
+ │ │ │ │
398
+ │ ├── claude -p "Fix bug" ──────→ │ Shell hook intercepts │
399
+ │ │ (normal usage) │ │
400
+ │ │ ├── Run agent │
401
+ │ │ ┌─ output passes through ──┤ │
402
+ │ │ │ ├── Record trace │
403
+ │ │ │ │ (~/.polyharness/ │
404
+ │ │ │ │ traces/) │
405
+ │ │ │ │ │
406
+ │ │ │ ├── Check threshold │
407
+ │ │ │ │ traces < 50? │
408
+ │ │ │ │ ├─ Yes: "7/50 traces" │
409
+ │ │ │ │ └─ No: trigger ───┐ │
410
+ │ │ │ │ │ │
411
+ │ │ │ │ ┌─────────────────┘ │
412
+ │ │ │ │ │ Evolution cycle │
413
+ │ │ │ │ │ (same as ph run) │
414
+ │ │ │ │ │ Propose → Evaluate │
415
+ │ │ │ │ │ → Store → Repeat │
416
+ │ │ │ │ └────────────────── │
417
+ │ │ │ │ │
418
+ │ └───┘ │ │
419
+ │ │
420
+ └──────────────────────────────────────────────────────────────┘
421
+ ```
422
+
423
+ The key difference: **you never run `ph run` manually.** You use your agent as always; PolyHarness silently collects data and triggers evolution when it has enough signal.
329
424
 
330
425
  ### Try it now (no API key needed)
331
426
 
@@ -347,35 +442,7 @@ The score path above is the current measured result of the bundled `math-word-pr
347
442
 
348
443
  ## How It Works
349
444
 
350
- PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes:
351
-
352
- ```
353
- ┌──────────────────────────────────────────────────────────────┐
354
- │ │
355
- │ You PolyHarness │
356
- │ │ │ │
357
- │ ├── ph init ──────────────────→│ Creates workspace │
358
- │ │ (harness + tasks + eval) │ Copies files │
359
- │ │ │ Injects CLAUDE.md │
360
- │ │ │ │
361
- │ ├── ph run ───────────────────→│ Starts search loop: │
362
- │ │ │ │
363
- │ │ ┌──────────────────────────┤ │
364
- │ │ │ Step 1: SELECT parent │ Best or Tournament │
365
- │ │ │ Step 2: COPY harness │ From parent → candidate │
366
- │ │ │ Step 3: PROPOSE changes │ Agent reads all history │
367
- │ │ │ Step 4: EVALUATE │ Run tasks, get scores │
368
- │ │ │ Step 5: STORE results │ Code + scores + traces │
369
- │ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
370
- │ │ └──────────┬───────────────┤ │
371
- │ │ └── loop ───────┘ │
372
- │ │ │ │
373
- │ ├── ph log ───────────────────→│ Shows search tree │
374
- │ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
375
- │ └── ph apply ─────────────────→│ Writes best back │
376
- │ │
377
- └──────────────────────────────────────────────────────────────┘
378
- ```
445
+ PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes. See the detailed flow diagrams above in [Step 4](#4-run-the-optimization-loop) and [Step 6](#6-auto-evolution).
379
446
 
380
447
  ### Why it works: non-Markovian search
381
448
 
@@ -400,12 +467,23 @@ The Proposer reads **all of this** before generating the next candidate. It can
400
467
  | `claude-code` | `claude -p` | Official Claude Code CLI (Pro/Teams subscription) |
401
468
  | `claw-code` | `claw -p` | Open-source Claw Code CLI |
402
469
  | `codex` | `codex --quiet` | OpenAI Codex CLI |
470
+ | `hermes` | `hermes chat -q` | Nous Research [Hermes Agent](https://github.com/NousResearch/hermes-agent) CLI |
403
471
  | `opencode` | `opencode -p` | OpenCode CLI |
404
472
  | `local` | — | Offline rule-based engine for development & testing |
405
473
 
406
474
  `ph doctor` auto-detects all available backends and shows their status.
407
475
 
408
- When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `OPENCODE.md` — each agent's native instruction format.
476
+ When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `AGENTS.md` (Hermes), `OPENCODE.md` — each agent's native instruction format.
477
+
478
+ #### Backend ensemble (adaptive selection)
479
+
480
+ Don't know which backend writes the best harness changes for your task? Let PolyHarness find out. Pass several and it picks one per iteration with a **UCB bandit**, shifting picks toward whichever backend actually produces *improving* candidates:
481
+
482
+ ```bash
483
+ ph run --ensemble "claude-code,codex,local"
484
+ ```
485
+
486
+ At the end of the run you get a per-backend breakdown (picks + improve-rate). Selection is deterministic given the reward sequence, so runs stay reproducible. Inspired by ShinkaEvolve's adaptive LLM-ensemble selection.
409
487
 
410
488
  ### Local Model Setup
411
489
 
@@ -455,10 +533,16 @@ After `ph init`, the workspace has a `config.yaml` with these sections:
455
533
  search:
456
534
  max_iterations: 20 # Maximum search iterations
457
535
  early_stop_patience: 5 # Stop after N iterations with no improvement
458
- parent_selection: best # Strategy: best | tournament | all
536
+ parent_selection: best # Strategy: best | tournament | all | pareto
537
+ novelty_filter: false # Reject near-duplicate candidates before eval (saves budget)
538
+ novelty_threshold: 0.97 # Similarity ratio above which a candidate is a near-duplicate
539
+ novelty_max_retries: 1 # Regenerate a near-duplicate this many times before skipping
540
+ seed: null # RNG seed — set an int to make randomized runs reproducible
459
541
 
460
542
  proposer:
461
- backend: api # api | openai | claude-code | claw-code | codex | opencode | local
543
+ backend: api # api | openai | claude-code | claw-code | codex | hermes | opencode | local
544
+ ensemble: [] # If non-empty, pick among these backends per iteration via a UCB bandit
545
+ bandit_c: 1.41421356 # UCB exploration constant (higher = more exploration)
462
546
  model: claude-sonnet-4-20250514 # Model name (for api/openai backends)
463
547
  base_url: null # Custom API endpoint (for openai backend)
464
548
  api_key: null # API key override (null = use env var)
@@ -470,6 +554,9 @@ evaluator:
470
554
  type: python # python | docker | custom
471
555
  entry: evaluate.py # Evaluator script entrypoint
472
556
  timeout: 300 # Per-task timeout in seconds
557
+ cascade: false # Stage cheap subset first; skip rest if it fails the gate (per-task mode)
558
+ cascade_threshold: 0.4 # Min stage-1 mean score required to run the full task set
559
+ cascade_stage1: 0 # Tasks in stage 1 (0 = auto, ~1/3 of the list)
473
560
 
474
561
  harness:
475
562
  language: python # Harness code language
@@ -537,11 +624,11 @@ python -m polyharness --version
537
624
  | `ph init` | Initialize workspace with auto-copy of harness, tasks, eval script |
538
625
  | `ph run` | Start the optimization search loop |
539
626
  | `ph status` | Progress table with elapsed time, improvement rate, and delta |
540
- | `ph log` | Search tree with delta (Δ) column (or `--flat` for table) |
627
+ | `ph log` | Search tree with delta (Δ) column and Pareto-frontier (◆) markers (or `--flat` for table) |
541
628
  | `ph best` | Show best candidate: score, per-task breakdown, changes summary |
542
629
  | `ph compare A B` | Compare two iterations: score deltas + unified code diff |
543
630
  | `ph diff <N>` | Shorthand for `compare 0 <N>` |
544
- | `ph leaderboard` | Ranked table of all candidates (`--top N`, `--tasks` drilldown) |
631
+ | `ph leaderboard` | Ranked table of all candidates with Pareto (◆) and backend columns (`--top N`, `--tasks` drilldown) |
545
632
  | `ph trace <N>` | View stdout, stderr, metrics, exit code for an iteration |
546
633
  | `ph report` | Generate a full markdown report with score trends and per-task table |
547
634
  | `ph apply` | Copy best harness back to `base_harness/` (or `--target` dir) |
@@ -555,6 +642,9 @@ python -m polyharness --version
555
642
  | `ph traces stats` | Summary statistics: total traces, scored count, agent distribution |
556
643
  | `ph traces clear` | Remove collected traces (`--keep N` to retain newest, `-y` to skip confirm) |
557
644
  | `ph evolve` | Trigger an online evolution cycle using collected traces as context |
645
+ | `ph shell-hook install` | Install shell hook to auto-wrap agent commands (claude, claw, codex, opencode) |
646
+ | `ph shell-hook uninstall` | Remove the shell hook from your rc file |
647
+ | `ph shell-hook status` | Check if the shell hook is installed |
558
648
  | `ph upgrade` | Upgrade PolyHarness to the latest version |
559
649
  | `ph uninstall` | Uninstall PolyHarness from the current environment (`-y` to skip confirm) |
560
650
 
@@ -582,7 +672,8 @@ python -m polyharness --version
582
672
  --dry-run Only evaluate the base harness, skip search
583
673
  --resume Continue an interrupted search from where it left off
584
674
  --backend <name> Override proposer backend without editing config
585
- --strategy <name> Override parent selection: best | tournament | all
675
+ --strategy <name> Override parent selection: best | tournament | all | pareto
676
+ --ensemble b1,b2,... Pick among multiple backends per iteration via a UCB bandit
586
677
  ```
587
678
 
588
679
  ### `ph wrap` options
@@ -664,7 +755,7 @@ ph run --max-iterations 5
664
755
  ```
665
756
  polyharness/
666
757
  ├── src/polyharness/
667
- │ ├── cli.py # Click CLI — 22 commands/subcommands
758
+ │ ├── cli.py # Click CLI — 25 commands/subcommands
668
759
  │ ├── config.py # Pydantic config models (+ EvolutionConfig)
669
760
  │ ├── collector.py # Trace collector for online evolution
670
761
  │ ├── orchestrator.py # Meta-Harness search loop + progress bar + error recovery
@@ -682,6 +773,7 @@ polyharness/
682
773
  │ │ ├── claude_code.py # claude -p
683
774
  │ │ ├── claw_code.py # claw -p
684
775
  │ │ ├── codex.py # codex --quiet --auto-edit
776
+ │ │ ├── hermes.py # hermes chat -q
685
777
  │ │ └── opencode.py # opencode -p
686
778
  │ └── templates/ # 5 built-in task templates
687
779
  │ ├── text-classification/
@@ -689,7 +781,7 @@ polyharness/
689
781
  │ ├── code-generation/
690
782
  │ ├── rag-qa/
691
783
  │ └── api-calling/
692
- ├── tests/ # 165 tests (pytest)
784
+ ├── tests/ # 173 tests (pytest)
693
785
  ├── bin/ # npm wrapper (ph.mjs, postinstall.mjs)
694
786
  ├── docs/
695
787
  │ ├── development/ # Product roadmap & technical architecture
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "polyharness"
7
- version = "0.2.0"
7
+ version = "0.2.2"
8
8
  description = "Automated harness optimization for AI agents — make your agent evolve."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -1,3 +1,3 @@
1
1
  """PolyHarness — Automated harness optimization for AI agents."""
2
2
 
3
- __version__ = "0.2.0"
3
+ __version__ = "0.2.2"