polyharness 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. polyharness-0.1.1/LICENSE +21 -0
  2. polyharness-0.1.1/PKG-INFO +493 -0
  3. polyharness-0.1.1/README.md +461 -0
  4. polyharness-0.1.1/pyproject.toml +61 -0
  5. polyharness-0.1.1/setup.cfg +4 -0
  6. polyharness-0.1.1/src/polyharness/__init__.py +3 -0
  7. polyharness-0.1.1/src/polyharness/__main__.py +6 -0
  8. polyharness-0.1.1/src/polyharness/cli.py +1205 -0
  9. polyharness-0.1.1/src/polyharness/config.py +86 -0
  10. polyharness-0.1.1/src/polyharness/doctor.py +61 -0
  11. polyharness-0.1.1/src/polyharness/evaluator/__init__.py +8 -0
  12. polyharness-0.1.1/src/polyharness/evaluator/evaluator.py +153 -0
  13. polyharness-0.1.1/src/polyharness/orchestrator.py +276 -0
  14. polyharness-0.1.1/src/polyharness/proposer/__init__.py +31 -0
  15. polyharness-0.1.1/src/polyharness/proposer/adapters/__init__.py +44 -0
  16. polyharness-0.1.1/src/polyharness/proposer/adapters/base.py +74 -0
  17. polyharness-0.1.1/src/polyharness/proposer/adapters/claude_code.py +31 -0
  18. polyharness-0.1.1/src/polyharness/proposer/adapters/claw_code.py +31 -0
  19. polyharness-0.1.1/src/polyharness/proposer/adapters/codex.py +29 -0
  20. polyharness-0.1.1/src/polyharness/proposer/adapters/opencode.py +28 -0
  21. polyharness-0.1.1/src/polyharness/proposer/api_proposer.py +276 -0
  22. polyharness-0.1.1/src/polyharness/proposer/base.py +29 -0
  23. polyharness-0.1.1/src/polyharness/proposer/cli_proposer.py +143 -0
  24. polyharness-0.1.1/src/polyharness/proposer/local_proposer.py +1102 -0
  25. polyharness-0.1.1/src/polyharness/search_log.py +86 -0
  26. polyharness-0.1.1/src/polyharness/utils/__init__.py +0 -0
  27. polyharness-0.1.1/src/polyharness/workspace.py +356 -0
  28. polyharness-0.1.1/src/polyharness.egg-info/PKG-INFO +493 -0
  29. polyharness-0.1.1/src/polyharness.egg-info/SOURCES.txt +43 -0
  30. polyharness-0.1.1/src/polyharness.egg-info/dependency_links.txt +1 -0
  31. polyharness-0.1.1/src/polyharness.egg-info/entry_points.txt +2 -0
  32. polyharness-0.1.1/src/polyharness.egg-info/requires.txt +13 -0
  33. polyharness-0.1.1/src/polyharness.egg-info/top_level.txt +1 -0
  34. polyharness-0.1.1/tests/test_cli_adapters.py +273 -0
  35. polyharness-0.1.1/tests/test_cli_features.py +451 -0
  36. polyharness-0.1.1/tests/test_compare.py +120 -0
  37. polyharness-0.1.1/tests/test_config.py +44 -0
  38. polyharness-0.1.1/tests/test_evaluator.py +80 -0
  39. polyharness-0.1.1/tests/test_example.py +192 -0
  40. polyharness-0.1.1/tests/test_export.py +135 -0
  41. polyharness-0.1.1/tests/test_log.py +102 -0
  42. polyharness-0.1.1/tests/test_orchestrator.py +229 -0
  43. polyharness-0.1.1/tests/test_search_log.py +61 -0
  44. polyharness-0.1.1/tests/test_smoke.py +45 -0
  45. polyharness-0.1.1/tests/test_workspace.py +212 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 weijt606
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,493 @@
1
+ Metadata-Version: 2.4
2
+ Name: polyharness
3
+ Version: 0.1.1
4
+ Summary: Automated harness optimization for AI agents — make your agent evolve.
5
+ Author: weijt606
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/weijt606/polyharness
8
+ Project-URL: Repository, https://github.com/weijt606/polyharness
9
+ Project-URL: Issues, https://github.com/weijt606/polyharness/issues
10
+ Keywords: agent,harness,optimization,meta-harness,cli
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Libraries
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: anthropic>=0.40.0
21
+ Requires-Dist: click>=8.0
22
+ Requires-Dist: pydantic>=2.0
23
+ Requires-Dist: pyyaml>=6.0
24
+ Requires-Dist: rich>=13.0
25
+ Provides-Extra: docker
26
+ Requires-Dist: docker>=7.0; extra == "docker"
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=8.0; extra == "dev"
29
+ Requires-Dist: pytest-cov; extra == "dev"
30
+ Requires-Dist: ruff; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # PolyHarness
34
+
35
+ ```text
36
+ _____ _ _ _
37
+ | __ \ | | | | | |
38
+ | |__) |__ | |_ _ | |__| | __ _ _ __ _ __ ___ ___ ___
39
+ | ___/ _ \| | | | || __ |/ _` | '__| '_ \ / _ \/ __/ __|
40
+ | | | (_) | | |_| || | | | (_| | | | | | | __/\__ \__ \
41
+ |_| \___/|_|\__, ||_| |_|\__,_|_| |_| |_|\___||___/___/
42
+ __/ |
43
+ |___/
44
+ ```
45
+
46
+ **Make your AI Agent evolve automatically.**
47
+
48
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
49
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
50
+ [![Tests](https://img.shields.io/badge/tests-121%20passing-brightgreen.svg)]()
51
+ [![中文文档](https://img.shields.io/badge/文档-中文版-red.svg)](README_CN.md)
52
+
53
+ ---
54
+
55
+ Your AI agent runs the same harness every time. Same prompts, same tool config, same strategy — no matter how many times it fails.
56
+
57
+ **PolyHarness addresses that.** It records each iteration, evaluates candidate harness changes, and uses the accumulated history to search for better-scoring configurations. You run one command to start the loop.
58
+
59
+ | | |
60
+ |---|---|
61
+ | **Self-Evolution** | Iteratively searches over harness changes and keeps the full evaluation history in one workspace. |
62
+ | **6 Agent Backends** | Claude Code · Claw Code · Codex · OpenCode · API direct · Local — plug in any CLI agent. |
63
+ | **Full History** | Every iteration's code, scores, and traces preserved. The Meta-Harness paper reports that non-Markovian search outperforms blind retries. |
64
+ | **Search Tree** | Visualize the optimization path. Compare any two candidates with per-task diffs. |
65
+ | **One-Command Setup** | `ph init --base-harness ... --task-dir ...` — copies files, configures workspace, done. |
66
+ | **Closed Loop** | init → run → inspect → apply. You choose when to write the best-scoring candidate back to your project. |
67
+
68
+ ---
69
+
70
+ ## Backstory
71
+
72
+ Stanford's [Meta-Harness paper](https://arxiv.org/abs/2603.28052) (IRIS Lab, 2026) proved a surprising result: **harness design is the #1 lever for agent performance** — more impactful than model choice, prompt engineering, or fine-tuning.
73
+
74
+ The key insight? When you give an AI agent access to *full diagnostic history* — not just the latest score, but every past attempt's code, traces, and failure modes — it can *systematically evolve* its own harness configuration. The paper called this "non-Markovian search" and showed it outperforms simple best-of-N sampling by a wide margin.
75
+
76
+ But the paper only released the final optimized artifact (`agent.py`). **The search framework itself was never open-sourced.**
77
+
78
+ PolyHarness fills that gap. It's the open-source engine that makes Meta-Harness search available to everyone — for any agent, any task, any evaluation pipeline.
79
+
80
+ > **Think of it this way:**
81
+ > - Memory tools (like Supermemory) give agents persistent **memory** across conversations.
82
+ > - **PolyHarness gives agents persistent self-evolution** — you get a repeatable way to refine how they work over time.
83
+
84
+ ## What PolyHarness Is
85
+
86
+ PolyHarness is the open-source engine for iteratively searching over an agent's harness.
87
+
88
+ It builds on ideas from the Meta-Harness paper and the TBench2 results reported there, while focusing this repository on the optimization workflow itself — how harness variants are proposed, evaluated, and revised over repeated runs.
89
+
90
+ If tools like ForgeCode help you code, PolyHarness helps you search for task-specific harness improvements by iterating on prompts, tool use, and harness logic.
91
+
92
+ ---
93
+
94
+ ## Use PolyHarness
95
+
96
+ <table>
97
+ <tr>
98
+ <td width="50%" valign="top">
99
+
100
+ ### I use AI coding agents
101
+
102
+ You have Claude Code, Codex, or another agent.
103
+ You want to tune it for your specific tasks — without manually tweaking prompts.
104
+
105
+ ```bash
106
+ pip install polyharness
107
+ ph init --agent claude-code --task-dir ./my_tasks
108
+ ph run
109
+ ph apply
110
+ ```
111
+
112
+ You now have a repeatable optimization workspace. Inspect the results, then apply the best-scoring candidate if it improves your evaluation.
113
+
114
+ **[→ Jump to Quick Start](#quick-start)**
115
+
116
+ </td>
117
+ <td width="50%" valign="top">
118
+
119
+ ### I'm building agent frameworks
120
+
121
+ You're developing an AI agent or tool and want
122
+ to integrate automated optimization as a feature.
123
+
124
+ PolyHarness provides a pluggable adapter API —
125
+ implement 3 methods and your agent can participate in the same search loop.
126
+
127
+ ```python
128
+ class MyAgentAdapter(CLIAdapter):
129
+ def build_command(self, prompt, cwd):
130
+ return ["my-agent", "--prompt", prompt]
131
+ def parse_output(self, stdout, stderr, code):
132
+ return CLIResult(...)
133
+ ```
134
+
135
+ **[→ Jump to Architecture](#how-it-works)**
136
+
137
+ </td>
138
+ </tr>
139
+ </table>
140
+
141
+ ---
142
+
143
+ ## Quick Start
144
+
145
+ ### 1. Install
146
+
147
+ ```bash
148
+ pip install polyharness # Python >= 3.12
149
+ # or
150
+ npm install -g polyharness # Node.js wrapper, auto-installs Python package
151
+ ```
152
+
153
+ ### 2. Check your environment
154
+
155
+ ```bash
156
+ ph doctor
157
+ ```
158
+
159
+ This auto-detects which agent backends (Claude Code, Codex, etc.) are installed and shows their status.
160
+
161
+ ### 3. Initialize a workspace
162
+
163
+ ```bash
164
+ ph init --agent claude-code --base-harness ./my_harness/ --task-dir ./my_tasks/ --eval-script ./evaluate.py
165
+ ```
166
+
167
+ This copies your harness code, test cases, and evaluation script into an isolated **optimization workspace** (by default `.ph_workspace` in the current directory, or the folder specified by `--workspace`).
168
+
169
+ **Configure Your Agent**
170
+
171
+ PolyHarness automatically sandboxes your agent inside this workspace, ensuring it only edits candidate copies and safely reads history traces.
172
+
173
+ | Scenario | How to configure |
174
+ |----------|------------------|
175
+ | **Supported CLI Tools** | Run `ph init --agent <name>`. PolyHarness auto-injects required instructions (e.g., `CLAUDE.md`).<br>*(Supported: claude-code, claw-code, codex, opencode)* |
176
+ | **API / LLM Directly** | Run `ph init --agent api`. No CLI tool required, just run `export OPENAI_API_KEY="sk-..."` before `ph run`. |
177
+ | **Custom CLI path** | If your CLI agent uses a non-standard command, edit `config.yaml` in the workspace before running:<br>`proposer: { cli_path: "npx @anthropic-ai/claude-code" }`|
178
+
179
+ ### 4. Run the optimization loop
180
+
181
+ ```bash
182
+ ph run
183
+ ```
184
+
185
+ The orchestrator: copies your harness → asks the Proposer agent for a candidate change → evaluates the result → stores everything → repeats.
186
+
187
+ ### 5. Inspect and apply
188
+
189
+ ```bash
190
+ ph status # progress table + elapsed + improvement rate
191
+ ph log # search tree with delta (Δ) column
192
+ ph best # best candidate details
193
+ ph leaderboard # ranked table of all candidates (--tasks for drilldown)
194
+ ph compare 0 5 # diff two iterations (scores + code)
195
+ ph diff 5 # shorthand for: compare 0 5
196
+ ph trace 3 # view stdout/stderr/metrics for iter_3
197
+ ph report # generate a full markdown report
198
+
199
+ ph apply # write best harness back to base_harness/
200
+ ph export ./my-optimized # or export to any directory
201
+ ph clean --keep-best # remove candidates to free disk space
202
+ ```
203
+
204
+ ### Try it now (no API key needed)
205
+
206
+ ```bash
207
+ cd examples/math-word-problems
208
+
209
+ ph init --agent local \
210
+ --base-harness ./base_harness \
211
+ --task-dir . \
212
+ --workspace .ph_workspace
213
+
214
+ ph log --workspace .ph_workspace
215
+
216
+ # Search Tree
217
+ # └── iter_0 0.3500
218
+ # └── iter_1 0.5000
219
+ # └── iter_2 0.6500
220
+ # └── iter_3 0.9000 ★
221
+ ```
222
+
223
+ The score path above is the current measured result of the bundled `math-word-problems` example with the repository's `local` backend, rounded for readability. It is not a paper benchmark or an external project result. The `local` backend is deterministic; no fixed score uplift is claimed here for Claude Code, Codex, or other real agent backends.
224
+
225
+ ---
226
+
227
+ ## How It Works
228
+
229
+ PolyHarness runs a **Meta-Harness-style search loop** — an iterative process where an AI agent proposes, evaluates, and stores harness changes:
230
+
231
+ ```
232
+ ┌──────────────────────────────────────────────────────────────┐
233
+ │ │
234
+ │ You PolyHarness │
235
+ │ │ │ │
236
+ │ ├── ph init ──────────────────→│ Creates workspace │
237
+ │ │ (harness + tasks + eval) │ Copies files │
238
+ │ │ │ Injects CLAUDE.md │
239
+ │ │ │ │
240
+ │ ├── ph run ───────────────────→│ Starts search loop: │
241
+ │ │ │ │
242
+ │ │ ┌──────────────────────────┤ │
243
+ │ │ │ Step 1: SELECT parent │ Best or Tournament │
244
+ │ │ │ Step 2: COPY harness │ From parent → candidate │
245
+ │ │ │ Step 3: PROPOSE changes │ Agent reads all history │
246
+ │ │ │ Step 4: EVALUATE │ Run tasks, get scores │
247
+ │ │ │ Step 5: STORE results │ Code + scores + traces │
248
+ │ │ │ Step 6: CHECK stopping │ Improved? Patience left? │
249
+ │ │ └──────────┬───────────────┤ │
250
+ │ │ └── loop ───────┘ │
251
+ │ │ │ │
252
+ │ ├── ph log ───────────────────→│ Shows search tree │
253
+ │ ├── ph compare 0 5 ──────────→│ Score deltas + code diff │
254
+ │ └── ph apply ─────────────────→│ Writes best back │
255
+ │ │
256
+ └──────────────────────────────────────────────────────────────┘
257
+ ```
258
+
259
+ ### Why it works: non-Markovian search
260
+
261
+ Traditional approaches: run the agent → check the score → retry. Each attempt is independent.
262
+
263
+ **PolyHarness is different.** Every iteration stores:
264
+ - The complete candidate source code
265
+ - Per-task scores (not just the overall number)
266
+ - Full execution traces (stdout, stderr, exit codes)
267
+ - Metadata (parent candidate, proposer model, changes summary)
268
+
269
+ The Proposer reads **all of this** before generating the next candidate. It can see *why* a previous attempt failed, *which specific tasks* regressed, and *what code changes* caused it. This is why the Meta-Harness paper found that full-context search outperforms scores-only search by 15+ percentage points.
270
+
271
+ ---
272
+
273
+ ## Supported Agent Backends
274
+
275
+ | Backend | Command | Use case |
276
+ |---------|---------|----------|
277
+ | `api` | — | Default. Anthropic API direct, just needs `ANTHROPIC_API_KEY` |
278
+ | `claude-code` | `claude -p` | Official Claude Code CLI (Pro/Teams subscription) |
279
+ | `claw-code` | `claw -p` | Open-source Claw Code CLI |
280
+ | `codex` | `codex --quiet` | OpenAI Codex CLI |
281
+ | `opencode` | `opencode -p` | OpenCode CLI |
282
+ | `local` | — | Offline rule-based engine for development & testing |
283
+
284
+ `ph doctor` auto-detects all available backends and shows their status.
285
+
286
+ When you run `ph init --agent claude-code`, PolyHarness automatically generates a `CLAUDE.md` instruction file in the workspace, telling the agent how to behave as an optimization Proposer. Same for `CLAW.md`, `CODEX.md`, `OPENCODE.md` — each agent's native instruction format.
287
+
288
+ ---
289
+
290
+ ## Installation
291
+
292
+ ### pip (recommended)
293
+
294
+ ```bash
295
+ pip install polyharness # Requires Python >= 3.12
296
+ ph --version
297
+ ```
298
+
299
+ ### npm / npx
300
+
301
+ ```bash
302
+ npm install -g polyharness # postinstall auto-installs Python package
303
+ npx polyharness doctor # or run without global install
304
+ ```
305
+
306
+ The npm package is a thin Node.js wrapper (`bin/ph.mjs`) that finds and invokes the Python CLI. It checks: `ph` on PATH → `python -m polyharness` → auto-discovers `.venv` in parent directories.
307
+
308
+ ### From source
309
+
310
+ ```bash
311
+ git clone https://github.com/weijt606/polyharness.git
312
+ cd polyharness
313
+
314
+ python -m venv .venv && source .venv/bin/activate
315
+ pip install -e ".[dev]"
316
+ # or: pip install anthropic click pydantic pyyaml rich && export PYTHONPATH="$PWD/src"
317
+
318
+ python -m polyharness --version
319
+ ```
320
+
321
+ ---
322
+
323
+ ## CLI Reference
324
+
325
+ | Command | Description |
326
+ |---------|-------------|
327
+ | `ph doctor` | Detect installed agents and environment status |
328
+ | `ph init` | Initialize workspace with auto-copy of harness, tasks, eval script |
329
+ | `ph run` | Start the optimization search loop |
330
+ | `ph status` | Progress table with elapsed time, improvement rate, and delta |
331
+ | `ph log` | Search tree with delta (Δ) column (or `--flat` for table) |
332
+ | `ph best` | Show best candidate: score, per-task breakdown, changes summary |
333
+ | `ph compare A B` | Compare two iterations: score deltas + unified code diff |
334
+ | `ph diff <N>` | Shorthand for `compare 0 <N>` |
335
+ | `ph leaderboard` | Ranked table of all candidates (`--top N`, `--tasks` drilldown) |
336
+ | `ph trace <N>` | View stdout, stderr, metrics, exit code for an iteration |
337
+ | `ph report` | Generate a full markdown report with score trends and per-task table |
338
+ | `ph apply` | Copy best harness back to `base_harness/` (or `--target` dir) |
339
+ | `ph export <dir>` | Export candidate to any directory (with optional `--include-meta`) |
340
+ | `ph clean` | Remove candidate dirs to free disk space (`--keep-best`, `-y`) |
341
+ | `ph config show` | Display the current workspace configuration |
342
+ | `ph config set K V` | Modify a config value via dot-notation (with validation) |
343
+
344
+ ### Global flags
345
+
346
+ ```
347
+ -v, --verbose Show detailed output
348
+ -q, --quiet Suppress non-essential output
349
+ ```
350
+
351
+ ### `ph init` options
352
+
353
+ ```
354
+ --agent <name> Backend: claude-code | claw-code | codex | opencode | api | local
355
+ --workspace <dir> Workspace directory (default: current dir)
356
+ --base-harness <dir> Copy starting harness code into workspace
357
+ --task-dir <dir> Copy tasks/ folder and evaluate.py into workspace
358
+ --eval-script <path> Copy a specific evaluate.py into workspace
359
+ ```
360
+
361
+ ### `ph run` options
362
+
363
+ ```
364
+ --max-iterations N Override max iterations
365
+ --dry-run Only evaluate the base harness, skip search
366
+ --resume Continue an interrupted search from where it left off
367
+ --backend <name> Override proposer backend without editing config
368
+ --strategy <name> Override parent selection: best | tournament | all
369
+ ```
370
+
371
+ ---
372
+
373
+ ## Examples
374
+
375
+ The score trajectories below are measured from the bundled examples using the current `local` backend and are rounded for readability. They are not borrowed from the Meta-Harness paper or from external benchmarks.
376
+
377
+ ### Text Classification (sentiment analysis)
378
+
379
+ ```bash
380
+ cd examples/text-classification
381
+ ph init --agent local --base-harness ./base_harness --task-dir .
382
+ ph run --max-iterations 3
383
+
384
+ # iter_0: 0.65 → iter_1: 1.00 ★ (naive word list → expanded lexicon)
385
+ ```
386
+
387
+ ### Math Word Problems (numerical reasoning)
388
+
389
+ ```bash
390
+ cd examples/math-word-problems
391
+ ph init --agent local --base-harness ./base_harness --task-dir .
392
+ ph run --max-iterations 5
393
+
394
+ # iter_0: 0.35 → iter_1: 0.50 → iter_2: 0.65 → iter_3: 0.90 ★
395
+ # (naive multiply → operation detection → averages/% → multi-step reasoning)
396
+ ```
397
+
398
+ ### Code Generation (function synthesis)
399
+
400
+ ```bash
401
+ cd examples/code-generation
402
+ ph init --agent local --base-harness ./base_harness --task-dir .
403
+ ph run --max-iterations 5
404
+
405
+ # iter_0: 0.27 → iter_1: 0.50 → iter_2: 0.68 → iter_3: 0.95 ★
406
+ # (5 keywords → 10 patterns → composite logic → comprehensive coverage)
407
+ ```
408
+
409
+ ### API Calling (endpoint routing + parameter extraction)
410
+
411
+ ```bash
412
+ cd examples/api-calling
413
+ ph init --agent local --base-harness ./base_harness --task-dir .
414
+ ph run --max-iterations 5
415
+
416
+ # iter_0: 0.19 → iter_1: 0.55 → iter_2: 0.77 → iter_3: 0.87 ★
417
+ # (keyword matching → broad routing → param helpers → full regex extraction)
418
+ ```
419
+
420
+ ### RAG Question Answering (retrieval + answer extraction)
421
+
422
+ ```bash
423
+ cd examples/rag-qa
424
+ ph init --agent local --base-harness ./base_harness --task-dir .
425
+ ph run --max-iterations 5
426
+
427
+ # iter_0: 0.51 → iter_1: 0.79 ★
428
+ # (word overlap → stopword-filtered retrieval + sentence scoring)
429
+ ```
430
+
431
+ ---
432
+
433
+ ## Project Structure
434
+
435
+ ```
436
+ src/polyharness/
437
+ ├── cli.py # Click CLI — 16 commands/subcommands
438
+ ├── config.py # Pydantic config models
439
+ ├── orchestrator.py # Meta-Harness search loop + progress bar + error recovery
440
+ ├── workspace.py # Filesystem workspace + agent instruction injection
441
+ ├── search_log.py # JSONL append-only search log
442
+ ├── doctor.py # Environment detection for all backends
443
+ ├── evaluator/
444
+ │ └── evaluator.py # PythonEvaluator (subprocess)
445
+ ├── proposer/
446
+ │ ├── api_proposer.py # Anthropic API direct + tool-use loop
447
+ │ ├── cli_proposer.py # CLIProposer — unified subprocess management
448
+ │ ├── local_proposer.py # Offline rule-based (5 task types)
449
+ │ └── adapters/ # Per-agent CLI adapters
450
+ │ ├── claude_code.py # claude -p
451
+ │ ├── claw_code.py # claw -p
452
+ │ ├── codex.py # codex --quiet --auto-edit
453
+ │ └── opencode.py # opencode -p
454
+
455
+ bin/
456
+ ├── ph.mjs # npm wrapper
457
+ └── postinstall.mjs # npm postinstall
458
+
459
+ examples/
460
+ ├── text-classification/ # 20 test cases
461
+ ├── math-word-problems/ # 20 test cases
462
+ ├── code-generation/ # 20 tasks × 3 inputs
463
+ ├── api-calling/ # 20 test cases
464
+ └── rag-qa/ # 20 QA pairs + 10-doc knowledge base
465
+
466
+ tests/ # 121 tests (pytest)
467
+ ```
468
+
469
+ ## Local Development
470
+
471
+ ```bash
472
+ git clone https://github.com/weijt606/polyharness.git && cd polyharness
473
+ python -m venv .venv && source .venv/bin/activate
474
+ pip install anthropic click pydantic pyyaml rich pytest pytest-cov ruff
475
+ export PYTHONPATH="$PWD/src"
476
+
477
+ python -m pytest tests/ # run tests
478
+ ruff check src/ tests/ # lint
479
+ ```
480
+
481
+ ## Documentation
482
+
483
+ - [Product Development](docs/development/product-development.md) — roadmap, user scenarios, success metrics
484
+ - [Technical Architecture](docs/development/technical-architecture.md) — system design & data flow
485
+ - [Meta-Harness Paper](docs/research/references/meta-harness-paper.md) — theoretical foundation and paper-reported reference results
486
+
487
+ ---
488
+
489
+ <p align="center"><strong>Give your agent self-evolution. It's about time.</strong></p>
490
+
491
+ ## License
492
+
493
+ MIT