harness-evolver 3.3.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +54 -29
- package/agents/evolver-architect.md +56 -23
- package/agents/evolver-consolidator.md +57 -0
- package/agents/evolver-critic.md +58 -15
- package/agents/evolver-proposer.md +21 -0
- package/agents/evolver-testgen.md +22 -0
- package/bin/install.js +8 -13
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +343 -71
- package/tools/__pycache__/add_evaluator.cpython-313.pyc +0 -0
- package/tools/__pycache__/adversarial_inject.cpython-313.pyc +0 -0
- package/tools/__pycache__/consolidate.cpython-313.pyc +0 -0
- package/tools/__pycache__/iteration_gate.cpython-313.pyc +0 -0
- package/tools/__pycache__/regression_tracker.cpython-313.pyc +0 -0
- package/tools/__pycache__/synthesize_strategy.cpython-313.pyc +0 -0
- package/tools/__pycache__/validate_state.cpython-313.pyc +0 -0
- package/tools/add_evaluator.py +103 -0
- package/tools/adversarial_inject.py +205 -0
- package/tools/consolidate.py +235 -0
- package/tools/iteration_gate.py +140 -0
- package/tools/regression_tracker.py +175 -0
- package/tools/synthesize_strategy.py +178 -0
- package/tools/validate_state.py +212 -0
- package/tools/__pycache__/detect_stack.cpython-314.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-314.pyc +0 -0
package/skills/evolve/SKILL.md
CHANGED
|
@@ -26,10 +26,16 @@ Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations.
|
|
|
26
26
|
## Parse Arguments
|
|
27
27
|
|
|
28
28
|
- `--iterations N` (default: from interactive question or 5)
|
|
29
|
+
- `--no-interactive` — skip all AskUserQuestion prompts, use defaults (iterations=5, target=none, mode=interactive). Required for cron/background scheduled runs.
|
|
29
30
|
|
|
30
31
|
## Pre-Loop: Interactive Configuration
|
|
31
32
|
|
|
32
|
-
If no
|
|
33
|
+
If `--no-interactive` is set, skip all questions and use defaults:
|
|
34
|
+
- Iterations: value from `--iterations` or 5
|
|
35
|
+
- Target: value from `.evolver.json` `target_score` if set, otherwise no limit
|
|
36
|
+
- Mode: interactive (the cron itself handles scheduling)
|
|
37
|
+
|
|
38
|
+
Otherwise, if no `--iterations` argument was provided, ask the user:
|
|
33
39
|
|
|
34
40
|
```json
|
|
35
41
|
{
|
|
@@ -59,6 +65,68 @@ If no `--iterations` argument was provided, ask the user:
|
|
|
59
65
|
}
|
|
60
66
|
```
|
|
61
67
|
|
|
68
|
+
Write the target to `.evolver.json` for gate checks:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python3 -c "
|
|
72
|
+
import json
|
|
73
|
+
c = json.load(open('.evolver.json'))
|
|
74
|
+
c['target_score'] = {target_score_float} # parsed from user selection, or None for 'No limit'
|
|
75
|
+
json.dump(c, open('.evolver.json', 'w'), indent=2)
|
|
76
|
+
"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
If iterations > 3, offer execution mode:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"questions": [
|
|
84
|
+
{
|
|
85
|
+
"question": "Run mode?",
|
|
86
|
+
"header": "Execution",
|
|
87
|
+
"multiSelect": false,
|
|
88
|
+
"options": [
|
|
89
|
+
{"label": "Interactive", "description": "I'll watch. Show results after each iteration."},
|
|
90
|
+
{"label": "Background", "description": "Run all iterations in background. Notify on completion or significant improvement."},
|
|
91
|
+
{"label": "Scheduled", "description": "Schedule iterations to run on a cron (e.g., nightly optimization)."}
|
|
92
|
+
]
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**If "Background" selected:**
|
|
99
|
+
Run the evolution loop as a background task. Use the `run_in_background` parameter on the main loop execution.
|
|
100
|
+
|
|
101
|
+
**If "Scheduled" selected:**
|
|
102
|
+
Ask for schedule via AskUserQuestion:
|
|
103
|
+
```json
|
|
104
|
+
{
|
|
105
|
+
"questions": [
|
|
106
|
+
{
|
|
107
|
+
"question": "Schedule?",
|
|
108
|
+
"header": "Cron Schedule",
|
|
109
|
+
"multiSelect": false,
|
|
110
|
+
"options": [
|
|
111
|
+
{"label": "Every 6 hours", "description": "Run 1 iteration every 6 hours"},
|
|
112
|
+
{"label": "Nightly (2 AM)", "description": "Run iterations overnight"},
|
|
113
|
+
{"label": "Custom", "description": "Enter a cron expression"}
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Then create a cron trigger:
|
|
121
|
+
```
|
|
122
|
+
Use CronCreate tool to schedule:
|
|
123
|
+
- command: "/evolver:evolve --iterations 1 --no-interactive"
|
|
124
|
+
- schedule: {selected_cron}
|
|
125
|
+
- description: "Harness Evolver: scheduled optimization iteration"
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Report: "Scheduled evolution iterations. Use `/evolver:status` to check progress. Cancel with CronDelete."
|
|
129
|
+
|
|
62
130
|
## The Loop
|
|
63
131
|
|
|
64
132
|
Read config:
|
|
@@ -66,6 +134,29 @@ Read config:
|
|
|
66
134
|
python3 -c "import json; c=json.load(open('.evolver.json')); print(f'Best: {c[\"best_experiment\"]} ({c[\"best_score\"]:.3f}), Iterations: {c[\"iterations\"]}')"
|
|
67
135
|
```
|
|
68
136
|
|
|
137
|
+
### 0.5. Validate State
|
|
138
|
+
|
|
139
|
+
Before starting the loop, verify `.evolver.json` matches LangSmith reality:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
VALIDATION=$($EVOLVER_PY $TOOLS/validate_state.py --config .evolver.json 2>/dev/null)
|
|
143
|
+
VALID=$(echo "$VALIDATION" | python3 -c "import sys,json; print(json.load(sys.stdin).get('valid', False))")
|
|
144
|
+
if [ "$VALID" = "False" ]; then
|
|
145
|
+
echo "WARNING: State validation found issues:"
|
|
146
|
+
echo "$VALIDATION" | python3 -c "
|
|
147
|
+
import sys, json
|
|
148
|
+
data = json.load(sys.stdin)
|
|
149
|
+
for issue in data.get('issues', []):
|
|
150
|
+
print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
|
|
151
|
+
"
|
|
152
|
+
fi
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
If critical issues found, ask user whether to continue or fix first via AskUserQuestion:
|
|
156
|
+
- "Continue anyway" — proceed with warnings
|
|
157
|
+
- "Fix and retry" — attempt auto-fix with `--fix` flag
|
|
158
|
+
- "Abort" — stop the evolution loop
|
|
159
|
+
|
|
69
160
|
For each iteration:
|
|
70
161
|
|
|
71
162
|
### 1. Get Next Version
|
|
@@ -117,67 +208,153 @@ If `best_results.json` exists, parse it to find failing examples (score < 0.7).
|
|
|
117
208
|
Generate adaptive briefings for Candidates D and E (same logic as v2).
|
|
118
209
|
If no best_results.json (first iteration without baseline), all proposers work from code analysis only — no failure data available.
|
|
119
210
|
|
|
211
|
+
### 1.8a. Synthesize Strategy
|
|
212
|
+
|
|
213
|
+
Generate a targeted strategy document from all available analysis:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
$EVOLVER_PY $TOOLS/synthesize_strategy.py \
|
|
217
|
+
--config .evolver.json \
|
|
218
|
+
--trace-insights trace_insights.json \
|
|
219
|
+
--best-results best_results.json \
|
|
220
|
+
--evolution-memory evolution_memory.json \
|
|
221
|
+
--output strategy.md 2>/dev/null
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
The `strategy.md` file is included in the proposer `<files_to_read>` block via the shared context (Step 1.9). This replaces raw data dumps with a synthesized, actionable document — proposers receive specific targets, not raw traces.
|
|
225
|
+
|
|
226
|
+
### 1.9. Prepare Shared Proposer Context
|
|
227
|
+
|
|
228
|
+
Build the shared context that ALL proposers will receive as an identical prefix. This enables KV cache sharing — spawning 5 proposers costs barely more than 1.
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
# Build shared context block (identical for all 5 proposers)
|
|
232
|
+
SHARED_FILES_BLOCK="<files_to_read>
|
|
233
|
+
- .evolver.json
|
|
234
|
+
- strategy.md (if exists)
|
|
235
|
+
- evolution_memory.md (if exists)
|
|
236
|
+
- {entry_point_file}
|
|
237
|
+
</files_to_read>"
|
|
238
|
+
|
|
239
|
+
SHARED_CONTEXT_BLOCK="<context>
|
|
240
|
+
Best experiment: {best_experiment} (score: {best_score})
|
|
241
|
+
Framework: {framework}
|
|
242
|
+
Entry point: {entry_point}
|
|
243
|
+
Evaluators: {evaluators}
|
|
244
|
+
Iteration: {iteration_number} of {total_iterations}
|
|
245
|
+
Score history: {score_history_summary}
|
|
246
|
+
</context>"
|
|
247
|
+
|
|
248
|
+
SHARED_OBJECTIVE="<objective>
|
|
249
|
+
Improve the agent code to score higher on the evaluation dataset.
|
|
250
|
+
You are working in an isolated git worktree — modify any file freely.
|
|
251
|
+
</objective>"
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
**CRITICAL for cache sharing**: The `<objective>`, `<files_to_read>`, and `<context>` blocks MUST be byte-identical across all 5 proposer prompts. Only the `<strategy>` block differs. Place the strategy block LAST in the prompt so the shared prefix is maximized.
|
|
255
|
+
|
|
120
256
|
### 2. Spawn 5 Proposers in Parallel
|
|
121
257
|
|
|
122
|
-
Each proposer
|
|
258
|
+
Each proposer receives the IDENTICAL prefix (objective + files + context) followed by its unique strategy suffix.
|
|
123
259
|
|
|
124
|
-
**
|
|
260
|
+
**All 5 candidates** — `run_in_background: true, isolation: "worktree"`:
|
|
125
261
|
|
|
262
|
+
The prompt for EACH proposer follows this structure:
|
|
126
263
|
```
|
|
127
|
-
|
|
128
|
-
subagent_type: "evolver-proposer",
|
|
129
|
-
description: "Proposer A: exploit best version",
|
|
130
|
-
isolation: "worktree",
|
|
131
|
-
run_in_background: true,
|
|
132
|
-
prompt: |
|
|
133
|
-
<objective>
|
|
134
|
-
Improve the agent code to score higher on the evaluation dataset.
|
|
135
|
-
You are working in an isolated git worktree — modify any file freely.
|
|
136
|
-
</objective>
|
|
264
|
+
{SHARED_OBJECTIVE}
|
|
137
265
|
|
|
138
|
-
|
|
139
|
-
APPROACH: exploitation
|
|
140
|
-
Make targeted improvements to the current best version.
|
|
141
|
-
Focus on the specific failures identified in the results.
|
|
142
|
-
</strategy>
|
|
266
|
+
{SHARED_FILES_BLOCK}
|
|
143
267
|
|
|
144
|
-
|
|
145
|
-
- .evolver.json
|
|
146
|
-
- trace_insights.json (if exists)
|
|
147
|
-
- production_seed.json (if exists)
|
|
148
|
-
- best_results.json (if exists)
|
|
149
|
-
- {entry point file from .evolver.json}
|
|
150
|
-
</files_to_read>
|
|
268
|
+
{SHARED_CONTEXT_BLOCK}
|
|
151
269
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
Entry point: {entry_point}
|
|
156
|
-
Evaluators: {evaluators}
|
|
157
|
-
Failing examples: {failing_example_summary}
|
|
158
|
-
</context>
|
|
270
|
+
<strategy>
|
|
271
|
+
{UNIQUE PER CANDIDATE — see below}
|
|
272
|
+
</strategy>
|
|
159
273
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
274
|
+
<output>
|
|
275
|
+
1. Modify the code to improve performance
|
|
276
|
+
2. Commit your changes with a descriptive message
|
|
277
|
+
3. Write proposal.md explaining what you changed and why
|
|
278
|
+
</output>
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
**Candidate A strategy block:**
|
|
282
|
+
```
|
|
283
|
+
APPROACH: exploitation
|
|
284
|
+
Make targeted improvements to the current best version.
|
|
285
|
+
Focus on the specific failures identified in the results.
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
**Candidate B strategy block:**
|
|
289
|
+
```
|
|
290
|
+
APPROACH: exploration
|
|
291
|
+
Try a fundamentally different approach. Change algorithms, prompts, routing, architecture.
|
|
292
|
+
Don't be afraid to make big changes — this worktree is disposable.
|
|
166
293
|
```
|
|
167
294
|
|
|
168
|
-
**Candidate
|
|
169
|
-
|
|
295
|
+
**Candidate C strategy block:**
|
|
296
|
+
```
|
|
297
|
+
APPROACH: crossover
|
|
298
|
+
Combine strengths from previous iterations. Check git log for what was tried.
|
|
299
|
+
Recent changes: {git_log_last_5}
|
|
300
|
+
```
|
|
170
301
|
|
|
171
|
-
**Candidate
|
|
172
|
-
|
|
173
|
-
|
|
302
|
+
**Candidate D strategy block:**
|
|
303
|
+
```
|
|
304
|
+
APPROACH: {failure_targeted_or_creative}
|
|
305
|
+
{adaptive_briefing_d}
|
|
306
|
+
```
|
|
174
307
|
|
|
175
|
-
**
|
|
176
|
-
|
|
177
|
-
|
|
308
|
+
**Candidate E strategy block:**
|
|
309
|
+
```
|
|
310
|
+
APPROACH: {failure_targeted_or_efficiency}
|
|
311
|
+
{adaptive_briefing_e}
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
**Tool restrictions per strategy:**
|
|
315
|
+
|
|
316
|
+
| Strategy | Allowed Tools | Rationale |
|
|
317
|
+
|----------|--------------|-----------|
|
|
318
|
+
| Exploit (A) | Read, Edit, Bash, Glob, Grep | No Write — can't create new files, only edit existing |
|
|
319
|
+
| Explore (B) | Read, Write, Edit, Bash, Glob, Grep | Full access — may need new files for new architecture |
|
|
320
|
+
| Crossover (C) | Read, Edit, Bash, Glob, Grep | No Write — combines existing patterns, doesn't create |
|
|
321
|
+
| Failure-targeted (D, E) | Read, Edit, Bash, Glob, Grep | No Write — focused fixes on specific files |
|
|
322
|
+
|
|
323
|
+
Apply via the `tools` parameter in each Agent() call. Example for exploit:
|
|
324
|
+
```
|
|
325
|
+
Agent(
|
|
326
|
+
subagent_type: "evolver-proposer",
|
|
327
|
+
tools: ["Read", "Edit", "Bash", "Glob", "Grep"],
|
|
328
|
+
...
|
|
329
|
+
)
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
For explore:
|
|
333
|
+
```
|
|
334
|
+
Agent(
|
|
335
|
+
subagent_type: "evolver-proposer",
|
|
336
|
+
tools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
|
|
337
|
+
...
|
|
338
|
+
)
|
|
339
|
+
```
|
|
178
340
|
|
|
179
341
|
Wait for all 5 to complete.
|
|
180
342
|
|
|
343
|
+
**Stuck proposer detection**: If any proposer hasn't completed after 10 minutes, it may be stuck in a loop. The Claude Code runtime handles this via the agent's turn limit. If a proposer returns without committing changes, skip it — don't retry.
|
|
344
|
+
|
|
345
|
+
After all proposers complete, check which ones actually committed:
|
|
346
|
+
|
|
347
|
+
```bash
|
|
348
|
+
for WORKTREE in {worktree_paths}; do
|
|
349
|
+
CHANGES=$(cd "$WORKTREE" && git log --oneline -1 --since="10 minutes ago" 2>/dev/null | wc -l)
|
|
350
|
+
if [ "$CHANGES" -eq 0 ]; then
|
|
351
|
+
echo "Proposer in $WORKTREE made no commits — skipping"
|
|
352
|
+
fi
|
|
353
|
+
done
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
Only run evaluation (Step 3) for proposers that committed changes.
|
|
357
|
+
|
|
181
358
|
### 3. Run Target for Each Candidate
|
|
182
359
|
|
|
183
360
|
For each worktree that has changes (proposer committed something):
|
|
@@ -298,81 +475,176 @@ Iteration {i}/{N} — 5 candidates evaluated:
|
|
|
298
475
|
Per-task champion: {champion} (beats winner on {N} tasks)
|
|
299
476
|
```
|
|
300
477
|
|
|
301
|
-
### 5.5. Test Suite Growth
|
|
478
|
+
### 5.5. Regression Tracking & Test Suite Growth
|
|
302
479
|
|
|
303
|
-
If
|
|
480
|
+
If this is not the first iteration (previous experiment exists), track regressions and auto-add guards:
|
|
304
481
|
|
|
305
482
|
```bash
|
|
306
|
-
python3 -c "
|
|
307
|
-
from langsmith import Client
|
|
483
|
+
PREV_EXP=$(python3 -c "
|
|
308
484
|
import json
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
485
|
+
h = json.load(open('.evolver.json')).get('history', [])
|
|
486
|
+
print(h[-2]['experiment'] if len(h) >= 2 else '')
|
|
487
|
+
")
|
|
488
|
+
if [ -n "$PREV_EXP" ]; then
|
|
489
|
+
$EVOLVER_PY $TOOLS/regression_tracker.py \
|
|
490
|
+
--config .evolver.json \
|
|
491
|
+
--previous-experiment "$PREV_EXP" \
|
|
492
|
+
--current-experiment "{winner_experiment}" \
|
|
493
|
+
--add-guards --max-guards 5 \
|
|
494
|
+
--output regression_report.json 2>/dev/null
|
|
495
|
+
|
|
496
|
+
# Report regressions
|
|
497
|
+
python3 -c "
|
|
498
|
+
import json, os
|
|
499
|
+
if os.path.exists('regression_report.json'):
|
|
500
|
+
r = json.load(open('regression_report.json'))
|
|
501
|
+
if r['regression_count'] > 0:
|
|
502
|
+
print(f'WARNING: {r[\"regression_count\"]} regressions detected')
|
|
503
|
+
if r['guards_added'] > 0:
|
|
504
|
+
print(f' Added {r[\"guards_added\"]} regression guard examples to dataset')
|
|
505
|
+
if r['fixed_count'] > 0:
|
|
506
|
+
print(f' {r[\"fixed_count\"]} previously-failing examples now pass')
|
|
318
507
|
" 2>/dev/null
|
|
508
|
+
fi
|
|
319
509
|
```
|
|
320
510
|
|
|
321
511
|
### 6. Report
|
|
322
512
|
|
|
323
513
|
Print: `Iteration {i}/{N}: v{NNN} scored {score} (best: {best} at {best_score})`
|
|
324
514
|
|
|
325
|
-
### 6.
|
|
515
|
+
### 6.2. Consolidate Evolution Memory
|
|
326
516
|
|
|
327
|
-
|
|
517
|
+
Spawn the consolidator agent to analyze the iteration and update cross-iteration memory:
|
|
328
518
|
|
|
329
|
-
|
|
519
|
+
```
|
|
520
|
+
Agent(
|
|
521
|
+
subagent_type: "evolver-consolidator",
|
|
522
|
+
description: "Consolidate evolution memory after iteration v{NNN}",
|
|
523
|
+
run_in_background: true,
|
|
524
|
+
prompt: |
|
|
525
|
+
<objective>
|
|
526
|
+
Consolidate learnings from iteration v{NNN}.
|
|
527
|
+
Run the consolidation tool and review its output.
|
|
528
|
+
</objective>
|
|
529
|
+
|
|
530
|
+
<tools_path>
|
|
531
|
+
TOOLS={tools_path}
|
|
532
|
+
EVOLVER_PY={evolver_py_path}
|
|
533
|
+
</tools_path>
|
|
534
|
+
|
|
535
|
+
<instructions>
|
|
536
|
+
Run: $EVOLVER_PY $TOOLS/consolidate.py \
|
|
537
|
+
--config .evolver.json \
|
|
538
|
+
--comparison-files comparison.json \
|
|
539
|
+
--output evolution_memory.md \
|
|
540
|
+
--output-json evolution_memory.json
|
|
541
|
+
|
|
542
|
+
Then read the output and verify insights are accurate.
|
|
543
|
+
</instructions>
|
|
544
|
+
|
|
545
|
+
<files_to_read>
|
|
546
|
+
- .evolver.json
|
|
547
|
+
- comparison.json
|
|
548
|
+
- trace_insights.json (if exists)
|
|
549
|
+
- regression_report.json (if exists)
|
|
550
|
+
- evolution_memory.md (if exists)
|
|
551
|
+
</files_to_read>
|
|
552
|
+
)
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
The `evolution_memory.md` file will be included in proposer briefings for subsequent iterations.
|
|
556
|
+
|
|
557
|
+
### 6.5. Auto-trigger Active Critic
|
|
558
|
+
|
|
559
|
+
If score jumped >0.3 from previous iteration OR reached target in <3 iterations:
|
|
330
560
|
|
|
331
561
|
```
|
|
332
562
|
Agent(
|
|
333
563
|
subagent_type: "evolver-critic",
|
|
334
|
-
description: "Critic:
|
|
564
|
+
description: "Active Critic: detect and fix evaluator gaming",
|
|
335
565
|
prompt: |
|
|
336
566
|
<objective>
|
|
337
|
-
EVAL GAMING
|
|
567
|
+
EVAL GAMING CHECK: Score jumped from {prev_score} to {score}.
|
|
338
568
|
Check if the LangSmith evaluators are being gamed.
|
|
569
|
+
If gaming detected, add stricter evaluators using $TOOLS/add_evaluator.py.
|
|
339
570
|
</objective>
|
|
340
571
|
|
|
572
|
+
<tools_path>
|
|
573
|
+
TOOLS={tools_path}
|
|
574
|
+
EVOLVER_PY={evolver_py_path}
|
|
575
|
+
</tools_path>
|
|
576
|
+
|
|
341
577
|
<files_to_read>
|
|
342
578
|
- .evolver.json
|
|
343
579
|
- comparison.json
|
|
344
580
|
- trace_insights.json
|
|
581
|
+
- evolution_memory.md (if exists)
|
|
345
582
|
</files_to_read>
|
|
346
583
|
)
|
|
347
584
|
```
|
|
348
585
|
|
|
349
|
-
|
|
586
|
+
If the critic added new evaluators, log it:
|
|
587
|
+
```
|
|
588
|
+
Critic added evaluators: {new_evaluators}. Next iteration will use stricter evaluation.
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
### 7. Auto-trigger Architect (ULTRAPLAN Mode)
|
|
350
592
|
|
|
351
593
|
If 3 consecutive iterations within 1% OR score dropped:
|
|
352
594
|
|
|
353
595
|
```
|
|
354
596
|
Agent(
|
|
355
597
|
subagent_type: "evolver-architect",
|
|
356
|
-
|
|
598
|
+
model: "opus",
|
|
599
|
+
description: "Architect ULTRAPLAN: deep topology analysis",
|
|
357
600
|
prompt: |
|
|
358
601
|
<objective>
|
|
359
602
|
The evolution loop has stagnated after {iterations} iterations.
|
|
360
|
-
|
|
603
|
+
Scores: {last_3_scores}.
|
|
604
|
+
Perform deep architectural analysis and recommend structural changes.
|
|
605
|
+
Use extended thinking — you have more compute budget than normal agents.
|
|
361
606
|
</objective>
|
|
362
607
|
|
|
608
|
+
<tools_path>
|
|
609
|
+
TOOLS={tools_path}
|
|
610
|
+
EVOLVER_PY={evolver_py_path}
|
|
611
|
+
</tools_path>
|
|
612
|
+
|
|
363
613
|
<files_to_read>
|
|
364
614
|
- .evolver.json
|
|
365
615
|
- trace_insights.json
|
|
366
|
-
-
|
|
616
|
+
- evolution_memory.md (if exists)
|
|
617
|
+
- evolution_memory.json (if exists)
|
|
618
|
+
- strategy.md (if exists)
|
|
619
|
+
- {entry point and all related source files}
|
|
367
620
|
</files_to_read>
|
|
368
621
|
)
|
|
369
622
|
```
|
|
370
623
|
|
|
371
|
-
|
|
624
|
+
After architect completes, include `architecture.md` in proposer `<files_to_read>` for next iteration.
|
|
625
|
+
|
|
626
|
+
### 8. Gate Check (Three-Gate Trigger)
|
|
627
|
+
|
|
628
|
+
Before starting the next iteration, run the gate check:
|
|
629
|
+
|
|
630
|
+
```bash
|
|
631
|
+
GATE_RESULT=$($EVOLVER_PY $TOOLS/iteration_gate.py --config .evolver.json 2>/dev/null)
|
|
632
|
+
PROCEED=$(echo "$GATE_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('proceed', True))")
|
|
633
|
+
```
|
|
634
|
+
|
|
635
|
+
If `PROCEED` is `False`, check suggestions:
|
|
636
|
+
|
|
637
|
+
```bash
|
|
638
|
+
SUGGEST=$(echo "$GATE_RESULT" | python3 -c "import sys,json; s=json.load(sys.stdin).get('suggestions',[]); print(s[0] if s else '')")
|
|
639
|
+
```
|
|
640
|
+
|
|
641
|
+
- If `$SUGGEST` is `architect`: auto-trigger architect agent (Step 7)
|
|
642
|
+
- If `$SUGGEST` is `continue_cautious`: ask user via AskUserQuestion whether to continue
|
|
643
|
+
- Otherwise: stop the loop and report final results
|
|
372
644
|
|
|
645
|
+
Legacy stop conditions still apply:
|
|
373
646
|
- **Target**: `score >= target_score` → stop
|
|
374
|
-
- **N reached**: done
|
|
375
|
-
- **Stagnation post-architect**: 3 more iterations without improvement → stop
|
|
647
|
+
- **N reached**: all requested iterations done → stop
|
|
376
648
|
|
|
377
649
|
## When Loop Ends — Final Report
|
|
378
650
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Add a new evaluator to .evolver.json configuration.
|
|
3
|
+
|
|
4
|
+
Used by the active critic to programmatically strengthen evaluation
|
|
5
|
+
when gaming is detected.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 add_evaluator.py --config .evolver.json --evaluator factual_accuracy --type llm
|
|
9
|
+
python3 add_evaluator.py --config .evolver.json --evaluator regex_check --type code --pattern "\\d{4}"
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
CODE_EVALUATOR_TEMPLATES = {
|
|
18
|
+
"no_fabricated_references": {
|
|
19
|
+
"description": "Check output doesn't contain fabricated citation patterns",
|
|
20
|
+
"check": "not any(p in output for p in ['http://fake', 'doi.org/10.xxxx', 'ISBN 000'])",
|
|
21
|
+
},
|
|
22
|
+
"answer_not_question": {
|
|
23
|
+
"description": "Check output doesn't just repeat the input question",
|
|
24
|
+
"check": "output.strip().lower() != input_text.strip().lower()",
|
|
25
|
+
},
|
|
26
|
+
"min_length": {
|
|
27
|
+
"description": "Check output meets minimum length",
|
|
28
|
+
"check": "len(output.strip()) >= 20",
|
|
29
|
+
},
|
|
30
|
+
"no_repetition": {
|
|
31
|
+
"description": "Check output doesn't have excessive repetition",
|
|
32
|
+
"check": "len(set(output.split())) / max(len(output.split()), 1) > 0.3",
|
|
33
|
+
},
|
|
34
|
+
"no_empty_filler": {
|
|
35
|
+
"description": "Check output isn't padded with filler phrases to appear longer",
|
|
36
|
+
"check": "output.count('In conclusion') + output.count('As mentioned') + output.count('It is important to note') < 3",
|
|
37
|
+
},
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def add_evaluator(config_path, evaluator_name, eval_type, pattern=None):
|
|
42
|
+
"""Add evaluator to config."""
|
|
43
|
+
with open(config_path) as f:
|
|
44
|
+
config = json.load(f)
|
|
45
|
+
|
|
46
|
+
evaluators = config.get("evaluators", [])
|
|
47
|
+
|
|
48
|
+
if evaluator_name in evaluators:
|
|
49
|
+
print(f"Evaluator '{evaluator_name}' already exists", file=sys.stderr)
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
evaluators.append(evaluator_name)
|
|
53
|
+
config["evaluators"] = evaluators
|
|
54
|
+
|
|
55
|
+
if eval_type == "code" and pattern:
|
|
56
|
+
code_evals = config.get("code_evaluators", {})
|
|
57
|
+
code_evals[evaluator_name] = {"pattern": pattern, "type": "regex"}
|
|
58
|
+
config["code_evaluators"] = code_evals
|
|
59
|
+
elif eval_type == "code" and evaluator_name in CODE_EVALUATOR_TEMPLATES:
|
|
60
|
+
code_evals = config.get("code_evaluators", {})
|
|
61
|
+
code_evals[evaluator_name] = CODE_EVALUATOR_TEMPLATES[evaluator_name]
|
|
62
|
+
config["code_evaluators"] = code_evals
|
|
63
|
+
|
|
64
|
+
with open(config_path, "w") as f:
|
|
65
|
+
json.dump(config, f, indent=2)
|
|
66
|
+
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main():
|
|
71
|
+
parser = argparse.ArgumentParser(description="Add evaluator to .evolver.json")
|
|
72
|
+
parser.add_argument("--config", default=".evolver.json")
|
|
73
|
+
parser.add_argument("--evaluator", required=True, help="Evaluator name")
|
|
74
|
+
parser.add_argument("--type", choices=["llm", "code"], default="llm", help="Evaluator type")
|
|
75
|
+
parser.add_argument("--pattern", default=None, help="Regex pattern for code evaluators")
|
|
76
|
+
parser.add_argument("--remove", action="store_true", help="Remove evaluator instead of adding")
|
|
77
|
+
args = parser.parse_args()
|
|
78
|
+
|
|
79
|
+
if args.remove:
|
|
80
|
+
with open(args.config) as f:
|
|
81
|
+
config = json.load(f)
|
|
82
|
+
evaluators = config.get("evaluators", [])
|
|
83
|
+
if args.evaluator in evaluators:
|
|
84
|
+
evaluators.remove(args.evaluator)
|
|
85
|
+
config["evaluators"] = evaluators
|
|
86
|
+
with open(args.config, "w") as f:
|
|
87
|
+
json.dump(config, f, indent=2)
|
|
88
|
+
print(f"Removed evaluator: {args.evaluator}")
|
|
89
|
+
else:
|
|
90
|
+
print(f"Evaluator '{args.evaluator}' not found", file=sys.stderr)
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
added = add_evaluator(args.config, args.evaluator, args.type, args.pattern)
|
|
94
|
+
if added:
|
|
95
|
+
print(json.dumps({
|
|
96
|
+
"added": args.evaluator,
|
|
97
|
+
"type": args.type,
|
|
98
|
+
"evaluators": json.load(open(args.config))["evaluators"],
|
|
99
|
+
}, indent=2))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
main()
|