harness-evolver 3.3.1 → 4.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +54 -29
- package/agents/evolver-architect.md +56 -23
- package/agents/evolver-consolidator.md +57 -0
- package/agents/evolver-critic.md +58 -15
- package/agents/evolver-proposer.md +13 -0
- package/agents/evolver-testgen.md +22 -0
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +288 -71
- package/tools/__pycache__/add_evaluator.cpython-313.pyc +0 -0
- package/tools/__pycache__/adversarial_inject.cpython-313.pyc +0 -0
- package/tools/__pycache__/consolidate.cpython-313.pyc +0 -0
- package/tools/__pycache__/iteration_gate.cpython-313.pyc +0 -0
- package/tools/__pycache__/regression_tracker.cpython-313.pyc +0 -0
- package/tools/__pycache__/synthesize_strategy.cpython-313.pyc +0 -0
- package/tools/__pycache__/validate_state.cpython-313.pyc +0 -0
- package/tools/add_evaluator.py +103 -0
- package/tools/adversarial_inject.py +205 -0
- package/tools/consolidate.py +235 -0
- package/tools/iteration_gate.py +140 -0
- package/tools/regression_tracker.py +175 -0
- package/tools/synthesize_strategy.py +224 -0
- package/tools/validate_state.py +212 -0
- package/tools/__pycache__/detect_stack.cpython-314.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-314.pyc +0 -0
package/skills/evolve/SKILL.md
CHANGED
|
@@ -26,10 +26,16 @@ Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations.
|
|
|
26
26
|
## Parse Arguments
|
|
27
27
|
|
|
28
28
|
- `--iterations N` (default: from interactive question or 5)
|
|
29
|
+
- `--no-interactive` — skip all AskUserQuestion prompts, use defaults (iterations=5, target=none, mode=interactive). Required for cron/background scheduled runs.
|
|
29
30
|
|
|
30
31
|
## Pre-Loop: Interactive Configuration
|
|
31
32
|
|
|
32
|
-
If no
|
|
33
|
+
If `--no-interactive` is set, skip all questions and use defaults:
|
|
34
|
+
- Iterations: value from `--iterations` or 5
|
|
35
|
+
- Target: value from `.evolver.json` `target_score` if set, otherwise no limit
|
|
36
|
+
- Mode: interactive (the cron itself handles scheduling)
|
|
37
|
+
|
|
38
|
+
Otherwise, if no `--iterations` argument was provided, ask the user:
|
|
33
39
|
|
|
34
40
|
```json
|
|
35
41
|
{
|
|
@@ -59,6 +65,38 @@ If no `--iterations` argument was provided, ask the user:
|
|
|
59
65
|
}
|
|
60
66
|
```
|
|
61
67
|
|
|
68
|
+
Write the target to `.evolver.json` for gate checks:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python3 -c "
|
|
72
|
+
import json
|
|
73
|
+
c = json.load(open('.evolver.json'))
|
|
74
|
+
c['target_score'] = {target_score_float} # parsed from user selection, or None for 'No limit'
|
|
75
|
+
json.dump(c, open('.evolver.json', 'w'), indent=2)
|
|
76
|
+
"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
If iterations > 3, offer execution mode:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"questions": [
|
|
84
|
+
{
|
|
85
|
+
"question": "Run mode?",
|
|
86
|
+
"header": "Execution",
|
|
87
|
+
"multiSelect": false,
|
|
88
|
+
"options": [
|
|
89
|
+
{"label": "Interactive", "description": "I'll watch. Show results after each iteration."},
|
|
90
|
+
{"label": "Background", "description": "Run all iterations in background. Notify on completion or significant improvement."}
|
|
91
|
+
]
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**If "Background" selected:**
|
|
98
|
+
Run the evolution loop as a background task. Use the `run_in_background` parameter on the main loop execution.
|
|
99
|
+
|
|
62
100
|
## The Loop
|
|
63
101
|
|
|
64
102
|
Read config:
|
|
@@ -66,6 +104,29 @@ Read config:
|
|
|
66
104
|
python3 -c "import json; c=json.load(open('.evolver.json')); print(f'Best: {c[\"best_experiment\"]} ({c[\"best_score\"]:.3f}), Iterations: {c[\"iterations\"]}')"
|
|
67
105
|
```
|
|
68
106
|
|
|
107
|
+
### 0.5. Validate State
|
|
108
|
+
|
|
109
|
+
Before starting the loop, verify `.evolver.json` matches LangSmith reality:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
VALIDATION=$($EVOLVER_PY $TOOLS/validate_state.py --config .evolver.json 2>/dev/null)
|
|
113
|
+
VALID=$(echo "$VALIDATION" | python3 -c "import sys,json; print(json.load(sys.stdin).get('valid', False))")
|
|
114
|
+
if [ "$VALID" = "False" ]; then
|
|
115
|
+
echo "WARNING: State validation found issues:"
|
|
116
|
+
echo "$VALIDATION" | python3 -c "
|
|
117
|
+
import sys, json
|
|
118
|
+
data = json.load(sys.stdin)
|
|
119
|
+
for issue in data.get('issues', []):
|
|
120
|
+
print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
|
|
121
|
+
"
|
|
122
|
+
fi
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
If critical issues found, ask user whether to continue or fix first via AskUserQuestion:
|
|
126
|
+
- "Continue anyway" — proceed with warnings
|
|
127
|
+
- "Fix and retry" — attempt auto-fix with `--fix` flag
|
|
128
|
+
- "Abort" — stop the evolution loop
|
|
129
|
+
|
|
69
130
|
For each iteration:
|
|
70
131
|
|
|
71
132
|
### 1. Get Next Version
|
|
@@ -117,67 +178,128 @@ If `best_results.json` exists, parse it to find failing examples (score < 0.7).
|
|
|
117
178
|
Generate adaptive briefings for Candidates D and E (same logic as v2).
|
|
118
179
|
If no best_results.json (first iteration without baseline), all proposers work from code analysis only — no failure data available.
|
|
119
180
|
|
|
181
|
+
### 1.8a. Synthesize Strategy
|
|
182
|
+
|
|
183
|
+
Generate a targeted strategy document from all available analysis:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
$EVOLVER_PY $TOOLS/synthesize_strategy.py \
|
|
187
|
+
--config .evolver.json \
|
|
188
|
+
--trace-insights trace_insights.json \
|
|
189
|
+
--best-results best_results.json \
|
|
190
|
+
--evolution-memory evolution_memory.json \
|
|
191
|
+
--production-seed production_seed.json \
|
|
192
|
+
--output strategy.md 2>/dev/null
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
The `strategy.md` file is included in the proposer `<files_to_read>` block via the shared context (Step 1.9). It synthesizes trace analysis, evolution memory, and production data into an actionable document. Proposers also receive `production_seed.json` directly for access to raw production traces.
|
|
196
|
+
|
|
197
|
+
### 1.9. Prepare Shared Proposer Context
|
|
198
|
+
|
|
199
|
+
Build the shared context that ALL proposers will receive as an identical prefix. This enables KV cache sharing — spawning 5 proposers costs barely more than 1.
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
# Build shared context block (identical for all 5 proposers)
|
|
203
|
+
SHARED_FILES_BLOCK="<files_to_read>
|
|
204
|
+
- .evolver.json
|
|
205
|
+
- strategy.md (if exists)
|
|
206
|
+
- evolution_memory.md (if exists)
|
|
207
|
+
- production_seed.json (if exists)
|
|
208
|
+
- {entry_point_file}
|
|
209
|
+
</files_to_read>"
|
|
210
|
+
|
|
211
|
+
SHARED_CONTEXT_BLOCK="<context>
|
|
212
|
+
Best experiment: {best_experiment} (score: {best_score})
|
|
213
|
+
Framework: {framework}
|
|
214
|
+
Entry point: {entry_point}
|
|
215
|
+
Evaluators: {evaluators}
|
|
216
|
+
Iteration: {iteration_number} of {total_iterations}
|
|
217
|
+
Score history: {score_history_summary}
|
|
218
|
+
</context>"
|
|
219
|
+
|
|
220
|
+
SHARED_OBJECTIVE="<objective>
|
|
221
|
+
Improve the agent code to score higher on the evaluation dataset.
|
|
222
|
+
You are working in an isolated git worktree — modify any file freely.
|
|
223
|
+
</objective>"
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
**CRITICAL for cache sharing**: The `<objective>`, `<files_to_read>`, and `<context>` blocks MUST be byte-identical across all 5 proposer prompts. Only the `<strategy>` block differs. Place the strategy block LAST in the prompt so the shared prefix is maximized.
|
|
227
|
+
|
|
120
228
|
### 2. Spawn 5 Proposers in Parallel
|
|
121
229
|
|
|
122
|
-
Each proposer
|
|
230
|
+
Each proposer receives the IDENTICAL prefix (objective + files + context) followed by its unique strategy suffix.
|
|
123
231
|
|
|
124
|
-
**
|
|
232
|
+
**All 5 candidates** — `run_in_background: true, isolation: "worktree"`:
|
|
125
233
|
|
|
234
|
+
The prompt for EACH proposer follows this structure:
|
|
126
235
|
```
|
|
127
|
-
|
|
128
|
-
subagent_type: "evolver-proposer",
|
|
129
|
-
description: "Proposer A: exploit best version",
|
|
130
|
-
isolation: "worktree",
|
|
131
|
-
run_in_background: true,
|
|
132
|
-
prompt: |
|
|
133
|
-
<objective>
|
|
134
|
-
Improve the agent code to score higher on the evaluation dataset.
|
|
135
|
-
You are working in an isolated git worktree — modify any file freely.
|
|
136
|
-
</objective>
|
|
236
|
+
{SHARED_OBJECTIVE}
|
|
137
237
|
|
|
138
|
-
|
|
139
|
-
APPROACH: exploitation
|
|
140
|
-
Make targeted improvements to the current best version.
|
|
141
|
-
Focus on the specific failures identified in the results.
|
|
142
|
-
</strategy>
|
|
238
|
+
{SHARED_FILES_BLOCK}
|
|
143
239
|
|
|
144
|
-
|
|
145
|
-
- .evolver.json
|
|
146
|
-
- trace_insights.json (if exists)
|
|
147
|
-
- production_seed.json (if exists)
|
|
148
|
-
- best_results.json (if exists)
|
|
149
|
-
- {entry point file from .evolver.json}
|
|
150
|
-
</files_to_read>
|
|
240
|
+
{SHARED_CONTEXT_BLOCK}
|
|
151
241
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
Entry point: {entry_point}
|
|
156
|
-
Evaluators: {evaluators}
|
|
157
|
-
Failing examples: {failing_example_summary}
|
|
158
|
-
</context>
|
|
242
|
+
<strategy>
|
|
243
|
+
{UNIQUE PER CANDIDATE — see below}
|
|
244
|
+
</strategy>
|
|
159
245
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
)
|
|
246
|
+
<output>
|
|
247
|
+
1. Modify the code to improve performance
|
|
248
|
+
2. Commit your changes with a descriptive message
|
|
249
|
+
3. Write proposal.md explaining what you changed and why
|
|
250
|
+
</output>
|
|
166
251
|
```
|
|
167
252
|
|
|
168
|
-
**Candidate
|
|
169
|
-
|
|
253
|
+
**Candidate A strategy block:**
|
|
254
|
+
```
|
|
255
|
+
APPROACH: exploitation
|
|
256
|
+
Make targeted improvements to the current best version.
|
|
257
|
+
Focus on the specific failures identified in the results.
|
|
258
|
+
```
|
|
170
259
|
|
|
171
|
-
**Candidate
|
|
172
|
-
|
|
173
|
-
|
|
260
|
+
**Candidate B strategy block:**
|
|
261
|
+
```
|
|
262
|
+
APPROACH: exploration
|
|
263
|
+
Try a fundamentally different approach. Change algorithms, prompts, routing, architecture.
|
|
264
|
+
Don't be afraid to make big changes — this worktree is disposable.
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
**Candidate C strategy block:**
|
|
268
|
+
```
|
|
269
|
+
APPROACH: crossover
|
|
270
|
+
Combine strengths from previous iterations. Check git log for what was tried.
|
|
271
|
+
Recent changes: {git_log_last_5}
|
|
272
|
+
```
|
|
174
273
|
|
|
175
|
-
**
|
|
176
|
-
|
|
177
|
-
|
|
274
|
+
**Candidate D strategy block:**
|
|
275
|
+
```
|
|
276
|
+
APPROACH: {failure_targeted_or_creative}
|
|
277
|
+
{adaptive_briefing_d}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
**Candidate E strategy block:**
|
|
281
|
+
```
|
|
282
|
+
APPROACH: {failure_targeted_or_efficiency}
|
|
283
|
+
{adaptive_briefing_e}
|
|
284
|
+
```
|
|
178
285
|
|
|
179
286
|
Wait for all 5 to complete.
|
|
180
287
|
|
|
288
|
+
**Stuck proposer detection**: If any proposer hasn't completed after 10 minutes, it may be stuck in a loop. The Claude Code runtime handles this via the agent's turn limit. If a proposer returns without committing changes, skip it — don't retry.
|
|
289
|
+
|
|
290
|
+
After all proposers complete, check which ones actually committed:
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
for WORKTREE in {worktree_paths}; do
|
|
294
|
+
CHANGES=$(cd "$WORKTREE" && git log --oneline -1 --since="10 minutes ago" 2>/dev/null | wc -l)
|
|
295
|
+
if [ "$CHANGES" -eq 0 ]; then
|
|
296
|
+
echo "Proposer in $WORKTREE made no commits — skipping"
|
|
297
|
+
fi
|
|
298
|
+
done
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
Only run evaluation (Step 3) for proposers that committed changes.
|
|
302
|
+
|
|
181
303
|
### 3. Run Target for Each Candidate
|
|
182
304
|
|
|
183
305
|
For each worktree that has changes (proposer committed something):
|
|
@@ -298,81 +420,176 @@ Iteration {i}/{N} — 5 candidates evaluated:
|
|
|
298
420
|
Per-task champion: {champion} (beats winner on {N} tasks)
|
|
299
421
|
```
|
|
300
422
|
|
|
301
|
-
### 5.5. Test Suite Growth
|
|
423
|
+
### 5.5. Regression Tracking & Test Suite Growth
|
|
302
424
|
|
|
303
|
-
If
|
|
425
|
+
If this is not the first iteration (previous experiment exists), track regressions and auto-add guards:
|
|
304
426
|
|
|
305
427
|
```bash
|
|
306
|
-
python3 -c "
|
|
307
|
-
from langsmith import Client
|
|
428
|
+
PREV_EXP=$(python3 -c "
|
|
308
429
|
import json
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
430
|
+
h = json.load(open('.evolver.json')).get('history', [])
|
|
431
|
+
print(h[-2]['experiment'] if len(h) >= 2 else '')
|
|
432
|
+
")
|
|
433
|
+
if [ -n "$PREV_EXP" ]; then
|
|
434
|
+
$EVOLVER_PY $TOOLS/regression_tracker.py \
|
|
435
|
+
--config .evolver.json \
|
|
436
|
+
--previous-experiment "$PREV_EXP" \
|
|
437
|
+
--current-experiment "{winner_experiment}" \
|
|
438
|
+
--add-guards --max-guards 5 \
|
|
439
|
+
--output regression_report.json 2>/dev/null
|
|
440
|
+
|
|
441
|
+
# Report regressions
|
|
442
|
+
python3 -c "
|
|
443
|
+
import json, os
|
|
444
|
+
if os.path.exists('regression_report.json'):
|
|
445
|
+
r = json.load(open('regression_report.json'))
|
|
446
|
+
if r['regression_count'] > 0:
|
|
447
|
+
print(f'WARNING: {r[\"regression_count\"]} regressions detected')
|
|
448
|
+
if r['guards_added'] > 0:
|
|
449
|
+
print(f' Added {r[\"guards_added\"]} regression guard examples to dataset')
|
|
450
|
+
if r['fixed_count'] > 0:
|
|
451
|
+
print(f' {r[\"fixed_count\"]} previously-failing examples now pass')
|
|
318
452
|
" 2>/dev/null
|
|
453
|
+
fi
|
|
319
454
|
```
|
|
320
455
|
|
|
321
456
|
### 6. Report
|
|
322
457
|
|
|
323
458
|
Print: `Iteration {i}/{N}: v{NNN} scored {score} (best: {best} at {best_score})`
|
|
324
459
|
|
|
325
|
-
### 6.
|
|
460
|
+
### 6.2. Consolidate Evolution Memory
|
|
326
461
|
|
|
327
|
-
|
|
462
|
+
Spawn the consolidator agent to analyze the iteration and update cross-iteration memory:
|
|
463
|
+
|
|
464
|
+
```
|
|
465
|
+
Agent(
|
|
466
|
+
subagent_type: "evolver-consolidator",
|
|
467
|
+
description: "Consolidate evolution memory after iteration v{NNN}",
|
|
468
|
+
run_in_background: true,
|
|
469
|
+
prompt: |
|
|
470
|
+
<objective>
|
|
471
|
+
Consolidate learnings from iteration v{NNN}.
|
|
472
|
+
Run the consolidation tool and review its output.
|
|
473
|
+
</objective>
|
|
474
|
+
|
|
475
|
+
<tools_path>
|
|
476
|
+
TOOLS={tools_path}
|
|
477
|
+
EVOLVER_PY={evolver_py_path}
|
|
478
|
+
</tools_path>
|
|
328
479
|
|
|
329
|
-
|
|
480
|
+
<instructions>
|
|
481
|
+
Run: $EVOLVER_PY $TOOLS/consolidate.py \
|
|
482
|
+
--config .evolver.json \
|
|
483
|
+
--comparison-files comparison.json \
|
|
484
|
+
--output evolution_memory.md \
|
|
485
|
+
--output-json evolution_memory.json
|
|
486
|
+
|
|
487
|
+
Then read the output and verify insights are accurate.
|
|
488
|
+
</instructions>
|
|
489
|
+
|
|
490
|
+
<files_to_read>
|
|
491
|
+
- .evolver.json
|
|
492
|
+
- comparison.json
|
|
493
|
+
- trace_insights.json (if exists)
|
|
494
|
+
- regression_report.json (if exists)
|
|
495
|
+
- evolution_memory.md (if exists)
|
|
496
|
+
</files_to_read>
|
|
497
|
+
)
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
The `evolution_memory.md` file will be included in proposer briefings for subsequent iterations.
|
|
501
|
+
|
|
502
|
+
### 6.5. Auto-trigger Active Critic
|
|
503
|
+
|
|
504
|
+
If score jumped >0.3 from previous iteration OR reached target in <3 iterations:
|
|
330
505
|
|
|
331
506
|
```
|
|
332
507
|
Agent(
|
|
333
508
|
subagent_type: "evolver-critic",
|
|
334
|
-
description: "Critic:
|
|
509
|
+
description: "Active Critic: detect and fix evaluator gaming",
|
|
335
510
|
prompt: |
|
|
336
511
|
<objective>
|
|
337
|
-
EVAL GAMING
|
|
512
|
+
EVAL GAMING CHECK: Score jumped from {prev_score} to {score}.
|
|
338
513
|
Check if the LangSmith evaluators are being gamed.
|
|
514
|
+
If gaming detected, add stricter evaluators using $TOOLS/add_evaluator.py.
|
|
339
515
|
</objective>
|
|
340
516
|
|
|
517
|
+
<tools_path>
|
|
518
|
+
TOOLS={tools_path}
|
|
519
|
+
EVOLVER_PY={evolver_py_path}
|
|
520
|
+
</tools_path>
|
|
521
|
+
|
|
341
522
|
<files_to_read>
|
|
342
523
|
- .evolver.json
|
|
343
524
|
- comparison.json
|
|
344
525
|
- trace_insights.json
|
|
526
|
+
- evolution_memory.md (if exists)
|
|
345
527
|
</files_to_read>
|
|
346
528
|
)
|
|
347
529
|
```
|
|
348
530
|
|
|
349
|
-
|
|
531
|
+
If the critic added new evaluators, log it:
|
|
532
|
+
```
|
|
533
|
+
Critic added evaluators: {new_evaluators}. Next iteration will use stricter evaluation.
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
### 7. Auto-trigger Architect (ULTRAPLAN Mode)
|
|
350
537
|
|
|
351
538
|
If 3 consecutive iterations within 1% OR score dropped:
|
|
352
539
|
|
|
353
540
|
```
|
|
354
541
|
Agent(
|
|
355
542
|
subagent_type: "evolver-architect",
|
|
356
|
-
|
|
543
|
+
model: "opus",
|
|
544
|
+
description: "Architect ULTRAPLAN: deep topology analysis",
|
|
357
545
|
prompt: |
|
|
358
546
|
<objective>
|
|
359
547
|
The evolution loop has stagnated after {iterations} iterations.
|
|
360
|
-
|
|
548
|
+
Scores: {last_3_scores}.
|
|
549
|
+
Perform deep architectural analysis and recommend structural changes.
|
|
550
|
+
Use extended thinking — you have more compute budget than normal agents.
|
|
361
551
|
</objective>
|
|
362
552
|
|
|
553
|
+
<tools_path>
|
|
554
|
+
TOOLS={tools_path}
|
|
555
|
+
EVOLVER_PY={evolver_py_path}
|
|
556
|
+
</tools_path>
|
|
557
|
+
|
|
363
558
|
<files_to_read>
|
|
364
559
|
- .evolver.json
|
|
365
560
|
- trace_insights.json
|
|
366
|
-
-
|
|
561
|
+
- evolution_memory.md (if exists)
|
|
562
|
+
- evolution_memory.json (if exists)
|
|
563
|
+
- strategy.md (if exists)
|
|
564
|
+
- {entry point and all related source files}
|
|
367
565
|
</files_to_read>
|
|
368
566
|
)
|
|
369
567
|
```
|
|
370
568
|
|
|
371
|
-
|
|
569
|
+
After architect completes, include `architecture.md` in proposer `<files_to_read>` for next iteration.
|
|
570
|
+
|
|
571
|
+
### 8. Gate Check (Three-Gate Trigger)
|
|
572
|
+
|
|
573
|
+
Before starting the next iteration, run the gate check:
|
|
574
|
+
|
|
575
|
+
```bash
|
|
576
|
+
GATE_RESULT=$($EVOLVER_PY $TOOLS/iteration_gate.py --config .evolver.json 2>/dev/null)
|
|
577
|
+
PROCEED=$(echo "$GATE_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('proceed', True))")
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
If `PROCEED` is `False`, check suggestions:
|
|
581
|
+
|
|
582
|
+
```bash
|
|
583
|
+
SUGGEST=$(echo "$GATE_RESULT" | python3 -c "import sys,json; s=json.load(sys.stdin).get('suggestions',[]); print(s[0] if s else '')")
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
- If `$SUGGEST` is `architect`: auto-trigger architect agent (Step 7)
|
|
587
|
+
- If `$SUGGEST` is `continue_cautious`: ask user via AskUserQuestion whether to continue
|
|
588
|
+
- Otherwise: stop the loop and report final results
|
|
372
589
|
|
|
590
|
+
Legacy stop conditions still apply:
|
|
373
591
|
- **Target**: `score >= target_score` → stop
|
|
374
|
-
- **N reached**: done
|
|
375
|
-
- **Stagnation post-architect**: 3 more iterations without improvement → stop
|
|
592
|
+
- **N reached**: all requested iterations done → stop
|
|
376
593
|
|
|
377
594
|
## When Loop Ends — Final Report
|
|
378
595
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Add a new evaluator to .evolver.json configuration.
|
|
3
|
+
|
|
4
|
+
Used by the active critic to programmatically strengthen evaluation
|
|
5
|
+
when gaming is detected.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 add_evaluator.py --config .evolver.json --evaluator factual_accuracy --type llm
|
|
9
|
+
python3 add_evaluator.py --config .evolver.json --evaluator regex_check --type code --pattern "\\d{4}"
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
CODE_EVALUATOR_TEMPLATES = {
|
|
18
|
+
"no_fabricated_references": {
|
|
19
|
+
"description": "Check output doesn't contain fabricated citation patterns",
|
|
20
|
+
"check": "not any(p in output for p in ['http://fake', 'doi.org/10.xxxx', 'ISBN 000'])",
|
|
21
|
+
},
|
|
22
|
+
"answer_not_question": {
|
|
23
|
+
"description": "Check output doesn't just repeat the input question",
|
|
24
|
+
"check": "output.strip().lower() != input_text.strip().lower()",
|
|
25
|
+
},
|
|
26
|
+
"min_length": {
|
|
27
|
+
"description": "Check output meets minimum length",
|
|
28
|
+
"check": "len(output.strip()) >= 20",
|
|
29
|
+
},
|
|
30
|
+
"no_repetition": {
|
|
31
|
+
"description": "Check output doesn't have excessive repetition",
|
|
32
|
+
"check": "len(set(output.split())) / max(len(output.split()), 1) > 0.3",
|
|
33
|
+
},
|
|
34
|
+
"no_empty_filler": {
|
|
35
|
+
"description": "Check output isn't padded with filler phrases to appear longer",
|
|
36
|
+
"check": "output.count('In conclusion') + output.count('As mentioned') + output.count('It is important to note') < 3",
|
|
37
|
+
},
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def add_evaluator(config_path, evaluator_name, eval_type, pattern=None):
|
|
42
|
+
"""Add evaluator to config."""
|
|
43
|
+
with open(config_path) as f:
|
|
44
|
+
config = json.load(f)
|
|
45
|
+
|
|
46
|
+
evaluators = config.get("evaluators", [])
|
|
47
|
+
|
|
48
|
+
if evaluator_name in evaluators:
|
|
49
|
+
print(f"Evaluator '{evaluator_name}' already exists", file=sys.stderr)
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
evaluators.append(evaluator_name)
|
|
53
|
+
config["evaluators"] = evaluators
|
|
54
|
+
|
|
55
|
+
if eval_type == "code" and pattern:
|
|
56
|
+
code_evals = config.get("code_evaluators", {})
|
|
57
|
+
code_evals[evaluator_name] = {"pattern": pattern, "type": "regex"}
|
|
58
|
+
config["code_evaluators"] = code_evals
|
|
59
|
+
elif eval_type == "code" and evaluator_name in CODE_EVALUATOR_TEMPLATES:
|
|
60
|
+
code_evals = config.get("code_evaluators", {})
|
|
61
|
+
code_evals[evaluator_name] = CODE_EVALUATOR_TEMPLATES[evaluator_name]
|
|
62
|
+
config["code_evaluators"] = code_evals
|
|
63
|
+
|
|
64
|
+
with open(config_path, "w") as f:
|
|
65
|
+
json.dump(config, f, indent=2)
|
|
66
|
+
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main():
|
|
71
|
+
parser = argparse.ArgumentParser(description="Add evaluator to .evolver.json")
|
|
72
|
+
parser.add_argument("--config", default=".evolver.json")
|
|
73
|
+
parser.add_argument("--evaluator", required=True, help="Evaluator name")
|
|
74
|
+
parser.add_argument("--type", choices=["llm", "code"], default="llm", help="Evaluator type")
|
|
75
|
+
parser.add_argument("--pattern", default=None, help="Regex pattern for code evaluators")
|
|
76
|
+
parser.add_argument("--remove", action="store_true", help="Remove evaluator instead of adding")
|
|
77
|
+
args = parser.parse_args()
|
|
78
|
+
|
|
79
|
+
if args.remove:
|
|
80
|
+
with open(args.config) as f:
|
|
81
|
+
config = json.load(f)
|
|
82
|
+
evaluators = config.get("evaluators", [])
|
|
83
|
+
if args.evaluator in evaluators:
|
|
84
|
+
evaluators.remove(args.evaluator)
|
|
85
|
+
config["evaluators"] = evaluators
|
|
86
|
+
with open(args.config, "w") as f:
|
|
87
|
+
json.dump(config, f, indent=2)
|
|
88
|
+
print(f"Removed evaluator: {args.evaluator}")
|
|
89
|
+
else:
|
|
90
|
+
print(f"Evaluator '{args.evaluator}' not found", file=sys.stderr)
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
added = add_evaluator(args.config, args.evaluator, args.type, args.pattern)
|
|
94
|
+
if added:
|
|
95
|
+
print(json.dumps({
|
|
96
|
+
"added": args.evaluator,
|
|
97
|
+
"type": args.type,
|
|
98
|
+
"evaluators": json.load(open(args.config))["evaluators"],
|
|
99
|
+
}, indent=2))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
main()
|