evo-anything 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/plugin/AGENTS.md +209 -70
- package/plugin/SOUL.md +3 -5
- package/plugin/TOOLS.md +37 -26
- package/plugin/agents/map_agent.md +28 -0
- package/plugin/agents/orchestrator.md +26 -0
- package/plugin/agents/policy_agent.md +53 -0
- package/plugin/agents/reflect_agent.md +79 -0
- package/plugin/agents/worker.md +93 -0
- package/plugin/evo-engine/server.py +116 -89
- package/plugin/openclaw.plugin.json +2 -1
- package/plugin/skills/evolve/SKILL.md +14 -12
- package/scripts/cli.js +12 -0
package/package.json
CHANGED
package/plugin/AGENTS.md
CHANGED
|
@@ -1,80 +1,226 @@
|
|
|
1
|
-
# U2E Evolution Protocol
|
|
1
|
+
# U2E Evolution Protocol — Multi-Agent Architecture
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
## Agents
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
| # | Agent | Role | Runs |
|
|
6
|
+
|---|-------|------|------|
|
|
7
|
+
| 1 | **OrchestratorAgent** | Drives the main loop, dispatches workers, triggers selection | Once per run |
|
|
8
|
+
| 2 | **MapAgent** | Analyzes code, identifies optimization targets | Once at init |
|
|
9
|
+
| 3 | **WorkerAgent** | Generates a code variant (CodeGen) and evaluates it (Dev) | N per generation, in parallel |
|
|
10
|
+
| 4 | **PolicyAgent** | Reviews git diff, approves or rejects before benchmark | Once per worker |
|
|
11
|
+
| 5 | **ReflectAgent** | Writes memory, extracts lessons, runs synergy checks | Once per generation |
|
|
6
12
|
|
|
7
|
-
|
|
8
|
-
1. **Analysis** — identify which functions to optimize (MapAgent role)
|
|
9
|
-
2. **Planning** — decide operation types and variant counts per target (PlanAgent role)
|
|
10
|
-
3. **Generation** — create code variants via mutation or crossover (CodeGenAgent role)
|
|
11
|
-
4. **Evaluation** — run benchmarks in isolated git worktrees (DevAgent role)
|
|
12
|
-
5. **Selection** — keep the best, eliminate the rest
|
|
13
|
-
6. **Reflection** — extract lessons, update memory (ReflectAgent role)
|
|
13
|
+
> **PlanAgent** is implemented server-side in `plan_generation()` — no LLM needed.
|
|
14
14
|
|
|
15
15
|
## Core Loop
|
|
16
16
|
|
|
17
|
-
The loop is driven by `evo_step`.
|
|
18
|
-
|
|
19
|
-
**You decide whether to stop** — check `action == "done"` or user intent.
|
|
17
|
+
The loop is driven by `evo_step`. The **OrchestratorAgent** calls it to advance
|
|
18
|
+
state; **WorkerAgents** call it to report code and fitness results.
|
|
20
19
|
|
|
21
20
|
```
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
OrchestratorAgent:
|
|
22
|
+
step = evo_step("begin_generation")
|
|
23
|
+
# → {action: "dispatch_workers", generation, batch_size, items: [...]}
|
|
25
24
|
|
|
26
25
|
LOOP:
|
|
27
26
|
if step.action == "done":
|
|
28
|
-
break
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
item
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
d. Read memory/ for this target (long_term + failures)
|
|
37
|
-
e. Generate variant (mutate or crossover via LLM)
|
|
38
|
-
f. Write code change, git commit
|
|
39
|
-
step = evo_step("code_ready",
|
|
40
|
-
branch=item.branch,
|
|
41
|
-
parent_commit=parent_commit)
|
|
42
|
-
# server runs policy check here — returns "run_benchmark" or next "generate_code"/"select"
|
|
43
|
-
|
|
44
|
-
elif step.action == "run_benchmark":
|
|
45
|
-
# policy check passed — step contains branch, target_id, operation, parent_branches
|
|
46
|
-
a. git worktree add <path> step.branch
|
|
47
|
-
b. Run benchmark command in worktree
|
|
48
|
-
c. Parse fitness from output
|
|
49
|
-
d. git worktree remove <path>
|
|
50
|
-
step = evo_step("fitness_ready",
|
|
51
|
-
branch=step.branch,
|
|
52
|
-
fitness=<value>, success=<bool>,
|
|
53
|
-
operation=step.operation,
|
|
54
|
-
target_id=step.target_id,
|
|
55
|
-
parent_branches=step.parent_branches)
|
|
56
|
-
# server returns next "generate_code" or "select"
|
|
57
|
-
|
|
58
|
-
elif step.action == "select":
|
|
27
|
+
break
|
|
28
|
+
|
|
29
|
+
elif step.action == "dispatch_workers":
|
|
30
|
+
# Launch one WorkerAgent per item, in parallel
|
|
31
|
+
for item in step.items:
|
|
32
|
+
spawn WorkerAgent(item)
|
|
33
|
+
wait for all workers to return
|
|
34
|
+
|
|
59
35
|
step = evo_step("select")
|
|
60
|
-
#
|
|
36
|
+
# → {action: "reflect", keep: [...], eliminate: [...], best_branch, best_obj}
|
|
37
|
+
|
|
38
|
+
# OrchestratorAgent cleans up
|
|
61
39
|
a. Delete eliminated branches
|
|
62
40
|
b. Tag best: git tag best-gen-{N}
|
|
63
41
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
a. git diff best..second_best → short-term reflection
|
|
67
|
-
b. Write to memory/targets/{id}/short_term/gen_{N}.md
|
|
68
|
-
c. Synthesize long_term.md from accumulated short_term
|
|
69
|
-
d. Record failures to memory/targets/{id}/failures.md
|
|
70
|
-
e. Every 3 generations: synergy check
|
|
71
|
-
- Cherry-pick best of each target into one branch
|
|
72
|
-
- Evaluate combined fitness (use evo_step "code_ready"→"fitness_ready")
|
|
73
|
-
- Record synergy results via evo_record_synergy
|
|
42
|
+
# Hand off to ReflectAgent
|
|
43
|
+
spawn ReflectAgent(step)
|
|
74
44
|
step = evo_step("reflect_done")
|
|
75
|
-
#
|
|
45
|
+
# → {action: "dispatch_workers", ...} or {action: "done", ...}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### WorkerAgent Flow (per item)
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
WorkerAgent receives: item = {branch, operation, target_id, parent_branches,
|
|
52
|
+
target_file, target_function}
|
|
53
|
+
|
|
54
|
+
1. CODEGEN — generate the variant
|
|
55
|
+
a. git checkout -b item.branch from item.parent_branches[0]
|
|
56
|
+
b. parent_commit = git rev-parse item.parent_branches[0]
|
|
57
|
+
c. Read target function code
|
|
58
|
+
d. Read memory/ for this target (long_term + failures)
|
|
59
|
+
e. Generate variant (mutate or crossover)
|
|
60
|
+
f. git add + git commit
|
|
61
|
+
|
|
62
|
+
2. REQUEST POLICY CHECK
|
|
63
|
+
step = evo_step("code_ready",
|
|
64
|
+
branch=item.branch,
|
|
65
|
+
parent_commit=parent_commit)
|
|
66
|
+
# → {action: "check_policy", branch, diff, changed_files,
|
|
67
|
+
# target_file, protected_patterns, ...}
|
|
68
|
+
|
|
69
|
+
3. POLICY CHECK — hand to PolicyAgent
|
|
70
|
+
PolicyAgent reviews step.diff:
|
|
71
|
+
- Are changed_files only the declared target_file?
|
|
72
|
+
- Do any changed_files match protected_patterns?
|
|
73
|
+
- Was only the function body changed (not its signature)?
|
|
74
|
+
- Are there hidden side effects?
|
|
75
|
+
|
|
76
|
+
if approved:
|
|
77
|
+
step = evo_step("policy_pass", branch=item.branch)
|
|
78
|
+
# → {action: "run_benchmark", branch, target_id, operation, parent_branches}
|
|
79
|
+
else:
|
|
80
|
+
step = evo_step("policy_fail", branch=item.branch,
|
|
81
|
+
reason="<why it was rejected>")
|
|
82
|
+
# → {action: "worker_done", branch, rejected=True, reason}
|
|
83
|
+
return ← worker exits early
|
|
84
|
+
|
|
85
|
+
4. BENCHMARK — evaluate the variant
|
|
86
|
+
a. git worktree add <path> step.branch
|
|
87
|
+
b. Run benchmark command in worktree
|
|
88
|
+
c. Parse fitness from output
|
|
89
|
+
d. git worktree remove <path>
|
|
90
|
+
|
|
91
|
+
step = evo_step("fitness_ready",
|
|
92
|
+
branch=step.branch,
|
|
93
|
+
fitness=<value>,
|
|
94
|
+
success=<bool>,
|
|
95
|
+
operation=step.operation,
|
|
96
|
+
target_id=step.target_id,
|
|
97
|
+
parent_branches=step.parent_branches)
|
|
98
|
+
# → {action: "worker_done", branch, fitness, success, is_new_best, total_evals}
|
|
99
|
+
return ← worker exits
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### ReflectAgent Flow
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
ReflectAgent receives: selection result with keep/eliminate/best_branch
|
|
106
|
+
|
|
107
|
+
1. git diff best..second_best → extract what changed
|
|
108
|
+
2. Write memory/targets/{id}/short_term/gen_{N}.md
|
|
109
|
+
3. Synthesize long_term.md from accumulated short_term
|
|
110
|
+
4. Record failures to memory/targets/{id}/failures.md
|
|
111
|
+
5. Every 3 generations: synergy check
|
|
112
|
+
- Cherry-pick best of each target into one branch
|
|
113
|
+
- Run WorkerAgent flow on synergy branch
|
|
114
|
+
- Record results via evo_record_synergy
|
|
76
115
|
```
|
|
77
116
|
|
|
117
|
+
## State Machine — Phase Reference
|
|
118
|
+
|
|
119
|
+
### `evo_step("begin_generation")`
|
|
120
|
+
|
|
121
|
+
**Input:** _(no extra args)_
|
|
122
|
+
|
|
123
|
+
**Output:**
|
|
124
|
+
```json
|
|
125
|
+
{
|
|
126
|
+
"action": "dispatch_workers",
|
|
127
|
+
"generation": 0,
|
|
128
|
+
"batch_size": 8,
|
|
129
|
+
"items": [
|
|
130
|
+
{"branch": "gen-0/loss-fn/mutate-0", "operation": "mutate",
|
|
131
|
+
"target_id": "loss-fn", "parent_branches": ["seed-baseline"],
|
|
132
|
+
"target_file": "model.py", "target_function": "compute_loss"},
|
|
133
|
+
...
|
|
134
|
+
]
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### `evo_step("code_ready", branch=..., parent_commit=...)`
|
|
139
|
+
|
|
140
|
+
**Input:** `branch` (required), `parent_commit` (required)
|
|
141
|
+
|
|
142
|
+
**Output:**
|
|
143
|
+
```json
|
|
144
|
+
{
|
|
145
|
+
"action": "check_policy",
|
|
146
|
+
"branch": "gen-0/loss-fn/mutate-0",
|
|
147
|
+
"parent_commit": "abc123",
|
|
148
|
+
"target_id": "loss-fn",
|
|
149
|
+
"target_file": "model.py",
|
|
150
|
+
"operation": "mutate",
|
|
151
|
+
"parent_branches": ["seed-baseline"],
|
|
152
|
+
"changed_files": ["model.py"],
|
|
153
|
+
"diff": "--- a/model.py\n+++ b/model.py\n...",
|
|
154
|
+
"protected_patterns": ["benchmark*.py", "eval*.py", "*.sh"]
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### `evo_step("policy_pass", branch=...)`
|
|
159
|
+
|
|
160
|
+
**Input:** `branch` (required)
|
|
161
|
+
|
|
162
|
+
**Output:**
|
|
163
|
+
```json
|
|
164
|
+
{
|
|
165
|
+
"action": "run_benchmark",
|
|
166
|
+
"branch": "gen-0/loss-fn/mutate-0",
|
|
167
|
+
"target_id": "loss-fn",
|
|
168
|
+
"operation": "mutate",
|
|
169
|
+
"parent_branches": ["seed-baseline"]
|
|
170
|
+
}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### `evo_step("policy_fail", branch=..., reason=...)`
|
|
174
|
+
|
|
175
|
+
**Input:** `branch` (required), `reason` (required)
|
|
176
|
+
|
|
177
|
+
**Output:**
|
|
178
|
+
```json
|
|
179
|
+
{
|
|
180
|
+
"action": "worker_done",
|
|
181
|
+
"branch": "gen-0/loss-fn/mutate-0",
|
|
182
|
+
"rejected": true,
|
|
183
|
+
"reason": "Protected file modified: 'benchmark.py'"
|
|
184
|
+
}
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### `evo_step("fitness_ready", branch=..., fitness=..., success=..., operation=..., target_id=..., parent_branches=[...])`
|
|
188
|
+
|
|
189
|
+
**Input:** `branch`, `fitness`, `success`, `operation`, `target_id`, `parent_branches` (all required)
|
|
190
|
+
|
|
191
|
+
**Output:**
|
|
192
|
+
```json
|
|
193
|
+
{
|
|
194
|
+
"action": "worker_done",
|
|
195
|
+
"branch": "gen-0/loss-fn/mutate-0",
|
|
196
|
+
"fitness": 0.0342,
|
|
197
|
+
"success": true,
|
|
198
|
+
"is_new_best": true,
|
|
199
|
+
"total_evals": 15
|
|
200
|
+
}
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### `evo_step("select")`
|
|
204
|
+
|
|
205
|
+
**Input:** _(no extra args)_
|
|
206
|
+
|
|
207
|
+
**Output:**
|
|
208
|
+
```json
|
|
209
|
+
{
|
|
210
|
+
"action": "reflect",
|
|
211
|
+
"keep": ["gen-0/loss-fn/mutate-0", "gen-0/loss-fn/crossover-1"],
|
|
212
|
+
"eliminate": ["gen-0/loss-fn/mutate-2"],
|
|
213
|
+
"best_branch": "gen-0/loss-fn/mutate-0",
|
|
214
|
+
"best_obj": 0.0342
|
|
215
|
+
}
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### `evo_step("reflect_done")`
|
|
219
|
+
|
|
220
|
+
**Input:** _(no extra args)_
|
|
221
|
+
|
|
222
|
+
**Output:** Same as `begin_generation` (`dispatch_workers` or `done`).
|
|
223
|
+
|
|
78
224
|
## Memory Layout
|
|
79
225
|
|
|
80
226
|
```
|
|
@@ -100,16 +246,9 @@ Tags: `seed-baseline`, `best-gen-{N}`, `best-overall`
|
|
|
100
246
|
|
|
101
247
|
## Evaluation Protocol
|
|
102
248
|
|
|
103
|
-
Policy
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
1. **Policy check** — automatic, runs inside `evo_step("code_ready")`.
|
|
108
|
-
Server diffs `parent_commit..branch`, checks against `protected_patterns`
|
|
109
|
-
and declared target files.
|
|
110
|
-
- Pass → returns `action="run_benchmark"`
|
|
111
|
-
- Violation → records it, skips to next item, returns `action="generate_code"`
|
|
112
|
-
(or `action="select"` if batch is done) with `policy_violation={branch, reason}`
|
|
249
|
+
1. **Policy check** — explicit, via PolicyAgent.
|
|
250
|
+
After `evo_step("code_ready")` returns `check_policy`, the PolicyAgent
|
|
251
|
+
reviews the diff and calls `policy_pass` or `policy_fail`.
|
|
113
252
|
2. **Static check** — before committing: fix obvious issues (missing imports,
|
|
114
253
|
syntax errors). Do NOT fix algorithm logic.
|
|
115
254
|
3. **Quick eval** — if quick_cmd is configured, run it first to filter failures.
|
|
@@ -118,7 +257,7 @@ when you report that code is ready.
|
|
|
118
257
|
If a variant crashes:
|
|
119
258
|
- Read the traceback
|
|
120
259
|
- If it's a trivial fix (missing import, typo, type mismatch): fix it, re-commit,
|
|
121
|
-
then call `evo_step("code_ready", ...)` again
|
|
260
|
+
then call `evo_step("code_ready", ...)` again
|
|
122
261
|
- If it's an algorithm logic error: report via `evo_step("fitness_ready", success=False)`
|
|
123
262
|
|
|
124
263
|
## Constraints
|
package/plugin/SOUL.md
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
You are
|
|
1
|
+
You are the OrchestratorAgent of an evolutionary code optimization engine.
|
|
2
2
|
|
|
3
|
-
You
|
|
3
|
+
You coordinate a team of specialized agents: MapAgent analyzes targets, WorkerAgents generate and evaluate code variants in parallel, PolicyAgents review changes, and ReflectAgent extracts lessons.
|
|
4
4
|
|
|
5
|
-
You are methodical, data-driven, and never guess. You
|
|
6
|
-
|
|
7
|
-
When speaking to the user, be concise and direct. Report numbers, not feelings.
|
|
5
|
+
You are methodical, data-driven, and never guess. You dispatch work, track progress, and report numbers — not feelings. You stop when the data says to stop.
|
package/plugin/TOOLS.md
CHANGED
|
@@ -1,32 +1,43 @@
|
|
|
1
1
|
# Tool Usage Conventions
|
|
2
2
|
|
|
3
|
-
##
|
|
3
|
+
## By Agent
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
- `evo_init` — call once at the start to initialize evolution state
|
|
9
|
-
- `evo_register_targets` — register optimization targets identified by code analysis
|
|
10
|
-
- `evo_next_batch` — get the next set of branch operations to execute
|
|
11
|
-
- `evo_report_fitness` — report benchmark results back after evaluation
|
|
12
|
-
- `evo_select_survivors` — run selection algorithm, get keep/eliminate lists
|
|
5
|
+
### OrchestratorAgent
|
|
6
|
+
- `evo_step` — advance the evolution state machine (`begin_generation`, `select`, `reflect_done`)
|
|
13
7
|
- `evo_get_status` — check current evolution progress
|
|
14
8
|
- `evo_get_lineage` — trace how a branch evolved
|
|
15
9
|
- `evo_freeze_target` / `evo_boost_target` — manual priority control
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
- `
|
|
21
|
-
- `
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
- `
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
10
|
+
- `exec git branch -D` / `exec git tag` — branch cleanup and tagging
|
|
11
|
+
|
|
12
|
+
### MapAgent
|
|
13
|
+
- `read` — read source files and benchmark scripts
|
|
14
|
+
- `exec` — run static analysis, grep call chains
|
|
15
|
+
- `evo_register_targets` — register identified optimization targets
|
|
16
|
+
|
|
17
|
+
### WorkerAgent
|
|
18
|
+
- `read` / `edit` / `write` — read target code, generate variants
|
|
19
|
+
- `exec git checkout -b` — create variant branches
|
|
20
|
+
- `exec git worktree add/remove` — isolated evaluation directories
|
|
21
|
+
- `exec` — run benchmark command, capture stdout/stderr
|
|
22
|
+
- `evo_step` — report code (`code_ready`), report fitness (`fitness_ready`)
|
|
23
|
+
- `evo_check_cache` — skip duplicate code evaluations
|
|
24
|
+
|
|
25
|
+
### PolicyAgent
|
|
26
|
+
- `evo_step` — report policy decision (`policy_pass`, `policy_fail`)
|
|
27
|
+
- No other tools needed — all input comes from the `check_policy` response
|
|
28
|
+
|
|
29
|
+
### ReflectAgent
|
|
30
|
+
- `read` / `write` — memory file I/O (short_term, long_term, failures)
|
|
31
|
+
- `exec git diff` — compare best vs second-best variants
|
|
32
|
+
- `exec git cherry-pick` — combine branches for synergy checks
|
|
33
|
+
- `evo_record_synergy` — record synergy experiment results
|
|
34
|
+
- `evo_get_lineage` — trace branch ancestry for context
|
|
35
|
+
|
|
36
|
+
## General Rules
|
|
37
|
+
|
|
38
|
+
- All deterministic evolution bookkeeping goes through `evo_*` MCP tools.
|
|
39
|
+
Never manually track population state.
|
|
40
|
+
- Use `exec` for git commands and benchmark execution.
|
|
41
|
+
- Use `read` / `edit` / `write` for code changes. Never blindly generate —
|
|
42
|
+
always read the target function first.
|
|
43
|
+
- Always capture both stdout and stderr when running benchmarks.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# MapAgent
|
|
2
|
+
|
|
3
|
+
You analyze the target repository to identify which functions to optimize.
|
|
4
|
+
|
|
5
|
+
## When
|
|
6
|
+
|
|
7
|
+
Called once during initialization, before the evolution loop begins.
|
|
8
|
+
|
|
9
|
+
## Responsibilities
|
|
10
|
+
|
|
11
|
+
1. Read the benchmark entry file to understand what is being measured
|
|
12
|
+
2. Trace the call chain from the benchmark into the codebase
|
|
13
|
+
3. Identify functions that have the highest impact on the objective
|
|
14
|
+
4. For each target, determine: `id`, `file`, `function`, `lines`, `impact`, `description`
|
|
15
|
+
5. Call `evo_register_targets` with the identified targets
|
|
16
|
+
|
|
17
|
+
## Guidelines
|
|
18
|
+
|
|
19
|
+
- Prioritize functions that are called frequently or dominate runtime
|
|
20
|
+
- Skip trivial functions (getters, setters, simple wrappers)
|
|
21
|
+
- Skip functions whose signatures are constrained by external APIs
|
|
22
|
+
- Aim for 1-5 targets; too many dilutes evolution budget
|
|
23
|
+
|
|
24
|
+
## Tools
|
|
25
|
+
|
|
26
|
+
- `read` — read source files
|
|
27
|
+
- `exec` — run `grep`, `ast` analysis, profiling if available
|
|
28
|
+
- `evo_register_targets` — register identified targets
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# OrchestratorAgent
|
|
2
|
+
|
|
3
|
+
You drive the evolution loop. You do not generate code or run benchmarks — you coordinate.
|
|
4
|
+
|
|
5
|
+
## Responsibilities
|
|
6
|
+
|
|
7
|
+
1. Call `evo_step("begin_generation")` to get batch items
|
|
8
|
+
2. Spawn one **WorkerAgent** per item in parallel
|
|
9
|
+
3. Wait for all workers to return `worker_done`
|
|
10
|
+
4. Call `evo_step("select")` to run survivor selection
|
|
11
|
+
5. Clean up eliminated branches (`git branch -D`)
|
|
12
|
+
6. Tag the best branch: `git tag best-gen-{N}`
|
|
13
|
+
7. Spawn **ReflectAgent** with the selection result
|
|
14
|
+
8. Call `evo_step("reflect_done")` to advance to next generation or finish
|
|
15
|
+
|
|
16
|
+
## Decision Points
|
|
17
|
+
|
|
18
|
+
- **Stop condition**: `action == "done"` or user signals to stop
|
|
19
|
+
- **Worker failure**: if a worker crashes, record `fitness_ready(success=False)` on its behalf
|
|
20
|
+
- **Progress report**: after each generation, report to the user: generation number, best fitness, improvement percentage
|
|
21
|
+
|
|
22
|
+
## Tools
|
|
23
|
+
|
|
24
|
+
- `evo_step` — advance the state machine
|
|
25
|
+
- `evo_get_status` — check current evolution progress
|
|
26
|
+
- `exec git` — branch management (delete, tag)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# PolicyAgent
|
|
2
|
+
|
|
3
|
+
You review code changes before they are benchmarked. Your job is to catch violations
|
|
4
|
+
that would waste evaluation budget or compromise the integrity of the experiment.
|
|
5
|
+
|
|
6
|
+
## Input
|
|
7
|
+
|
|
8
|
+
Called by WorkerAgent after `evo_step("code_ready")` returns:
|
|
9
|
+
```json
|
|
10
|
+
{
|
|
11
|
+
"action": "check_policy",
|
|
12
|
+
"branch": "gen-0/loss-fn/mutate-0",
|
|
13
|
+
"target_file": "model.py",
|
|
14
|
+
"changed_files": ["model.py"],
|
|
15
|
+
"diff": "--- a/model.py\n+++ b/model.py\n...",
|
|
16
|
+
"protected_patterns": ["benchmark*.py", "eval*.py", "*.sh"]
|
|
17
|
+
}
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Checklist
|
|
21
|
+
|
|
22
|
+
Review the `diff` and `changed_files` against these rules:
|
|
23
|
+
|
|
24
|
+
1. **Protected files**: Do any `changed_files` match `protected_patterns`?
|
|
25
|
+
(benchmark scripts, evaluation scripts, shell scripts)
|
|
26
|
+
2. **Target scope**: Are all `changed_files` within the declared `target_file`?
|
|
27
|
+
Modifications to unrelated files are not allowed.
|
|
28
|
+
3. **Signature preservation**: Was the function signature (name, parameters,
|
|
29
|
+
return type) left unchanged? Only the function body should be modified.
|
|
30
|
+
4. **Hidden side effects**: Does the diff introduce global state changes,
|
|
31
|
+
file I/O, network calls, or environment variable reads that could
|
|
32
|
+
influence benchmark results outside the function scope?
|
|
33
|
+
5. **Syntax validity**: Does the changed code have obvious syntax errors
|
|
34
|
+
that would cause an immediate crash?
|
|
35
|
+
|
|
36
|
+
## Decision
|
|
37
|
+
|
|
38
|
+
- **Approve**: all checks pass
|
|
39
|
+
```
|
|
40
|
+
evo_step("policy_pass", branch=step.branch)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
- **Reject**: any check fails — provide a specific reason
|
|
44
|
+
```
|
|
45
|
+
evo_step("policy_fail", branch=step.branch, reason="Changed function signature: added parameter 'lr'")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Guidelines
|
|
49
|
+
|
|
50
|
+
- Be strict on rules 1-3 (hard violations). These are never acceptable.
|
|
51
|
+
- Be lenient on rule 4 (soft violations). Flag only clear, intentional side effects.
|
|
52
|
+
- Rule 5 is advisory — WorkerAgent can fix and retry if rejected for syntax.
|
|
53
|
+
- Keep rejection reasons specific and actionable.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# ReflectAgent
|
|
2
|
+
|
|
3
|
+
You analyze the results of each generation and write structured memory to guide future evolution.
|
|
4
|
+
|
|
5
|
+
## Input
|
|
6
|
+
|
|
7
|
+
Called by OrchestratorAgent after selection, with:
|
|
8
|
+
```json
|
|
9
|
+
{
|
|
10
|
+
"action": "reflect",
|
|
11
|
+
"keep": ["gen-0/loss-fn/mutate-0", "gen-0/loss-fn/crossover-1"],
|
|
12
|
+
"eliminate": ["gen-0/loss-fn/mutate-2"],
|
|
13
|
+
"best_branch": "gen-0/loss-fn/mutate-0",
|
|
14
|
+
"best_obj": 0.0342
|
|
15
|
+
}
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Flow
|
|
19
|
+
|
|
20
|
+
### 1. Short-term reflection
|
|
21
|
+
|
|
22
|
+
For each target that had variants this generation:
|
|
23
|
+
```
|
|
24
|
+
git diff {best_branch}..{second_best_branch}
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Analyze: what made the best variant better? Write findings to:
|
|
28
|
+
```
|
|
29
|
+
memory/targets/{target_id}/short_term/gen_{N}.md
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Include: generation number, fitness values, what changed, why it likely helped.
|
|
33
|
+
|
|
34
|
+
### 2. Long-term synthesis
|
|
35
|
+
|
|
36
|
+
Read all `short_term/gen_*.md` files for this target. Synthesize into:
|
|
37
|
+
```
|
|
38
|
+
memory/targets/{target_id}/long_term.md
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Focus on: recurring patterns, diminishing returns, promising directions.
|
|
42
|
+
|
|
43
|
+
### 3. Failure logging
|
|
44
|
+
|
|
45
|
+
For variants that failed (success=False or were policy-rejected):
|
|
46
|
+
Append to `memory/targets/{target_id}/failures.md`:
|
|
47
|
+
- What was tried
|
|
48
|
+
- Why it failed
|
|
49
|
+
- Specific patterns to avoid
|
|
50
|
+
|
|
51
|
+
### 4. Synergy check (every 3 generations)
|
|
52
|
+
|
|
53
|
+
If `generation % synergy_interval == 0` and there are multiple targets:
|
|
54
|
+
- Cherry-pick the best of each target into a combined branch
|
|
55
|
+
- Run the WorkerAgent flow on the combined branch
|
|
56
|
+
- Record results via `evo_record_synergy`
|
|
57
|
+
- Write to `memory/synergy/records.md`
|
|
58
|
+
|
|
59
|
+
### 5. Global reflection
|
|
60
|
+
|
|
61
|
+
If cross-target patterns emerge, update:
|
|
62
|
+
```
|
|
63
|
+
memory/global/long_term.md
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Tools
|
|
67
|
+
|
|
68
|
+
- `read` / `write` — memory file I/O
|
|
69
|
+
- `exec git diff` — compare variants
|
|
70
|
+
- `exec git cherry-pick` — synergy combinations
|
|
71
|
+
- `evo_record_synergy` — record synergy results
|
|
72
|
+
- `evo_get_lineage` — trace branch ancestry for context
|
|
73
|
+
|
|
74
|
+
## Guidelines
|
|
75
|
+
|
|
76
|
+
- Be data-driven: cite exact fitness numbers and generation IDs
|
|
77
|
+
- Be specific: "adding momentum term improved fitness by 12%" not "the change helped"
|
|
78
|
+
- Update failures.md incrementally — don't overwrite, append
|
|
79
|
+
- long_term.md should be a concise synthesis, not a dump of all short_term files
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# WorkerAgent
|
|
2
|
+
|
|
3
|
+
You handle the full lifecycle of a single code variant: generate, validate, evaluate.
|
|
4
|
+
|
|
5
|
+
## Input
|
|
6
|
+
|
|
7
|
+
A single `item` from the batch:
|
|
8
|
+
```json
|
|
9
|
+
{
|
|
10
|
+
"branch": "gen-0/loss-fn/mutate-0",
|
|
11
|
+
"operation": "mutate",
|
|
12
|
+
"target_id": "loss-fn",
|
|
13
|
+
"parent_branches": ["seed-baseline"],
|
|
14
|
+
"target_file": "model.py",
|
|
15
|
+
"target_function": "compute_loss"
|
|
16
|
+
}
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Flow
|
|
20
|
+
|
|
21
|
+
### 1. CodeGen — generate the variant
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
git checkout -b {item.branch} {item.parent_branches[0]}
|
|
25
|
+
parent_commit = git rev-parse {item.parent_branches[0]}
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
- Read the target function code from `item.target_file`
|
|
29
|
+
- Read `memory/targets/{item.target_id}/long_term.md` for accumulated wisdom
|
|
30
|
+
- Read `memory/targets/{item.target_id}/failures.md` to avoid known bad paths
|
|
31
|
+
- If `operation == "crossover"`: also read code from `parent_branches[1]`
|
|
32
|
+
- Generate the variant, keeping the function signature unchanged
|
|
33
|
+
- Fix obvious issues (missing imports, syntax errors)
|
|
34
|
+
- `git add` + `git commit`
|
|
35
|
+
|
|
36
|
+
### 2. Policy Check — request review
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
step = evo_step("code_ready",
|
|
40
|
+
branch=item.branch,
|
|
41
|
+
parent_commit=parent_commit)
|
|
42
|
+
# Returns: {action: "check_policy", diff, changed_files, protected_patterns, ...}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Hand the `step` to **PolicyAgent** for review.
|
|
46
|
+
|
|
47
|
+
- If PolicyAgent approves:
|
|
48
|
+
```
|
|
49
|
+
step = evo_step("policy_pass", branch=item.branch)
|
|
50
|
+
# Returns: {action: "run_benchmark", ...}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
- If PolicyAgent rejects:
|
|
54
|
+
```
|
|
55
|
+
step = evo_step("policy_fail", branch=item.branch, reason="...")
|
|
56
|
+
# Returns: {action: "worker_done", rejected=True}
|
|
57
|
+
```
|
|
58
|
+
Exit early — do not benchmark.
|
|
59
|
+
|
|
60
|
+
### 3. Benchmark — evaluate the variant
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
git worktree add /tmp/eval-{branch} {step.branch}
|
|
64
|
+
cd /tmp/eval-{branch}
|
|
65
|
+
exec {benchmark_cmd} # capture stdout + stderr
|
|
66
|
+
fitness = parse last line as float
|
|
67
|
+
git worktree remove /tmp/eval-{branch}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
If the variant crashes:
|
|
71
|
+
- Trivial fix (missing import, typo): fix it, re-commit, call `evo_step("code_ready")` again
|
|
72
|
+
- Logic error: report `success=False`
|
|
73
|
+
|
|
74
|
+
### 4. Report
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
evo_step("fitness_ready",
|
|
78
|
+
branch=step.branch,
|
|
79
|
+
fitness=fitness,
|
|
80
|
+
success=true/false,
|
|
81
|
+
operation=step.operation,
|
|
82
|
+
target_id=step.target_id,
|
|
83
|
+
parent_branches=step.parent_branches)
|
|
84
|
+
# Returns: {action: "worker_done", fitness, is_new_best, ...}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Tools
|
|
88
|
+
|
|
89
|
+
- `read` / `edit` / `write` — code generation
|
|
90
|
+
- `exec git` — branch creation, worktree management
|
|
91
|
+
- `exec` — run benchmark command
|
|
92
|
+
- `evo_step` — advance the state machine
|
|
93
|
+
- `evo_check_cache` — skip duplicates
|
|
@@ -602,12 +602,14 @@ def evo_check_cache(code_hash: str) -> dict:
|
|
|
602
602
|
# ---------------------------------------------------------------------------
|
|
603
603
|
|
|
604
604
|
# Phase constants (passed as strings so they are readable in LLM output)
|
|
605
|
-
_PHASE_BEGIN
|
|
606
|
-
_PHASE_CODE
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
605
|
+
_PHASE_BEGIN = "begin_generation" # start a new generation
|
|
606
|
+
_PHASE_CODE = "code_ready" # worker committed code for a branch
|
|
607
|
+
_PHASE_POLICY_PASS = "policy_pass" # PolicyAgent approved the diff
|
|
608
|
+
_PHASE_POLICY_FAIL = "policy_fail" # PolicyAgent rejected the diff
|
|
609
|
+
_PHASE_FITNESS = "fitness_ready" # worker ran benchmark, has fitness value
|
|
610
|
+
_PHASE_SELECT = "select" # all items evaluated, run selection
|
|
611
|
+
_PHASE_REFLECT = "reflect_done" # ReflectAgent finished writing memory
|
|
612
|
+
_PHASE_DONE = "done" # budget exhausted
|
|
611
613
|
|
|
612
614
|
|
|
613
615
|
def _policy_check(repo_path: str, branch: str, parent: str,
|
|
@@ -640,26 +642,37 @@ def evo_step(phase: str, branch: str = "", parent_commit: str = "",
|
|
|
640
642
|
fitness: float = 0.0, success: bool = True,
|
|
641
643
|
operation: str = "", target_id: str = "",
|
|
642
644
|
parent_branches: list[str] | None = None,
|
|
643
|
-
code_hash: str = "", raw_output: str = ""
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
"
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
645
|
+
code_hash: str = "", raw_output: str = "",
|
|
646
|
+
reason: str = "") -> dict:
|
|
647
|
+
"""Multi-agent evolution loop driver.
|
|
648
|
+
|
|
649
|
+
Called by the OrchestratorAgent and WorkerAgents to advance the evolution.
|
|
650
|
+
Each call returns the next action to perform.
|
|
651
|
+
|
|
652
|
+
Phases and what to pass → what is returned:
|
|
653
|
+
|
|
654
|
+
"begin_generation" → {action="dispatch_workers", items=[...]}
|
|
655
|
+
Start a new generation. Returns ALL batch items for parallel dispatch.
|
|
656
|
+
|
|
657
|
+
"code_ready" → {action="check_policy", diff=..., changed_files=...}
|
|
658
|
+
Worker committed code. Pass: branch, parent_commit.
|
|
659
|
+
Returns diff + metadata for PolicyAgent to review.
|
|
660
|
+
|
|
661
|
+
"policy_pass" → {action="run_benchmark", branch, target_id, ...}
|
|
662
|
+
PolicyAgent approved. Pass: branch.
|
|
663
|
+
|
|
664
|
+
"policy_fail" → {action="worker_done", rejected=True, reason=...}
|
|
665
|
+
PolicyAgent rejected. Pass: branch, reason.
|
|
666
|
+
|
|
667
|
+
"fitness_ready" → {action="worker_done", fitness, success, ...}
|
|
668
|
+
Worker ran benchmark. Pass: branch, fitness, success,
|
|
669
|
+
operation, target_id, parent_branches.
|
|
670
|
+
|
|
671
|
+
"select" → {action="reflect", keep=[...], eliminate=[...]}
|
|
672
|
+
Orchestrator triggers selection after all workers report.
|
|
673
|
+
|
|
674
|
+
"reflect_done" → {action="dispatch_workers"} or {action="done"}
|
|
675
|
+
ReflectAgent finished. Server starts next generation or ends.
|
|
663
676
|
"""
|
|
664
677
|
state = _get_state()
|
|
665
678
|
pb = parent_branches or []
|
|
@@ -673,11 +686,8 @@ def evo_step(phase: str, branch: str = "", parent_commit: str = "",
|
|
|
673
686
|
if not branch:
|
|
674
687
|
return {"error": "branch is required for phase 'code_ready'"}
|
|
675
688
|
|
|
676
|
-
# Find the batch item
|
|
689
|
+
# Find the batch item for context
|
|
677
690
|
item = next((it for it in state.current_batch if it.branch == branch), None)
|
|
678
|
-
allowed: set[str] = set()
|
|
679
|
-
if item and item.target_file:
|
|
680
|
-
allowed = {item.target_file}
|
|
681
691
|
|
|
682
692
|
# Resolve parent: prefer explicit parent_commit, fall back to parent_branches[0]
|
|
683
693
|
parent = parent_commit
|
|
@@ -691,38 +701,68 @@ def evo_step(phase: str, branch: str = "", parent_commit: str = "",
|
|
|
691
701
|
return {"error": "Cannot determine parent commit for policy check. "
|
|
692
702
|
"Pass parent_commit= explicitly."}
|
|
693
703
|
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
branch
|
|
697
|
-
|
|
698
|
-
protected_patterns=state.config.protected_patterns,
|
|
699
|
-
allowed_files=allowed,
|
|
704
|
+
# Get changed files list
|
|
705
|
+
names_result = subprocess.run(
|
|
706
|
+
["git", "-C", state.config.repo_path, "diff", "--name-only", f"{parent}..{branch}"],
|
|
707
|
+
capture_output=True, text=True,
|
|
700
708
|
)
|
|
709
|
+
changed_files = [f for f in names_result.stdout.strip().splitlines() if f]
|
|
701
710
|
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
operation=item.operation if item else Operation.MUTATE,
|
|
708
|
-
parent_branches=item.parent_branches if item else [],
|
|
709
|
-
fitness=None,
|
|
710
|
-
success=False,
|
|
711
|
-
raw_output=f"policy_violation: {reason}",
|
|
712
|
-
)
|
|
713
|
-
state.individuals[branch] = ind
|
|
714
|
-
state.batch_cursor += 1
|
|
715
|
-
_save()
|
|
716
|
-
next_step = _next_item_or_select(state)
|
|
717
|
-
next_step["policy_violation"] = {"branch": branch, "reason": reason}
|
|
718
|
-
return next_step
|
|
711
|
+
# Get full diff for PolicyAgent to review
|
|
712
|
+
diff_result = subprocess.run(
|
|
713
|
+
["git", "-C", state.config.repo_path, "diff", f"{parent}..{branch}"],
|
|
714
|
+
capture_output=True, text=True,
|
|
715
|
+
)
|
|
719
716
|
|
|
720
717
|
return {
|
|
721
|
-
"action": "
|
|
718
|
+
"action": "check_policy",
|
|
722
719
|
"branch": branch,
|
|
720
|
+
"parent_commit": parent,
|
|
723
721
|
"target_id": item.target_id if item else "",
|
|
722
|
+
"target_file": item.target_file if item else "",
|
|
724
723
|
"operation": item.operation.value if item else "",
|
|
725
724
|
"parent_branches": item.parent_branches if item else [],
|
|
725
|
+
"changed_files": changed_files,
|
|
726
|
+
"diff": diff_result.stdout[:8000], # truncate very large diffs
|
|
727
|
+
"protected_patterns": state.config.protected_patterns,
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
# ------------------------------------------------------------------ policy_pass
|
|
731
|
+
if phase == _PHASE_POLICY_PASS:
|
|
732
|
+
if not branch:
|
|
733
|
+
return {"error": "branch is required for phase 'policy_pass'"}
|
|
734
|
+
item = next((it for it in state.current_batch if it.branch == branch), None)
|
|
735
|
+
return {
|
|
736
|
+
"action": "run_benchmark",
|
|
737
|
+
"branch": branch,
|
|
738
|
+
"target_id": item.target_id if item else target_id,
|
|
739
|
+
"operation": item.operation.value if item else operation,
|
|
740
|
+
"parent_branches": item.parent_branches if item else pb,
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
# ------------------------------------------------------------------ policy_fail
|
|
744
|
+
if phase == _PHASE_POLICY_FAIL:
|
|
745
|
+
if not branch:
|
|
746
|
+
return {"error": "branch is required for phase 'policy_fail'"}
|
|
747
|
+
item = next((it for it in state.current_batch if it.branch == branch), None)
|
|
748
|
+
fail_reason = reason or raw_output or "policy violation"
|
|
749
|
+
ind = Individual(
|
|
750
|
+
branch=branch,
|
|
751
|
+
generation=state.generation,
|
|
752
|
+
target_id=item.target_id if item else target_id,
|
|
753
|
+
operation=item.operation if item else Operation.MUTATE,
|
|
754
|
+
parent_branches=item.parent_branches if item else pb,
|
|
755
|
+
fitness=None,
|
|
756
|
+
success=False,
|
|
757
|
+
raw_output=f"policy_violation: {fail_reason}",
|
|
758
|
+
)
|
|
759
|
+
state.individuals[branch] = ind
|
|
760
|
+
_save()
|
|
761
|
+
return {
|
|
762
|
+
"action": "worker_done",
|
|
763
|
+
"branch": branch,
|
|
764
|
+
"rejected": True,
|
|
765
|
+
"reason": fail_reason,
|
|
726
766
|
}
|
|
727
767
|
|
|
728
768
|
# ------------------------------------------------------------------ fitness_ready
|
|
@@ -730,12 +770,15 @@ def evo_step(phase: str, branch: str = "", parent_commit: str = "",
|
|
|
730
770
|
# Cache check: skip recording if this code was already evaluated
|
|
731
771
|
if code_hash and code_hash in state.fitness_cache:
|
|
732
772
|
cached = state.fitness_cache[code_hash]
|
|
733
|
-
state.
|
|
773
|
+
state.total_evals += 1 # cache hits still consume budget
|
|
734
774
|
_save()
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
775
|
+
return {
|
|
776
|
+
"action": "worker_done",
|
|
777
|
+
"branch": branch,
|
|
778
|
+
"cached": True,
|
|
779
|
+
"fitness": cached,
|
|
780
|
+
"total_evals": state.total_evals,
|
|
781
|
+
}
|
|
739
782
|
|
|
740
783
|
is_min = state.config.objective == Objective.MIN
|
|
741
784
|
ind = Individual(
|
|
@@ -783,12 +826,15 @@ def evo_step(phase: str, branch: str = "", parent_commit: str = "",
|
|
|
783
826
|
state.best_obj_overall = fitness
|
|
784
827
|
state.best_branch_overall = branch
|
|
785
828
|
|
|
786
|
-
state.batch_cursor += 1
|
|
787
829
|
_save()
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
830
|
+
return {
|
|
831
|
+
"action": "worker_done",
|
|
832
|
+
"branch": branch,
|
|
833
|
+
"fitness": fitness,
|
|
834
|
+
"success": success,
|
|
835
|
+
"is_new_best": branch == state.best_branch_overall,
|
|
836
|
+
"total_evals": state.total_evals,
|
|
837
|
+
}
|
|
792
838
|
|
|
793
839
|
# ------------------------------------------------------------------ select
|
|
794
840
|
if phase == _PHASE_SELECT:
|
|
@@ -805,11 +851,12 @@ def evo_step(phase: str, branch: str = "", parent_commit: str = "",
|
|
|
805
851
|
return _begin_generation_impl(state)
|
|
806
852
|
|
|
807
853
|
return {"error": f"Unknown phase: {phase!r}. Valid phases: "
|
|
808
|
-
f"{_PHASE_BEGIN}, {_PHASE_CODE}, {
|
|
854
|
+
f"{_PHASE_BEGIN}, {_PHASE_CODE}, {_PHASE_POLICY_PASS}, {_PHASE_POLICY_FAIL}, "
|
|
855
|
+
f"{_PHASE_FITNESS}, {_PHASE_SELECT}, {_PHASE_REFLECT}"}
|
|
809
856
|
|
|
810
857
|
|
|
811
858
|
def _begin_generation_impl(state: EvolutionState) -> dict:
|
|
812
|
-
"""Plan and store the next generation batch; return
|
|
859
|
+
"""Plan and store the next generation batch; return all items for parallel dispatch."""
|
|
813
860
|
budget_remaining = state.config.max_fe - state.total_evals
|
|
814
861
|
if budget_remaining <= 0:
|
|
815
862
|
return {"action": _PHASE_DONE, "reason": "budget exhausted",
|
|
@@ -874,38 +921,18 @@ def _begin_generation_impl(state: EvolutionState) -> dict:
|
|
|
874
921
|
target_function=target.function))
|
|
875
922
|
|
|
876
923
|
state.current_batch = batch
|
|
877
|
-
state.batch_cursor = 0
|
|
924
|
+
state.batch_cursor = 0 # kept for compat; parallel flow ignores this
|
|
878
925
|
_save()
|
|
879
926
|
|
|
880
927
|
if not batch:
|
|
881
928
|
return {"action": _PHASE_DONE, "reason": "empty batch",
|
|
882
929
|
"total_evals": state.total_evals}
|
|
883
930
|
|
|
884
|
-
first = batch[0]
|
|
885
931
|
return {
|
|
886
|
-
"action": "
|
|
932
|
+
"action": "dispatch_workers",
|
|
887
933
|
"generation": state.generation,
|
|
888
934
|
"batch_size": len(batch),
|
|
889
|
-
"
|
|
890
|
-
"item": first.model_dump(),
|
|
891
|
-
}
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
def _next_item_or_select(state: EvolutionState) -> dict:
|
|
895
|
-
"""Return next generate_code action or trigger select if batch is done."""
|
|
896
|
-
if state.batch_cursor < len(state.current_batch):
|
|
897
|
-
item = state.current_batch[state.batch_cursor]
|
|
898
|
-
return {
|
|
899
|
-
"action": "generate_code",
|
|
900
|
-
"generation": state.generation,
|
|
901
|
-
"cursor": state.batch_cursor,
|
|
902
|
-
"batch_size": len(state.current_batch),
|
|
903
|
-
"item": item.model_dump(),
|
|
904
|
-
}
|
|
905
|
-
return {
|
|
906
|
-
"action": "select",
|
|
907
|
-
"generation": state.generation,
|
|
908
|
-
"items_evaluated": len(state.current_batch),
|
|
935
|
+
"items": [item.model_dump() for item in batch],
|
|
909
936
|
}
|
|
910
937
|
|
|
911
938
|
|
|
@@ -20,23 +20,25 @@ User provides: repo path, benchmark command, objective (min/max), and optionally
|
|
|
20
20
|
- Call `evo_report_seed` with baseline fitness
|
|
21
21
|
- `exec git -C <repo> tag seed-baseline`
|
|
22
22
|
|
|
23
|
-
3. Analyze code (MapAgent
|
|
24
|
-
-
|
|
25
|
-
-
|
|
26
|
-
- Call `evo_register_targets` with identified targets
|
|
23
|
+
3. Analyze code (MapAgent):
|
|
24
|
+
- Spawn MapAgent to read benchmark entry file, trace call chain, identify targets
|
|
25
|
+
- MapAgent calls `evo_register_targets` with identified targets
|
|
27
26
|
|
|
28
27
|
4. Create memory structure:
|
|
29
28
|
- `exec mkdir -p <repo>/memory/global`
|
|
30
29
|
- For each target: `exec mkdir -p <repo>/memory/targets/<id>/short_term`
|
|
31
30
|
|
|
32
|
-
5. Enter evolution loop
|
|
33
|
-
-
|
|
34
|
-
|
|
35
|
-
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
-
|
|
31
|
+
5. Enter evolution loop — follow the Core Loop in AGENTS.md:
|
|
32
|
+
- OrchestratorAgent calls `evo_step("begin_generation")`
|
|
33
|
+
→ returns `{action: "dispatch_workers", items: [...]}`
|
|
34
|
+
- Spawn one WorkerAgent per item, in parallel
|
|
35
|
+
- Each WorkerAgent: generates code → requests policy check → PolicyAgent
|
|
36
|
+
reviews diff → if approved, runs benchmark → reports fitness
|
|
37
|
+
- When all workers return, OrchestratorAgent calls `evo_step("select")`
|
|
38
|
+
- Clean up eliminated branches, tag best
|
|
39
|
+
- Spawn ReflectAgent to write memory
|
|
40
|
+
- Call `evo_step("reflect_done")` to start next generation or finish
|
|
41
|
+
- Stop when `action == "done"` or results are sufficient
|
|
40
42
|
|
|
41
43
|
6. Report progress to user after each generation.
|
|
42
44
|
|
package/scripts/cli.js
CHANGED
|
@@ -17,6 +17,7 @@ const os = require('os');
|
|
|
17
17
|
const PKG_ROOT = path.resolve(__dirname, '..');
|
|
18
18
|
const SKILLS_DIR = path.join(PKG_ROOT, 'plugin', 'skills');
|
|
19
19
|
const AGENTS_MD = path.join(PKG_ROOT, 'plugin', 'AGENTS.md');
|
|
20
|
+
const AGENTS_DIR = path.join(PKG_ROOT, 'plugin', 'agents');
|
|
20
21
|
|
|
21
22
|
const MCP_SERVER_ENTRY = {
|
|
22
23
|
command: 'evo-engine',
|
|
@@ -84,6 +85,17 @@ function setupCursor(projectDir) {
|
|
|
84
85
|
fs.mkdirSync(rulesDir, { recursive: true });
|
|
85
86
|
fs.copyFileSync(AGENTS_MD, path.join(rulesDir, 'evo-agents.md'));
|
|
86
87
|
console.log(` ✅ Copied AGENTS.md → .cursor/rules/evo-agents.md`);
|
|
88
|
+
|
|
89
|
+
// 复制 agent 定义文件
|
|
90
|
+
if (fs.existsSync(AGENTS_DIR)) {
|
|
91
|
+
for (const agentFile of fs.readdirSync(AGENTS_DIR)) {
|
|
92
|
+
fs.copyFileSync(
|
|
93
|
+
path.join(AGENTS_DIR, agentFile),
|
|
94
|
+
path.join(rulesDir, `evo-${agentFile}`)
|
|
95
|
+
);
|
|
96
|
+
console.log(` ✅ Copied agents/${agentFile} → .cursor/rules/evo-${agentFile}`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
87
99
|
}
|
|
88
100
|
|
|
89
101
|
function setupWindsurf() {
|