@evo-hq/pi-evo 0.4.3 → 0.4.4-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/extensions/evo/index.js +633 -49
- package/package.json +1 -1
- package/skills/discover/SKILL.md +35 -15
- package/skills/discover/references/constructing-benchmark.md +7 -3
- package/skills/infra-setup/SKILL.md +1 -0
- package/skills/optimize/SKILL.md +9 -6
- package/skills/report/SKILL.md +43 -0
- package/skills/subagent/SKILL.md +7 -5
package/extensions/evo/index.js
CHANGED
|
@@ -98,8 +98,18 @@ function writeOffset(runDir, sid, opts) {
|
|
|
98
98
|
function formatDirectiveText(events) {
|
|
99
99
|
const lines = [];
|
|
100
100
|
for (const ev of events) {
|
|
101
|
-
if (ev.text)
|
|
102
|
-
|
|
101
|
+
if (!ev.text)
|
|
102
|
+
continue;
|
|
103
|
+
const id = ev.id || "";
|
|
104
|
+
if (id) {
|
|
105
|
+
lines.push(`[EVO DIRECTIVE id=${id}]`);
|
|
106
|
+
lines.push(ev.text);
|
|
107
|
+
lines.push(`[END EVO DIRECTIVE — when done, run: evo ack ${id}]`);
|
|
108
|
+
} else {
|
|
109
|
+
lines.push("[EVO DIRECTIVE]");
|
|
110
|
+
lines.push(ev.text);
|
|
111
|
+
lines.push("[END EVO DIRECTIVE]");
|
|
112
|
+
}
|
|
103
113
|
}
|
|
104
114
|
return lines.join(`
|
|
105
115
|
`);
|
|
@@ -117,6 +127,12 @@ function registerSession(runDir, sid, host, expId = null) {
|
|
|
117
127
|
const existing = readJsonOrNull(p);
|
|
118
128
|
if (existing) {
|
|
119
129
|
existing.last_seen_at = now;
|
|
130
|
+
if (expId && !existing.exp_id)
|
|
131
|
+
existing.exp_id = expId;
|
|
132
|
+
if (existing.has_evo_engaged === undefined)
|
|
133
|
+
existing.has_evo_engaged = false;
|
|
134
|
+
if (existing.engaged_at === undefined)
|
|
135
|
+
existing.engaged_at = null;
|
|
120
136
|
atomicWriteJson(p, existing);
|
|
121
137
|
return;
|
|
122
138
|
}
|
|
@@ -128,9 +144,40 @@ function registerSession(runDir, sid, host, expId = null) {
|
|
|
128
144
|
registered_at: now,
|
|
129
145
|
last_seen_at: now,
|
|
130
146
|
exp_id: expId,
|
|
131
|
-
parent_session_id: null
|
|
147
|
+
parent_session_id: null,
|
|
148
|
+
has_evo_engaged: false,
|
|
149
|
+
engaged_at: null
|
|
132
150
|
};
|
|
133
151
|
atomicWriteJson(p, rec);
|
|
152
|
+
initOffsetToLatest(runDir, sid);
|
|
153
|
+
}
|
|
154
|
+
function markEngaged(runDir, sid) {
|
|
155
|
+
const p = sessionFile(runDir, sid);
|
|
156
|
+
const rec = readJsonOrNull(p);
|
|
157
|
+
if (!rec)
|
|
158
|
+
return false;
|
|
159
|
+
if (rec.has_evo_engaged)
|
|
160
|
+
return false;
|
|
161
|
+
rec.has_evo_engaged = true;
|
|
162
|
+
rec.engaged_at = nowIso();
|
|
163
|
+
atomicWriteJson(p, rec);
|
|
164
|
+
return true;
|
|
165
|
+
}
|
|
166
|
+
function initOffsetToLatest(runDir, sid) {
|
|
167
|
+
const wsPath = workspaceEventsPath(runDir);
|
|
168
|
+
let latest = null;
|
|
169
|
+
if (fs.existsSync(wsPath)) {
|
|
170
|
+
const events = readEventsAfter(wsPath, null);
|
|
171
|
+
if (events.length > 0)
|
|
172
|
+
latest = events[events.length - 1].id;
|
|
173
|
+
}
|
|
174
|
+
writeOffset(runDir, sid, { workspaceId: latest });
|
|
175
|
+
}
|
|
176
|
+
var EVO_CMD_RE = /^\s*evo(\s|$)/;
|
|
177
|
+
function isEvoCommand(command) {
|
|
178
|
+
if (!command || typeof command !== "string")
|
|
179
|
+
return false;
|
|
180
|
+
return EVO_CMD_RE.test(command);
|
|
134
181
|
}
|
|
135
182
|
function findEvoRunDir(cwd) {
|
|
136
183
|
const envRunDir = process.env.EVO_RUN_DIR;
|
|
@@ -156,6 +203,40 @@ function findEvoRunDir(cwd) {
|
|
|
156
203
|
}
|
|
157
204
|
return null;
|
|
158
205
|
}
|
|
206
|
+
function peekDrainSession(runDir, sessionId) {
|
|
207
|
+
const sess = getSession(runDir, sessionId);
|
|
208
|
+
if (!sess) {
|
|
209
|
+
return { text: null, newWorkspaceOffset: null, newExpOffset: null };
|
|
210
|
+
}
|
|
211
|
+
const expId = sess.exp_id;
|
|
212
|
+
let events = [];
|
|
213
|
+
let newWorkspaceOffset = null;
|
|
214
|
+
let newExpOffset = null;
|
|
215
|
+
if (expId) {
|
|
216
|
+
const lastId = readOffset(runDir, sessionId, "exp");
|
|
217
|
+
const newEvents = readEventsAfter(expEventsPath(runDir, expId), lastId);
|
|
218
|
+
events = newEvents;
|
|
219
|
+
if (newEvents.length > 0)
|
|
220
|
+
newExpOffset = newEvents[newEvents.length - 1].id;
|
|
221
|
+
} else {
|
|
222
|
+
const lastId = readOffset(runDir, sessionId, "workspace");
|
|
223
|
+
const newEvents = readEventsAfter(workspaceEventsPath(runDir), lastId);
|
|
224
|
+
events = newEvents;
|
|
225
|
+
if (newEvents.length > 0)
|
|
226
|
+
newWorkspaceOffset = newEvents[newEvents.length - 1].id;
|
|
227
|
+
}
|
|
228
|
+
const text = events.length > 0 ? formatDirectiveText(events) : null;
|
|
229
|
+
return { text, newWorkspaceOffset, newExpOffset };
|
|
230
|
+
}
|
|
231
|
+
function commitDrainPeek(runDir, sessionId, peek) {
|
|
232
|
+
if (peek.newWorkspaceOffset || peek.newExpOffset) {
|
|
233
|
+
writeOffset(runDir, sessionId, {
|
|
234
|
+
workspaceId: peek.newWorkspaceOffset,
|
|
235
|
+
expId: peek.newExpOffset
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
unlinkIfExists(markerFile(runDir, sessionId));
|
|
239
|
+
}
|
|
159
240
|
function drainSession(runDir, sessionId) {
|
|
160
241
|
const sess = getSession(runDir, sessionId);
|
|
161
242
|
if (!sess) {
|
|
@@ -189,56 +270,559 @@ function drainSession(runDir, sessionId) {
|
|
|
189
270
|
unlinkIfExists(markerFile(runDir, sessionId));
|
|
190
271
|
return { text, newWorkspaceOffset, newExpOffset };
|
|
191
272
|
}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
273
|
+
var POLICY_NUDGE_TEMPLATE = `[EVO POLICY]
|
|
274
|
+
` + `Preventative block. You may have strayed from /evo:optimize protocol.
|
|
275
|
+
` + `
|
|
276
|
+
` + `When /optimize is active, the orchestrator should NOT edit files directly
|
|
277
|
+
` + `or run experiment commands by hand. The protocol is: write briefs, spawn
|
|
278
|
+
` + `subagents, let them do the edits and runs, then read the results.
|
|
279
|
+
` + `
|
|
280
|
+
` + "If you need to wait for subagents to finish, use `evo wait` — it blocks\n" + `until any experiment concludes, max 1h.
|
|
281
|
+
` + `
|
|
282
|
+
` + `Manual edits and shell commands that mutate files are discouraged unless
|
|
283
|
+
` + `evo doesn't already expose the operation. If this block was actually
|
|
284
|
+
` + "warranted (rare), run `evo exit-optimize-mode` to disable the safety\n" + `nudges, then retry the tool.
|
|
285
|
+
` + `[END EVO POLICY]
|
|
286
|
+
`;
|
|
287
|
+
var STOP_NUDGE_TEMPLATE = `[EVO LOOP]
|
|
288
|
+
` + `You are driving /evo:optimize autonomously. Don't stop, and don't ask
|
|
289
|
+
` + `the user for direction — they explicitly invoked /optimize for hands-off
|
|
290
|
+
` + "operation. They'll intervene via `evo direct` if they need to.\n" + `
|
|
291
|
+
` + "If subagents are still running: run `evo wait` to block until the next\n" + `experiment concludes (or up to 1h). Then resume planning.
|
|
292
|
+
` + `
|
|
293
|
+
` + "If subagents are done and you have unread results: read `evo scratchpad`,\n" + `update annotations as needed, and plan + spawn the next round.
|
|
294
|
+
` + `
|
|
295
|
+
` + "Stop only if `evo status` shows the budget exhausted or you've hit the\n" + `stall limit. If so, print a final summary first. To suppress this
|
|
296
|
+
` + `continuation loop for a legitimate one-off task, run
|
|
297
|
+
` + "`evo exit-optimize-mode`.\n" + `[END EVO LOOP]
|
|
298
|
+
`;
|
|
299
|
+
var DENY_TOOL_NAMES = new Set([
|
|
300
|
+
"edit",
|
|
301
|
+
"write",
|
|
302
|
+
"notebookedit",
|
|
303
|
+
"notebook_edit",
|
|
304
|
+
"multiedit",
|
|
305
|
+
"multi_edit",
|
|
306
|
+
"edit_file",
|
|
307
|
+
"create_file",
|
|
308
|
+
"search_replace",
|
|
309
|
+
"str_replace",
|
|
310
|
+
"applypatch",
|
|
311
|
+
"apply_patch",
|
|
312
|
+
"delete_file",
|
|
313
|
+
"file_write",
|
|
314
|
+
"file_edit",
|
|
315
|
+
"patch"
|
|
316
|
+
]);
|
|
317
|
+
var BASH_TOOL_NAMES = new Set([
|
|
318
|
+
"bash",
|
|
319
|
+
"shell",
|
|
320
|
+
"exec",
|
|
321
|
+
"run_terminal_cmd",
|
|
322
|
+
"runterminalcmd",
|
|
323
|
+
"run_command",
|
|
324
|
+
"terminal",
|
|
325
|
+
"execute_code",
|
|
326
|
+
"execute"
|
|
327
|
+
]);
|
|
328
|
+
var SEGMENT_DENY_RE = /^\s*(?:nohup\s+)?(?:\S*\/)?(?:tee\b(?:\s+-[aiu]+)*\s+[^\s|&<>]+|sed\b[^|&;]*?\s-[a-zA-Z]*i[a-zA-Z]*\b|sed\b[^|&;]*?\s--in-place\b|perl\b[^|&;]*?\s-[a-zA-Z]*i[a-zA-Z]*\b|awk\b[^|&;]*?\s-i\s+inplace\b|(?:mv|cp|rm|mkdir|rmdir|touch|chmod|chown|chgrp|ln|rsync)(?:\s|$)|dd\b[^|&;]*?\bof=|curl\b[^|&;]*?\s-[a-zA-Z]*[oO][a-zA-Z=]*(?:\s|$)|curl\b[^|&;]*?\s--output(?:=|\s)|curl\b[^|&;]*?\s--remote-name\b|wget(?:\s|$)|patch(?:\s|$)|install(?:\s|$)|truncate(?:\s|$)|git\b(?:\s+(?:-[a-zA-Z]\S*|--[a-z][a-z-]*(?:=\S+)?)(?:\s+\S+)?)*?\s+(?:apply|checkout|restore|reset|clean|switch|merge|rebase|am|stash(?!\s+(?:list|show)\b)|cherry-pick|pull|clone|revert|worktree)\b|(?:vim|vi|nano|emacs)(?:\s|$))/;
|
|
329
|
+
var REDIRECT_DENY_RE = /(?:(?<![<\d&])>>?\s*[^\s|&<>;]+|\b\d+>>?\s*(?!&)[^\s|&<>;]+|&>>?\s*(?!&)[^\s|&<>;]+|>\|\s*[^\s|&<>;]+)/;
|
|
330
|
+
var HOST_SPAWN_PREFIX_RE = /^\s*(?:nohup\s+)?(?:claude(?:\s|$)|codex(?:\s|$)|cursor-agent(?:\s|$)|opencode(?:\s|$)|hermes(?:\s|$)|openclaw(?:\s|$)|pi(?:\s|$)|pi-coding-agent(?:\s|$))/;
|
|
331
|
+
var UNQUOTED_SEPARATOR_RE = /[;\n]|&&|\|\||\|(?!\|)|(?<![>&])&(?![&>])(?!\s*$)/;
|
|
332
|
+
function splitSegments(cmd) {
|
|
333
|
+
return cmd.split(UNQUOTED_SEPARATOR_RE);
|
|
334
|
+
}
|
|
335
|
+
function extractSubstitutionBodies(seg) {
|
|
336
|
+
const bodies = [];
|
|
337
|
+
let i = 0;
|
|
338
|
+
const n = seg.length;
|
|
339
|
+
let state = "default";
|
|
340
|
+
const findBalancedParenClose = (start) => {
|
|
341
|
+
let depth = 1;
|
|
342
|
+
let k = start;
|
|
343
|
+
let inner = "default";
|
|
344
|
+
while (k < n && depth > 0) {
|
|
345
|
+
const cc = seg[k];
|
|
346
|
+
if (inner === "sq") {
|
|
347
|
+
if (cc === "'")
|
|
348
|
+
inner = "default";
|
|
349
|
+
k++;
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
if (inner === "dq") {
|
|
353
|
+
if (cc === "\\" && k + 1 < n) {
|
|
354
|
+
k += 2;
|
|
355
|
+
continue;
|
|
356
|
+
}
|
|
357
|
+
if (cc === '"') {
|
|
358
|
+
inner = "default";
|
|
359
|
+
k++;
|
|
360
|
+
continue;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (cc === "\\" && k + 1 < n) {
|
|
364
|
+
k += 2;
|
|
365
|
+
continue;
|
|
366
|
+
}
|
|
367
|
+
if (cc === "'" && inner === "default") {
|
|
368
|
+
inner = "sq";
|
|
369
|
+
} else if (cc === '"' && inner === "default") {
|
|
370
|
+
inner = "dq";
|
|
371
|
+
} else if (cc === "(") {
|
|
372
|
+
depth++;
|
|
373
|
+
} else if (cc === ")") {
|
|
374
|
+
depth--;
|
|
375
|
+
}
|
|
376
|
+
k++;
|
|
222
377
|
}
|
|
378
|
+
return depth === 0 ? k : -1;
|
|
223
379
|
};
|
|
224
|
-
|
|
225
|
-
|
|
380
|
+
while (i < n) {
|
|
381
|
+
const c = seg[i];
|
|
382
|
+
if (state === "sq") {
|
|
383
|
+
if (c === "'")
|
|
384
|
+
state = "default";
|
|
385
|
+
i++;
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
if (state === "dq") {
|
|
389
|
+
if (c === "\\" && i + 1 < n) {
|
|
390
|
+
i += 2;
|
|
391
|
+
continue;
|
|
392
|
+
}
|
|
393
|
+
if (c === '"') {
|
|
394
|
+
state = "default";
|
|
395
|
+
i++;
|
|
396
|
+
continue;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
if (c === "\\" && i + 1 < n) {
|
|
400
|
+
i += 2;
|
|
401
|
+
continue;
|
|
402
|
+
}
|
|
403
|
+
if (c === "'" && state === "default") {
|
|
404
|
+
state = "sq";
|
|
405
|
+
i++;
|
|
406
|
+
continue;
|
|
407
|
+
}
|
|
408
|
+
if (c === '"' && state === "default") {
|
|
409
|
+
state = "dq";
|
|
410
|
+
i++;
|
|
411
|
+
continue;
|
|
412
|
+
}
|
|
413
|
+
if (c === "$" && i + 1 < n && seg[i + 1] === "(") {
|
|
414
|
+
if (i + 2 < n && seg[i + 2] === "(") {
|
|
415
|
+
i += 3;
|
|
416
|
+
continue;
|
|
417
|
+
}
|
|
418
|
+
const end = findBalancedParenClose(i + 2);
|
|
419
|
+
if (end !== -1) {
|
|
420
|
+
bodies.push(seg.slice(i + 2, end - 1));
|
|
421
|
+
i = end;
|
|
422
|
+
continue;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
if ((c === "<" || c === ">") && i + 1 < n && seg[i + 1] === "(" && state === "default") {
|
|
426
|
+
const end = findBalancedParenClose(i + 2);
|
|
427
|
+
if (end !== -1) {
|
|
428
|
+
bodies.push(seg.slice(i + 2, end - 1));
|
|
429
|
+
i = end;
|
|
430
|
+
continue;
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
if (c === "`" && state !== "sq") {
|
|
434
|
+
let j = i + 1;
|
|
435
|
+
while (j < n && seg[j] !== "`") {
|
|
436
|
+
if (seg[j] === "\\" && j + 1 < n) {
|
|
437
|
+
j += 2;
|
|
438
|
+
continue;
|
|
439
|
+
}
|
|
440
|
+
j++;
|
|
441
|
+
}
|
|
442
|
+
if (j < n) {
|
|
443
|
+
bodies.push(seg.slice(i + 1, j));
|
|
444
|
+
i = j + 1;
|
|
445
|
+
continue;
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
i++;
|
|
449
|
+
}
|
|
450
|
+
return bodies;
|
|
451
|
+
}
|
|
452
|
+
function stripInertQuoted(cmd) {
|
|
453
|
+
let out = cmd.replace(/'[^']*'/g, "''");
|
|
454
|
+
out = out.replace(/"(?:[^"\\]|\\.)*"/g, (match) => {
|
|
455
|
+
if (match.indexOf("$(") >= 0 || match.indexOf("`") >= 0)
|
|
456
|
+
return match;
|
|
457
|
+
return '""';
|
|
226
458
|
});
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
459
|
+
const buf = [];
|
|
460
|
+
let i = 0;
|
|
461
|
+
const n = out.length;
|
|
462
|
+
while (i < n) {
|
|
463
|
+
if (out[i] === "$" && i + 2 < n && out[i + 1] === "(" && out[i + 2] === "(") {
|
|
464
|
+
let depth = 2;
|
|
465
|
+
let j = i + 3;
|
|
466
|
+
while (j < n && depth > 0) {
|
|
467
|
+
if (out[j] === "(")
|
|
468
|
+
depth++;
|
|
469
|
+
else if (out[j] === ")")
|
|
470
|
+
depth--;
|
|
471
|
+
j++;
|
|
472
|
+
}
|
|
473
|
+
if (depth === 0) {
|
|
474
|
+
i = j;
|
|
475
|
+
continue;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
buf.push(out[i]);
|
|
479
|
+
i++;
|
|
480
|
+
}
|
|
481
|
+
return buf.join("");
|
|
482
|
+
}
|
|
483
|
+
var SHELL_INTERPRETERS = new Set(["bash", "sh", "zsh", "dash", "ash"]);
|
|
484
|
+
function tokenize(cmd) {
|
|
485
|
+
const out = [];
|
|
486
|
+
let buf = "";
|
|
487
|
+
let state = "default";
|
|
488
|
+
let inToken = false;
|
|
489
|
+
for (let i = 0;i < cmd.length; i++) {
|
|
490
|
+
const c = cmd[i];
|
|
491
|
+
if (state === "sq") {
|
|
492
|
+
if (c === "'") {
|
|
493
|
+
state = "default";
|
|
494
|
+
continue;
|
|
495
|
+
}
|
|
496
|
+
buf += c;
|
|
497
|
+
inToken = true;
|
|
498
|
+
continue;
|
|
499
|
+
}
|
|
500
|
+
if (state === "dq") {
|
|
501
|
+
if (c === "\\" && i + 1 < cmd.length) {
|
|
502
|
+
buf += cmd[++i];
|
|
503
|
+
continue;
|
|
504
|
+
}
|
|
505
|
+
if (c === '"') {
|
|
506
|
+
state = "default";
|
|
507
|
+
continue;
|
|
508
|
+
}
|
|
509
|
+
buf += c;
|
|
510
|
+
inToken = true;
|
|
511
|
+
continue;
|
|
512
|
+
}
|
|
513
|
+
if (c === "'") {
|
|
514
|
+
state = "sq";
|
|
515
|
+
inToken = true;
|
|
516
|
+
continue;
|
|
517
|
+
}
|
|
518
|
+
if (c === '"') {
|
|
519
|
+
state = "dq";
|
|
520
|
+
inToken = true;
|
|
521
|
+
continue;
|
|
522
|
+
}
|
|
523
|
+
if (c === "\\" && i + 1 < cmd.length) {
|
|
524
|
+
buf += cmd[++i];
|
|
525
|
+
inToken = true;
|
|
526
|
+
continue;
|
|
527
|
+
}
|
|
528
|
+
if (/\s/.test(c)) {
|
|
529
|
+
if (inToken) {
|
|
530
|
+
out.push(buf);
|
|
531
|
+
buf = "";
|
|
532
|
+
inToken = false;
|
|
533
|
+
}
|
|
534
|
+
continue;
|
|
535
|
+
}
|
|
536
|
+
buf += c;
|
|
537
|
+
inToken = true;
|
|
538
|
+
}
|
|
539
|
+
if (state !== "default")
|
|
540
|
+
return null;
|
|
541
|
+
if (inToken)
|
|
542
|
+
out.push(buf);
|
|
543
|
+
return out;
|
|
544
|
+
}
|
|
545
|
+
function unwrapShellCArguments(cmd) {
|
|
546
|
+
const tokens = tokenize(cmd);
|
|
547
|
+
if (!tokens || tokens.length === 0)
|
|
548
|
+
return cmd;
|
|
549
|
+
const appended = [];
|
|
550
|
+
for (let i = 0;i < tokens.length; i++) {
|
|
551
|
+
const tok = tokens[i];
|
|
552
|
+
const name = tok.replace(/\/+$/, "").split("/").pop() || "";
|
|
553
|
+
if (!SHELL_INTERPRETERS.has(name))
|
|
554
|
+
continue;
|
|
555
|
+
let j = i + 1;
|
|
556
|
+
while (j < tokens.length) {
|
|
557
|
+
const t = tokens[j];
|
|
558
|
+
if (t === "-c") {
|
|
559
|
+
if (j + 1 < tokens.length)
|
|
560
|
+
appended.push(tokens[j + 1]);
|
|
561
|
+
break;
|
|
562
|
+
}
|
|
563
|
+
if (t.startsWith("-") && !t.startsWith("--") && t.length > 1 && t.slice(1).indexOf("c") >= 0) {
|
|
564
|
+
if (j + 1 < tokens.length)
|
|
565
|
+
appended.push(tokens[j + 1]);
|
|
566
|
+
break;
|
|
567
|
+
}
|
|
568
|
+
j++;
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
if (appended.length === 0)
|
|
572
|
+
return cmd;
|
|
573
|
+
return cmd + " ; " + appended.join(" ; ");
|
|
574
|
+
}
|
|
575
|
+
function isDeniedInOptimizeMode(toolName, toolInput) {
|
|
576
|
+
if (!toolName)
|
|
577
|
+
return false;
|
|
578
|
+
const t = toolName.toLowerCase();
|
|
579
|
+
if (DENY_TOOL_NAMES.has(t))
|
|
580
|
+
return true;
|
|
581
|
+
if (!BASH_TOOL_NAMES.has(t))
|
|
582
|
+
return false;
|
|
583
|
+
const input = toolInput || {};
|
|
584
|
+
const cmd = typeof input.command === "string" ? input.command : "";
|
|
585
|
+
if (!cmd)
|
|
586
|
+
return false;
|
|
587
|
+
const prepared = unwrapShellCArguments(cmd);
|
|
588
|
+
for (const body of extractSubstitutionBodies(prepared)) {
|
|
589
|
+
if (isDeniedInOptimizeMode("Bash", { command: body }))
|
|
590
|
+
return true;
|
|
591
|
+
}
|
|
592
|
+
const sanitized = stripInertQuoted(prepared);
|
|
593
|
+
for (const rawSeg of splitSegments(sanitized)) {
|
|
594
|
+
const seg = rawSeg.trim();
|
|
595
|
+
if (!seg)
|
|
596
|
+
continue;
|
|
597
|
+
if (SEGMENT_DENY_RE.test(seg))
|
|
598
|
+
return true;
|
|
599
|
+
if (HOST_SPAWN_PREFIX_RE.test(seg))
|
|
600
|
+
continue;
|
|
601
|
+
if (REDIRECT_DENY_RE.test(seg))
|
|
602
|
+
return true;
|
|
603
|
+
}
|
|
604
|
+
return false;
|
|
605
|
+
}
|
|
606
|
+
function markOptimizeMode(runDir, sid) {
|
|
607
|
+
const p = sessionFile(runDir, sid);
|
|
608
|
+
const rec = readJsonOrNull(p);
|
|
609
|
+
if (!rec)
|
|
610
|
+
return false;
|
|
611
|
+
if (rec.exp_id)
|
|
612
|
+
return false;
|
|
613
|
+
if (rec.optimize_mode)
|
|
614
|
+
return false;
|
|
615
|
+
rec.optimize_mode = true;
|
|
616
|
+
rec.optimize_mode_at = nowIso();
|
|
617
|
+
atomicWriteJson(p, rec);
|
|
618
|
+
return true;
|
|
619
|
+
}
|
|
620
|
+
var OPTIMIZE_PROMPT_RES = {
|
|
621
|
+
opencode: [/(?:^|[^A-Za-z0-9_/:-])\/optimize\b/i],
|
|
622
|
+
openclaw: [
|
|
623
|
+
/(?:^|[^A-Za-z0-9_/:-])\/optimize\b/i,
|
|
624
|
+
/(?:^|[^A-Za-z0-9_/:-])\/skill\s+optimize\b/i
|
|
625
|
+
],
|
|
626
|
+
pi: [
|
|
627
|
+
/(?:^|[^A-Za-z0-9_/:-])\/skill:optimize\b/i,
|
|
628
|
+
/(?:^|[^A-Za-z0-9_/:-])\/optimize\b/i
|
|
629
|
+
]
|
|
630
|
+
};
|
|
631
|
+
function maybeMarkOptimizeFromPrompt(runDir, sid, host, promptText) {
|
|
632
|
+
if (!promptText)
|
|
633
|
+
return;
|
|
634
|
+
const patterns = OPTIMIZE_PROMPT_RES[host];
|
|
635
|
+
if (!patterns)
|
|
636
|
+
return;
|
|
637
|
+
if (!patterns.some((re) => re.test(promptText)))
|
|
638
|
+
return;
|
|
639
|
+
markOptimizeMode(runDir, sid);
|
|
640
|
+
}
|
|
641
|
+
function policyStateFile(runDir, sid) {
|
|
642
|
+
return path.join(injectRoot(runDir), "policy_state", `${sid}.json`);
|
|
643
|
+
}
|
|
644
|
+
function readPolicyState(runDir, sid) {
|
|
645
|
+
return readJsonOrNull(policyStateFile(runDir, sid)) || {};
|
|
646
|
+
}
|
|
647
|
+
function writePolicyState(runDir, sid, data) {
|
|
648
|
+
atomicWriteJson(policyStateFile(runDir, sid), data);
|
|
649
|
+
}
|
|
650
|
+
function incrementAndShouldBlock(runDir, sid, toolName) {
|
|
651
|
+
const state = readPolicyState(runDir, sid);
|
|
652
|
+
const count = (state.violation_count || 0) + 1;
|
|
653
|
+
state.violation_count = count;
|
|
654
|
+
state.last_violation_tool = toolName || "";
|
|
655
|
+
state.nudge_pending = true;
|
|
656
|
+
writePolicyState(runDir, sid, state);
|
|
657
|
+
return count % 2 === 1;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
// factory.ts
|
|
661
|
+
import * as crypto from "crypto";
|
|
662
|
+
function makeRegister(host) {
|
|
663
|
+
function deriveSessionId() {
|
|
664
|
+
const expId = process.env.EVO_EXP_ID || "";
|
|
665
|
+
const seed = expId ? `${process.cwd()}|${expId}` : process.cwd();
|
|
666
|
+
const hash = crypto.createHash("sha256").update(seed).digest("hex").slice(0, 12);
|
|
667
|
+
return `${host}-${hash}`;
|
|
668
|
+
}
|
|
669
|
+
return function register(api) {
|
|
670
|
+
const drainedTexts = [];
|
|
671
|
+
const ensureRegistered = () => {
|
|
672
|
+
const runDir = findEvoRunDir();
|
|
673
|
+
if (!runDir)
|
|
674
|
+
return null;
|
|
675
|
+
const sid = deriveSessionId();
|
|
676
|
+
if (!isRegistered(runDir, sid)) {
|
|
677
|
+
const expId = process.env.EVO_EXP_ID || null;
|
|
678
|
+
registerSession(runDir, sid, host, expId);
|
|
679
|
+
}
|
|
680
|
+
return { sid, runDir };
|
|
681
|
+
};
|
|
682
|
+
const appendToPayload = (event, text) => {
|
|
683
|
+
if (Array.isArray(event.payload?.input)) {
|
|
684
|
+
event.payload.input.push({
|
|
685
|
+
role: "user",
|
|
686
|
+
content: [{ type: "input_text", text }]
|
|
687
|
+
});
|
|
688
|
+
} else if (Array.isArray(event.payload?.messages)) {
|
|
689
|
+
event.payload.messages.push({
|
|
690
|
+
role: "user",
|
|
691
|
+
content: [{ type: "text", text }]
|
|
692
|
+
});
|
|
693
|
+
}
|
|
694
|
+
};
|
|
695
|
+
api.on("session_start", () => {
|
|
696
|
+
const ctx = ensureRegistered();
|
|
697
|
+
if (!ctx)
|
|
698
|
+
return;
|
|
699
|
+
if (markEngaged(ctx.runDir, ctx.sid)) {
|
|
700
|
+
initOffsetToLatest(ctx.runDir, ctx.sid);
|
|
701
|
+
}
|
|
702
|
+
});
|
|
703
|
+
const scanForEvoCommands = (payload) => {
|
|
704
|
+
try {
|
|
705
|
+
const items = Array.isArray(payload?.input) ? payload.input : [];
|
|
706
|
+
for (const it of items) {
|
|
707
|
+
const args = it?.arguments;
|
|
708
|
+
if (typeof args === "string" && isEvoCommand(args))
|
|
709
|
+
return true;
|
|
710
|
+
if (typeof args === "object" && args) {
|
|
711
|
+
const cmd = args.command ?? args.cmd ?? args.shell;
|
|
712
|
+
if (typeof cmd === "string" && isEvoCommand(cmd))
|
|
713
|
+
return true;
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
const msgs = Array.isArray(payload?.messages) ? payload.messages : [];
|
|
717
|
+
for (const m of msgs) {
|
|
718
|
+
const content = Array.isArray(m?.content) ? m.content : [];
|
|
719
|
+
for (const c of content) {
|
|
720
|
+
if (c?.type === "tool_use") {
|
|
721
|
+
const cmd = c?.input?.command ?? c?.input?.cmd;
|
|
722
|
+
if (typeof cmd === "string" && isEvoCommand(cmd))
|
|
723
|
+
return true;
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
} catch {}
|
|
728
|
+
return false;
|
|
729
|
+
};
|
|
730
|
+
const extractLatestUserText = (payload) => {
|
|
731
|
+
try {
|
|
732
|
+
const items = Array.isArray(payload?.input) ? payload.input : [];
|
|
733
|
+
for (let i = items.length - 1;i >= 0; i--) {
|
|
734
|
+
const it = items[i];
|
|
735
|
+
if (it?.role !== "user")
|
|
736
|
+
continue;
|
|
737
|
+
if (typeof it.content === "string" && it.content)
|
|
738
|
+
return it.content;
|
|
739
|
+
if (Array.isArray(it.content)) {
|
|
740
|
+
for (const c of it.content) {
|
|
741
|
+
if (typeof c?.text === "string" && c.text)
|
|
742
|
+
return c.text;
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
const msgs = Array.isArray(payload?.messages) ? payload.messages : [];
|
|
747
|
+
for (let i = msgs.length - 1;i >= 0; i--) {
|
|
748
|
+
const m = msgs[i];
|
|
749
|
+
if (m?.role !== "user")
|
|
750
|
+
continue;
|
|
751
|
+
if (typeof m.content === "string")
|
|
752
|
+
return m.content;
|
|
753
|
+
if (Array.isArray(m.content)) {
|
|
754
|
+
for (const c of m.content) {
|
|
755
|
+
if (typeof c?.text === "string" && c.text)
|
|
756
|
+
return c.text;
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
} catch {}
|
|
761
|
+
return "";
|
|
762
|
+
};
|
|
763
|
+
api.on("before_provider_request", (event, _ctx) => {
|
|
764
|
+
const ctx = ensureRegistered();
|
|
765
|
+
if (!ctx)
|
|
766
|
+
return;
|
|
767
|
+
const promptText = extractLatestUserText(event.payload);
|
|
768
|
+
maybeMarkOptimizeFromPrompt(ctx.runDir, ctx.sid, host, promptText);
|
|
769
|
+
scanForEvoCommands(event.payload);
|
|
770
|
+
const result = drainSession(ctx.runDir, ctx.sid);
|
|
771
|
+
if (result.text)
|
|
772
|
+
drainedTexts.push(result.text);
|
|
773
|
+
if (drainedTexts.length === 0)
|
|
774
|
+
return;
|
|
775
|
+
const combined = drainedTexts.join(`
|
|
237
776
|
`);
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
777
|
+
appendToPayload(event, combined);
|
|
778
|
+
return event.payload;
|
|
779
|
+
});
|
|
780
|
+
api.on("tool_call", (event, _ctx) => {
|
|
781
|
+
const ctx = ensureRegistered();
|
|
782
|
+
if (!ctx)
|
|
783
|
+
return;
|
|
784
|
+
const sess = getSession(ctx.runDir, ctx.sid);
|
|
785
|
+
if (!sess)
|
|
786
|
+
return;
|
|
787
|
+
if (sess.exp_id)
|
|
788
|
+
return;
|
|
789
|
+
if (!sess.optimize_mode)
|
|
790
|
+
return;
|
|
791
|
+
const toolName = event?.toolName ?? event?.tool_name;
|
|
792
|
+
const toolInput = event?.input ?? {};
|
|
793
|
+
if (!isDeniedInOptimizeMode(toolName, toolInput))
|
|
794
|
+
return;
|
|
795
|
+
if (incrementAndShouldBlock(ctx.runDir, ctx.sid, toolName)) {
|
|
796
|
+
return { block: true, reason: POLICY_NUDGE_TEMPLATE };
|
|
797
|
+
}
|
|
798
|
+
});
|
|
799
|
+
api.on("turn_end", async (_event, _ctx) => {
|
|
800
|
+
if (typeof api.sendUserMessage !== "function")
|
|
801
|
+
return;
|
|
802
|
+
const ctx = ensureRegistered();
|
|
803
|
+
if (!ctx)
|
|
804
|
+
return;
|
|
805
|
+
const sess = getSession(ctx.runDir, ctx.sid);
|
|
806
|
+
if (!sess)
|
|
807
|
+
return;
|
|
808
|
+
if (sess.exp_id)
|
|
809
|
+
return;
|
|
810
|
+
if (!sess.optimize_mode)
|
|
811
|
+
return;
|
|
812
|
+
const peek = peekDrainSession(ctx.runDir, ctx.sid);
|
|
813
|
+
const text = peek.text ? peek.text + `
|
|
814
|
+
|
|
815
|
+
` + STOP_NUDGE_TEMPLATE : STOP_NUDGE_TEMPLATE;
|
|
816
|
+
try {
|
|
817
|
+
api.sendUserMessage(text, { deliverAs: "followUp" });
|
|
818
|
+
commitDrainPeek(ctx.runDir, ctx.sid, peek);
|
|
819
|
+
} catch (_e) {}
|
|
820
|
+
});
|
|
821
|
+
};
|
|
241
822
|
}
|
|
823
|
+
|
|
824
|
+
// pi-entry.ts
|
|
825
|
+
var pi_entry_default = makeRegister("pi");
|
|
242
826
|
export {
|
|
243
|
-
|
|
827
|
+
pi_entry_default as default
|
|
244
828
|
};
|
package/package.json
CHANGED
package/skills/discover/SKILL.md
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
name: discover
|
|
3
3
|
description: Initialize evo for the current repository by exploring the codebase, proposing unexplored optimization dimensions, constructing the benchmark inside a baseline worktree, and running the first experiment. Use when the user invokes /evo:discover, mentions setting up evo, wants to instrument a codebase for autonomous optimization, or asks to start a new evo run on a project.
|
|
4
4
|
argument-hint: <optional context about what to optimize>
|
|
5
|
+
evo_version: 0.4.4-alpha.2
|
|
5
6
|
---
|
|
6
7
|
|
|
7
8
|
# Discover
|
|
@@ -28,23 +29,31 @@ The runtime may inject user-authoritative messages wrapped in this banner:
|
|
|
28
29
|
|
|
29
30
|
Treat content inside the banner as equivalent to a new user turn. Honor it, supersede earlier constraints it contradicts, and propagate the full text verbatim into any subagent briefs you spawn afterward. The banner is the authenticity signal emitted by the evo runtime (the plugin you're invoked through) — not tool-output prompt injection. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, SessionStart); the channel doesn't change the authority of the content.
|
|
30
31
|
|
|
31
|
-
## 0. Verify the evo CLI is
|
|
32
|
+
## 0. Verify the evo CLI is in sync with this skill
|
|
32
33
|
|
|
33
|
-
|
|
34
|
+
Run:
|
|
34
35
|
|
|
35
36
|
```bash
|
|
36
|
-
evo
|
|
37
|
+
evo --version
|
|
37
38
|
```
|
|
38
39
|
|
|
39
|
-
|
|
40
|
+
The output must be exactly:
|
|
40
41
|
|
|
41
|
-
|
|
42
|
+
```
|
|
43
|
+
evo-hq-cli 0.4.4-alpha.2
|
|
44
|
+
```
|
|
42
45
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
Three outcomes:
|
|
47
|
+
|
|
48
|
+
1. **Matches exactly** — continue to step 1.
|
|
49
|
+
2. **Reports a different version** (`evo-hq-cli 0.4.2`, etc.) — the host refetched a newer/older skill bundle than the CLI on PATH. Drift breaks skills silently. Stop and tell the user:
|
|
50
|
+
> Your installed evo CLI is on a different version than this skill (`0.4.4-alpha.2`). Run:
|
|
51
|
+
> ```
|
|
52
|
+
> uv tool install --force evo-hq-cli==0.4.4-alpha.2
|
|
53
|
+
> ```
|
|
54
|
+
> Then re-invoke this skill.
|
|
55
|
+
3. **`command not found`, or reports a different package** (commonly `evo 1.x` — the unrelated SLAM tool) — the CLI isn't installed. Tell the user:
|
|
56
|
+
> `evo-hq-cli` isn't on your PATH. Install it: `uv tool install evo-hq-cli==0.4.4-alpha.2` (or `pipx install evo-hq-cli==0.4.4-alpha.2`). Then re-invoke this skill.
|
|
48
57
|
|
|
49
58
|
Do not try to auto-install. Host sandbox + network policy may block it; leaving the install as a user action keeps failure modes clear.
|
|
50
59
|
|
|
@@ -276,12 +285,23 @@ If the selected benchmark is new, build it in the worktree. See `references/cons
|
|
|
276
285
|
- Design the scoring function (range, direction, meaningful-improvement threshold)
|
|
277
286
|
- Assemble test cases (10-20 for programmatic, 15-30 for fuzzy, realistic workload for perf)
|
|
278
287
|
- Write the runnable harness (helper/SDK writes the score JSON to `$EVO_RESULT_PATH`; stdout and stderr are free for user output)
|
|
279
|
-
- Goodhart check (document gaming strategies,
|
|
288
|
+
- Goodhart check (document concrete gaming strategies and mitigation). Include validation/gold-answer leakage explicitly: assume subagents can see benchmark traces and gold answers, so detection is the defense, not concealment. Prefer a crisp deterministic cheat-check gate, such as a workspace-specific script that greps the target/worktree for exact validation strings and exits non-zero on a match; register it with `evo gate add ... --phase pre` only after the user explicitly opts in. Mention expected cost for any LLM-judge variant and reserve it for paraphrase cases because it is flakier than exact-string checks.
|
|
280
289
|
- Held-out validation slice (60/70 training, 30/40 held-out) if the benchmark is hand-written
|
|
281
290
|
|
|
282
291
|
Do not run separate determinism checks during setup. Note the benchmark's determinism property in `project.md` (step 12) and move on. Variance surfaces during optimization itself, where it can be handled with real evidence rather than guessed at during setup.
|
|
283
292
|
|
|
284
|
-
### 10b.
|
|
293
|
+
### 10b. Audit the harness for amortizable wins
|
|
294
|
+
|
|
295
|
+
Apply any change that preserves what we measure -- descendants inherit it. Changes that could move the score (including for a different target) belong in `/evo:optimize`, not here.
|
|
296
|
+
|
|
297
|
+
Patterns to scan for:
|
|
298
|
+
|
|
299
|
+
- Serial loop over independent tasks -> thread/process pool
|
|
300
|
+
- Constant prefix across tasks -> prompt cache
|
|
301
|
+
- Per-task setup that could be one-time -> hoist out of the loop
|
|
302
|
+
- Transport errors (429/5xx) counted as task failures -> retry
|
|
303
|
+
|
|
304
|
+
### 10c. Apply instrumentation
|
|
285
305
|
|
|
286
306
|
Based on the instrumentation mode passed to `evo init`:
|
|
287
307
|
|
|
@@ -292,7 +312,7 @@ Paths below are relative to this `SKILL.md` file (resolve them against the skill
|
|
|
292
312
|
|
|
293
313
|
The wire protocol is the same either way: `task_<id>.json` written to `$EVO_TRACES_DIR`, score JSON written to `$EVO_RESULT_PATH`. Stdout is free for user output.
|
|
294
314
|
|
|
295
|
-
###
|
|
315
|
+
### 10d. Cheap validation run
|
|
296
316
|
|
|
297
317
|
Before the full baseline, validate the toolchain with the cheapest possible end-to-end run (single task, smallest split, dry-run flag -- whatever is fastest). Run the check from the main repo root:
|
|
298
318
|
|
|
@@ -313,7 +333,7 @@ The check asserts `result.json` exists, is non-empty, and is a JSON object with
|
|
|
313
333
|
|
|
314
334
|
Fix any issues and re-validate before proceeding.
|
|
315
335
|
|
|
316
|
-
###
|
|
336
|
+
### 10e. Commit inside the worktree
|
|
317
337
|
|
|
318
338
|
Logical commits are ideal but not required. Minimal acceptable:
|
|
319
339
|
|
|
@@ -334,7 +354,7 @@ dist/
|
|
|
334
354
|
build/
|
|
335
355
|
```
|
|
336
356
|
|
|
337
|
-
Otherwise, running the benchmark once before committing will drag bytecode caches, `.pytest_cache/`, or stray `.evo/` writes into the experiment's tree and pollute every descendant branch. Belt-and-suspenders with step
|
|
357
|
+
Otherwise, running the benchmark once before committing will drag bytecode caches, `.pytest_cache/`, or stray `.evo/` writes into the experiment's tree and pollute every descendant branch. Belt-and-suspenders with step 10d's "run from main repo root" rule: even if cwd slips, the ignore catches it.
|
|
338
358
|
|
|
339
359
|
## 11. Run the baseline
|
|
340
360
|
|
|
@@ -93,12 +93,16 @@ Common pairings:
|
|
|
93
93
|
|
|
94
94
|
| Benchmark style | Minimum paired gate |
|
|
95
95
|
|---|---|
|
|
96
|
-
| Hand-written task pass rate |
|
|
96
|
+
| Hand-written task pass rate | Validation-slice score threshold; add an exact-leakage pre-gate when validation strings or gold answers could be copied into the target |
|
|
97
97
|
| Latency / performance | Correctness test (the optimized code must still produce the same outputs) |
|
|
98
|
-
| LLM-as-judge rating | Structural validity check
|
|
98
|
+
| LLM-as-judge rating | Structural validity check; optional LLM-judge cheat gate only for paraphrase leakage risks |
|
|
99
99
|
| Quality-of-output score | Sanity assertion that catches degenerate outputs (empty, constant, out-of-range) |
|
|
100
100
|
|
|
101
|
-
Add
|
|
101
|
+
Add gates with an explicit phase. Use `--phase pre` for gates that detect invalid edits before benchmark spend, including cheat-detection checks for leaked validation strings; use the default/post phase for benchmark-derived score-threshold gates that need scoring. For any gate that costs money, especially LLM-judge cheat checks, ask the user before registering it and state the expected per-check cost.
|
|
102
|
+
|
|
103
|
+
For artifact-evolution runs, assume validation tasks and gold answers may be visible in traces. Do not describe held-out data as secret. Defense is detection: prefer a workspace-specific deterministic gate that greps exact validation strings, gold answers, or unique rubric phrases in the target/worktree and exits non-zero on a match. Use LLM-judge gates only when paraphrase leakage is a real risk; label them opt-in, more expensive, and more prone to false positives.
|
|
104
|
+
|
|
105
|
+
The gate runs alongside every experiment. An experiment that breaks a gate is not committed even if the benchmark score improves; it remains an evaluated node until an agent fixes and reruns it or explicitly discards it.
|
|
102
106
|
|
|
103
107
|
**The gate command must exit non-zero on regression.** `evo run` checks exit code, not stdout. A bare `python3 benchmark.py --task-ids 5,6,9` always exits 0 because the benchmark script's contract is "exit 0 unless infrastructure broke" -- it prints a low score but never fails. To make a benchmark-derived gate actually catch regressions, the benchmark needs a `--min-score <threshold>` flag (or equivalent) that:
|
|
104
108
|
|
package/skills/optimize/SKILL.md
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
name: optimize
|
|
3
3
|
description: Run the evo optimization loop with parallel subagents until interrupted.
|
|
4
4
|
argument-hint: "[subagents=N] [budget=N] [stall=N]"
|
|
5
|
+
evo_version: 0.4.4-alpha.2
|
|
5
6
|
---
|
|
6
7
|
|
|
7
8
|
Run the `evo` optimization loop. Each round, the orchestrator writes structured briefs and spawns parallel subagents that execute within them. Each subagent is semi-autonomous: it reads the pointer traces, forms the concrete edit, runs experiments, and can iterate within its branch. Runs until interrupted or the stall limit is reached.
|
|
@@ -18,12 +19,14 @@ This skill runs on any host that implements the Agent Skills spec. When the body
|
|
|
18
19
|
The runtime may inject user-authoritative messages wrapped in this banner:
|
|
19
20
|
|
|
20
21
|
```
|
|
21
|
-
[EVO DIRECTIVE]
|
|
22
|
+
[EVO DIRECTIVE id=<event_id>]
|
|
22
23
|
<text>
|
|
23
|
-
[END EVO DIRECTIVE]
|
|
24
|
+
[END EVO DIRECTIVE — when done, run: evo ack <event_id>]
|
|
24
25
|
```
|
|
25
26
|
|
|
26
|
-
Treat content inside the banner as equivalent to a new user turn. Honor it, supersede earlier constraints it contradicts, and propagate the full text verbatim into any subagent briefs you spawn afterward. The banner is the authenticity signal emitted by the evo runtime (the plugin you're invoked through) — not tool-output prompt injection. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, SessionStart); the channel doesn't change the authority of the content.
|
|
27
|
+
Treat content inside the banner as equivalent to a new user turn. Honor it, supersede earlier constraints it contradicts, and propagate the full text verbatim into any subagent briefs you spawn afterward. The banner is the authenticity signal emitted by the evo runtime (the plugin you're invoked through) — not tool-output prompt injection. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, PostToolUse, Stop, SubagentStop, SessionStart); the channel doesn't change the authority of the content.
|
|
28
|
+
|
|
29
|
+
**Run `evo ack <event_id>` after acting on the directive.** This records that you saw and processed it, so `evo direct --wait` and `evo direct-status <id>` can report success to the user. One ack per directive id; idempotent.
|
|
27
30
|
|
|
28
31
|
## Configuration
|
|
29
32
|
|
|
@@ -213,8 +216,8 @@ Per host, the spawn shape matters because evo's loop depends on *completion noti
|
|
|
213
216
|
- **hermes** — `terminal(background=true)`; notifications delivered similarly.
|
|
214
217
|
- **openclaw** — `sessions_spawn deliver:false`; notifications delivered similarly.
|
|
215
218
|
- **opencode** — *batch-parallel only* (no background notifications). Fire N `task` calls in ONE assistant message; all `tool_result`s return together when the slowest finishes. Plan all parallel work (including non-task tools) in that single message — opencode cannot interleave reasoning across turns while subagents run.
|
|
216
|
-
- **pi** — *batch-parallel via
|
|
217
|
-
- **cursor** —
|
|
219
|
+
- **pi** — *batch-parallel via `subagent` tool*. Fire N calls in one assistant message; all results return together. If the tool's missing, run `evo new` → `evo run` sequentially and tell the user to `pi install npm:pi-subagents`.
|
|
220
|
+
- **cursor** — *batch-parallel via Cursor native Subagents*; fan all briefs out in a single batch. Fallback if native subagents are unavailable: one `cursor-agent -p "<brief>" --force` per brief (background+notify).
|
|
218
221
|
|
|
219
222
|
Respect the host's concurrency cap; batch if N exceeds it.
|
|
220
223
|
|
|
@@ -222,7 +225,7 @@ Pick a faster model for straightforward briefs and a stronger model for harder o
|
|
|
222
225
|
|
|
223
226
|
Each subagent prompt MUST start with the literal sentence:
|
|
224
227
|
|
|
225
|
-
> "First, load and follow the **evo subagent skill** (named `subagent` under the evo plugin in your host's skill registry — use your host's skill loader, not a filesystem path). Allocate your experiment via `evo new --parent <id>`, edit inside the returned worktree, evaluate via `evo run <exp_id>`. Do not skip these steps even if the brief looks simple."
|
|
228
|
+
> "First, load and follow the **evo subagent skill** (named `subagent` under the evo plugin in your host's skill registry — use your host's skill loader, not a filesystem path). Allocate your experiment via `evo new --parent <id>`, edit inside the returned worktree, evaluate via `evo run <exp_id>`. Do not skip these steps even if the brief looks simple. If `evo run` exits `GATE_FAILED`, fix the edit so it satisfies the inherited gate; do not weaken, bypass, delete, or argue with the gate unless the orchestrator explicitly changes the brief."
|
|
226
229
|
|
|
227
230
|
Then append:
|
|
228
231
|
- The four-field brief verbatim (objective, parent, boundaries/anti-patterns, pointer traces)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: report
|
|
3
|
+
description: Print the dashboard's dot chart (score over experiment order, status colors, best-path stair) inline in the terminal for every run in the workspace. Use when the user invokes /evo:report, asks for a quick score chart without opening the dashboard, or wants the scatter plot in chat output.
|
|
4
|
+
evo_version: 0.4.4-alpha.2
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Report
|
|
8
|
+
|
|
9
|
+
Render the dashboard's scatter plot as a colored terminal block, one chart per run, sized to the current terminal.
|
|
10
|
+
|
|
11
|
+
## What it shows
|
|
12
|
+
|
|
13
|
+
Mirrors the web dashboard's score scatter (left rail of `evo dashboard`):
|
|
14
|
+
|
|
15
|
+
- X = experiment creation order, Y = score
|
|
16
|
+
- Dot color by status: green = committed, red = failed, purple = active, grey = pending / evaluated / discarded / pruned
|
|
17
|
+
- ★ marks the current best committed experiment
|
|
18
|
+
- Yellow ring on dots that sit on the best-path spine (root → best)
|
|
19
|
+
- Yellow stair line traces cumulative-best across committed experiments
|
|
20
|
+
- ○ at the baseline for experiments that have no score yet (active / pending)
|
|
21
|
+
|
|
22
|
+
Every run in the workspace is rendered, stacked top-to-bottom, with a header line showing `run_id · target · metric`.
|
|
23
|
+
|
|
24
|
+
## How to invoke
|
|
25
|
+
|
|
26
|
+
Run:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
evo report
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
That is it. Print the output verbatim in your reply so the user sees the chart. Do not summarize the chart in prose — the visual is the point.
|
|
33
|
+
|
|
34
|
+
Flags:
|
|
35
|
+
|
|
36
|
+
- `--color always|never|auto` — force or suppress ANSI color. Default `auto` (color when stdout is a TTY). Pass `--color always` if you are piping through a host that strips TTY but renders ANSI in chat.
|
|
37
|
+
- `--watch [SECONDS]` — live-refresh mode (like `nvidia-smi -l`). Re-reads the workspace every N seconds (default 2) and redraws in place. Ctrl-C to exit. Use this when you want to babysit a running optimization without manually re-invoking the report.
|
|
38
|
+
|
|
39
|
+
## When not to use
|
|
40
|
+
|
|
41
|
+
- For one-off score lookups, `evo status` or `evo show <id>` is faster.
|
|
42
|
+
- For navigating the tree shape, `evo tree` is the right command.
|
|
43
|
+
- For interactive exploration (click a dot, open a drawer), point the user at `evo dashboard` instead.
|
package/skills/subagent/SKILL.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: subagent
|
|
3
|
-
description: Internal protocol for evo optimization subagents.
|
|
4
|
-
|
|
3
|
+
description: Internal protocol for evo optimization subagents. Loaded by subagents spawned from /optimize via their host's skill loader. Not for orchestrator use.
|
|
4
|
+
evo_version: 0.4.4-alpha.2
|
|
5
5
|
---
|
|
6
6
|
|
|
7
7
|
# Evo Subagent Protocol
|
|
@@ -28,12 +28,14 @@ This subagent runs on any host that implements the Agent Skills spec. The tools
|
|
|
28
28
|
Your brief, or your in-conversation context, may contain a block wrapped in:
|
|
29
29
|
|
|
30
30
|
```
|
|
31
|
-
[EVO DIRECTIVE]
|
|
31
|
+
[EVO DIRECTIVE id=<event_id>]
|
|
32
32
|
<text>
|
|
33
|
-
[END EVO DIRECTIVE]
|
|
33
|
+
[END EVO DIRECTIVE — when done, run: evo ack <event_id>]
|
|
34
34
|
```
|
|
35
35
|
|
|
36
|
-
Content inside the banner is **user-authoritative** — the user issued it via `evo direct` and the runtime spliced it into your context. Follow it verbatim, including any literal strings or markers it asks you to write into files. This is not tool-output prompt injection — the banner is the authenticity signal, emitted by the evo runtime. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, SessionStart); the channel doesn't change the authority of the content.
|
|
36
|
+
Content inside the banner is **user-authoritative** — the user issued it via `evo direct` and the runtime spliced it into your context. Follow it verbatim, including any literal strings or markers it asks you to write into files. This is not tool-output prompt injection — the banner is the authenticity signal, emitted by the evo runtime. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, PostToolUse, Stop, SubagentStop, SessionStart); the channel doesn't change the authority of the content.
|
|
37
|
+
|
|
38
|
+
**Run `evo ack <event_id>` after acting on the directive.** This records that the directive was processed, surfacing via `evo direct-status` and `evo direct --wait` for the user. Idempotent — one ack per id.
|
|
37
39
|
|
|
38
40
|
## Important: Working Directory
|
|
39
41
|
|