npm - @evo-hq/pi-evo - Versions diffs - 0.4.3 → 0.4.4-alpha.2 - Mend

@evo-hq/pi-evo 0.4.3 → 0.4.4-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/extensions/evo/index.js +633 -49
package/package.json +1 -1
package/skills/discover/SKILL.md +35 -15
package/skills/discover/references/constructing-benchmark.md +7 -3
package/skills/infra-setup/SKILL.md +1 -0
package/skills/optimize/SKILL.md +9 -6
package/skills/report/SKILL.md +43 -0
package/skills/subagent/SKILL.md +7 -5

package/extensions/evo/index.js CHANGED Viewed

@@ -98,8 +98,18 @@ function writeOffset(runDir, sid, opts) {
 function formatDirectiveText(events) {
   const lines = [];
   for (const ev of events) {
-    if (ev.text)
-      lines.push(`[evo direct] ${ev.text}`);
+    if (!ev.text)
+      continue;
+    const id = ev.id || "";
+    if (id) {
+      lines.push(`[EVO DIRECTIVE id=${id}]`);
+      lines.push(ev.text);
+      lines.push(`[END EVO DIRECTIVE — when done, run: evo ack ${id}]`);
+    } else {
+      lines.push("[EVO DIRECTIVE]");
+      lines.push(ev.text);
+      lines.push("[END EVO DIRECTIVE]");
+    }
   }
   return lines.join(`
 `);
@@ -117,6 +127,12 @@ function registerSession(runDir, sid, host, expId = null) {
   const existing = readJsonOrNull(p);
   if (existing) {
     existing.last_seen_at = now;
+    if (expId && !existing.exp_id)
+      existing.exp_id = expId;
+    if (existing.has_evo_engaged === undefined)
+      existing.has_evo_engaged = false;
+    if (existing.engaged_at === undefined)
+      existing.engaged_at = null;
     atomicWriteJson(p, existing);
     return;
   }
@@ -128,9 +144,40 @@ function registerSession(runDir, sid, host, expId = null) {
     registered_at: now,
     last_seen_at: now,
     exp_id: expId,
-    parent_session_id: null
+    parent_session_id: null,
+    has_evo_engaged: false,
+    engaged_at: null
   };
   atomicWriteJson(p, rec);
+  initOffsetToLatest(runDir, sid);
+}
+function markEngaged(runDir, sid) {
+  const p = sessionFile(runDir, sid);
+  const rec = readJsonOrNull(p);
+  if (!rec)
+    return false;
+  if (rec.has_evo_engaged)
+    return false;
+  rec.has_evo_engaged = true;
+  rec.engaged_at = nowIso();
+  atomicWriteJson(p, rec);
+  return true;
+}
+function initOffsetToLatest(runDir, sid) {
+  const wsPath = workspaceEventsPath(runDir);
+  let latest = null;
+  if (fs.existsSync(wsPath)) {
+    const events = readEventsAfter(wsPath, null);
+    if (events.length > 0)
+      latest = events[events.length - 1].id;
+  }
+  writeOffset(runDir, sid, { workspaceId: latest });
+}
+var EVO_CMD_RE = /^\s*evo(\s|$)/;
+function isEvoCommand(command) {
+  if (!command || typeof command !== "string")
+    return false;
+  return EVO_CMD_RE.test(command);
 }
 function findEvoRunDir(cwd) {
   const envRunDir = process.env.EVO_RUN_DIR;
@@ -156,6 +203,40 @@ function findEvoRunDir(cwd) {
   }
   return null;
 }
+function peekDrainSession(runDir, sessionId) {
+  const sess = getSession(runDir, sessionId);
+  if (!sess) {
+    return { text: null, newWorkspaceOffset: null, newExpOffset: null };
+  }
+  const expId = sess.exp_id;
+  let events = [];
+  let newWorkspaceOffset = null;
+  let newExpOffset = null;
+  if (expId) {
+    const lastId = readOffset(runDir, sessionId, "exp");
+    const newEvents = readEventsAfter(expEventsPath(runDir, expId), lastId);
+    events = newEvents;
+    if (newEvents.length > 0)
+      newExpOffset = newEvents[newEvents.length - 1].id;
+  } else {
+    const lastId = readOffset(runDir, sessionId, "workspace");
+    const newEvents = readEventsAfter(workspaceEventsPath(runDir), lastId);
+    events = newEvents;
+    if (newEvents.length > 0)
+      newWorkspaceOffset = newEvents[newEvents.length - 1].id;
+  }
+  const text = events.length > 0 ? formatDirectiveText(events) : null;
+  return { text, newWorkspaceOffset, newExpOffset };
+}
+function commitDrainPeek(runDir, sessionId, peek) {
+  if (peek.newWorkspaceOffset || peek.newExpOffset) {
+    writeOffset(runDir, sessionId, {
+      workspaceId: peek.newWorkspaceOffset,
+      expId: peek.newExpOffset
+    });
+  }
+  unlinkIfExists(markerFile(runDir, sessionId));
+}
 function drainSession(runDir, sessionId) {
   const sess = getSession(runDir, sessionId);
   if (!sess) {
@@ -189,56 +270,559 @@ function drainSession(runDir, sessionId) {
   unlinkIfExists(markerFile(runDir, sessionId));
   return { text, newWorkspaceOffset, newExpOffset };
 }
-// index.ts
-import * as crypto from "crypto";
-function deriveSessionId() {
-  const hash = crypto.createHash("sha256").update(process.cwd()).digest("hex").slice(0, 12);
-  return `openclaw-${hash}`;
-}
-function register(api) {
-  const drainedTexts = [];
-  const ensureRegistered = () => {
-    const runDir = findEvoRunDir();
-    if (!runDir)
-      return null;
-    const sid = deriveSessionId();
-    if (!isRegistered(runDir, sid)) {
-      registerSession(runDir, sid, "openclaw");
-    }
-    return { sid, runDir };
-  };
-  const appendToPayload = (event, text) => {
-    if (Array.isArray(event.payload?.input)) {
-      event.payload.input.push({
-        role: "user",
-        content: [{ type: "input_text", text }]
-      });
-    } else if (Array.isArray(event.payload?.messages)) {
-      event.payload.messages.push({
-        role: "user",
-        content: [{ type: "text", text }]
-      });
+var POLICY_NUDGE_TEMPLATE = `[EVO POLICY]
+` + `Preventative block. You may have strayed from /evo:optimize protocol.
+` + `
+` + `When /optimize is active, the orchestrator should NOT edit files directly
+` + `or run experiment commands by hand. The protocol is: write briefs, spawn
+` + `subagents, let them do the edits and runs, then read the results.
+` + `
+` + "If you need to wait for subagents to finish, use `evo wait` — it blocks\n" + `until any experiment concludes, max 1h.
+` + `
+` + `Manual edits and shell commands that mutate files are discouraged unless
+` + `evo doesn't already expose the operation. If this block was actually
+` + "warranted (rare), run `evo exit-optimize-mode` to disable the safety\n" + `nudges, then retry the tool.
+` + `[END EVO POLICY]
+`;
+var STOP_NUDGE_TEMPLATE = `[EVO LOOP]
+` + `You are driving /evo:optimize autonomously. Don't stop, and don't ask
+` + `the user for direction — they explicitly invoked /optimize for hands-off
+` + "operation. They'll intervene via `evo direct` if they need to.\n" + `
+` + "If subagents are still running: run `evo wait` to block until the next\n" + `experiment concludes (or up to 1h). Then resume planning.
+` + `
+` + "If subagents are done and you have unread results: read `evo scratchpad`,\n" + `update annotations as needed, and plan + spawn the next round.
+` + `
+` + "Stop only if `evo status` shows the budget exhausted or you've hit the\n" + `stall limit. If so, print a final summary first. To suppress this
+` + `continuation loop for a legitimate one-off task, run
+` + "`evo exit-optimize-mode`.\n" + `[END EVO LOOP]
+`;
+var DENY_TOOL_NAMES = new Set([
+  "edit",
+  "write",
+  "notebookedit",
+  "notebook_edit",
+  "multiedit",
+  "multi_edit",
+  "edit_file",
+  "create_file",
+  "search_replace",
+  "str_replace",
+  "applypatch",
+  "apply_patch",
+  "delete_file",
+  "file_write",
+  "file_edit",
+  "patch"
+]);
+var BASH_TOOL_NAMES = new Set([
+  "bash",
+  "shell",
+  "exec",
+  "run_terminal_cmd",
+  "runterminalcmd",
+  "run_command",
+  "terminal",
+  "execute_code",
+  "execute"
+]);
+var SEGMENT_DENY_RE = /^\s*(?:nohup\s+)?(?:\S*\/)?(?:tee\b(?:\s+-[aiu]+)*\s+[^\s|&<>]+|sed\b[^|&;]*?\s-[a-zA-Z]*i[a-zA-Z]*\b|sed\b[^|&;]*?\s--in-place\b|perl\b[^|&;]*?\s-[a-zA-Z]*i[a-zA-Z]*\b|awk\b[^|&;]*?\s-i\s+inplace\b|(?:mv|cp|rm|mkdir|rmdir|touch|chmod|chown|chgrp|ln|rsync)(?:\s|$)|dd\b[^|&;]*?\bof=|curl\b[^|&;]*?\s-[a-zA-Z]*[oO][a-zA-Z=]*(?:\s|$)|curl\b[^|&;]*?\s--output(?:=|\s)|curl\b[^|&;]*?\s--remote-name\b|wget(?:\s|$)|patch(?:\s|$)|install(?:\s|$)|truncate(?:\s|$)|git\b(?:\s+(?:-[a-zA-Z]\S*|--[a-z][a-z-]*(?:=\S+)?)(?:\s+\S+)?)*?\s+(?:apply|checkout|restore|reset|clean|switch|merge|rebase|am|stash(?!\s+(?:list|show)\b)|cherry-pick|pull|clone|revert|worktree)\b|(?:vim|vi|nano|emacs)(?:\s|$))/;
+var REDIRECT_DENY_RE = /(?:(?<![<\d&])>>?\s*[^\s|&<>;]+|\b\d+>>?\s*(?!&)[^\s|&<>;]+|&>>?\s*(?!&)[^\s|&<>;]+|>\|\s*[^\s|&<>;]+)/;
+var HOST_SPAWN_PREFIX_RE = /^\s*(?:nohup\s+)?(?:claude(?:\s|$)|codex(?:\s|$)|cursor-agent(?:\s|$)|opencode(?:\s|$)|hermes(?:\s|$)|openclaw(?:\s|$)|pi(?:\s|$)|pi-coding-agent(?:\s|$))/;
+var UNQUOTED_SEPARATOR_RE = /[;\n]|&&|\|\||\|(?!\|)|(?<![>&])&(?![&>])(?!\s*$)/;
+function splitSegments(cmd) {
+  return cmd.split(UNQUOTED_SEPARATOR_RE);
+}
+function extractSubstitutionBodies(seg) {
+  const bodies = [];
+  let i = 0;
+  const n = seg.length;
+  let state = "default";
+  const findBalancedParenClose = (start) => {
+    let depth = 1;
+    let k = start;
+    let inner = "default";
+    while (k < n && depth > 0) {
+      const cc = seg[k];
+      if (inner === "sq") {
+        if (cc === "'")
+          inner = "default";
+        k++;
+        continue;
+      }
+      if (inner === "dq") {
+        if (cc === "\\" && k + 1 < n) {
+          k += 2;
+          continue;
+        }
+        if (cc === '"') {
+          inner = "default";
+          k++;
+          continue;
+        }
+      }
+      if (cc === "\\" && k + 1 < n) {
+        k += 2;
+        continue;
+      }
+      if (cc === "'" && inner === "default") {
+        inner = "sq";
+      } else if (cc === '"' && inner === "default") {
+        inner = "dq";
+      } else if (cc === "(") {
+        depth++;
+      } else if (cc === ")") {
+        depth--;
+      }
+      k++;
     }
+    return depth === 0 ? k : -1;
   };
-  api.on("session_start", () => {
-    ensureRegistered();
+  while (i < n) {
+    const c = seg[i];
+    if (state === "sq") {
+      if (c === "'")
+        state = "default";
+      i++;
+      continue;
+    }
+    if (state === "dq") {
+      if (c === "\\" && i + 1 < n) {
+        i += 2;
+        continue;
+      }
+      if (c === '"') {
+        state = "default";
+        i++;
+        continue;
+      }
+    }
+    if (c === "\\" && i + 1 < n) {
+      i += 2;
+      continue;
+    }
+    if (c === "'" && state === "default") {
+      state = "sq";
+      i++;
+      continue;
+    }
+    if (c === '"' && state === "default") {
+      state = "dq";
+      i++;
+      continue;
+    }
+    if (c === "$" && i + 1 < n && seg[i + 1] === "(") {
+      if (i + 2 < n && seg[i + 2] === "(") {
+        i += 3;
+        continue;
+      }
+      const end = findBalancedParenClose(i + 2);
+      if (end !== -1) {
+        bodies.push(seg.slice(i + 2, end - 1));
+        i = end;
+        continue;
+      }
+    }
+    if ((c === "<" || c === ">") && i + 1 < n && seg[i + 1] === "(" && state === "default") {
+      const end = findBalancedParenClose(i + 2);
+      if (end !== -1) {
+        bodies.push(seg.slice(i + 2, end - 1));
+        i = end;
+        continue;
+      }
+    }
+    if (c === "`" && state !== "sq") {
+      let j = i + 1;
+      while (j < n && seg[j] !== "`") {
+        if (seg[j] === "\\" && j + 1 < n) {
+          j += 2;
+          continue;
+        }
+        j++;
+      }
+      if (j < n) {
+        bodies.push(seg.slice(i + 1, j));
+        i = j + 1;
+        continue;
+      }
+    }
+    i++;
+  }
+  return bodies;
+}
+function stripInertQuoted(cmd) {
+  let out = cmd.replace(/'[^']*'/g, "''");
+  out = out.replace(/"(?:[^"\\]|\\.)*"/g, (match) => {
+    if (match.indexOf("$(") >= 0 || match.indexOf("`") >= 0)
+      return match;
+    return '""';
   });
-  api.on("before_provider_request", (event, _ctx) => {
-    const ctx = ensureRegistered();
-    if (!ctx)
-      return;
-    const result = drainSession(ctx.runDir, ctx.sid);
-    if (result.text)
-      drainedTexts.push(result.text);
-    if (drainedTexts.length === 0)
-      return;
-    const combined = drainedTexts.join(`
+  const buf = [];
+  let i = 0;
+  const n = out.length;
+  while (i < n) {
+    if (out[i] === "$" && i + 2 < n && out[i + 1] === "(" && out[i + 2] === "(") {
+      let depth = 2;
+      let j = i + 3;
+      while (j < n && depth > 0) {
+        if (out[j] === "(")
+          depth++;
+        else if (out[j] === ")")
+          depth--;
+        j++;
+      }
+      if (depth === 0) {
+        i = j;
+        continue;
+      }
+    }
+    buf.push(out[i]);
+    i++;
+  }
+  return buf.join("");
+}
+var SHELL_INTERPRETERS = new Set(["bash", "sh", "zsh", "dash", "ash"]);
+function tokenize(cmd) {
+  const out = [];
+  let buf = "";
+  let state = "default";
+  let inToken = false;
+  for (let i = 0;i < cmd.length; i++) {
+    const c = cmd[i];
+    if (state === "sq") {
+      if (c === "'") {
+        state = "default";
+        continue;
+      }
+      buf += c;
+      inToken = true;
+      continue;
+    }
+    if (state === "dq") {
+      if (c === "\\" && i + 1 < cmd.length) {
+        buf += cmd[++i];
+        continue;
+      }
+      if (c === '"') {
+        state = "default";
+        continue;
+      }
+      buf += c;
+      inToken = true;
+      continue;
+    }
+    if (c === "'") {
+      state = "sq";
+      inToken = true;
+      continue;
+    }
+    if (c === '"') {
+      state = "dq";
+      inToken = true;
+      continue;
+    }
+    if (c === "\\" && i + 1 < cmd.length) {
+      buf += cmd[++i];
+      inToken = true;
+      continue;
+    }
+    if (/\s/.test(c)) {
+      if (inToken) {
+        out.push(buf);
+        buf = "";
+        inToken = false;
+      }
+      continue;
+    }
+    buf += c;
+    inToken = true;
+  }
+  if (state !== "default")
+    return null;
+  if (inToken)
+    out.push(buf);
+  return out;
+}
+function unwrapShellCArguments(cmd) {
+  const tokens = tokenize(cmd);
+  if (!tokens || tokens.length === 0)
+    return cmd;
+  const appended = [];
+  for (let i = 0;i < tokens.length; i++) {
+    const tok = tokens[i];
+    const name = tok.replace(/\/+$/, "").split("/").pop() || "";
+    if (!SHELL_INTERPRETERS.has(name))
+      continue;
+    let j = i + 1;
+    while (j < tokens.length) {
+      const t = tokens[j];
+      if (t === "-c") {
+        if (j + 1 < tokens.length)
+          appended.push(tokens[j + 1]);
+        break;
+      }
+      if (t.startsWith("-") && !t.startsWith("--") && t.length > 1 && t.slice(1).indexOf("c") >= 0) {
+        if (j + 1 < tokens.length)
+          appended.push(tokens[j + 1]);
+        break;
+      }
+      j++;
+    }
+  }
+  if (appended.length === 0)
+    return cmd;
+  return cmd + " ; " + appended.join(" ; ");
+}
+function isDeniedInOptimizeMode(toolName, toolInput) {
+  if (!toolName)
+    return false;
+  const t = toolName.toLowerCase();
+  if (DENY_TOOL_NAMES.has(t))
+    return true;
+  if (!BASH_TOOL_NAMES.has(t))
+    return false;
+  const input = toolInput || {};
+  const cmd = typeof input.command === "string" ? input.command : "";
+  if (!cmd)
+    return false;
+  const prepared = unwrapShellCArguments(cmd);
+  for (const body of extractSubstitutionBodies(prepared)) {
+    if (isDeniedInOptimizeMode("Bash", { command: body }))
+      return true;
+  }
+  const sanitized = stripInertQuoted(prepared);
+  for (const rawSeg of splitSegments(sanitized)) {
+    const seg = rawSeg.trim();
+    if (!seg)
+      continue;
+    if (SEGMENT_DENY_RE.test(seg))
+      return true;
+    if (HOST_SPAWN_PREFIX_RE.test(seg))
+      continue;
+    if (REDIRECT_DENY_RE.test(seg))
+      return true;
+  }
+  return false;
+}
+function markOptimizeMode(runDir, sid) {
+  const p = sessionFile(runDir, sid);
+  const rec = readJsonOrNull(p);
+  if (!rec)
+    return false;
+  if (rec.exp_id)
+    return false;
+  if (rec.optimize_mode)
+    return false;
+  rec.optimize_mode = true;
+  rec.optimize_mode_at = nowIso();
+  atomicWriteJson(p, rec);
+  return true;
+}
+var OPTIMIZE_PROMPT_RES = {
+  opencode: [/(?:^|[^A-Za-z0-9_/:-])\/optimize\b/i],
+  openclaw: [
+    /(?:^|[^A-Za-z0-9_/:-])\/optimize\b/i,
+    /(?:^|[^A-Za-z0-9_/:-])\/skill\s+optimize\b/i
+  ],
+  pi: [
+    /(?:^|[^A-Za-z0-9_/:-])\/skill:optimize\b/i,
+    /(?:^|[^A-Za-z0-9_/:-])\/optimize\b/i
+  ]
+};
+function maybeMarkOptimizeFromPrompt(runDir, sid, host, promptText) {
+  if (!promptText)
+    return;
+  const patterns = OPTIMIZE_PROMPT_RES[host];
+  if (!patterns)
+    return;
+  if (!patterns.some((re) => re.test(promptText)))
+    return;
+  markOptimizeMode(runDir, sid);
+}
+function policyStateFile(runDir, sid) {
+  return path.join(injectRoot(runDir), "policy_state", `${sid}.json`);
+}
+function readPolicyState(runDir, sid) {
+  return readJsonOrNull(policyStateFile(runDir, sid)) || {};
+}
+function writePolicyState(runDir, sid, data) {
+  atomicWriteJson(policyStateFile(runDir, sid), data);
+}
+function incrementAndShouldBlock(runDir, sid, toolName) {
+  const state = readPolicyState(runDir, sid);
+  const count = (state.violation_count || 0) + 1;
+  state.violation_count = count;
+  state.last_violation_tool = toolName || "";
+  state.nudge_pending = true;
+  writePolicyState(runDir, sid, state);
+  return count % 2 === 1;
+}
+// factory.ts
+import * as crypto from "crypto";
+function makeRegister(host) {
+  function deriveSessionId() {
+    const expId = process.env.EVO_EXP_ID || "";
+    const seed = expId ? `${process.cwd()}|${expId}` : process.cwd();
+    const hash = crypto.createHash("sha256").update(seed).digest("hex").slice(0, 12);
+    return `${host}-${hash}`;
+  }
+  return function register(api) {
+    const drainedTexts = [];
+    const ensureRegistered = () => {
+      const runDir = findEvoRunDir();
+      if (!runDir)
+        return null;
+      const sid = deriveSessionId();
+      if (!isRegistered(runDir, sid)) {
+        const expId = process.env.EVO_EXP_ID || null;
+        registerSession(runDir, sid, host, expId);
+      }
+      return { sid, runDir };
+    };
+    const appendToPayload = (event, text) => {
+      if (Array.isArray(event.payload?.input)) {
+        event.payload.input.push({
+          role: "user",
+          content: [{ type: "input_text", text }]
+        });
+      } else if (Array.isArray(event.payload?.messages)) {
+        event.payload.messages.push({
+          role: "user",
+          content: [{ type: "text", text }]
+        });
+      }
+    };
+    api.on("session_start", () => {
+      const ctx = ensureRegistered();
+      if (!ctx)
+        return;
+      if (markEngaged(ctx.runDir, ctx.sid)) {
+        initOffsetToLatest(ctx.runDir, ctx.sid);
+      }
+    });
+    const scanForEvoCommands = (payload) => {
+      try {
+        const items = Array.isArray(payload?.input) ? payload.input : [];
+        for (const it of items) {
+          const args = it?.arguments;
+          if (typeof args === "string" && isEvoCommand(args))
+            return true;
+          if (typeof args === "object" && args) {
+            const cmd = args.command ?? args.cmd ?? args.shell;
+            if (typeof cmd === "string" && isEvoCommand(cmd))
+              return true;
+          }
+        }
+        const msgs = Array.isArray(payload?.messages) ? payload.messages : [];
+        for (const m of msgs) {
+          const content = Array.isArray(m?.content) ? m.content : [];
+          for (const c of content) {
+            if (c?.type === "tool_use") {
+              const cmd = c?.input?.command ?? c?.input?.cmd;
+              if (typeof cmd === "string" && isEvoCommand(cmd))
+                return true;
+            }
+          }
+        }
+      } catch {}
+      return false;
+    };
+    const extractLatestUserText = (payload) => {
+      try {
+        const items = Array.isArray(payload?.input) ? payload.input : [];
+        for (let i = items.length - 1;i >= 0; i--) {
+          const it = items[i];
+          if (it?.role !== "user")
+            continue;
+          if (typeof it.content === "string" && it.content)
+            return it.content;
+          if (Array.isArray(it.content)) {
+            for (const c of it.content) {
+              if (typeof c?.text === "string" && c.text)
+                return c.text;
+            }
+          }
+        }
+        const msgs = Array.isArray(payload?.messages) ? payload.messages : [];
+        for (let i = msgs.length - 1;i >= 0; i--) {
+          const m = msgs[i];
+          if (m?.role !== "user")
+            continue;
+          if (typeof m.content === "string")
+            return m.content;
+          if (Array.isArray(m.content)) {
+            for (const c of m.content) {
+              if (typeof c?.text === "string" && c.text)
+                return c.text;
+            }
+          }
+        }
+      } catch {}
+      return "";
+    };
+    api.on("before_provider_request", (event, _ctx) => {
+      const ctx = ensureRegistered();
+      if (!ctx)
+        return;
+      const promptText = extractLatestUserText(event.payload);
+      maybeMarkOptimizeFromPrompt(ctx.runDir, ctx.sid, host, promptText);
+      scanForEvoCommands(event.payload);
+      const result = drainSession(ctx.runDir, ctx.sid);
+      if (result.text)
+        drainedTexts.push(result.text);
+      if (drainedTexts.length === 0)
+        return;
+      const combined = drainedTexts.join(`
 `);
-    appendToPayload(event, combined);
-    return event.payload;
-  });
+      appendToPayload(event, combined);
+      return event.payload;
+    });
+    api.on("tool_call", (event, _ctx) => {
+      const ctx = ensureRegistered();
+      if (!ctx)
+        return;
+      const sess = getSession(ctx.runDir, ctx.sid);
+      if (!sess)
+        return;
+      if (sess.exp_id)
+        return;
+      if (!sess.optimize_mode)
+        return;
+      const toolName = event?.toolName ?? event?.tool_name;
+      const toolInput = event?.input ?? {};
+      if (!isDeniedInOptimizeMode(toolName, toolInput))
+        return;
+      if (incrementAndShouldBlock(ctx.runDir, ctx.sid, toolName)) {
+        return { block: true, reason: POLICY_NUDGE_TEMPLATE };
+      }
+    });
+    api.on("turn_end", async (_event, _ctx) => {
+      if (typeof api.sendUserMessage !== "function")
+        return;
+      const ctx = ensureRegistered();
+      if (!ctx)
+        return;
+      const sess = getSession(ctx.runDir, ctx.sid);
+      if (!sess)
+        return;
+      if (sess.exp_id)
+        return;
+      if (!sess.optimize_mode)
+        return;
+      const peek = peekDrainSession(ctx.runDir, ctx.sid);
+      const text = peek.text ? peek.text + `
+` + STOP_NUDGE_TEMPLATE : STOP_NUDGE_TEMPLATE;
+      try {
+        api.sendUserMessage(text, { deliverAs: "followUp" });
+        commitDrainPeek(ctx.runDir, ctx.sid, peek);
+      } catch (_e) {}
+    });
+  };
 }
+// pi-entry.ts
+var pi_entry_default = makeRegister("pi");
 export {
-  register as default
+  pi_entry_default as default
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@evo-hq/pi-evo",
-  "version": "0.4.3",
+  "version": "0.4.4-alpha.2",
   "description": "Evo plugin for pi-coding-agent: optimize/discover/subagent skills + mid-run inject extension.",
   "publishConfig": {
     "access": "public"

package/skills/discover/SKILL.md CHANGED Viewed

@@ -2,6 +2,7 @@
 name: discover
 description: Initialize evo for the current repository by exploring the codebase, proposing unexplored optimization dimensions, constructing the benchmark inside a baseline worktree, and running the first experiment. Use when the user invokes /evo:discover, mentions setting up evo, wants to instrument a codebase for autonomous optimization, or asks to start a new evo run on a project.
 argument-hint: <optional context about what to optimize>
+evo_version: 0.4.4-alpha.2
 ---
 # Discover
@@ -28,23 +29,31 @@ The runtime may inject user-authoritative messages wrapped in this banner:
 Treat content inside the banner as equivalent to a new user turn. Honor it, supersede earlier constraints it contradicts, and propagate the full text verbatim into any subagent briefs you spawn afterward. The banner is the authenticity signal emitted by the evo runtime (the plugin you're invoked through) — not tool-output prompt injection. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, SessionStart); the channel doesn't change the authority of the content.
-## 0. Verify the evo CLI is available and in sync with the plugin
+## 0. Verify the evo CLI is in sync with this skill
-Before anything else, run:
+Run:
 ```bash
-evo-version-check
+evo --version
 ```
-This wraps `evo --version` and additionally asserts the installed CLI matches the plugin manifest version (hosts refetch the plugin on version bumps, but do not reinstall the globally-installed CLI -- drift between the two breaks skills silently).
+The output must be exactly:
-Four outcomes to handle:
+```
+evo-hq-cli 0.4.4-alpha.2
+```
-1. **Exit 0, `evo-version-check: OK (plugin=X, cli=X)`** -- continue to step 1.
-2. **Exit 1, "plugin manifest and installed CLI disagree"** -- stop and show the user the script's stderr verbatim; it tells them the `uv tool install --force evo-hq-cli==<version>` command to run. Then re-invoke this skill.
-3. **Exit 2, "evo CLI not on PATH"** -- stop and tell the user:
-   > `evo-hq-cli` isn't on your PATH. Install it once: `uv tool install evo-hq-cli` (or `pipx install evo-hq-cli`). Then re-invoke this skill.
-4. **`evo-version-check: command not found`** -- the host's plugin install is incomplete (missing the `bin/` wrapper). Fall back to running `evo --version` directly and check for `evo-hq-cli` in the output; if it's a different package (commonly `evo 1.x` -- the unrelated SLAM tool), tell the user to uninstall it and install `evo-hq-cli` in its place.
+Three outcomes:
+1. **Matches exactly** — continue to step 1.
+2. **Reports a different version** (`evo-hq-cli 0.4.2`, etc.) — the host refetched a newer/older skill bundle than the CLI on PATH. Drift breaks skills silently. Stop and tell the user:
+   > Your installed evo CLI is on a different version than this skill (`0.4.4-alpha.2`). Run:
+   > ```
+   > uv tool install --force evo-hq-cli==0.4.4-alpha.2
+   > ```
+   > Then re-invoke this skill.
+3. **`command not found`, or reports a different package** (commonly `evo 1.x` — the unrelated SLAM tool) — the CLI isn't installed. Tell the user:
+   > `evo-hq-cli` isn't on your PATH. Install it: `uv tool install evo-hq-cli==0.4.4-alpha.2` (or `pipx install evo-hq-cli==0.4.4-alpha.2`). Then re-invoke this skill.
 Do not try to auto-install. Host sandbox + network policy may block it; leaving the install as a user action keeps failure modes clear.
@@ -276,12 +285,23 @@ If the selected benchmark is new, build it in the worktree. See `references/cons
 - Design the scoring function (range, direction, meaningful-improvement threshold)
 - Assemble test cases (10-20 for programmatic, 15-30 for fuzzy, realistic workload for perf)
 - Write the runnable harness (helper/SDK writes the score JSON to `$EVO_RESULT_PATH`; stdout and stderr are free for user output)
-- Goodhart check (document gaming strategies, mitigate each with a gate or held-out slice)
+- Goodhart check (document concrete gaming strategies and mitigation). Include validation/gold-answer leakage explicitly: assume subagents can see benchmark traces and gold answers, so detection is the defense, not concealment. Prefer a crisp deterministic cheat-check gate, such as a workspace-specific script that greps the target/worktree for exact validation strings and exits non-zero on a match; register it with `evo gate add ... --phase pre` only after the user explicitly opts in. Mention expected cost for any LLM-judge variant and reserve it for paraphrase cases because it is flakier than exact-string checks.
 - Held-out validation slice (60/70 training, 30/40 held-out) if the benchmark is hand-written
 Do not run separate determinism checks during setup. Note the benchmark's determinism property in `project.md` (step 12) and move on. Variance surfaces during optimization itself, where it can be handled with real evidence rather than guessed at during setup.
-### 10b. Apply instrumentation
+### 10b. Audit the harness for amortizable wins
+Apply any change that preserves what we measure -- descendants inherit it. Changes that could move the score (including for a different target) belong in `/evo:optimize`, not here.
+Patterns to scan for:
+- Serial loop over independent tasks -> thread/process pool
+- Constant prefix across tasks -> prompt cache
+- Per-task setup that could be one-time -> hoist out of the loop
+- Transport errors (429/5xx) counted as task failures -> retry
+### 10c. Apply instrumentation
 Based on the instrumentation mode passed to `evo init`:
@@ -292,7 +312,7 @@ Paths below are relative to this `SKILL.md` file (resolve them against the skill
 The wire protocol is the same either way: `task_<id>.json` written to `$EVO_TRACES_DIR`, score JSON written to `$EVO_RESULT_PATH`. Stdout is free for user output.
-### 10c. Cheap validation run
+### 10d. Cheap validation run
 Before the full baseline, validate the toolchain with the cheapest possible end-to-end run (single task, smallest split, dry-run flag -- whatever is fastest). Run the check from the main repo root:
@@ -313,7 +333,7 @@ The check asserts `result.json` exists, is non-empty, and is a JSON object with
 Fix any issues and re-validate before proceeding.
-### 10d. Commit inside the worktree
+### 10e. Commit inside the worktree
 Logical commits are ideal but not required. Minimal acceptable:
@@ -334,7 +354,7 @@ dist/
 build/
 ```
-Otherwise, running the benchmark once before committing will drag bytecode caches, `.pytest_cache/`, or stray `.evo/` writes into the experiment's tree and pollute every descendant branch. Belt-and-suspenders with step 10c's "run from main repo root" rule: even if cwd slips, the ignore catches it.
+Otherwise, running the benchmark once before committing will drag bytecode caches, `.pytest_cache/`, or stray `.evo/` writes into the experiment's tree and pollute every descendant branch. Belt-and-suspenders with step 10d's "run from main repo root" rule: even if cwd slips, the ignore catches it.
 ## 11. Run the baseline

package/skills/discover/references/constructing-benchmark.md CHANGED Viewed

@@ -93,12 +93,16 @@ Common pairings:
 | Benchmark style | Minimum paired gate |
 |---|---|
-| Hand-written task pass rate | Held-out slice (other tasks, not visible during optimization) |
+| Hand-written task pass rate | Validation-slice score threshold; add an exact-leakage pre-gate when validation strings or gold answers could be copied into the target |
 | Latency / performance | Correctness test (the optimized code must still produce the same outputs) |
-| LLM-as-judge rating | Structural validity check (output parses / is well-formed) |
+| LLM-as-judge rating | Structural validity check; optional LLM-judge cheat gate only for paraphrase leakage risks |
 | Quality-of-output score | Sanity assertion that catches degenerate outputs (empty, constant, out-of-range) |
-Add the gate via `evo gate add root --name <name> --command <command>` during the discover flow. The gate runs alongside every experiment. An experiment that breaks a gate is not committed even if the benchmark score improves; it remains an evaluated node until an agent fixes and reruns it or explicitly discards it.
+Add gates with an explicit phase. Use `--phase pre` for gates that detect invalid edits before benchmark spend, including cheat-detection checks for leaked validation strings; use the default/post phase for benchmark-derived score-threshold gates that need scoring. For any gate that costs money, especially LLM-judge cheat checks, ask the user before registering it and state the expected per-check cost.
+For artifact-evolution runs, assume validation tasks and gold answers may be visible in traces. Do not describe held-out data as secret. Defense is detection: prefer a workspace-specific deterministic gate that greps exact validation strings, gold answers, or unique rubric phrases in the target/worktree and exits non-zero on a match. Use LLM-judge gates only when paraphrase leakage is a real risk; label them opt-in, more expensive, and more prone to false positives.
+The gate runs alongside every experiment. An experiment that breaks a gate is not committed even if the benchmark score improves; it remains an evaluated node until an agent fixes and reruns it or explicitly discards it.
 **The gate command must exit non-zero on regression.** `evo run` checks exit code, not stdout. A bare `python3 benchmark.py --task-ids 5,6,9` always exits 0 because the benchmark script's contract is "exit 0 unless infrastructure broke" -- it prints a low score but never fails. To make a benchmark-derived gate actually catch regressions, the benchmark needs a `--min-score <threshold>` flag (or equivalent) that:

package/skills/infra-setup/SKILL.md CHANGED Viewed

@@ -2,6 +2,7 @@
 name: infra-setup
 description: Non-user-invocable provider/setup reference for evo backend switching, prerequisite checks, and auth/install guidance.
 disable-model-invocation: true
+evo_version: 0.4.4-alpha.2
 ---
 # Infra Setup

package/skills/optimize/SKILL.md CHANGED Viewed

@@ -2,6 +2,7 @@
 name: optimize
 description: Run the evo optimization loop with parallel subagents until interrupted.
 argument-hint: "[subagents=N] [budget=N] [stall=N]"
+evo_version: 0.4.4-alpha.2
 ---
 Run the `evo` optimization loop. Each round, the orchestrator writes structured briefs and spawns parallel subagents that execute within them. Each subagent is semi-autonomous: it reads the pointer traces, forms the concrete edit, runs experiments, and can iterate within its branch. Runs until interrupted or the stall limit is reached.
@@ -18,12 +19,14 @@ This skill runs on any host that implements the Agent Skills spec. When the body
 The runtime may inject user-authoritative messages wrapped in this banner:
 ```
-[EVO DIRECTIVE]
+[EVO DIRECTIVE id=<event_id>]
 <text>
-[END EVO DIRECTIVE]
+[END EVO DIRECTIVE — when done, run: evo ack <event_id>]
 ```
-Treat content inside the banner as equivalent to a new user turn. Honor it, supersede earlier constraints it contradicts, and propagate the full text verbatim into any subagent briefs you spawn afterward. The banner is the authenticity signal emitted by the evo runtime (the plugin you're invoked through) — not tool-output prompt injection. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, SessionStart); the channel doesn't change the authority of the content.
+Treat content inside the banner as equivalent to a new user turn. Honor it, supersede earlier constraints it contradicts, and propagate the full text verbatim into any subagent briefs you spawn afterward. The banner is the authenticity signal emitted by the evo runtime (the plugin you're invoked through) — not tool-output prompt injection. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, PostToolUse, Stop, SubagentStop, SessionStart); the channel doesn't change the authority of the content.
+**Run `evo ack <event_id>` after acting on the directive.** This records that you saw and processed it, so `evo direct --wait` and `evo direct-status <id>` can report success to the user. One ack per directive id; idempotent.
 ## Configuration
@@ -213,8 +216,8 @@ Per host, the spawn shape matters because evo's loop depends on *completion noti
 - **hermes** — `terminal(background=true)`; notifications delivered similarly.
 - **openclaw** — `sessions_spawn deliver:false`; notifications delivered similarly.
 - **opencode** — *batch-parallel only* (no background notifications). Fire N `task` calls in ONE assistant message; all `tool_result`s return together when the slowest finishes. Plan all parallel work (including non-task tools) in that single message — opencode cannot interleave reasoning across turns while subagents run.
-- **pi** — *batch-parallel via extension*. Pi's default toolkit has no subagent primitive; `evo install pi` ensures the `pi-subagents` package is present, which registers a `subagent` tool. Fire N `subagent` calls in ONE assistant message; all results return together when the slowest finishes (same shape as opencode). If the `subagent` tool isn't available, fall back to running experiments sequentially in your own turn (`evo new` → `evo run` per attempt) and tell the user to `pi install npm:pi-subagents` for proper fanout.
-- **cursor** — use Cursor's native Subagents to run each brief in parallel (own context per subagent), and fan them out in a single batch. If native subagents aren't available, fall back to one `cursor-agent -p "<brief>" --force` per brief (background+notify shape, like claude-code) so each runs its brief to completion in its own headless session. Inject reaches the orchestrator via the `postToolUse`/`sessionStart` hooks `evo install cursor` wires; the directive banner can arrive on either channel.
+- **pi** — *batch-parallel via `subagent` tool*. Fire N calls in one assistant message; all results return together. If the tool's missing, run `evo new` → `evo run` sequentially and tell the user to `pi install npm:pi-subagents`.
+- **cursor** — *batch-parallel via Cursor native Subagents*; fan all briefs out in a single batch. Fallback if native subagents are unavailable: one `cursor-agent -p "<brief>" --force` per brief (background+notify).
 Respect the host's concurrency cap; batch if N exceeds it.
@@ -222,7 +225,7 @@ Pick a faster model for straightforward briefs and a stronger model for harder o
 Each subagent prompt MUST start with the literal sentence:
-> "First, load and follow the **evo subagent skill** (named `subagent` under the evo plugin in your host's skill registry — use your host's skill loader, not a filesystem path). Allocate your experiment via `evo new --parent <id>`, edit inside the returned worktree, evaluate via `evo run <exp_id>`. Do not skip these steps even if the brief looks simple."
+> "First, load and follow the **evo subagent skill** (named `subagent` under the evo plugin in your host's skill registry — use your host's skill loader, not a filesystem path). Allocate your experiment via `evo new --parent <id>`, edit inside the returned worktree, evaluate via `evo run <exp_id>`. Do not skip these steps even if the brief looks simple. If `evo run` exits `GATE_FAILED`, fix the edit so it satisfies the inherited gate; do not weaken, bypass, delete, or argue with the gate unless the orchestrator explicitly changes the brief."
 Then append:
 - The four-field brief verbatim (objective, parent, boundaries/anti-patterns, pointer traces)

package/skills/report/SKILL.md ADDED Viewed

@@ -0,0 +1,43 @@
+---
+name: report
+description: Print the dashboard's dot chart (score over experiment order, status colors, best-path stair) inline in the terminal for every run in the workspace. Use when the user invokes /evo:report, asks for a quick score chart without opening the dashboard, or wants the scatter plot in chat output.
+evo_version: 0.4.4-alpha.2
+---
+# Report
+Render the dashboard's scatter plot as a colored terminal block, one chart per run, sized to the current terminal.
+## What it shows
+Mirrors the web dashboard's score scatter (left rail of `evo dashboard`):
+- X = experiment creation order, Y = score
+- Dot color by status: green = committed, red = failed, purple = active, grey = pending / evaluated / discarded / pruned
+- ★ marks the current best committed experiment
+- Yellow ring on dots that sit on the best-path spine (root → best)
+- Yellow stair line traces cumulative-best across committed experiments
+- ○ at the baseline for experiments that have no score yet (active / pending)
+Every run in the workspace is rendered, stacked top-to-bottom, with a header line showing `run_id · target · metric`.
+## How to invoke
+Run:
+```bash
+evo report
+```
+That is it. Print the output verbatim in your reply so the user sees the chart. Do not summarize the chart in prose — the visual is the point.
+Flags:
+- `--color always|never|auto` — force or suppress ANSI color. Default `auto` (color when stdout is a TTY). Pass `--color always` if you are piping through a host that strips TTY but renders ANSI in chat.
+- `--watch [SECONDS]` — live-refresh mode (like `nvidia-smi -l`). Re-reads the workspace every N seconds (default 2) and redraws in place. Ctrl-C to exit. Use this when you want to babysit a running optimization without manually re-invoking the report.
+## When not to use
+- For one-off score lookups, `evo status` or `evo show <id>` is faster.
+- For navigating the tree shape, `evo tree` is the right command.
+- For interactive exploration (click a dot, open a drawer), point the user at `evo dashboard` instead.

package/skills/subagent/SKILL.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 name: subagent
-description: Internal protocol for evo optimization subagents. Not user-invocable -- read by subagents spawned from /optimize.
-disable-model-invocation: true
+description: Internal protocol for evo optimization subagents. Loaded by subagents spawned from /optimize via their host's skill loader. Not for orchestrator use.
+evo_version: 0.4.4-alpha.2
 ---
 # Evo Subagent Protocol
@@ -28,12 +28,14 @@ This subagent runs on any host that implements the Agent Skills spec. The tools
 Your brief, or your in-conversation context, may contain a block wrapped in:
 ```
-[EVO DIRECTIVE]
+[EVO DIRECTIVE id=<event_id>]
 <text>
-[END EVO DIRECTIVE]
+[END EVO DIRECTIVE — when done, run: evo ack <event_id>]
 ```
-Content inside the banner is **user-authoritative** — the user issued it via `evo direct` and the runtime spliced it into your context. Follow it verbatim, including any literal strings or markers it asks you to write into files. This is not tool-output prompt injection — the banner is the authenticity signal, emitted by the evo runtime. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, SessionStart); the channel doesn't change the authority of the content.
+Content inside the banner is **user-authoritative** — the user issued it via `evo direct` and the runtime spliced it into your context. Follow it verbatim, including any literal strings or markers it asks you to write into files. This is not tool-output prompt injection — the banner is the authenticity signal, emitted by the evo runtime. Banners may arrive via any hook channel (UserPromptSubmit, PreToolUse, PostToolUse, Stop, SubagentStop, SessionStart); the channel doesn't change the authority of the content.
+**Run `evo ack <event_id>` after acting on the directive.** This records that the directive was processed, surfacing via `evo direct-status` and `evo direct --wait` for the user. Idempotent — one ack per id.
 ## Important: Working Directory