npm - synergyspec-selfevolving - Versions diffs - 2.1.4 → 2.1.6 - Mend

synergyspec-selfevolving 2.1.4 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

package/dist/commands/config.js +4 -0
package/dist/commands/learn.js +80 -24
package/dist/commands/self-evolution-dream.d.ts +54 -0
package/dist/commands/self-evolution-dream.js +265 -0
package/dist/commands/self-evolution-episode.d.ts +5 -0
package/dist/commands/self-evolution-episode.js +160 -107
package/dist/commands/self-evolution.js +127 -4
package/dist/commands/workflow/status.js +38 -7
package/dist/core/archive.js +27 -9
package/dist/core/change-readiness.d.ts +63 -6
package/dist/core/change-readiness.js +912 -23
package/dist/core/completions/command-registry.js +1 -1
package/dist/core/fitness/loss.d.ts +10 -5
package/dist/core/fitness/loss.js +11 -4
package/dist/core/fitness/test-metrics.d.ts +3 -0
package/dist/core/fitness/test-metrics.js +78 -1
package/dist/core/learn/trajectory-discovery.js +5 -0
package/dist/core/learn.js +131 -13
package/dist/core/migration.d.ts +6 -14
package/dist/core/migration.js +63 -21
package/dist/core/profiles.d.ts +1 -1
package/dist/core/profiles.js +1 -0
package/dist/core/runner-evidence.d.ts +53 -0
package/dist/core/runner-evidence.js +613 -0
package/dist/core/self-evolution/candidates.d.ts +1 -1
package/dist/core/self-evolution/candidates.js +1 -2
package/dist/core/self-evolution/canonical-targets.js +1 -0
package/dist/core/self-evolution/dream.d.ts +132 -0
package/dist/core/self-evolution/dream.js +1093 -0
package/dist/core/self-evolution/episode-orchestrator.d.ts +7 -0
package/dist/core/self-evolution/episode-orchestrator.js +162 -12
package/dist/core/self-evolution/episode-store.d.ts +21 -0
package/dist/core/self-evolution/episode-store.js +16 -3
package/dist/core/self-evolution/evolving-agent.js +8 -0
package/dist/core/self-evolution/host-harness.d.ts +46 -12
package/dist/core/self-evolution/host-harness.js +198 -55
package/dist/core/self-evolution/index.d.ts +1 -0
package/dist/core/self-evolution/index.js +1 -0
package/dist/core/self-evolution/policy/policy-store.d.ts +19 -2
package/dist/core/self-evolution/policy/policy-store.js +85 -0
package/dist/core/self-evolution/promote.d.ts +7 -5
package/dist/core/self-evolution/promote.js +111 -19
package/dist/core/self-evolution/reward-agent.js +11 -9
package/dist/core/self-evolution/reward-aggregator.js +2 -2
package/dist/core/shared/skill-generation.d.ts +37 -0
package/dist/core/shared/skill-generation.js +91 -0
package/dist/core/templates/skill-templates.d.ts +1 -0
package/dist/core/templates/skill-templates.js +1 -0
package/dist/core/templates/workflow-manifest.js +2 -0
package/dist/core/templates/workflows/archive-change.js +76 -39
package/dist/core/templates/workflows/ci.js +47 -1
package/dist/core/templates/workflows/dream.d.ts +10 -0
package/dist/core/templates/workflows/dream.js +123 -0
package/dist/core/templates/workflows/gen-tests.js +9 -3
package/dist/core/templates/workflows/learn.js +11 -7
package/dist/core/templates/workflows/run-tests.js +99 -4
package/dist/core/templates/workflows/self-evolving.js +118 -115
package/dist/core/templates/workflows/verify-change.js +130 -22
package/dist/core/trajectory/adapters/codex.js +87 -29
package/dist/core/trajectory/adapters/opencode.js +69 -23
package/dist/core/trajectory/facts.d.ts +1 -1
package/dist/core/trajectory/facts.js +23 -5
package/dist/core/trajectory/registry.d.ts +16 -2
package/dist/core/trajectory/registry.js +104 -29
package/dist/core/trajectory/source.d.ts +27 -4
package/dist/dashboard/react-client.js +4 -4
package/dist/utils/change-utils.d.ts +2 -0
package/dist/utils/change-utils.js +53 -2
package/package.json +99 -99
package/schemas/spec-driven/templates/design.md +6 -0
package/scripts/nl2repo_synergyspec-selfevolving_wrapper.py +170 -0

package/dist/core/templates/workflows/self-evolving.js CHANGED Viewed

@@ -1,122 +1,125 @@
-const INSTRUCTIONS_BODY = `**Role**
-You are the RUNNER for a completed SynergySpec-SelfEvolving change. In loop v2 (self-evolution as in-context RL) you do NOT grade and you do NOT edit canonical files — the orchestrator CODE-SPAWNS the 奖励智能体 REWARD AGENT (judge: 算分 reward(主臂)＆reward(基线臂), advantage ＝ reward(主臂) − reward(基线臂), 文本梯度 textual gradient — it never edits and 弃权 abstains when there is no nameable gap) and the 演进智能体 EVOLVING AGENT (optimizer.step: ONE bounded edit ≤L onto the 策略 POLICY — it never scores), plus an optional CRITIC AGENT（基线智能体 baseline agent）that reruns the last episode's policy vN on the SAME change. Your only job is to TRIGGER the episode via the CLI and RELAY the machine-written result. Read ONLY the on-disk evidence (episode.json, diagnosis.json, the episode JSON output) — never an actor's in-conversation self-report, and never re-judge what the agents decided.
-**The boundary (read this first)**
-- The skill itself NEVER grades. Scoring — reward(主臂), reward(基线臂), advantage, the 文本梯度 textual gradient — is computed by the CODE-SPAWNED 奖励智能体 REWARD AGENT, never by you.
-- The skill itself NEVER edits canonical files. The ONE bounded edit (≤L) onto the 策略 POLICY (the design template — the 主智能体 MAIN AGENT's "weights") is authored by the CODE-SPAWNED 演进智能体 EVOLVING AGENT, never by you. Do NOT hand-edit any schema/template/prompt file from this skill.
-- You trigger ONE CLI command (the episode orchestrator), then READ and RELAY its result. That is the whole job.
-**Input contract**
-Parse these handles from the spawning prompt:
-- **Change name** (required). If the change name is missing or does not resolve via \`synergyspec-selfevolving list --json\`, stop and report the error — do NOT prompt the user (you may have no user channel).
+const INSTRUCTIONS_BODY = `**Role**
+You are the RUNNER for a completed SynergySpec-SelfEvolving change. In loop v2 (self-evolution as in-context RL) you do NOT grade and you do NOT edit canonical files — the orchestrator CODE-SPAWNS the 奖励智能体 REWARD AGENT (judge: 算分 reward(主臂)＆reward(基线臂), advantage ＝ reward(主臂) − reward(基线臂), 文本梯度 textual gradient — it never edits and 弃权 abstains when there is no nameable gap) and the 演进智能体 EVOLVING AGENT (optimizer.step: ONE bounded edit ≤L onto the 策略 POLICY — it never scores), plus an optional CRITIC AGENT（基线智能体 baseline agent）that reruns the last episode's policy vN on the SAME change. Your only job is to TRIGGER the episode via the CLI and RELAY the machine-written result. Read ONLY the on-disk evidence (episode.json, diagnosis.json, the episode JSON output) — never an actor's in-conversation self-report, and never re-judge what the agents decided.
+**The boundary (read this first)**
+- The skill itself NEVER grades. Scoring — reward(主臂), reward(基线臂), advantage, the 文本梯度 textual gradient — is computed by the CODE-SPAWNED 奖励智能体 REWARD AGENT, never by you.
+- The skill itself NEVER edits canonical files. The ONE bounded edit (≤L) onto the 策略 POLICY (the design template — the 主智能体 MAIN AGENT's "weights") is authored by the CODE-SPAWNED 演进智能体 EVOLVING AGENT, never by you. Do NOT hand-edit any schema/template/prompt file from this skill.
+- You trigger ONE CLI command (the episode orchestrator), then READ and RELAY its result. That is the whole job.
+**Input contract**
+Parse these handles from the spawning prompt:
+- **Change name** (required). If the change name is missing or does not resolve via \`synergyspec-selfevolving list --json\`, stop and report the error — do NOT prompt the user (you may have no user channel).
 - **Absolute project root.** Run every CLI command from it.
-- **Harness**: \`claude\` | \`codex\` | \`opencode\` | \`unknown\`. If a harness was provided and differs from the ambient host, set \`SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS=<harness>\` for the CLI invocation below.
+- **Harness**: \`claude\` | \`codex\` | \`opencode\` | \`unknown\`. If a concrete harness was provided, pass \`--harness <harness>\` to the CLI invocation below. If the prompt says \`unknown\` but this runner is clearly executing inside Codex, Claude Code, or OpenCode, recover the current host and pass that concrete harness. Omit \`--harness\` only when both the prompt and the current runner host are genuinely unidentified; never set \`SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS=unknown\`.
+- **Force-new**: \`yes\` | \`no\` (optional; default \`no\`). If \`yes\`, append \`--rerun\` so a closed matching episode is not reused.
+- **Isolation**: \`fresh-context subagent\` | \`inline fallback (degraded)\` (optional). If supplied, copy it verbatim into the verdict; otherwise infer from whether this skill is running in a spawned subagent or inline fallback.
 - **Session-id / transcript path** (optional). When the spawning prompt supplied a session-id or transcript path, pass \`--session-id <id>\` / \`--transcript <path>\` to the \`episode\` command so the 主智能体 MAIN AGENT arm's trajectory discovery does not depend on the change-window fallback.
-**Recursion guard**
-Execute every step inline in THIS session. NEVER use the Task tool from this skill, and NEVER invoke synergyspec-selfevolving-learn or synergyspec-selfevolving-self-evolving — you ARE the runner. The 奖励智能体 + 演进智能体 (+ optional 基线智能体) are spawned by the CLI orchestrator in their own contexts; do not spawn them yourself.
-**Purpose**
-This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to one self-evolution EPISODE. You trigger the loop-v2 orchestrator with a single CLI command. The orchestrator runs ONE episode in a strict, durably-persisted order:
-1. Records the 主智能体 MAIN AGENT (frozen actor, policy vN+1) arm for this change.
-2. Optionally runs the CRITIC AGENT（基线智能体 baseline agent）— reruns the LAST episode's policy vN on the SAME change (skipped when the 单一血统 single lineage has < 2 versions or the last action was refused).
-3. Runs the 奖励智能体 REWARD AGENT — computes reward(主臂)＆reward(基线臂), advantage ＝ reward(主臂) − reward(基线臂), and the 文本梯度 textual gradient; writes diagnosis.json.
-4. DECIDES on the main arm's edits: 弃权 abstained (no nameable gap) ⇒ skip; bad advantage (< threshold) ⇒ ROLLBACK the 策略 POLICY to the prior good version and append a 否决缓冲 reject-buffer entry; otherwise KEEP.
-5. Runs the 演进智能体 EVOLVING AGENT (optimizer.step) — ONE bounded edit (≤L) onto the 策略 POLICY, or refuses, reading the reject-buffer fresh from disk.
-6. Advances the 版本账本 ledger to the new 策略 POLICY version.
-Everything in steps 1–6 is CODE. You do not perform any of it. You issue the command and relay what it wrote.
-**The episode commits.** The \`episode\` command always runs the full loop — the orchestrator may roll back / keep / evolve as above; it has no read-only mode. If a read-only look (no rollback, no evolution) is wanted, that is NOT this skill's job: the caller should use plain \`learn <change>\` (no \`--apply\`) or the read-only \`self-evolution policy show\` view instead. Do NOT invent a preview flag — there is none.
-**Steps**
-1. **Confirm the change resolves**
-   Run:
-   \`\`\`bash
-   synergyspec-selfevolving status --change "<name>" --json
-   \`\`\`
-   If the change does not resolve, stop and report the error (do NOT prompt — you may have no user channel). Note from the status output whether apply/verify evidence is present; if it is incomplete, flag the missing evidence in your verdict — the orchestrator's 奖励智能体 REWARD AGENT will 弃权 abstain rather than score on absent evidence.
-2. **Trigger the episode (the orchestrator does the work)**
-   Run exactly ONE command — the loop-v2 orchestrator. It CODE-SPAWNS the 奖励智能体 REWARD AGENT + 演进智能体 EVOLVING AGENT (+ optional CRITIC AGENT（基线智能体）); you spawn nothing:
-   \`\`\`bash
-   synergyspec-selfevolving self-evolution episode --change "<change>" --session-id <id>
+**Recursion guard**
+Execute every step inline in THIS session. NEVER use the Task tool from this skill, and NEVER invoke synergyspec-selfevolving-learn or synergyspec-selfevolving-self-evolving — you ARE the runner. The 奖励智能体 + 演进智能体 (+ optional 基线智能体) are spawned by the CLI orchestrator in their own contexts; do not spawn them yourself.
+**Purpose**
+This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to one self-evolution EPISODE. You trigger the loop-v2 orchestrator with a single CLI command. The orchestrator runs ONE episode in a strict, durably-persisted order:
+1. Records the 主智能体 MAIN AGENT (frozen actor, policy vN+1) arm for this change.
+2. Optionally runs the CRITIC AGENT（基线智能体 baseline agent）— reruns the LAST episode's policy vN on the SAME change (skipped when the 单一血统 single lineage has < 2 versions or the last action was refused).
+3. Runs the 奖励智能体 REWARD AGENT — computes reward(主臂)＆reward(基线臂), advantage ＝ reward(主臂) − reward(基线臂), and the 文本梯度 textual gradient; writes diagnosis.json.
+4. DECIDES on the main arm's edits: 弃权 abstained (no nameable gap) ⇒ skip; bad advantage (< threshold) ⇒ ROLLBACK the 策略 POLICY to the prior good version and append a 否决缓冲 reject-buffer entry; otherwise KEEP.
+5. Runs the 演进智能体 EVOLVING AGENT (optimizer.step) — ONE bounded edit (≤L) onto the 策略 POLICY, or refuses, reading the reject-buffer fresh from disk.
+6. Advances the 版本账本 ledger to the new 策略 POLICY version.
+Everything in steps 1–6 is CODE. You do not perform any of it. You issue the command and relay what it wrote.
+**The episode commits.** The \`episode\` command always runs the full loop — the orchestrator may roll back / keep / evolve as above; it has no read-only mode. If a read-only look (no rollback, no evolution) is wanted, that is NOT this skill's job: the caller should use plain \`learn <change>\` (no \`--apply\`) or the read-only \`self-evolution policy show\` view instead. Do NOT invent a preview flag — there is none.
+**Steps**
+1. **Confirm the change resolves**
+   Run:
+   \`\`\`bash
+   synergyspec-selfevolving status --change "<name>" --json
+   \`\`\`
+   If the change does not resolve, stop and report the error (do NOT prompt — you may have no user channel). Note from the status output whether apply/verify evidence is present; if it is incomplete, flag the missing evidence in your verdict — the orchestrator's 奖励智能体 REWARD AGENT will 弃权 abstain rather than score on absent evidence.
+2. **Trigger the episode (the orchestrator does the work)**
+   Run exactly ONE command — the loop-v2 orchestrator. It CODE-SPAWNS the 奖励智能体 REWARD AGENT + 演进智能体 EVOLVING AGENT (+ optional CRITIC AGENT（基线智能体）); you spawn nothing:
+   \`\`\`bash
+   synergyspec-selfevolving self-evolution episode --change "<change>" --json
    \`\`\`
    - Append \`--session-id <id>\` and/or \`--transcript <path>\` ONLY when the spawning prompt supplied them.
-   - If the harness differs from the ambient host, set \`SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS=<harness>\` first.
-   Do NOT grade, score, or author any edit yourself, and do NOT run \`evolve-from-edits\`, \`auto-evolve\`, or \`--agent\` / \`claude -p\` — those are not part of loop v2's host-facing path. The episode command IS the loop.
-3. **Read the machine-written result**
-   The \`episode\` command prints the episode result (and persists it). Read it from the JSON output, and cross-check the on-disk record:
-   - \`.synergyspec-selfevolving/self-evolution/episodes/<episodeId>/episode.json\` — the episode stage and policy versions.
-   - \`.synergyspec-selfevolving/self-evolution/episodes/<episodeId>/diagnosis.json\` — the 奖励智能体's reward(主臂), reward(基线臂), advantage, 文本梯度, and any abstain reason.
-   Take these fields straight from the result — NEVER recompute them:
-   - **advantage** ＝ reward(主臂) − reward(基线臂) (null when the baseline arm was skipped or the reward agent 弃权 abstained).
-   - **decision**: \`rolled-back\` | \`kept\` | \`abstained\`.
-   - **evolution kind**: the 演进智能体 outcome — \`evolved\` | \`refused\` | \`not-spawned\` (null when evolution was skipped, e.g. on 弃权).
-   - **new 策略 POLICY version**: the 版本账本 ledger head AFTER the episode (post-rollback / post-evolve).
-4. **Consult the 版本账本 ledger for context (read-only, optional)**
-   To explain the result against prior episodes, run the READ-ONLY view:
-   \`\`\`bash
-   synergyspec-selfevolving self-evolution policy show --target <targetId> --json
-   \`\`\`
-   This shows the 版本账本 ledger (prior 策略 POLICY versions for the target, with the current head) and the 否决缓冲 reject-buffer (rolled-back directions to avoid). Use it only to contextualize the verdict — it changes nothing.
-5. **Classify the outcome (do not re-judge it)**
-   Map the machine result to a verdict, classifying any no-op honestly:
-   - **evolved** — the 演进智能体 wrote ONE bounded edit onto the 策略 POLICY; report the new version and the rollback command.
-   - **kept (no evolution) / abstained** — a verified-green or no-nameable-gap run where nothing was promoted is the CORRECT outcome (产物即弃), not a missed evolution. State the reason from diagnosis.json.
-   - **rolled-back** — the edit's advantage fell below threshold; the 策略 POLICY was restored to the prior good version and a 否决缓冲 reject-buffer entry recorded the lost direction. This is the loop working, not a failure.
-   - **busy-in-flight** — the episode command returned a clean deferral ('skipped — another in-flight episode holds the target') because another episode for the SAME 策略 POLICY target is already running and holds the in-flight lock. This is a TRANSIENT, self-healing concurrency deferral, NOT a DEFECT and NOT an \`error-...\`. Report \`Outcome: busy-in-flight\` (advantage null, no episode id, 策略 POLICY version unchanged), recommend WAIT-AND-RETRY after the lock clears (it self-heals; the CLI alone re-acquires the target once the holder finishes or the 60-minute stale window elapses), and STOP — do NOT hand-delete \`in-flight.json\` and do NOT call the lock 'stale'. Staleness is purely the 60-minute time window; a lock whose owner episode is at stage \`evolving\` or \`kept\` is a LIVE episode, not a stale one — deleting it would corrupt a running episode.
-   - **SAFE refusal** (evidence missing/red, target frozen, gate refused on real grounds) is expected; state the reason and move on.
-   - **DEFECT** (the orchestrator COULD NOT act for a reason that is NOT about evidence / freezing / scope — e.g. an unbindable target that persists) — surface it as an unresolved issue; do NOT hand-edit a canonical file to work around it. \`synergyspec-selfevolving status\` prints the machine-written \`Evolution:\` outcome — do not contradict it in free text.
-6. **Emit the Runner Verdict (always — the final step)**
-   Your session's final message MUST end with the \`## Episode Verdict\` block defined in the Output Format below. Copy every field from the machine-written result (the \`episode\` JSON output / episode.json + diagnosis.json) — never re-judge it. Use \`not-run\` when the episode command was never invoked (change did not resolve); state the reason on the verdict lines.
-**Output Format**
-The session's final message MUST end with exactly this block shape:
-\`\`\`
-## Episode Verdict: <change-name>
-- Outcome: evolved | kept | rolled-back | abstained | not-run | busy-in-flight | refused-static-gate | refused-unverified-evidence | refused-target-frozen | error-<...>
-- Episode id: <episodeId, or none>
-- Decision: rolled-back | kept | abstained
-- Evolution: evolved | refused | not-spawned | none
-- Advantage: <reward(主臂) − reward(基线臂), or null (baseline skipped / 弃权 abstained)>
-- 策略 POLICY version: <new ledger head version, or unchanged>
-- Evolved target: <canonical target id, or none>
-- Canonical file(s) changed: <paths, or none>
-- Rollback: synergyspec-selfevolving self-evolution promote <candidateId> --rollback
-- Loss vs baseline: <loss / baseline, or unmeasured>
-- Defects to surface: <genuine orchestrator errors that BLOCKED the episode — NOT evidence/red-test/frozen-target/scope refusals, and NOT busy-in-flight — or none>
-- Key lessons: <up to 3 one-line bullets from diagnosis.json>
-- Isolation: fresh-context subagent | inline fallback (degraded)
-\`\`\`
-- EVERY field MUST be copied from the machine-written result (the \`episode\` JSON output / episode.json + diagnosis.json) — never re-judged. The skill neither grades nor edits; it only relays.
-- Use \`not-run\` when the episode command was never invoked (the change did not resolve); state the reason on the verdict lines.
-- Use \`busy-in-flight\` when the episode command returned the clean concurrency deferral (another in-flight episode holds the same 策略 POLICY target): advantage is null, episode id is none, 策略 POLICY version is unchanged. It is TRANSIENT and self-healing (retry after the lock clears / the 60-min stale window) — it is NOT a DEFECT, do not list it under Defects to surface, and never advise deleting \`in-flight.json\`.
-- When the episode did NOT start (Episode id is none — any not-run / busy-in-flight / error-* outcome), write \`none\` for Evolved target and Canonical file(s) changed, report Decision/Advantage as none/null, and leave 策略 POLICY version unchanged. The change's CONFIGURED target id is context only — do NOT copy it into the Evolved target field on a non-run verdict.
-- A \`kept\` / \`abstained\` outcome on a verified-green run is the CORRECT no-op, not a missed evolution — say so plainly rather than hedging.
-- Report \`Isolation: fresh-context subagent\` when you were spawned as a subagent; report \`Isolation: inline fallback (degraded)\` when this skill is running inline in the spawning session.`;
+   - Append \`--harness <harness>\` when the spawning prompt supplied \`claude\`, \`codex\`, or \`opencode\`, or when the prompt supplied \`unknown\` but this runner can identify the current host as Codex, Claude Code, or OpenCode. Never append \`--harness unknown\`.
+   - Append \`--rerun\` ONLY when the spawning prompt supplied \`Force-new: yes\`.
+   Do NOT grade, score, or author any edit yourself, and do NOT run \`evolve-from-edits\`, \`auto-evolve\`, or \`--agent\` / \`claude -p\` — those are not part of loop v2's host-facing path. The episode command IS the loop.
+3. **Read the machine-written result**
+   The \`episode\` command prints the episode result as JSON (and persists it). Read it from the JSON output, and cross-check the on-disk record:
+   - \`.synergyspec-selfevolving/self-evolution/episodes/<episodeId>/episode.json\` — the episode stage and policy versions.
+   - \`.synergyspec-selfevolving/self-evolution/episodes/<episodeId>/diagnosis.json\` — the 奖励智能体's reward(主臂), reward(基线臂), advantage, 文本梯度, and any abstain reason.
+   Take these fields straight from the result — NEVER recompute them:
+   - **advantage** ＝ reward(主臂) − reward(基线臂) (null when the baseline arm was skipped or the reward agent 弃权 abstained).
+   - **decision**: \`rolled-back\` | \`kept\` | \`abstained\`.
+   - **evolution kind**: the 演进智能体 outcome — \`evolved\` | \`refused\` | \`not-spawned\` (null when evolution was skipped, e.g. on 弃权).
+   - **new 策略 POLICY version**: the 版本账本 ledger head AFTER the episode (post-rollback / post-evolve).
+4. **Consult the 版本账本 ledger for context (read-only, optional)**
+   To explain the result against prior episodes, run the READ-ONLY view:
+   \`\`\`bash
+   synergyspec-selfevolving self-evolution policy show --target <targetId> --json
+   \`\`\`
+   This shows the 版本账本 ledger (prior 策略 POLICY versions for the target, with the current head) and the 否决缓冲 reject-buffer (rolled-back directions to avoid). Use it only to contextualize the verdict — it changes nothing.
+5. **Classify the outcome (do not re-judge it)**
+   Map the machine result to a verdict, classifying any no-op honestly:
+   - **evolved** — the 演进智能体 wrote ONE bounded edit onto the 策略 POLICY; report the new version and the rollback command.
+   - **kept (no evolution) / abstained** — a verified-green or no-nameable-gap run where nothing was promoted is the CORRECT outcome (产物即弃), not a missed evolution. State the reason from diagnosis.json.
+   - **rolled-back** — the edit's advantage fell below threshold; the 策略 POLICY was restored to the prior good version and a 否决缓冲 reject-buffer entry recorded the lost direction. This is the loop working, not a failure.
+   - **busy-in-flight** — the episode command returned a clean deferral ('skipped — another in-flight episode holds the target') because another episode for the SAME 策略 POLICY target is already running and holds the in-flight lock. This is a TRANSIENT, self-healing concurrency deferral, NOT a DEFECT and NOT an \`error-...\`. Report \`Outcome: busy-in-flight\` (advantage null, no episode id, 策略 POLICY version unchanged), recommend WAIT-AND-RETRY after the lock clears (it self-heals; the CLI alone re-acquires the target once the holder finishes or the 60-minute stale window elapses), and STOP — do NOT hand-delete \`in-flight.json\` and do NOT call the lock 'stale'. Staleness is purely the 60-minute time window; a lock whose owner episode is at stage \`evolving\` or \`kept\` is a LIVE episode, not a stale one — deleting it would corrupt a running episode.
+   - **SAFE refusal** (evidence missing/red, target frozen, gate refused on real grounds) is expected; state the reason and move on.
+   - **DEFECT** (the orchestrator COULD NOT act for a reason that is NOT about evidence / freezing / scope — e.g. an unbindable target that persists) — surface it as an unresolved issue; do NOT hand-edit a canonical file to work around it. \`synergyspec-selfevolving status\` prints the machine-written \`Evolution:\` outcome — do not contradict it in free text.
+6. **Emit the Runner Verdict (always — the final step)**
+   Your session's final message MUST end with the \`## Episode Verdict\` block defined in the Output Format below. Copy every field from the machine-written result (the \`episode\` JSON output / episode.json + diagnosis.json) — never re-judge it. Use \`not-run\` when the episode command was never invoked (change did not resolve); state the reason on the verdict lines.
+**Output Format**
+The session's final message MUST end with exactly this block shape:
+\`\`\`
+## Episode Verdict: <change-name>
+- Outcome: evolved | kept | rolled-back | abstained | not-run | busy-in-flight | refused-static-gate | refused-unverified-evidence | refused-target-frozen | error-<...>
+- Episode id: <episodeId, or none>
+- Decision: rolled-back | kept | abstained
+- Evolution: evolved | refused | not-spawned | none
+- Advantage: <reward(主臂) − reward(基线臂), or null (baseline skipped / 弃权 abstained)>
+- 策略 POLICY version: <new ledger head version, or unchanged>
+- Evolved target: <canonical target id, or none>
+- Canonical file(s) changed: <paths, or none>
+- Rollback: synergyspec-selfevolving self-evolution promote <candidateId> --rollback
+- Loss vs baseline: <loss / baseline, or unmeasured>
+- Defects to surface: <genuine orchestrator errors that BLOCKED the episode — NOT evidence/red-test/frozen-target/scope refusals, and NOT busy-in-flight — or none>
+- Key lessons: <up to 3 one-line bullets from diagnosis.json>
+- Isolation: fresh-context subagent | inline fallback (degraded)
+\`\`\`
+- EVERY field MUST be copied from the machine-written result (the \`episode\` JSON output / episode.json + diagnosis.json) — never re-judged. The skill neither grades nor edits; it only relays.
+- Use \`not-run\` when the episode command was never invoked (the change did not resolve); state the reason on the verdict lines.
+- Use \`busy-in-flight\` when the episode command returned the clean concurrency deferral (another in-flight episode holds the same 策略 POLICY target): advantage is null, episode id is none, 策略 POLICY version is unchanged. It is TRANSIENT and self-healing (retry after the lock clears / the 60-min stale window) — it is NOT a DEFECT, do not list it under Defects to surface, and never advise deleting \`in-flight.json\`.
+- When the episode did NOT start (Episode id is none — any not-run / busy-in-flight / error-* outcome), write \`none\` for Evolved target and Canonical file(s) changed, report Decision/Advantage as none/null, and leave 策略 POLICY version unchanged. The change's CONFIGURED target id is context only — do NOT copy it into the Evolved target field on a non-run verdict.
+- A \`kept\` / \`abstained\` outcome on a verified-green run is the CORRECT no-op, not a missed evolution — say so plainly rather than hedging.
+- Copy the supplied \`Isolation:\` value verbatim when present. If it was not supplied, report \`Isolation: fresh-context subagent\` when you were spawned as a subagent, or \`Isolation: inline fallback (degraded)\` when this skill is running inline in the spawning session.`;
 export function getSelfEvolvingSkillTemplate() {
     return {
         name: 'synergyspec-selfevolving-self-evolving',

package/dist/core/templates/workflows/verify-change.js CHANGED Viewed

@@ -43,6 +43,45 @@ export function getVerifyChangeSkillTemplate() {
    Each dimension can have CRITICAL, WARNING, or SUGGESTION issues.
+4a. **Load and validate durable runner evidence**
+   Treat \`test-report.md\`, \`ci-report.md\`, and any chat-written summaries as
+   claims until their runner evidence is validated. Do not mark a requirement,
+   test suite, CI run, or PBT result as verified from a self-authored markdown
+   summary alone.
+   Check these files if they exist:
+   - \`synergyspec-selfevolving/changes/<name>/test-report.md\`
+   - \`synergyspec-selfevolving/ci-report.md\`
+   - \`synergyspec-selfevolving/changes/<name>/pbt-regressions.md\`
+   For each report that contains test or CI claims:
+   - Locate its \`### Runner Evidence\` section.
+   - Extract raw stdout/stderr log paths and the \`*-exit.json\` path.
+   - Verify every referenced evidence path exists on disk and is inside the project.
+   - Parse each exit JSON and require: \`command\`, \`cwd\`, \`startedAt\` or \`timestamp\`,
+     \`exitCode\`, and raw log paths.
+   - If optional JUnit or coverage paths are listed, verify they exist unless the
+     value is explicitly \`null\`, \`N/A\`, or empty.
+   - Cross-check the markdown verdict against \`exitCode\`: non-zero exit means
+     the run failed even if the markdown says PASS.
+   - Compare \`runner-exit.json.workspaceIdentity\` to the current root before
+     trusting the report: \`cwd\` must still be this project, recorded
+     \`pyproject.toml [project].name\` and hash must match the current
+     \`pyproject.toml\`, and recorded \`package.json\` name/hash must match the
+     current \`package.json\` when those files exist. A mismatch means the report
+     proves an older or different workspace, not the current change.
+   Evidence verdicts:
+   - **verified**: raw logs exist, exit JSON parses, required provenance fields exist, and verdict matches exit code.
+   - **unverified**: report exists but lacks runner evidence, has missing files, malformed JSON, or mismatched verdicts.
+   - **absent**: no report or evidence file exists.
+   Missing or unverified runner evidence is at least a WARNING. If the change is
+   otherwise claiming "all tests passed", "all requirements covered", or "ready
+   to archive" based on that report, promote it to CRITICAL until durable
+   evidence is available.
 5. **Verify Completeness**
    **Task Completion**:
@@ -114,6 +153,18 @@ export function getVerifyChangeSkillTemplate() {
    | Coherence    | Followed/Issues  |
    \`\`\`
+   **Evidence Provenance**:
+   \`\`\`markdown
+   ### Evidence Provenance
+   | Source | Status | Exit JSON | Raw Logs | Notes |
+   |--------|--------|-----------|----------|-------|
+   | test-report.md | verified / unverified / absent | \`...\` | stdout/stderr paths | <reason> |
+   | ci-report.md | verified / unverified / absent | \`...\` | stdout/stderr paths | <reason> |
+   \`\`\`
+   Only count test and CI claims as verification evidence when this table marks
+   the corresponding source \`verified\`.
    **Issues by Priority**:
    1. **CRITICAL** (Must fix before archive):
@@ -131,17 +182,18 @@ export function getVerifyChangeSkillTemplate() {
       - Minor improvements
       - Each with specific recommendation
-   **Final Assessment**:
-   - If CRITICAL issues: "X critical issue(s) found. Fix before archiving."
-   - If only warnings: "No critical issues. Y warning(s) to consider. Ready for learn, then archive."
-   - If all clear: "All checks passed. Ready for learn, then archive."
-   Write the verification report to \`synergyspec-selfevolving/changes/<name>/verification-report.md\` so \`/synspec:learn\` can read concrete verification evidence. If you cannot write it, state that explicitly and include the report in the chat response.
+   **Final Assessment**:
+   - If CRITICAL issues: "X critical issue(s) found. Fix before archiving."
+   - If only warnings: "No critical issues. Y warning(s) to consider. Ready for learn, then archive."
+   - If all clear: "All checks passed. Ready for learn, then archive."
+   Write the verification report to \`synergyspec-selfevolving/changes/<name>/verification-report.md\` so \`/synspec:learn\` can read concrete verification evidence. If you cannot write it, state that explicitly and include the report in the chat response.
 **Verification Heuristics**
 - **Completeness**: Focus on objective checklist items (checkboxes, requirements list)
 - **Correctness**: Use keyword search, file path analysis, reasonable inference - don't require perfect certainty
+- **Evidence provenance**: Prefer raw runner logs, exit JSON, JUnit XML, and coverage artifacts over report prose. Self-authored markdown summaries without durable evidence are not proof.
 - **Coherence**: Look for glaring inconsistencies, don't nitpick style
 - **False Positives**: When uncertain, prefer SUGGESTION over WARNING, WARNING over CRITICAL
 - **Actionability**: Every issue must have a specific recommendation with file/line references where applicable
@@ -257,6 +309,7 @@ export function getVerifyChangeSkillTemplate() {
 - If only tasks.md exists: verify task completion only, skip spec/design checks
 - If tasks + specs exist: verify completeness and correctness, skip design
 - If full artifacts: verify all three dimensions
+- If test/CI reports exist but runner evidence is missing or invalid: keep the reports as context only, mark evidence provenance unverified, and add a WARNING or CRITICAL per step 4a
 - Always note which checks were skipped and why
 - If git diff unavailable or \`synergyspec-selfevolving/specs/\` is empty: skip blast radius gracefully
@@ -264,11 +317,12 @@ export function getVerifyChangeSkillTemplate() {
 Use clear markdown with:
 - Table for summary scorecard
-- Grouped lists for issues (CRITICAL/WARNING/SUGGESTION)
-- Code references in format: \`file.ts:123\`
-- Specific, actionable recommendations
-- Confirmation that \`synergyspec-selfevolving/changes/<name>/verification-report.md\` was written, or why it could not be written
-- If no critical issues remain: suggest \`/synspec:learn <name>\` next, then \`/synspec:archive <name>\`
+- Evidence Provenance table with verified / unverified / absent status for test-report.md and ci-report.md
+- Grouped lists for issues (CRITICAL/WARNING/SUGGESTION)
+- Code references in format: \`file.ts:123\`
+- Specific, actionable recommendations
+- Confirmation that \`synergyspec-selfevolving/changes/<name>/verification-report.md\` was written, or why it could not be written
+- If no critical issues remain: suggest \`/synspec:learn <name>\` next, then \`/synspec:archive <name>\`
 - No vague suggestions like "consider reviewing"`,
         license: 'MIT',
         compatibility: 'Requires synergyspec-selfevolving CLI.',
@@ -322,6 +376,45 @@ export function getOpsxVerifyCommandTemplate() {
    Each dimension can have CRITICAL, WARNING, or SUGGESTION issues.
+4a. **Load and validate durable runner evidence**
+   Treat \`test-report.md\`, \`ci-report.md\`, and any chat-written summaries as
+   claims until their runner evidence is validated. Do not mark a requirement,
+   test suite, CI run, or PBT result as verified from a self-authored markdown
+   summary alone.
+   Check these files if they exist:
+   - \`synergyspec-selfevolving/changes/<name>/test-report.md\`
+   - \`synergyspec-selfevolving/ci-report.md\`
+   - \`synergyspec-selfevolving/changes/<name>/pbt-regressions.md\`
+   For each report that contains test or CI claims:
+   - Locate its \`### Runner Evidence\` section.
+   - Extract raw stdout/stderr log paths and the \`*-exit.json\` path.
+   - Verify every referenced evidence path exists on disk and is inside the project.
+   - Parse each exit JSON and require: \`command\`, \`cwd\`, \`startedAt\` or \`timestamp\`,
+     \`exitCode\`, and raw log paths.
+   - If optional JUnit or coverage paths are listed, verify they exist unless the
+     value is explicitly \`null\`, \`N/A\`, or empty.
+   - Cross-check the markdown verdict against \`exitCode\`: non-zero exit means
+     the run failed even if the markdown says PASS.
+   - Compare \`runner-exit.json.workspaceIdentity\` to the current root before
+     trusting the report: \`cwd\` must still be this project, recorded
+     \`pyproject.toml [project].name\` and hash must match the current
+     \`pyproject.toml\`, and recorded \`package.json\` name/hash must match the
+     current \`package.json\` when those files exist. A mismatch means the report
+     proves an older or different workspace, not the current change.
+   Evidence verdicts:
+   - **verified**: raw logs exist, exit JSON parses, required provenance fields exist, and verdict matches exit code.
+   - **unverified**: report exists but lacks runner evidence, has missing files, malformed JSON, or mismatched verdicts.
+   - **absent**: no report or evidence file exists.
+   Missing or unverified runner evidence is at least a WARNING. If the change is
+   otherwise claiming "all tests passed", "all requirements covered", or "ready
+   to archive" based on that report, promote it to CRITICAL until durable
+   evidence is available.
 5. **Verify Completeness**
    **Task Completion**:
@@ -393,6 +486,18 @@ export function getOpsxVerifyCommandTemplate() {
    | Coherence    | Followed/Issues  |
    \`\`\`
+   **Evidence Provenance**:
+   \`\`\`markdown
+   ### Evidence Provenance
+   | Source | Status | Exit JSON | Raw Logs | Notes |
+   |--------|--------|-----------|----------|-------|
+   | test-report.md | verified / unverified / absent | \`...\` | stdout/stderr paths | <reason> |
+   | ci-report.md | verified / unverified / absent | \`...\` | stdout/stderr paths | <reason> |
+   \`\`\`
+   Only count test and CI claims as verification evidence when this table marks
+   the corresponding source \`verified\`.
    **Issues by Priority**:
    1. **CRITICAL** (Must fix before archive):
@@ -410,17 +515,18 @@ export function getOpsxVerifyCommandTemplate() {
       - Minor improvements
       - Each with specific recommendation
-   **Final Assessment**:
-   - If CRITICAL issues: "X critical issue(s) found. Fix before archiving."
-   - If only warnings: "No critical issues. Y warning(s) to consider. Ready for learn, then archive."
-   - If all clear: "All checks passed. Ready for learn, then archive."
-   Write the verification report to \`synergyspec-selfevolving/changes/<name>/verification-report.md\` so \`/synspec:learn\` can read concrete verification evidence. If you cannot write it, state that explicitly and include the report in the chat response.
+   **Final Assessment**:
+   - If CRITICAL issues: "X critical issue(s) found. Fix before archiving."
+   - If only warnings: "No critical issues. Y warning(s) to consider. Ready for learn, then archive."
+   - If all clear: "All checks passed. Ready for learn, then archive."
+   Write the verification report to \`synergyspec-selfevolving/changes/<name>/verification-report.md\` so \`/synspec:learn\` can read concrete verification evidence. If you cannot write it, state that explicitly and include the report in the chat response.
 **Verification Heuristics**
 - **Completeness**: Focus on objective checklist items (checkboxes, requirements list)
 - **Correctness**: Use keyword search, file path analysis, reasonable inference - don't require perfect certainty
+- **Evidence provenance**: Prefer raw runner logs, exit JSON, JUnit XML, and coverage artifacts over report prose. Self-authored markdown summaries without durable evidence are not proof.
 - **Coherence**: Look for glaring inconsistencies, don't nitpick style
 - **False Positives**: When uncertain, prefer SUGGESTION over WARNING, WARNING over CRITICAL
 - **Actionability**: Every issue must have a specific recommendation with file/line references where applicable
@@ -536,6 +642,7 @@ export function getOpsxVerifyCommandTemplate() {
 - If only tasks.md exists: verify task completion only, skip spec/design checks
 - If tasks + specs exist: verify completeness and correctness, skip design
 - If full artifacts: verify all three dimensions
+- If test/CI reports exist but runner evidence is missing or invalid: keep the reports as context only, mark evidence provenance unverified, and add a WARNING or CRITICAL per step 4a
 - Always note which checks were skipped and why
 - If git diff unavailable or \`synergyspec-selfevolving/specs/\` is empty: skip blast radius gracefully
@@ -543,11 +650,12 @@ export function getOpsxVerifyCommandTemplate() {
 Use clear markdown with:
 - Table for summary scorecard
-- Grouped lists for issues (CRITICAL/WARNING/SUGGESTION)
-- Code references in format: \`file.ts:123\`
-- Specific, actionable recommendations
-- Confirmation that \`synergyspec-selfevolving/changes/<name>/verification-report.md\` was written, or why it could not be written
-- If no critical issues remain: suggest \`/synspec:learn <name>\` next, then \`/synspec:archive <name>\`
+- Evidence Provenance table with verified / unverified / absent status for test-report.md and ci-report.md
+- Grouped lists for issues (CRITICAL/WARNING/SUGGESTION)
+- Code references in format: \`file.ts:123\`
+- Specific, actionable recommendations
+- Confirmation that \`synergyspec-selfevolving/changes/<name>/verification-report.md\` was written, or why it could not be written
+- If no critical issues remain: suggest \`/synspec:learn <name>\` next, then \`/synspec:archive <name>\`
 - No vague suggestions like "consider reviewing"`
     };
 }