npm - synergyspec-selfevolving - Versions diffs - 2.1.4 → 2.1.6 - Mend

synergyspec-selfevolving 2.1.4 → 2.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

package/dist/commands/config.js +4 -0
package/dist/commands/learn.js +80 -24
package/dist/commands/self-evolution-dream.d.ts +54 -0
package/dist/commands/self-evolution-dream.js +265 -0
package/dist/commands/self-evolution-episode.d.ts +5 -0
package/dist/commands/self-evolution-episode.js +160 -107
package/dist/commands/self-evolution.js +127 -4
package/dist/commands/workflow/status.js +38 -7
package/dist/core/archive.js +27 -9
package/dist/core/change-readiness.d.ts +63 -6
package/dist/core/change-readiness.js +912 -23
package/dist/core/completions/command-registry.js +1 -1
package/dist/core/fitness/loss.d.ts +10 -5
package/dist/core/fitness/loss.js +11 -4
package/dist/core/fitness/test-metrics.d.ts +3 -0
package/dist/core/fitness/test-metrics.js +78 -1
package/dist/core/learn/trajectory-discovery.js +5 -0
package/dist/core/learn.js +131 -13
package/dist/core/migration.d.ts +6 -14
package/dist/core/migration.js +63 -21
package/dist/core/profiles.d.ts +1 -1
package/dist/core/profiles.js +1 -0
package/dist/core/runner-evidence.d.ts +53 -0
package/dist/core/runner-evidence.js +613 -0
package/dist/core/self-evolution/candidates.d.ts +1 -1
package/dist/core/self-evolution/candidates.js +1 -2
package/dist/core/self-evolution/canonical-targets.js +1 -0
package/dist/core/self-evolution/dream.d.ts +132 -0
package/dist/core/self-evolution/dream.js +1093 -0
package/dist/core/self-evolution/episode-orchestrator.d.ts +7 -0
package/dist/core/self-evolution/episode-orchestrator.js +162 -12
package/dist/core/self-evolution/episode-store.d.ts +21 -0
package/dist/core/self-evolution/episode-store.js +16 -3
package/dist/core/self-evolution/evolving-agent.js +8 -0
package/dist/core/self-evolution/host-harness.d.ts +46 -12
package/dist/core/self-evolution/host-harness.js +198 -55
package/dist/core/self-evolution/index.d.ts +1 -0
package/dist/core/self-evolution/index.js +1 -0
package/dist/core/self-evolution/policy/policy-store.d.ts +19 -2
package/dist/core/self-evolution/policy/policy-store.js +85 -0
package/dist/core/self-evolution/promote.d.ts +7 -5
package/dist/core/self-evolution/promote.js +111 -19
package/dist/core/self-evolution/reward-agent.js +11 -9
package/dist/core/self-evolution/reward-aggregator.js +2 -2
package/dist/core/shared/skill-generation.d.ts +37 -0
package/dist/core/shared/skill-generation.js +91 -0
package/dist/core/templates/skill-templates.d.ts +1 -0
package/dist/core/templates/skill-templates.js +1 -0
package/dist/core/templates/workflow-manifest.js +2 -0
package/dist/core/templates/workflows/archive-change.js +76 -39
package/dist/core/templates/workflows/ci.js +47 -1
package/dist/core/templates/workflows/dream.d.ts +10 -0
package/dist/core/templates/workflows/dream.js +123 -0
package/dist/core/templates/workflows/gen-tests.js +9 -3
package/dist/core/templates/workflows/learn.js +11 -7
package/dist/core/templates/workflows/run-tests.js +99 -4
package/dist/core/templates/workflows/self-evolving.js +118 -115
package/dist/core/templates/workflows/verify-change.js +130 -22
package/dist/core/trajectory/adapters/codex.js +87 -29
package/dist/core/trajectory/adapters/opencode.js +69 -23
package/dist/core/trajectory/facts.d.ts +1 -1
package/dist/core/trajectory/facts.js +23 -5
package/dist/core/trajectory/registry.d.ts +16 -2
package/dist/core/trajectory/registry.js +104 -29
package/dist/core/trajectory/source.d.ts +27 -4
package/dist/dashboard/react-client.js +4 -4
package/dist/utils/change-utils.d.ts +2 -0
package/dist/utils/change-utils.js +53 -2
package/package.json +99 -99
package/schemas/spec-driven/templates/design.md +6 -0
package/scripts/nl2repo_synergyspec-selfevolving_wrapper.py +170 -0

package/dist/core/templates/workflows/ci.js CHANGED Viewed

@@ -21,6 +21,35 @@ const INSTRUCTIONS_BODY = `**Input**: No change name required — CI runs across
    - coverage metrics: lines, branches, functions, statements
      (parse \`coverage-summary.json\` if present, or the runner's table output)
+   Persist durable runner evidence before writing the CI summary. Create:
+   \`synergyspec-selfevolving/ci-evidence/<YYYYMMDDTHHMMSSZ>/\`.
+   For the unit/integration coverage run, save:
+   - \`unit.stdout.log\` — raw stdout, unedited
+   - \`unit.stderr.log\` — raw stderr, unedited (empty if none)
+   - \`unit-exit.json\` — machine-readable execution metadata:
+   \`\`\`json
+   {
+     "command": "<exact command that was run>",
+     "cwd": "<absolute working directory>",
+     "startedAt": "<ISO timestamp>",
+     "finishedAt": "<ISO timestamp>",
+     "exitCode": 0,
+     "signal": null,
+     "stdoutLog": "synergyspec-selfevolving/ci-evidence/<timestamp>/unit.stdout.log",
+     "stderrLog": "synergyspec-selfevolving/ci-evidence/<timestamp>/unit.stderr.log",
+     "junitXml": null,
+     "coverageSummary": "coverage/coverage-summary.json",
+     "coverageLcov": "coverage/lcov.info",
+     "coverageHtml": "coverage/lcov-report/index.html"
+   }
+   \`\`\`
+   If a listed JUnit or coverage path does not exist, set it to \`null\`. The
+   CI verdict must be derived from this exit JSON plus parsed runner outputs,
+   not from a hand-written summary alone.
    This single run covers all changes since the project test suite is shared.
 2b. **Spec Blast Radius Coverage** (if any blast radius files exist)
@@ -88,7 +117,13 @@ const INSTRUCTIONS_BODY = `**Input**: No change name required — CI runs across
        browsers/binaries as needed.
    - After ensuring the tool is installed, run the e2e suite using the appropriate command
      (e.g. \`npx playwright test --reporter=json\`, \`npx cypress run --reporter json\`) and
-     save the output to \`e2e-results/latest/<tool>-results.json\`.
+     save the output to \`e2e-results/latest/<tool>-results.json\`. Also save raw
+     \`<tool>.stdout.log\`, \`<tool>.stderr.log\`, and \`<tool>-exit.json\` under
+     \`synergyspec-selfevolving/ci-evidence/<timestamp>/\`, using the same
+     command/cwd/startedAt/finishedAt/exitCode/stdoutLog/stderrLog fields as the
+     unit runner evidence. Include the JSON reporter path in \`junitXml\` only if
+     it is actually JUnit XML; otherwise record it as a separate report path in
+     the CI markdown.
    - Map each TP entry result (pass/fail) from the tool's output.
    **Never skip e2e tests because a dependency is missing — always install it first.**
@@ -145,6 +180,15 @@ const INSTRUCTIONS_BODY = `**Input**: No change name required — CI runs across
    | Functions | X% (A/B) |
    | Statements | X% (A/B) |
+   ### Runner Evidence
+   | Runner | Command | CWD | Started | Finished | Exit Code | stdout | stderr | Exit JSON | Optional Reports |
+   |--------|---------|-----|---------|----------|-----------|--------|--------|-----------|------------------|
+   | unit/coverage | \`<exact command>\` | \`<absolute cwd>\` | \`<ISO>\` | \`<ISO>\` | \`0\` | \`synergyspec-selfevolving/ci-evidence/<timestamp>/unit.stdout.log\` | \`synergyspec-selfevolving/ci-evidence/<timestamp>/unit.stderr.log\` | \`synergyspec-selfevolving/ci-evidence/<timestamp>/unit-exit.json\` | coverage/JUnit paths or N/A |
+   | e2e:<tool> | \`<exact command>\` | \`<absolute cwd>\` | \`<ISO>\` | \`<ISO>\` | \`0\` | \`synergyspec-selfevolving/ci-evidence/<timestamp>/<tool>.stdout.log\` | \`synergyspec-selfevolving/ci-evidence/<timestamp>/<tool>.stderr.log\` | \`synergyspec-selfevolving/ci-evidence/<timestamp>/<tool>-exit.json\` | reporter output path or N/A |
+   The overall verdict must agree with every listed \`*-exit.json\`. A non-zero
+   exit code is a CI failure even if the markdown summary claims otherwise.
    ### E2E Test Plan Results
    | Change | ID | Description | Verdict |
    |--------|----|-------------|---------|
@@ -166,6 +210,7 @@ const INSTRUCTIONS_BODY = `**Input**: No change name required — CI runs across
    ### Artifacts
    - Coverage: \`coverage/lcov-report/index.html\`
+   - Runner evidence: \`synergyspec-selfevolving/ci-evidence/<timestamp>/\`
    - Screenshots: \`e2e-results/latest/artifacts/\`
    - Archived to: \`e2e-results/<timestamp>/\`
    \`\`\`
@@ -189,6 +234,7 @@ const INSTRUCTIONS_BODY = `**Input**: No change name required — CI runs across
 - No pbt-regressions.md for a change: note "PBT not yet run for \`<name>\` — suggest \`/synspec:run-tests\`" and treat that change as PARTIAL, not FAIL.
 - Test runner detection fails: ask the user rather than failing silently.
 - Coverage tooling not configured: skip coverage metrics, note the gap.
+- Durable runner evidence cannot be written: mark CI PARTIAL, list the missing evidence paths, and do not claim PASS from markdown summaries alone.
 **Output**

package/dist/core/templates/workflows/dream.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * Skill Template Workflow Modules
+ *
+ * Dream workflow: chat-native entrance for the offline Supervised Learning
+ * Dream lane. The CLI remains the engine; this wrapper is the agent harness UX.
+ */
+import type { CommandTemplate, SkillTemplate } from '../types.js';
+export declare function getDreamSkillTemplate(): SkillTemplate;
+export declare function getOpsxDreamCommandTemplate(): CommandTemplate;
+//# sourceMappingURL=dream.d.ts.map

package/dist/core/templates/workflows/dream.js ADDED Viewed

@@ -0,0 +1,123 @@
+const INSTRUCTIONS_BODY = `**Input**: Optionally specify a Dream mode and flags after \`/synspec:dream\`.
+Accepted forms:
+\`\`\`text
+/synspec:dream
+/synspec:dream preview [--target <id>] [--limit <n>] [--json]
+/synspec:dream run [--target <id>] [--limit <n>] [--apply --yes] [--json]
+/synspec:dream show [runId] [--json]
+/synspec:dream policy-update <candidateId> --accepted-by <name> --yes [--json]
+\`\`\`
+Bare \`/synspec:dream\` means \`preview\`. Preview is read-only. Plain \`run\` writes only Dream artifacts. \`run --apply --yes\` and \`policy-update ... --yes\` are explicit policy-update entrances for already accepted Dream candidates.
+**Purpose**
+This is the SS agent-harness entrance for offline Supervised Learning Dream. The user should trigger Dream from the code-agent chat, not by opening a separate terminal. Your job is to call the existing CLI engine, parse the JSON result, and relay a short Dream Verdict.
+Dream is not the loop-v2 episode runner. It batch-reads completed evidence and proposes optimizer briefs for existing skill/workflow/template targets. It never creates new skills, never edits POLICY directly, and never runs the episode/reward/evolving agents. By default Dream is proposal-only; policy changes require an explicit accepted-candidate update with \`--yes\`, synthesize bounded edits into the candidate package, pass the static gate, and promote through the existing rollback/ledger path.
+**Mode parsing**
+1. If the first argument is missing, use \`preview\`.
+2. If the first argument is one of \`preview\`, \`run\`, \`show\`, or \`policy-update\`, use that mode.
+3. If the first argument starts with \`--\`, treat it as a \`preview\` flag.
+4. If the mode is unknown, stop and show the accepted forms above.
+Pass only these user options through:
+- \`--target <id>\`
+- \`--limit <n>\`
+- \`--apply\` and \`--yes\` for \`run\`
+- \`candidateId\`, \`--accepted-by <name>\`, and \`--yes\` for \`policy-update\`
+- \`--json\`
+- \`runId\` for \`show\`
+Always add \`--json\` to the CLI command you run so the result is machine-readable. If the user explicitly asked for \`--json\`, include the compact raw JSON after the Dream Verdict; otherwise provide the human summary only.
+**Steps**
+1. **Run the CLI engine from the project root**
+   For preview:
+   \`\`\`bash
+   synergyspec-selfevolving self-evolution dream preview --json
+   \`\`\`
+   For run:
+   \`\`\`bash
+   synergyspec-selfevolving self-evolution dream run --json
+   \`\`\`
+   For show:
+   \`\`\`bash
+   synergyspec-selfevolving self-evolution dream show --json
+   \`\`\`
+   For accepted candidate policy update:
+   \`\`\`bash
+   synergyspec-selfevolving self-evolution dream policy-update <candidateId> --accepted-by <name> --yes --json
+   \`\`\`
+   Append \`--target <id>\`, \`--limit <n>\`, \`--apply\`, \`--yes\`, \`--accepted-by <name>\`, or \`runId\` only when the user supplied them. Never add \`--yes\` on the user's behalf.
+2. **Interpret the result without re-judging it**
+   Read candidate ids, target ids, evidence summary, run id, update outcome, gate result, promoted files, policy version, and write paths from the CLI JSON when present. Do not invent candidate ids or claim a policy change.
+3. **Classify writes**
+   - \`preview\`: Writes are \`none\`.
+   - plain \`run\`: Writes are \`dream-run + draft candidates\`.
+   - \`run --apply --yes\`: Writes are \`dream-run + candidates + gated policy update\` when the update is promoted; otherwise report the refusal outcome.
+   - \`show\`: Writes are \`none\`.
+   - \`policy-update --yes\`: Writes are \`gated policy update\` when promoted; otherwise report the refusal outcome.
+4. **Report the next step**
+   Plain Dream candidates are proposal-only optimizer briefs. To turn an accepted candidate into policy, use \`/synspec:dream policy-update <candidateId> --accepted-by <name> --yes\`. The update path must author bounded edits, pass the static gate, and promote through the existing rollback/ledger channel; if any gate refuses, report the refusal and leave the policy unchanged.
+**Output Format**
+End with this block:
+\`\`\`text
+## Dream Verdict
+- Mode: preview | run | show | policy-update
+- Run id: <id or none>
+- Candidates: <ids or none>
+- Targets: <target ids or all eligible>
+- Evidence read: <short summary>
+- Writes: none | dream-run + draft candidates | dream-run + candidates + gated policy update | gated policy update
+- Policy changed: yes | no
+- New skills created: no
+- Next step: review candidate(s), run accepted policy-update, or inspect gate refusal
+\`\`\`
+If the CLI command fails, still end with \`## Dream Verdict\` and set fields to \`none\` where unknown. Put the command failure under \`Evidence read\` or \`Next step\`; do not retry with a different self-evolution command.`;
+export function getDreamSkillTemplate() {
+    return {
+        name: 'synergyspec-selfevolving-dream',
+        description: 'SS Dream entrance: preview, run, inspect, or apply accepted offline Supervised Learning Dream updates from the code-agent chat.',
+        instructions: `Run the SS offline Supervised Learning Dream lane from the code-agent harness.
+${INSTRUCTIONS_BODY}`,
+        license: 'MIT',
+        compatibility: 'Requires synergyspec-selfevolving CLI.',
+        metadata: { author: 'synergyspec-selfevolving', version: '1.0' },
+    };
+}
+export function getOpsxDreamCommandTemplate() {
+    return {
+        name: 'SS: Dream',
+        description: 'Preview, run, inspect, or apply accepted offline Supervised Learning Dream updates from the code-agent chat',
+        category: 'Workflow',
+        tags: ['workflow', 'dream', 'self-evolution', 'offline-learning'],
+        content: `Run the SS offline Supervised Learning Dream lane from the code-agent harness.
+**Input**: Optionally specify a mode after \`/synspec:dream\` (for example \`/synspec:dream preview\`, \`/synspec:dream run --limit 5\`, \`/synspec:dream show\`, or \`/synspec:dream policy-update <candidateId> --accepted-by <name> --yes\`). Bare \`/synspec:dream\` means read-only \`preview\`.
+${INSTRUCTIONS_BODY}`,
+    };
+}
+//# sourceMappingURL=dream.js.map

package/dist/core/templates/workflows/gen-tests.js CHANGED Viewed

@@ -72,7 +72,7 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
    Detection order: scan existing test files for PBT imports first; if none found, infer from project language; if ambiguous, use AskUserQuestion.
-   **Every WHEN/THEN scenario extracted in step 4 must have exactly one PBT test**, with ONE sanctioned exception — a scenario whose behaviour is already exhaustively covered by an existing example/benchmark test AND that cannot be expressed as a meaningful property may instead be recorded as a \`➖ N/A\` row in the PBT Coverage table (step 8), citing that covering test. Otherwise, no exceptions:
+   **Every WHEN/THEN scenario extracted in step 4 must be represented in PBT Coverage**, preferably by exactly one PBT test. There is ONE sanctioned \`➖ N/A\` exception: a scenario whose behaviour is already exhaustively covered by an existing example/benchmark test AND cannot be expressed as a meaningful property may be recorded as \`➖ N/A\` in step 8. The \`➖ N/A\` row must cite the covering test path (and line when available) in the \`PBT Test\` column and explain why a property would be vacuous or duplicate coverage. Otherwise, no exceptions:
    - **WHEN** clause → generator expression + precondition guard (filter/assume)
    - **THEN** clause → invariant (property assertion that must hold for all generated inputs)
    - When the WHEN clause has no parameterisable variable (e.g. "WHEN the app loads"), generate arbitrary system/environment state as the input and use the THEN clause alone as the invariant.
@@ -136,9 +136,15 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
    |---------|----------|----------|-----------|--------|
    | UC1-S1 | <scenario description> | \`test/uc1-s1.property.test.ts:5\` | fast-check | ✅ |
    | UC1-S2 | <scenario description> | \`test/uc1-s2.property.test.ts:12\` | fast-check | ✅ |
+   | UC1-S3 | <scenario description> | \`test/exhaustive-example.test.ts:44\` | N/A | ➖ N/A — exhaustively covered; property would duplicate fixed finite cases |
    | UC1-E2a | <scenario description> | \`test/uc1-e2a.property.test.ts:8\` | fast-check | ❌ missing |
    ...
+   A \`➖ N/A\` row is valid only when it cites an existing covering test path in
+   \`PBT Test\` and the status text states the reason. A \`➖ N/A\` row with no
+   covering test path, or with only a prose justification, is invalid and must be
+   treated as \`❌ missing\`.
    ## Use Case Details: <name> (ID: UC1)
    ### Main Scenario
@@ -161,7 +167,7 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
 9. **Decision point (Re-generate or output)**
-   - Report missing or incomplete **example-based tests** AND any **PBT Coverage** rows marked \`❌ missing\`.
+   - Report missing or incomplete **example-based tests**, any **PBT Coverage** rows marked \`❌ missing\`, and any invalid \`➖ N/A\` rows that lack a concrete covering test path.
    - **Ask if they want to generate / update all missing and incomplete tests (both kinds).**
    - If the user confirms, go back to steps 6–7 and generate / update tests.
    - If the user does not confirm, proceed to the output step.
@@ -176,7 +182,7 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
 - Classify by requirement boundary, not by code layer. A test that calls a low-level function but verifies a single spec step is still a Unit test. A test that exercises the UI but covers an entire use case flow is an Integration test.
 - Map requirements to tests via: exact name match, keyword match, file path match
 - When uncertain about test implementation status, mark as ⚠️ (partial) not ✅
-- Every WHEN/THEN scenario must have a PBT test. When the WHEN clause has no parameterisable input, generate arbitrary system/environment state and use the THEN clause as the invariant — do not skip the scenario.
+- Every WHEN/THEN scenario must have either one PBT test or one valid \`➖ N/A\` row with a concrete covering test path. When the WHEN clause has no parameterisable input, generate arbitrary system/environment state and use the THEN clause as the invariant unless the strict \`➖ N/A\` criteria above apply.
 **Graceful Degradation**

package/dist/core/templates/workflows/learn.js CHANGED Viewed

@@ -19,17 +19,19 @@ This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify
    The runner starts with NO conversation context, so collect every handle it needs:
    - **Project root**: the absolute path of the current working directory.
    - **Change name**: from step 1.
-   - **Harness**: read the \`harness:\` key from \`synergyspec-selfevolving/changes/<name>/.synergyspec-selfevolving.yaml\`; if absent, use \`unknown\`.
-   - **Mode**: always \`apply\` — the episode runs the full loop (score, decide, and the 演进智能体's ONE bounded edit) autonomously, with no confirmation prompt. There is NO read-only episode and NO \`--preview\` flag. If the user wants a read-only look (no rollback, no evolution), do NOT run an episode: use the read-only view \`synergyspec-selfevolving self-evolution policy show\` (or a plain \`synergyspec-selfevolving learn <name>\` without \`--apply\`) instead.
-   - **Session handle (optional)**: if your harness exposes this session's id or transcript path, capture it; otherwise omit it (the 主智能体 MAIN AGENT arm's trajectory discovery then uses the change window).
+   - **Harness**: resolve the CURRENT host runtime, not the change metadata. If this skill is running in Codex, use \`codex\`; in Claude Code, use \`claude\`; in OpenCode, use \`opencode\`. Use \`unknown\` only when the host is genuinely unidentified after checking the active session/tooling. Do NOT read \`harness:\` from the per-change YAML for this field: that metadata is historical provenance, not the runtime that will spawn the loop-v2 agents.
+   - **Mode**: always \`apply\` — the episode runs the full loop (score, decide, and the 演进智能体's ONE bounded edit) autonomously, with no confirmation prompt. There is NO read-only episode and NO \`--preview\` flag. If the user wants a read-only look (no rollback, no evolution), do NOT run an episode: use the read-only view \`synergyspec-selfevolving self-evolution policy show\` (or a plain \`synergyspec-selfevolving learn <name>\` without \`--apply\`) instead.
+   - **Force-new episode**: \`yes\` only when the user explicitly asked to rerun / force a fresh episode; otherwise \`no\`. A normal learn run must not invent a rerun.
+   - **Isolation**: \`fresh-context subagent\` for the spawned runner.
+   - **Session handle (optional)**: if your harness exposes this session's id or transcript path, capture it; otherwise omit it (the 主智能体 MAIN AGENT arm's trajectory discovery then uses the change window).
 3. **Spawn the runner**
-   Use the host's available general-purpose Task/subagent runner (for example \`general-purpose\` on Claude or \`general\` on hosts that expose that type), prompt: "Use Skill tool to invoke synergyspec-selfevolving-self-evolving for change '<name>'. Project root: <root>. Harness: <harness>. Mode: apply. Session-id: <id>. Transcript: <path>. Trigger the loop-v2 self-evolution episode autonomously, do not ask the user questions, and end with the '## Episode Verdict' block."
+   Use the host's available general-purpose Task/subagent runner (for example \`general-purpose\` on Claude or \`general\` on hosts that expose that type), prompt: "Use Skill tool to invoke synergyspec-selfevolving-self-evolving for change '<name>'. Project root: <root>. Harness: <harness>. Mode: apply. Force-new: <yes|no>. Isolation: fresh-context subagent. Session-id: <id>. Transcript: <path>. Trigger the loop-v2 self-evolution episode autonomously, do not ask the user questions, and end with the '## Episode Verdict' block."
    Include the \`Session-id: <id>.\` / \`Transcript: <path>.\` segment only when the session handle from step 2 is known — omit it entirely when unknown.
-   The runner triggers exactly one CLI command — \`synergyspec-selfevolving self-evolution episode --change "<name>" --session-id <id>\` — and the orchestrator CODE-SPAWNS the 奖励智能体 REWARD AGENT + 演进智能体 EVOLVING AGENT (+ optional CRITIC AGENT（基线智能体）). Neither you nor the runner grades or edits canonical files.
+   The runner triggers exactly one CLI command — \`synergyspec-selfevolving self-evolution episode --change "<name>" --harness <harness> --session-id <id> --rerun\` when force-new is \`yes\`; omit \`--rerun\` when force-new is \`no\`; omit \`--harness\` when it is \`unknown\` — and the orchestrator CODE-SPAWNS the 奖励智能体 REWARD AGENT + 演进智能体 EVOLVING AGENT (+ optional CRITIC AGENT（基线智能体）). Neither you nor the runner grades or edits canonical files.
    Guardrails:
    - Do NOT trigger the episode yourself in this session — it must run from a fresh context.
@@ -42,6 +44,7 @@ This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify
    Read the runner's \`## Episode Verdict\` block from its final message, then:
    - Cross-check it against \`synergyspec-selfevolving status --change "<name>" --json\` and the episode's \`episode.json\` / \`diagnosis.json\`. NEVER contradict the machine-written outcome.
+   - Write durable learn evidence to \`synergyspec-selfevolving/changes/<name>/learn-report.md\` before the final reply. The report MUST be derived from the runner's machine verdict and the cross-checks, not from a new judgment. Include: the verbatim \`## Episode Verdict\` fields, status cross-check result, episode artifact paths checked, policy/canonical files changed, rollback command, defect/safe-no-op classification, isolation mode, and next step. Create the file for \`evolved\`, \`kept\`, \`abstained\`, \`rolled-back\`, \`busy-in-flight\`, safe refusals, and real \`error-...\` defects alike; archive uses this file as evidence that learn actually ran. If the write fails, say so explicitly and include the full report text in the chat instead of pretending learn evidence exists.
    - Relay the outcome, the decision (rolled-back / kept / abstained), the evolution kind, the new 策略 POLICY version, the evolved target, and the rollback command verbatim.
    - Classify the outcome before moving on: a \`kept\` / \`abstained\` no-op on a verified-green or no-nameable-gap run is the CORRECT outcome (产物即弃), not a missed evolution; a \`rolled-back\` decision is the loop working (the 否决缓冲 reject-buffer recorded the lost direction). A SAFE refusal (missing/red evidence, frozen target, gate refused on real grounds) is expected, not a bug; a DEFECT the runner flagged (an unbindable target, an orchestrator failure that is NOT about evidence / freezing / scope) must be surfaced to the user, not archived over.
@@ -50,12 +53,13 @@ This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify
 - Lead with the runner's verdict, not the spawn mechanics.
 - Relay the \`## Episode Verdict\` fields verbatim: outcome, decision, evolution kind, advantage, new 策略 POLICY version, evolved target, canonical file(s) changed, and the rollback command.
 - State clearly whether the 策略 POLICY changed (evolved / rolled-back / unchanged) and the isolation mode (fresh-context subagent, or inline fallback (degraded)).
+- State whether \`synergyspec-selfevolving/changes/<name>/learn-report.md\` was written, and do not call the change archive-ready if that write failed.
 - Separate safe no-ops and refusals from DEFECTs to surface.
 - End with the normal next step: \`/synspec:archive\` once the user is satisfied with the review.`;
 export function getLearnSkillTemplate() {
     return {
         name: 'synergyspec-selfevolving-learn',
-        description: 'Review a completed SynergySpec-SelfEvolving change after apply/verify and preview reusable lessons, avoidable issues, and memory/template observations.',
+        description: 'Review a completed SynergySpec-SelfEvolving change after apply/verify, capture durable learn evidence, and relay the loop-v2 episode verdict.',
         instructions: `Review and learn from a completed SynergySpec-SelfEvolving change after apply and verify.
 ${INSTRUCTIONS_BODY}`,
@@ -67,7 +71,7 @@ ${INSTRUCTIONS_BODY}`,
 export function getOpsxLearnCommandTemplate() {
     return {
         name: 'SynergySpec-SelfEvolving: Learn',
-        description: 'Review a completed change and preview reusable lessons, avoidable issues, and memory/template observations',
+        description: 'Review a completed change, capture durable learn evidence, and relay the loop-v2 episode verdict',
         category: 'Workflow',
         tags: ['workflow', 'learn', 'review', 'memory'],
         content: `Review and learn from a completed SynergySpec-SelfEvolving change after \`/synspec:apply\` and \`/synspec:verify\`.

package/dist/core/templates/workflows/run-tests.js CHANGED Viewed

@@ -21,6 +21,71 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
    Run: \`<detected-runner>\`
    Capture stdout/stderr output.
+3a. **Persist durable runner evidence**
+   Before summarizing results, create an evidence directory:
+   \`synergyspec-selfevolving/changes/<name>/test-evidence/<YYYYMMDDTHHMMSSZ>/\`.
+   Save the raw runner outputs and exit metadata:
+   - \`runner.stdout.log\` — raw stdout, unedited
+   - \`runner.stderr.log\` — raw stderr, unedited (create an empty file if the runner emitted none)
+   - \`runner-exit.json\` — machine-readable execution metadata:
+   \`\`\`json
+   {
+     "command": "<exact command that was run>",
+     "cwd": "<absolute working directory>",
+     "startedAt": "<ISO timestamp>",
+     "finishedAt": "<ISO timestamp>",
+     "exitCode": 0,
+     "signal": null,
+     "stdoutLog": "synergyspec-selfevolving/changes/<name>/test-evidence/<timestamp>/runner.stdout.log",
+     "stderrLog": "synergyspec-selfevolving/changes/<name>/test-evidence/<timestamp>/runner.stderr.log",
+     "stdoutLogSha256": "<sha256 of runner.stdout.log>",
+     "stderrLogSha256": "<sha256 of runner.stderr.log>",
+      "workspaceIdentity": {
+        "changeName": "<name>",
+        "taskId": "<benchmark task id, if any>",
+        "cwd": "<absolute working directory>",
+        "pyproject": null,
+        "packageJson": null
+      },
+     "testMetrics": {
+       "total": 29,
+       "passed": 29,
+       "failed": 0,
+       "passRate": 1
+     },
+     "junitXml": null,
+     "coverageSummary": null,
+     "coverageLcov": null,
+     "coverageHtml": null
+   }
+   \`\`\`
+   Set each workspace identity file entry to an object ONLY when that file
+   exists at the project root. If \`pyproject.toml\` or \`package.json\` is absent,
+   leave that field \`null\` (or omit it); do not emit a \`path\` for an absent file.
+   Object shape for a present file:
+   \`\`\`json
+   {
+     "path": "pyproject.toml",
+     "name": "<project/package name, or null>",
+     "sha256": "<sha256 of the file>"
+   }
+   \`\`\`
+   If the runner summary exposes pass/fail counts, record them in
+   \`testMetrics\`; otherwise set \`testMetrics\` to \`null\` and preserve the raw
+   stdout/stderr logs. The \`stdoutLogSha256\` and \`stderrLogSha256\` fields MUST
+   be the SHA-256 hashes of the exact saved log files, computed after writing the
+   files and before writing \`runner-exit.json\`; do not hand-edit logs after
+   hashing. If the runner produces JUnit XML or coverage artifacts,
+   record their paths in \`runner-exit.json\`. If it does not, keep those fields
+   \`null\`. The markdown report may summarize results, but the raw logs and exit
+   JSON are the durable evidence that later verification must inspect.
 3b. **Promote PBT counterexamples to regression tests**
    Scan the captured stdout/stderr for PBT failure markers. Each major framework prints a minimal (shrunk) counterexample when a property fails:
@@ -52,6 +117,11 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
    **If no PBT tests exist** (no \`.property.test.*\` files found anywhere): note "No PBT tests found — run \`/synspec:gen-tests\` to generate them."
+   **PBT N/A rows**: If \`spec-tests.md\` contains \`PBT Coverage\` rows marked
+   \`➖ N/A\`, do not count those rows as missing PBT tests only when each row cites
+   a covering example/benchmark test path. A \`➖ N/A\` row without a concrete
+   covering test is invalid and should be reported as missing PBT coverage.
 4. **Generate Test Coverage Report**
    Save this file to \`synergyspec-selfevolving/changes/<name>/test-report.md\`.
@@ -82,9 +152,33 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
    | UC1-E4a1 | Error when no grid space | ❌ failed | \`gridSize=0, widgetCount=1\` | \`test/pbt-regression-uc1-e4a1-1.test.ts\` |
    ...
-   ### Test Run Results
-   <summary from test runner output: passed/failed/skipped counts>
-   If failures: list failing test names and errors.
+   ### Test Run Results
+   Summary: <N collected>, <N passed>, <N failed>, <N skipped>, <N collection errors>
+   <raw summary from test runner output: passed/failed/skipped counts>
+   If failures: list failing test names and errors.
+   ### Runner Evidence
+   | Evidence | Path / Value |
+   |----------|--------------|
+   | Command | \`<exact command>\` |
+   | CWD | \`<absolute working directory>\` |
+   | Started | \`<ISO timestamp>\` |
+   | Finished | \`<ISO timestamp>\` |
+   | Exit Code | \`<numeric exit code>\` |
+   | stdout | \`synergyspec-selfevolving/changes/<name>/test-evidence/<timestamp>/runner.stdout.log\` |
+   | stderr | \`synergyspec-selfevolving/changes/<name>/test-evidence/<timestamp>/runner.stderr.log\` |
+   | Exit JSON | \`synergyspec-selfevolving/changes/<name>/test-evidence/<timestamp>/runner-exit.json\` |
+   | Workspace Identity | \`pyproject.toml [project].name=<name>, sha256=<hash>; package.json name=<name>, sha256=<hash>\` |
+   | JUnit XML | \`<path or N/A>\` |
+   | Coverage Summary | \`<path or N/A>\` |
+   The pass/fail verdict must be consistent with \`runner-exit.json\`. If the
+   runner exited non-zero, do not report the suite as passed even if a textual
+   summary appears optimistic.
+   The current root package identity must also match
+   \`runner-exit.json.workspaceIdentity\` before the report can prove the current
+   workspace. If \`pyproject.toml\` or \`package.json\` changes after the run,
+   rerun tests and refresh this evidence instead of archiving stale results.
    \`\`\`
 5. **Generate Test Plan**
@@ -172,6 +266,7 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
 **Graceful Degradation**
 - If tests fail: still show the coverage report; highlight failures separately
+- If durable runner evidence cannot be written: mark the report PARTIAL, explain why, and do not claim the tests passed solely from a self-authored summary
 - If no spec-tests.md: note "Run /synspec:gen-tests first for accurate coverage mapping"
 - If no PBT tests found: note "No PBT tests found — run /synspec:gen-tests to generate them" and skip step 3b
 - If pbt-regressions.md already exists: update it in place (append new entries, update status of previously open regressions that now pass)
@@ -183,7 +278,7 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
 - File:line references for existing tests
 - Specific, actionable recommendations for missing coverage
 - If test plan was generated: "Test plan saved to \`synergyspec-selfevolving/changes/<name>/test-plan.md\` — follow it to manually verify N uncovered steps."
-- If coverage is complete: suggest \`/synspec:verify\` next, then \`/synspec:learn\`, then \`/synspec:archive\`
+- If coverage is complete: suggest \`/synspec:verify\` next, then \`/synspec:learn\`, then \`/synspec:archive\`
 - For full CI pipeline (all tests + e2e + coverage + screenshot comparison in one step): run \`/synspec:ci\``;
 export function getRunTestsSkillTemplate() {
     return {