npm - synergyspec-selfevolving - Versions diffs - 1.1.9 → 1.1.10 - Mend

synergyspec-selfevolving 1.1.9 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/commands/learn.js +38 -1
package/dist/core/learn.d.ts +19 -0
package/dist/core/learn.js +53 -4
package/dist/core/templates/workflows/learn.js +2 -2
package/dist/core/trajectory/adapters/claude.js +2 -1
package/dist/core/trajectory/adapters/opencode.js +66 -7
package/dist/core/trajectory/facts.d.ts +20 -0
package/dist/core/trajectory/facts.js +146 -33
package/package.json +95 -95

package/dist/commands/learn.js CHANGED Viewed

@@ -4,6 +4,9 @@ import { generateEvolutionHints, lookupCanonicalTarget, persistLearnHints, resol
 import { readProjectConfig } from '../core/project-config.js';
 import { assembleTrajectoryContext, } from '../core/learn/trajectory-assembler.js';
 import { findTranscriptsForChange, resolveChangeDir, } from '../core/learn/trajectory-discovery.js';
+import { getTrajectoryForChange } from '../core/trajectory/registry.js';
+import { toTrajectoryFacts, describeRunnerResults } from '../core/trajectory/facts.js';
+import { resolveHostHarness } from '../core/self-evolution/host-harness.js';
 import { buildLLMSummaryCandidates, ingestLearnHandoff, } from '../core/learn/llm-summary.js';
 function collect(value, previous) {
     previous.push(value);
@@ -14,7 +17,7 @@ export function registerLearnCommand(program) {
         .command('learn [change]')
         .description('Review a completed change and extract reusable learning candidates')
         .option('--preview', 'Preview lessons and memory candidates without writing (default)')
-        .option('--apply', 'Write memory candidates to the local SynergySpec-SelfEvolving memory store; requires --yes')
+        .option('--apply', 'Write memory candidates to the local SynergySpec-SelfEvolving memory store; requires --yes. The learn SKILL runs with --apply --yes to evolve autonomously; the bare CLI previews by default.')
         .option('--only <candidate-id>', 'When applying, write only this keep candidate id (repeatable)', collect, [])
         .option('--exclude <candidate-id>', 'When applying, skip this candidate id (repeatable)', collect, [])
         .option('-y, --yes', 'Confirm --apply and skip confirmation prompts')
@@ -95,6 +98,7 @@ export function registerLearnCommand(program) {
         .command('debug-trajectory <change>')
         .description('Print the assembled TrajectoryContext for a change as JSON. Read-only; runs no LLM handoff and writes nothing.')
         .option('--preview', 'Truncate the trajectory text field to 4000 chars in the output')
+        .option('--harness <name>', 'Force the observed-run trajectory adapter (claude|codex|opencode); defaults to the resolved host harness')
         .action(async (change, opts) => {
         const projectRoot = process.cwd();
         try {
@@ -106,12 +110,45 @@ export function registerLearnCommand(program) {
             });
             const payload = {
                 changeName: change,
+                // Claude-transcript discovery (this is what the `text` assembly below
+                // uses). On a non-Claude host this is EXPECTED to be no-transcript —
+                // the observed-run facts the fitness pipeline grades come from the
+                // `adapter` block instead.
                 discovery: {
                     paths: discovered.paths,
                     sessionIds: discovered.sessionIds,
                     selectionRule: discovered.selectionRule,
                 },
             };
+            // Adapter-aware introspection: the LIVE learn fitness pipeline grades the
+            // OBSERVED run via the host-harness trajectory adapter (registry →
+            // opencode/codex/claude), NOT the Claude-transcript path above. Surface
+            // its facts + per-runner-result breakdown so a misgrade is visible in one
+            // command. `--harness` forces a specific adapter for cross-host debugging.
+            const prevHarnessEnv = process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS;
+            if (opts.harness)
+                process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS = opts.harness;
+            try {
+                const adapterTrajectory = await getTrajectoryForChange(projectRoot, change);
+                payload.adapter = {
+                    resolvedHarness: resolveHostHarness(),
+                    sessionId: adapterTrajectory?.sessionId ?? null,
+                    turns: adapterTrajectory?.turns.length ?? 0,
+                    sourcePaths: adapterTrajectory ? [...new Set(adapterTrajectory.sourcePaths)] : [],
+                    facts: toTrajectoryFacts(adapterTrajectory, change),
+                    runnerResults: describeRunnerResults(adapterTrajectory),
+                };
+            }
+            finally {
+                if (opts.harness) {
+                    if (prevHarnessEnv === undefined) {
+                        delete process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS;
+                    }
+                    else {
+                        process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS = prevHarnessEnv;
+                    }
+                }
+            }
             if (assembled.kind === 'ok') {
                 const t = assembled.trajectory;
                 const truncatedForPreview = opts.preview === true && t.text.length > 4000

package/dist/core/learn.d.ts CHANGED Viewed

@@ -137,6 +137,10 @@ export interface LearnMemoryRetrievalCheck {
     rank?: number;
     matchedMemoryId?: string;
 }
+interface ArtifactFile {
+    relativePath: string;
+    content: string;
+}
 export declare function generateLearnReport(args?: {
     projectRoot?: string;
     changeName?: string;
@@ -179,5 +183,20 @@ export declare function structuredMemoryBody(args: {
 }): string;
 export declare function defaultRetrievalQueries(title: string, tags: string[] | undefined): string[];
 export declare function classifyCandidate(candidate: LearnMemoryCandidate): LearnMemoryCandidate;
+/**
+ * Find lines in a verification artifact that look like UNRESOLVED failure
+ * evidence. The hazard (the same prose-keyword trap as the trajectory runner
+ * detector) is mistaking a PASSED negative-path scenario for a failure: a result
+ * row `| UC3-E7a | Cleanup failure propagates | PASS | … |`, a table header
+ * naming a `Counterexample` / `Regression Test` column, or a list item
+ * `- PASS UC1-E1a: Open fails because …` all merely MENTION failure words while
+ * reporting success. We therefore decide pass-ness structurally (table outcome
+ * cell / PASS-prefixed list item / header row) before keyword-scanning the rest.
+ */
+export declare function extractFailureEvidence(file: ArtifactFile): Array<{
+    file: string;
+    line: string;
+}>;
 export declare function limitText(value: string, maxLength: number): string;
+export {};
 //# sourceMappingURL=learn.d.ts.map

package/dist/core/learn.js CHANGED Viewed

@@ -1381,14 +1381,63 @@ function collectArtifactFiles(artifacts) {
         ...artifacts.evidence,
     ]);
 }
-function extractFailureEvidence(file) {
+/** A markdown table separator row, e.g. `| --- | :---: | ---- |`. */
+function isTableSeparator(line) {
+    return /^\|?\s*:?-{3,}:?\s*(?:\|\s*:?-{3,}:?\s*)*\|?$/.test(line);
+}
+/** Split a markdown table row into trimmed, emphasis-stripped cells; null if not a row. */
+function splitTableCells(line) {
+    if (!line.includes('|'))
+        return null;
+    return line
+        .replace(/^\|/, '')
+        .replace(/\|$/, '')
+        .split('|')
+        .map((c) => c.trim().replace(/\*\*/g, '').replace(/`/g, ''));
+}
+/** A single cell whose whole value is a passing/neutral verdict. */
+const PASS_CELL_RE = /^(?:pass(?:ed|es)?|covered|ok|✓|✔|n\/?a|none|-|—)$/i;
+/**
+ * Find lines in a verification artifact that look like UNRESOLVED failure
+ * evidence. The hazard (the same prose-keyword trap as the trajectory runner
+ * detector) is mistaking a PASSED negative-path scenario for a failure: a result
+ * row `| UC3-E7a | Cleanup failure propagates | PASS | … |`, a table header
+ * naming a `Counterexample` / `Regression Test` column, or a list item
+ * `- PASS UC1-E1a: Open fails because …` all merely MENTION failure words while
+ * reporting success. We therefore decide pass-ness structurally (table outcome
+ * cell / PASS-prefixed list item / header row) before keyword-scanning the rest.
+ */
+export function extractFailureEvidence(file) {
     const lines = file.content.split(/\r?\n/);
+    const nextNonEmpty = (from) => {
+        for (let j = from + 1; j < lines.length; j++) {
+            const t = lines[j].trim();
+            if (t)
+                return t;
+        }
+        return null;
+    };
     const matches = [];
-    for (const line of lines) {
-        const trimmed = line.trim();
+    for (let i = 0; i < lines.length; i++) {
+        const trimmed = lines[i].trim();
         if (!trimmed)
             continue;
-        if (/\|\s*(covered|pass(?:ed|es)?)\s*\|?\s*$/i.test(trimmed)) {
+        if (isTableSeparator(trimmed))
+            continue;
+        const cells = splitTableCells(trimmed);
+        if (cells) {
+            // A header row (its successor is a separator) lists column LABELS such as
+            // "Counterexample" or "Regression Test" — not failure evidence.
+            const successor = nextNonEmpty(i);
+            if (successor !== null && isTableSeparator(successor))
+                continue;
+            // A data row whose outcome/status cell reads PASS/✓/covered is a PASSING
+            // result, even when another cell names the failure scenario it exercises.
+            if (cells.some((c) => PASS_CELL_RE.test(c)))
+                continue;
+        }
+        else if (/^[-*\s]*(?:\*\*)?\s*pass(?:ed|es)?\b/i.test(trimmed)) {
+            // A list item explicitly marked PASS (a passed negative-path scenario).
             continue;
         }
         if (/\b(no|none|zero|0)\s+(failures?|failed|errors?|critical issues)\b/i.test(trimmed)) {

package/dist/core/templates/workflows/learn.js CHANGED Viewed

@@ -2,9 +2,9 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
 **Purpose**
-This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to autonomous self-evolution. By DEFAULT it does two things: (1) summarize reusable lessons and consolidate memory, and (2) evolve the tool itself from those lessons — authoring and promoting a concrete improvement to a canonical prompt/template — WITHOUT asking for confirmation. You (the agent running this skill) are the proposer: you already hold the full change context, so you author the improved file yourself and the CLI validates → gates → promotes it onto the local installed file (no rebuild, no republish, no second agent, no \`claude -p\`).
+This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to autonomous self-evolution. When you run this skill you invoke learn with \`--apply --yes\` (the bare \`learn\` CLI previews by default — writing nothing — for safety; the skill opts into evolution explicitly). That does two things: (1) summarize reusable lessons and consolidate memory, and (2) evolve the tool itself from those lessons — authoring and promoting a concrete improvement to a canonical prompt/template — WITHOUT asking for confirmation. You (the agent running this skill) are the proposer: you already hold the full change context, so you author the improved file yourself and the CLI validates → gates → promotes it onto the local installed file (no rebuild, no republish, no second agent, no \`claude -p\`).
-Preview-only is the explicit opt-out: run \`synergyspec-selfevolving learn <change> --preview\` (or pass \`--preview\`) to analyze without evolving.
+Preview-only is the bare-CLI default and the explicit opt-out: run \`synergyspec-selfevolving learn <change> --preview\` (or simply omit \`--apply\`) to analyze without evolving.
 **Default Mode: Autonomous self-evolution**

package/dist/core/trajectory/adapters/claude.js CHANGED Viewed

@@ -323,7 +323,8 @@ class ClaudeTrajectorySource {
                 sessionId: mainSessionId,
                 turns,
                 subagentSessionIds,
-                sourcePaths,
+                // Distinct source files only (one transcript can yield many turns).
+                sourcePaths: [...new Set(sourcePaths)],
             };
         }
         catch {

package/dist/core/trajectory/adapters/opencode.js CHANGED Viewed

@@ -46,8 +46,10 @@
  *     ToolPart{type:'tool', state}     -> pending|running => ToolCallPart{tool, callId, input}
  *                                         completed       => ToolResultPart{tool, callId, output}
  *                                         error           => ToolResultPart{tool, callId, isError, output:error}
- *   exitCode is left null (opencode does not expose one); facts.ts re-sources
- *   the pass rate from `output`.
+ *   exitCode is sourced from `state.metadata.exit` when present (opencode exposes
+ *   the real shell exit code there), else null; facts.ts uses it directly and
+ *   also re-sources the pass rate from `output`. For truncated outputs the full
+ *   text is recovered from `state.metadata.outputPath`.
  *
  * Pure + no throw: every I/O or parse failure degrades to null / an empty list.
  */
@@ -58,9 +60,20 @@ import { readChangeWindow, resolveChangeDir } from '../../learn/trajectory-disco
 import { samePath } from '../path-attribution.js';
 /** Cap individual JSON files we read to avoid blowing up on a giant transcript. */
 const MAX_JSON_BYTES = 8 * 1024 * 1024;
-/** Resolve the opencode data root (env override wins, then ~/.local/share/opencode). */
+/**
+ * Resolve the opencode data root. Precedence: explicit `OPENCODE_DATA_DIR`, then
+ * the XDG base dir (`XDG_DATA_HOME/opencode`) when set, then the cross-platform
+ * default `~/.local/share/opencode`. Honoring XDG_DATA_HOME keeps discovery (and
+ * thus observed-run grading) working where opencode stores its data off the
+ * default path; absent that env it is byte-identical to the prior behavior.
+ */
 function dataRoot(homeDir) {
-    return process.env.OPENCODE_DATA_DIR ?? path.join(homeDir, '.local', 'share', 'opencode');
+    if (process.env.OPENCODE_DATA_DIR)
+        return process.env.OPENCODE_DATA_DIR;
+    const xdg = process.env.XDG_DATA_HOME;
+    if (xdg && xdg.trim().length > 0)
+        return path.join(xdg, 'opencode');
+    return path.join(homeDir, '.local', 'share', 'opencode');
 }
 /** Read + JSON.parse a file, returning null on any miss/oversize/parse error. */
 async function readJsonCapped(file) {
@@ -75,6 +88,33 @@ async function readJsonCapped(file) {
         return null;
     }
 }
+/** Read a UTF-8 text file, returning null on any miss/oversize/read error. */
+async function readTextCapped(file) {
+    try {
+        const stat = await fs.stat(file);
+        if (!stat.isFile() || stat.size > MAX_JSON_BYTES)
+            return null;
+        return await fs.readFile(file, 'utf-8');
+    }
+    catch {
+        return null;
+    }
+}
+/**
+ * When opencode truncates a tool's inline `output` (large logs), the full text is
+ * written to `state.metadata.outputPath`. Recover it (bounded) so a test summary
+ * that fell past the retained inline preview is still parseable. Best-effort:
+ * any miss (no path, not truncated, ENOENT, oversize, read error) leaves the
+ * inline output untouched. Mutates the raw part in place before mapping.
+ */
+async function recoverTruncatedOutput(part) {
+    const meta = part.state?.metadata;
+    if (!meta || meta.truncated !== true || typeof meta.outputPath !== 'string')
+        return;
+    const full = await readTextCapped(meta.outputPath);
+    if (full !== null && part.state)
+        part.state.output = full;
+}
 /** List directory entry names of a given kind; [] on any error. */
 async function listDir(dir, kind) {
     try {
@@ -539,11 +579,20 @@ class SqliteStore {
                 statusStr === 'error'
                 ? statusStr
                 : undefined;
+            const metaRec = asRecord(stateRec.metadata);
             part.state = {
                 status,
                 input: asRecord(stateRec.input) ?? undefined,
                 output: typeof stateRec.output === 'string' ? stateRec.output : undefined,
                 error: typeof stateRec.error === 'string' ? stateRec.error : undefined,
+                metadata: metaRec
+                    ? {
+                        exit: typeof metaRec.exit === 'number' ? metaRec.exit : undefined,
+                        output: typeof metaRec.output === 'string' ? metaRec.output : undefined,
+                        truncated: typeof metaRec.truncated === 'boolean' ? metaRec.truncated : undefined,
+                        outputPath: typeof metaRec.outputPath === 'string' ? metaRec.outputPath : undefined,
+                    }
+                    : undefined,
             };
         }
         return part;
@@ -600,6 +649,13 @@ function mapPart(part) {
             const tool = typeof part.tool === 'string' ? part.tool : 'tool';
             const callId = typeof part.callID === 'string' ? part.callID : undefined;
             const state = part.state ?? {};
+            // opencode DOES expose a real shell exit code via `state.metadata.exit`
+            // (e.g. 0 for a green `pytest` run). Earlier this was hardcoded null, which
+            // forced verification to rely solely on parsing the output summary and made
+            // observed-grading fail closed when the summary was absent/truncated.
+            const exitCode = typeof state.metadata?.exit === 'number' && Number.isFinite(state.metadata.exit)
+                ? state.metadata.exit
+                : null;
             const call = { kind: 'tool_call', tool, callId, input: state.input };
             switch (state.status) {
                 case 'pending':
@@ -613,7 +669,7 @@ function mapPart(part) {
                             tool,
                             callId,
                             output: typeof state.output === 'string' ? state.output : undefined,
-                            exitCode: null,
+                            exitCode,
                         },
                     ];
                 case 'error':
@@ -625,7 +681,7 @@ function mapPart(part) {
                             callId,
                             isError: true,
                             output: typeof state.error === 'string' ? state.error : undefined,
-                            exitCode: null,
+                            exitCode,
                         },
                     ];
                 default:
@@ -663,6 +719,7 @@ async function readSessionTurns(store, sessionId, startIndex) {
         rawParts.sort(byId);
         const parts = [];
         for (const p of rawParts) {
+            await recoverTruncatedOutput(p);
             for (const mapped of mapPart(p))
                 parts.push(mapped);
         }
@@ -795,7 +852,9 @@ class OpencodeSource {
             sessionId: main.id,
             turns,
             subagentSessionIds,
-            sourcePaths,
+            // De-dupe: one label is pushed per message/turn, so the same db file (or
+            // dir) repeats once per turn. Distinct sources only (audit-trail clarity).
+            sourcePaths: [...new Set(sourcePaths)],
         };
     }
 }

package/dist/core/trajectory/facts.d.ts CHANGED Viewed

@@ -36,4 +36,24 @@ export interface TrajectoryFacts {
  * test-report and preserve byte-identical baseline behaviour.
  */
 export declare function toTrajectoryFacts(trajectory: NormalizedTrajectory | null, changeName: string): TrajectoryFacts | null;
+/** One observed runner result, as the grader sees it — for `debug-trajectory`. */
+export interface RunnerResultDetail {
+    tool: string | null;
+    callId: string | null;
+    command: string | null;
+    outputLength: number;
+    outputLastLine: string;
+    parsedPassRate: number | null;
+    exitCode: number | null;
+    isError: boolean;
+    /** True for the single result that decided the verdict (see gradedRunnerIndex). */
+    graded: boolean;
+}
+/**
+ * Explain WHICH runner results the grader saw and which one it graded. This is
+ * the introspection the observed-grading path otherwise hides — surfaced by
+ * `learn debug-trajectory` so a misgrade (e.g. a file-write quoting a test
+ * command shadowing a real run) is visible in one command. Pure; [] when null.
+ */
+export declare function describeRunnerResults(trajectory: NormalizedTrajectory | null): RunnerResultDetail[];
 //# sourceMappingURL=facts.d.ts.map

package/dist/core/trajectory/facts.js CHANGED Viewed

@@ -13,70 +13,156 @@
  * Pure + no throw.
  */
 import { parseTestMetrics } from '../fitness/test-metrics.js';
+/**
+ * Matches the NAME of a shell/command-executing tool across harnesses — Claude
+ * `Bash`; opencode `bash`; Codex `shell`/`local_shell`/`shell_command`/`exec`;
+ * cursor `run_terminal_cmd`; etc. Runner detection is GATED on this (deny by
+ * default): a file-mutating tool (`apply_patch`/`write`/`edit`/`multiedit`) whose
+ * payload merely QUOTES a test command must never be mistaken for a test run —
+ * that false positive, paired with the "last runner result wins" rule below,
+ * silently blanked observed-grading on a real run (a verification-report write
+ * whose patch text quoted `pytest tests`). A NAME we don't recognize as a shell
+ * tool degrades to `testRunObserved = false` (soft penalty), never to a false
+ * green. We pattern-match (not an exact set) so harness naming drift
+ * (`shell_command` vs `local_shell`) doesn't silently disable grading; the shell
+ * tokens are word-boundaried on `._-` so `apply_patch`/`multiedit`/`str_replace`
+ * never match.
+ */
+const EXEC_TOOL_RE = /(?:^|[._-])(?:bash|sh|zsh|fish|pwsh|powershell|shell|cmd|exec|command|terminal|run)(?:[._-]|$)/i;
+function isExecTool(tool) {
+    return tool !== undefined && EXEC_TOOL_RE.test(tool);
+}
+/**
+ * Input fields that carry the executed command line, in preference order. When
+ * one is present we match the runner regex against IT ALONE, so a sibling prose
+ * field (e.g. a Bash call's `description`) that merely mentions a test command
+ * cannot trigger a match. Falls back to scanning every string value when no
+ * command field is present — keeps detection working for unknown input shapes,
+ * and a false negative there is the safe direction.
+ */
+const COMMAND_FIELDS = ['command', 'cmd', 'script', 'argv', 'args'];
 /**
  * Recognizes a test-runner invocation from a tool call's decoded arguments.
- * Best-effort and harness-neutral: we stringify the input values and match the
- * canonical runner commands. A false negative degrades to `testRunObserved =
- * false` (soft penalty), never to a crash.
+ * Best-effort and harness-neutral. The CALLER must gate on {@link isExecTool}
+ * first; this only inspects the command-bearing field(s) of the input.
  */
 const RUNNER_RE = /\b(?:vitest|jest|mocha|playwright\s+test|cypress\s+run|pytest|py\.test|python\s+-m\s+(?:pytest|unittest)|unittest|go\s+test|cargo\s+test|cargo\s+nextest|rspec|gradle(?:w)?\s+test|mvn\s+test|dotnet\s+test|ctest|npm\s+(?:run\s+)?test|yarn\s+(?:run\s+)?test|pnpm\s+(?:run\s+)?test|bun\s+test)\b/i;
+function matchesRunner(value) {
+    if (typeof value === 'string')
+        return RUNNER_RE.test(value);
+    // Codex/opencode shell args often arrive as an argv array.
+    if (Array.isArray(value)) {
+        return RUNNER_RE.test(value.filter((v) => typeof v === 'string').join(' '));
+    }
+    return false;
+}
 function inputLooksLikeRunner(input) {
     if (!input)
         return false;
-    for (const value of Object.values(input)) {
-        if (typeof value === 'string') {
-            if (RUNNER_RE.test(value))
-                return true;
-        }
-        else if (Array.isArray(value)) {
-            // Codex/opencode shell args often arrive as an argv array.
-            if (RUNNER_RE.test(value.filter((v) => typeof v === 'string').join(' ')))
-                return true;
-        }
+    const preferred = [];
+    for (const f of COMMAND_FIELDS) {
+        if (Object.prototype.hasOwnProperty.call(input, f))
+            preferred.push(input[f]);
     }
-    return false;
+    // Codex wraps a raw (non-JSON) command line as `{ input: "<cmd>" }`.
+    if (preferred.length === 0 && typeof input.input === 'string')
+        preferred.push(input.input);
+    const values = preferred.length > 0 ? preferred : Object.values(input);
+    return values.some(matchesRunner);
+}
+/** Best-effort command text from a tool_call input, for debug display. */
+function commandText(input) {
+    if (!input)
+        return undefined;
+    for (const f of COMMAND_FIELDS) {
+        const v = input[f];
+        if (typeof v === 'string')
+            return v;
+        if (Array.isArray(v))
+            return v.filter((x) => typeof x === 'string').join(' ');
+    }
+    if (typeof input.input === 'string')
+        return input.input;
+    return undefined;
 }
 /**
- * Compute the {@link TrajectoryFacts} for a change. Returns `null` when there is
- * no trajectory at all, so callers can cleanly fall back to the authored
- * test-report and preserve byte-identical baseline behaviour.
+ * Walk the trajectory once, pairing runner results to runner CALLS by callId
+ * (with a positional fallback for records that omit one), and return the runner
+ * results plus the total tool-call count. Single-sourced so {@link toTrajectoryFacts}
+ * and {@link describeRunnerResults} can never drift on what counts as a run.
  */
-export function toTrajectoryFacts(trajectory, changeName) {
-    if (!trajectory)
-        return null;
-    // Walk parts in order, pairing runner results to runner calls by callId,
-    // with a positional fallback for records that omit one.
-    const runnerByCallId = new Map();
-    let lastCallWasRunner = false;
+function collectRunnerResults(trajectory) {
+    const callMetaById = new Map();
+    let lastCall = null;
     let toolCallCount = 0;
     const runnerResults = [];
     for (const turn of trajectory.turns) {
         for (const part of turn.parts) {
             if (part.kind === 'tool_call') {
                 toolCallCount++;
-                const isRunner = inputLooksLikeRunner(part.input);
-                lastCallWasRunner = isRunner;
+                const meta = {
+                    isRunner: isExecTool(part.tool) && inputLooksLikeRunner(part.input),
+                    tool: part.tool,
+                    command: commandText(part.input),
+                };
+                lastCall = meta;
                 if (part.callId)
-                    runnerByCallId.set(part.callId, isRunner);
+                    callMetaById.set(part.callId, meta);
             }
             else if (part.kind === 'tool_result') {
-                const matched = part.callId && runnerByCallId.has(part.callId)
-                    ? runnerByCallId.get(part.callId) === true
-                    : !part.callId && lastCallWasRunner;
-                if (matched) {
+                const meta = part.callId && callMetaById.has(part.callId)
+                    ? callMetaById.get(part.callId)
+                    : !part.callId && lastCall
+                        ? lastCall
+                        : undefined;
+                if (meta?.isRunner) {
                     runnerResults.push({
                         output: part.output,
                         isError: part.isError,
                         exitCode: part.exitCode ?? null,
+                        tool: meta.tool ?? part.tool,
+                        callId: part.callId,
+                        command: meta.command,
                     });
                 }
                 // A result consumes the positional pairing slot.
-                lastCallWasRunner = false;
+                lastCall = null;
             }
         }
     }
+    return { runnerResults, toolCallCount };
+}
+/** A runner result carries a usable signal: metrics, a numeric exit, or an error. */
+function hasSignal(r) {
+    return (typeof r.exitCode === 'number' ||
+        r.isError === true ||
+        (r.output != null && parseTestMetrics(r.output) !== null));
+}
+/**
+ * Index of the runner result that decides the verdict: the LAST one carrying a
+ * signal, so a trailing exec call that merely matched a runner keyword but
+ * produced no test summary (e.g. `echo "pytest"`) can't blank a real prior run.
+ * Falls back to the final result when none carries a signal; -1 when empty.
+ */
+function gradedRunnerIndex(runnerResults) {
+    for (let i = runnerResults.length - 1; i >= 0; i--) {
+        if (hasSignal(runnerResults[i]))
+            return i;
+    }
+    return runnerResults.length - 1;
+}
+/**
+ * Compute the {@link TrajectoryFacts} for a change. Returns `null` when there is
+ * no trajectory at all, so callers can cleanly fall back to the authored
+ * test-report and preserve byte-identical baseline behaviour.
+ */
+export function toTrajectoryFacts(trajectory, changeName) {
+    if (!trajectory)
+        return null;
+    const { runnerResults, toolCallCount } = collectRunnerResults(trajectory);
     const testRunObserved = runnerResults.length > 0;
-    const last = testRunObserved ? runnerResults[runnerResults.length - 1] : null;
+    const gradedIdx = gradedRunnerIndex(runnerResults);
+    const last = gradedIdx >= 0 ? runnerResults[gradedIdx] : null;
     const runnerExitCode = last && typeof last.exitCode === 'number' ? last.exitCode : null;
     const observedPassRate = last && last.output ? (parseTestMetrics(last.output)?.passRate ?? null) : null;
     let observedStatus = null;
@@ -100,4 +186,31 @@ export function toTrajectoryFacts(trajectory, changeName) {
         sourcePaths: trajectory.sourcePaths,
     };
 }
+/**
+ * Explain WHICH runner results the grader saw and which one it graded. This is
+ * the introspection the observed-grading path otherwise hides — surfaced by
+ * `learn debug-trajectory` so a misgrade (e.g. a file-write quoting a test
+ * command shadowing a real run) is visible in one command. Pure; [] when null.
+ */
+export function describeRunnerResults(trajectory) {
+    if (!trajectory)
+        return [];
+    const { runnerResults } = collectRunnerResults(trajectory);
+    const gradedIdx = gradedRunnerIndex(runnerResults);
+    return runnerResults.map((r, i) => {
+        const output = typeof r.output === 'string' ? r.output : '';
+        const lastLine = output.trim().split(/\r?\n/).filter(Boolean).pop() ?? '';
+        return {
+            tool: r.tool ?? null,
+            callId: r.callId ?? null,
+            command: r.command ?? null,
+            outputLength: output.length,
+            outputLastLine: lastLine.slice(0, 200),
+            parsedPassRate: output ? (parseTestMetrics(output)?.passRate ?? null) : null,
+            exitCode: r.exitCode ?? null,
+            isError: r.isError === true,
+            graded: i === gradedIdx,
+        };
+    });
+}
 //# sourceMappingURL=facts.js.map

package/package.json CHANGED Viewed

@@ -1,95 +1,95 @@
-{
-  "name": "synergyspec-selfevolving",
-  "version": "1.1.9",
-  "description": "AI-native system for spec-driven development",
-  "keywords": [
-    "synergyspec-selfevolving",
-    "openspec",
-    "spec-driven",
-    "specs",
-    "cli",
-    "ai",
-    "development"
-  ],
-  "homepage": "https://github.com/ZhifeiDou/SynergySpec-SelfEvolving",
-  "repository": {
-    "type": "git",
-    "url": "git+https://github.com/ZhifeiDou/SynergySpec-SelfEvolving.git"
-  },
-  "license": "MIT",
-  "author": "Zhifei Dou",
-  "type": "module",
-  "publishConfig": {
-    "access": "public"
-  },
-  "exports": {
-    ".": {
-      "types": "./dist/index.d.ts",
-      "default": "./dist/index.js"
-    }
-  },
-  "bin": {
-    "synergyspec-selfevolving": "bin/synergyspec-selfevolving.js"
-  },
-  "files": [
-    "dist",
-    "bin",
-    "schemas",
-    "scripts/postinstall.js",
-    "scripts/nl2repo_synergyspec-selfevolving_wrapper.py",
-    "!dist/**/*.test.js",
-    "!dist/**/__tests__",
-    "!dist/**/*.map"
-  ],
-  "scripts": {
-    "lint": "eslint src/",
-    "build": "node build.js",
-    "dev": "tsc --watch",
-    "dev:cli": "pnpm build && node bin/synergyspec-selfevolving.js",
-    "test": "vitest run",
-    "test:watch": "vitest",
-    "test:ui": "vitest --ui",
-    "test:coverage": "vitest run --coverage",
-    "test:postinstall": "node scripts/postinstall.js",
-    "test:e2e:real-agent": "node smoking-test/run-real-agent-self-evolution-e2e.mjs --local-pack",
-    "prepare": "pnpm run build",
-    "prepublishOnly": "pnpm run build && pnpm run check:pack-version && pnpm run check:pack-contents",
-    "postinstall": "node scripts/postinstall.js",
-    "check:docs": "node scripts/docs-check.mjs",
-    "check:pack-version": "node scripts/pack-version-check.mjs",
-    "check:pack-contents": "node scripts/pack-contents-check.mjs",
-    "release": "pnpm run release:ci",
-    "release:ci": "pnpm run check:pack-version && pnpm run check:pack-contents && pnpm exec changeset publish",
-    "changeset": "changeset"
-  },
-  "engines": {
-    "node": ">=20.19.0"
-  },
-  "devDependencies": {
-    "@changesets/changelog-github": "^0.5.2",
-    "@changesets/cli": "^2.27.7",
-    "@types/node": "^24.2.0",
-    "@vitest/coverage-v8": "^3.2.4",
-    "@vitest/ui": "^3.2.4",
-    "eslint": "^9.39.2",
-    "fast-check": "^4.8.0",
-    "typescript": "^5.9.3",
-    "typescript-eslint": "^8.50.1",
-    "vitest": "^3.2.4"
-  },
-  "dependencies": {
-    "@inquirer/core": "^10.2.2",
-    "@inquirer/prompts": "^7.8.0",
-    "ansi-regex": "^5.0.1",
-    "chalk": "^5.5.0",
-    "commander": "^14.0.0",
-    "fast-glob": "^3.3.3",
-    "ora": "^8.2.0",
-    "posthog-node": "^5.20.0",
-    "react": "^18.3.1",
-    "react-dom": "^18.3.1",
-    "tsx": "^4.20.6",
-    "yaml": "^2.8.2",
-    "zod": "^4.0.17"
-  }
-}
+{
+  "name": "synergyspec-selfevolving",
+  "version": "1.1.10",
+  "description": "AI-native system for spec-driven development",
+  "keywords": [
+    "synergyspec-selfevolving",
+    "openspec",
+    "spec-driven",
+    "specs",
+    "cli",
+    "ai",
+    "development"
+  ],
+  "homepage": "https://github.com/ZhifeiDou/SynergySpec-SelfEvolving",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/ZhifeiDou/SynergySpec-SelfEvolving.git"
+  },
+  "license": "MIT",
+  "author": "Zhifei Dou",
+  "type": "module",
+  "publishConfig": {
+    "access": "public"
+  },
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "default": "./dist/index.js"
+    }
+  },
+  "bin": {
+    "synergyspec-selfevolving": "bin/synergyspec-selfevolving.js"
+  },
+  "files": [
+    "dist",
+    "bin",
+    "schemas",
+    "scripts/postinstall.js",
+    "scripts/nl2repo_synergyspec-selfevolving_wrapper.py",
+    "!dist/**/*.test.js",
+    "!dist/**/__tests__",
+    "!dist/**/*.map"
+  ],
+  "scripts": {
+    "lint": "eslint src/",
+    "build": "node build.js",
+    "dev": "tsc --watch",
+    "dev:cli": "pnpm build && node bin/synergyspec-selfevolving.js",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "test:ui": "vitest --ui",
+    "test:coverage": "vitest run --coverage",
+    "test:postinstall": "node scripts/postinstall.js",
+    "test:e2e:real-agent": "node smoking-test/run-real-agent-self-evolution-e2e.mjs --local-pack",
+    "prepare": "pnpm run build",
+    "prepublishOnly": "pnpm run build && pnpm run check:pack-version && pnpm run check:pack-contents",
+    "postinstall": "node scripts/postinstall.js",
+    "check:docs": "node scripts/docs-check.mjs",
+    "check:pack-version": "node scripts/pack-version-check.mjs",
+    "check:pack-contents": "node scripts/pack-contents-check.mjs",
+    "release": "pnpm run release:ci",
+    "release:ci": "pnpm run check:pack-version && pnpm run check:pack-contents && pnpm exec changeset publish",
+    "changeset": "changeset"
+  },
+  "engines": {
+    "node": ">=20.19.0"
+  },
+  "devDependencies": {
+    "@changesets/changelog-github": "^0.5.2",
+    "@changesets/cli": "^2.27.7",
+    "@types/node": "^24.2.0",
+    "@vitest/coverage-v8": "^3.2.4",
+    "@vitest/ui": "^3.2.4",
+    "eslint": "^9.39.2",
+    "fast-check": "^4.8.0",
+    "typescript": "^5.9.3",
+    "typescript-eslint": "^8.50.1",
+    "vitest": "^3.2.4"
+  },
+  "dependencies": {
+    "@inquirer/core": "^10.2.2",
+    "@inquirer/prompts": "^7.8.0",
+    "ansi-regex": "^5.0.1",
+    "chalk": "^5.5.0",
+    "commander": "^14.0.0",
+    "fast-glob": "^3.3.3",
+    "ora": "^8.2.0",
+    "posthog-node": "^5.20.0",
+    "react": "^18.3.1",
+    "react-dom": "^18.3.1",
+    "tsx": "^4.20.6",
+    "yaml": "^2.8.2",
+    "zod": "^4.0.17"
+  }
+}