synergyspec-selfevolving 1.1.9 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,9 @@ import { generateEvolutionHints, lookupCanonicalTarget, persistLearnHints, resol
4
4
  import { readProjectConfig } from '../core/project-config.js';
5
5
  import { assembleTrajectoryContext, } from '../core/learn/trajectory-assembler.js';
6
6
  import { findTranscriptsForChange, resolveChangeDir, } from '../core/learn/trajectory-discovery.js';
7
+ import { getTrajectoryForChange } from '../core/trajectory/registry.js';
8
+ import { toTrajectoryFacts, describeRunnerResults } from '../core/trajectory/facts.js';
9
+ import { resolveHostHarness } from '../core/self-evolution/host-harness.js';
7
10
  import { buildLLMSummaryCandidates, ingestLearnHandoff, } from '../core/learn/llm-summary.js';
8
11
  function collect(value, previous) {
9
12
  previous.push(value);
@@ -14,7 +17,7 @@ export function registerLearnCommand(program) {
14
17
  .command('learn [change]')
15
18
  .description('Review a completed change and extract reusable learning candidates')
16
19
  .option('--preview', 'Preview lessons and memory candidates without writing (default)')
17
- .option('--apply', 'Write memory candidates to the local SynergySpec-SelfEvolving memory store; requires --yes')
20
+ .option('--apply', 'Write memory candidates to the local SynergySpec-SelfEvolving memory store; requires --yes. The learn SKILL runs with --apply --yes to evolve autonomously; the bare CLI previews by default.')
18
21
  .option('--only <candidate-id>', 'When applying, write only this keep candidate id (repeatable)', collect, [])
19
22
  .option('--exclude <candidate-id>', 'When applying, skip this candidate id (repeatable)', collect, [])
20
23
  .option('-y, --yes', 'Confirm --apply and skip confirmation prompts')
@@ -95,6 +98,7 @@ export function registerLearnCommand(program) {
95
98
  .command('debug-trajectory <change>')
96
99
  .description('Print the assembled TrajectoryContext for a change as JSON. Read-only; runs no LLM handoff and writes nothing.')
97
100
  .option('--preview', 'Truncate the trajectory text field to 4000 chars in the output')
101
+ .option('--harness <name>', 'Force the observed-run trajectory adapter (claude|codex|opencode); defaults to the resolved host harness')
98
102
  .action(async (change, opts) => {
99
103
  const projectRoot = process.cwd();
100
104
  try {
@@ -106,12 +110,45 @@ export function registerLearnCommand(program) {
106
110
  });
107
111
  const payload = {
108
112
  changeName: change,
113
+ // Claude-transcript discovery (this is what the `text` assembly below
114
+ // uses). On a non-Claude host this is EXPECTED to be no-transcript —
115
+ // the observed-run facts the fitness pipeline grades come from the
116
+ // `adapter` block instead.
109
117
  discovery: {
110
118
  paths: discovered.paths,
111
119
  sessionIds: discovered.sessionIds,
112
120
  selectionRule: discovered.selectionRule,
113
121
  },
114
122
  };
123
+ // Adapter-aware introspection: the LIVE learn fitness pipeline grades the
124
+ // OBSERVED run via the host-harness trajectory adapter (registry →
125
+ // opencode/codex/claude), NOT the Claude-transcript path above. Surface
126
+ // its facts + per-runner-result breakdown so a misgrade is visible in one
127
+ // command. `--harness` forces a specific adapter for cross-host debugging.
128
+ const prevHarnessEnv = process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS;
129
+ if (opts.harness)
130
+ process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS = opts.harness;
131
+ try {
132
+ const adapterTrajectory = await getTrajectoryForChange(projectRoot, change);
133
+ payload.adapter = {
134
+ resolvedHarness: resolveHostHarness(),
135
+ sessionId: adapterTrajectory?.sessionId ?? null,
136
+ turns: adapterTrajectory?.turns.length ?? 0,
137
+ sourcePaths: adapterTrajectory ? [...new Set(adapterTrajectory.sourcePaths)] : [],
138
+ facts: toTrajectoryFacts(adapterTrajectory, change),
139
+ runnerResults: describeRunnerResults(adapterTrajectory),
140
+ };
141
+ }
142
+ finally {
143
+ if (opts.harness) {
144
+ if (prevHarnessEnv === undefined) {
145
+ delete process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS;
146
+ }
147
+ else {
148
+ process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS = prevHarnessEnv;
149
+ }
150
+ }
151
+ }
115
152
  if (assembled.kind === 'ok') {
116
153
  const t = assembled.trajectory;
117
154
  const truncatedForPreview = opts.preview === true && t.text.length > 4000
@@ -137,6 +137,10 @@ export interface LearnMemoryRetrievalCheck {
137
137
  rank?: number;
138
138
  matchedMemoryId?: string;
139
139
  }
140
+ interface ArtifactFile {
141
+ relativePath: string;
142
+ content: string;
143
+ }
140
144
  export declare function generateLearnReport(args?: {
141
145
  projectRoot?: string;
142
146
  changeName?: string;
@@ -179,5 +183,20 @@ export declare function structuredMemoryBody(args: {
179
183
  }): string;
180
184
  export declare function defaultRetrievalQueries(title: string, tags: string[] | undefined): string[];
181
185
  export declare function classifyCandidate(candidate: LearnMemoryCandidate): LearnMemoryCandidate;
186
+ /**
187
+ * Find lines in a verification artifact that look like UNRESOLVED failure
188
+ * evidence. The hazard (the same prose-keyword trap as the trajectory runner
189
+ * detector) is mistaking a PASSED negative-path scenario for a failure: a result
190
+ * row `| UC3-E7a | Cleanup failure propagates | PASS | … |`, a table header
191
+ * naming a `Counterexample` / `Regression Test` column, or a list item
192
+ * `- PASS UC1-E1a: Open fails because …` all merely MENTION failure words while
193
+ * reporting success. We therefore decide pass-ness structurally (table outcome
194
+ * cell / PASS-prefixed list item / header row) before keyword-scanning the rest.
195
+ */
196
+ export declare function extractFailureEvidence(file: ArtifactFile): Array<{
197
+ file: string;
198
+ line: string;
199
+ }>;
182
200
  export declare function limitText(value: string, maxLength: number): string;
201
+ export {};
183
202
  //# sourceMappingURL=learn.d.ts.map
@@ -1381,14 +1381,63 @@ function collectArtifactFiles(artifacts) {
1381
1381
  ...artifacts.evidence,
1382
1382
  ]);
1383
1383
  }
1384
- function extractFailureEvidence(file) {
1384
+ /** A markdown table separator row, e.g. `| --- | :---: | ---- |`. */
1385
+ function isTableSeparator(line) {
1386
+ return /^\|?\s*:?-{3,}:?\s*(?:\|\s*:?-{3,}:?\s*)*\|?$/.test(line);
1387
+ }
1388
+ /** Split a markdown table row into trimmed, emphasis-stripped cells; null if not a row. */
1389
+ function splitTableCells(line) {
1390
+ if (!line.includes('|'))
1391
+ return null;
1392
+ return line
1393
+ .replace(/^\|/, '')
1394
+ .replace(/\|$/, '')
1395
+ .split('|')
1396
+ .map((c) => c.trim().replace(/\*\*/g, '').replace(/`/g, ''));
1397
+ }
1398
+ /** A single cell whose whole value is a passing/neutral verdict. */
1399
+ const PASS_CELL_RE = /^(?:pass(?:ed|es)?|covered|ok|✓|✔|n\/?a|none|-|—)$/i;
1400
+ /**
1401
+ * Find lines in a verification artifact that look like UNRESOLVED failure
1402
+ * evidence. The hazard (the same prose-keyword trap as the trajectory runner
1403
+ * detector) is mistaking a PASSED negative-path scenario for a failure: a result
1404
+ * row `| UC3-E7a | Cleanup failure propagates | PASS | … |`, a table header
1405
+ * naming a `Counterexample` / `Regression Test` column, or a list item
1406
+ * `- PASS UC1-E1a: Open fails because …` all merely MENTION failure words while
1407
+ * reporting success. We therefore decide pass-ness structurally (table outcome
1408
+ * cell / PASS-prefixed list item / header row) before keyword-scanning the rest.
1409
+ */
1410
+ export function extractFailureEvidence(file) {
1385
1411
  const lines = file.content.split(/\r?\n/);
1412
+ const nextNonEmpty = (from) => {
1413
+ for (let j = from + 1; j < lines.length; j++) {
1414
+ const t = lines[j].trim();
1415
+ if (t)
1416
+ return t;
1417
+ }
1418
+ return null;
1419
+ };
1386
1420
  const matches = [];
1387
- for (const line of lines) {
1388
- const trimmed = line.trim();
1421
+ for (let i = 0; i < lines.length; i++) {
1422
+ const trimmed = lines[i].trim();
1389
1423
  if (!trimmed)
1390
1424
  continue;
1391
- if (/\|\s*(covered|pass(?:ed|es)?)\s*\|?\s*$/i.test(trimmed)) {
1425
+ if (isTableSeparator(trimmed))
1426
+ continue;
1427
+ const cells = splitTableCells(trimmed);
1428
+ if (cells) {
1429
+ // A header row (its successor is a separator) lists column LABELS such as
1430
+ // "Counterexample" or "Regression Test" — not failure evidence.
1431
+ const successor = nextNonEmpty(i);
1432
+ if (successor !== null && isTableSeparator(successor))
1433
+ continue;
1434
+ // A data row whose outcome/status cell reads PASS/✓/covered is a PASSING
1435
+ // result, even when another cell names the failure scenario it exercises.
1436
+ if (cells.some((c) => PASS_CELL_RE.test(c)))
1437
+ continue;
1438
+ }
1439
+ else if (/^[-*\s]*(?:\*\*)?\s*pass(?:ed|es)?\b/i.test(trimmed)) {
1440
+ // A list item explicitly marked PASS (a passed negative-path scenario).
1392
1441
  continue;
1393
1442
  }
1394
1443
  if (/\b(no|none|zero|0)\s+(failures?|failed|errors?|critical issues)\b/i.test(trimmed)) {
@@ -2,9 +2,9 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
2
2
 
3
3
  **Purpose**
4
4
 
5
- This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to autonomous self-evolution. By DEFAULT it does two things: (1) summarize reusable lessons and consolidate memory, and (2) evolve the tool itself from those lessons — authoring and promoting a concrete improvement to a canonical prompt/template — WITHOUT asking for confirmation. You (the agent running this skill) are the proposer: you already hold the full change context, so you author the improved file yourself and the CLI validates → gates → promotes it onto the local installed file (no rebuild, no republish, no second agent, no \`claude -p\`).
5
+ This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to autonomous self-evolution. When you run this skill you invoke learn with \`--apply --yes\` (the bare \`learn\` CLI previews by default — writing nothing — for safety; the skill opts into evolution explicitly). That does two things: (1) summarize reusable lessons and consolidate memory, and (2) evolve the tool itself from those lessons — authoring and promoting a concrete improvement to a canonical prompt/template — WITHOUT asking for confirmation. You (the agent running this skill) are the proposer: you already hold the full change context, so you author the improved file yourself and the CLI validates → gates → promotes it onto the local installed file (no rebuild, no republish, no second agent, no \`claude -p\`).
6
6
 
7
- Preview-only is the explicit opt-out: run \`synergyspec-selfevolving learn <change> --preview\` (or pass \`--preview\`) to analyze without evolving.
7
+ Preview-only is the bare-CLI default and the explicit opt-out: run \`synergyspec-selfevolving learn <change> --preview\` (or simply omit \`--apply\`) to analyze without evolving.
8
8
 
9
9
  **Default Mode: Autonomous self-evolution**
10
10
 
@@ -323,7 +323,8 @@ class ClaudeTrajectorySource {
323
323
  sessionId: mainSessionId,
324
324
  turns,
325
325
  subagentSessionIds,
326
- sourcePaths,
326
+ // Distinct source files only (one transcript can yield many turns).
327
+ sourcePaths: [...new Set(sourcePaths)],
327
328
  };
328
329
  }
329
330
  catch {
@@ -46,8 +46,10 @@
46
46
  * ToolPart{type:'tool', state} -> pending|running => ToolCallPart{tool, callId, input}
47
47
  * completed => ToolResultPart{tool, callId, output}
48
48
  * error => ToolResultPart{tool, callId, isError, output:error}
49
- * exitCode is left null (opencode does not expose one); facts.ts re-sources
50
- * the pass rate from `output`.
49
+ * exitCode is sourced from `state.metadata.exit` when present (opencode exposes
50
+ * the real shell exit code there), else null; facts.ts uses it directly and
51
+ * also re-sources the pass rate from `output`. For truncated outputs the full
52
+ * text is recovered from `state.metadata.outputPath`.
51
53
  *
52
54
  * Pure + no throw: every I/O or parse failure degrades to null / an empty list.
53
55
  */
@@ -58,9 +60,20 @@ import { readChangeWindow, resolveChangeDir } from '../../learn/trajectory-disco
58
60
  import { samePath } from '../path-attribution.js';
59
61
  /** Cap individual JSON files we read to avoid blowing up on a giant transcript. */
60
62
  const MAX_JSON_BYTES = 8 * 1024 * 1024;
61
- /** Resolve the opencode data root (env override wins, then ~/.local/share/opencode). */
63
+ /**
64
+ * Resolve the opencode data root. Precedence: explicit `OPENCODE_DATA_DIR`, then
65
+ * the XDG base dir (`XDG_DATA_HOME/opencode`) when set, then the cross-platform
66
+ * default `~/.local/share/opencode`. Honoring XDG_DATA_HOME keeps discovery (and
67
+ * thus observed-run grading) working where opencode stores its data off the
68
+ * default path; absent that env it is byte-identical to the prior behavior.
69
+ */
62
70
  function dataRoot(homeDir) {
63
- return process.env.OPENCODE_DATA_DIR ?? path.join(homeDir, '.local', 'share', 'opencode');
71
+ if (process.env.OPENCODE_DATA_DIR)
72
+ return process.env.OPENCODE_DATA_DIR;
73
+ const xdg = process.env.XDG_DATA_HOME;
74
+ if (xdg && xdg.trim().length > 0)
75
+ return path.join(xdg, 'opencode');
76
+ return path.join(homeDir, '.local', 'share', 'opencode');
64
77
  }
65
78
  /** Read + JSON.parse a file, returning null on any miss/oversize/parse error. */
66
79
  async function readJsonCapped(file) {
@@ -75,6 +88,33 @@ async function readJsonCapped(file) {
75
88
  return null;
76
89
  }
77
90
  }
91
+ /** Read a UTF-8 text file, returning null on any miss/oversize/read error. */
92
+ async function readTextCapped(file) {
93
+ try {
94
+ const stat = await fs.stat(file);
95
+ if (!stat.isFile() || stat.size > MAX_JSON_BYTES)
96
+ return null;
97
+ return await fs.readFile(file, 'utf-8');
98
+ }
99
+ catch {
100
+ return null;
101
+ }
102
+ }
103
+ /**
104
+ * When opencode truncates a tool's inline `output` (large logs), the full text is
105
+ * written to `state.metadata.outputPath`. Recover it (bounded) so a test summary
106
+ * that fell past the retained inline preview is still parseable. Best-effort:
107
+ * any miss (no path, not truncated, ENOENT, oversize, read error) leaves the
108
+ * inline output untouched. Mutates the raw part in place before mapping.
109
+ */
110
+ async function recoverTruncatedOutput(part) {
111
+ const meta = part.state?.metadata;
112
+ if (!meta || meta.truncated !== true || typeof meta.outputPath !== 'string')
113
+ return;
114
+ const full = await readTextCapped(meta.outputPath);
115
+ if (full !== null && part.state)
116
+ part.state.output = full;
117
+ }
78
118
  /** List directory entry names of a given kind; [] on any error. */
79
119
  async function listDir(dir, kind) {
80
120
  try {
@@ -539,11 +579,20 @@ class SqliteStore {
539
579
  statusStr === 'error'
540
580
  ? statusStr
541
581
  : undefined;
582
+ const metaRec = asRecord(stateRec.metadata);
542
583
  part.state = {
543
584
  status,
544
585
  input: asRecord(stateRec.input) ?? undefined,
545
586
  output: typeof stateRec.output === 'string' ? stateRec.output : undefined,
546
587
  error: typeof stateRec.error === 'string' ? stateRec.error : undefined,
588
+ metadata: metaRec
589
+ ? {
590
+ exit: typeof metaRec.exit === 'number' ? metaRec.exit : undefined,
591
+ output: typeof metaRec.output === 'string' ? metaRec.output : undefined,
592
+ truncated: typeof metaRec.truncated === 'boolean' ? metaRec.truncated : undefined,
593
+ outputPath: typeof metaRec.outputPath === 'string' ? metaRec.outputPath : undefined,
594
+ }
595
+ : undefined,
547
596
  };
548
597
  }
549
598
  return part;
@@ -600,6 +649,13 @@ function mapPart(part) {
600
649
  const tool = typeof part.tool === 'string' ? part.tool : 'tool';
601
650
  const callId = typeof part.callID === 'string' ? part.callID : undefined;
602
651
  const state = part.state ?? {};
652
+ // opencode DOES expose a real shell exit code via `state.metadata.exit`
653
+ // (e.g. 0 for a green `pytest` run). Earlier this was hardcoded null, which
654
+ // forced verification to rely solely on parsing the output summary and made
655
+ // observed-grading fail closed when the summary was absent/truncated.
656
+ const exitCode = typeof state.metadata?.exit === 'number' && Number.isFinite(state.metadata.exit)
657
+ ? state.metadata.exit
658
+ : null;
603
659
  const call = { kind: 'tool_call', tool, callId, input: state.input };
604
660
  switch (state.status) {
605
661
  case 'pending':
@@ -613,7 +669,7 @@ function mapPart(part) {
613
669
  tool,
614
670
  callId,
615
671
  output: typeof state.output === 'string' ? state.output : undefined,
616
- exitCode: null,
672
+ exitCode,
617
673
  },
618
674
  ];
619
675
  case 'error':
@@ -625,7 +681,7 @@ function mapPart(part) {
625
681
  callId,
626
682
  isError: true,
627
683
  output: typeof state.error === 'string' ? state.error : undefined,
628
- exitCode: null,
684
+ exitCode,
629
685
  },
630
686
  ];
631
687
  default:
@@ -663,6 +719,7 @@ async function readSessionTurns(store, sessionId, startIndex) {
663
719
  rawParts.sort(byId);
664
720
  const parts = [];
665
721
  for (const p of rawParts) {
722
+ await recoverTruncatedOutput(p);
666
723
  for (const mapped of mapPart(p))
667
724
  parts.push(mapped);
668
725
  }
@@ -795,7 +852,9 @@ class OpencodeSource {
795
852
  sessionId: main.id,
796
853
  turns,
797
854
  subagentSessionIds,
798
- sourcePaths,
855
+ // De-dupe: one label is pushed per message/turn, so the same db file (or
856
+ // dir) repeats once per turn. Distinct sources only (audit-trail clarity).
857
+ sourcePaths: [...new Set(sourcePaths)],
799
858
  };
800
859
  }
801
860
  }
@@ -36,4 +36,24 @@ export interface TrajectoryFacts {
36
36
  * test-report and preserve byte-identical baseline behaviour.
37
37
  */
38
38
  export declare function toTrajectoryFacts(trajectory: NormalizedTrajectory | null, changeName: string): TrajectoryFacts | null;
39
+ /** One observed runner result, as the grader sees it — for `debug-trajectory`. */
40
+ export interface RunnerResultDetail {
41
+ tool: string | null;
42
+ callId: string | null;
43
+ command: string | null;
44
+ outputLength: number;
45
+ outputLastLine: string;
46
+ parsedPassRate: number | null;
47
+ exitCode: number | null;
48
+ isError: boolean;
49
+ /** True for the single result that decided the verdict (see gradedRunnerIndex). */
50
+ graded: boolean;
51
+ }
52
+ /**
53
+ * Explain WHICH runner results the grader saw and which one it graded. This is
54
+ * the introspection the observed-grading path otherwise hides — surfaced by
55
+ * `learn debug-trajectory` so a misgrade (e.g. a file-write quoting a test
56
+ * command shadowing a real run) is visible in one command. Pure; [] when null.
57
+ */
58
+ export declare function describeRunnerResults(trajectory: NormalizedTrajectory | null): RunnerResultDetail[];
39
59
  //# sourceMappingURL=facts.d.ts.map
@@ -13,70 +13,156 @@
13
13
  * Pure + no throw.
14
14
  */
15
15
  import { parseTestMetrics } from '../fitness/test-metrics.js';
16
+ /**
17
+ * Matches the NAME of a shell/command-executing tool across harnesses — Claude
18
+ * `Bash`; opencode `bash`; Codex `shell`/`local_shell`/`shell_command`/`exec`;
19
+ * cursor `run_terminal_cmd`; etc. Runner detection is GATED on this (deny by
20
+ * default): a file-mutating tool (`apply_patch`/`write`/`edit`/`multiedit`) whose
21
+ * payload merely QUOTES a test command must never be mistaken for a test run —
22
+ * that false positive, paired with the "last runner result wins" rule below,
23
+ * silently blanked observed-grading on a real run (a verification-report write
24
+ * whose patch text quoted `pytest tests`). A NAME we don't recognize as a shell
25
+ * tool degrades to `testRunObserved = false` (soft penalty), never to a false
26
+ * green. We pattern-match (not an exact set) so harness naming drift
27
+ * (`shell_command` vs `local_shell`) doesn't silently disable grading; the shell
28
+ * tokens are word-boundaried on `._-` so `apply_patch`/`multiedit`/`str_replace`
29
+ * never match.
30
+ */
31
+ const EXEC_TOOL_RE = /(?:^|[._-])(?:bash|sh|zsh|fish|pwsh|powershell|shell|cmd|exec|command|terminal|run)(?:[._-]|$)/i;
32
+ function isExecTool(tool) {
33
+ return tool !== undefined && EXEC_TOOL_RE.test(tool);
34
+ }
35
+ /**
36
+ * Input fields that carry the executed command line, in preference order. When
37
+ * one is present we match the runner regex against IT ALONE, so a sibling prose
38
+ * field (e.g. a Bash call's `description`) that merely mentions a test command
39
+ * cannot trigger a match. Falls back to scanning every string value when no
40
+ * command field is present — keeps detection working for unknown input shapes,
41
+ * and a false negative there is the safe direction.
42
+ */
43
+ const COMMAND_FIELDS = ['command', 'cmd', 'script', 'argv', 'args'];
16
44
  /**
17
45
  * Recognizes a test-runner invocation from a tool call's decoded arguments.
18
- * Best-effort and harness-neutral: we stringify the input values and match the
19
- * canonical runner commands. A false negative degrades to `testRunObserved =
20
- * false` (soft penalty), never to a crash.
46
+ * Best-effort and harness-neutral. The CALLER must gate on {@link isExecTool}
47
+ * first; this only inspects the command-bearing field(s) of the input.
21
48
  */
22
49
  const RUNNER_RE = /\b(?:vitest|jest|mocha|playwright\s+test|cypress\s+run|pytest|py\.test|python\s+-m\s+(?:pytest|unittest)|unittest|go\s+test|cargo\s+test|cargo\s+nextest|rspec|gradle(?:w)?\s+test|mvn\s+test|dotnet\s+test|ctest|npm\s+(?:run\s+)?test|yarn\s+(?:run\s+)?test|pnpm\s+(?:run\s+)?test|bun\s+test)\b/i;
50
+ function matchesRunner(value) {
51
+ if (typeof value === 'string')
52
+ return RUNNER_RE.test(value);
53
+ // Codex/opencode shell args often arrive as an argv array.
54
+ if (Array.isArray(value)) {
55
+ return RUNNER_RE.test(value.filter((v) => typeof v === 'string').join(' '));
56
+ }
57
+ return false;
58
+ }
23
59
  function inputLooksLikeRunner(input) {
24
60
  if (!input)
25
61
  return false;
26
- for (const value of Object.values(input)) {
27
- if (typeof value === 'string') {
28
- if (RUNNER_RE.test(value))
29
- return true;
30
- }
31
- else if (Array.isArray(value)) {
32
- // Codex/opencode shell args often arrive as an argv array.
33
- if (RUNNER_RE.test(value.filter((v) => typeof v === 'string').join(' ')))
34
- return true;
35
- }
62
+ const preferred = [];
63
+ for (const f of COMMAND_FIELDS) {
64
+ if (Object.prototype.hasOwnProperty.call(input, f))
65
+ preferred.push(input[f]);
36
66
  }
37
- return false;
67
+ // Codex wraps a raw (non-JSON) command line as `{ input: "<cmd>" }`.
68
+ if (preferred.length === 0 && typeof input.input === 'string')
69
+ preferred.push(input.input);
70
+ const values = preferred.length > 0 ? preferred : Object.values(input);
71
+ return values.some(matchesRunner);
72
+ }
73
+ /** Best-effort command text from a tool_call input, for debug display. */
74
+ function commandText(input) {
75
+ if (!input)
76
+ return undefined;
77
+ for (const f of COMMAND_FIELDS) {
78
+ const v = input[f];
79
+ if (typeof v === 'string')
80
+ return v;
81
+ if (Array.isArray(v))
82
+ return v.filter((x) => typeof x === 'string').join(' ');
83
+ }
84
+ if (typeof input.input === 'string')
85
+ return input.input;
86
+ return undefined;
38
87
  }
39
88
  /**
40
- * Compute the {@link TrajectoryFacts} for a change. Returns `null` when there is
41
- * no trajectory at all, so callers can cleanly fall back to the authored
42
- * test-report and preserve byte-identical baseline behaviour.
89
+ * Walk the trajectory once, pairing runner results to runner CALLS by callId
90
+ * (with a positional fallback for records that omit one), and return the runner
91
+ * results plus the total tool-call count. Single-sourced so {@link toTrajectoryFacts}
92
+ * and {@link describeRunnerResults} can never drift on what counts as a run.
43
93
  */
44
- export function toTrajectoryFacts(trajectory, changeName) {
45
- if (!trajectory)
46
- return null;
47
- // Walk parts in order, pairing runner results to runner calls by callId,
48
- // with a positional fallback for records that omit one.
49
- const runnerByCallId = new Map();
50
- let lastCallWasRunner = false;
94
+ function collectRunnerResults(trajectory) {
95
+ const callMetaById = new Map();
96
+ let lastCall = null;
51
97
  let toolCallCount = 0;
52
98
  const runnerResults = [];
53
99
  for (const turn of trajectory.turns) {
54
100
  for (const part of turn.parts) {
55
101
  if (part.kind === 'tool_call') {
56
102
  toolCallCount++;
57
- const isRunner = inputLooksLikeRunner(part.input);
58
- lastCallWasRunner = isRunner;
103
+ const meta = {
104
+ isRunner: isExecTool(part.tool) && inputLooksLikeRunner(part.input),
105
+ tool: part.tool,
106
+ command: commandText(part.input),
107
+ };
108
+ lastCall = meta;
59
109
  if (part.callId)
60
- runnerByCallId.set(part.callId, isRunner);
110
+ callMetaById.set(part.callId, meta);
61
111
  }
62
112
  else if (part.kind === 'tool_result') {
63
- const matched = part.callId && runnerByCallId.has(part.callId)
64
- ? runnerByCallId.get(part.callId) === true
65
- : !part.callId && lastCallWasRunner;
66
- if (matched) {
113
+ const meta = part.callId && callMetaById.has(part.callId)
114
+ ? callMetaById.get(part.callId)
115
+ : !part.callId && lastCall
116
+ ? lastCall
117
+ : undefined;
118
+ if (meta?.isRunner) {
67
119
  runnerResults.push({
68
120
  output: part.output,
69
121
  isError: part.isError,
70
122
  exitCode: part.exitCode ?? null,
123
+ tool: meta.tool ?? part.tool,
124
+ callId: part.callId,
125
+ command: meta.command,
71
126
  });
72
127
  }
73
128
  // A result consumes the positional pairing slot.
74
- lastCallWasRunner = false;
129
+ lastCall = null;
75
130
  }
76
131
  }
77
132
  }
133
+ return { runnerResults, toolCallCount };
134
+ }
135
+ /** A runner result carries a usable signal: metrics, a numeric exit, or an error. */
136
+ function hasSignal(r) {
137
+ return (typeof r.exitCode === 'number' ||
138
+ r.isError === true ||
139
+ (r.output != null && parseTestMetrics(r.output) !== null));
140
+ }
141
+ /**
142
+ * Index of the runner result that decides the verdict: the LAST one carrying a
143
+ * signal, so a trailing exec call that merely matched a runner keyword but
144
+ * produced no test summary (e.g. `echo "pytest"`) can't blank a real prior run.
145
+ * Falls back to the final result when none carries a signal; -1 when empty.
146
+ */
147
+ function gradedRunnerIndex(runnerResults) {
148
+ for (let i = runnerResults.length - 1; i >= 0; i--) {
149
+ if (hasSignal(runnerResults[i]))
150
+ return i;
151
+ }
152
+ return runnerResults.length - 1;
153
+ }
154
+ /**
155
+ * Compute the {@link TrajectoryFacts} for a change. Returns `null` when there is
156
+ * no trajectory at all, so callers can cleanly fall back to the authored
157
+ * test-report and preserve byte-identical baseline behaviour.
158
+ */
159
+ export function toTrajectoryFacts(trajectory, changeName) {
160
+ if (!trajectory)
161
+ return null;
162
+ const { runnerResults, toolCallCount } = collectRunnerResults(trajectory);
78
163
  const testRunObserved = runnerResults.length > 0;
79
- const last = testRunObserved ? runnerResults[runnerResults.length - 1] : null;
164
+ const gradedIdx = gradedRunnerIndex(runnerResults);
165
+ const last = gradedIdx >= 0 ? runnerResults[gradedIdx] : null;
80
166
  const runnerExitCode = last && typeof last.exitCode === 'number' ? last.exitCode : null;
81
167
  const observedPassRate = last && last.output ? (parseTestMetrics(last.output)?.passRate ?? null) : null;
82
168
  let observedStatus = null;
@@ -100,4 +186,31 @@ export function toTrajectoryFacts(trajectory, changeName) {
100
186
  sourcePaths: trajectory.sourcePaths,
101
187
  };
102
188
  }
189
+ /**
190
+ * Explain WHICH runner results the grader saw and which one it graded. This is
191
+ * the introspection the observed-grading path otherwise hides — surfaced by
192
+ * `learn debug-trajectory` so a misgrade (e.g. a file-write quoting a test
193
+ * command shadowing a real run) is visible in one command. Pure; [] when null.
194
+ */
195
+ export function describeRunnerResults(trajectory) {
196
+ if (!trajectory)
197
+ return [];
198
+ const { runnerResults } = collectRunnerResults(trajectory);
199
+ const gradedIdx = gradedRunnerIndex(runnerResults);
200
+ return runnerResults.map((r, i) => {
201
+ const output = typeof r.output === 'string' ? r.output : '';
202
+ const lastLine = output.trim().split(/\r?\n/).filter(Boolean).pop() ?? '';
203
+ return {
204
+ tool: r.tool ?? null,
205
+ callId: r.callId ?? null,
206
+ command: r.command ?? null,
207
+ outputLength: output.length,
208
+ outputLastLine: lastLine.slice(0, 200),
209
+ parsedPassRate: output ? (parseTestMetrics(output)?.passRate ?? null) : null,
210
+ exitCode: r.exitCode ?? null,
211
+ isError: r.isError === true,
212
+ graded: i === gradedIdx,
213
+ };
214
+ });
215
+ }
103
216
  //# sourceMappingURL=facts.js.map
package/package.json CHANGED
@@ -1,95 +1,95 @@
1
- {
2
- "name": "synergyspec-selfevolving",
3
- "version": "1.1.9",
4
- "description": "AI-native system for spec-driven development",
5
- "keywords": [
6
- "synergyspec-selfevolving",
7
- "openspec",
8
- "spec-driven",
9
- "specs",
10
- "cli",
11
- "ai",
12
- "development"
13
- ],
14
- "homepage": "https://github.com/ZhifeiDou/SynergySpec-SelfEvolving",
15
- "repository": {
16
- "type": "git",
17
- "url": "git+https://github.com/ZhifeiDou/SynergySpec-SelfEvolving.git"
18
- },
19
- "license": "MIT",
20
- "author": "Zhifei Dou",
21
- "type": "module",
22
- "publishConfig": {
23
- "access": "public"
24
- },
25
- "exports": {
26
- ".": {
27
- "types": "./dist/index.d.ts",
28
- "default": "./dist/index.js"
29
- }
30
- },
31
- "bin": {
32
- "synergyspec-selfevolving": "bin/synergyspec-selfevolving.js"
33
- },
34
- "files": [
35
- "dist",
36
- "bin",
37
- "schemas",
38
- "scripts/postinstall.js",
39
- "scripts/nl2repo_synergyspec-selfevolving_wrapper.py",
40
- "!dist/**/*.test.js",
41
- "!dist/**/__tests__",
42
- "!dist/**/*.map"
43
- ],
44
- "scripts": {
45
- "lint": "eslint src/",
46
- "build": "node build.js",
47
- "dev": "tsc --watch",
48
- "dev:cli": "pnpm build && node bin/synergyspec-selfevolving.js",
49
- "test": "vitest run",
50
- "test:watch": "vitest",
51
- "test:ui": "vitest --ui",
52
- "test:coverage": "vitest run --coverage",
53
- "test:postinstall": "node scripts/postinstall.js",
54
- "test:e2e:real-agent": "node smoking-test/run-real-agent-self-evolution-e2e.mjs --local-pack",
55
- "prepare": "pnpm run build",
56
- "prepublishOnly": "pnpm run build && pnpm run check:pack-version && pnpm run check:pack-contents",
57
- "postinstall": "node scripts/postinstall.js",
58
- "check:docs": "node scripts/docs-check.mjs",
59
- "check:pack-version": "node scripts/pack-version-check.mjs",
60
- "check:pack-contents": "node scripts/pack-contents-check.mjs",
61
- "release": "pnpm run release:ci",
62
- "release:ci": "pnpm run check:pack-version && pnpm run check:pack-contents && pnpm exec changeset publish",
63
- "changeset": "changeset"
64
- },
65
- "engines": {
66
- "node": ">=20.19.0"
67
- },
68
- "devDependencies": {
69
- "@changesets/changelog-github": "^0.5.2",
70
- "@changesets/cli": "^2.27.7",
71
- "@types/node": "^24.2.0",
72
- "@vitest/coverage-v8": "^3.2.4",
73
- "@vitest/ui": "^3.2.4",
74
- "eslint": "^9.39.2",
75
- "fast-check": "^4.8.0",
76
- "typescript": "^5.9.3",
77
- "typescript-eslint": "^8.50.1",
78
- "vitest": "^3.2.4"
79
- },
80
- "dependencies": {
81
- "@inquirer/core": "^10.2.2",
82
- "@inquirer/prompts": "^7.8.0",
83
- "ansi-regex": "^5.0.1",
84
- "chalk": "^5.5.0",
85
- "commander": "^14.0.0",
86
- "fast-glob": "^3.3.3",
87
- "ora": "^8.2.0",
88
- "posthog-node": "^5.20.0",
89
- "react": "^18.3.1",
90
- "react-dom": "^18.3.1",
91
- "tsx": "^4.20.6",
92
- "yaml": "^2.8.2",
93
- "zod": "^4.0.17"
94
- }
95
- }
1
+ {
2
+ "name": "synergyspec-selfevolving",
3
+ "version": "1.1.10",
4
+ "description": "AI-native system for spec-driven development",
5
+ "keywords": [
6
+ "synergyspec-selfevolving",
7
+ "openspec",
8
+ "spec-driven",
9
+ "specs",
10
+ "cli",
11
+ "ai",
12
+ "development"
13
+ ],
14
+ "homepage": "https://github.com/ZhifeiDou/SynergySpec-SelfEvolving",
15
+ "repository": {
16
+ "type": "git",
17
+ "url": "git+https://github.com/ZhifeiDou/SynergySpec-SelfEvolving.git"
18
+ },
19
+ "license": "MIT",
20
+ "author": "Zhifei Dou",
21
+ "type": "module",
22
+ "publishConfig": {
23
+ "access": "public"
24
+ },
25
+ "exports": {
26
+ ".": {
27
+ "types": "./dist/index.d.ts",
28
+ "default": "./dist/index.js"
29
+ }
30
+ },
31
+ "bin": {
32
+ "synergyspec-selfevolving": "bin/synergyspec-selfevolving.js"
33
+ },
34
+ "files": [
35
+ "dist",
36
+ "bin",
37
+ "schemas",
38
+ "scripts/postinstall.js",
39
+ "scripts/nl2repo_synergyspec-selfevolving_wrapper.py",
40
+ "!dist/**/*.test.js",
41
+ "!dist/**/__tests__",
42
+ "!dist/**/*.map"
43
+ ],
44
+ "scripts": {
45
+ "lint": "eslint src/",
46
+ "build": "node build.js",
47
+ "dev": "tsc --watch",
48
+ "dev:cli": "pnpm build && node bin/synergyspec-selfevolving.js",
49
+ "test": "vitest run",
50
+ "test:watch": "vitest",
51
+ "test:ui": "vitest --ui",
52
+ "test:coverage": "vitest run --coverage",
53
+ "test:postinstall": "node scripts/postinstall.js",
54
+ "test:e2e:real-agent": "node smoking-test/run-real-agent-self-evolution-e2e.mjs --local-pack",
55
+ "prepare": "pnpm run build",
56
+ "prepublishOnly": "pnpm run build && pnpm run check:pack-version && pnpm run check:pack-contents",
57
+ "postinstall": "node scripts/postinstall.js",
58
+ "check:docs": "node scripts/docs-check.mjs",
59
+ "check:pack-version": "node scripts/pack-version-check.mjs",
60
+ "check:pack-contents": "node scripts/pack-contents-check.mjs",
61
+ "release": "pnpm run release:ci",
62
+ "release:ci": "pnpm run check:pack-version && pnpm run check:pack-contents && pnpm exec changeset publish",
63
+ "changeset": "changeset"
64
+ },
65
+ "engines": {
66
+ "node": ">=20.19.0"
67
+ },
68
+ "devDependencies": {
69
+ "@changesets/changelog-github": "^0.5.2",
70
+ "@changesets/cli": "^2.27.7",
71
+ "@types/node": "^24.2.0",
72
+ "@vitest/coverage-v8": "^3.2.4",
73
+ "@vitest/ui": "^3.2.4",
74
+ "eslint": "^9.39.2",
75
+ "fast-check": "^4.8.0",
76
+ "typescript": "^5.9.3",
77
+ "typescript-eslint": "^8.50.1",
78
+ "vitest": "^3.2.4"
79
+ },
80
+ "dependencies": {
81
+ "@inquirer/core": "^10.2.2",
82
+ "@inquirer/prompts": "^7.8.0",
83
+ "ansi-regex": "^5.0.1",
84
+ "chalk": "^5.5.0",
85
+ "commander": "^14.0.0",
86
+ "fast-glob": "^3.3.3",
87
+ "ora": "^8.2.0",
88
+ "posthog-node": "^5.20.0",
89
+ "react": "^18.3.1",
90
+ "react-dom": "^18.3.1",
91
+ "tsx": "^4.20.6",
92
+ "yaml": "^2.8.2",
93
+ "zod": "^4.0.17"
94
+ }
95
+ }