synergyspec-selfevolving 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/README.md +31 -18
  2. package/dist/commands/learn.d.ts +12 -1
  3. package/dist/commands/learn.js +158 -11
  4. package/dist/commands/self-evolution-episode.d.ts +177 -0
  5. package/dist/commands/self-evolution-episode.js +431 -0
  6. package/dist/commands/self-evolution.d.ts +12 -190
  7. package/dist/commands/self-evolution.js +114 -866
  8. package/dist/core/archive.d.ts +0 -1
  9. package/dist/core/archive.js +0 -58
  10. package/dist/core/artifact-graph/instruction-loader.d.ts +2 -4
  11. package/dist/core/artifact-graph/instruction-loader.js +3 -31
  12. package/dist/core/fitness/loss.d.ts +5 -5
  13. package/dist/core/fitness/loss.js +4 -4
  14. package/dist/core/fitness/test-failures.js +10 -2
  15. package/dist/core/project-config.d.ts +19 -0
  16. package/dist/core/project-config.js +96 -0
  17. package/dist/core/self-evolution/candidate-fitness.d.ts +23 -1
  18. package/dist/core/self-evolution/candidate-fitness.js +31 -5
  19. package/dist/core/self-evolution/candidates.d.ts +0 -9
  20. package/dist/core/self-evolution/critic-agent.d.ts +192 -0
  21. package/dist/core/self-evolution/critic-agent.js +568 -0
  22. package/dist/core/self-evolution/edits-contract.d.ts +53 -0
  23. package/dist/core/self-evolution/edits-contract.js +89 -0
  24. package/dist/core/self-evolution/episode-orchestrator.d.ts +234 -0
  25. package/dist/core/self-evolution/episode-orchestrator.js +681 -0
  26. package/dist/core/self-evolution/episode-store.d.ts +266 -0
  27. package/dist/core/self-evolution/episode-store.js +573 -0
  28. package/dist/core/self-evolution/evolution-switches.d.ts +1 -1
  29. package/dist/core/self-evolution/evolution-switches.js +5 -10
  30. package/dist/core/self-evolution/evolving-agent.d.ts +208 -0
  31. package/dist/core/self-evolution/evolving-agent.js +535 -0
  32. package/dist/core/self-evolution/host-harness.d.ts +14 -15
  33. package/dist/core/self-evolution/host-harness.js +48 -23
  34. package/dist/core/self-evolution/index.d.ts +11 -6
  35. package/dist/core/self-evolution/index.js +20 -6
  36. package/dist/core/self-evolution/line-diff.d.ts +60 -0
  37. package/dist/core/self-evolution/line-diff.js +130 -0
  38. package/dist/core/self-evolution/policy/fs-safe.d.ts +19 -0
  39. package/dist/core/self-evolution/policy/fs-safe.js +89 -0
  40. package/dist/core/self-evolution/policy/index.d.ts +13 -0
  41. package/dist/core/self-evolution/policy/index.js +13 -0
  42. package/dist/core/self-evolution/policy/policy-store.d.ts +217 -0
  43. package/dist/core/self-evolution/policy/policy-store.js +774 -0
  44. package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
  45. package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
  46. package/dist/core/self-evolution/policy/reject-buffer.d.ts +55 -0
  47. package/dist/core/self-evolution/policy/reject-buffer.js +170 -0
  48. package/dist/core/self-evolution/promote.d.ts +1 -1
  49. package/dist/core/self-evolution/promote.js +6 -33
  50. package/dist/core/self-evolution/promotion.js +1 -2
  51. package/dist/core/self-evolution/reward-agent.d.ts +379 -0
  52. package/dist/core/self-evolution/reward-agent.js +940 -0
  53. package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
  54. package/dist/core/self-evolution/reward-aggregator.js +262 -0
  55. package/dist/core/self-evolution/scope-gate.d.ts +66 -0
  56. package/dist/core/self-evolution/scope-gate.js +107 -0
  57. package/dist/core/self-evolution/success-channel.js +2 -2
  58. package/dist/core/self-evolution/tamper-check.d.ts +24 -0
  59. package/dist/core/self-evolution/tamper-check.js +236 -0
  60. package/dist/core/self-evolution/tool-evolution.js +2 -13
  61. package/dist/core/self-evolution/verdict.d.ts +8 -5
  62. package/dist/core/self-evolution/verdict.js +4 -7
  63. package/dist/core/templates/workflows/gen-tests.js +1 -1
  64. package/dist/core/templates/workflows/learn.d.ts +3 -2
  65. package/dist/core/templates/workflows/learn.js +21 -18
  66. package/dist/core/templates/workflows/self-evolving.d.ts +6 -4
  67. package/dist/core/templates/workflows/self-evolving.js +62 -172
  68. package/dist/core/trajectory/scrub.d.ts +27 -0
  69. package/dist/core/trajectory/scrub.js +79 -0
  70. package/dist/core/trajectory/skeleton.d.ts +27 -1
  71. package/dist/core/trajectory/skeleton.js +152 -8
  72. package/dist/dashboard/data.d.ts +25 -51
  73. package/dist/dashboard/data.js +68 -180
  74. package/dist/dashboard/react-client.js +458 -503
  75. package/dist/dashboard/react-styles.js +3 -3
  76. package/dist/dashboard/server.js +23 -17
  77. package/dist/ui/ascii-patterns.d.ts +7 -15
  78. package/dist/ui/ascii-patterns.js +123 -54
  79. package/dist/ui/welcome-screen.d.ts +0 -14
  80. package/dist/ui/welcome-screen.js +16 -35
  81. package/package.json +1 -1
  82. package/dist/core/self-evolution/ga-selection.d.ts +0 -94
  83. package/dist/core/self-evolution/ga-selection.js +0 -153
  84. package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
  85. package/dist/core/self-evolution/proposer-agent.js +0 -326
  86. package/dist/core/self-evolution/replay-runner.d.ts +0 -100
  87. package/dist/core/self-evolution/replay-runner.js +0 -170
  88. package/dist/core/self-evolution/replay.d.ts +0 -45
  89. package/dist/core/self-evolution/replay.js +0 -56
  90. package/dist/core/self-evolution/template-variants.d.ts +0 -62
  91. package/dist/core/self-evolution/template-variants.js +0 -171
  92. package/dist/core/self-evolution/trajectory.d.ts +0 -65
  93. package/dist/core/self-evolution/trajectory.js +0 -185
@@ -15,8 +15,17 @@
15
15
  */
16
16
  import { parseTestMetrics } from '../fitness/test-metrics.js';
17
17
  import { commandText, inputLooksLikeRunner, isExecTool } from './facts.js';
18
+ import { scrub } from './scrub.js';
18
19
  const MAX_SKELETON_EVENTS = 40;
19
20
  const MAX_COMMAND_CHARS = 120;
21
+ /** Failed command/test-run events keep more of their command line so flags survive (P7). */
22
+ const FAILED_COMMAND_CHARS = 240;
23
+ /** Per-event cap on a kept error tail. */
24
+ const MAX_ERROR_TAIL_CHARS = 400;
25
+ /** Global budget across all kept error tails — bounds a flailing run's failures. */
26
+ const MAX_TOTAL_ERROR_BODY_CHARS = 2000;
27
+ /** A reasoning/text block at least this long counts as a "stated plan" (⑥). */
28
+ const STATED_PLAN_MIN_CHARS = 120;
20
29
  /**
21
30
  * Matches the NAME of a file-mutating tool across harnesses — Claude
22
31
  * `Write`/`Edit`/`MultiEdit`/`NotebookEdit`; opencode `write`/`edit`/`patch`;
@@ -65,11 +74,16 @@ function editedFiles(input) {
65
74
  }
66
75
  return patchFilesFromPayload(input);
67
76
  }
68
- function capCommand(command) {
77
+ function capCommand(command, max = MAX_COMMAND_CHARS) {
69
78
  if (!command)
70
79
  return undefined;
71
80
  const c = command.trim().replace(/\s+/g, ' ');
72
- return c.length > MAX_COMMAND_CHARS ? `${c.slice(0, MAX_COMMAND_CHARS - 1)}…` : c;
81
+ return c.length > max ? `${c.slice(0, max - 1)}…` : c;
82
+ }
83
+ /** Last `max` chars of a (single-spaced) output — error summaries live at the end. */
84
+ function tailExcerpt(text, max) {
85
+ const t = text.trimEnd();
86
+ return t.length > max ? `…${t.slice(t.length - (max - 1))}` : t;
73
87
  }
74
88
  /**
75
89
  * Project the bounded action skeleton from a normalized trajectory. One walk,
@@ -83,6 +97,14 @@ export function toActionSkeleton(trajectory) {
83
97
  let lastPending = null;
84
98
  let totalToolCalls = 0;
85
99
  const events = [];
100
+ // Raw (uncapped) command + raw failed-output, keyed by event reference, so the
101
+ // failed-command higher cap (P7) and error tails (P4) can be finalized AFTER
102
+ // salience truncation against only the KEPT events.
103
+ const rawCommand = new Map();
104
+ const rawErrorOutput = new Map();
105
+ // ⑥ Non-narrative plan signal: did the agent ever state a substantial plan?
106
+ let statedPlanPresent = false;
107
+ let sawNarration = false;
86
108
  const append = (event) => {
87
109
  // Per-file rollup: consecutive edits to the same file collapse.
88
110
  const prev = events[events.length - 1];
@@ -102,7 +124,8 @@ export function toActionSkeleton(trajectory) {
102
124
  totalToolCalls++;
103
125
  const exec = isExecTool(part.tool);
104
126
  if (exec) {
105
- const command = capCommand(commandText(part.input));
127
+ const rawCmd = commandText(part.input);
128
+ const command = capCommand(rawCmd);
106
129
  // A shell-driven apply_patch (codex heredoc style) carries
107
130
  // `*** Update File: <path>` lines in its payload — that's a file
108
131
  // edit, not a command. Markers only: an exec input's `path`-like
@@ -126,9 +149,14 @@ export function toActionSkeleton(trajectory) {
126
149
  kind: inputLooksLikeRunner(part.input) ? 'test-run' : 'command',
127
150
  ordinal: 0,
128
151
  tool: part.tool,
129
- ...(command ? { command } : {}),
152
+ // Scrub the command BEFORE it lands in the skeleton JSON the judge
153
+ // reads — a `curl …?key=…` / `git clone https://user:pass@…` would
154
+ // otherwise leak a credential verbatim into the judge's prompt.
155
+ ...(command ? { command: scrub(command) } : {}),
130
156
  ...(turn.sessionId ? { sessionId: turn.sessionId } : {}),
131
157
  });
158
+ if (rawCmd)
159
+ rawCommand.set(event, rawCmd);
132
160
  const pending = { event };
133
161
  lastPending = pending;
134
162
  if (part.callId)
@@ -166,30 +194,146 @@ export function toActionSkeleton(trajectory) {
166
194
  e.failedCount = metrics.failed;
167
195
  }
168
196
  }
197
+ // Failure flag (isError / nonzero exit / failed tests) — drives both
198
+ // salience ranking and which events keep an error tail. Stash the raw
199
+ // output so the tail is finalized later, against only kept events.
200
+ const failed = part.isError === true ||
201
+ (typeof e.exitCode === 'number' && e.exitCode > 0) ||
202
+ (typeof e.failedCount === 'number' && e.failedCount > 0);
203
+ if (failed) {
204
+ e.isError = true;
205
+ if (typeof part.output === 'string' && part.output.trim().length > 0) {
206
+ rawErrorOutput.set(e, part.output);
207
+ }
208
+ }
169
209
  }
170
210
  lastPending = null;
171
211
  }
212
+ else if (part.kind === 'text' || part.kind === 'reasoning') {
213
+ // ⑥ A substantial assistant plan/reasoning block — presence only.
214
+ sawNarration = true;
215
+ if (part.text.trim().length >= STATED_PLAN_MIN_CHARS)
216
+ statedPlanPresent = true;
217
+ }
172
218
  }
173
219
  }
174
- // Stamp ordinals on the full (rolled-up) sequence, then middle-out truncate.
220
+ // Stamp ordinals on the full (rolled-up) sequence.
175
221
  events.forEach((e, i) => {
176
222
  e.ordinal = i;
177
223
  });
224
+ const preTruncationEventCount = events.length;
225
+ // ── P3: salience-ranked retention (replaces middle-out) ────────────────────
226
+ // Keep the most diagnostic events within the SAME hard cap, not the events in
227
+ // the most convenient POSITIONS. Position-based loss routinely discarded the
228
+ // failing-then-recovering MIDDLE; salience keeps failures, pass-rate
229
+ // transitions, and session endpoints, and degrades gracefully when a harness
230
+ // never parses a pass rate (failures are still ranked by isError/exit code).
178
231
  let bounded = events;
179
232
  let truncated = false;
180
233
  if (events.length > MAX_SKELETON_EVENTS) {
181
- const head = Math.ceil(MAX_SKELETON_EVENTS / 2);
182
- const tail = MAX_SKELETON_EVENTS - head;
183
- bounded = [...events.slice(0, head), ...events.slice(events.length - tail)];
234
+ const transitions = transitionOrdinals(events);
235
+ const forced = forcedOrdinals(events);
236
+ const priority = (e) => (forced.has(e.ordinal) ? 1000 : 0) + salience(e, transitions.has(e.ordinal));
237
+ const kept = [...events]
238
+ .sort((a, b) => priority(b) - priority(a) || a.ordinal - b.ordinal)
239
+ .slice(0, MAX_SKELETON_EVENTS)
240
+ .sort((a, b) => a.ordinal - b.ordinal);
241
+ // Honest per-gap elision marker: how many ORIGINAL events were dropped in the
242
+ // contiguous run immediately before each kept event (so a non-contiguous
243
+ // record is never read as a continuous causal narrative).
244
+ let prevOrdinal = -1;
245
+ for (const e of kept) {
246
+ const gap = e.ordinal - prevOrdinal - 1;
247
+ if (gap > 0)
248
+ e.elidedBefore = gap;
249
+ prevOrdinal = e.ordinal;
250
+ }
251
+ bounded = kept;
184
252
  truncated = true;
185
253
  }
254
+ // ── P7 + P4: failed KEPT events keep a longer command line (flags survive)
255
+ // and a bounded, scrubbed error tail (the actionable message), within a global
256
+ // budget so a flailing run's failures cannot flood the prompt.
257
+ let errorBudget = MAX_TOTAL_ERROR_BODY_CHARS;
258
+ let errorBodiesElided = 0;
259
+ for (const e of bounded) {
260
+ if (!e.isError || (e.kind !== 'command' && e.kind !== 'test-run'))
261
+ continue;
262
+ const raw = rawCommand.get(e);
263
+ if (raw) {
264
+ const capped = capCommand(raw, FAILED_COMMAND_CHARS);
265
+ if (capped)
266
+ e.command = scrub(capped); // scrub the longer failed-command form too
267
+ }
268
+ const output = rawErrorOutput.get(e);
269
+ if (!output)
270
+ continue;
271
+ if (errorBudget <= 0) {
272
+ errorBodiesElided++;
273
+ continue;
274
+ }
275
+ // Hard cap AFTER scrub (a redaction marker could be marginally longer than
276
+ // what it replaced) so a kept tail never exceeds the per-event bound.
277
+ const tail = scrub(tailExcerpt(output, Math.min(MAX_ERROR_TAIL_CHARS, errorBudget))).slice(0, MAX_ERROR_TAIL_CHARS);
278
+ if (tail.length > 0) {
279
+ e.errorTail = tail;
280
+ errorBudget -= tail.length;
281
+ }
282
+ }
186
283
  return {
187
284
  harness: trajectory.harness,
188
285
  events: bounded,
189
286
  totalToolCalls,
190
287
  truncated,
288
+ preTruncationEventCount,
289
+ ...(errorBodiesElided > 0 ? { errorBodiesElided } : {}),
290
+ ...(sawNarration ? { statedPlanPresent } : {}),
191
291
  };
192
292
  }
293
+ /**
294
+ * Ordinals of test-runs whose signal CHANGED from the previous test-run (or the
295
+ * first test-run, which establishes signal) — the moments credit/blame lives.
296
+ */
297
+ function transitionOrdinals(events) {
298
+ const set = new Set();
299
+ let prev = null;
300
+ for (const e of events) {
301
+ if (e.kind !== 'test-run')
302
+ continue;
303
+ const cur = { passRate: e.passRate ?? null, failedCount: e.failedCount ?? null };
304
+ if (prev === null || cur.passRate !== prev.passRate || cur.failedCount !== prev.failedCount) {
305
+ set.add(e.ordinal);
306
+ }
307
+ prev = cur;
308
+ }
309
+ return set;
310
+ }
311
+ /** Always-retained ordinals: the global endpoints + the last test-run per session. */
312
+ function forcedOrdinals(events) {
313
+ const set = new Set();
314
+ if (events.length > 0) {
315
+ set.add(events[0].ordinal);
316
+ set.add(events[events.length - 1].ordinal);
317
+ }
318
+ const lastTestRunBySession = new Map();
319
+ for (const e of events) {
320
+ if (e.kind === 'test-run')
321
+ lastTestRunBySession.set(e.sessionId ?? '', e.ordinal);
322
+ }
323
+ for (const ord of lastTestRunBySession.values())
324
+ set.add(ord);
325
+ return set;
326
+ }
327
+ /** Deterministic, code-derived salience: failures > transitions > runs > commands > edits. */
328
+ function salience(e, isTransition) {
329
+ if (e.isError)
330
+ return 100;
331
+ if (e.kind === 'test-run')
332
+ return isTransition ? 90 : 70;
333
+ if (e.kind === 'command')
334
+ return 30;
335
+ return 20;
336
+ }
193
337
  function basename(p) {
194
338
  const parts = p.split('/');
195
339
  return parts[parts.length - 1] || p;
@@ -1,3 +1,4 @@
1
+ import { type EpisodeRecord, type PolicyLedgerEntry, type RejectBufferEntry } from '../core/self-evolution/index.js';
1
2
  export interface ProjectInfo {
2
3
  name: string;
3
4
  version: string;
@@ -32,39 +33,6 @@ export interface CliHistoryEvent {
32
33
  durationMs?: number;
33
34
  metadata?: Record<string, unknown>;
34
35
  }
35
- export type EvolveRunStatus = 'completed' | 'errored' | 'pending' | 'empty';
36
- export interface EvolveRunSummary {
37
- schemaVersion?: number;
38
- runId: string;
39
- benchmarkId?: string;
40
- harnessVariant?: string;
41
- startedAt?: string;
42
- finishedAt?: string;
43
- taskCount?: number;
44
- verdictCounts?: Record<string, number>;
45
- passRate?: number;
46
- totalCostUsd?: number;
47
- totalWallTimeMs?: number;
48
- interrupted?: boolean;
49
- isolationMode?: string;
50
- budget?: Record<string, unknown>;
51
- status?: EvolveRunStatus;
52
- failureReasonSummary?: string;
53
- }
54
- export interface EvolveArchive {
55
- schemaVersion?: number;
56
- createdAt?: string;
57
- entries: Array<{
58
- id: string;
59
- parentId: string | null;
60
- generation: number;
61
- createdAt?: string;
62
- snapshotPath?: string;
63
- runs?: unknown[];
64
- childCount?: number;
65
- }>;
66
- generations?: unknown[];
67
- }
68
36
  export interface ProjectOverview {
69
37
  project: ProjectInfo;
70
38
  changes: {
@@ -73,15 +41,25 @@ export interface ProjectOverview {
73
41
  inProgress: number;
74
42
  };
75
43
  evolve: {
76
- runs: number;
77
- lastRunAt: string | null;
78
- lastVerdict: string | null;
44
+ episodes: number;
45
+ lastEpisodeAt: string | null;
46
+ lastStage: string | null;
47
+ headVersion: number | null;
79
48
  };
80
49
  cli: {
81
50
  totalEvents: number;
82
51
  recentFailures: number;
83
52
  };
84
53
  }
54
+ export interface PolicyLineage {
55
+ targetId: string;
56
+ headVersion: number | null;
57
+ entries: PolicyLedgerEntry[];
58
+ evolveCount: number;
59
+ rollbackCount: number;
60
+ refusedCount: number;
61
+ lastAt: string | null;
62
+ }
85
63
  export interface AgentInterfacePlan {
86
64
  schemaVersion: 1;
87
65
  generatedAt: string;
@@ -193,21 +171,17 @@ export declare function readProjectInfo(root: string): Promise<ProjectInfo>;
193
171
  export declare function readChange(root: string, id: string): Promise<ChangeSummary | null>;
194
172
  export declare function listChanges(root: string): Promise<ChangeSummary[]>;
195
173
  export declare function readCliHistory(root: string, limit?: number): Promise<CliHistoryEvent[]>;
196
- export interface EvolveRunDetail extends EvolveRunSummary {
197
- tasks: Array<{
198
- taskId: string;
199
- verdict?: string;
200
- wallTimeMs?: number;
201
- totalCostUsd?: number;
202
- reason?: string;
203
- }>;
204
- wrapperStderrTail?: string;
205
- wrapperStdoutTail?: string;
206
- fileListing?: string[];
207
- }
208
- export declare function listEvolveRuns(root: string): Promise<EvolveRunSummary[]>;
209
- export declare function readEvolveRun(root: string, runId: string): Promise<EvolveRunDetail | null>;
210
- export declare function readEvolveArchive(root: string): Promise<EvolveArchive | null>;
174
+ /**
175
+ * Read the loop-v2 self-evolution surface: per-episode two-arm forward records,
176
+ * the policy version ledger grouped into per-target lineages, and the
177
+ * reject-buffer of rolled-back episodes. Each reader is independently guarded so
178
+ * a single missing/unreadable store yields an empty slice rather than throwing.
179
+ */
180
+ export declare function readSelfEvolution(root: string): Promise<{
181
+ episodes: EpisodeRecord[];
182
+ policyLineages: PolicyLineage[];
183
+ rejectBuffer: RejectBufferEntry[];
184
+ }>;
211
185
  export declare function readAgentInterfacePlan(root: string): Promise<AgentInterfacePlan>;
212
186
  export declare function readOverview(root: string): Promise<ProjectOverview>;
213
187
  /**
@@ -2,6 +2,7 @@ import { promises as fs } from 'fs';
2
2
  import { join, resolve, dirname } from 'path';
3
3
  import { readAllJsonLines } from './tail.js';
4
4
  import { readAgentCognitiveEvents, summarizeAgentCognitiveTrace, } from '../history/cognitive.js';
5
+ import { listEpisodes, readPolicyLedgerAll, readRejectBufferAll, } from '../core/self-evolution/index.js';
5
6
  async function tryReadJson(path) {
6
7
  try {
7
8
  const raw = await fs.readFile(path, 'utf8');
@@ -137,156 +138,42 @@ export async function readCliHistory(root, limit = 200) {
137
138
  const events = await readAllJsonLines(path);
138
139
  return events.slice(-limit).reverse();
139
140
  }
140
- const LOG_TAIL_BYTES = 16 * 1024;
141
- async function tryReadFileTail(path, maxBytes = LOG_TAIL_BYTES) {
142
- try {
143
- const stat = await fs.stat(path);
144
- if (stat.size === 0)
145
- return null;
146
- if (stat.size <= maxBytes)
147
- return await fs.readFile(path, 'utf8');
148
- const handle = await fs.open(path, 'r');
149
- try {
150
- const buf = Buffer.alloc(maxBytes);
151
- await handle.read(buf, 0, maxBytes, stat.size - maxBytes);
152
- return '…\n' + buf.toString('utf8');
153
- }
154
- finally {
155
- await handle.close();
156
- }
157
- }
158
- catch {
159
- return null;
160
- }
161
- }
162
- async function classifyRunStatus(dir, summary) {
163
- if (summary && summary.verdictCounts)
164
- return 'completed';
165
- let hasAnyFiles = false;
166
- let hasWrapperStderr = false;
167
- try {
168
- const items = await fs.readdir(dir, { withFileTypes: true });
169
- hasAnyFiles = items.length > 0;
170
- for (const item of items) {
171
- if (item.isFile() && item.name === 'wrapper.stderr.log') {
172
- const stat = await fs.stat(join(dir, item.name));
173
- if (stat.size > 0)
174
- hasWrapperStderr = true;
175
- }
176
- }
177
- }
178
- catch {
179
- return 'empty';
180
- }
181
- if (hasWrapperStderr)
182
- return 'errored';
183
- if (hasAnyFiles)
184
- return 'pending';
185
- return 'empty';
186
- }
187
- function summarizeFailureReason(reason) {
188
- if (!reason)
189
- return undefined;
190
- const lines = reason
191
- .split(/\r?\n/)
192
- .map((line) => line.trim())
193
- .filter(Boolean);
194
- if (!lines.length)
195
- return undefined;
196
- const mismatchCount = lines.filter((line) => /sha256 mismatch/i.test(line)).length;
197
- if (mismatchCount > 1)
198
- return `${mismatchCount} genome file hash mismatches`;
199
- return lines[0].replace(/\s+/g, ' ').slice(0, 180);
200
- }
201
- async function readRunFailureReasonSummary(runDir) {
202
- const tasksDir = join(runDir, 'tasks');
203
- let taskNames = [];
204
- try {
205
- const items = await fs.readdir(tasksDir, { withFileTypes: true });
206
- taskNames = items.filter((d) => d.isDirectory()).map((d) => d.name);
207
- }
208
- catch {
209
- return undefined;
210
- }
211
- for (const taskName of taskNames) {
212
- const result = await tryReadJson(join(tasksDir, taskName, 'result.json'));
213
- const summary = summarizeFailureReason(result?.reason);
214
- if (summary)
215
- return summary;
216
- }
217
- return undefined;
218
- }
219
- export async function listEvolveRuns(root) {
220
- const dir = join(root, 'evolve', 'runs');
221
- let entries = [];
222
- try {
223
- const items = await fs.readdir(dir, { withFileTypes: true });
224
- entries = items.filter((d) => d.isDirectory()).map((d) => d.name);
225
- }
226
- catch {
227
- return [];
228
- }
229
- const summaries = await Promise.all(entries.map(async (id) => {
230
- const runDir = join(dir, id);
231
- const summary = await tryReadJson(join(runDir, 'summary.json'));
232
- const status = await classifyRunStatus(runDir, summary);
233
- const failureReasonSummary = await readRunFailureReasonSummary(runDir);
234
- if (!summary)
235
- return { runId: id, status, failureReasonSummary };
236
- return { ...summary, runId: summary.runId ?? id, status, failureReasonSummary };
237
- }));
238
- return summaries.sort((a, b) => (b.startedAt ?? '').localeCompare(a.startedAt ?? ''));
239
- }
240
- export async function readEvolveRun(root, runId) {
241
- const dir = join(root, 'evolve', 'runs', runId);
242
- const dirStat = await tryStat(dir);
243
- if (!dirStat)
244
- return null;
245
- const summary = await tryReadJson(join(dir, 'summary.json'));
246
- const status = await classifyRunStatus(dir, summary);
247
- const tasksDir = join(dir, 'tasks');
248
- let taskNames = [];
249
- try {
250
- const items = await fs.readdir(tasksDir, { withFileTypes: true });
251
- taskNames = items.filter((d) => d.isDirectory()).map((d) => d.name);
252
- }
253
- catch {
254
- // No tasks dir.
255
- }
256
- const tasks = await Promise.all(taskNames.map(async (taskId) => {
257
- const result = await tryReadJson(join(tasksDir, taskId, 'result.json'));
258
- return {
259
- taskId,
260
- verdict: result?.verdict,
261
- wallTimeMs: result?.telemetry?.wallTimeMs,
262
- totalCostUsd: result?.telemetry?.totalCostUsd,
263
- reason: result?.reason,
264
- };
265
- }));
266
- const needsDiagnostics = !summary || status !== 'completed';
267
- const wrapperStderrTail = needsDiagnostics
268
- ? (await tryReadFileTail(join(dir, 'wrapper.stderr.log'))) ?? undefined
269
- : undefined;
270
- const wrapperStdoutTail = needsDiagnostics
271
- ? (await tryReadFileTail(join(dir, 'wrapper.stdout.log'))) ?? undefined
272
- : undefined;
273
- let fileListing;
274
- if (needsDiagnostics) {
275
- try {
276
- const items = await fs.readdir(dir, { withFileTypes: true });
277
- fileListing = items.map((d) => d.name + (d.isDirectory() ? '/' : ''));
278
- }
279
- catch {
280
- fileListing = undefined;
281
- }
141
+ /**
142
+ * Read the loop-v2 self-evolution surface: per-episode two-arm forward records,
143
+ * the policy version ledger grouped into per-target lineages, and the
144
+ * reject-buffer of rolled-back episodes. Each reader is independently guarded so
145
+ * a single missing/unreadable store yields an empty slice rather than throwing.
146
+ */
147
+ export async function readSelfEvolution(root) {
148
+ const [episodes, ledger, rejectBuffer] = await Promise.all([
149
+ listEpisodes(root).catch(() => []),
150
+ readPolicyLedgerAll(root).catch(() => []),
151
+ readRejectBufferAll(root).catch(() => []),
152
+ ]);
153
+ // Group ledger entries by targetId, preserving append order within each group.
154
+ const byTarget = new Map();
155
+ for (const entry of ledger) {
156
+ const list = byTarget.get(entry.targetId);
157
+ if (list)
158
+ list.push(entry);
159
+ else
160
+ byTarget.set(entry.targetId, [entry]);
161
+ }
162
+ const policyLineages = [];
163
+ for (const [targetId, entries] of byTarget) {
164
+ const last = entries[entries.length - 1];
165
+ policyLineages.push({
166
+ targetId,
167
+ headVersion: last ? last.version : null,
168
+ entries,
169
+ evolveCount: entries.filter((e) => e.action === 'evolve').length,
170
+ rollbackCount: entries.filter((e) => e.action === 'rollback').length,
171
+ refusedCount: entries.filter((e) => e.action === 'refused').length,
172
+ lastAt: last ? last.at : null,
173
+ });
282
174
  }
283
- const base = summary
284
- ? { ...summary, runId: summary.runId ?? runId, status }
285
- : { runId, status };
286
- return { ...base, tasks, wrapperStderrTail, wrapperStdoutTail, fileListing };
287
- }
288
- export async function readEvolveArchive(root) {
289
- return tryReadJson(join(root, 'evolve', 'archive', 'archive.json'));
175
+ policyLineages.sort((a, b) => (b.lastAt ?? '').localeCompare(a.lastAt ?? ''));
176
+ return { episodes, policyLineages, rejectBuffer };
290
177
  }
291
178
  export async function readAgentInterfacePlan(root) {
292
179
  const { events, skippedRecords } = await readAgentCognitiveEvents({
@@ -366,10 +253,10 @@ export async function readAgentInterfacePlan(root) {
366
253
  boundary: 'Same workflow, tool-native syntax.',
367
254
  },
368
255
  {
369
- id: 'lab',
256
+ id: 'self-evolution',
370
257
  label: 'Self-evolution',
371
- command: 'synergyspec-selfevolving self-evolution evolve',
372
- purpose: 'Score and rank candidate template variants into a human-gated promotion report.',
258
+ command: 'synergyspec-selfevolving self-evolution episode',
259
+ purpose: 'Run one self-evolution episode (main baseline arms reward → bounded policy edit).',
373
260
  boundary: 'Maintainer/research surface, not the normal user workflow.',
374
261
  },
375
262
  ],
@@ -384,15 +271,14 @@ export async function readAgentInterfacePlan(root) {
384
271
  };
385
272
  }
386
273
  export async function readOverview(root) {
387
- const [project, changes, runs, cli] = await Promise.all([
274
+ const [project, changes, se, cli] = await Promise.all([
388
275
  readProjectInfo(root),
389
276
  listChanges(root),
390
- listEvolveRuns(root),
277
+ readSelfEvolution(root),
391
278
  readCliHistory(root, 500),
392
279
  ]);
393
280
  const completedChanges = changes.filter((c) => c.status === 'completed').length;
394
281
  const inProgressChanges = changes.filter((c) => c.status === 'in-progress').length;
395
- const lastRun = runs[0];
396
282
  const recentFailures = cli.filter((e) => e.outcome === 'failure').length;
397
283
  return {
398
284
  project,
@@ -402,14 +288,10 @@ export async function readOverview(root) {
402
288
  inProgress: inProgressChanges,
403
289
  },
404
290
  evolve: {
405
- runs: runs.length,
406
- lastRunAt: lastRun?.startedAt ?? null,
407
- lastVerdict: lastRun
408
- ? Object.entries(lastRun.verdictCounts ?? {})
409
- .filter(([, n]) => n > 0)
410
- .map(([k]) => k)
411
- .join(', ') || null
412
- : null,
291
+ episodes: se.episodes.length,
292
+ lastEpisodeAt: se.episodes[0]?.updatedAt ?? null,
293
+ lastStage: se.episodes[0]?.stage ?? null,
294
+ headVersion: se.policyLineages[0]?.headVersion ?? null,
413
295
  },
414
296
  cli: { totalEvents: cli.length, recentFailures },
415
297
  };
@@ -604,8 +486,8 @@ export async function readArchitecture(root) {
604
486
  const domain = [
605
487
  { id: 'change', label: 'Change', role: 'A unit of planned work with its artifacts' },
606
488
  { id: 'memory', label: 'Memory', role: 'Durable attributed lessons across runs' },
607
- { id: 'evolveRun', label: 'Evolve run', role: 'A benchmark-backed self-evolution attempt' },
608
- { id: 'archive', label: 'Archive', role: 'Finalized changes and evolution lineage' },
489
+ { id: 'episode', label: 'Episode', role: 'One two-arm forward self-evolution episode (main ∥ baseline arms -> graded advantage)' },
490
+ { id: 'archive', label: 'Archive', role: 'Finalized changes' },
609
491
  ];
610
492
  logical = { entities: [...artifacts, ...domain] };
611
493
  }
@@ -624,20 +506,16 @@ export async function readArchitecture(root) {
624
506
  };
625
507
  try {
626
508
  const plan = await readAgentInterfacePlan(root);
627
- const runs = await listEvolveRuns(root);
509
+ const se = await readSelfEvolution(root);
628
510
  const cli = await readCliHistory(root, 500);
629
- const lastRun = runs[0];
630
- const lastVerdict = lastRun
631
- ? Object.entries(lastRun.verdictCounts ?? {})
632
- .filter(([, n]) => n > 0)
633
- .map(([k]) => k)
634
- .join(', ') || null
635
- : null;
636
- const passRates = runs
637
- .map((r) => r.passRate)
511
+ const lastVerdict = se.episodes[0]?.stage ?? null;
512
+ // passRate = fraction of GRADED episodes (advantage measured) that were
513
+ // positive (advantage > 0), over all graded episodes; null when none graded.
514
+ const gradedAdvantages = se.episodes
515
+ .map((e) => e.advantage)
638
516
  .filter((v) => typeof v === 'number' && Number.isFinite(v));
639
- const passRate = passRates.length > 0
640
- ? passRates.reduce((sum, v) => sum + v, 0) / passRates.length
517
+ const passRate = gradedAdvantages.length > 0
518
+ ? gradedAdvantages.filter((v) => v > 0).length / gradedAdvantages.length
641
519
  : null;
642
520
  const toolCounts = new Map();
643
521
  for (const event of cli) {
@@ -653,7 +531,7 @@ export async function readArchitecture(root) {
653
531
  toolDistribution,
654
532
  traceEvents: plan.summary.traces,
655
533
  decisions: plan.summary.decisions,
656
- runs: runs.length,
534
+ runs: se.episodes.length,
657
535
  lastVerdict,
658
536
  passRate,
659
537
  };
@@ -701,8 +579,18 @@ export async function readArchitecture(root) {
701
579
  rel: '.synergyspec-selfevolving/history/events.ndjson',
702
580
  detail: 'Append-only CLI history event log',
703
581
  },
704
- { rel: 'evolve/runs', detail: 'Per-run self-evolution outputs' },
705
- { rel: 'evolve/archive/archive.json', detail: 'Self-evolution lineage archive' },
582
+ {
583
+ rel: '.synergyspec-selfevolving/self-evolution/episodes',
584
+ detail: 'Per-episode two-arm forward records',
585
+ },
586
+ {
587
+ rel: '.synergyspec-selfevolving/self-evolution/policy/ledger.ndjson',
588
+ detail: 'Append-only policy version ledger',
589
+ },
590
+ {
591
+ rel: '.synergyspec-selfevolving/self-evolution/policy/reject-buffer.ndjson',
592
+ detail: 'Rolled-back episodes (reject-buffer)',
593
+ },
706
594
  ];
707
595
  const storeSurfaces = [];
708
596
  for (const store of storePaths) {