synergyspec-selfevolving 2.1.5 → 2.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/commands/learn.js +80 -24
  2. package/dist/commands/self-evolution-dream.d.ts +15 -1
  3. package/dist/commands/self-evolution-dream.js +111 -6
  4. package/dist/commands/self-evolution-episode.d.ts +3 -0
  5. package/dist/commands/self-evolution-episode.js +157 -108
  6. package/dist/commands/workflow/status.js +4 -0
  7. package/dist/core/archive.js +17 -9
  8. package/dist/core/change-readiness.d.ts +16 -1
  9. package/dist/core/change-readiness.js +441 -15
  10. package/dist/core/fitness/loss.d.ts +3 -5
  11. package/dist/core/fitness/loss.js +2 -2
  12. package/dist/core/fitness/test-metrics.d.ts +1 -0
  13. package/dist/core/fitness/test-metrics.js +49 -0
  14. package/dist/core/learn.js +129 -11
  15. package/dist/core/migration.d.ts +6 -14
  16. package/dist/core/migration.js +63 -21
  17. package/dist/core/runner-evidence.d.ts +53 -0
  18. package/dist/core/runner-evidence.js +613 -0
  19. package/dist/core/self-evolution/candidates.js +0 -2
  20. package/dist/core/self-evolution/dream.d.ts +57 -3
  21. package/dist/core/self-evolution/dream.js +480 -9
  22. package/dist/core/self-evolution/episode-orchestrator.d.ts +2 -0
  23. package/dist/core/self-evolution/episode-orchestrator.js +17 -5
  24. package/dist/core/self-evolution/episode-store.d.ts +5 -0
  25. package/dist/core/self-evolution/episode-store.js +6 -2
  26. package/dist/core/self-evolution/evolving-agent.d.ts +33 -4
  27. package/dist/core/self-evolution/evolving-agent.js +138 -11
  28. package/dist/core/self-evolution/host-harness.d.ts +35 -12
  29. package/dist/core/self-evolution/host-harness.js +188 -49
  30. package/dist/core/self-evolution/reward-aggregator.js +2 -2
  31. package/dist/core/templates/workflows/archive-change.js +18 -18
  32. package/dist/core/templates/workflows/dream.js +57 -47
  33. package/dist/core/templates/workflows/learn.js +7 -5
  34. package/dist/core/templates/workflows/run-tests.js +48 -29
  35. package/dist/core/templates/workflows/self-evolving.js +11 -8
  36. package/dist/core/trajectory/facts.d.ts +1 -1
  37. package/dist/core/trajectory/registry.js +39 -8
  38. package/package.json +1 -1
@@ -34,17 +34,18 @@ const HARNESSES = ['claude', 'codex', 'opencode'];
34
34
  export const DEFAULT_AGENT_TIMEOUT_MS = 600_000;
35
35
  /**
36
36
  * Per-host absolute-timeout defaults. claude/codex keep the 10-min
37
- * {@link DEFAULT_AGENT_TIMEOUT_MS}; opencode is given a lower wall because — in
38
- * the v2.1.2 smoke run an opencode/GPT-5.5 print-mode spawn emitted ZERO
39
- * output and burned the full 10 minutes before the wall fired (the host CLI is
40
- * empirically slow-to-emit / occasionally non-terminating in `run` print mode).
37
+ * {@link DEFAULT_AGENT_TIMEOUT_MS}; opencode gets a longer wall because the
38
+ * v2.1.5 Windows/OpenCode smoke run reached reward/scoring, then killed the
39
+ * evolving agent at the previous 5-min wall while it was still producing a
40
+ * bounded candidate. The idle watchdog remains the earlier trip wire for silent
41
+ * wedges, so the absolute wall should be large enough for a live edit attempt.
41
42
  * The wall is still overridable per-host via
42
43
  * `SYNERGYSPEC_SELFEVOLVING_AGENT_TIMEOUT_MS` ({@link resolveAgentTimeoutMs}).
43
44
  */
44
45
  const HARNESS_TIMEOUT_DEFAULTS_MS = {
45
46
  claude: DEFAULT_AGENT_TIMEOUT_MS,
46
47
  codex: DEFAULT_AGENT_TIMEOUT_MS,
47
- opencode: 300_000,
48
+ opencode: 900_000,
48
49
  };
49
50
  /**
50
51
  * Default STDOUT/STDERR-idle watchdog window (2 min). If a spawned host CLI
@@ -62,11 +63,13 @@ export const DEFAULT_AGENT_IDLE_TIMEOUT_MS = 120_000;
62
63
  * emit ZERO bytes for well over 2 min while it reasons, so claude/codex get a
63
64
  * 5-min idle leash. opencode keeps the tighter 2-min window — it is the
64
65
  * empirically-wedging host (the v2.1.2 hang emitted no output at all) and a
65
- * faster idle kill is what we want there.
66
+ * faster idle kill is what we want there. opencode's absolute wall is longer
67
+ * than claude/codex because its live edit attempts can be slower even when they
68
+ * are not silent.
66
69
  *
67
70
  * INVARIANT: every harness's idle default is strictly LESS than its absolute
68
71
  * default ({@link HARNESS_TIMEOUT_DEFAULTS_MS}) so the idle watchdog stays the
69
- * earlier trip wire (claude 300<600, codex 300<600, opencode 120<300).
72
+ * earlier trip wire (claude 300<600, codex 300<600, opencode 120<900).
70
73
  * Overridable per host via `SYNERGYSPEC_SELFEVOLVING_AGENT_IDLE_TIMEOUT_MS`
71
74
  * ({@link resolveIdleTimeoutMs}).
72
75
  */
@@ -125,8 +128,8 @@ const AGENT_TIMEOUT_ENV = 'SYNERGYSPEC_SELFEVOLVING_AGENT_TIMEOUT_MS';
125
128
  * (1) `SYNERGYSPEC_SELFEVOLVING_AGENT_TIMEOUT_MS` when it parses to a positive
126
129
  * finite integer — a host-wide tunable that overrides every harness.
127
130
  * (2) the per-harness default ({@link HARNESS_TIMEOUT_DEFAULTS_MS}): the 10-min
128
- * {@link DEFAULT_AGENT_TIMEOUT_MS} for claude/codex, a lower wall for the
129
- * empirically slow-to-emit opencode.
131
+ * {@link DEFAULT_AGENT_TIMEOUT_MS} for claude/codex, and a longer wall for
132
+ * opencode live edit attempts.
130
133
  *
131
134
  * `harness` omitted ⇒ {@link resolveHostHarness} is consulted so the default is
132
135
  * host-appropriate.
@@ -150,25 +153,25 @@ function isAgentHarness(value) {
150
153
  * Precedence:
151
154
  * (a) `SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS` when it equals claude|codex|opencode.
152
155
  * (b) Heuristic on the ambient environment:
153
- * - `CODEX_HOME` or any `CODEX_*` var set → 'codex'.
154
156
  * - `OPENCODE_DATA_DIR` or any `OPENCODE_*` var set → 'opencode'.
157
+ * - `CODEX_HOME` or any `CODEX_*` var set → 'codex'.
155
158
  * (c) Default 'claude'.
156
159
  *
157
- * Codex is checked before opencode so that, in the unlikely event both families
158
- * of env vars are present, the explicit override remains the only way to force a
159
- * choice; the heuristic is best-effort.
160
+ * OpenCode is checked before Codex because Codex can be the meta-runner that is
161
+ * invoking an OpenCode smoke test; in that mixed environment OPENCODE_* is the
162
+ * stronger signal for the observed run whose trajectory we must grade.
160
163
  */
161
164
  export function resolveHostHarness() {
162
165
  const override = process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS;
163
166
  if (isAgentHarness(override))
164
167
  return override;
165
168
  const envKeys = Object.keys(process.env);
166
- const hasCodex = process.env.CODEX_HOME !== undefined || envKeys.some((k) => k.startsWith('CODEX_'));
167
- if (hasCodex)
168
- return 'codex';
169
169
  const hasOpencode = process.env.OPENCODE_DATA_DIR !== undefined || envKeys.some((k) => k.startsWith('OPENCODE_'));
170
170
  if (hasOpencode)
171
171
  return 'opencode';
172
+ const hasCodex = process.env.CODEX_HOME !== undefined || envKeys.some((k) => k.startsWith('CODEX_'));
173
+ if (hasCodex)
174
+ return 'codex';
172
175
  return 'claude';
173
176
  }
174
177
  // ---------------------------------------------------------------------------
@@ -194,14 +197,23 @@ function hostHarnessPath(repoRoot) {
194
197
  * spawns, never a precondition for the current run.
195
198
  */
196
199
  export async function persistHostHarness(repoRoot, harness) {
200
+ let tmpFile = null;
197
201
  try {
198
202
  const file = hostHarnessPath(repoRoot);
199
203
  await fs.mkdir(path.dirname(file), { recursive: true });
200
- await fs.writeFile(file, `${JSON.stringify({ harness }, null, 2)}\n`, 'utf8');
204
+ tmpFile = path.join(path.dirname(file), `${HOST_HARNESS_FILE}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2)}.tmp`);
205
+ await fs.writeFile(tmpFile, `${JSON.stringify({ harness }, null, 2)}\n`, 'utf8');
206
+ await fs.rename(tmpFile, file);
207
+ tmpFile = null;
201
208
  }
202
209
  catch {
203
210
  // Swallow: a read-only or transient FS must not break the loop.
204
211
  }
212
+ finally {
213
+ if (tmpFile) {
214
+ await fs.unlink(tmpFile).catch(() => undefined);
215
+ }
216
+ }
205
217
  }
206
218
  /**
207
219
  * Read + parse + validate the persisted-harness sidecar. Returns the
@@ -253,18 +265,16 @@ function binaryResolvable(binary) {
253
265
  if (binary.trim().length === 0)
254
266
  return false;
255
267
  const isWindows = process.platform === 'win32';
256
- // Windows PATHEXT (e.g. `.COM;.EXE;.BAT;.CMD`); also try the bare name (a
257
- // binary may already carry its extension).
258
- const exts = isWindows
259
- ? ['', ...(process.env.PATHEXT ?? '.COM;.EXE;.BAT;.CMD').split(';').filter(Boolean)]
260
- : [''];
268
+ // Windows PATHEXT (e.g. `.COM;.EXE;.BAT;.CMD`). A bare extensionless npm
269
+ // shim is not a CreateProcess target; prefer the PATHEXT-resolved .cmd/.exe.
270
+ const exts = executableExtensions(binary, isWindows, process.env.PATHEXT);
261
271
  const isExecutableFile = (candidate) => {
262
272
  try {
263
273
  const st = statSync(candidate);
264
274
  if (!st.isFile())
265
275
  return false;
266
276
  if (isWindows)
267
- return true; // Windows has no executable bit; existence + ext suffices.
277
+ return isWindowsSpawnCompatibleExecutable(candidate);
268
278
  // POSIX: any execute bit (owner/group/other) marks it runnable.
269
279
  return (st.mode & 0o111) !== 0;
270
280
  }
@@ -314,8 +324,11 @@ function persistedBinary(harness) {
314
324
  * wrong binary,
315
325
  * (4) 'claude'.
316
326
  *
317
- * When (1) or (2) resolve CONFIDENTLY from a real env signal, the result is
318
- * persisted best-effort (fire-and-forget) so a later env-less call recovers it.
327
+ * This resolver is read-only. Command entry points that need to seed an
328
+ * env-less subagent call `seedHostHarnessForRepo`; keeping this function pure
329
+ * matters because learn preview/report generation uses it during trajectory
330
+ * lookup and must not write sidecar files.
331
+ *
319
332
  * The env checks are replicated inline (rather than only calling the sync
320
333
  * {@link resolveHostHarness}) precisely so we can tell "env gave a real signal"
321
334
  * apart from "defaulted to claude with no signal" — the sync resolver collapses
@@ -325,24 +338,21 @@ export async function resolveHostHarnessDetailsForRepo(repoRoot) {
325
338
  // (1) explicit override.
326
339
  const override = process.env.SYNERGYSPEC_SELFEVOLVING_HOST_HARNESS;
327
340
  if (isAgentHarness(override)) {
328
- void persistHostHarness(repoRoot, override);
329
341
  return { harness: override, source: 'override' };
330
342
  }
331
343
  // (2) env heuristic — only a POSITIVE hit counts (mirrors resolveHostHarness'
332
- // CODEX_-before-OPENCODE_ ordering, but distinguishes a real signal from
344
+ // OPENCODE_-before-CODEX_ ordering, but distinguishes a real signal from
333
345
  // the 'claude' fall-through).
334
346
  const envKeys = Object.keys(process.env);
335
- const hasCodex = process.env.CODEX_HOME !== undefined || envKeys.some((k) => k.startsWith('CODEX_'));
336
- if (hasCodex) {
337
- void persistHostHarness(repoRoot, 'codex');
338
- return { harness: 'codex', source: 'env' };
339
- }
340
347
  const hasOpencode = process.env.OPENCODE_DATA_DIR !== undefined ||
341
348
  envKeys.some((k) => k.startsWith('OPENCODE_'));
342
349
  if (hasOpencode) {
343
- void persistHostHarness(repoRoot, 'opencode');
344
350
  return { harness: 'opencode', source: 'env' };
345
351
  }
352
+ const hasCodex = process.env.CODEX_HOME !== undefined || envKeys.some((k) => k.startsWith('CODEX_'));
353
+ if (hasCodex) {
354
+ return { harness: 'codex', source: 'env' };
355
+ }
346
356
  // (3) persisted sidecar (the env-less-subagent recovery path) — honored ONLY
347
357
  // when its binary is resolvable here. The persisted value for codex /
348
358
  // opencode IS the binary name; probing it on PATH skips a wrong/stale
@@ -358,14 +368,27 @@ export async function resolveHostHarnessDetailsForRepo(repoRoot) {
358
368
  export async function resolveHostHarnessForRepo(repoRoot) {
359
369
  return (await resolveHostHarnessDetailsForRepo(repoRoot)).harness;
360
370
  }
371
+ /**
372
+ * Resolve the host harness and persist only a confident host signal (explicit
373
+ * override or CODEX_/OPENCODE_ env). This is the side-effecting entry point for
374
+ * command handlers that are about to spawn env-less subagents; core report and
375
+ * trajectory readers should use the read-only resolver above.
376
+ */
377
+ export async function seedHostHarnessForRepo(repoRoot) {
378
+ const resolution = await resolveHostHarnessDetailsForRepo(repoRoot);
379
+ if (resolution.source === 'override' || resolution.source === 'env') {
380
+ await persistHostHarness(repoRoot, resolution.harness);
381
+ }
382
+ return resolution;
383
+ }
361
384
  /**
362
385
  * Build the concrete `{binary, args, useStdin}` invocation for a headless run.
363
386
  *
364
387
  * Full escape hatch: if `SYNERGYSPEC_CODE_AGENT_COMMAND` is set, it is parsed as a
365
- * JSON `string[]` template. The literal tokens `{prompt}` and `{cwd}` are
366
- * substituted in each element; `binary = template[0]`, `args = template.slice(1)`.
367
- * `useStdin` is inferred true iff the template does NOT contain a `{prompt}`
368
- * token anywhere (so the caller streams the prompt to stdin instead).
388
+ * JSON `string[]` template. The literal token `{cwd}` is substituted in each
389
+ * element; `binary = template[0]`, `args = template.slice(1)`. `{prompt}` is
390
+ * deliberately rejected: loop-v2 prompts are too large for argv and must flow
391
+ * through stdin for every harness and override.
369
392
  *
370
393
  * Otherwise the command is derived from the harness (default
371
394
  * {@link resolveHostHarness}). Every harness streams the prompt over stdin
@@ -385,12 +408,14 @@ export function buildHeadlessCommand(prompt, opts) {
385
408
  throw new Error('SYNERGYSPEC_CODE_AGENT_COMMAND must be a non-empty JSON array of strings');
386
409
  }
387
410
  const rawTemplate = parsed;
388
- const useStdin = !rawTemplate.some((e) => e.includes('{prompt}'));
389
- const substituted = rawTemplate.map((e) => e.split('{prompt}').join(prompt).split('{cwd}').join(opts.cwd));
411
+ if (rawTemplate.some((e) => e.includes('{prompt}'))) {
412
+ throw new Error('SYNERGYSPEC_CODE_AGENT_COMMAND must not contain {prompt}; prompts are always streamed over stdin');
413
+ }
414
+ const substituted = rawTemplate.map((e) => e.split('{cwd}').join(opts.cwd));
390
415
  return {
391
416
  binary: substituted[0],
392
417
  args: substituted.slice(1),
393
- useStdin,
418
+ useStdin: true,
394
419
  };
395
420
  }
396
421
  const harness = opts.harness ?? resolveHostHarness();
@@ -422,6 +447,110 @@ export function buildHeadlessCommand(prompt, opts) {
422
447
  }
423
448
  }
424
449
  }
450
+ export function resolveHeadlessCommandForSpawn(command, opts = {}) {
451
+ const platform = opts.platform ?? process.platform;
452
+ if (platform !== 'win32') {
453
+ return { ...command, shell: false };
454
+ }
455
+ const resolved = resolveWindowsExecutable(command.binary, {
456
+ env: opts.env ?? process.env,
457
+ isExecutableFile: opts.isExecutableFile ??
458
+ ((candidate) => {
459
+ try {
460
+ return statSync(candidate).isFile();
461
+ }
462
+ catch {
463
+ return false;
464
+ }
465
+ }),
466
+ });
467
+ const binary = resolved ?? command.binary;
468
+ if (isUnsupportedWindowsExecutable(binary)) {
469
+ throw new Error(`Windows headless agent binary '${binary}' has unsupported extension '${path.win32
470
+ .extname(binary)
471
+ .toLowerCase()}'; use a .cmd, .bat, .exe, or .com shim, or invoke the interpreter explicitly via SYNERGYSPEC_CODE_AGENT_COMMAND.`);
472
+ }
473
+ if (isWindowsShellScript(binary)) {
474
+ const wrapper = wrapWindowsShellScript(binary, command.args, opts.env ?? process.env);
475
+ return {
476
+ ...command,
477
+ binary: wrapper.binary,
478
+ args: wrapper.args,
479
+ shell: false,
480
+ };
481
+ }
482
+ return {
483
+ ...command,
484
+ binary,
485
+ shell: false,
486
+ };
487
+ }
488
+ function executableExtensions(binary, isWindows, pathext) {
489
+ if (!isWindows)
490
+ return [''];
491
+ if (path.win32.extname(binary))
492
+ return [''];
493
+ return (pathext ?? '.COM;.EXE;.BAT;.CMD')
494
+ .split(';')
495
+ .map((ext) => ext.trim())
496
+ .filter(Boolean);
497
+ }
498
+ function resolveWindowsExecutable(binary, opts) {
499
+ if (!binary || binary.trim().length === 0)
500
+ return null;
501
+ const exts = executableExtensions(binary, true, opts.env.PATHEXT);
502
+ const candidates = [];
503
+ const hasPathSeparator = binary.includes('/') || binary.includes('\\');
504
+ if (hasPathSeparator) {
505
+ candidates.push(...exts.map((ext) => binary + ext));
506
+ }
507
+ else {
508
+ const entries = (opts.env.PATH ?? '').split(';').filter(Boolean);
509
+ for (const dir of entries) {
510
+ for (const ext of exts)
511
+ candidates.push(path.win32.join(dir, binary + ext));
512
+ }
513
+ }
514
+ let firstUnsupported = null;
515
+ for (const candidate of candidates) {
516
+ if (!opts.isExecutableFile(candidate, true))
517
+ continue;
518
+ if (isWindowsSpawnCompatibleExecutable(candidate))
519
+ return candidate;
520
+ firstUnsupported ??= candidate;
521
+ }
522
+ if (firstUnsupported) {
523
+ throw new Error(`Windows headless agent binary resolved to '${firstUnsupported}', but that extension cannot be spawned with shell:false; use a .cmd, .bat, .exe, or .com shim, or invoke the interpreter explicitly via SYNERGYSPEC_CODE_AGENT_COMMAND.`);
524
+ }
525
+ return null;
526
+ }
527
+ function isWindowsShellScript(binary) {
528
+ const ext = path.win32.extname(binary).toLowerCase();
529
+ return ext === '.cmd' || ext === '.bat';
530
+ }
531
+ function isWindowsSpawnCompatibleExecutable(binary) {
532
+ const ext = path.win32.extname(binary).toLowerCase();
533
+ return ext === '' || ext === '.com' || ext === '.exe' || ext === '.bat' || ext === '.cmd';
534
+ }
535
+ function isUnsupportedWindowsExecutable(binary) {
536
+ const ext = path.win32.extname(binary).toLowerCase();
537
+ return ext.length > 0 && !isWindowsSpawnCompatibleExecutable(binary);
538
+ }
539
+ function wrapWindowsShellScript(binary, args, env) {
540
+ const comspec = firstNonBlankEnv(env, 'ComSpec', 'COMSPEC') ?? 'cmd.exe';
541
+ return {
542
+ binary: comspec,
543
+ args: ['/d', '/s', '/c', 'call', binary, ...args],
544
+ };
545
+ }
546
+ function firstNonBlankEnv(env, ...keys) {
547
+ for (const key of keys) {
548
+ const value = env[key];
549
+ if (typeof value === 'string' && value.trim().length > 0)
550
+ return value;
551
+ }
552
+ return undefined;
553
+ }
425
554
  /**
426
555
  * The claude-default binary fallback: `SYNERGYSPEC_SELFEVOLVING_CLAUDE_BIN` when
427
556
  * non-empty, else `'claude'`. Kept here so {@link buildHeadlessCommand} is the
@@ -457,16 +586,27 @@ function claudeDefaultBinary() {
457
586
  */
458
587
  export async function runHeadlessAgent(prompt, opts) {
459
588
  const spawnImpl = opts.spawn ?? nodeSpawn;
460
- const command = buildHeadlessCommand(prompt, {
461
- cwd: opts.cwd,
462
- harness: opts.harness,
463
- binaryOverride: opts.binaryOverride,
464
- });
589
+ let spawnCommand;
590
+ try {
591
+ const command = buildHeadlessCommand(prompt, {
592
+ cwd: opts.cwd,
593
+ harness: opts.harness,
594
+ binaryOverride: opts.binaryOverride,
595
+ });
596
+ spawnCommand = resolveHeadlessCommandForSpawn(command);
597
+ }
598
+ catch (e) {
599
+ return {
600
+ exitCode: -1,
601
+ stdout: '',
602
+ stderr: e instanceof Error ? e.message : String(e),
603
+ };
604
+ }
465
605
  return await new Promise((resolve) => {
466
606
  let child;
467
607
  try {
468
- child = spawnImpl(command.binary, command.args, {
469
- shell: false,
608
+ child = spawnImpl(spawnCommand.binary, spawnCommand.args, {
609
+ shell: spawnCommand.shell,
470
610
  cwd: opts.cwd,
471
611
  });
472
612
  }
@@ -524,7 +664,7 @@ export async function runHeadlessAgent(prompt, opts) {
524
664
  // ignore
525
665
  }
526
666
  };
527
- if (command.useStdin) {
667
+ if (spawnCommand.useStdin) {
528
668
  // Swallow stdin stream errors (e.g. EPIPE when the child exits before it
529
669
  // has read the whole — possibly 100KB+ — prompt). The real failure is
530
670
  // reported via the child's own 'error'/'close' handlers below; an
@@ -585,7 +725,6 @@ export async function runHeadlessAgent(prompt, opts) {
585
725
  if (settled)
586
726
  return;
587
727
  const elapsedS = Math.round((Date.now() - startedAt) / 1000);
588
- // eslint-disable-next-line no-console
589
728
  console.error(`[self-evolution] headless agent running: ${elapsedS}s elapsed, ${bytesReceived} bytes received`);
590
729
  }, HEARTBEAT_INTERVAL_MS);
591
730
  heartbeatTimer.unref?.();
@@ -1,4 +1,4 @@
1
- import { DEFAULT_AGENT_TIMEOUT_MS, resolveAgentTimeoutMs, runHeadlessAgent, } from './host-harness.js';
1
+ import { resolveAgentTimeoutMs, runHeadlessAgent, } from './host-harness.js';
2
2
  import { enrichGradientWithDeepRead, DEFAULT_DEEP_READ_CONFIG, } from './reward-deepread.js';
3
3
  import { loadRewardScoringContext, scoreOnce, deriveSingleSampleVerdict, buildAnchors, computeJudgeVerifierDivergence, formatJudgeVerifierDivergenceFlag, JUDGE_VERIFIER_DIVERGENCE_FLAG_PREFIX, } from './reward-agent.js';
4
4
  import { writeDiagnosis, advanceEpisodeStage, } from './episode-store.js';
@@ -102,7 +102,7 @@ export async function runRewardAgentEnsemble(opts) {
102
102
  cwd: repoRoot,
103
103
  spawn: opts.spawn,
104
104
  binaryOverride: opts.binary,
105
- timeoutMs: opts.timeoutMs ?? DEFAULT_AGENT_TIMEOUT_MS,
105
+ timeoutMs: opts.timeoutMs ?? resolveAgentTimeoutMs(opts.harness),
106
106
  harness: opts.harness,
107
107
  });
108
108
  if (r.exitCode === 0 && r.stdout.length > 0)
@@ -121,12 +121,12 @@ export function getArchiveChangeSkillTemplate() {
121
121
  If \`evolution.status\` is \`error\`, surface the defect from status and still warn that the durable report is missing.
122
122
  If \`evolution.status\` is \`not-run\`, learn has not run or left evidence.
123
123
 
124
- If either is missing, display a non-blocking warning before archiving:
125
- - Missing verification evidence: suggest \`/synspec:verify <name>\`
126
- - Missing learn evidence: suggest \`/synspec:learn <name>\`
127
- - Ask the user to confirm before archiving without those files
128
-
129
- 4d. **Final workspace/package identity check**
124
+ If either is missing, stop before archiving:
125
+ - Missing verification evidence: suggest \`/synspec:verify <name>\`
126
+ - Missing learn evidence: suggest \`/synspec:learn <name>\`
127
+ - Do not archive until the missing evidence exists and readiness passes
128
+
129
+ 4d. **Final workspace/package identity check**
130
130
 
131
131
  Before archiving, validate that verification evidence still describes the
132
132
  current workspace:
@@ -169,7 +169,7 @@ export function getArchiveChangeSkillTemplate() {
169
169
  - Whether specs were synced (if applicable)
170
170
  - Blast radius triage results (e.g. "2 specs synced, 1 marked for review" or "No blast radius")
171
171
  - Verify/learn evidence status
172
- - Note about any warnings (incomplete artifacts/tasks or missing verify/learn evidence)
172
+ - Note about any warnings (incomplete artifacts/tasks) and any hard evidence blockers found
173
173
 
174
174
  **Output On Success**
175
175
 
@@ -188,12 +188,12 @@ All artifacts complete. All tasks complete.
188
188
  **Guardrails**
189
189
  - Always prompt for change selection if not provided
190
190
  - Use artifact graph (synergyspec-selfevolving status --json) for completion checking
191
- - Don't block archive on warnings - just inform and confirm
191
+ - Do not block archive on force-bypassable warnings, such as incomplete artifacts/tasks when the user explicitly forces them; do block on missing verification evidence, missing learn evidence, invalid workspace identity, or incomplete evolution.
192
192
  - Preserve .synergyspec-selfevolving.yaml when moving to archive (it moves with the directory)
193
193
  - Show clear summary of what happened
194
194
  - If sync is requested, use synergyspec-selfevolving-sync-specs approach (agent-driven)
195
195
  - If delta specs exist, always run the sync assessment and show the combined summary before prompting
196
- - If verification or learn evidence is missing, warn and confirm before archiving; do not silently skip those workflow stages
196
+ - If verification or learn evidence is missing, stop before archiving; do not silently skip those workflow stages
197
197
  - If spec-blast-radius.md does not exist, skip step 4b silently (no warning needed)`,
198
198
  license: 'MIT',
199
199
  compatibility: 'Requires synergyspec-selfevolving CLI.',
@@ -325,12 +325,12 @@ export function getOpsxArchiveCommandTemplate() {
325
325
  If \`evolution.status\` is \`error\`, surface the defect from status and still warn that the durable report is missing.
326
326
  If \`evolution.status\` is \`not-run\`, learn has not run or left evidence.
327
327
 
328
- If either is missing, display a non-blocking warning before archiving:
329
- - Missing verification evidence: suggest \`/synspec:verify <name>\`
330
- - Missing learn evidence: suggest \`/synspec:learn <name>\`
331
- - Ask the user to confirm before archiving without those files
332
-
333
- 5. **Perform the archive**
328
+ If either is missing, stop before archiving:
329
+ - Missing verification evidence: suggest \`/synspec:verify <name>\`
330
+ - Missing learn evidence: suggest \`/synspec:learn <name>\`
331
+ - Do not archive until the missing evidence exists and readiness passes
332
+
333
+ 5. **Perform the archive**
334
334
 
335
335
  Create the archive directory if it doesn't exist:
336
336
  \`\`\`bash
@@ -356,7 +356,7 @@ export function getOpsxArchiveCommandTemplate() {
356
356
  - Spec sync status (synced / sync skipped / no delta specs)
357
357
  - Blast radius triage results (e.g. "2 specs synced, 1 marked for review" or "No blast radius")
358
358
  - Verify/learn evidence status
359
- - Note about any warnings (incomplete artifacts/tasks or missing verify/learn evidence)
359
+ - Note about any warnings (incomplete artifacts/tasks) and any hard evidence blockers found
360
360
 
361
361
  **Output On Success**
362
362
 
@@ -424,12 +424,12 @@ Target archive directory already exists.
424
424
  **Guardrails**
425
425
  - Always prompt for change selection if not provided
426
426
  - Use artifact graph (synergyspec-selfevolving status --json) for completion checking
427
- - Don't block archive on warnings - just inform and confirm
427
+ - Do not block archive on force-bypassable warnings, such as incomplete artifacts/tasks when the user explicitly forces them; do block on missing verification evidence, missing learn evidence, invalid workspace identity, or incomplete evolution.
428
428
  - Preserve .synergyspec-selfevolving.yaml when moving to archive (it moves with the directory)
429
429
  - Show clear summary of what happened
430
430
  - If sync is requested, use the Skill tool to invoke \`synergyspec-selfevolving-sync-specs\` (agent-driven)
431
431
  - If delta specs exist, always run the sync assessment and show the combined summary before prompting
432
- - If verification or learn evidence is missing, warn and confirm before archiving; do not silently skip those workflow stages
432
+ - If verification or learn evidence is missing, stop before archiving; do not silently skip those workflow stages
433
433
  - If spec-blast-radius.md does not exist, skip step 4b silently (no warning needed)`
434
434
  };
435
435
  }
@@ -3,32 +3,35 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a Dream mode and flags
3
3
  Accepted forms:
4
4
 
5
5
  \`\`\`text
6
- /synspec:dream
7
- /synspec:dream preview [--target <id>] [--limit <n>] [--json]
8
- /synspec:dream run [--target <id>] [--limit <n>] [--json]
9
- /synspec:dream show [runId] [--json]
10
- \`\`\`
11
-
12
- Bare \`/synspec:dream\` means \`preview\`. Preview is read-only.
6
+ /synspec:dream
7
+ /synspec:dream preview [--target <id>] [--limit <n>] [--json]
8
+ /synspec:dream run [--target <id>] [--limit <n>] [--apply --yes] [--json]
9
+ /synspec:dream show [runId] [--json]
10
+ /synspec:dream policy-update <candidateId> --accepted-by <name> --yes [--json]
11
+ \`\`\`
12
+
13
+ Bare \`/synspec:dream\` means \`preview\`. Preview is read-only. Plain \`run\` writes only Dream artifacts. \`run --apply --yes\` and \`policy-update ... --yes\` are explicit policy-update entrances for already accepted Dream candidates.
13
14
 
14
15
  **Purpose**
15
16
 
16
- This is the SS agent-harness entrance for offline Supervised Learning Dream. The user should trigger Dream from the code-agent chat, not by opening a separate terminal. Your job is to call the existing CLI engine, parse the JSON result, and relay a short Dream Verdict.
17
-
18
- Dream is not the loop-v2 episode runner. It batch-reads completed evidence and proposes optimizer briefs for existing skill/workflow/template targets. It never creates new skills, never edits POLICY directly, never promotes candidates, and never runs the episode/reward/evolving agents.
17
+ This is the SS agent-harness entrance for offline Supervised Learning Dream. The user should trigger Dream from the code-agent chat, not by opening a separate terminal. Your job is to call the existing CLI engine, parse the JSON result, and relay a short Dream Verdict.
18
+
19
+ Dream is not the loop-v2 episode runner. It batch-reads completed evidence and proposes optimizer briefs for existing skill/workflow/template targets. It never creates new skills, never edits POLICY directly, and never runs the episode/reward/evolving agents. By default Dream is proposal-only; policy changes require an explicit accepted-candidate update with \`--yes\`, synthesize bounded edits into the candidate package, pass the static gate, and promote through the existing rollback/ledger path.
19
20
 
20
21
  **Mode parsing**
21
22
 
22
23
  1. If the first argument is missing, use \`preview\`.
23
- 2. If the first argument is one of \`preview\`, \`run\`, or \`show\`, use that mode.
24
- 3. If the first argument starts with \`--\`, treat it as a \`preview\` flag.
25
- 4. If the mode is unknown, stop and show the accepted forms above.
26
-
27
- Pass only these user options through:
28
- - \`--target <id>\`
29
- - \`--limit <n>\`
30
- - \`--json\`
31
- - \`runId\` for \`show\`
24
+ 2. If the first argument is one of \`preview\`, \`run\`, \`show\`, or \`policy-update\`, use that mode.
25
+ 3. If the first argument starts with \`--\`, treat it as a \`preview\` flag.
26
+ 4. If the mode is unknown, stop and show the accepted forms above.
27
+
28
+ Pass only these user options through:
29
+ - \`--target <id>\`
30
+ - \`--limit <n>\`
31
+ - \`--apply\` and \`--yes\` for \`run\`
32
+ - \`candidateId\`, \`--accepted-by <name>\`, and \`--yes\` for \`policy-update\`
33
+ - \`--json\`
34
+ - \`runId\` for \`show\`
32
35
 
33
36
  Always add \`--json\` to the CLI command you run so the result is machine-readable. If the user explicitly asked for \`--json\`, include the compact raw JSON after the Dream Verdict; otherwise provide the human summary only.
34
37
 
@@ -46,49 +49,56 @@ Always add \`--json\` to the CLI command you run so the result is machine-readab
46
49
  synergyspec-selfevolving self-evolution dream run --json
47
50
  \`\`\`
48
51
 
49
- For show:
50
- \`\`\`bash
51
- synergyspec-selfevolving self-evolution dream show --json
52
- \`\`\`
53
-
54
- Append \`--target <id>\`, \`--limit <n>\`, or \`runId\` only when the user supplied them.
52
+ For show:
53
+ \`\`\`bash
54
+ synergyspec-selfevolving self-evolution dream show --json
55
+ \`\`\`
56
+
57
+ For accepted candidate policy update:
58
+ \`\`\`bash
59
+ synergyspec-selfevolving self-evolution dream policy-update <candidateId> --accepted-by <name> --yes --json
60
+ \`\`\`
61
+
62
+ Append \`--target <id>\`, \`--limit <n>\`, \`--apply\`, \`--yes\`, \`--accepted-by <name>\`, or \`runId\` only when the user supplied them. Never add \`--yes\` on the user's behalf.
55
63
 
56
64
  2. **Interpret the result without re-judging it**
57
65
 
58
- Read candidate ids, target ids, evidence summary, run id, and write paths from the CLI JSON when present. Do not invent candidate ids or claim a policy change.
66
+ Read candidate ids, target ids, evidence summary, run id, update outcome, gate result, promoted files, policy version, and write paths from the CLI JSON when present. Do not invent candidate ids or claim a policy change.
59
67
 
60
68
  3. **Classify writes**
61
69
 
62
- - \`preview\`: Writes are \`none\`.
63
- - \`run\`: Writes are \`dream-run + draft candidates\`.
64
- - \`show\`: Writes are \`none\`.
65
-
66
- 4. **Report the next step**
67
-
68
- Dream candidates are proposal-only optimizer briefs. If the user wants to turn one into a real SS change, use the existing review/promotion channel after a reviewer or optimizer backend authors bounded edits. Do not promote from this skill.
70
+ - \`preview\`: Writes are \`none\`.
71
+ - plain \`run\`: Writes are \`dream-run + draft candidates\`.
72
+ - \`run --apply --yes\`: Writes are \`dream-run + candidates + gated policy update\` when the update is promoted; otherwise report the refusal outcome.
73
+ - \`show\`: Writes are \`none\`.
74
+ - \`policy-update --yes\`: Writes are \`gated policy update\` when promoted; otherwise report the refusal outcome.
75
+
76
+ 4. **Report the next step**
77
+
78
+ Plain Dream candidates are proposal-only optimizer briefs. To turn an accepted candidate into policy, use \`/synspec:dream policy-update <candidateId> --accepted-by <name> --yes\`. The update path must author bounded edits, pass the static gate, and promote through the existing rollback/ledger channel; if any gate refuses, report the refusal and leave the policy unchanged.
69
79
 
70
80
  **Output Format**
71
81
 
72
82
  End with this block:
73
83
 
74
84
  \`\`\`text
75
- ## Dream Verdict
76
- - Mode: preview | run | show
77
- - Run id: <id or none>
78
- - Candidates: <ids or none>
79
- - Targets: <target ids or all eligible>
80
- - Evidence read: <short summary>
81
- - Writes: none | dream-run + draft candidates
82
- - Policy changed: no
83
- - New skills created: no
84
- - Next step: review candidate(s), then use the existing review/promotion flow if wanted
85
- \`\`\`
85
+ ## Dream Verdict
86
+ - Mode: preview | run | show | policy-update
87
+ - Run id: <id or none>
88
+ - Candidates: <ids or none>
89
+ - Targets: <target ids or all eligible>
90
+ - Evidence read: <short summary>
91
+ - Writes: none | dream-run + draft candidates | dream-run + candidates + gated policy update | gated policy update
92
+ - Policy changed: yes | no
93
+ - New skills created: no
94
+ - Next step: review candidate(s), run accepted policy-update, or inspect gate refusal
95
+ \`\`\`
86
96
 
87
97
  If the CLI command fails, still end with \`## Dream Verdict\` and set fields to \`none\` where unknown. Put the command failure under \`Evidence read\` or \`Next step\`; do not retry with a different self-evolution command.`;
88
98
  export function getDreamSkillTemplate() {
89
99
  return {
90
100
  name: 'synergyspec-selfevolving-dream',
91
- description: 'SS Dream entrance: preview, run, or inspect offline Supervised Learning Dream proposals from the code-agent chat.',
101
+ description: 'SS Dream entrance: preview, run, inspect, or apply accepted offline Supervised Learning Dream updates from the code-agent chat.',
92
102
  instructions: `Run the SS offline Supervised Learning Dream lane from the code-agent harness.
93
103
 
94
104
  ${INSTRUCTIONS_BODY}`,
@@ -100,12 +110,12 @@ ${INSTRUCTIONS_BODY}`,
100
110
  export function getOpsxDreamCommandTemplate() {
101
111
  return {
102
112
  name: 'SS: Dream',
103
- description: 'Preview, run, or inspect offline Supervised Learning Dream proposals from the code-agent chat',
113
+ description: 'Preview, run, inspect, or apply accepted offline Supervised Learning Dream updates from the code-agent chat',
104
114
  category: 'Workflow',
105
115
  tags: ['workflow', 'dream', 'self-evolution', 'offline-learning'],
106
116
  content: `Run the SS offline Supervised Learning Dream lane from the code-agent harness.
107
117
 
108
- **Input**: Optionally specify a mode after \`/synspec:dream\` (for example \`/synspec:dream preview\`, \`/synspec:dream run --limit 5\`, or \`/synspec:dream show\`). Bare \`/synspec:dream\` means read-only \`preview\`.
118
+ **Input**: Optionally specify a mode after \`/synspec:dream\` (for example \`/synspec:dream preview\`, \`/synspec:dream run --limit 5\`, \`/synspec:dream show\`, or \`/synspec:dream policy-update <candidateId> --accepted-by <name> --yes\`). Bare \`/synspec:dream\` means read-only \`preview\`.
109
119
 
110
120
  ${INSTRUCTIONS_BODY}`,
111
121
  };