synergyspec-selfevolving 1.4.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/README.md +31 -18
  2. package/dist/commands/learn.d.ts +12 -1
  3. package/dist/commands/learn.js +158 -11
  4. package/dist/commands/self-evolution-episode.d.ts +177 -0
  5. package/dist/commands/self-evolution-episode.js +431 -0
  6. package/dist/commands/self-evolution.d.ts +12 -190
  7. package/dist/commands/self-evolution.js +114 -866
  8. package/dist/core/archive.d.ts +0 -1
  9. package/dist/core/archive.js +0 -58
  10. package/dist/core/artifact-graph/instruction-loader.d.ts +2 -4
  11. package/dist/core/artifact-graph/instruction-loader.js +3 -31
  12. package/dist/core/fitness/loss.d.ts +5 -5
  13. package/dist/core/fitness/loss.js +4 -4
  14. package/dist/core/fitness/test-failures.js +10 -2
  15. package/dist/core/project-config.d.ts +19 -0
  16. package/dist/core/project-config.js +96 -0
  17. package/dist/core/self-evolution/candidate-fitness.d.ts +23 -1
  18. package/dist/core/self-evolution/candidate-fitness.js +31 -5
  19. package/dist/core/self-evolution/candidates.d.ts +0 -9
  20. package/dist/core/self-evolution/critic-agent.d.ts +192 -0
  21. package/dist/core/self-evolution/critic-agent.js +568 -0
  22. package/dist/core/self-evolution/edits-contract.d.ts +53 -0
  23. package/dist/core/self-evolution/edits-contract.js +89 -0
  24. package/dist/core/self-evolution/episode-orchestrator.d.ts +234 -0
  25. package/dist/core/self-evolution/episode-orchestrator.js +681 -0
  26. package/dist/core/self-evolution/episode-store.d.ts +266 -0
  27. package/dist/core/self-evolution/episode-store.js +573 -0
  28. package/dist/core/self-evolution/evolution-switches.d.ts +1 -1
  29. package/dist/core/self-evolution/evolution-switches.js +5 -10
  30. package/dist/core/self-evolution/evolving-agent.d.ts +208 -0
  31. package/dist/core/self-evolution/evolving-agent.js +535 -0
  32. package/dist/core/self-evolution/host-harness.d.ts +14 -15
  33. package/dist/core/self-evolution/host-harness.js +48 -23
  34. package/dist/core/self-evolution/index.d.ts +11 -6
  35. package/dist/core/self-evolution/index.js +20 -6
  36. package/dist/core/self-evolution/line-diff.d.ts +60 -0
  37. package/dist/core/self-evolution/line-diff.js +130 -0
  38. package/dist/core/self-evolution/policy/fs-safe.d.ts +19 -0
  39. package/dist/core/self-evolution/policy/fs-safe.js +89 -0
  40. package/dist/core/self-evolution/policy/index.d.ts +13 -0
  41. package/dist/core/self-evolution/policy/index.js +13 -0
  42. package/dist/core/self-evolution/policy/policy-store.d.ts +217 -0
  43. package/dist/core/self-evolution/policy/policy-store.js +774 -0
  44. package/dist/core/self-evolution/policy/prediction-reconcile.d.ts +54 -0
  45. package/dist/core/self-evolution/policy/prediction-reconcile.js +191 -0
  46. package/dist/core/self-evolution/policy/reject-buffer.d.ts +55 -0
  47. package/dist/core/self-evolution/policy/reject-buffer.js +170 -0
  48. package/dist/core/self-evolution/promote.d.ts +1 -1
  49. package/dist/core/self-evolution/promote.js +6 -33
  50. package/dist/core/self-evolution/promotion.js +1 -2
  51. package/dist/core/self-evolution/reward-agent.d.ts +379 -0
  52. package/dist/core/self-evolution/reward-agent.js +940 -0
  53. package/dist/core/self-evolution/reward-aggregator.d.ts +59 -0
  54. package/dist/core/self-evolution/reward-aggregator.js +262 -0
  55. package/dist/core/self-evolution/scope-gate.d.ts +66 -0
  56. package/dist/core/self-evolution/scope-gate.js +107 -0
  57. package/dist/core/self-evolution/success-channel.js +2 -2
  58. package/dist/core/self-evolution/tamper-check.d.ts +24 -0
  59. package/dist/core/self-evolution/tamper-check.js +236 -0
  60. package/dist/core/self-evolution/tool-evolution.js +2 -13
  61. package/dist/core/self-evolution/verdict.d.ts +8 -5
  62. package/dist/core/self-evolution/verdict.js +4 -7
  63. package/dist/core/templates/workflows/gen-tests.js +1 -1
  64. package/dist/core/templates/workflows/learn.d.ts +3 -2
  65. package/dist/core/templates/workflows/learn.js +21 -18
  66. package/dist/core/templates/workflows/self-evolving.d.ts +6 -4
  67. package/dist/core/templates/workflows/self-evolving.js +62 -172
  68. package/dist/core/trajectory/scrub.d.ts +27 -0
  69. package/dist/core/trajectory/scrub.js +79 -0
  70. package/dist/core/trajectory/skeleton.d.ts +27 -1
  71. package/dist/core/trajectory/skeleton.js +152 -8
  72. package/dist/dashboard/data.d.ts +25 -51
  73. package/dist/dashboard/data.js +68 -180
  74. package/dist/dashboard/react-client.js +458 -503
  75. package/dist/dashboard/react-styles.js +3 -3
  76. package/dist/dashboard/server.js +23 -17
  77. package/dist/ui/ascii-patterns.d.ts +7 -15
  78. package/dist/ui/ascii-patterns.js +123 -54
  79. package/dist/ui/welcome-screen.d.ts +0 -14
  80. package/dist/ui/welcome-screen.js +16 -35
  81. package/package.json +1 -1
  82. package/dist/core/self-evolution/ga-selection.d.ts +0 -94
  83. package/dist/core/self-evolution/ga-selection.js +0 -153
  84. package/dist/core/self-evolution/proposer-agent.d.ts +0 -182
  85. package/dist/core/self-evolution/proposer-agent.js +0 -326
  86. package/dist/core/self-evolution/replay-runner.d.ts +0 -100
  87. package/dist/core/self-evolution/replay-runner.js +0 -170
  88. package/dist/core/self-evolution/replay.d.ts +0 -45
  89. package/dist/core/self-evolution/replay.js +0 -56
  90. package/dist/core/self-evolution/template-variants.d.ts +0 -62
  91. package/dist/core/self-evolution/template-variants.js +0 -171
  92. package/dist/core/self-evolution/trajectory.d.ts +0 -65
  93. package/dist/core/self-evolution/trajectory.js +0 -185
@@ -0,0 +1,236 @@
1
+ /**
2
+ * 防作弊 ANTI-HACKING test-tamper detection — a deterministic, dependency-free
3
+ * detector that flags a 主智能体 MAIN agent which "passed" by weakening its own
4
+ * tests rather than by making the code correct. The 奖励智能体 REWARD AGENT calls
5
+ * {@link detectTestTamper} on a change's directory and, when a STRONG heuristic
6
+ * fires, treats the episode as 作弊嫌疑 tamper-suspected (a hard gate, never a
7
+ * smooth penalty).
8
+ *
9
+ * Three families of heuristic, all read directly off the change dir's files:
10
+ * 1. Count mismatch — `spec-tests.md` declares N scenarios but `test-report.md`
11
+ * collected/ran far fewer (the agent deleted/skipped most of the suite).
12
+ * 2. Neutered assertions — `assert True`, `assert(true)`, `expect(true).toBe(true)`
13
+ * and friends in any discovered test file.
14
+ * 3. Disabled tests — `pytest.skip`/`@pytest.mark.xfail`/`it.skip`/`t.Skip(` …,
15
+ * or an empty test body left as `pass`/`return` with a `TODO`.
16
+ *
17
+ * Conservative by construction: every heuristic tolerates missing files (returns
18
+ * no flag, never throws on ENOENT), the count heuristic only fires on a LARGE
19
+ * drop, and `suspected` is true iff at least one strong heuristic fired. Output
20
+ * is bounded so a pathological change cannot flood the caller.
21
+ */
22
+ import { promises as fs } from 'node:fs';
23
+ import * as path from 'node:path';
24
+ import { parseTestMetrics } from '../fitness/test-metrics.js';
25
+ /** Cap on the number of flags returned (the strongest first). */
26
+ const MAX_FLAGS = 6;
27
+ /** Cap on each flag's length. */
28
+ const MAX_FLAG_CHARS = 200;
29
+ /** Report must show at least this fraction of the declared scenarios or it's a drop. */
30
+ const MIN_REPORT_FRACTION = 0.5;
31
+ /** Only fire the count heuristic when the suite is non-trivial. */
32
+ const MIN_DECLARED_FOR_DROP = 4;
33
+ /** Cap on how many test files we scan (a defensive bound). */
34
+ const MAX_TEST_FILES = 400;
35
+ /** Cap on directory recursion depth. */
36
+ const MAX_DEPTH = 8;
37
+ // ANSI SGR sequence matcher — same construction as test-metrics.ts so no
38
+ // literal control character lives in the source.
39
+ const ANSI_SGR = new RegExp(String.fromCharCode(27) + '?\\[[0-9;]*m', 'g');
40
+ function clampFlag(text) {
41
+ const t = text.trim();
42
+ return t.length > MAX_FLAG_CHARS ? `${t.slice(0, MAX_FLAG_CHARS - 1)}…` : t;
43
+ }
44
+ /** Read a file as utf8, or return null when it does not exist (never throws on ENOENT). */
45
+ async function readFileOrNull(filePath) {
46
+ try {
47
+ return await fs.readFile(filePath, 'utf8');
48
+ }
49
+ catch (err) {
50
+ if (err.code === 'ENOENT')
51
+ return null;
52
+ // A directory-where-a-file-was, permission, etc. — treat as "no signal".
53
+ return null;
54
+ }
55
+ }
56
+ /**
57
+ * Count the scenarios/tests a `spec-tests.md` declares. The canonical gen-tests
58
+ * format is a "Requirement Traceability Matrix" markdown table — one row per
59
+ * mapped (requirement, test) pair. We count those data rows (skipping the
60
+ * header + separator and any row whose test cell is "—"/blank, which marks a
61
+ * step with no applicable test). Returns null when no countable table is found.
62
+ */
63
+ export function countDeclaredScenarios(specTestsText) {
64
+ if (!specTestsText)
65
+ return null;
66
+ const lines = specTestsText.replace(ANSI_SGR, '').split(/\r?\n/);
67
+ let count = 0;
68
+ let sawTableRow = false;
69
+ for (const raw of lines) {
70
+ const line = raw.trim();
71
+ // A markdown table data row: starts and ends with a pipe and has ≥ 3 cells.
72
+ if (!line.startsWith('|') || !line.endsWith('|'))
73
+ continue;
74
+ const cells = line
75
+ .slice(1, -1)
76
+ .split('|')
77
+ .map((c) => c.trim());
78
+ if (cells.length < 3)
79
+ continue;
80
+ // Header separator row: every cell is dashes/colons.
81
+ if (cells.every((c) => /^:?-{2,}:?$/.test(c) || c === ''))
82
+ continue;
83
+ // Header row: a cell literally named "ID" or "Use Case ID" etc. — skip the
84
+ // first such header but keep scanning for data rows.
85
+ const firstCell = cells[0].toLowerCase();
86
+ if (firstCell === 'id' || firstCell === 'use case id' || firstCell === 'uc step') {
87
+ continue;
88
+ }
89
+ sawTableRow = true;
90
+ count += 1;
91
+ }
92
+ return sawTableRow ? count : null;
93
+ }
94
+ /** test-report.md count heuristic: declared N but report collected far fewer. */
95
+ async function checkCountMismatch(changeDirPath) {
96
+ const specText = await readFileOrNull(path.join(changeDirPath, 'spec-tests.md'));
97
+ const reportText = await readFileOrNull(path.join(changeDirPath, 'test-report.md'));
98
+ if (specText === null || reportText === null)
99
+ return null;
100
+ const declared = countDeclaredScenarios(specText);
101
+ if (declared === null || declared < MIN_DECLARED_FOR_DROP)
102
+ return null;
103
+ const metrics = parseTestMetrics(reportText);
104
+ if (metrics === null)
105
+ return null;
106
+ const collected = metrics.total;
107
+ if (collected >= declared * MIN_REPORT_FRACTION)
108
+ return null;
109
+ return clampFlag(`spec-tests.md declares ${declared} scenarios but test-report shows ${collected} collected ` +
110
+ `(< ${Math.round(MIN_REPORT_FRACTION * 100)}% of declared) — most of the suite appears dropped or skipped`);
111
+ }
112
+ /** Neutered/tautological assertion patterns across python/js/ts/go test files. */
113
+ const NEUTERED_ASSERTION_RES = [
114
+ /\bassert\s+True\b/, // python: assert True
115
+ /\bassert\s+1\s*==\s*1\b/, // python: assert 1 == 1
116
+ /\bassert\s*\(\s*true\s*\)/i, // js/go-ish: assert(true)
117
+ /\bassert\s+(['"]).*\1\b/, // python: assert "msg" (truthy literal)
118
+ /\bexpect\s*\(\s*true\s*\)\s*\.\s*toBe\s*\(\s*true\s*\)/i, // jest/vitest
119
+ /\bexpect\s*\(\s*1\s*\)\s*\.\s*toBe\s*\(\s*1\s*\)/i, // jest/vitest: expect(1).toBe(1)
120
+ /\bassert\.\w+\s*\(\s*true\s*,?\s*true\s*\)/i, // node assert.equal(true,true)
121
+ ];
122
+ /** Disabled / skipped test markers across python/js/ts/go. */
123
+ const DISABLED_TEST_RES = [
124
+ /\bpytest\s*\.\s*skip\b/, // pytest.skip(...)
125
+ /@\s*pytest\s*\.\s*mark\s*\.\s*skip\b/, // @pytest.mark.skip
126
+ /@\s*pytest\s*\.\s*mark\s*\.\s*xfail\b/, // @pytest.mark.xfail
127
+ /@\s*unittest\s*\.\s*skip\b/, // @unittest.skip
128
+ /\b(?:it|describe|test)\s*\.\s*skip\b/, // it.skip / describe.skip / test.skip
129
+ /\b(?:xit|xdescribe)\s*\(/, // xit( / xdescribe(
130
+ /\bt\s*\.\s*Skip\s*\(/, // go: t.Skip(
131
+ ];
132
+ const TEST_FILE_RE = /(?:^|[._-])test|test[._-]|\.spec\.|\.test\./i;
133
+ const SCANNABLE_EXT = new Set(['.py', '.ts', '.tsx', '.js', '.jsx', '.go']);
134
+ /** Is this a file we should scan for tampered test code? */
135
+ function looksLikeTestFile(filePath) {
136
+ const ext = path.extname(filePath).toLowerCase();
137
+ if (!SCANNABLE_EXT.has(ext))
138
+ return false;
139
+ const base = path.basename(filePath);
140
+ return TEST_FILE_RE.test(base);
141
+ }
142
+ /** Walk the change dir (bounded), collecting paths that look like test files. */
143
+ async function collectTestFiles(root) {
144
+ const found = [];
145
+ async function walk(dir, depth) {
146
+ if (depth > MAX_DEPTH || found.length >= MAX_TEST_FILES)
147
+ return;
148
+ let entries;
149
+ try {
150
+ entries = await fs.readdir(dir, { withFileTypes: true });
151
+ }
152
+ catch {
153
+ return; // unreadable dir — skip, never throw
154
+ }
155
+ for (const entry of entries) {
156
+ if (found.length >= MAX_TEST_FILES)
157
+ return;
158
+ const name = entry.name;
159
+ const full = path.join(dir, name);
160
+ if (entry.isDirectory()) {
161
+ if (name === 'node_modules' || name === '.git')
162
+ continue;
163
+ await walk(full, depth + 1);
164
+ }
165
+ else if (entry.isFile() && looksLikeTestFile(full)) {
166
+ found.push(full);
167
+ }
168
+ }
169
+ }
170
+ await walk(root, 0);
171
+ return found;
172
+ }
173
+ /**
174
+ * A test body that is just `pass`/`return` paired with a TODO comment — an
175
+ * empty stub masquerading as a test. We look for a TODO comment on (or right
176
+ * beside) a lone `pass`/`return` line.
177
+ */
178
+ const EMPTY_BODY_TODO_RES = [
179
+ /^\s*(?:pass|return)\b.*(?:#|\/\/)\s*TODO/im, // pass # TODO / return // TODO
180
+ /(?:#|\/\/)\s*TODO[^\n]*\n\s*(?:pass|return)\b/im, // # TODO\n pass
181
+ ];
182
+ /** Scan the discovered test files for neutered assertions and disabled tests. */
183
+ async function scanTestFiles(root) {
184
+ const files = await collectTestFiles(root);
185
+ const scan = { neutered: 0, disabled: 0, emptyTodo: 0 };
186
+ for (const file of files) {
187
+ const content = await readFileOrNull(file);
188
+ if (content === null)
189
+ continue;
190
+ const rel = path.relative(root, file) || path.basename(file);
191
+ if (NEUTERED_ASSERTION_RES.some((re) => re.test(content))) {
192
+ scan.neutered += 1;
193
+ if (scan.firstNeuteredFile === undefined)
194
+ scan.firstNeuteredFile = rel;
195
+ }
196
+ if (DISABLED_TEST_RES.some((re) => re.test(content))) {
197
+ scan.disabled += 1;
198
+ if (scan.firstDisabledFile === undefined)
199
+ scan.firstDisabledFile = rel;
200
+ }
201
+ if (EMPTY_BODY_TODO_RES.some((re) => re.test(content))) {
202
+ scan.emptyTodo += 1;
203
+ }
204
+ }
205
+ return scan;
206
+ }
207
+ /**
208
+ * Detect whether a change's tests were tampered to fake a pass. Reads only
209
+ * files under `input.changeDirPath`; tolerates every missing file. `suspected`
210
+ * is true iff at least one strong heuristic fired.
211
+ */
212
+ export async function detectTestTamper(input) {
213
+ const root = input.changeDirPath;
214
+ const flags = [];
215
+ const countFlag = await checkCountMismatch(root);
216
+ if (countFlag)
217
+ flags.push(countFlag);
218
+ const scan = await scanTestFiles(root);
219
+ if (scan.neutered > 0) {
220
+ const where = scan.firstNeuteredFile ? ` (e.g. ${scan.firstNeuteredFile})` : '';
221
+ flags.push(clampFlag(`${scan.neutered} test file(s) contain trivially-true / neutered assertions` +
222
+ `${where} — e.g. \`assert True\`, \`expect(true).toBe(true)\``));
223
+ }
224
+ if (scan.disabled > 0) {
225
+ const where = scan.firstDisabledFile ? ` (e.g. ${scan.firstDisabledFile})` : '';
226
+ flags.push(clampFlag(`${scan.disabled} test file(s) disable tests via skip/xfail/xit${where} — ` +
227
+ `the suite may pass because its checks no longer run`));
228
+ }
229
+ if (scan.emptyTodo > 0) {
230
+ flags.push(clampFlag(`${scan.emptyTodo} test file(s) have empty test bodies (\`pass\`/\`return\` with a TODO) ` +
231
+ `standing in for real assertions`));
232
+ }
233
+ const capped = flags.slice(0, MAX_FLAGS);
234
+ return { suspected: capped.length > 0, flags: capped };
235
+ }
236
+ //# sourceMappingURL=tamper-check.js.map
@@ -189,12 +189,10 @@ export function classifyEvolvablePart(filePath) {
189
189
  if (!file)
190
190
  return [];
191
191
  const parts = [];
192
- if (file === 'src/core/self-evolution/template-variants.ts' ||
193
- file.startsWith('.synergyspec-selfevolving/self-evolution/templates/') ||
194
- file.startsWith('.synergyspec-selfevolving/self-evolution/template-variants') ||
192
+ if (file.startsWith('.synergyspec-selfevolving/self-evolution/templates/') ||
195
193
  file.startsWith('schemas/') ||
196
194
  /\/templates\//.test(file)) {
197
- parts.push('template-variants');
195
+ parts.push('artifact-templates');
198
196
  }
199
197
  if (file === 'src/core/self-evolution/archive-memory.ts') {
200
198
  parts.push('archive-memory');
@@ -212,15 +210,6 @@ export function classifyEvolvablePart(filePath) {
212
210
  file.startsWith('src/memory/extraction/')) {
213
211
  parts.push('runtime-memory');
214
212
  }
215
- if (file.startsWith('evolve/src/evolution/') ||
216
- file.startsWith('evolve/src/proposer/') ||
217
- file.startsWith('evolve/src/mutators/') ||
218
- file.startsWith('evolve/src/variant/') ||
219
- file.startsWith('evolve/src/materialize/') ||
220
- file.startsWith('evolve/src/benchmark/') ||
221
- file === 'src/commands/evolve.ts') {
222
- parts.push('dgm-harness');
223
- }
224
213
  if (file === 'src/core/self-evolution/tool-evolution.ts' ||
225
214
  file.startsWith('evolve/src/safety/') ||
226
215
  file.startsWith('evolve/test/safety/')) {
@@ -9,12 +9,15 @@ export declare const VERDICT_FILE = "verdict.json";
9
9
  * - `gate-failed` — the static gate refused it.
10
10
  * - `declined` — an auto-promote predicate said no (non-terminal: the
11
11
  * candidate may still be retried later).
12
- * - `outcompeted` — lost the GA ranking to a sibling variant for the same
13
- * target (advisory; the candidate's status is untouched).
14
12
  */
15
- export type CandidateVerdictKind = 'promoted' | 'rolled-back' | 'rejected' | 'gate-failed' | 'declined' | 'outcompeted';
16
- /** Who/what reached the verdict — for provenance in the trajectory block. */
17
- export type CandidateVerdictDecider = 'human' | 'auto-evolve' | 'evolve-from-edits' | 'evolve-outer-loop' | 'static-gate';
13
+ export type CandidateVerdictKind = 'promoted' | 'rolled-back' | 'rejected' | 'gate-failed' | 'declined';
14
+ /**
15
+ * Who/what reached the verdict for provenance in the trajectory block.
16
+ * Internal enum, not a user-facing capability. (Legacy verdict.json files may
17
+ * carry a now-removed `'auto-evolve'` decider; the unchecked cast on read keeps
18
+ * those readable.)
19
+ */
20
+ export type CandidateVerdictDecider = 'human' | 'evolve-from-edits' | 'static-gate';
18
21
  export interface CandidateVerdictRecord {
19
22
  verdict: CandidateVerdictKind;
20
23
  /** ISO-8601 UTC timestamp the verdict was reached (caller-supplied). */
@@ -2,11 +2,9 @@
2
2
  * Per-candidate promotion VERDICT (the backward-pass label the proposer reads).
3
3
  *
4
4
  * `fitness-record.jsonl` records HOW a candidate scored; this sidecar records
5
- * what the loop DECIDED about it — promoted, rejected, outcompeted, etc. — plus
6
- * the loss it carried at decision time and the baseline it was judged against.
7
- * The optimization-trajectory block ({@link import('./trajectory.js')}) renders
8
- * these labels so the proposer can learn from prior accepted/rejected attempts
9
- * (OPRO/AlphaEvolve: scored history of past solutions in the meta-prompt).
5
+ * what the loop DECIDED about it — promoted, rejected, etc. — plus the loss it
6
+ * carried at decision time and the baseline it was judged against, so the
7
+ * manual promote / evolve-from-edits path can learn from prior decisions.
10
8
  *
11
9
  * A candidate has exactly ONE current verdict, so this is a single JSON object
12
10
  * (last-write-wins), NOT an append-only log — the full transition history
@@ -37,8 +35,7 @@ function isValidVerdictKind(value) {
37
35
  value === 'rolled-back' ||
38
36
  value === 'rejected' ||
39
37
  value === 'gate-failed' ||
40
- value === 'declined' ||
41
- value === 'outcompeted');
38
+ value === 'declined');
42
39
  }
43
40
  /**
44
41
  * Write (or overwrite) a candidate's current verdict. Atomic tmp+rename inside
@@ -72,7 +72,7 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
72
72
 
73
73
  Detection order: scan existing test files for PBT imports first; if none found, infer from project language; if ambiguous, use AskUserQuestion.
74
74
 
75
- **Every WHEN/THEN scenario extracted in step 4 must have exactly one PBT test** — no exceptions:
75
+ **Every WHEN/THEN scenario extracted in step 4 must have exactly one PBT test**, with ONE sanctioned exception a scenario whose behaviour is already exhaustively covered by an existing example/benchmark test AND that cannot be expressed as a meaningful property may instead be recorded as a \`➖ N/A\` row in the PBT Coverage table (step 8), citing that covering test. Otherwise, no exceptions:
76
76
  - **WHEN** clause → generator expression + precondition guard (filter/assume)
77
77
  - **THEN** clause → invariant (property assertion that must hold for all generated inputs)
78
78
  - When the WHEN clause has no parameterisable variable (e.g. "WHEN the app loads"), generate arbitrary system/environment state as the input and use the THEN clause alone as the invariant.
@@ -2,8 +2,9 @@
2
2
  * Skill Template Workflow Modules
3
3
  *
4
4
  * Learn workflow: thin entrance to self-evolution — selects the change,
5
- * spawns the fresh-context critic skill
6
- * (synergyspec-selfevolving-self-evolving), and relays its verdict.
5
+ * spawns the fresh-context runner skill
6
+ * (synergyspec-selfevolving-self-evolving), which triggers ONE loop-v2
7
+ * self-evolution episode, and relays its verdict.
7
8
  */
8
9
  import type { SkillTemplate, CommandTemplate } from '../types.js';
9
10
  export declare function getLearnSkillTemplate(): SkillTemplate;
@@ -2,7 +2,7 @@ const INSTRUCTIONS_BODY = `**Input**: Optionally specify a change name. If omitt
2
2
 
3
3
  **Purpose**
4
4
 
5
- This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to autonomous self-evolution — but the critique itself runs in a FRESH-CONTEXT critic subagent (\`synergyspec-selfevolving-self-evolving\`). The rollout actor does not grade its own work: end-of-cycle sessions are context-heavy, and the system grades the on-disk trajectory, not the actor's intentions. Your job here is to select the change, hand the critic explicit handles, and relay its verdict.
5
+ This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify\`, and it is the ENTRANCE to one self-evolution EPISODE (loop v2 self-evolution as in-context RL) — but the episode itself runs through a FRESH-CONTEXT runner subagent (\`synergyspec-selfevolving-self-evolving\`). The rollout actor does not grade its own work: end-of-cycle sessions are context-heavy, and the system grades the on-disk trajectory, not the actor's intentions. The runner does not grade or edit either — it triggers the CLI episode orchestrator, which CODE-SPAWNS the 奖励智能体 REWARD AGENT (scoring) and 演进智能体 EVOLVING AGENT (the ONE bounded edit onto the 策略 POLICY), then relays the result. Your job here is to select the change, hand the runner explicit handles, and relay its verdict.
6
6
 
7
7
  **Steps**
8
8
 
@@ -12,41 +12,44 @@ This is the review-and-learn step after \`/synspec:apply\` and \`/synspec:verify
12
12
  - Infer from conversation context only if the change is unambiguous
13
13
  - If ambiguous, run \`synergyspec-selfevolving list --json\` and ask the user to select
14
14
 
15
- Prefer changes that have completed apply/verify evidence. If apply or verify appears incomplete, continue in preview mode and clearly mark the missing evidence.
15
+ Prefer changes that have completed apply/verify evidence. If apply or verify appears incomplete, continue and clearly mark the missing evidence — the orchestrator's 奖励智能体 REWARD AGENT will 弃权 abstain rather than score on absent evidence.
16
16
 
17
17
  2. **Gather explicit handles**
18
18
 
19
- The critic starts with NO conversation context, so collect every handle it needs:
19
+ The runner starts with NO conversation context, so collect every handle it needs:
20
20
  - **Project root**: the absolute path of the current working directory.
21
21
  - **Change name**: from step 1.
22
22
  - **Harness**: read the \`harness:\` key from \`synergyspec-selfevolving/changes/<name>/.synergyspec-selfevolving.yaml\`; if absent, use \`unknown\`.
23
- - **Mode**: \`preview\` only if the user asked for a preview; otherwise \`apply\` (the critic consolidates memory and runs the gated \`evolve-from-edits --yes\` promote autonomously no confirmation prompt).
24
- - **Session handle (optional)**: if your harness exposes this session's id or transcript path, capture it; otherwise omit it (trajectory discovery then uses the change window).
23
+ - **Mode**: always \`apply\` the episode runs the full loop (score, decide, and the 演进智能体's ONE bounded edit) autonomously, with no confirmation prompt. There is NO read-only episode and NO \`--preview\` flag. If the user wants a read-only look (no rollback, no evolution), do NOT run an episode: use the read-only view \`synergyspec-selfevolving self-evolution policy show\` (or a plain \`synergyspec-selfevolving learn <name>\` without \`--apply\`) instead.
24
+ - **Session handle (optional)**: if your harness exposes this session's id or transcript path, capture it; otherwise omit it (the 主智能体 MAIN AGENT arm's trajectory discovery then uses the change window).
25
25
 
26
- 3. **Spawn the critic**
26
+ 3. **Spawn the runner**
27
27
 
28
- Use Task tool (subagent_type: "general-purpose", prompt: "Use Skill tool to invoke synergyspec-selfevolving-self-evolving for change '<name>'. Project root: <root>. Harness: <harness>. Mode: <apply|preview>. Session-id: <id>. Transcript: <path>. Run the full critique autonomously, do not ask the user questions, and end with the '## Critic Verdict' block.")
28
+ Use Task tool (subagent_type: "general-purpose", prompt: "Use Skill tool to invoke synergyspec-selfevolving-self-evolving for change '<name>'. Project root: <root>. Harness: <harness>. Mode: apply. Session-id: <id>. Transcript: <path>. Trigger the loop-v2 self-evolution episode autonomously, do not ask the user questions, and end with the '## Episode Verdict' block.")
29
29
 
30
30
  Include the \`Session-id: <id>.\` / \`Transcript: <path>.\` segment only when the session handle from step 2 is known — omit it entirely when unknown.
31
31
 
32
+ The runner triggers exactly one CLI command — \`synergyspec-selfevolving self-evolution episode --change "<name>" --session-id <id>\` — and the orchestrator CODE-SPAWNS the 奖励智能体 REWARD AGENT + 演进智能体 EVOLVING AGENT (+ optional CRITIC AGENT(基线智能体)). Neither you nor the runner grades or edits canonical files.
33
+
32
34
  Guardrails:
33
- - Do NOT perform the critique steps yourself in this session — the critique must run in a fresh context.
34
- - **Fallback (no Task tool):** if this host has no Task tool (or spawning fails), invoke the synergyspec-selfevolving-self-evolving skill INLINE instead — use the Skill tool, or read \`<skillsDir>/skills/synergyspec-selfevolving-self-evolving/SKILL.md\` and follow it in this session — and note \`Isolation: inline fallback (degraded)\` in the final output.
35
- - **Last resort (critic skill not installed):** run \`synergyspec-selfevolving learn "<name>" --preview\`, report that the full critique could not run, and suggest re-running \`synergyspec-selfevolving init\`.
35
+ - Do NOT trigger the episode yourself in this session — it must run from a fresh context.
36
+ - **Fallback (no Task tool / runner never started):** ONLY when this host has no Task tool, or the spawn never produced a \`## Episode Verdict\` block at all, invoke the synergyspec-selfevolving-self-evolving skill INLINE instead — use the Skill tool, or read \`<skillsDir>/skills/synergyspec-selfevolving-self-evolving/SKILL.md\` and follow it in this session — and note \`Isolation: inline fallback (degraded)\` in the final output.
37
+ - **STOP do NOT retry inline — when the runner DID return a verdict whose Outcome is \`error-...\`** (the episode command itself ran and failed — e.g. \`error-ENAMETOOLONG\`). An inline re-run invokes the SAME failing CLI and fails identically. Surface it as a DEFECT to the user, do NOT archive over it, and stop.
38
+ - **Last resort (runner skill not installed):** run \`synergyspec-selfevolving self-evolution episode --change "<name>"\`, report that the full episode could not run via the runner, and suggest re-running \`synergyspec-selfevolving init\`.
36
39
 
37
40
  4. **Verify and relay the verdict**
38
41
 
39
- Read the critic's \`## Critic Verdict\` block from its final message, then:
40
- - Cross-check it against \`synergyspec-selfevolving status --change "<name>" --json\` and \`synergyspec-selfevolving/changes/<name>/evolution-result.json\`. NEVER contradict the machine-written outcome.
41
- - Relay the outcome, the evolved target, and the rollback command verbatim.
42
- - Classify any refusal before moving on: a SAFE refusal (missing/red evidence, frozen target, static gate failed on real grounds) is expected, not a bug; a DEFECT the critic flagged (an unbindable target, a promotion failure that is NOT about evidence / freezing / scope) must be surfaced to the user, not archived over. \`Outcome: not-run\` is a safe no-op (no hints survived, or no bindable target) — not a DEFECT, unless the critic flagged case-(b). On a verified-GREEN run, learn also records success-channel protections/exemplars as side-writes — proposing nothing on a green run is the CORRECT outcome, not a missed evolution.
42
+ Read the runner's \`## Episode Verdict\` block from its final message, then:
43
+ - Cross-check it against \`synergyspec-selfevolving status --change "<name>" --json\` and the episode's \`episode.json\` / \`diagnosis.json\`. NEVER contradict the machine-written outcome.
44
+ - Relay the outcome, the decision (rolled-back / kept / abstained), the evolution kind, the new 策略 POLICY version, the evolved target, and the rollback command verbatim.
45
+ - Classify the outcome before moving on: a \`kept\` / \`abstained\` no-op on a verified-green or no-nameable-gap run is the CORRECT outcome (产物即弃), not a missed evolution; a \`rolled-back\` decision is the loop working (the 否决缓冲 reject-buffer recorded the lost direction). A SAFE refusal (missing/red evidence, frozen target, gate refused on real grounds) is expected, not a bug; a DEFECT the runner flagged (an unbindable target, an orchestrator failure that is NOT about evidence / freezing / scope) must be surfaced to the user, not archived over.
43
46
 
44
47
  **Output Format**
45
48
 
46
- - Lead with the critic's verdict, not the spawn mechanics.
47
- - Relay the \`## Critic Verdict\` fields verbatim: outcome, evolved target, canonical file(s) changed, and the rollback command.
48
- - State clearly whether anything was written (memory, learn report, canonical promote) and the isolation mode (fresh-context subagent, or inline fallback (degraded)).
49
- - Separate safe refusals from DEFECTs to surface.
49
+ - Lead with the runner's verdict, not the spawn mechanics.
50
+ - Relay the \`## Episode Verdict\` fields verbatim: outcome, decision, evolution kind, advantage, new 策略 POLICY version, evolved target, canonical file(s) changed, and the rollback command.
51
+ - State clearly whether the 策略 POLICY changed (evolved / rolled-back / unchanged) and the isolation mode (fresh-context subagent, or inline fallback (degraded)).
52
+ - Separate safe no-ops and refusals from DEFECTs to surface.
50
53
  - End with the normal next step: \`/synspec:archive\` once the user is satisfied with the review.`;
51
54
  export function getLearnSkillTemplate() {
52
55
  return {
@@ -1,10 +1,12 @@
1
1
  /**
2
2
  * Skill Template Workflow Modules
3
3
  *
4
- * Self-evolving critic: utility skill (no command counterpart) — always
5
- * installed regardless of profile. Fresh-context critic for a completed
6
- * change; normally spawned by synergyspec-selfevolving-learn (the rollout
7
- * actor) so the critique never runs in the actor's own context.
4
+ * Self-evolving runner: utility skill (no command counterpart) — always
5
+ * installed regardless of profile. In loop v2 (self-evolution as in-context
6
+ * RL) the 奖励智能体 REWARD AGENT (scoring) and 演进智能体 EVOLVING AGENT
7
+ * (editing) are CODE-SPAWNED by the episode orchestrator so this host-facing
8
+ * skill is a THIN RUNNER: it triggers the CLI episode and relays the result.
9
+ * It NEVER grades and NEVER edits canonical files itself.
8
10
  */
9
11
  import type { SkillTemplate } from '../types.js';
10
12
  export declare function getSelfEvolvingSkillTemplate(): SkillTemplate;