@evomap/evolver 1.89.2 → 1.89.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/.cursor/BUGBOT.md +182 -0
  2. package/.env.example +68 -0
  3. package/.git-commit-guard-token +1 -0
  4. package/.github/CODEOWNERS +63 -0
  5. package/.github/ISSUE_TEMPLATE/good_first_issue.md +23 -0
  6. package/.github/pull_request_template.md +45 -0
  7. package/.github/workflows/test.yml +75 -0
  8. package/CHANGELOG.md +1237 -0
  9. package/README.ja-JP.md +1 -3
  10. package/README.ko-KR.md +1 -3
  11. package/README.md +86 -530
  12. package/README.public.md +569 -0
  13. package/README.zh-CN.md +1 -3
  14. package/SECURITY.md +108 -0
  15. package/assets/gep/events.jsonl +3 -0
  16. package/assets/gep/genes.json +496 -0
  17. package/examples/atp-consumer-quickstart.md +100 -0
  18. package/examples/hello-world.md +38 -0
  19. package/index.js +44 -48
  20. package/package.json +6 -17
  21. package/proxy-package.json +39 -0
  22. package/public.manifest.json +143 -0
  23. package/src/adapters/hookAdapter.js +2 -0
  24. package/src/adapters/scripts/_lockPaths.js +74 -0
  25. package/src/adapters/scripts/evolver-session-start.js +19 -27
  26. package/src/config.js +23 -0
  27. package/src/evolve/guards.js +721 -1
  28. package/src/evolve/pipeline/collect.js +1283 -1
  29. package/src/evolve/pipeline/dispatch.js +421 -1
  30. package/src/evolve/pipeline/enrich.js +440 -1
  31. package/src/evolve/pipeline/hub.js +319 -1
  32. package/src/evolve/pipeline/select.js +274 -1
  33. package/src/evolve/pipeline/signals.js +206 -1
  34. package/src/evolve/utils.js +264 -1
  35. package/src/evolve.js +350 -1
  36. package/src/experiment/agentRunner.js +229 -0
  37. package/src/experiment/cli.js +159 -0
  38. package/src/experiment/comparison.js +233 -0
  39. package/src/experiment/metrics.js +75 -0
  40. package/src/forceUpdate.js +311 -30
  41. package/src/gep/a2aProtocol.js +4455 -1
  42. package/src/gep/antiAbuseTelemetry.js +233 -0
  43. package/src/gep/autoDistillConv.js +205 -1
  44. package/src/gep/autoDistillLlm.js +315 -1
  45. package/src/gep/candidateEval.js +92 -1
  46. package/src/gep/candidates.js +198 -1
  47. package/src/gep/contentHash.js +30 -1
  48. package/src/gep/conversationSniffer.js +266 -1
  49. package/src/gep/crypto.js +89 -1
  50. package/src/gep/curriculum.js +163 -1
  51. package/src/gep/deviceId.js +218 -1
  52. package/src/gep/envFingerprint.js +118 -1
  53. package/src/gep/epigenetics.js +31 -1
  54. package/src/gep/execBridge.js +711 -1
  55. package/src/gep/explore.js +289 -1
  56. package/src/gep/hash.js +15 -1
  57. package/src/gep/hubFetch.js +359 -1
  58. package/src/gep/hubReview.js +207 -1
  59. package/src/gep/hubSearch.js +526 -1
  60. package/src/gep/hubVerify.js +306 -1
  61. package/src/gep/learningSignals.js +89 -1
  62. package/src/gep/memoryGraph.js +1374 -1
  63. package/src/gep/memoryGraphAdapter.js +203 -1
  64. package/src/gep/mutation.js +203 -1
  65. package/src/gep/narrativeMemory.js +108 -1
  66. package/src/gep/openPRRegistry.js +205 -1
  67. package/src/gep/personality.js +423 -1
  68. package/src/gep/policyCheck.js +599 -1
  69. package/src/gep/prompt.js +836 -1
  70. package/src/gep/recallInject.js +409 -1
  71. package/src/gep/recallVerifier.js +318 -1
  72. package/src/gep/reflection.js +177 -1
  73. package/src/gep/sanitize.js +9 -0
  74. package/src/gep/selector.js +602 -1
  75. package/src/gep/skillDistiller.js +1294 -1
  76. package/src/gep/solidify.js +1699 -1
  77. package/src/gep/strategy.js +136 -1
  78. package/src/gep/tokenSavings.js +88 -1
  79. package/src/gep/validator/sandboxExecutor.js +29 -1
  80. package/src/gep/workspaceKeychain.js +174 -1
  81. package/src/proxy/extensions/traceControl.js +99 -1
  82. package/src/proxy/index.js +14 -5
  83. package/src/proxy/inject.js +52 -1
  84. package/src/proxy/lifecycle/manager.js +30 -0
  85. package/src/proxy/mailbox/store.js +2 -1
  86. package/src/proxy/router/messages_route.js +13 -2
  87. package/src/proxy/trace/extractor.js +646 -1
  88. package/src/proxy/trace/usage.js +105 -1
  89. package/CONTRIBUTING.md +0 -19
  90. package/assets/cover.png +0 -0
  91. package/assets/gep/genes.seed.json +0 -245
  92. package/scripts/a2a_export.js +0 -63
  93. package/scripts/a2a_ingest.js +0 -79
  94. package/scripts/a2a_promote.js +0 -118
  95. package/scripts/analyze_by_skill.js +0 -121
  96. package/scripts/build_binaries.js +0 -479
  97. package/scripts/check-changelog.js +0 -166
  98. package/scripts/extract_log.js +0 -85
  99. package/scripts/generate_history.js +0 -75
  100. package/scripts/gep_append_event.js +0 -96
  101. package/scripts/gep_personality_report.js +0 -234
  102. package/scripts/human_report.js +0 -147
  103. package/scripts/recall-verify-report.js +0 -234
  104. package/scripts/recover_loop.js +0 -61
  105. package/scripts/seed-merchants.js +0 -91
  106. package/scripts/suggest_version.js +0 -89
  107. package/scripts/validate-modules.js +0 -38
  108. package/scripts/validate-suite.js +0 -78
  109. package/skills/index.json +0 -14
  110. /package/{skills → bundled-skills}/_meta/SKILL.md +0 -0
@@ -0,0 +1,229 @@
1
+ // src/experiment/agentRunner.js
2
+ //
3
+ // The ONLY module in src/experiment that touches a subprocess. It runs a single
4
+ // arm of a comparison by invoking a headless coding-agent CLI -- by default
5
+ // `claude -p "<prompt>" --output-format json` -- and maps its JSON envelope
6
+ // onto the canonical AgentResult shape (duration / rounds / tokens / cost).
7
+ //
8
+ // Security: the prompt is passed as an argv element with shell:false, so quotes,
9
+ // `$`, backticks etc. in the task can never be interpreted by a shell (same
10
+ // injection posture as src/gep/validator/sandboxExecutor.js). Unlike the
11
+ // sandbox, the real user env IS passed through -- the agent CLI needs PATH and
12
+ // its own auth/credentials to run.
13
+ 'use strict';
14
+
15
+ const { spawn } = require('child_process');
16
+
17
+ const DEFAULT_CMD = 'claude';
18
+ const DEFAULT_TIMEOUT_MS = 300000; // 5 min
19
+ const MIN_TIMEOUT_MS = 1000;
20
+ const MAX_TIMEOUT_MS = 1800000; // 30 min hard cap
21
+ const RESULT_CAP = 4000;
22
+ const MAX_STDOUT_BYTES = 10 * 1024 * 1024; // 10 MB — a JSON envelope is tiny; cap a chatty/malformed CLI before it OOMs.
23
+
24
+ function num(v, fallback) {
25
+ const n = Number(v);
26
+ return Number.isFinite(n) ? n : (fallback === undefined ? 0 : fallback);
27
+ }
28
+
29
+ function resolveAgentCommand(opts) {
30
+ if (opts && opts.command) return String(opts.command);
31
+ const env = process.env.EVOLVER_EXPERIMENT_AGENT_CMD;
32
+ return env && env.trim() ? env.trim() : DEFAULT_CMD;
33
+ }
34
+
35
+ function resolveExtraArgs(opts) {
36
+ if (opts && Array.isArray(opts.extraArgs)) return opts.extraArgs.map(String);
37
+ const env = process.env.EVOLVER_EXPERIMENT_AGENT_ARGS;
38
+ if (!env || !env.trim()) return [];
39
+ return env.trim().split(/\s+/);
40
+ }
41
+
42
+ function resolveTimeout(opts) {
43
+ let t = opts && Number.isFinite(Number(opts.timeoutMs)) ? Number(opts.timeoutMs) : null;
44
+ if (t == null) {
45
+ const e = Number(process.env.EVOLVER_EXPERIMENT_TIMEOUT_MS);
46
+ t = Number.isFinite(e) && e > 0 ? e : DEFAULT_TIMEOUT_MS;
47
+ }
48
+ return Math.min(Math.max(MIN_TIMEOUT_MS, t), MAX_TIMEOUT_MS);
49
+ }
50
+
51
+ // Parse the `--output-format json` envelope. Tolerant of leading/trailing
52
+ // non-JSON noise by falling back to the outermost {...} slice.
53
+ function parseAgentJson(stdout) {
54
+ const text = String(stdout || '').trim();
55
+ if (!text) return null;
56
+ try {
57
+ return JSON.parse(text);
58
+ } catch (_) { /* fall through */ }
59
+ const start = text.indexOf('{');
60
+ const end = text.lastIndexOf('}');
61
+ if (start >= 0 && end > start) {
62
+ try {
63
+ return JSON.parse(text.slice(start, end + 1));
64
+ } catch (_) { /* fall through */ }
65
+ }
66
+ return null;
67
+ }
68
+
69
+ function makeFailure(error, command, extra) {
70
+ return Object.assign(
71
+ {
72
+ ok: false,
73
+ isError: true,
74
+ error: error,
75
+ durationMs: 0,
76
+ rounds: 0,
77
+ tokensIn: 0,
78
+ tokensOut: 0,
79
+ tokensTotal: 0,
80
+ costUsd: 0,
81
+ resultText: '',
82
+ exitCode: -1,
83
+ timedOut: false,
84
+ runnerName: 'claude-cli',
85
+ agentCommand: command,
86
+ },
87
+ extra || {},
88
+ );
89
+ }
90
+
91
+ // Map a parsed `claude -p --output-format json` envelope onto AgentResult.
92
+ function mapAgentResult(json, ctx) {
93
+ const command = ctx.command;
94
+ const exitCode = num(ctx.exitCode, -1);
95
+ const timedOut = !!ctx.timedOut;
96
+ const usage = (json && json.usage) || {};
97
+ const tokensIn = num(usage.input_tokens);
98
+ const tokensOut = num(usage.output_tokens);
99
+ const isError = !!(json && json.is_error);
100
+ const ok = exitCode === 0 && !isError && !timedOut;
101
+ let error = null;
102
+ if (!ok) {
103
+ if (timedOut) error = 'agent_timeout';
104
+ else if (isError) error = 'agent_reported_error';
105
+ else if (exitCode !== 0) error = 'agent_exit_' + exitCode;
106
+ }
107
+ // Prefer the agent's self-reported duration; fall back to wall-clock.
108
+ const durationMs = json && Number.isFinite(Number(json.duration_ms))
109
+ ? num(json.duration_ms)
110
+ : num(ctx.durationMs);
111
+ return {
112
+ ok,
113
+ isError,
114
+ error,
115
+ durationMs,
116
+ rounds: num(json && json.num_turns),
117
+ tokensIn,
118
+ tokensOut,
119
+ tokensTotal: tokensIn + tokensOut,
120
+ costUsd: num(json && json.total_cost_usd),
121
+ resultText: typeof (json && json.result) === 'string' ? json.result.slice(0, RESULT_CAP) : '',
122
+ exitCode,
123
+ timedOut,
124
+ runnerName: 'claude-cli',
125
+ agentCommand: command,
126
+ };
127
+ }
128
+
129
+ /**
130
+ * Run one task prompt through a headless agent CLI.
131
+ *
132
+ * @param {string} prompt
133
+ * @param {object} [opts] { command, extraArgs, timeoutMs, cwd }
134
+ * @returns {Promise<AgentResult>} never rejects; failures resolve as { ok:false, error }
135
+ */
136
+ function runAgentTask(prompt, opts) {
137
+ opts = opts || {};
138
+ const command = resolveAgentCommand(opts);
139
+ const extraArgs = resolveExtraArgs(opts);
140
+ const timeoutMs = resolveTimeout(opts);
141
+ const cwd = opts.cwd || process.cwd();
142
+ const argv = [...extraArgs, '-p', String(prompt == null ? '' : prompt), '--output-format', 'json'];
143
+
144
+ return new Promise((resolve) => {
145
+ let child;
146
+ let settled = false;
147
+ let stdout = '';
148
+ let stderr = '';
149
+ const startedAt = Date.now();
150
+
151
+ const done = (result) => {
152
+ if (settled) return;
153
+ settled = true;
154
+ clearTimeout(timer);
155
+ resolve(result);
156
+ };
157
+
158
+ const timer = setTimeout(() => {
159
+ if (child && !child.killed) {
160
+ try { child.kill('SIGKILL'); } catch (_) { /* noop */ }
161
+ }
162
+ done(makeFailure('agent_timeout', command, {
163
+ durationMs: Date.now() - startedAt,
164
+ timedOut: true,
165
+ }));
166
+ }, timeoutMs);
167
+
168
+ try {
169
+ child = spawn(command, argv, {
170
+ shell: false,
171
+ cwd,
172
+ env: process.env,
173
+ stdio: ['ignore', 'pipe', 'pipe'],
174
+ });
175
+ } catch (e) {
176
+ done(makeFailure('agent_spawn_failed: ' + (e && e.message ? e.message : String(e)), command));
177
+ return;
178
+ }
179
+
180
+ child.stdout.on('data', (d) => {
181
+ stdout += d.toString('utf8');
182
+ if (stdout.length > MAX_STDOUT_BYTES) {
183
+ // A well-behaved `--output-format json` envelope is tiny; this much
184
+ // stdout means a runaway/malformed CLI — kill it rather than OOM.
185
+ try { child.kill('SIGKILL'); } catch (_) { /* noop */ }
186
+ done(makeFailure('agent_output_too_large', command, { durationMs: Date.now() - startedAt }));
187
+ }
188
+ });
189
+ child.stderr.on('data', (d) => {
190
+ stderr += d.toString('utf8');
191
+ if (stderr.length > MAX_STDOUT_BYTES) stderr = stderr.slice(-RESULT_CAP);
192
+ });
193
+
194
+ child.on('error', (err) => {
195
+ const msg = err && err.code === 'ENOENT'
196
+ ? 'agent_cli_not_found: ' + command
197
+ : 'agent_spawn_error: ' + (err && err.message ? err.message : String(err));
198
+ done(makeFailure(msg, command, { durationMs: Date.now() - startedAt }));
199
+ });
200
+
201
+ child.on('exit', (code, signal) => {
202
+ const durationMs = Date.now() - startedAt;
203
+ // A signal-terminated child reports code=null, and Number(null) is 0 —
204
+ // num(null, -1) would read it as a CLEAN exit and let a killed arm
205
+ // score ok:true. Map null/undefined explicitly before the numeric path.
206
+ const exitCode = (code === null || code === undefined) ? -1 : num(code, -1);
207
+ const json = parseAgentJson(stdout);
208
+ if (!json) {
209
+ done(makeFailure(
210
+ signal ? 'agent_killed_' + signal : 'agent_output_parse_failed', command, {
211
+ durationMs,
212
+ exitCode,
213
+ resultText: String(stdout || stderr || '').slice(0, RESULT_CAP),
214
+ }));
215
+ return;
216
+ }
217
+ done(mapAgentResult(json, { exitCode, durationMs, timedOut: false, command }));
218
+ });
219
+ });
220
+ }
221
+
222
+ module.exports = {
223
+ runAgentTask,
224
+ parseAgentJson,
225
+ mapAgentResult,
226
+ resolveAgentCommand,
227
+ resolveTimeout,
228
+ DEFAULT_CMD,
229
+ };
@@ -0,0 +1,159 @@
1
+ // src/experiment/cli.js
2
+ //
3
+ // CLI surface for the `experiment` subcommand. Mirrors src/atp/cli.js:
4
+ // parseExperimentArgs(args) -> { ok, opts? , error? }
5
+ // runExperiment(opts, deps) -> { ok, data?, error?, exitCode }
6
+ // Dependency-injected (comparison / agentRunner / geneLoader / sandbox)
7
+ // so the runner is unit-testable without a real agent CLI.
8
+ 'use strict';
9
+
10
+ const fs = require('fs');
11
+ const path = require('path');
12
+
13
+ const MAX_REQUEST_FILE_BYTES = 1024 * 1024; // 1 MB — a request JSON is tiny
14
+
15
+ // Minimal `--key=value` / `--flag` parser (no shell, no globbing).
16
+ function parseFlags(args) {
17
+ const out = {};
18
+ for (const a of args || []) {
19
+ if (typeof a !== 'string') continue;
20
+ const eq = a.indexOf('=');
21
+ if (a.startsWith('--') && eq > 2) {
22
+ out[a.slice(2, eq)] = a.slice(eq + 1);
23
+ } else if (a.startsWith('--')) {
24
+ out[a.slice(2)] = true;
25
+ }
26
+ }
27
+ return out;
28
+ }
29
+
30
+ /**
31
+ * Parse experiment subcommand args.
32
+ *
33
+ * Flags: --task= --metric= [--baseline=] [--variant=] [--gene=]
34
+ * [--validation="cmd1;;cmd2"] [--timeout=ms] [--request-file=<json>]
35
+ *
36
+ * --request-file supplies a JSON base ({task,baseline,variant,metric,geneId,
37
+ * validationCommands,timeoutMs}); explicit flags override it. This is the only
38
+ * filesystem read here and lets the desktop pass complex params without argv
39
+ * escaping headaches.
40
+ *
41
+ * @returns {{ ok: true, opts: object } | { ok: false, error: string }}
42
+ */
43
+ function parseExperimentArgs(args) {
44
+ const f = parseFlags(args);
45
+
46
+ let base = {};
47
+ if (f['request-file']) {
48
+ try {
49
+ // Resolve + stat the path and bound its size before reading: don't open
50
+ // an arbitrary non-file (device/FIFO) or slurp an unbounded file.
51
+ const rfPath = path.resolve(String(f['request-file']));
52
+ const st = fs.statSync(rfPath);
53
+ if (!st.isFile()) return { ok: false, error: '--request-file must be a regular file' };
54
+ if (st.size > MAX_REQUEST_FILE_BYTES) return { ok: false, error: '--request-file too large (> 1 MB)' };
55
+ base = JSON.parse(fs.readFileSync(rfPath, 'utf8'));
56
+ } catch (e) {
57
+ return { ok: false, error: 'failed to read --request-file: ' + (e && e.message ? e.message : String(e)) };
58
+ }
59
+ if (!base || typeof base !== 'object') {
60
+ return { ok: false, error: '--request-file must contain a JSON object' };
61
+ }
62
+ }
63
+
64
+ const pick = (flagVal, baseVal, dflt) => {
65
+ if (flagVal !== undefined && flagVal !== true) return String(flagVal);
66
+ if (baseVal !== undefined && baseVal !== null) return String(baseVal);
67
+ return dflt;
68
+ };
69
+
70
+ const opts = {
71
+ // trim() so a whitespace-only value fails the required-field check below
72
+ // (parse-time exit 2 + usage) instead of crashing later in runComparison
73
+ // (exit 1 with no JSON on stdout for the Go caller).
74
+ task: pick(f.task, base.task, '').trim(),
75
+ baseline: pick(f.baseline, base.baseline, 'baseline'),
76
+ variant: pick(f.variant, base.variant, 'variant'),
77
+ metric: pick(f.metric, base.metric, '').trim(),
78
+ geneId: pick(f.gene, base.geneId !== undefined ? base.geneId : base.gene, '') || null,
79
+ validationCommands: null,
80
+ timeoutMs: undefined,
81
+ };
82
+
83
+ if (f.validation !== undefined && f.validation !== true) {
84
+ opts.validationCommands = String(f.validation).split(';;').map((s) => s.trim()).filter(Boolean);
85
+ } else if (Array.isArray(base.validationCommands)) {
86
+ opts.validationCommands = base.validationCommands.map(String);
87
+ }
88
+
89
+ const timeoutRaw = f.timeout !== undefined && f.timeout !== true ? Number(f.timeout)
90
+ : (Number.isFinite(Number(base.timeoutMs)) ? Number(base.timeoutMs) : NaN);
91
+ if (Number.isFinite(timeoutRaw)) opts.timeoutMs = timeoutRaw;
92
+
93
+ if (!opts.task) return { ok: false, error: 'missing required --task (or "task" in --request-file)' };
94
+ if (!opts.metric) return { ok: false, error: 'missing required --metric (or "metric" in --request-file)' };
95
+
96
+ return { ok: true, opts };
97
+ }
98
+
99
+ /**
100
+ * Run the comparison. Returns the result object (also written to stdout by the
101
+ * index.js wrapper) plus an exit code. exitCode 3 == inconclusive (a real,
102
+ * structured outcome, not a crash).
103
+ *
104
+ * @param {object} opts from parseExperimentArgs
105
+ * @param {object} [deps] { comparison, agentRunner, geneLoader, sandbox, err }
106
+ */
107
+ async function runExperiment(opts, deps) {
108
+ deps = deps || {};
109
+ const comparison = deps.comparison || require('./comparison');
110
+ const err = typeof deps.err === 'function' ? deps.err : ((...a) => console.error(...a));
111
+
112
+ const params = Object.assign({}, opts);
113
+ if (deps.agentRunner) params.agentRunner = deps.agentRunner;
114
+ if (deps.geneLoader) params.geneLoader = deps.geneLoader;
115
+ if (deps.sandbox) params.sandbox = deps.sandbox;
116
+
117
+ try {
118
+ const raw = await comparison.runComparison(params);
119
+ // Redact any secrets / API keys an agent's resultText (or a parse-failure
120
+ // snippet) may carry before the result crosses the process boundary into
121
+ // the desktop consumer / is persisted to disk.
122
+ const data = require('../gep/sanitize').sanitizePayload(raw);
123
+ let exitCode = 0;
124
+ if (data && data.winner === 'inconclusive') {
125
+ exitCode = 3;
126
+ // Two distinct causes share the inconclusive verdict; say which one
127
+ // happened so operators don't go debugging arm failures that never
128
+ // occurred (both arms can be ok when pass-rate was simply unmeasured).
129
+ const bothArmsOk = !!(data.arms && data.arms.baseline && data.arms.baseline.ok
130
+ && data.arms.variant && data.arms.variant.ok);
131
+ err(bothArmsOk
132
+ ? '[Experiment] inconclusive: metric resolved to pass-rate but no validation commands were given, so pass-rate was never measured — pass --validation (or pick another --metric)'
133
+ : '[Experiment] inconclusive: one or both arms failed — see arms.*.error');
134
+ }
135
+ return { ok: true, data, exitCode };
136
+ } catch (e) {
137
+ err('[Experiment] error: ' + (e && e.message ? e.message : String(e)));
138
+ return { ok: false, error: e && e.message ? e.message : String(e), exitCode: 1 };
139
+ }
140
+ }
141
+
142
+ function printExperimentUsage() {
143
+ return [
144
+ 'Usage: node index.js experiment --task="..." --metric="..." [flags]',
145
+ ' --baseline="..." 对照臂标签 (default: baseline)',
146
+ ' --variant="..." 实验臂标签 (default: variant)',
147
+ ' --gene=<geneId> 变体臂复用的基因 id (注入其 strategy)',
148
+ ' --validation="c1;;c2" 自包含 node 校验命令 (通过率评分; ;; 分隔)',
149
+ ' --timeout=<ms> 单臂超时',
150
+ ' --request-file=<path> JSON 基底 (显式 flag 覆盖之)',
151
+ '',
152
+ 'Runs the same task twice (baseline vs variant-with-gene) via a headless',
153
+ 'agent CLI and prints a comparison JSON to stdout. Logs go to stderr.',
154
+ 'Env: EVOLVER_EXPERIMENT_AGENT_CMD (default claude),',
155
+ ' EVOLVER_EXPERIMENT_AGENT_ARGS, EVOLVER_EXPERIMENT_TIMEOUT_MS (300000).',
156
+ ].join('\n');
157
+ }
158
+
159
+ module.exports = { parseExperimentArgs, runExperiment, printExperimentUsage, parseFlags };
@@ -0,0 +1,233 @@
1
+ // src/experiment/comparison.js
2
+ //
3
+ // Thin orchestrator for a comparative experiment: run the SAME task twice --
4
+ // a baseline arm (plain task) and a variant arm (task + the reused gene's
5
+ // strategy injected) -- through a pluggable agent runner, collect real
6
+ // metrics (duration / rounds / tokens / pass-rate), and emit a versioned
7
+ // comparison result.
8
+ //
9
+ // Design notes:
10
+ // - This module NEVER requires child_process. The agent runner, gene loader,
11
+ // and sandbox runner are all injectable, so unit tests stay deterministic
12
+ // (no LLM, no network, no subprocess). Production defaults are lazy-loaded.
13
+ // - A failed arm never fabricates a score: if either arm is !ok the winner is
14
+ // 'inconclusive' and improvement is null, while still recording whatever
15
+ // partial metrics were captured.
16
+ 'use strict';
17
+
18
+ const { deriveMetric, scoreArm, num, round } = require('./metrics');
19
+
20
+ const SCHEMA = 'evolver.experiment.comparison.v1';
21
+ const RESULT_TEXT_CAP = 2000;
22
+ const EPS = 1e-9;
23
+
24
+ // Build the variant prompt by appending the reused gene's strategy, mirroring
25
+ // the numbered-list format used in src/gep/prompt.js (`${i+1}. ${s}`).
26
+ function buildVariantPrompt(task, gene) {
27
+ if (!gene || !Array.isArray(gene.strategy) || gene.strategy.length === 0) return task;
28
+ const steps = gene.strategy.map((s, i) => `${i + 1}. ${s}`).join('\n');
29
+ return (
30
+ task +
31
+ '\n\n## Reuse the following proven strategy\n' +
32
+ steps +
33
+ '\n\nApply the strategy above while completing the task.'
34
+ );
35
+ }
36
+
37
+ // Coerce whatever the agent runner returned into the canonical arm shape.
38
+ function normalizeArm(label, raw) {
39
+ raw = raw || {};
40
+ const tokensIn = num(raw.tokensIn);
41
+ const tokensOut = num(raw.tokensOut);
42
+ const tokensTotal = Number.isFinite(Number(raw.tokensTotal)) ? num(raw.tokensTotal) : tokensIn + tokensOut;
43
+ return {
44
+ label: String(label == null ? '' : label),
45
+ ok: !!raw.ok,
46
+ error: raw.error != null ? String(raw.error) : null,
47
+ durationMs: num(raw.durationMs),
48
+ rounds: num(raw.rounds),
49
+ tokensIn,
50
+ tokensOut,
51
+ tokensTotal,
52
+ costUsd: num(raw.costUsd),
53
+ passRate: Number.isFinite(Number(raw.passRate)) ? num(raw.passRate) : (raw.ok ? 1 : 0),
54
+ resultText: typeof raw.resultText === 'string' ? raw.resultText.slice(0, RESULT_TEXT_CAP) : '',
55
+ exitCode: Number.isFinite(Number(raw.exitCode)) ? num(raw.exitCode) : null,
56
+ timedOut: !!raw.timedOut,
57
+ };
58
+ }
59
+
60
+ // Pass-rate for ONE arm: run its `node <script>` validation commands INSIDE that
61
+ // arm's own workspace (where its agent just ran), so two arms whose agents
62
+ // produced different output get different pass-rates -- the metric is linked to
63
+ // the arm, not to a shared empty sandbox. Each command is a `node <script>`
64
+ // vetted by sandboxExecutor's allowlist (runSingleCommand rejects anything else).
65
+ async function passRateInDir(commands, cwd, runSingleCommand, timeoutMs, warnings) {
66
+ let passed = 0;
67
+ let total = 0;
68
+ for (const cmd of commands) {
69
+ total += 1;
70
+ try {
71
+ const r = await runSingleCommand(cmd, { cwd, timeoutMs });
72
+ if (r && r.ok) passed += 1;
73
+ } catch (e) {
74
+ warnings.push('passrate_command_error: ' + (e && e.message ? e.message : String(e)));
75
+ }
76
+ }
77
+ return total > 0 ? round(passed / total, 4) : 0;
78
+ }
79
+
80
+ /**
81
+ * Run a two-arm comparison.
82
+ *
83
+ * @param {object} params
84
+ * @param {string} params.task 自然语言任务(必填)
85
+ * @param {string} [params.baseline='baseline'] 对照臂标签
86
+ * @param {string} [params.variant='variant'] 实验臂标签
87
+ * @param {string} params.metric 评估指标(必填)
88
+ * @param {string} [params.geneId] 变体臂复用的基因 id
89
+ * @param {string[]}[params.validationCommands] 自包含 `node <script>` 校验命令
90
+ * @param {number} [params.timeoutMs] 单臂超时
91
+ * @param {function}[params.agentRunner] (prompt, opts) => Promise<AgentResult>
92
+ * @param {function}[params.geneLoader] () => Gene[]
93
+ * @param {object} [params.sandbox] { createSandboxDir, cleanupDir, runSingleCommand } (default: sandboxExecutor)
94
+ * @returns {Promise<object>} versioned ComparisonResult (see SCHEMA)
95
+ */
96
+ async function runComparison(params) {
97
+ const p = params || {};
98
+ const task = String(p.task == null ? '' : p.task).trim();
99
+ const baseline = p.baseline ? String(p.baseline) : 'baseline';
100
+ const variant = p.variant ? String(p.variant) : 'variant';
101
+ const metric = String(p.metric == null ? '' : p.metric);
102
+ const geneId = p.geneId ? String(p.geneId) : null;
103
+ const validationCommands = Array.isArray(p.validationCommands)
104
+ ? p.validationCommands.filter((c) => typeof c === 'string' && c.trim())
105
+ : null;
106
+ const timeoutMs = Number.isFinite(Number(p.timeoutMs)) ? Number(p.timeoutMs) : undefined;
107
+
108
+ if (!task) throw new Error('task is required');
109
+ if (!metric) throw new Error('metric is required');
110
+
111
+ const agentRunner = typeof p.agentRunner === 'function'
112
+ ? p.agentRunner
113
+ : require('./agentRunner').runAgentTask;
114
+ const geneLoader = typeof p.geneLoader === 'function'
115
+ ? p.geneLoader
116
+ : require('../gep/assetStore').loadGenes;
117
+ const sandbox = p.sandbox && typeof p.sandbox === 'object'
118
+ ? p.sandbox
119
+ : require('../gep/validator/sandboxExecutor');
120
+
121
+ const startedAt = new Date().toISOString();
122
+ const t0 = Date.now();
123
+ const warnings = [];
124
+
125
+ const metricInfo = deriveMetric(metric);
126
+ if (!metricInfo.recognized) warnings.push('metric_unrecognized: ' + metric);
127
+
128
+ // Look up the reused gene (variant arm). Without a resolved gene the variant
129
+ // prompt is identical to the baseline task, so the two arms are NOT a strategy
130
+ // comparison -- record an explicit warning so identical arms aren't mistaken
131
+ // for one.
132
+ let gene = null;
133
+ if (geneId) {
134
+ let genes = [];
135
+ try {
136
+ genes = geneLoader() || [];
137
+ } catch (e) {
138
+ warnings.push('gene_load_error: ' + (e && e.message ? e.message : String(e)));
139
+ }
140
+ gene = genes.find((g) => g && String(g.id) === geneId) || null;
141
+ if (!gene) warnings.push('gene_not_found: ' + geneId + ' (variant arm equals baseline)');
142
+ } else {
143
+ warnings.push('no_gene: variant arm equals baseline (no strategy injected)');
144
+ }
145
+
146
+ const hasValidation = !!(validationCommands && validationCommands.length);
147
+ if (!hasValidation) warnings.push('passrate_degraded_no_validation');
148
+
149
+ let metaRunner = null;
150
+ let metaCommand = null;
151
+
152
+ const runArm = async (label, prompt) => {
153
+ // Each arm runs in its OWN fresh sandbox dir, so the agent works in
154
+ // isolation (never the evolver repo / process.cwd()) and its pass-rate
155
+ // validation reads that arm's own output, not a shared empty directory.
156
+ const workdir = sandbox.createSandboxDir();
157
+ let raw;
158
+ try {
159
+ raw = await agentRunner(prompt, { timeoutMs, cwd: workdir });
160
+ } catch (e) {
161
+ raw = { ok: false, error: 'agent_runner_threw: ' + (e && e.message ? e.message : String(e)) };
162
+ }
163
+ if (raw) {
164
+ if (metaRunner == null && raw.runnerName) metaRunner = String(raw.runnerName);
165
+ if (metaCommand == null && raw.agentCommand) metaCommand = String(raw.agentCommand);
166
+ }
167
+ const arm = normalizeArm(label, raw);
168
+ if (hasValidation) {
169
+ arm.passRate = await passRateInDir(validationCommands, workdir, sandbox.runSingleCommand, timeoutMs, warnings);
170
+ }
171
+ try { sandbox.cleanupDir(workdir); } catch (_) { /* best-effort cleanup */ }
172
+ return arm;
173
+ };
174
+
175
+ // Arms run sequentially: two real agent CLIs in parallel would contend for
176
+ // local resources / provider rate limits and muddy the duration metric.
177
+ const armBaseline = await runArm(baseline, task);
178
+ const armVariant = await runArm(variant, buildVariantPrompt(task, gene));
179
+
180
+ const baselineScore = scoreArm(armBaseline, metricInfo.metricField);
181
+ const variantScore = scoreArm(armVariant, metricInfo.metricField);
182
+
183
+ // Pass-rate is only a real measurement when validation commands ran. Without
184
+ // them it's a synthetic ok?1:0, so a pass-rate comparison would falsely tie
185
+ // (both arms 1.0) — report it as inconclusive instead of a fake tie.
186
+ const passRateNotMeasured = metricInfo.metricField === 'passRate' && !hasValidation;
187
+ let winner;
188
+ let improvement;
189
+ if (!armBaseline.ok || !armVariant.ok || passRateNotMeasured) {
190
+ winner = 'inconclusive';
191
+ improvement = null;
192
+ } else if (Math.abs(baselineScore - variantScore) <= EPS) {
193
+ winner = 'tie';
194
+ improvement = 0;
195
+ } else {
196
+ const variantBetter = metricInfo.lowerIsBetter
197
+ ? variantScore < baselineScore
198
+ : variantScore > baselineScore;
199
+ winner = variantBetter ? 'variant' : 'baseline';
200
+ if (baselineScore === 0) {
201
+ improvement = null;
202
+ } else {
203
+ const ratio = metricInfo.lowerIsBetter
204
+ ? (baselineScore - variantScore) / Math.abs(baselineScore)
205
+ : (variantScore - baselineScore) / Math.abs(baselineScore);
206
+ improvement = round(ratio, 4);
207
+ }
208
+ }
209
+
210
+ return {
211
+ schema: SCHEMA,
212
+ task,
213
+ metric,
214
+ metricField: metricInfo.metricField,
215
+ lowerIsBetter: metricInfo.lowerIsBetter,
216
+ scoreUnit: metricInfo.scoreUnit,
217
+ geneId,
218
+ baselineScore,
219
+ variantScore,
220
+ winner,
221
+ improvement,
222
+ arms: { baseline: armBaseline, variant: armVariant },
223
+ meta: {
224
+ runner: metaRunner || 'unknown',
225
+ agentCommand: metaCommand || null,
226
+ startedAt,
227
+ durationMs: Date.now() - t0,
228
+ warnings,
229
+ },
230
+ };
231
+ }
232
+
233
+ module.exports = { runComparison, buildVariantPrompt, normalizeArm, SCHEMA };