@evomap/evolver 1.89.2 → 1.89.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cursor/BUGBOT.md +182 -0
- package/.env.example +68 -0
- package/.git-commit-guard-token +1 -0
- package/.github/CODEOWNERS +63 -0
- package/.github/ISSUE_TEMPLATE/good_first_issue.md +23 -0
- package/.github/pull_request_template.md +45 -0
- package/.github/workflows/test.yml +75 -0
- package/CHANGELOG.md +1237 -0
- package/README.ja-JP.md +1 -3
- package/README.ko-KR.md +1 -3
- package/README.md +86 -530
- package/README.public.md +569 -0
- package/README.zh-CN.md +1 -3
- package/SECURITY.md +108 -0
- package/assets/gep/events.jsonl +3 -0
- package/assets/gep/genes.json +496 -0
- package/examples/atp-consumer-quickstart.md +100 -0
- package/examples/hello-world.md +38 -0
- package/index.js +44 -48
- package/package.json +6 -17
- package/proxy-package.json +39 -0
- package/public.manifest.json +143 -0
- package/src/adapters/hookAdapter.js +2 -0
- package/src/adapters/scripts/_lockPaths.js +74 -0
- package/src/adapters/scripts/evolver-session-start.js +19 -27
- package/src/config.js +23 -0
- package/src/evolve/guards.js +721 -1
- package/src/evolve/pipeline/collect.js +1283 -1
- package/src/evolve/pipeline/dispatch.js +421 -1
- package/src/evolve/pipeline/enrich.js +440 -1
- package/src/evolve/pipeline/hub.js +319 -1
- package/src/evolve/pipeline/select.js +274 -1
- package/src/evolve/pipeline/signals.js +206 -1
- package/src/evolve/utils.js +264 -1
- package/src/evolve.js +350 -1
- package/src/experiment/agentRunner.js +229 -0
- package/src/experiment/cli.js +159 -0
- package/src/experiment/comparison.js +233 -0
- package/src/experiment/metrics.js +75 -0
- package/src/forceUpdate.js +311 -30
- package/src/gep/a2aProtocol.js +4455 -1
- package/src/gep/antiAbuseTelemetry.js +233 -0
- package/src/gep/autoDistillConv.js +205 -1
- package/src/gep/autoDistillLlm.js +315 -1
- package/src/gep/candidateEval.js +92 -1
- package/src/gep/candidates.js +198 -1
- package/src/gep/contentHash.js +30 -1
- package/src/gep/conversationSniffer.js +266 -1
- package/src/gep/crypto.js +89 -1
- package/src/gep/curriculum.js +163 -1
- package/src/gep/deviceId.js +218 -1
- package/src/gep/envFingerprint.js +118 -1
- package/src/gep/epigenetics.js +31 -1
- package/src/gep/execBridge.js +711 -1
- package/src/gep/explore.js +289 -1
- package/src/gep/hash.js +15 -1
- package/src/gep/hubFetch.js +359 -1
- package/src/gep/hubReview.js +207 -1
- package/src/gep/hubSearch.js +526 -1
- package/src/gep/hubVerify.js +306 -1
- package/src/gep/learningSignals.js +89 -1
- package/src/gep/memoryGraph.js +1374 -1
- package/src/gep/memoryGraphAdapter.js +203 -1
- package/src/gep/mutation.js +203 -1
- package/src/gep/narrativeMemory.js +108 -1
- package/src/gep/openPRRegistry.js +205 -1
- package/src/gep/personality.js +423 -1
- package/src/gep/policyCheck.js +599 -1
- package/src/gep/prompt.js +836 -1
- package/src/gep/recallInject.js +409 -1
- package/src/gep/recallVerifier.js +318 -1
- package/src/gep/reflection.js +177 -1
- package/src/gep/sanitize.js +9 -0
- package/src/gep/selector.js +602 -1
- package/src/gep/skillDistiller.js +1294 -1
- package/src/gep/solidify.js +1699 -1
- package/src/gep/strategy.js +136 -1
- package/src/gep/tokenSavings.js +88 -1
- package/src/gep/validator/sandboxExecutor.js +29 -1
- package/src/gep/workspaceKeychain.js +174 -1
- package/src/proxy/extensions/traceControl.js +99 -1
- package/src/proxy/index.js +14 -5
- package/src/proxy/inject.js +52 -1
- package/src/proxy/lifecycle/manager.js +30 -0
- package/src/proxy/mailbox/store.js +2 -1
- package/src/proxy/router/messages_route.js +13 -2
- package/src/proxy/trace/extractor.js +646 -1
- package/src/proxy/trace/usage.js +105 -1
- package/CONTRIBUTING.md +0 -19
- package/assets/cover.png +0 -0
- package/assets/gep/genes.seed.json +0 -245
- package/scripts/a2a_export.js +0 -63
- package/scripts/a2a_ingest.js +0 -79
- package/scripts/a2a_promote.js +0 -118
- package/scripts/analyze_by_skill.js +0 -121
- package/scripts/build_binaries.js +0 -479
- package/scripts/check-changelog.js +0 -166
- package/scripts/extract_log.js +0 -85
- package/scripts/generate_history.js +0 -75
- package/scripts/gep_append_event.js +0 -96
- package/scripts/gep_personality_report.js +0 -234
- package/scripts/human_report.js +0 -147
- package/scripts/recall-verify-report.js +0 -234
- package/scripts/recover_loop.js +0 -61
- package/scripts/seed-merchants.js +0 -91
- package/scripts/suggest_version.js +0 -89
- package/scripts/validate-modules.js +0 -38
- package/scripts/validate-suite.js +0 -78
- package/skills/index.json +0 -14
- /package/{skills → bundled-skills}/_meta/SKILL.md +0 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
// src/experiment/agentRunner.js
|
|
2
|
+
//
|
|
3
|
+
// The ONLY module in src/experiment that touches a subprocess. It runs a single
|
|
4
|
+
// arm of a comparison by invoking a headless coding-agent CLI -- by default
|
|
5
|
+
// `claude -p "<prompt>" --output-format json` -- and maps its JSON envelope
|
|
6
|
+
// onto the canonical AgentResult shape (duration / rounds / tokens / cost).
|
|
7
|
+
//
|
|
8
|
+
// Security: the prompt is passed as an argv element with shell:false, so quotes,
|
|
9
|
+
// `$`, backticks etc. in the task can never be interpreted by a shell (same
|
|
10
|
+
// injection posture as src/gep/validator/sandboxExecutor.js). Unlike the
|
|
11
|
+
// sandbox, the real user env IS passed through -- the agent CLI needs PATH and
|
|
12
|
+
// its own auth/credentials to run.
|
|
13
|
+
'use strict';
|
|
14
|
+
|
|
15
|
+
const { spawn } = require('child_process');
|
|
16
|
+
|
|
17
|
+
const DEFAULT_CMD = 'claude';
|
|
18
|
+
const DEFAULT_TIMEOUT_MS = 300000; // 5 min
|
|
19
|
+
const MIN_TIMEOUT_MS = 1000;
|
|
20
|
+
const MAX_TIMEOUT_MS = 1800000; // 30 min hard cap
|
|
21
|
+
const RESULT_CAP = 4000;
|
|
22
|
+
const MAX_STDOUT_BYTES = 10 * 1024 * 1024; // 10 MB — a JSON envelope is tiny; cap a chatty/malformed CLI before it OOMs.
|
|
23
|
+
|
|
24
|
+
function num(v, fallback) {
|
|
25
|
+
const n = Number(v);
|
|
26
|
+
return Number.isFinite(n) ? n : (fallback === undefined ? 0 : fallback);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function resolveAgentCommand(opts) {
|
|
30
|
+
if (opts && opts.command) return String(opts.command);
|
|
31
|
+
const env = process.env.EVOLVER_EXPERIMENT_AGENT_CMD;
|
|
32
|
+
return env && env.trim() ? env.trim() : DEFAULT_CMD;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function resolveExtraArgs(opts) {
|
|
36
|
+
if (opts && Array.isArray(opts.extraArgs)) return opts.extraArgs.map(String);
|
|
37
|
+
const env = process.env.EVOLVER_EXPERIMENT_AGENT_ARGS;
|
|
38
|
+
if (!env || !env.trim()) return [];
|
|
39
|
+
return env.trim().split(/\s+/);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function resolveTimeout(opts) {
|
|
43
|
+
let t = opts && Number.isFinite(Number(opts.timeoutMs)) ? Number(opts.timeoutMs) : null;
|
|
44
|
+
if (t == null) {
|
|
45
|
+
const e = Number(process.env.EVOLVER_EXPERIMENT_TIMEOUT_MS);
|
|
46
|
+
t = Number.isFinite(e) && e > 0 ? e : DEFAULT_TIMEOUT_MS;
|
|
47
|
+
}
|
|
48
|
+
return Math.min(Math.max(MIN_TIMEOUT_MS, t), MAX_TIMEOUT_MS);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Parse the `--output-format json` envelope. Tolerant of leading/trailing
|
|
52
|
+
// non-JSON noise by falling back to the outermost {...} slice.
|
|
53
|
+
function parseAgentJson(stdout) {
|
|
54
|
+
const text = String(stdout || '').trim();
|
|
55
|
+
if (!text) return null;
|
|
56
|
+
try {
|
|
57
|
+
return JSON.parse(text);
|
|
58
|
+
} catch (_) { /* fall through */ }
|
|
59
|
+
const start = text.indexOf('{');
|
|
60
|
+
const end = text.lastIndexOf('}');
|
|
61
|
+
if (start >= 0 && end > start) {
|
|
62
|
+
try {
|
|
63
|
+
return JSON.parse(text.slice(start, end + 1));
|
|
64
|
+
} catch (_) { /* fall through */ }
|
|
65
|
+
}
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function makeFailure(error, command, extra) {
|
|
70
|
+
return Object.assign(
|
|
71
|
+
{
|
|
72
|
+
ok: false,
|
|
73
|
+
isError: true,
|
|
74
|
+
error: error,
|
|
75
|
+
durationMs: 0,
|
|
76
|
+
rounds: 0,
|
|
77
|
+
tokensIn: 0,
|
|
78
|
+
tokensOut: 0,
|
|
79
|
+
tokensTotal: 0,
|
|
80
|
+
costUsd: 0,
|
|
81
|
+
resultText: '',
|
|
82
|
+
exitCode: -1,
|
|
83
|
+
timedOut: false,
|
|
84
|
+
runnerName: 'claude-cli',
|
|
85
|
+
agentCommand: command,
|
|
86
|
+
},
|
|
87
|
+
extra || {},
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Map a parsed `claude -p --output-format json` envelope onto AgentResult.
|
|
92
|
+
function mapAgentResult(json, ctx) {
|
|
93
|
+
const command = ctx.command;
|
|
94
|
+
const exitCode = num(ctx.exitCode, -1);
|
|
95
|
+
const timedOut = !!ctx.timedOut;
|
|
96
|
+
const usage = (json && json.usage) || {};
|
|
97
|
+
const tokensIn = num(usage.input_tokens);
|
|
98
|
+
const tokensOut = num(usage.output_tokens);
|
|
99
|
+
const isError = !!(json && json.is_error);
|
|
100
|
+
const ok = exitCode === 0 && !isError && !timedOut;
|
|
101
|
+
let error = null;
|
|
102
|
+
if (!ok) {
|
|
103
|
+
if (timedOut) error = 'agent_timeout';
|
|
104
|
+
else if (isError) error = 'agent_reported_error';
|
|
105
|
+
else if (exitCode !== 0) error = 'agent_exit_' + exitCode;
|
|
106
|
+
}
|
|
107
|
+
// Prefer the agent's self-reported duration; fall back to wall-clock.
|
|
108
|
+
const durationMs = json && Number.isFinite(Number(json.duration_ms))
|
|
109
|
+
? num(json.duration_ms)
|
|
110
|
+
: num(ctx.durationMs);
|
|
111
|
+
return {
|
|
112
|
+
ok,
|
|
113
|
+
isError,
|
|
114
|
+
error,
|
|
115
|
+
durationMs,
|
|
116
|
+
rounds: num(json && json.num_turns),
|
|
117
|
+
tokensIn,
|
|
118
|
+
tokensOut,
|
|
119
|
+
tokensTotal: tokensIn + tokensOut,
|
|
120
|
+
costUsd: num(json && json.total_cost_usd),
|
|
121
|
+
resultText: typeof (json && json.result) === 'string' ? json.result.slice(0, RESULT_CAP) : '',
|
|
122
|
+
exitCode,
|
|
123
|
+
timedOut,
|
|
124
|
+
runnerName: 'claude-cli',
|
|
125
|
+
agentCommand: command,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Run one task prompt through a headless agent CLI.
|
|
131
|
+
*
|
|
132
|
+
* @param {string} prompt
|
|
133
|
+
* @param {object} [opts] { command, extraArgs, timeoutMs, cwd }
|
|
134
|
+
* @returns {Promise<AgentResult>} never rejects; failures resolve as { ok:false, error }
|
|
135
|
+
*/
|
|
136
|
+
function runAgentTask(prompt, opts) {
|
|
137
|
+
opts = opts || {};
|
|
138
|
+
const command = resolveAgentCommand(opts);
|
|
139
|
+
const extraArgs = resolveExtraArgs(opts);
|
|
140
|
+
const timeoutMs = resolveTimeout(opts);
|
|
141
|
+
const cwd = opts.cwd || process.cwd();
|
|
142
|
+
const argv = [...extraArgs, '-p', String(prompt == null ? '' : prompt), '--output-format', 'json'];
|
|
143
|
+
|
|
144
|
+
return new Promise((resolve) => {
|
|
145
|
+
let child;
|
|
146
|
+
let settled = false;
|
|
147
|
+
let stdout = '';
|
|
148
|
+
let stderr = '';
|
|
149
|
+
const startedAt = Date.now();
|
|
150
|
+
|
|
151
|
+
const done = (result) => {
|
|
152
|
+
if (settled) return;
|
|
153
|
+
settled = true;
|
|
154
|
+
clearTimeout(timer);
|
|
155
|
+
resolve(result);
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
const timer = setTimeout(() => {
|
|
159
|
+
if (child && !child.killed) {
|
|
160
|
+
try { child.kill('SIGKILL'); } catch (_) { /* noop */ }
|
|
161
|
+
}
|
|
162
|
+
done(makeFailure('agent_timeout', command, {
|
|
163
|
+
durationMs: Date.now() - startedAt,
|
|
164
|
+
timedOut: true,
|
|
165
|
+
}));
|
|
166
|
+
}, timeoutMs);
|
|
167
|
+
|
|
168
|
+
try {
|
|
169
|
+
child = spawn(command, argv, {
|
|
170
|
+
shell: false,
|
|
171
|
+
cwd,
|
|
172
|
+
env: process.env,
|
|
173
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
174
|
+
});
|
|
175
|
+
} catch (e) {
|
|
176
|
+
done(makeFailure('agent_spawn_failed: ' + (e && e.message ? e.message : String(e)), command));
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
child.stdout.on('data', (d) => {
|
|
181
|
+
stdout += d.toString('utf8');
|
|
182
|
+
if (stdout.length > MAX_STDOUT_BYTES) {
|
|
183
|
+
// A well-behaved `--output-format json` envelope is tiny; this much
|
|
184
|
+
// stdout means a runaway/malformed CLI — kill it rather than OOM.
|
|
185
|
+
try { child.kill('SIGKILL'); } catch (_) { /* noop */ }
|
|
186
|
+
done(makeFailure('agent_output_too_large', command, { durationMs: Date.now() - startedAt }));
|
|
187
|
+
}
|
|
188
|
+
});
|
|
189
|
+
child.stderr.on('data', (d) => {
|
|
190
|
+
stderr += d.toString('utf8');
|
|
191
|
+
if (stderr.length > MAX_STDOUT_BYTES) stderr = stderr.slice(-RESULT_CAP);
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
child.on('error', (err) => {
|
|
195
|
+
const msg = err && err.code === 'ENOENT'
|
|
196
|
+
? 'agent_cli_not_found: ' + command
|
|
197
|
+
: 'agent_spawn_error: ' + (err && err.message ? err.message : String(err));
|
|
198
|
+
done(makeFailure(msg, command, { durationMs: Date.now() - startedAt }));
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
child.on('exit', (code, signal) => {
|
|
202
|
+
const durationMs = Date.now() - startedAt;
|
|
203
|
+
// A signal-terminated child reports code=null, and Number(null) is 0 —
|
|
204
|
+
// num(null, -1) would read it as a CLEAN exit and let a killed arm
|
|
205
|
+
// score ok:true. Map null/undefined explicitly before the numeric path.
|
|
206
|
+
const exitCode = (code === null || code === undefined) ? -1 : num(code, -1);
|
|
207
|
+
const json = parseAgentJson(stdout);
|
|
208
|
+
if (!json) {
|
|
209
|
+
done(makeFailure(
|
|
210
|
+
signal ? 'agent_killed_' + signal : 'agent_output_parse_failed', command, {
|
|
211
|
+
durationMs,
|
|
212
|
+
exitCode,
|
|
213
|
+
resultText: String(stdout || stderr || '').slice(0, RESULT_CAP),
|
|
214
|
+
}));
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
done(mapAgentResult(json, { exitCode, durationMs, timedOut: false, command }));
|
|
218
|
+
});
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
module.exports = {
|
|
223
|
+
runAgentTask,
|
|
224
|
+
parseAgentJson,
|
|
225
|
+
mapAgentResult,
|
|
226
|
+
resolveAgentCommand,
|
|
227
|
+
resolveTimeout,
|
|
228
|
+
DEFAULT_CMD,
|
|
229
|
+
};
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
// src/experiment/cli.js
|
|
2
|
+
//
|
|
3
|
+
// CLI surface for the `experiment` subcommand. Mirrors src/atp/cli.js:
|
|
4
|
+
// parseExperimentArgs(args) -> { ok, opts? , error? }
|
|
5
|
+
// runExperiment(opts, deps) -> { ok, data?, error?, exitCode }
|
|
6
|
+
// Dependency-injected (comparison / agentRunner / geneLoader / sandbox)
|
|
7
|
+
// so the runner is unit-testable without a real agent CLI.
|
|
8
|
+
'use strict';
|
|
9
|
+
|
|
10
|
+
const fs = require('fs');
|
|
11
|
+
const path = require('path');
|
|
12
|
+
|
|
13
|
+
const MAX_REQUEST_FILE_BYTES = 1024 * 1024; // 1 MB — a request JSON is tiny
|
|
14
|
+
|
|
15
|
+
// Minimal `--key=value` / `--flag` parser (no shell, no globbing).
|
|
16
|
+
function parseFlags(args) {
|
|
17
|
+
const out = {};
|
|
18
|
+
for (const a of args || []) {
|
|
19
|
+
if (typeof a !== 'string') continue;
|
|
20
|
+
const eq = a.indexOf('=');
|
|
21
|
+
if (a.startsWith('--') && eq > 2) {
|
|
22
|
+
out[a.slice(2, eq)] = a.slice(eq + 1);
|
|
23
|
+
} else if (a.startsWith('--')) {
|
|
24
|
+
out[a.slice(2)] = true;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return out;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Parse experiment subcommand args.
|
|
32
|
+
*
|
|
33
|
+
* Flags: --task= --metric= [--baseline=] [--variant=] [--gene=]
|
|
34
|
+
* [--validation="cmd1;;cmd2"] [--timeout=ms] [--request-file=<json>]
|
|
35
|
+
*
|
|
36
|
+
* --request-file supplies a JSON base ({task,baseline,variant,metric,geneId,
|
|
37
|
+
* validationCommands,timeoutMs}); explicit flags override it. This is the only
|
|
38
|
+
* filesystem read here and lets the desktop pass complex params without argv
|
|
39
|
+
* escaping headaches.
|
|
40
|
+
*
|
|
41
|
+
* @returns {{ ok: true, opts: object } | { ok: false, error: string }}
|
|
42
|
+
*/
|
|
43
|
+
function parseExperimentArgs(args) {
|
|
44
|
+
const f = parseFlags(args);
|
|
45
|
+
|
|
46
|
+
let base = {};
|
|
47
|
+
if (f['request-file']) {
|
|
48
|
+
try {
|
|
49
|
+
// Resolve + stat the path and bound its size before reading: don't open
|
|
50
|
+
// an arbitrary non-file (device/FIFO) or slurp an unbounded file.
|
|
51
|
+
const rfPath = path.resolve(String(f['request-file']));
|
|
52
|
+
const st = fs.statSync(rfPath);
|
|
53
|
+
if (!st.isFile()) return { ok: false, error: '--request-file must be a regular file' };
|
|
54
|
+
if (st.size > MAX_REQUEST_FILE_BYTES) return { ok: false, error: '--request-file too large (> 1 MB)' };
|
|
55
|
+
base = JSON.parse(fs.readFileSync(rfPath, 'utf8'));
|
|
56
|
+
} catch (e) {
|
|
57
|
+
return { ok: false, error: 'failed to read --request-file: ' + (e && e.message ? e.message : String(e)) };
|
|
58
|
+
}
|
|
59
|
+
if (!base || typeof base !== 'object') {
|
|
60
|
+
return { ok: false, error: '--request-file must contain a JSON object' };
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const pick = (flagVal, baseVal, dflt) => {
|
|
65
|
+
if (flagVal !== undefined && flagVal !== true) return String(flagVal);
|
|
66
|
+
if (baseVal !== undefined && baseVal !== null) return String(baseVal);
|
|
67
|
+
return dflt;
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
const opts = {
|
|
71
|
+
// trim() so a whitespace-only value fails the required-field check below
|
|
72
|
+
// (parse-time exit 2 + usage) instead of crashing later in runComparison
|
|
73
|
+
// (exit 1 with no JSON on stdout for the Go caller).
|
|
74
|
+
task: pick(f.task, base.task, '').trim(),
|
|
75
|
+
baseline: pick(f.baseline, base.baseline, 'baseline'),
|
|
76
|
+
variant: pick(f.variant, base.variant, 'variant'),
|
|
77
|
+
metric: pick(f.metric, base.metric, '').trim(),
|
|
78
|
+
geneId: pick(f.gene, base.geneId !== undefined ? base.geneId : base.gene, '') || null,
|
|
79
|
+
validationCommands: null,
|
|
80
|
+
timeoutMs: undefined,
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
if (f.validation !== undefined && f.validation !== true) {
|
|
84
|
+
opts.validationCommands = String(f.validation).split(';;').map((s) => s.trim()).filter(Boolean);
|
|
85
|
+
} else if (Array.isArray(base.validationCommands)) {
|
|
86
|
+
opts.validationCommands = base.validationCommands.map(String);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const timeoutRaw = f.timeout !== undefined && f.timeout !== true ? Number(f.timeout)
|
|
90
|
+
: (Number.isFinite(Number(base.timeoutMs)) ? Number(base.timeoutMs) : NaN);
|
|
91
|
+
if (Number.isFinite(timeoutRaw)) opts.timeoutMs = timeoutRaw;
|
|
92
|
+
|
|
93
|
+
if (!opts.task) return { ok: false, error: 'missing required --task (or "task" in --request-file)' };
|
|
94
|
+
if (!opts.metric) return { ok: false, error: 'missing required --metric (or "metric" in --request-file)' };
|
|
95
|
+
|
|
96
|
+
return { ok: true, opts };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Run the comparison. Returns the result object (also written to stdout by the
|
|
101
|
+
* index.js wrapper) plus an exit code. exitCode 3 == inconclusive (a real,
|
|
102
|
+
* structured outcome, not a crash).
|
|
103
|
+
*
|
|
104
|
+
* @param {object} opts from parseExperimentArgs
|
|
105
|
+
* @param {object} [deps] { comparison, agentRunner, geneLoader, sandbox, err }
|
|
106
|
+
*/
|
|
107
|
+
async function runExperiment(opts, deps) {
|
|
108
|
+
deps = deps || {};
|
|
109
|
+
const comparison = deps.comparison || require('./comparison');
|
|
110
|
+
const err = typeof deps.err === 'function' ? deps.err : ((...a) => console.error(...a));
|
|
111
|
+
|
|
112
|
+
const params = Object.assign({}, opts);
|
|
113
|
+
if (deps.agentRunner) params.agentRunner = deps.agentRunner;
|
|
114
|
+
if (deps.geneLoader) params.geneLoader = deps.geneLoader;
|
|
115
|
+
if (deps.sandbox) params.sandbox = deps.sandbox;
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
const raw = await comparison.runComparison(params);
|
|
119
|
+
// Redact any secrets / API keys an agent's resultText (or a parse-failure
|
|
120
|
+
// snippet) may carry before the result crosses the process boundary into
|
|
121
|
+
// the desktop consumer / is persisted to disk.
|
|
122
|
+
const data = require('../gep/sanitize').sanitizePayload(raw);
|
|
123
|
+
let exitCode = 0;
|
|
124
|
+
if (data && data.winner === 'inconclusive') {
|
|
125
|
+
exitCode = 3;
|
|
126
|
+
// Two distinct causes share the inconclusive verdict; say which one
|
|
127
|
+
// happened so operators don't go debugging arm failures that never
|
|
128
|
+
// occurred (both arms can be ok when pass-rate was simply unmeasured).
|
|
129
|
+
const bothArmsOk = !!(data.arms && data.arms.baseline && data.arms.baseline.ok
|
|
130
|
+
&& data.arms.variant && data.arms.variant.ok);
|
|
131
|
+
err(bothArmsOk
|
|
132
|
+
? '[Experiment] inconclusive: metric resolved to pass-rate but no validation commands were given, so pass-rate was never measured — pass --validation (or pick another --metric)'
|
|
133
|
+
: '[Experiment] inconclusive: one or both arms failed — see arms.*.error');
|
|
134
|
+
}
|
|
135
|
+
return { ok: true, data, exitCode };
|
|
136
|
+
} catch (e) {
|
|
137
|
+
err('[Experiment] error: ' + (e && e.message ? e.message : String(e)));
|
|
138
|
+
return { ok: false, error: e && e.message ? e.message : String(e), exitCode: 1 };
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function printExperimentUsage() {
|
|
143
|
+
return [
|
|
144
|
+
'Usage: node index.js experiment --task="..." --metric="..." [flags]',
|
|
145
|
+
' --baseline="..." 对照臂标签 (default: baseline)',
|
|
146
|
+
' --variant="..." 实验臂标签 (default: variant)',
|
|
147
|
+
' --gene=<geneId> 变体臂复用的基因 id (注入其 strategy)',
|
|
148
|
+
' --validation="c1;;c2" 自包含 node 校验命令 (通过率评分; ;; 分隔)',
|
|
149
|
+
' --timeout=<ms> 单臂超时',
|
|
150
|
+
' --request-file=<path> JSON 基底 (显式 flag 覆盖之)',
|
|
151
|
+
'',
|
|
152
|
+
'Runs the same task twice (baseline vs variant-with-gene) via a headless',
|
|
153
|
+
'agent CLI and prints a comparison JSON to stdout. Logs go to stderr.',
|
|
154
|
+
'Env: EVOLVER_EXPERIMENT_AGENT_CMD (default claude),',
|
|
155
|
+
' EVOLVER_EXPERIMENT_AGENT_ARGS, EVOLVER_EXPERIMENT_TIMEOUT_MS (300000).',
|
|
156
|
+
].join('\n');
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
module.exports = { parseExperimentArgs, runExperiment, printExperimentUsage, parseFlags };
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
// src/experiment/comparison.js
|
|
2
|
+
//
|
|
3
|
+
// Thin orchestrator for a comparative experiment: run the SAME task twice --
|
|
4
|
+
// a baseline arm (plain task) and a variant arm (task + the reused gene's
|
|
5
|
+
// strategy injected) -- through a pluggable agent runner, collect real
|
|
6
|
+
// metrics (duration / rounds / tokens / pass-rate), and emit a versioned
|
|
7
|
+
// comparison result.
|
|
8
|
+
//
|
|
9
|
+
// Design notes:
|
|
10
|
+
// - This module NEVER requires child_process. The agent runner, gene loader,
|
|
11
|
+
// and sandbox runner are all injectable, so unit tests stay deterministic
|
|
12
|
+
// (no LLM, no network, no subprocess). Production defaults are lazy-loaded.
|
|
13
|
+
// - A failed arm never fabricates a score: if either arm is !ok the winner is
|
|
14
|
+
// 'inconclusive' and improvement is null, while still recording whatever
|
|
15
|
+
// partial metrics were captured.
|
|
16
|
+
'use strict';
|
|
17
|
+
|
|
18
|
+
const { deriveMetric, scoreArm, num, round } = require('./metrics');
|
|
19
|
+
|
|
20
|
+
const SCHEMA = 'evolver.experiment.comparison.v1';
|
|
21
|
+
const RESULT_TEXT_CAP = 2000;
|
|
22
|
+
const EPS = 1e-9;
|
|
23
|
+
|
|
24
|
+
// Build the variant prompt by appending the reused gene's strategy, mirroring
|
|
25
|
+
// the numbered-list format used in src/gep/prompt.js (`${i+1}. ${s}`).
|
|
26
|
+
function buildVariantPrompt(task, gene) {
|
|
27
|
+
if (!gene || !Array.isArray(gene.strategy) || gene.strategy.length === 0) return task;
|
|
28
|
+
const steps = gene.strategy.map((s, i) => `${i + 1}. ${s}`).join('\n');
|
|
29
|
+
return (
|
|
30
|
+
task +
|
|
31
|
+
'\n\n## Reuse the following proven strategy\n' +
|
|
32
|
+
steps +
|
|
33
|
+
'\n\nApply the strategy above while completing the task.'
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Coerce whatever the agent runner returned into the canonical arm shape.
|
|
38
|
+
function normalizeArm(label, raw) {
|
|
39
|
+
raw = raw || {};
|
|
40
|
+
const tokensIn = num(raw.tokensIn);
|
|
41
|
+
const tokensOut = num(raw.tokensOut);
|
|
42
|
+
const tokensTotal = Number.isFinite(Number(raw.tokensTotal)) ? num(raw.tokensTotal) : tokensIn + tokensOut;
|
|
43
|
+
return {
|
|
44
|
+
label: String(label == null ? '' : label),
|
|
45
|
+
ok: !!raw.ok,
|
|
46
|
+
error: raw.error != null ? String(raw.error) : null,
|
|
47
|
+
durationMs: num(raw.durationMs),
|
|
48
|
+
rounds: num(raw.rounds),
|
|
49
|
+
tokensIn,
|
|
50
|
+
tokensOut,
|
|
51
|
+
tokensTotal,
|
|
52
|
+
costUsd: num(raw.costUsd),
|
|
53
|
+
passRate: Number.isFinite(Number(raw.passRate)) ? num(raw.passRate) : (raw.ok ? 1 : 0),
|
|
54
|
+
resultText: typeof raw.resultText === 'string' ? raw.resultText.slice(0, RESULT_TEXT_CAP) : '',
|
|
55
|
+
exitCode: Number.isFinite(Number(raw.exitCode)) ? num(raw.exitCode) : null,
|
|
56
|
+
timedOut: !!raw.timedOut,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Pass-rate for ONE arm: run its `node <script>` validation commands INSIDE that
|
|
61
|
+
// arm's own workspace (where its agent just ran), so two arms whose agents
|
|
62
|
+
// produced different output get different pass-rates -- the metric is linked to
|
|
63
|
+
// the arm, not to a shared empty sandbox. Each command is a `node <script>`
|
|
64
|
+
// vetted by sandboxExecutor's allowlist (runSingleCommand rejects anything else).
|
|
65
|
+
async function passRateInDir(commands, cwd, runSingleCommand, timeoutMs, warnings) {
|
|
66
|
+
let passed = 0;
|
|
67
|
+
let total = 0;
|
|
68
|
+
for (const cmd of commands) {
|
|
69
|
+
total += 1;
|
|
70
|
+
try {
|
|
71
|
+
const r = await runSingleCommand(cmd, { cwd, timeoutMs });
|
|
72
|
+
if (r && r.ok) passed += 1;
|
|
73
|
+
} catch (e) {
|
|
74
|
+
warnings.push('passrate_command_error: ' + (e && e.message ? e.message : String(e)));
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return total > 0 ? round(passed / total, 4) : 0;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Run a two-arm comparison.
|
|
82
|
+
*
|
|
83
|
+
* @param {object} params
|
|
84
|
+
* @param {string} params.task 自然语言任务(必填)
|
|
85
|
+
* @param {string} [params.baseline='baseline'] 对照臂标签
|
|
86
|
+
* @param {string} [params.variant='variant'] 实验臂标签
|
|
87
|
+
* @param {string} params.metric 评估指标(必填)
|
|
88
|
+
* @param {string} [params.geneId] 变体臂复用的基因 id
|
|
89
|
+
* @param {string[]}[params.validationCommands] 自包含 `node <script>` 校验命令
|
|
90
|
+
* @param {number} [params.timeoutMs] 单臂超时
|
|
91
|
+
* @param {function}[params.agentRunner] (prompt, opts) => Promise<AgentResult>
|
|
92
|
+
* @param {function}[params.geneLoader] () => Gene[]
|
|
93
|
+
* @param {object} [params.sandbox] { createSandboxDir, cleanupDir, runSingleCommand } (default: sandboxExecutor)
|
|
94
|
+
* @returns {Promise<object>} versioned ComparisonResult (see SCHEMA)
|
|
95
|
+
*/
|
|
96
|
+
async function runComparison(params) {
|
|
97
|
+
const p = params || {};
|
|
98
|
+
const task = String(p.task == null ? '' : p.task).trim();
|
|
99
|
+
const baseline = p.baseline ? String(p.baseline) : 'baseline';
|
|
100
|
+
const variant = p.variant ? String(p.variant) : 'variant';
|
|
101
|
+
const metric = String(p.metric == null ? '' : p.metric);
|
|
102
|
+
const geneId = p.geneId ? String(p.geneId) : null;
|
|
103
|
+
const validationCommands = Array.isArray(p.validationCommands)
|
|
104
|
+
? p.validationCommands.filter((c) => typeof c === 'string' && c.trim())
|
|
105
|
+
: null;
|
|
106
|
+
const timeoutMs = Number.isFinite(Number(p.timeoutMs)) ? Number(p.timeoutMs) : undefined;
|
|
107
|
+
|
|
108
|
+
if (!task) throw new Error('task is required');
|
|
109
|
+
if (!metric) throw new Error('metric is required');
|
|
110
|
+
|
|
111
|
+
const agentRunner = typeof p.agentRunner === 'function'
|
|
112
|
+
? p.agentRunner
|
|
113
|
+
: require('./agentRunner').runAgentTask;
|
|
114
|
+
const geneLoader = typeof p.geneLoader === 'function'
|
|
115
|
+
? p.geneLoader
|
|
116
|
+
: require('../gep/assetStore').loadGenes;
|
|
117
|
+
const sandbox = p.sandbox && typeof p.sandbox === 'object'
|
|
118
|
+
? p.sandbox
|
|
119
|
+
: require('../gep/validator/sandboxExecutor');
|
|
120
|
+
|
|
121
|
+
const startedAt = new Date().toISOString();
|
|
122
|
+
const t0 = Date.now();
|
|
123
|
+
const warnings = [];
|
|
124
|
+
|
|
125
|
+
const metricInfo = deriveMetric(metric);
|
|
126
|
+
if (!metricInfo.recognized) warnings.push('metric_unrecognized: ' + metric);
|
|
127
|
+
|
|
128
|
+
// Look up the reused gene (variant arm). Without a resolved gene the variant
|
|
129
|
+
// prompt is identical to the baseline task, so the two arms are NOT a strategy
|
|
130
|
+
// comparison -- record an explicit warning so identical arms aren't mistaken
|
|
131
|
+
// for one.
|
|
132
|
+
let gene = null;
|
|
133
|
+
if (geneId) {
|
|
134
|
+
let genes = [];
|
|
135
|
+
try {
|
|
136
|
+
genes = geneLoader() || [];
|
|
137
|
+
} catch (e) {
|
|
138
|
+
warnings.push('gene_load_error: ' + (e && e.message ? e.message : String(e)));
|
|
139
|
+
}
|
|
140
|
+
gene = genes.find((g) => g && String(g.id) === geneId) || null;
|
|
141
|
+
if (!gene) warnings.push('gene_not_found: ' + geneId + ' (variant arm equals baseline)');
|
|
142
|
+
} else {
|
|
143
|
+
warnings.push('no_gene: variant arm equals baseline (no strategy injected)');
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const hasValidation = !!(validationCommands && validationCommands.length);
|
|
147
|
+
if (!hasValidation) warnings.push('passrate_degraded_no_validation');
|
|
148
|
+
|
|
149
|
+
let metaRunner = null;
|
|
150
|
+
let metaCommand = null;
|
|
151
|
+
|
|
152
|
+
const runArm = async (label, prompt) => {
|
|
153
|
+
// Each arm runs in its OWN fresh sandbox dir, so the agent works in
|
|
154
|
+
// isolation (never the evolver repo / process.cwd()) and its pass-rate
|
|
155
|
+
// validation reads that arm's own output, not a shared empty directory.
|
|
156
|
+
const workdir = sandbox.createSandboxDir();
|
|
157
|
+
let raw;
|
|
158
|
+
try {
|
|
159
|
+
raw = await agentRunner(prompt, { timeoutMs, cwd: workdir });
|
|
160
|
+
} catch (e) {
|
|
161
|
+
raw = { ok: false, error: 'agent_runner_threw: ' + (e && e.message ? e.message : String(e)) };
|
|
162
|
+
}
|
|
163
|
+
if (raw) {
|
|
164
|
+
if (metaRunner == null && raw.runnerName) metaRunner = String(raw.runnerName);
|
|
165
|
+
if (metaCommand == null && raw.agentCommand) metaCommand = String(raw.agentCommand);
|
|
166
|
+
}
|
|
167
|
+
const arm = normalizeArm(label, raw);
|
|
168
|
+
if (hasValidation) {
|
|
169
|
+
arm.passRate = await passRateInDir(validationCommands, workdir, sandbox.runSingleCommand, timeoutMs, warnings);
|
|
170
|
+
}
|
|
171
|
+
try { sandbox.cleanupDir(workdir); } catch (_) { /* best-effort cleanup */ }
|
|
172
|
+
return arm;
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
// Arms run sequentially: two real agent CLIs in parallel would contend for
|
|
176
|
+
// local resources / provider rate limits and muddy the duration metric.
|
|
177
|
+
const armBaseline = await runArm(baseline, task);
|
|
178
|
+
const armVariant = await runArm(variant, buildVariantPrompt(task, gene));
|
|
179
|
+
|
|
180
|
+
const baselineScore = scoreArm(armBaseline, metricInfo.metricField);
|
|
181
|
+
const variantScore = scoreArm(armVariant, metricInfo.metricField);
|
|
182
|
+
|
|
183
|
+
// Pass-rate is only a real measurement when validation commands ran. Without
|
|
184
|
+
// them it's a synthetic ok?1:0, so a pass-rate comparison would falsely tie
|
|
185
|
+
// (both arms 1.0) — report it as inconclusive instead of a fake tie.
|
|
186
|
+
const passRateNotMeasured = metricInfo.metricField === 'passRate' && !hasValidation;
|
|
187
|
+
let winner;
|
|
188
|
+
let improvement;
|
|
189
|
+
if (!armBaseline.ok || !armVariant.ok || passRateNotMeasured) {
|
|
190
|
+
winner = 'inconclusive';
|
|
191
|
+
improvement = null;
|
|
192
|
+
} else if (Math.abs(baselineScore - variantScore) <= EPS) {
|
|
193
|
+
winner = 'tie';
|
|
194
|
+
improvement = 0;
|
|
195
|
+
} else {
|
|
196
|
+
const variantBetter = metricInfo.lowerIsBetter
|
|
197
|
+
? variantScore < baselineScore
|
|
198
|
+
: variantScore > baselineScore;
|
|
199
|
+
winner = variantBetter ? 'variant' : 'baseline';
|
|
200
|
+
if (baselineScore === 0) {
|
|
201
|
+
improvement = null;
|
|
202
|
+
} else {
|
|
203
|
+
const ratio = metricInfo.lowerIsBetter
|
|
204
|
+
? (baselineScore - variantScore) / Math.abs(baselineScore)
|
|
205
|
+
: (variantScore - baselineScore) / Math.abs(baselineScore);
|
|
206
|
+
improvement = round(ratio, 4);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
schema: SCHEMA,
|
|
212
|
+
task,
|
|
213
|
+
metric,
|
|
214
|
+
metricField: metricInfo.metricField,
|
|
215
|
+
lowerIsBetter: metricInfo.lowerIsBetter,
|
|
216
|
+
scoreUnit: metricInfo.scoreUnit,
|
|
217
|
+
geneId,
|
|
218
|
+
baselineScore,
|
|
219
|
+
variantScore,
|
|
220
|
+
winner,
|
|
221
|
+
improvement,
|
|
222
|
+
arms: { baseline: armBaseline, variant: armVariant },
|
|
223
|
+
meta: {
|
|
224
|
+
runner: metaRunner || 'unknown',
|
|
225
|
+
agentCommand: metaCommand || null,
|
|
226
|
+
startedAt,
|
|
227
|
+
durationMs: Date.now() - t0,
|
|
228
|
+
warnings,
|
|
229
|
+
},
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
module.exports = { runComparison, buildVariantPrompt, normalizeArm, SCHEMA };
|