@tangle-network/agent-runtime 0.50.0 → 0.52.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/agent.js +1 -1
  2. package/dist/{chunk-CM2IK7VS.js → chunk-2OU7ZQPD.js} +38 -8
  3. package/dist/chunk-2OU7ZQPD.js.map +1 -0
  4. package/dist/{chunk-OM3YNZIW.js → chunk-4JI4BCBI.js} +5 -360
  5. package/dist/chunk-4JI4BCBI.js.map +1 -0
  6. package/dist/{chunk-NDM5VXZW.js → chunk-7SP2OVYZ.js} +7 -5
  7. package/dist/{chunk-NDM5VXZW.js.map → chunk-7SP2OVYZ.js.map} +1 -1
  8. package/dist/{chunk-RHW75JW5.js → chunk-BERLUBAP.js} +2 -2
  9. package/dist/{chunk-BKAIVNFA.js → chunk-COAVO6QB.js} +3 -3
  10. package/dist/chunk-G3RGMA7C.js +361 -0
  11. package/dist/chunk-G3RGMA7C.js.map +1 -0
  12. package/dist/{chunk-ML4IXGTV.js → chunk-V2K35HF2.js} +2 -2
  13. package/dist/improvement.d.ts +96 -8
  14. package/dist/improvement.js +191 -9
  15. package/dist/improvement.js.map +1 -1
  16. package/dist/index.d.ts +114 -4
  17. package/dist/index.js +144 -18
  18. package/dist/index.js.map +1 -1
  19. package/dist/intelligence.d.ts +423 -0
  20. package/dist/intelligence.js +427 -0
  21. package/dist/intelligence.js.map +1 -0
  22. package/dist/loop-runner-bin.js +4 -3
  23. package/dist/loops.d.ts +2 -1
  24. package/dist/loops.js +3 -1
  25. package/dist/mcp/bin.js +5 -4
  26. package/dist/mcp/bin.js.map +1 -1
  27. package/dist/mcp/index.js +6 -5
  28. package/dist/mcp/index.js.map +1 -1
  29. package/dist/platform.d.ts +120 -62
  30. package/dist/platform.js +68 -26
  31. package/dist/platform.js.map +1 -1
  32. package/dist/runtime.d.ts +47 -8
  33. package/dist/runtime.js +3 -1
  34. package/dist/workflow.js +1 -1
  35. package/package.json +6 -1
  36. package/skills/agent-runtime-adoption/SKILL.md +41 -26
  37. package/skills/build-with-agent-runtime/SKILL.md +143 -0
  38. package/skills/loop-writer/SKILL.md +6 -7
  39. package/dist/chunk-CM2IK7VS.js.map +0 -1
  40. package/dist/chunk-OM3YNZIW.js.map +0 -1
  41. /package/dist/{chunk-RHW75JW5.js.map → chunk-BERLUBAP.js.map} +0 -0
  42. /package/dist/{chunk-BKAIVNFA.js.map → chunk-COAVO6QB.js.map} +0 -0
  43. /package/dist/{chunk-ML4IXGTV.js.map → chunk-V2K35HF2.js.map} +0 -0
@@ -10,24 +10,36 @@ function agenticGenerator(opts = {}) {
10
10
  const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt;
11
11
  const run = opts.runHarness ?? runLocalHarness;
12
12
  const dirty = opts.isDirty ?? worktreeDirty;
13
+ const verify = opts.verify;
13
14
  return {
14
15
  kind: `agentic:${harness}`,
15
16
  async generate({ worktreePath, report, findings, maxShots, signal }) {
16
- let prompt = buildPrompt({ report, findings });
17
+ const basePrompt = buildPrompt({ report, findings });
17
18
  const shots = Math.max(1, maxShots);
19
+ let attemptNote = "";
18
20
  for (let shot = 0; shot < shots; shot++) {
19
21
  if (signal.aborted) break;
20
22
  await run({
21
23
  harness,
22
24
  cwd: worktreePath,
23
- taskPrompt: prompt,
25
+ taskPrompt: attemptNote ? `${basePrompt}
26
+
27
+ ${attemptNote}` : basePrompt,
24
28
  timeoutMs: opts.timeoutMs,
25
29
  signal
26
30
  });
27
- if (dirty(worktreePath)) {
31
+ if (!dirty(worktreePath)) {
32
+ attemptNote = EMPTY_TREE_NOTE;
33
+ continue;
34
+ }
35
+ if (!verify) {
36
+ return { applied: true, summary: summarize(findings) };
37
+ }
38
+ const result = await verify(worktreePath);
39
+ if (result.ok) {
28
40
  return { applied: true, summary: summarize(findings) };
29
41
  }
30
- prompt = refine(prompt);
42
+ attemptNote = failureNote(result.feedback);
31
43
  }
32
44
  return { applied: false, summary: "" };
33
45
  }
@@ -48,10 +60,42 @@ function defaultBuildPrompt(args) {
48
60
  }
49
61
  return lines.join("\n");
50
62
  }
51
- function refine(prompt) {
52
- return `${prompt}
53
-
54
- NOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`;
63
+ var EMPTY_TREE_NOTE = "NOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.";
64
+ function failureNote(feedback) {
65
+ const detail = feedback?.trim();
66
+ return [
67
+ "NOTE: your edits are in the working tree but verification FAILED.",
68
+ "Fix the problem in place \u2014 build on your existing edits, do not revert them.",
69
+ detail ? `Verifier output:
70
+ ${truncate(detail, 4e3)}` : "No verifier detail was captured."
71
+ ].join("\n");
72
+ }
73
+ function commandVerifier(command, args = [], timeoutMs = 3e5) {
74
+ return (worktreePath) => {
75
+ const result = spawnSync(command, args, {
76
+ cwd: worktreePath,
77
+ encoding: "utf-8",
78
+ timeout: timeoutMs
79
+ });
80
+ if (result.signal) {
81
+ return {
82
+ ok: false,
83
+ feedback: `verifier '${command}' killed by ${result.signal} (likely timeout after ${timeoutMs}ms)`
84
+ };
85
+ }
86
+ if (result.error) {
87
+ const code = result.error.code;
88
+ if (code === "ENOENT") {
89
+ throw new Error(
90
+ `commandVerifier: '${command}' not found in PATH (setup bug, not a failed candidate)`
91
+ );
92
+ }
93
+ throw new Error(`commandVerifier: '${command}' failed to spawn: ${result.error.message}`);
94
+ }
95
+ if (result.status === 0) return { ok: true };
96
+ const out = `${result.stdout ?? ""}${result.stderr ?? ""}`.trim();
97
+ return { ok: false, feedback: out.length > 0 ? out : `exit ${result.status}` };
98
+ };
55
99
  }
56
100
  function summarize(findings) {
57
101
  if (findings.length === 0) return "agentic improvement";
@@ -79,6 +123,43 @@ function worktreeDirty(worktreePath) {
79
123
  return result.stdout.trim().length > 0;
80
124
  }
81
125
 
126
+ // src/improvement/build-prompts.ts
127
+ function findingLines(findings) {
128
+ return findings.map((f) => {
129
+ const where = f.subject ? ` [${f.subject}]` : "";
130
+ const action = f.recommended_action ? ` \u2192 ${f.recommended_action}` : "";
131
+ return `- (${f.severity})${where} ${f.claim}${action}`;
132
+ });
133
+ }
134
+ function toolBuildPrompt(args) {
135
+ return [
136
+ "You are building a new TOOL for this codebase to address the gaps below.",
137
+ "Write the tool as a small, self-contained module PLUS tests that exercise it.",
138
+ "The tool must compile and its tests must pass \u2014 they will be run automatically;",
139
+ "if verification fails you will get the error and another attempt. Do not commit;",
140
+ "leave the changes in the working tree.",
141
+ "",
142
+ "Gaps the tool should close:",
143
+ ...findingLines(args.findings)
144
+ ].join("\n");
145
+ }
146
+ function mcpBuildPrompt(args) {
147
+ return [
148
+ "You are building a new MCP SERVER (Model Context Protocol) that exposes",
149
+ "tool(s) addressing the gaps below, so any harness can mount it.",
150
+ "Requirements that WILL be checked by booting the server:",
151
+ "- it starts over stdio and answers the MCP `initialize` handshake,",
152
+ "- `tools/list` returns at least one tool with a valid input schema.",
153
+ "Newline-delimited JSON-RPC 2.0, protocol version 2024-11-05. Include a start",
154
+ "command (e.g. a package.json `start` script or a clear entrypoint). If the",
155
+ "boot-and-probe fails you will get the error and another attempt. Do not",
156
+ "commit; leave the changes in the working tree.",
157
+ "",
158
+ "Capabilities the server should provide:",
159
+ ...findingLines(args.findings)
160
+ ].join("\n");
161
+ }
162
+
82
163
  // src/improvement/improvement-driver.ts
83
164
  function improvementDriver(opts) {
84
165
  const baseRef = opts.baseRef ?? "main";
@@ -127,6 +208,103 @@ function resolveFindings(ctx) {
127
208
  return ctx.findings;
128
209
  }
129
210
 
211
+ // src/improvement/mcp-serve-verifier.ts
212
+ import { spawn } from "child_process";
213
+ import { createInterface } from "readline";
214
+ var PROTOCOL_VERSION = "2024-11-05";
215
+ function mcpServeVerifier(spec) {
216
+ const timeoutMs = spec.timeoutMs ?? 3e4;
217
+ const minTools = spec.minTools ?? 1;
218
+ return (worktreePath) => new Promise((resolve, reject) => {
219
+ const child = spawn(spec.command, spec.args ?? [], {
220
+ cwd: worktreePath,
221
+ stdio: ["pipe", "pipe", "pipe"],
222
+ env: { ...process.env, ...spec.env }
223
+ });
224
+ const stderr = [];
225
+ let settled = false;
226
+ let nextId = 1;
227
+ const initId = nextId++;
228
+ let listId = -1;
229
+ const settle = (fn) => {
230
+ if (settled) return;
231
+ settled = true;
232
+ clearTimeout(timer);
233
+ rl.close();
234
+ child.kill("SIGKILL");
235
+ fn();
236
+ };
237
+ const withStderr = (msg) => stderr.length > 0 ? `${msg}
238
+ stderr:
239
+ ${stderr.join("").slice(-2e3)}` : msg;
240
+ const pass = () => settle(() => resolve({ ok: true }));
241
+ const failCandidate = (msg) => settle(() => resolve({ ok: false, feedback: withStderr(msg) }));
242
+ const setupFault = (err) => settle(() => reject(err));
243
+ const send = (msg) => {
244
+ try {
245
+ child.stdin.write(`${JSON.stringify(msg)}
246
+ `);
247
+ return true;
248
+ } catch (err) {
249
+ failCandidate(`writing to MCP server stdin failed: ${err.message}`);
250
+ return false;
251
+ }
252
+ };
253
+ child.on("error", (err) => {
254
+ const code = err.code;
255
+ setupFault(
256
+ code === "ENOENT" ? new Error(
257
+ `mcpServeVerifier: '${spec.command}' not found in PATH (setup bug, not a failed candidate)`
258
+ ) : new Error(`mcpServeVerifier: '${spec.command}' failed to spawn: ${err.message}`)
259
+ );
260
+ });
261
+ child.on("exit", (code, signal) => {
262
+ failCandidate(`MCP server exited (code ${code}, signal ${signal}) before serving`);
263
+ });
264
+ child.stderr.on("data", (d) => stderr.push(String(d)));
265
+ const rl = createInterface({ input: child.stdout });
266
+ rl.on("line", (line) => {
267
+ let msg;
268
+ try {
269
+ msg = JSON.parse(line);
270
+ } catch {
271
+ return;
272
+ }
273
+ if (!msg || typeof msg !== "object") return;
274
+ if (msg.id === initId) {
275
+ if (msg.error) return failCandidate(`initialize errored: ${JSON.stringify(msg.error)}`);
276
+ if (!send({ jsonrpc: "2.0", method: "notifications/initialized" })) return;
277
+ listId = nextId++;
278
+ send({ jsonrpc: "2.0", id: listId, method: "tools/list" });
279
+ return;
280
+ }
281
+ if (msg.id === listId) {
282
+ if (msg.error) return failCandidate(`tools/list errored: ${JSON.stringify(msg.error)}`);
283
+ const tools = msg.result?.tools;
284
+ if (!Array.isArray(tools)) return failCandidate("tools/list result has no tools array");
285
+ if (tools.length < minTools) {
286
+ return failCandidate(`tools/list returned ${tools.length} tool(s), need >= ${minTools}`);
287
+ }
288
+ return pass();
289
+ }
290
+ });
291
+ const timer = setTimeout(
292
+ () => failCandidate(`MCP server did not complete the handshake within ${timeoutMs}ms`),
293
+ timeoutMs
294
+ );
295
+ send({
296
+ jsonrpc: "2.0",
297
+ id: initId,
298
+ method: "initialize",
299
+ params: {
300
+ protocolVersion: PROTOCOL_VERSION,
301
+ capabilities: {},
302
+ clientInfo: { name: "agent-runtime-mcp-verify", version: "0" }
303
+ }
304
+ });
305
+ });
306
+ }
307
+
130
308
  // src/improvement/reflective-generator.ts
131
309
  import { spawnSync as spawnSync2 } from "child_process";
132
310
  function reflectiveGenerator(opts) {
@@ -155,7 +333,11 @@ function applyPatch(patch, cwd) {
155
333
  }
156
334
  export {
157
335
  agenticGenerator,
336
+ commandVerifier,
158
337
  improvementDriver,
159
- reflectiveGenerator
338
+ mcpBuildPrompt,
339
+ mcpServeVerifier,
340
+ reflectiveGenerator,
341
+ toolBuildPrompt
160
342
  };
161
343
  //# sourceMappingURL=improvement.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/improvement-driver.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change\n * (the worktree stays clean), the generator refines the prompt and retries, up\n * to `maxShots` times. A harness that already changed files returns on shot 1.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n let prompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: prompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n // The worktree IS the signal: if the harness touched files, we have a\n // candidate. We don't trust the harness's stdout — we trust the diff.\n if (dirty(worktreePath)) {\n return { applied: true, summary: summarize(findings) }\n }\n // No change this shot — give the next attempt explicit feedback.\n prompt = refine(prompt)\n }\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nfunction refine(prompt: string): string {\n return `${prompt}\\n\\nNOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;\n * an agent reads code + report and edits)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`\n * setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;AAuBA,SAAS,iBAAiB;AAmBnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAE9B,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,UAAI,SAAS,YAAY,EAAE,QAAQ,SAAS,CAAC;AAC7C,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY;AAAA,UACZ,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,MAAM,YAAY,GAAG;AACvB,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,iBAAS,OAAO,MAAM;AAAA,MACxB;AACA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,OAAO,QAAwB;AACtC,SAAO,GAAG,MAAM;AAAA;AAAA;AAClB;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACvEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;ACrGA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}
1
+ {"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/build-prompts.ts","../src/improvement/improvement-driver.ts","../src/improvement/mcp-serve-verifier.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial — a multi-shot verify-in-session loop, NOT the\n * kernel `runLoop`. Each shot runs one full harness session in the (persistent)\n * worktree; between shots the loop refines based on what the last shot produced:\n * - empty tree → \"you changed nothing, make the edits\" → retry\n * - dirty + `verify` fails → feed the verifier's failure into the next shot\n * (the worktree persists, so the harness RESUMES atop its own failing\n * edits with the error in hand — no `--resume` session plumbing needed,\n * and harness-agnostic across claude/codex/opencode)\n * - dirty + `verify` ok (or no verifier configured) → return the candidate\n * A candidate that never verifies within `maxShots` is discarded (`applied:\n * false`), never shipped — if you configured a verifier, a non-passing tree is\n * not a candidate. With no verifier the legacy behavior holds: first dirty shot\n * is the candidate.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\n/** Outcome of verifying a candidate worktree. `feedback` (compiler errors,\n * failing test output) is fed into the next shot when `ok` is false. */\nexport interface VerifyResult {\n ok: boolean\n feedback?: string\n}\n\n/** Verifies the edited worktree. Sync or async; throws only on a setup fault\n * (a candidate that fails verification returns `{ok:false}`, it does not\n * throw). */\nexport type Verifier = (worktreePath: string) => Promise<VerifyResult> | VerifyResult\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Verify the worktree after each dirtying shot. When set, a candidate that\n * fails verification is NOT returned — the failure feeds the next shot\n * (verify-in-session), up to `maxShots`; a candidate that never verifies is\n * discarded (`applied:false`), never shipped. Omitted ⇒ legacy behavior:\n * the first dirty shot is the candidate. See `commandVerifier`. */\n verify?: Verifier\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n const verify = opts.verify\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n const basePrompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n // Feedback appended to the base prompt for the NEXT shot — empty on shot 0.\n let attemptNote = ''\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: attemptNote ? `${basePrompt}\\n\\n${attemptNote}` : basePrompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n\n // The worktree IS the signal: no edits ⇒ tell the next shot to act.\n if (!dirty(worktreePath)) {\n attemptNote = EMPTY_TREE_NOTE\n continue\n }\n\n // Dirty: with no verifier the diff IS the candidate (we trust the diff,\n // not the harness's stdout). With a verifier the candidate must pass it.\n if (!verify) {\n return { applied: true, summary: summarize(findings) }\n }\n const result = await verify(worktreePath)\n if (result.ok) {\n return { applied: true, summary: summarize(findings) }\n }\n // Dirty but failing — resume next shot atop these edits with the error.\n attemptNote = failureNote(result.feedback)\n }\n\n // Shots exhausted: no verified candidate (or, sans verifier, no edits).\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nconst EMPTY_TREE_NOTE =\n 'NOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.'\n\n/** Next-shot feedback when the worktree is dirty but failed verification. The\n * edits persist on disk, so the harness resumes atop them — tell it to fix in\n * place, not start over. Verifier detail is truncated to keep the prompt bounded. */\nfunction failureNote(feedback?: string): string {\n const detail = feedback?.trim()\n return [\n 'NOTE: your edits are in the working tree but verification FAILED.',\n 'Fix the problem in place — build on your existing edits, do not revert them.',\n detail ? `Verifier output:\\n${truncate(detail, 4000)}` : 'No verifier detail was captured.',\n ].join('\\n')\n}\n\n/** A `Verifier` that runs a command in the worktree: exit 0 ⇒ ok, any other\n * exit ⇒ failed with stdout+stderr as feedback. The common case — verify by\n * `tsc --noEmit`, `pnpm build`, or a test command. A timeout is treated as a\n * FAILED candidate (a change that hangs the build is a bad change); a missing\n * binary or spawn fault throws (a setup bug, not a failed candidate — no\n * silent fallback). */\nexport function commandVerifier(\n command: string,\n args: string[] = [],\n timeoutMs = 300_000,\n): Verifier {\n return (worktreePath: string): VerifyResult => {\n const result = spawnSync(command, args, {\n cwd: worktreePath,\n encoding: 'utf-8',\n timeout: timeoutMs,\n })\n if (result.signal) {\n return {\n ok: false,\n feedback: `verifier '${command}' killed by ${result.signal} (likely timeout after ${timeoutMs}ms)`,\n }\n }\n if (result.error) {\n const code = (result.error as NodeJS.ErrnoException).code\n if (code === 'ENOENT') {\n throw new Error(\n `commandVerifier: '${command}' not found in PATH (setup bug, not a failed candidate)`,\n )\n }\n throw new Error(`commandVerifier: '${command}' failed to spawn: ${result.error.message}`)\n }\n if (result.status === 0) return { ok: true }\n const out = `${result.stdout ?? ''}${result.stderr ?? ''}`.trim()\n return { ok: false, feedback: out.length > 0 ? out : `exit ${result.status}` }\n }\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * Build-prompt starting points for the two buildable artifact types. There is\n * NO `toolGenerator`/`mcpGenerator` wrapper — the factory is `agenticGenerator`\n * + a verifier (docs/artifact-lifecycle-frontier.md), so a tool or an MCP\n * server is built by composing the pieces directly:\n *\n * // a tool:\n * agenticGenerator({ buildPrompt: toolBuildPrompt, verify: commandVerifier('pnpm', ['test']) })\n * // an MCP server:\n * agenticGenerator({ buildPrompt: mcpBuildPrompt, verify: mcpServeVerifier({ command: 'node', args: ['server.mjs'] }) })\n *\n * These are the only type-specific bit (the phrasing that points the agent at a\n * tool vs. an MCP); the worktree, resume-on-failure loop, and improvement-loop\n * wrapper are shared. MCP is the load-bearing target — it is how a harness\n * acquires tools; raw tools matter where we control the loader.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\n\ntype FindingsArg = { report: unknown; findings: AnalystFinding[] }\n\nfunction findingLines(findings: AnalystFinding[]): string[] {\n return findings.map((f) => {\n const where = f.subject ? ` [${f.subject}]` : ''\n const action = f.recommended_action ? ` → ${f.recommended_action}` : ''\n return `- (${f.severity})${where} ${f.claim}${action}`\n })\n}\n\nexport function toolBuildPrompt(args: FindingsArg): string {\n return [\n 'You are building a new TOOL for this codebase to address the gaps below.',\n 'Write the tool as a small, self-contained module PLUS tests that exercise it.',\n 'The tool must compile and its tests must pass — they will be run automatically;',\n 'if verification fails you will get the error and another attempt. Do not commit;',\n 'leave the changes in the working tree.',\n '',\n 'Gaps the tool should close:',\n ...findingLines(args.findings),\n ].join('\\n')\n}\n\nexport function mcpBuildPrompt(args: FindingsArg): string {\n return [\n 'You are building a new MCP SERVER (Model Context Protocol) that exposes',\n 'tool(s) addressing the gaps below, so any harness can mount it.',\n 'Requirements that WILL be checked by booting the server:',\n '- it starts over stdio and answers the MCP `initialize` handshake,',\n '- `tools/list` returns at least one tool with a valid input schema.',\n 'Newline-delimited JSON-RPC 2.0, protocol version 2024-11-05. Include a start',\n 'command (e.g. a package.json `start` script or a clear entrypoint). If the',\n 'boot-and-probe fails you will get the error and another attempt. Do not',\n 'commit; leave the changes in the working tree.',\n '',\n 'Capabilities the server should provide:',\n ...findingLines(args.findings),\n ].join('\\n')\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, multi-shot\n * verify-in-session loop; an agent reads code +\n * report, edits, and re-tries on verifier failure)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * `mcpServeVerifier` — the intrinsic verifier for a built MCP server: the\n * boot-and-probe checker named in docs/artifact-lifecycle-frontier.md. A\n * generated MCP server is only a candidate if it actually *serves* — so this\n * boots it over stdio (the default local MCP transport) and runs the real\n * handshake: `initialize` → `notifications/initialized` → `tools/list`, and\n * asserts the server answers with at least `minTools` tools.\n *\n * Outcomes follow the `Verifier` contract: a server that fails to start, exits\n * early, errors the handshake, times out, or exposes no tools is a FAILED\n * candidate (`{ok:false}`, fed back into the next generation shot); a missing\n * start binary or spawn fault THROWS (a setup bug, never a silent fallback).\n *\n * Protocol matches the runtime's own stdio MCP server (src/mcp/server.ts):\n * newline-delimited JSON-RPC 2.0, protocol version 2024-11-05.\n */\n\nimport { spawn } from 'node:child_process'\nimport { createInterface } from 'node:readline'\nimport type { Verifier, VerifyResult } from './agentic-generator'\n\nconst PROTOCOL_VERSION = '2024-11-05'\n\nexport interface McpServeSpec {\n /** Command that starts the built MCP server in the worktree (stdio transport). */\n command: string\n args?: string[]\n /** Extra env for the server process (merged over `process.env`). */\n env?: Record<string, string>\n /** Handshake timeout (ms). Default 30s. */\n timeoutMs?: number\n /** Minimum tools the server must expose to pass. Default 1. */\n minTools?: number\n}\n\ninterface JsonRpcResponse {\n jsonrpc?: string\n id?: number | string | null\n result?: unknown\n error?: { code: number; message: string }\n}\n\nexport function mcpServeVerifier(spec: McpServeSpec): Verifier {\n const timeoutMs = spec.timeoutMs ?? 30_000\n const minTools = spec.minTools ?? 1\n\n return (worktreePath: string): Promise<VerifyResult> =>\n new Promise<VerifyResult>((resolve, reject) => {\n const child = spawn(spec.command, spec.args ?? [], {\n cwd: worktreePath,\n stdio: ['pipe', 'pipe', 'pipe'],\n env: { ...process.env, ...spec.env },\n })\n\n const stderr: string[] = []\n let settled = false\n let nextId = 1\n const initId = nextId++\n let listId = -1\n\n const settle = (fn: () => void) => {\n if (settled) return\n settled = true\n clearTimeout(timer)\n rl.close()\n child.kill('SIGKILL')\n fn()\n }\n const withStderr = (msg: string) =>\n stderr.length > 0 ? `${msg}\\nstderr:\\n${stderr.join('').slice(-2000)}` : msg\n const pass = () => settle(() => resolve({ ok: true }))\n const failCandidate = (msg: string) =>\n settle(() => resolve({ ok: false, feedback: withStderr(msg) }))\n const setupFault = (err: Error) => settle(() => reject(err))\n\n const send = (msg: Record<string, unknown>): boolean => {\n try {\n child.stdin.write(`${JSON.stringify(msg)}\\n`)\n return true\n } catch (err) {\n // EPIPE: the server died mid-handshake — a failed candidate, not a fault.\n failCandidate(`writing to MCP server stdin failed: ${(err as Error).message}`)\n return false\n }\n }\n\n child.on('error', (err) => {\n const code = (err as NodeJS.ErrnoException).code\n setupFault(\n code === 'ENOENT'\n ? new Error(\n `mcpServeVerifier: '${spec.command}' not found in PATH (setup bug, not a failed candidate)`,\n )\n : new Error(`mcpServeVerifier: '${spec.command}' failed to spawn: ${err.message}`),\n )\n })\n child.on('exit', (code, signal) => {\n // An exit before the handshake completes is a failed candidate (the\n // server crashed on boot); after we settle, our own SIGKILL fires here.\n failCandidate(`MCP server exited (code ${code}, signal ${signal}) before serving`)\n })\n child.stderr.on('data', (d) => stderr.push(String(d)))\n\n const rl = createInterface({ input: child.stdout })\n rl.on('line', (line) => {\n let msg: JsonRpcResponse | undefined\n try {\n msg = JSON.parse(line) as JsonRpcResponse\n } catch {\n return // servers log to stdout too; skip non-JSON lines\n }\n if (!msg || typeof msg !== 'object') return\n\n if (msg.id === initId) {\n if (msg.error) return failCandidate(`initialize errored: ${JSON.stringify(msg.error)}`)\n if (!send({ jsonrpc: '2.0', method: 'notifications/initialized' })) return\n listId = nextId++\n send({ jsonrpc: '2.0', id: listId, method: 'tools/list' })\n return\n }\n if (msg.id === listId) {\n if (msg.error) return failCandidate(`tools/list errored: ${JSON.stringify(msg.error)}`)\n const tools = (msg.result as { tools?: unknown[] } | undefined)?.tools\n if (!Array.isArray(tools)) return failCandidate('tools/list result has no tools array')\n if (tools.length < minTools) {\n return failCandidate(`tools/list returned ${tools.length} tool(s), need >= ${minTools}`)\n }\n return pass()\n }\n })\n\n const timer = setTimeout(\n () => failCandidate(`MCP server did not complete the handshake within ${timeoutMs}ms`),\n timeoutMs,\n )\n\n send({\n jsonrpc: '2.0',\n id: initId,\n method: 'initialize',\n params: {\n protocolVersion: PROTOCOL_VERSION,\n capabilities: {},\n clientInfo: { name: 'agent-runtime-mcp-verify', version: '0' },\n },\n })\n })\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (a multi-shot verify-in-session loop) is the\n * `shots=N` setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;AAiCA,SAAS,iBAAiB;AAqCnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAC9B,QAAM,SAAS,KAAK;AAEpB,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,YAAM,aAAa,YAAY,EAAE,QAAQ,SAAS,CAAC;AACnD,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,UAAI,cAAc;AAElB,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY,cAAc,GAAG,UAAU;AAAA;AAAA,EAAO,WAAW,KAAK;AAAA,UAC9D,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,CAAC,MAAM,YAAY,GAAG;AACxB,wBAAc;AACd;AAAA,QACF;AAIA,YAAI,CAAC,QAAQ;AACX,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AACA,cAAM,SAAS,MAAM,OAAO,YAAY;AACxC,YAAI,OAAO,IAAI;AACb,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,sBAAc,YAAY,OAAO,QAAQ;AAAA,MAC3C;AAGA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,IAAM,kBACJ;AAKF,SAAS,YAAY,UAA2B;AAC9C,QAAM,SAAS,UAAU,KAAK;AAC9B,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,SAAS;AAAA,EAAqB,SAAS,QAAQ,GAAI,CAAC,KAAK;AAAA,EAC3D,EAAE,KAAK,IAAI;AACb;AAQO,SAAS,gBACd,SACA,OAAiB,CAAC,GAClB,YAAY,KACF;AACV,SAAO,CAAC,iBAAuC;AAC7C,UAAM,SAAS,UAAU,SAAS,MAAM;AAAA,MACtC,KAAK;AAAA,MACL,UAAU;AAAA,MACV,SAAS;AAAA,IACX,CAAC;AACD,QAAI,OAAO,QAAQ;AACjB,aAAO;AAAA,QACL,IAAI;AAAA,QACJ,UAAU,aAAa,OAAO,eAAe,OAAO,MAAM,0BAA0B,SAAS;AAAA,MAC/F;AAAA,IACF;AACA,QAAI,OAAO,OAAO;AAChB,YAAM,OAAQ,OAAO,MAAgC;AACrD,UAAI,SAAS,UAAU;AACrB,cAAM,IAAI;AAAA,UACR,qBAAqB,OAAO;AAAA,QAC9B;AAAA,MACF;AACA,YAAM,IAAI,MAAM,qBAAqB,OAAO,sBAAsB,OAAO,MAAM,OAAO,EAAE;AAAA,IAC1F;AACA,QAAI,OAAO,WAAW,EAAG,QAAO,EAAE,IAAI,KAAK;AAC3C,UAAM,MAAM,GAAG,OAAO,UAAU,EAAE,GAAG,OAAO,UAAU,EAAE,GAAG,KAAK;AAChE,WAAO,EAAE,IAAI,OAAO,UAAU,IAAI,SAAS,IAAI,MAAM,QAAQ,OAAO,MAAM,GAAG;AAAA,EAC/E;AACF;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACzMA,SAAS,aAAa,UAAsC;AAC1D,SAAO,SAAS,IAAI,CAAC,MAAM;AACzB,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,SAAS,EAAE,qBAAqB,WAAM,EAAE,kBAAkB,KAAK;AACrE,WAAO,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,GAAG,MAAM;AAAA,EACtD,CAAC;AACH;AAEO,SAAS,gBAAgB,MAA2B;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,aAAa,KAAK,QAAQ;AAAA,EAC/B,EAAE,KAAK,IAAI;AACb;AAEO,SAAS,eAAe,MAA2B;AACxD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,aAAa,KAAK,QAAQ;AAAA,EAC/B,EAAE,KAAK,IAAI;AACb;;;ACEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;ACnGA,SAAS,aAAa;AACtB,SAAS,uBAAuB;AAGhC,IAAM,mBAAmB;AAqBlB,SAAS,iBAAiB,MAA8B;AAC7D,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,WAAW,KAAK,YAAY;AAElC,SAAO,CAAC,iBACN,IAAI,QAAsB,CAAC,SAAS,WAAW;AAC7C,UAAM,QAAQ,MAAM,KAAK,SAAS,KAAK,QAAQ,CAAC,GAAG;AAAA,MACjD,KAAK;AAAA,MACL,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,MAC9B,KAAK,EAAE,GAAG,QAAQ,KAAK,GAAG,KAAK,IAAI;AAAA,IACrC,CAAC;AAED,UAAM,SAAmB,CAAC;AAC1B,QAAI,UAAU;AACd,QAAI,SAAS;AACb,UAAM,SAAS;AACf,QAAI,SAAS;AAEb,UAAM,SAAS,CAAC,OAAmB;AACjC,UAAI,QAAS;AACb,gBAAU;AACV,mBAAa,KAAK;AAClB,SAAG,MAAM;AACT,YAAM,KAAK,SAAS;AACpB,SAAG;AAAA,IACL;AACA,UAAM,aAAa,CAAC,QAClB,OAAO,SAAS,IAAI,GAAG,GAAG;AAAA;AAAA,EAAc,OAAO,KAAK,EAAE,EAAE,MAAM,IAAK,CAAC,KAAK;AAC3E,UAAM,OAAO,MAAM,OAAO,MAAM,QAAQ,EAAE,IAAI,KAAK,CAAC,CAAC;AACrD,UAAM,gBAAgB,CAAC,QACrB,OAAO,MAAM,QAAQ,EAAE,IAAI,OAAO,UAAU,WAAW,GAAG,EAAE,CAAC,CAAC;AAChE,UAAM,aAAa,CAAC,QAAe,OAAO,MAAM,OAAO,GAAG,CAAC;AAE3D,UAAM,OAAO,CAAC,QAA0C;AACtD,UAAI;AACF,cAAM,MAAM,MAAM,GAAG,KAAK,UAAU,GAAG,CAAC;AAAA,CAAI;AAC5C,eAAO;AAAA,MACT,SAAS,KAAK;AAEZ,sBAAc,uCAAwC,IAAc,OAAO,EAAE;AAC7E,eAAO;AAAA,MACT;AAAA,IACF;AAEA,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,YAAM,OAAQ,IAA8B;AAC5C;AAAA,QACE,SAAS,WACL,IAAI;AAAA,UACF,sBAAsB,KAAK,OAAO;AAAA,QACpC,IACA,IAAI,MAAM,sBAAsB,KAAK,OAAO,sBAAsB,IAAI,OAAO,EAAE;AAAA,MACrF;AAAA,IACF,CAAC;AACD,UAAM,GAAG,QAAQ,CAAC,MAAM,WAAW;AAGjC,oBAAc,2BAA2B,IAAI,YAAY,MAAM,kBAAkB;AAAA,IACnF,CAAC;AACD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,OAAO,KAAK,OAAO,CAAC,CAAC,CAAC;AAErD,UAAM,KAAK,gBAAgB,EAAE,OAAO,MAAM,OAAO,CAAC;AAClD,OAAG,GAAG,QAAQ,CAAC,SAAS;AACtB,UAAI;AACJ,UAAI;AACF,cAAM,KAAK,MAAM,IAAI;AAAA,MACvB,QAAQ;AACN;AAAA,MACF;AACA,UAAI,CAAC,OAAO,OAAO,QAAQ,SAAU;AAErC,UAAI,IAAI,OAAO,QAAQ;AACrB,YAAI,IAAI,MAAO,QAAO,cAAc,uBAAuB,KAAK,UAAU,IAAI,KAAK,CAAC,EAAE;AACtF,YAAI,CAAC,KAAK,EAAE,SAAS,OAAO,QAAQ,4BAA4B,CAAC,EAAG;AACpE,iBAAS;AACT,aAAK,EAAE,SAAS,OAAO,IAAI,QAAQ,QAAQ,aAAa,CAAC;AACzD;AAAA,MACF;AACA,UAAI,IAAI,OAAO,QAAQ;AACrB,YAAI,IAAI,MAAO,QAAO,cAAc,uBAAuB,KAAK,UAAU,IAAI,KAAK,CAAC,EAAE;AACtF,cAAM,QAAS,IAAI,QAA8C;AACjE,YAAI,CAAC,MAAM,QAAQ,KAAK,EAAG,QAAO,cAAc,sCAAsC;AACtF,YAAI,MAAM,SAAS,UAAU;AAC3B,iBAAO,cAAc,uBAAuB,MAAM,MAAM,qBAAqB,QAAQ,EAAE;AAAA,QACzF;AACA,eAAO,KAAK;AAAA,MACd;AAAA,IACF,CAAC;AAED,UAAM,QAAQ;AAAA,MACZ,MAAM,cAAc,oDAAoD,SAAS,IAAI;AAAA,MACrF;AAAA,IACF;AAEA,SAAK;AAAA,MACH,SAAS;AAAA,MACT,IAAI;AAAA,MACJ,QAAQ;AAAA,MACR,QAAQ;AAAA,QACN,iBAAiB;AAAA,QACjB,cAAc,CAAC;AAAA,QACf,YAAY,EAAE,MAAM,4BAA4B,SAAS,IAAI;AAAA,MAC/D;AAAA,IACF,CAAC;AAAA,EACH,CAAC;AACL;;;ACrIA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}
package/dist/index.d.ts CHANGED
@@ -1,14 +1,14 @@
1
- import { AgentEvalError, KnowledgeReadinessReport, RunRecord, ControlEvalResult, KnowledgeRequirement } from '@tangle-network/agent-eval';
1
+ import { AgentProfile, AgentEvalError, KnowledgeReadinessReport, RunRecord, ControlEvalResult, KnowledgeRequirement } from '@tangle-network/agent-eval';
2
2
  export { AgentEvalError, AgentEvalErrorCode, ConfigError, ControlBudget, ControlDecision, ControlEvalResult, ControlRunResult, ControlStep, DataAcquisitionPlan, JudgeError, KnowledgeReadinessReport, KnowledgeRequirement, NotFoundError, RunRecord, ValidationError } from '@tangle-network/agent-eval';
3
3
  import { h as AgentBackendInput, i as AgentExecutionBackend, c as OpenAIChatTool, j as OpenAIChatToolChoice, k as AgentBackendContext, R as RuntimeStreamEvent, K as KnowledgeReadinessDecision, l as RunAgentTaskOptions, m as AgentTaskRunResult, n as RunAgentTaskStreamOptions, o as AgentRuntimeEvent, p as AgentTaskStatus, q as RuntimeSessionStore, r as RuntimeSession } from './types-BEQsBhOE.js';
4
4
  export { s as AgentAdapter, t as AgentKnowledgeProvider, u as AgentRuntimeEventSink, v as AgentTaskContext, w as AgentTaskSpec, B as BackendErrorDetail, x as RuntimeRunHandle, y as RuntimeRunPersistenceAdapter, z as RuntimeRunRow, C as startRuntimeRun } from './types-BEQsBhOE.js';
5
+ import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
5
6
  export { C as CoderLoopRunnerOptions, D as DELEGATED_LOOP_MODES, a as DelegatedLoopMode, b as DelegatedLoopRegistry, c as DelegatedLoopResult, d as DelegatedLoopRunner, e as DynamicLoopRunnerOptions, L as LoopRunnerCliArgs, f as LoopRunnerCliResult, R as ResearchLoopResult, g as ResearchLoopRunnerOptions, h as RunDelegatedLoopOptions, V as VetoedFact, i as auditLoopRunner, j as coderLoopRunner, k as dynamicLoopRunner, l as isDelegatedLoopMode, p as parseLoopRunnerArgv, r as researchLoopRunner, m as reviewLoopRunner, n as runDelegatedLoop, o as runLoopRunnerCli, s as selfImproveLoopRunner } from './loop-runner-bin-DFUNgpeK.js';
6
7
  export { m as mcpToolsForRuntimeMcp, a as mcpToolsForRuntimeMcpSubset } from './openai-tools-D4HLDWgw.js';
7
8
  export { aD as EvalRunEvent, aE as EvalRunGeneration, aF as EvalRunsExportConfig, aG as EvalRunsExportResult, aH as INTELLIGENCE_WIRE_VERSION, aI as LoopSpanNode, aJ as OtelAttribute, aK as OtelExportConfig, aL as OtelExporter, aM as OtelSpan, aN as buildLoopOtelSpans, aO as buildLoopSpanNodes, aP as createOtelExporter, aQ as exportEvalRuns, aR as loopEventToOtelSpan } from './kb-gate-CHAyt4aI.js';
8
9
  import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
9
10
  export { b as RuntimeDecisionEvidenceRef, c as RuntimeDecisionKind, d as RuntimeDecisionPoint, e as RuntimeHookContext, f as RuntimeHookErrorContext, a as RuntimeHookEvent, g as RuntimeHookPhase, h as RuntimeHookTarget, i as composeRuntimeHooks, j as defineRuntimeHooks, n as notifyRuntimeDecisionPoint, k as notifyRuntimeHookEvent } from './runtime-hooks-C7JwKb9E.js';
10
11
  import '@tangle-network/sandbox';
11
- import '@tangle-network/agent-eval/campaign';
12
12
  import '@tangle-network/agent-eval/contract';
13
13
  import './types-p8dWBIXL.js';
14
14
  import './coder-_YCf3BAK.js';
@@ -801,6 +801,88 @@ declare class SqlConversationJournal implements ConversationJournal {
801
801
  declare function runConversation(conversation: Conversation, options: RunConversationOptions): Promise<ConversationResult>;
802
802
  declare function runConversationStream(conversation: Conversation, options: RunConversationOptions): AsyncIterable<ConversationStreamEvent>;
803
803
 
804
+ /**
805
+ * `runPersonaConversation` — the persona loop runner: run a WORKER `AgentProfile`
806
+ * (the agent under test) as a multi-round conversation driven by a PERSONA (the
807
+ * simulated user), over the persistent conversation transcript.
808
+ *
809
+ * It is profiles-vs-profiles: the persona is itself a driver `AgentProfile` (an
810
+ * LLM role-playing the user from its facts) — `runConversation` runs the two
811
+ * against each other. Scripted persona turns are kept as a deterministic
812
+ * fast-path. Only the WORKER is metered (it is the side under test); the
813
+ * persona-driver is the test harness, not billed against the agent.
814
+ *
815
+ * `runPersonaDispatch` wraps the runner as a `ProfileDispatchFn` so it drops
816
+ * straight into `runProfileMatrix({ dispatch })` — the same loop serves a single
817
+ * cell and the whole matrix, replacing the per-agent hand-rolled
818
+ * `dispatchWithSurface` bridges.
819
+ */
820
+
821
+ /** A persona that drives the conversation: either a full driver `AgentProfile`
822
+ * (an LLM user-sim) or a deterministic script of user turns (the fast-path). */
823
+ type PersonaDriver = {
824
+ kind: 'profile';
825
+ profile: AgentProfile;
826
+ } | {
827
+ kind: 'scripted';
828
+ turns: string[];
829
+ };
830
+ interface RunPersonaConversationOptions {
831
+ /** The agent under test. Metered; its rendered prompt leads its turns. */
832
+ worker: AgentProfile;
833
+ /** The simulated user driving the dialogue. */
834
+ persona: PersonaDriver;
835
+ /** Turn an `AgentProfile` into a runnable backend (router / sandbox / fake).
836
+ * Applied to the worker and to a `profile`-kind persona. */
837
+ backendFor: (profile: AgentProfile, role: 'worker' | 'persona') => AgentExecutionBackend;
838
+ /** Render a profile's system prompt — prepended to that profile's messages. */
839
+ systemPromptOf: (profile: AgentProfile) => string;
840
+ /** Speaker-turn cap. Default for a scripted persona = `2 * turns.length`
841
+ * (worker answers each user turn). REQUIRED for a `profile` persona. */
842
+ maxTurns?: number;
843
+ /** Kickoff message routed to the first speaker (the persona). Default 'Begin.' */
844
+ seed?: string;
845
+ signal?: AbortSignal;
846
+ /** Worker participant / transcript speaker label. Default 'agent'. */
847
+ workerName?: string;
848
+ }
849
+ interface PersonaConversationResult {
850
+ transcript: ConversationTurn[];
851
+ turns: number;
852
+ halted: HaltReason;
853
+ /** Worker-only spend (the side under test). */
854
+ costUsd: number;
855
+ tokensIn: number;
856
+ tokensOut: number;
857
+ }
858
+ /**
859
+ * Run one worker profile against one persona as a multi-round conversation.
860
+ * The persona leads (participant 0): it speaks, the worker answers, repeat,
861
+ * until `maxTurns`. Returns the persistent transcript + worker-only usage.
862
+ */
863
+ declare function runPersonaConversation(opts: RunPersonaConversationOptions): Promise<PersonaConversationResult>;
864
+ interface RunPersonaConfig<TScenario extends Scenario, TArtifact> {
865
+ /** Turn an `AgentProfile` into a runnable backend (router / sandbox / fake). */
866
+ backendFor: (profile: AgentProfile, role: 'worker' | 'persona') => AgentExecutionBackend;
867
+ /** Render a profile's system prompt. */
868
+ systemPromptOf: (profile: AgentProfile) => string;
869
+ /** The persona driving each scenario — a driver profile or scripted turns. */
870
+ personaOf: (scenario: TScenario) => PersonaDriver;
871
+ /** Build the scored artifact from the finished transcript. */
872
+ artifactOf: (transcript: ConversationTurn[], scenario: TScenario) => TArtifact;
873
+ /** Speaker-turn cap (required when a persona is profile-driven). */
874
+ maxTurns?: (scenario: TScenario) => number;
875
+ seed?: (scenario: TScenario) => string;
876
+ workerName?: string;
877
+ }
878
+ /**
879
+ * Wrap {@link runPersonaConversation} as a `ProfileDispatchFn` for
880
+ * `runProfileMatrix`: the profile axis is the worker-under-test, the scenario
881
+ * axis is the persona, and the runner is the cell. Meters the worker through
882
+ * `ctx.cost` so the matrix's backend-integrity guard sees real usage.
883
+ */
884
+ declare function runPersonaDispatch<TScenario extends Scenario, TArtifact>(config: RunPersonaConfig<TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
885
+
804
886
  /**
805
887
  * @stable
806
888
  *
@@ -1332,9 +1414,37 @@ type ToolCallOutcome = {
1332
1414
  message: string;
1333
1415
  status?: number;
1334
1416
  };
1417
+ /** One OpenAI-shaped tool-call entry carried on an assistant message. */
1418
+ interface ToolLoopAssistantToolCall {
1419
+ id: string;
1420
+ type: 'function';
1421
+ function: {
1422
+ name: string;
1423
+ arguments: string;
1424
+ };
1425
+ }
1426
+ /**
1427
+ * A message in the running conversation the loop sends to `streamTurn`.
1428
+ *
1429
+ * The base `{ role, content }` covers `system` / `user` / plain `assistant`
1430
+ * turns. Two optional fields carry the OpenAI function-calling contract so a
1431
+ * strict model (Claude, and any OpenAI-compatible provider that validates tool
1432
+ * history) reads its own tool use back instead of re-issuing the same call:
1433
+ *
1434
+ * - an assistant turn that emitted tool calls carries `tool_calls`, and its
1435
+ * `content` is `null` when the turn was tool-only;
1436
+ * - each tool result is its own `{ role: 'tool', tool_call_id, content }`
1437
+ * message keyed to the call that produced it.
1438
+ *
1439
+ * Widening is additive: a `streamTurn` that reads only `role` + `content` still
1440
+ * works; one that forwards the whole message to an OpenAI-compatible endpoint
1441
+ * now sends correct tool history.
1442
+ */
1335
1443
  type ToolLoopMessage = {
1336
1444
  role: string;
1337
- content: string;
1445
+ content: string | null;
1446
+ tool_calls?: ToolLoopAssistantToolCall[];
1447
+ tool_call_id?: string;
1338
1448
  };
1339
1449
  type ToolLoopEvent = {
1340
1450
  type: 'text';
@@ -1433,4 +1543,4 @@ interface StreamToolLoopOptions<Raw> {
1433
1543
  * `capped` if it stops for any non-completed reason with calls still pending. */
1434
1544
  declare function streamToolLoop<Raw>(opts: StreamToolLoopOptions<Raw>): AsyncGenerator<StreamToolLoopYield<Raw>, void, unknown>;
1435
1545
 
1436
- export { AgentBackendContext, AgentBackendInput, AgentExecutionBackend, AgentRuntimeEvent, AgentTaskRunResult, AgentTaskStatus, type AuthSource, type BackendCallPolicy, BackendTransportError, type ChatStreamEvent, type ChatTurnHooks, type ChatTurnIdentity, type ChatTurnProducer, type ChatTurnResult, type CircuitBreakerConfig, CircuitBreakerState, CircuitOpenError, type Conversation, type ConversationDriveState, type ConversationJournal, type ConversationJournalEntry, type ConversationParticipant, type ConversationPolicy, type ConversationResult, type ConversationStreamEvent, type ConversationTurn, type D1DatabaseLike, type D1StmtLike, DEFAULT_MAX_DEPTH, DEFAULT_ROUTER_BASE_URL, DeadlineExceededError, FORWARD_HEADERS, FileConversationJournal, type ForwardHeaderName, type HaltContext, type HaltPredicate, type HaltReason, type HaltSignal, InMemoryConversationJournal, InMemoryRuntimeSessionStore, type ModelInfo, OpenAIChatTool, OpenAIChatToolChoice, PlannerError, type PropagatedHeaders, type ResolvedChatModel, type RetryBackoff, type RetryableErrorPredicate, type RouterEnv, type RunChatTurnInput, type RunConversationOptions, type RunToolLoopOptions, type RuntimeEventCollector, RuntimeHooks, RuntimeRunStateError, RuntimeSessionStore, RuntimeStreamEvent, type RuntimeStreamEventCollector, type RuntimeTelemetryOptions, type SanitizedKnowledgeReadinessReport, type SqlAdapter, SqlConversationJournal, type StreamToolLoopOptions, type StreamToolLoopYield, type ToolCallOutcome, type ToolLoopCall, type ToolLoopEvent, type ToolLoopMessage, type ToolLoopResult, type ToolLoopStopReason, type TurnOrder, applyRunRecordDefaults, buildForwardHeaders, cleanModelId, computeBackoff, createConversationBackend, createIterableBackend, createOpenAICompatibleBackend, createRuntimeEventCollector, createRuntimeStreamEventCollector, createSandboxPromptBackend, d1ToSqlAdapter, decideKnowledgeReadiness, defaultIsRetryable, defineConversation, deriveExecutionId, getModels, handleChatTurn, isDepthExceeded, makePerAttemptSignal, readDepth, readinessServerSentEvent, resolveChatModel, resolveRouterBaseUrl, runAgentTask, runAgentTaskStream, runConversation, runConversationStream, runToolLoop, runtimeStreamServerSentEvent, sanitizeAgentRuntimeEvent, sanitizeKnowledgeReadinessReport, sanitizeRuntimeStreamEvent, sleep, slugifySpeaker, streamToolLoop, turnId, validateChatModelId };
1546
+ export { AgentBackendContext, AgentBackendInput, AgentExecutionBackend, AgentRuntimeEvent, AgentTaskRunResult, AgentTaskStatus, type AuthSource, type BackendCallPolicy, BackendTransportError, type ChatStreamEvent, type ChatTurnHooks, type ChatTurnIdentity, type ChatTurnProducer, type ChatTurnResult, type CircuitBreakerConfig, CircuitBreakerState, CircuitOpenError, type Conversation, type ConversationDriveState, type ConversationJournal, type ConversationJournalEntry, type ConversationParticipant, type ConversationPolicy, type ConversationResult, type ConversationStreamEvent, type ConversationTurn, type D1DatabaseLike, type D1StmtLike, DEFAULT_MAX_DEPTH, DEFAULT_ROUTER_BASE_URL, DeadlineExceededError, FORWARD_HEADERS, FileConversationJournal, type ForwardHeaderName, type HaltContext, type HaltPredicate, type HaltReason, type HaltSignal, InMemoryConversationJournal, InMemoryRuntimeSessionStore, type ModelInfo, OpenAIChatTool, OpenAIChatToolChoice, type PersonaConversationResult, type PersonaDriver, PlannerError, type PropagatedHeaders, type ResolvedChatModel, type RetryBackoff, type RetryableErrorPredicate, type RouterEnv, type RunChatTurnInput, type RunConversationOptions, type RunPersonaConfig, type RunPersonaConversationOptions, type RunToolLoopOptions, type RuntimeEventCollector, RuntimeHooks, RuntimeRunStateError, RuntimeSessionStore, RuntimeStreamEvent, type RuntimeStreamEventCollector, type RuntimeTelemetryOptions, type SanitizedKnowledgeReadinessReport, type SqlAdapter, SqlConversationJournal, type StreamToolLoopOptions, type StreamToolLoopYield, type ToolCallOutcome, type ToolLoopAssistantToolCall, type ToolLoopCall, type ToolLoopEvent, type ToolLoopMessage, type ToolLoopResult, type ToolLoopStopReason, type TurnOrder, applyRunRecordDefaults, buildForwardHeaders, cleanModelId, computeBackoff, createConversationBackend, createIterableBackend, createOpenAICompatibleBackend, createRuntimeEventCollector, createRuntimeStreamEventCollector, createSandboxPromptBackend, d1ToSqlAdapter, decideKnowledgeReadiness, defaultIsRetryable, defineConversation, deriveExecutionId, getModels, handleChatTurn, isDepthExceeded, makePerAttemptSignal, readDepth, readinessServerSentEvent, resolveChatModel, resolveRouterBaseUrl, runAgentTask, runAgentTaskStream, runConversation, runConversationStream, runPersonaConversation, runPersonaDispatch, runToolLoop, runtimeStreamServerSentEvent, sanitizeAgentRuntimeEvent, sanitizeKnowledgeReadinessReport, sanitizeRuntimeStreamEvent, sleep, slugifySpeaker, streamToolLoop, turnId, validateChatModelId };