@tangle-network/agent-runtime 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,207 @@
1
+ import { FindingSubject, AnalystFinding } from '@tangle-network/agent-eval';
2
+ import { I as ImprovementAdapter } from './types-D_MXrmJP.js';
3
+
4
+ /**
5
+ * `AgentSurfaces` — declarative map of the mutable file/directory paths
6
+ * the self-improvement loop can edit on behalf of an agent.
7
+ *
8
+ * The substrate uses this map to resolve every parsed `FindingSubject`
9
+ * (from agent-eval) to a real on-disk path. No per-vertical glue;
10
+ * no fabricated paths; no silent `existsSync(...)` skips that hide
11
+ * misconfiguration from the operator.
12
+ *
13
+ * Surfaces are validated at `defineAgent` time — missing paths fail
14
+ * loud with a list of every offender. A surface that's not needed
15
+ * (e.g. an agent with no RAG corpora) is simply omitted; the loop
16
+ * refuses to route those subjects rather than fabricating a target.
17
+ */
18
+
19
+ /**
20
+ * Surface declarations. Every path is repo-relative (or absolute) at
21
+ * `defineAgent` time. At resolution time, paths are joined against the
22
+ * agent's `repoRoot`.
23
+ *
24
+ * `systemPrompt`, `tools`, `personas` are DIRECTORIES; the loop appends
25
+ * `<section>.md`, `<tool>/README.md`, `<persona-id>.yaml` etc.
26
+ * `rubric`, `outputSchema` are SINGLE FILES; the loop edits them in
27
+ * place.
28
+ *
29
+ * `knowledge` is the agent-knowledge root (typically `.agent-knowledge`);
30
+ * `applyKnowledgeWriteBlocks` writes pages relative to it.
31
+ *
32
+ * Optional surfaces (`scaffolding`, `memory`, `rag`, `outputSchema`)
33
+ * can be omitted — the loop will reject findings targeting them with a
34
+ * clear log message instead of fabricating a path.
35
+ */
36
+ interface AgentSurfaces {
37
+ /** Directory containing one markdown file per system-prompt section. */
38
+ systemPrompt: string;
39
+ /** Directory containing one subdir per tool (`<tool>/README.md`). */
40
+ tools: string;
41
+ /** Single file (TypeScript module) defining the rubric weights + dimensions. */
42
+ rubric: string;
43
+ /** Knowledge-base root; typically `.agent-knowledge`. */
44
+ knowledge: string;
45
+ /** Directory containing one YAML/JSON file per persona. */
46
+ personas: string;
47
+ /** Optional: directory containing scaffolding rules (precondition checks, retry policies). */
48
+ scaffolding?: string;
49
+ /** Optional: memory store path (JSONL / SQLite / DB). */
50
+ memory?: string;
51
+ /** Optional: directory containing RAG corpora (`<corpus>/<doc-id>.md`). */
52
+ rag?: string;
53
+ /** Optional: single file defining the output schema (Zod / JSON Schema). */
54
+ outputSchema?: string;
55
+ }
56
+ interface ResolvedSurface {
57
+ /** Absolute filesystem path the operator can `cat` / `vim`. */
58
+ absolutePath: string;
59
+ /** Repo-relative path for PR descriptions, diffs, audit logs. */
60
+ repoRelativePath: string;
61
+ /** Whether the path currently exists on disk. */
62
+ exists: boolean;
63
+ /** The substrate's intent: edit an existing file or create a new one. */
64
+ intent: 'edit-existing' | 'create-new';
65
+ }
66
+ /**
67
+ * Resolve a parsed `FindingSubject` to the file path the substrate
68
+ * should edit (or create) on disk.
69
+ *
70
+ * Returns `null` when:
71
+ * - the subject targets a surface the agent didn't declare
72
+ * (e.g. `rag:*` when `surfaces.rag` is undefined), OR
73
+ * - the subject is a `cluster` (failure-mode emits these as evidence,
74
+ * not actionable mutations — they don't route to a file).
75
+ *
76
+ * Returns a `ResolvedSurface` with `intent: 'create-new'` when the
77
+ * subject names a path that doesn't yet exist (e.g. a new wiki page).
78
+ * The caller chooses whether to honour the create — for tightly-managed
79
+ * surfaces like `systemPrompt` it's usually a contract violation
80
+ * (the analyst named a section that doesn't exist); for `knowledge`
81
+ * it's the whole point.
82
+ */
83
+ declare function resolveSubjectPath(subject: FindingSubject, surfaces: AgentSurfaces, repoRoot: string): ResolvedSurface | null;
84
+ /**
85
+ * Validate that every declared surface exists on disk under `repoRoot`.
86
+ *
87
+ * Returns an array of `SurfaceValidationIssue` — empty when all required
88
+ * surfaces resolve. `defineAgent` throws with the issues rendered, so
89
+ * a misconfigured manifest fails at startup (not at the first finding
90
+ * the loop produces 20 minutes later).
91
+ */
92
+ interface SurfaceValidationIssue {
93
+ surface: keyof AgentSurfaces;
94
+ path: string;
95
+ reason: 'missing' | 'not-directory' | 'not-file';
96
+ }
97
+ declare function validateSurfaces(surfaces: AgentSurfaces, repoRoot: string): ReadonlyArray<SurfaceValidationIssue>;
98
+ declare function renderSurfaceIssues(issues: ReadonlyArray<SurfaceValidationIssue>, repoRoot: string): string;
99
+
100
+ /**
101
+ * Substrate-default `ImprovementAdapter` — surfaces-driven, LLM-drafted
102
+ * patches, optional auto-apply or PR-open.
103
+ *
104
+ * This is the one ImprovementAdapter every vertical agent uses. The
105
+ * substrate parses each finding's `subject` via
106
+ * `parseFindingSubject` (agent-eval), resolves it to a real file path
107
+ * via the agent's `AgentSurfaces`, reads the current content, and asks
108
+ * an LLM to draft a unified-diff patch given the finding + current
109
+ * content + per-kind editing-discipline rules.
110
+ *
111
+ * Auto-apply gates on the source-finding's confidence and the
112
+ * autoApply.improvement policy. Two modes:
113
+ * `write` — apply the patch in-place via `git apply -p0`. Operator
114
+ * reviews via `git diff`.
115
+ * `open-pr` — write to a branch, commit, push, open a PR via `gh`.
116
+ * Operator reviews via the PR UI.
117
+ *
118
+ * Fail-loud rules:
119
+ * - Findings whose subject doesn't parse → counted in `errors`.
120
+ * - Findings whose subject targets an undeclared surface → counted in
121
+ * `errors` with the offending kind in the message.
122
+ * - Findings whose target path doesn't exist AND the kind isn't a
123
+ * create-new variant (`new-tool`, `knowledge.wiki`) → counted in
124
+ * `errors` with the resolved path in the message.
125
+ * - LLM drafts that fail JSON-schema validation → counted in
126
+ * `errors` with the schema issue.
127
+ *
128
+ * No silent skips. Every dropped finding has a recorded reason the
129
+ * loop's report surfaces.
130
+ */
131
+
132
+ interface SurfaceImprovementEdit {
133
+ /** Stable id derived from the source finding so re-proposals are idempotent. */
134
+ id: string;
135
+ /** The finding that produced this edit — for revert + audit trail. */
136
+ sourceFindingId: string;
137
+ /** Parsed subject; included so the apply step doesn't re-parse. */
138
+ subject: FindingSubject;
139
+ /** Resolved on-disk target. */
140
+ target: ResolvedSurface;
141
+ /** SHA-256 of the current file content the patch was drafted against. */
142
+ baseSha256: string;
143
+ /** Unified-diff patch the LLM drafted (relative to `target.absolutePath`). */
144
+ patch: string;
145
+ /** One-line summary the operator sees in the report / PR title. */
146
+ summary: string;
147
+ /** Multi-line rationale for the PR body — finding context + LLM reasoning. */
148
+ rationale: string;
149
+ /** Carry-forward from the finding so the apply gate can check the threshold. */
150
+ confidence: number;
151
+ /** Carry-forward severity for prioritization. */
152
+ severity: AnalystFinding['severity'];
153
+ }
154
+ interface CreateSurfaceImprovementAdapterOpts {
155
+ surfaces: AgentSurfaces;
156
+ repoRoot: string;
157
+ /**
158
+ * LLM-draft callback. Given a finding + current file content + the
159
+ * resolved target, returns a unified-diff patch + summary + rationale.
160
+ *
161
+ * Required — the substrate doesn't ship a hardcoded prompt; the agent
162
+ * author picks the model (Haiku for cheap routine drafts, Sonnet for
163
+ * substantive prompt rewrites, etc.) via this callback.
164
+ */
165
+ draftPatch: (input: DraftPatchInput) => Promise<DraftPatchOutput>;
166
+ /**
167
+ * Apply mode:
168
+ * `write` — `git apply` in-place; operator reviews via `git diff`
169
+ * `open-pr` — branch + commit + push + `gh pr create`
170
+ * `none` — never apply; collect proposals for the report only
171
+ *
172
+ * The `apply` method honours this even when the loop calls it; the
173
+ * effective behaviour is also gated on the per-finding confidence
174
+ * threshold via `runAnalystLoop`'s `autoApply` policy.
175
+ */
176
+ mode?: 'write' | 'open-pr' | 'none';
177
+ /** When `mode === 'open-pr'`, the base branch new PRs target. Default: `main`. */
178
+ baseBranch?: string;
179
+ /** Required for `mode === 'open-pr'` — the GH owner/repo (`tangle-network/tax-agent`). */
180
+ ghRepo?: string;
181
+ /**
182
+ * When the resolved target doesn't exist, allow the substrate to
183
+ * CREATE the file (for `knowledge.wiki`, `new-tool` subjects). Default
184
+ * true for those kinds, false for `system-prompt` / `rubric` / etc.
185
+ * (named sections that don't exist are a contract violation, not a
186
+ * scaffolding opportunity).
187
+ */
188
+ allowCreateForKinds?: ReadonlyArray<FindingSubject['kind']>;
189
+ }
190
+ interface DraftPatchInput {
191
+ finding: AnalystFinding;
192
+ subject: FindingSubject;
193
+ target: ResolvedSurface;
194
+ /** Current file content (empty string when `intent === 'create-new'`). */
195
+ currentContent: string;
196
+ }
197
+ interface DraftPatchOutput {
198
+ /** Unified diff against the current file content. Empty string skips this finding. */
199
+ patch: string;
200
+ /** One-line summary for the operator. */
201
+ summary: string;
202
+ /** Multi-line rationale for the PR body. */
203
+ rationale: string;
204
+ }
205
+ declare function createSurfaceImprovementAdapter(opts: CreateSurfaceImprovementAdapterOpts): ImprovementAdapter<SurfaceImprovementEdit>;
206
+
207
+ export { type AgentSurfaces as A, type CreateSurfaceImprovementAdapterOpts as C, type DraftPatchInput as D, type ResolvedSurface as R, type SurfaceImprovementEdit as S, type DraftPatchOutput as a, type SurfaceValidationIssue as b, createSurfaceImprovementAdapter as c, resolveSubjectPath as d, renderSurfaceIssues as r, validateSurfaces as v };
@@ -0,0 +1,120 @@
1
+ import { AnalystFinding } from '@tangle-network/agent-eval';
2
+ import { L as LocalHarness, r as runLocalHarness } from './local-harness-KrdFTY5R.js';
3
+ import { LabeledScenarioStore, WorktreeAdapter, ImprovementDriver } from '@tangle-network/agent-eval/campaign';
4
+ import { S as SurfaceImprovementEdit } from './improvement-adapter-CaZxFxTd.js';
5
+ import { I as ImprovementAdapter } from './types-D_MXrmJP.js';
6
+ import 'node:child_process';
7
+
8
+ /**
9
+ * @experimental
10
+ *
11
+ * `improvementDriver` — the ONE reflective/agentic improvement driver for
12
+ * agent-eval's improvement loop. It implements `ImprovementDriver` and owns
13
+ * the candidate lifecycle (worktree create → generate → finalize/discard,
14
+ * × populationSize); it delegates the only thing that genuinely varies — HOW
15
+ * a candidate change is produced — to a pluggable `CandidateGenerator`.
16
+ *
17
+ * There is no separate "analyst driver" vs "autoresearch driver": those are
18
+ * the SAME driver at two settings of a dial.
19
+ * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;
20
+ * applies pre-drafted patches)
21
+ * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;
22
+ * an agent reads code + report and edits)
23
+ * Both emit changes into a worktree the driver finalizes into a
24
+ * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See
25
+ * agent-eval's `docs/design/self-improvement-engine.md`.
26
+ */
27
+
28
+ /** The byte-producing seam — the ONE thing that differs between the cheap
29
+ * reflective path and the full agentic path. A generator makes (uncommitted)
30
+ * changes inside `worktreePath`; the driver commits them via the worktree
31
+ * adapter's `finalize`. */
32
+ interface CandidateGenerator {
33
+ kind: string;
34
+ generate(args: {
35
+ /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */
36
+ worktreePath: string;
37
+ /** Phase-2 research report (analyst findings + diff), opaque. */
38
+ report: unknown;
39
+ /** Findings resolved from the report or the loop context. */
40
+ findings: AnalystFinding[];
41
+ /** Handle to all captured data, to ground the change. */
42
+ dataset?: LabeledScenarioStore;
43
+ /** DEPTH: max iterations the generator may take (agentic uses this; the
44
+ * reflective generator ignores it). */
45
+ maxShots: number;
46
+ signal: AbortSignal;
47
+ }): Promise<{
48
+ applied: boolean;
49
+ summary: string;
50
+ }>;
51
+ }
52
+ interface ImprovementDriverOptions {
53
+ worktree: WorktreeAdapter;
54
+ generator: CandidateGenerator;
55
+ /** Base ref candidate worktrees fork from. Default `main`. */
56
+ baseRef?: string;
57
+ }
58
+ declare function improvementDriver(opts: ImprovementDriverOptions): ImprovementDriver<AnalystFinding>;
59
+
60
+ /**
61
+ * @experimental
62
+ *
63
+ * `agenticGenerator` — the full-agentic `CandidateGenerator`: the
64
+ * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real
65
+ * coding harness (claude / codex / opencode) inside the candidate worktree the
66
+ * driver already created, letting the agent read the codebase + the research
67
+ * report and make the change in place. The driver then commits the worktree
68
+ * into a `CodeSurface`.
69
+ *
70
+ * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the
71
+ * harness as a subprocess with `cwd` = the worktree, on the same filesystem,
72
+ * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is
73
+ * the verified primitive. The OUTER sandbox is the improvement loop's own
74
+ * execution context; the generator does not nest a second sandbox per
75
+ * candidate (which would reintroduce a host↔sandbox worktree-transport
76
+ * problem that does not need solving here).
77
+ *
78
+ * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change
79
+ * (the worktree stays clean), the generator refines the prompt and retries, up
80
+ * to `maxShots` times. A harness that already changed files returns on shot 1.
81
+ */
82
+
83
+ interface AgenticGeneratorOptions {
84
+ /** Local coding harness to run in the worktree. Default `claude`. */
85
+ harness?: LocalHarness;
86
+ /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */
87
+ timeoutMs?: number;
88
+ /** Build the harness task prompt from the report + findings. Override for
89
+ * domain phrasing; the default turns findings into a concrete coder task. */
90
+ buildPrompt?: (args: {
91
+ report: unknown;
92
+ findings: AnalystFinding[];
93
+ }) => string;
94
+ /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */
95
+ runHarness?: typeof runLocalHarness;
96
+ /** Test seam — inject the worktree-dirty check (defaults to `git status`). */
97
+ isDirty?: (worktreePath: string) => boolean;
98
+ }
99
+ declare function agenticGenerator(opts?: AgenticGeneratorOptions): CandidateGenerator;
100
+
101
+ /**
102
+ * @experimental
103
+ *
104
+ * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts
105
+ * surface edits via the existing improvement adapter (`proposeFromFindings`,
106
+ * one LLM patch per finding) and applies them as ONE coherent improvement into
107
+ * the candidate worktree. `maxShots` is ignored — reflection is single-shot by
108
+ * construction (the patches are already drafted).
109
+ *
110
+ * This is the `shots=1, sandbox=off` setting of the one improvement driver.
111
+ * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`
112
+ * setting — both plug into the same `improvementDriver`.
113
+ */
114
+
115
+ interface ReflectiveGeneratorOptions {
116
+ improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>;
117
+ }
118
+ declare function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator;
119
+
120
+ export { type AgenticGeneratorOptions, type CandidateGenerator, type ImprovementDriverOptions, type ReflectiveGeneratorOptions, agenticGenerator, improvementDriver, reflectiveGenerator };
@@ -0,0 +1,161 @@
1
+ import {
2
+ runLocalHarness
3
+ } from "./chunk-GLR25NG7.js";
4
+ import "./chunk-DGUM43GV.js";
5
+
6
+ // src/improvement/agentic-generator.ts
7
+ import { spawnSync } from "child_process";
8
+ function agenticGenerator(opts = {}) {
9
+ const harness = opts.harness ?? "claude";
10
+ const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt;
11
+ const run = opts.runHarness ?? runLocalHarness;
12
+ const dirty = opts.isDirty ?? worktreeDirty;
13
+ return {
14
+ kind: `agentic:${harness}`,
15
+ async generate({ worktreePath, report, findings, maxShots, signal }) {
16
+ let prompt = buildPrompt({ report, findings });
17
+ const shots = Math.max(1, maxShots);
18
+ for (let shot = 0; shot < shots; shot++) {
19
+ if (signal.aborted) break;
20
+ await run({
21
+ harness,
22
+ cwd: worktreePath,
23
+ taskPrompt: prompt,
24
+ timeoutMs: opts.timeoutMs,
25
+ signal
26
+ });
27
+ if (dirty(worktreePath)) {
28
+ return { applied: true, summary: summarize(findings) };
29
+ }
30
+ prompt = refine(prompt);
31
+ }
32
+ return { applied: false, summary: "" };
33
+ }
34
+ };
35
+ }
36
+ function defaultBuildPrompt(args) {
37
+ const lines = [
38
+ "You are improving this codebase based on an evaluation analysis.",
39
+ "Make the smallest set of edits that addresses the findings below, then stop.",
40
+ "Do not change unrelated code. Do not commit \u2014 leave changes in the working tree.",
41
+ "",
42
+ "Findings:"
43
+ ];
44
+ for (const f of args.findings) {
45
+ const where = f.subject ? ` [${f.subject}]` : "";
46
+ lines.push(`- (${f.severity})${where} ${f.claim}`);
47
+ if (f.recommended_action) lines.push(` \u2192 ${f.recommended_action}`);
48
+ }
49
+ return lines.join("\n");
50
+ }
51
+ function refine(prompt) {
52
+ return `${prompt}
53
+
54
+ NOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`;
55
+ }
56
+ function summarize(findings) {
57
+ if (findings.length === 0) return "agentic improvement";
58
+ if (findings.length === 1) return `agentic: ${truncate(findings[0].claim, 64)}`;
59
+ return `agentic: ${findings.length} findings addressed`;
60
+ }
61
+ function truncate(s, n) {
62
+ return s.length <= n ? s : `${s.slice(0, n - 1)}\u2026`;
63
+ }
64
+ function worktreeDirty(worktreePath) {
65
+ const result = spawnSync("git", ["status", "--porcelain"], {
66
+ cwd: worktreePath,
67
+ encoding: "utf-8"
68
+ });
69
+ if (result.error) {
70
+ throw new Error(
71
+ `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`
72
+ );
73
+ }
74
+ if (result.status !== 0) {
75
+ throw new Error(
76
+ `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`
77
+ );
78
+ }
79
+ return result.stdout.trim().length > 0;
80
+ }
81
+
82
+ // src/improvement/improvement-driver.ts
83
+ function improvementDriver(opts) {
84
+ const baseRef = opts.baseRef ?? "main";
85
+ return {
86
+ kind: `improvement:${opts.generator.kind}`,
87
+ async propose(ctx) {
88
+ const findings = resolveFindings(ctx);
89
+ if (findings.length === 0 && ctx.report === void 0) return [];
90
+ const surfaces = [];
91
+ for (let i = 0; i < ctx.populationSize; i++) {
92
+ if (ctx.signal.aborted) break;
93
+ const wt = await opts.worktree.create({
94
+ baseRef,
95
+ label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`
96
+ });
97
+ try {
98
+ const { applied, summary } = await opts.generator.generate({
99
+ worktreePath: wt.path,
100
+ report: ctx.report,
101
+ findings,
102
+ dataset: ctx.dataset,
103
+ maxShots: ctx.maxImprovementShots ?? 1,
104
+ signal: ctx.signal
105
+ });
106
+ if (!applied) {
107
+ await opts.worktree.discard(wt);
108
+ continue;
109
+ }
110
+ surfaces.push(await opts.worktree.finalize(wt, summary));
111
+ } catch (err) {
112
+ await opts.worktree.discard(wt).catch(() => {
113
+ });
114
+ throw err;
115
+ }
116
+ }
117
+ return surfaces;
118
+ }
119
+ };
120
+ }
121
+ function resolveFindings(ctx) {
122
+ const report = ctx.report;
123
+ if (report && typeof report === "object" && "findings" in report) {
124
+ const f = report.findings;
125
+ if (Array.isArray(f) && f.length > 0) return f;
126
+ }
127
+ return ctx.findings;
128
+ }
129
+
130
+ // src/improvement/reflective-generator.ts
131
+ import { spawnSync as spawnSync2 } from "child_process";
132
+ function reflectiveGenerator(opts) {
133
+ return {
134
+ kind: "reflective",
135
+ async generate({ worktreePath, findings }) {
136
+ const batch = await opts.improvementAdapter.proposeFromFindings(findings);
137
+ if (batch.edits.length === 0) return { applied: false, summary: "" };
138
+ let applied = 0;
139
+ for (const edit of batch.edits) {
140
+ if (applyPatch(edit.patch, worktreePath)) applied++;
141
+ }
142
+ if (applied === 0) return { applied: false, summary: "" };
143
+ const summary = batch.edits.length === 1 ? batch.edits[0].summary : `analyst: ${applied} surface edit${applied === 1 ? "" : "s"}`;
144
+ return { applied: true, summary };
145
+ }
146
+ };
147
+ }
148
+ function applyPatch(patch, cwd) {
149
+ const result = spawnSync2("git", ["apply", "--whitespace=fix", "-p0", "-"], {
150
+ cwd,
151
+ input: patch,
152
+ encoding: "utf-8"
153
+ });
154
+ return result.status === 0;
155
+ }
156
+ export {
157
+ agenticGenerator,
158
+ improvementDriver,
159
+ reflectiveGenerator
160
+ };
161
+ //# sourceMappingURL=improvement.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/improvement-driver.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change\n * (the worktree stays clean), the generator refines the prompt and retries, up\n * to `maxShots` times. A harness that already changed files returns on shot 1.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n let prompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: prompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n // The worktree IS the signal: if the harness touched files, we have a\n // candidate. We don't trust the harness's stdout — we trust the diff.\n if (dirty(worktreePath)) {\n return { applied: true, summary: summarize(findings) }\n }\n // No change this shot — give the next attempt explicit feedback.\n prompt = refine(prompt)\n }\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nfunction refine(prompt: string): string {\n return `${prompt}\\n\\nNOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;\n * an agent reads code + report and edits)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`\n * setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;AAuBA,SAAS,iBAAiB;AAmBnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAE9B,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,UAAI,SAAS,YAAY,EAAE,QAAQ,SAAS,CAAC;AAC7C,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY;AAAA,UACZ,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,MAAM,YAAY,GAAG;AACvB,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,iBAAS,OAAO,MAAM;AAAA,MACxB;AACA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,OAAO,QAAwB;AACtC,SAAO,GAAG,MAAM;AAAA;AAAA;AAClB;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACvEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;ACrGA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}
package/dist/index.js CHANGED
@@ -3,7 +3,7 @@ import {
3
3
  loopEventToOtelSpan,
4
4
  mcpToolsForRuntimeMcp,
5
5
  mcpToolsForRuntimeMcpSubset
6
- } from "./chunk-7HN72MF3.js";
6
+ } from "./chunk-QZEDHTT2.js";
7
7
  import "./chunk-UNQM6XQO.js";
8
8
  import {
9
9
  AgentEvalError,
@@ -395,6 +395,12 @@ async function* streamResponseEvents(response, context, requestedModel) {
395
395
  function* parseStreamChunk(chunk, context, usage, toolCalls) {
396
396
  const lines = chunk.split(/\r?\n/);
397
397
  const dataLines = lines.filter((line) => line.startsWith("data:"));
398
+ if (dataLines.length === 0 && lines.every((line) => {
399
+ const trimmed = line.trim();
400
+ return trimmed.length === 0 || trimmed.startsWith(":");
401
+ })) {
402
+ return;
403
+ }
398
404
  const data = dataLines.length > 0 ? dataLines.map((line) => line.slice(5).trimStart()).join("\n") : chunk.trim();
399
405
  if (!data || data === "[DONE]") return;
400
406
  let parsed;