@tangle-network/agent-runtime 0.37.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.d.ts +3 -3
- package/dist/analyst-loop.d.ts +2 -2
- package/dist/analyst-loop.js +3 -257
- package/dist/analyst-loop.js.map +1 -1
- package/dist/chunk-VOX6Z3II.js +90 -0
- package/dist/chunk-VOX6Z3II.js.map +1 -0
- package/dist/chunk-XBUG326M.js +261 -0
- package/dist/chunk-XBUG326M.js.map +1 -0
- package/dist/{chunk-T3GJBKHA.js → chunk-Z523NPJK.js} +58 -1
- package/dist/chunk-Z523NPJK.js.map +1 -0
- package/dist/dynamic-DeOPeeAw.d.ts +106 -0
- package/dist/{improvement-adapter-CaZxFxTd.d.ts → improvement-adapter-BC4HhuAR.d.ts} +1 -1
- package/dist/improvement.d.ts +6 -130
- package/dist/improvement.js +4 -85
- package/dist/improvement.js.map +1 -1
- package/dist/index.d.ts +67 -5
- package/dist/index.js +61 -2
- package/dist/index.js.map +1 -1
- package/dist/loops.d.ts +5 -106
- package/dist/mcp/index.d.ts +4 -79
- package/dist/mcp/index.js +2 -57
- package/dist/mcp/index.js.map +1 -1
- package/dist/optimize-prompt-cmH9wZdH.d.ts +129 -0
- package/dist/{otel-export-DgFMwsVy.d.ts → otel-export-CNmeg_7B.d.ts} +77 -2
- package/dist/profiles.d.ts +1 -1
- package/dist/{types-CmTjKLyB.d.ts → types-CmkQl8qE.d.ts} +1 -1
- package/dist/{types-D_MXrmJP.d.ts → types-p8dWBIXL.d.ts} +1 -1
- package/package.json +1 -1
- package/dist/chunk-T3GJBKHA.js.map +0 -1
package/dist/improvement.d.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import { AnalystFinding
|
|
1
|
+
import { AnalystFinding } from '@tangle-network/agent-eval';
|
|
2
2
|
import { L as LocalHarness, r as runLocalHarness } from './local-harness-KrdFTY5R.js';
|
|
3
|
-
import { LabeledScenarioStore, WorktreeAdapter, ImprovementDriver
|
|
4
|
-
|
|
5
|
-
import {
|
|
3
|
+
import { LabeledScenarioStore, WorktreeAdapter, ImprovementDriver } from '@tangle-network/agent-eval/campaign';
|
|
4
|
+
export { O as OptimizePromptOptions, a as OptimizePromptReflection, b as OptimizePromptResult, o as optimizePrompt } from './optimize-prompt-cmH9wZdH.js';
|
|
5
|
+
import { S as SurfaceImprovementEdit } from './improvement-adapter-BC4HhuAR.js';
|
|
6
|
+
import { I as ImprovementAdapter } from './types-p8dWBIXL.js';
|
|
6
7
|
import 'node:child_process';
|
|
7
8
|
|
|
8
9
|
/**
|
|
@@ -98,131 +99,6 @@ interface AgenticGeneratorOptions {
|
|
|
98
99
|
}
|
|
99
100
|
declare function agenticGenerator(opts?: AgenticGeneratorOptions): CandidateGenerator;
|
|
100
101
|
|
|
101
|
-
/**
|
|
102
|
-
* @experimental
|
|
103
|
-
*
|
|
104
|
-
* `optimizePrompt` — identity-gated optimization for any TEXT prompt surface
|
|
105
|
-
* (system prompt, planner prompt, judge rubric, skill doc).
|
|
106
|
-
*
|
|
107
|
-
* The text-surface sibling to this module's `improvementDriver` (the
|
|
108
|
-
* CODE-surface / worktree path). Both feed agent-eval's `runImprovementLoop`;
|
|
109
|
-
* this one defaults the driver to agent-eval's `gepaDriver` (reflective text
|
|
110
|
-
* mutator) and the gate to `heldOutGate`.
|
|
111
|
-
*
|
|
112
|
-
* IDENTITY-GATED BY CONSTRUCTION — the whole point. The loop runs evals,
|
|
113
|
-
* collects per-scenario signal, proposes candidates, and the gate compares
|
|
114
|
-
* candidate-vs-baseline ON THE HELDOUT. `result.prompt` is the baseline
|
|
115
|
-
* (identity) UNLESS the gate decided `'ship'`. So wiring a surface up is safe:
|
|
116
|
-
* a surface with no beneficial mutation simply keeps its baseline. You never
|
|
117
|
-
* regress by registering a prompt — you only ever improve when the held-out
|
|
118
|
-
* data earns it.
|
|
119
|
-
*
|
|
120
|
-
* Generic over the runtime: `runWithPrompt` is the only domain seam — given a
|
|
121
|
-
* candidate prompt + scenario, run it however the surface runs (sandbox
|
|
122
|
-
* `streamPrompt`, a `runLoop`, a direct model call) and return the artifact the
|
|
123
|
-
* judges score. The optimizer never assumes how a prompt is executed.
|
|
124
|
-
*/
|
|
125
|
-
|
|
126
|
-
/** Reflection config for the default `gepaDriver`. Omit when passing a custom
|
|
127
|
-
* `driver`. */
|
|
128
|
-
interface OptimizePromptReflection {
|
|
129
|
-
/** Router transport for the reflection model. */
|
|
130
|
-
llm: LlmClientOptions;
|
|
131
|
-
/** Model that performs the reflective rewrite. */
|
|
132
|
-
model: string;
|
|
133
|
-
/** What is being optimized — orients the reflection prompt. Default
|
|
134
|
-
* `'system prompt'`. */
|
|
135
|
-
target?: string;
|
|
136
|
-
/** Surface-specific mutation levers offered to the reflector. */
|
|
137
|
-
mutationPrimitives?: string[];
|
|
138
|
-
/** H2 (`## Foo`) headings that MUST survive every candidate. gepaDriver's
|
|
139
|
-
* only structural guard — load-bearing sections of the prompt should be
|
|
140
|
-
* `##` headings so a rewrite cannot drop them. */
|
|
141
|
-
preserveSections?: string[];
|
|
142
|
-
/** Max sentence-level edits per candidate vs the parent (a textual learning
|
|
143
|
-
* rate). Caps a rewrite from wiping prior rules in one generation. */
|
|
144
|
-
maxSentenceEdits?: number;
|
|
145
|
-
}
|
|
146
|
-
/** @experimental */
|
|
147
|
-
interface OptimizePromptOptions<TScenario extends Scenario, TArtifact> {
|
|
148
|
-
/** The prompt being optimized — the identity baseline the gate protects. */
|
|
149
|
-
baselinePrompt: string;
|
|
150
|
-
/** Domain seam: run a candidate prompt against a scenario → artifact the
|
|
151
|
-
* judges score. The optimizer is agnostic to HOW the prompt runs. */
|
|
152
|
-
runWithPrompt: (prompt: string, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
153
|
-
/** Training pool — scored each generation to rank candidates. */
|
|
154
|
-
scenarios: TScenario[];
|
|
155
|
-
/** Held out of training — scored ONLY for the gate's baseline-vs-winner
|
|
156
|
-
* delta. Disjoint from `scenarios`; this is what makes promotion measure
|
|
157
|
-
* generalization, not memorization. */
|
|
158
|
-
holdoutScenarios: TScenario[];
|
|
159
|
-
/** Scorers — deterministic checks or LLM judges. */
|
|
160
|
-
judges: JudgeConfig<TArtifact, TScenario>[];
|
|
161
|
-
/** Where artifacts + traces land (opaque key under in-memory storage). */
|
|
162
|
-
runDir: string;
|
|
163
|
-
/** Default driver = `gepaDriver` built from this. Required UNLESS `driver`
|
|
164
|
-
* is supplied. */
|
|
165
|
-
reflection?: OptimizePromptReflection;
|
|
166
|
-
/** Override the improvement strategy (custom driver / deterministic tests). */
|
|
167
|
-
driver?: ImprovementDriver;
|
|
168
|
-
/** Override the promotion gate. Default `heldOutGate` over `holdoutScenarios`
|
|
169
|
-
* — zero extra LLM. Wrap `defaultProductionGate` for red-team/reward-hacking
|
|
170
|
-
* hardening on production wiring. */
|
|
171
|
-
gate?: Gate<TArtifact, TScenario>;
|
|
172
|
-
/** Minimum held-out composite lift to ship, forwarded to the default
|
|
173
|
-
* `heldOutGate`. When omitted the gate uses its own default. */
|
|
174
|
-
deltaThreshold?: number;
|
|
175
|
-
/** Candidates proposed per generation. Default 4. */
|
|
176
|
-
populationSize?: number;
|
|
177
|
-
/** Generations to run. Default 3. */
|
|
178
|
-
maxGenerations?: number;
|
|
179
|
-
/** Candidates carried to the next generation. Default 2. */
|
|
180
|
-
promoteTopK?: number;
|
|
181
|
-
/** Storage backend. Pass `inMemoryCampaignStorage()` for filesystem-less /
|
|
182
|
-
* test runs. Default: Node filesystem. */
|
|
183
|
-
storage?: CampaignStorage;
|
|
184
|
-
/** Reproducibility seed. Default 42. */
|
|
185
|
-
seed?: number;
|
|
186
|
-
/** Per-scenario replicates for CI bands. Default 1. */
|
|
187
|
-
reps?: number;
|
|
188
|
-
/** Max concurrent cells. Default 2. */
|
|
189
|
-
maxConcurrency?: number;
|
|
190
|
-
/** Test seam — override the wall clock. */
|
|
191
|
-
now?: () => Date;
|
|
192
|
-
/** On a shipped gate: `'pr'` opens a PR, `'none'` just reports. Default
|
|
193
|
-
* `'none'`. */
|
|
194
|
-
autoOnPromote?: 'pr' | 'none';
|
|
195
|
-
ghOwner?: string;
|
|
196
|
-
ghRepo?: string;
|
|
197
|
-
}
|
|
198
|
-
/** @experimental */
|
|
199
|
-
interface OptimizePromptResult<TArtifact, TScenario extends Scenario> {
|
|
200
|
-
/** The prompt to USE. Identity (the baseline) unless the gate shipped a
|
|
201
|
-
* winner — so a caller can always assign `result.prompt` unconditionally. */
|
|
202
|
-
prompt: string;
|
|
203
|
-
/** True only when the gate promoted a candidate over baseline on holdout. */
|
|
204
|
-
improved: boolean;
|
|
205
|
-
/** The gate's verdict (`'ship' | 'hold' | 'need_more_work' | ...`). */
|
|
206
|
-
decision: GateResult['decision'];
|
|
207
|
-
/** Human-readable reasons the gate gave. */
|
|
208
|
-
reasons: string[];
|
|
209
|
-
/** Mean held-out composite of the baseline. */
|
|
210
|
-
baselineComposite: number;
|
|
211
|
-
/** Mean held-out composite of the winner candidate. */
|
|
212
|
-
winnerComposite: number;
|
|
213
|
-
/** Held-out lift (winner − baseline); the gate's `delta` when it reported one. */
|
|
214
|
-
delta: number;
|
|
215
|
-
/** Why the winner was proposed — present when a shipped winner carried a
|
|
216
|
-
* driver rationale. */
|
|
217
|
-
rationale?: string;
|
|
218
|
-
/** Unified baseline→winner diff (empty when the winner is the baseline). */
|
|
219
|
-
diff: string;
|
|
220
|
-
/** The full loop result for callers that need generations / campaigns. */
|
|
221
|
-
raw: RunImprovementLoopResult<TArtifact, TScenario>;
|
|
222
|
-
}
|
|
223
|
-
/** @experimental */
|
|
224
|
-
declare function optimizePrompt<TScenario extends Scenario, TArtifact>(opts: OptimizePromptOptions<TScenario, TArtifact>): Promise<OptimizePromptResult<TArtifact, TScenario>>;
|
|
225
|
-
|
|
226
102
|
/**
|
|
227
103
|
* @experimental
|
|
228
104
|
*
|
|
@@ -242,4 +118,4 @@ interface ReflectiveGeneratorOptions {
|
|
|
242
118
|
}
|
|
243
119
|
declare function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator;
|
|
244
120
|
|
|
245
|
-
export { type AgenticGeneratorOptions, type CandidateGenerator, type ImprovementDriverOptions, type
|
|
121
|
+
export { type AgenticGeneratorOptions, type CandidateGenerator, type ImprovementDriverOptions, type ReflectiveGeneratorOptions, agenticGenerator, improvementDriver, reflectiveGenerator };
|
package/dist/improvement.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
import {
|
|
2
|
+
optimizePrompt
|
|
3
|
+
} from "./chunk-VOX6Z3II.js";
|
|
1
4
|
import {
|
|
2
5
|
runLocalHarness
|
|
3
6
|
} from "./chunk-GLR25NG7.js";
|
|
4
|
-
import
|
|
5
|
-
ConfigError
|
|
6
|
-
} from "./chunk-SQSCRJ7U.js";
|
|
7
|
+
import "./chunk-SQSCRJ7U.js";
|
|
7
8
|
import "./chunk-DGUM43GV.js";
|
|
8
9
|
|
|
9
10
|
// src/improvement/agentic-generator.ts
|
|
@@ -130,88 +131,6 @@ function resolveFindings(ctx) {
|
|
|
130
131
|
return ctx.findings;
|
|
131
132
|
}
|
|
132
133
|
|
|
133
|
-
// src/improvement/optimize-prompt.ts
|
|
134
|
-
import { gepaDriver, heldOutGate, runImprovementLoop } from "@tangle-network/agent-eval/campaign";
|
|
135
|
-
async function optimizePrompt(opts) {
|
|
136
|
-
if (!opts.driver && !opts.reflection) {
|
|
137
|
-
throw new ConfigError(
|
|
138
|
-
"optimizePrompt: pass `reflection` (builds the default gepaDriver) or a custom `driver`"
|
|
139
|
-
);
|
|
140
|
-
}
|
|
141
|
-
if (opts.scenarios.length === 0) {
|
|
142
|
-
throw new ConfigError("optimizePrompt: `scenarios` must be non-empty");
|
|
143
|
-
}
|
|
144
|
-
if (opts.holdoutScenarios.length === 0) {
|
|
145
|
-
throw new ConfigError(
|
|
146
|
-
"optimizePrompt: `holdoutScenarios` must be non-empty (the gate needs it)"
|
|
147
|
-
);
|
|
148
|
-
}
|
|
149
|
-
const driver = opts.driver ?? gepaDriver({
|
|
150
|
-
llm: opts.reflection.llm,
|
|
151
|
-
model: opts.reflection.model,
|
|
152
|
-
target: opts.reflection.target ?? "system prompt",
|
|
153
|
-
mutationPrimitives: opts.reflection.mutationPrimitives,
|
|
154
|
-
constraints: opts.reflection.preserveSections || opts.reflection.maxSentenceEdits !== void 0 ? {
|
|
155
|
-
preserveSections: opts.reflection.preserveSections,
|
|
156
|
-
maxSentenceEdits: opts.reflection.maxSentenceEdits
|
|
157
|
-
} : void 0
|
|
158
|
-
});
|
|
159
|
-
const gate = opts.gate ?? heldOutGate({
|
|
160
|
-
scenarios: opts.holdoutScenarios,
|
|
161
|
-
...opts.deltaThreshold !== void 0 ? { deltaThreshold: opts.deltaThreshold } : {}
|
|
162
|
-
});
|
|
163
|
-
const result = await runImprovementLoop({
|
|
164
|
-
baselineSurface: opts.baselinePrompt,
|
|
165
|
-
dispatchWithSurface: (surface, scenario, ctx) => {
|
|
166
|
-
if (typeof surface !== "string") {
|
|
167
|
-
throw new ConfigError(
|
|
168
|
-
"optimizePrompt: received a CodeSurface \u2014 this entry point optimizes string prompts only"
|
|
169
|
-
);
|
|
170
|
-
}
|
|
171
|
-
return opts.runWithPrompt(surface, scenario, ctx);
|
|
172
|
-
},
|
|
173
|
-
driver,
|
|
174
|
-
populationSize: opts.populationSize ?? 4,
|
|
175
|
-
maxGenerations: opts.maxGenerations ?? 3,
|
|
176
|
-
...opts.promoteTopK !== void 0 ? { promoteTopK: opts.promoteTopK } : {},
|
|
177
|
-
scenarios: opts.scenarios,
|
|
178
|
-
holdoutScenarios: opts.holdoutScenarios,
|
|
179
|
-
judges: opts.judges,
|
|
180
|
-
gate,
|
|
181
|
-
autoOnPromote: opts.autoOnPromote ?? "none",
|
|
182
|
-
...opts.ghOwner !== void 0 ? { ghOwner: opts.ghOwner } : {},
|
|
183
|
-
...opts.ghRepo !== void 0 ? { ghRepo: opts.ghRepo } : {},
|
|
184
|
-
runDir: opts.runDir,
|
|
185
|
-
...opts.storage !== void 0 ? { storage: opts.storage } : {},
|
|
186
|
-
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
187
|
-
...opts.reps !== void 0 ? { reps: opts.reps } : {},
|
|
188
|
-
...opts.maxConcurrency !== void 0 ? { maxConcurrency: opts.maxConcurrency } : {},
|
|
189
|
-
...opts.now !== void 0 ? { now: opts.now } : {}
|
|
190
|
-
});
|
|
191
|
-
const improved = result.gateResult.decision === "ship";
|
|
192
|
-
const winnerSurface = typeof result.winnerSurface === "string" ? result.winnerSurface : opts.baselinePrompt;
|
|
193
|
-
const baselineComposite = meanComposite(result.baselineOnHoldout);
|
|
194
|
-
const winnerComposite = meanComposite(result.winnerOnHoldout);
|
|
195
|
-
return {
|
|
196
|
-
prompt: improved ? winnerSurface : opts.baselinePrompt,
|
|
197
|
-
improved,
|
|
198
|
-
decision: result.gateResult.decision,
|
|
199
|
-
reasons: result.gateResult.reasons,
|
|
200
|
-
baselineComposite,
|
|
201
|
-
winnerComposite,
|
|
202
|
-
delta: result.gateResult.delta ?? winnerComposite - baselineComposite,
|
|
203
|
-
...improved && result.winnerRationale ? { rationale: result.winnerRationale } : {},
|
|
204
|
-
diff: result.promotedDiff,
|
|
205
|
-
raw: result
|
|
206
|
-
};
|
|
207
|
-
}
|
|
208
|
-
function meanComposite(campaign) {
|
|
209
|
-
const scenarios = Object.values(campaign.aggregates.byScenario);
|
|
210
|
-
if (scenarios.length === 0) return 0;
|
|
211
|
-
const sum = scenarios.reduce((acc, s) => acc + s.meanComposite, 0);
|
|
212
|
-
return sum / scenarios.length;
|
|
213
|
-
}
|
|
214
|
-
|
|
215
134
|
// src/improvement/reflective-generator.ts
|
|
216
135
|
import { spawnSync as spawnSync2 } from "child_process";
|
|
217
136
|
function reflectiveGenerator(opts) {
|
package/dist/improvement.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/improvement-driver.ts","../src/improvement/optimize-prompt.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change\n * (the worktree stays clean), the generator refines the prompt and retries, up\n * to `maxShots` times. A harness that already changed files returns on shot 1.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n let prompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: prompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n // The worktree IS the signal: if the harness touched files, we have a\n // candidate. We don't trust the harness's stdout — we trust the diff.\n if (dirty(worktreePath)) {\n return { applied: true, summary: summarize(findings) }\n }\n // No change this shot — give the next attempt explicit feedback.\n prompt = refine(prompt)\n }\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nfunction refine(prompt: string): string {\n return `${prompt}\\n\\nNOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;\n * an agent reads code + report and edits)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * @experimental\n *\n * `optimizePrompt` — identity-gated optimization for any TEXT prompt surface\n * (system prompt, planner prompt, judge rubric, skill doc).\n *\n * The text-surface sibling to this module's `improvementDriver` (the\n * CODE-surface / worktree path). Both feed agent-eval's `runImprovementLoop`;\n * this one defaults the driver to agent-eval's `gepaDriver` (reflective text\n * mutator) and the gate to `heldOutGate`.\n *\n * IDENTITY-GATED BY CONSTRUCTION — the whole point. The loop runs evals,\n * collects per-scenario signal, proposes candidates, and the gate compares\n * candidate-vs-baseline ON THE HELDOUT. `result.prompt` is the baseline\n * (identity) UNLESS the gate decided `'ship'`. So wiring a surface up is safe:\n * a surface with no beneficial mutation simply keeps its baseline. You never\n * regress by registering a prompt — you only ever improve when the held-out\n * data earns it.\n *\n * Generic over the runtime: `runWithPrompt` is the only domain seam — given a\n * candidate prompt + scenario, run it however the surface runs (sandbox\n * `streamPrompt`, a `runLoop`, a direct model call) and return the artifact the\n * judges score. The optimizer never assumes how a prompt is executed.\n */\n\nimport type { LlmClientOptions } from '@tangle-network/agent-eval'\nimport type {\n CampaignResult,\n CampaignStorage,\n DispatchContext,\n Gate,\n GateResult,\n ImprovementDriver,\n JudgeConfig,\n RunImprovementLoopResult,\n Scenario,\n} from '@tangle-network/agent-eval/campaign'\nimport { gepaDriver, heldOutGate, runImprovementLoop } from '@tangle-network/agent-eval/campaign'\nimport { ConfigError } from '../errors'\n\n/** Reflection config for the default `gepaDriver`. Omit when passing a custom\n * `driver`. */\nexport interface OptimizePromptReflection {\n /** Router transport for the reflection model. */\n llm: LlmClientOptions\n /** Model that performs the reflective rewrite. */\n model: string\n /** What is being optimized — orients the reflection prompt. Default\n * `'system prompt'`. */\n target?: string\n /** Surface-specific mutation levers offered to the reflector. */\n mutationPrimitives?: string[]\n /** H2 (`## Foo`) headings that MUST survive every candidate. gepaDriver's\n * only structural guard — load-bearing sections of the prompt should be\n * `##` headings so a rewrite cannot drop them. */\n preserveSections?: string[]\n /** Max sentence-level edits per candidate vs the parent (a textual learning\n * rate). Caps a rewrite from wiping prior rules in one generation. */\n maxSentenceEdits?: number\n}\n\n/** @experimental */\nexport interface OptimizePromptOptions<TScenario extends Scenario, TArtifact> {\n /** The prompt being optimized — the identity baseline the gate protects. */\n baselinePrompt: string\n /** Domain seam: run a candidate prompt against a scenario → artifact the\n * judges score. The optimizer is agnostic to HOW the prompt runs. */\n runWithPrompt: (prompt: string, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>\n /** Training pool — scored each generation to rank candidates. */\n scenarios: TScenario[]\n /** Held out of training — scored ONLY for the gate's baseline-vs-winner\n * delta. Disjoint from `scenarios`; this is what makes promotion measure\n * generalization, not memorization. */\n holdoutScenarios: TScenario[]\n /** Scorers — deterministic checks or LLM judges. */\n judges: JudgeConfig<TArtifact, TScenario>[]\n /** Where artifacts + traces land (opaque key under in-memory storage). */\n runDir: string\n /** Default driver = `gepaDriver` built from this. Required UNLESS `driver`\n * is supplied. */\n reflection?: OptimizePromptReflection\n /** Override the improvement strategy (custom driver / deterministic tests). */\n driver?: ImprovementDriver\n /** Override the promotion gate. Default `heldOutGate` over `holdoutScenarios`\n * — zero extra LLM. Wrap `defaultProductionGate` for red-team/reward-hacking\n * hardening on production wiring. */\n gate?: Gate<TArtifact, TScenario>\n /** Minimum held-out composite lift to ship, forwarded to the default\n * `heldOutGate`. When omitted the gate uses its own default. */\n deltaThreshold?: number\n /** Candidates proposed per generation. Default 4. */\n populationSize?: number\n /** Generations to run. Default 3. */\n maxGenerations?: number\n /** Candidates carried to the next generation. Default 2. */\n promoteTopK?: number\n /** Storage backend. Pass `inMemoryCampaignStorage()` for filesystem-less /\n * test runs. Default: Node filesystem. */\n storage?: CampaignStorage\n /** Reproducibility seed. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1. */\n reps?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Test seam — override the wall clock. */\n now?: () => Date\n /** On a shipped gate: `'pr'` opens a PR, `'none'` just reports. Default\n * `'none'`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n}\n\n/** @experimental */\nexport interface OptimizePromptResult<TArtifact, TScenario extends Scenario> {\n /** The prompt to USE. Identity (the baseline) unless the gate shipped a\n * winner — so a caller can always assign `result.prompt` unconditionally. */\n prompt: string\n /** True only when the gate promoted a candidate over baseline on holdout. */\n improved: boolean\n /** The gate's verdict (`'ship' | 'hold' | 'need_more_work' | ...`). */\n decision: GateResult['decision']\n /** Human-readable reasons the gate gave. */\n reasons: string[]\n /** Mean held-out composite of the baseline. */\n baselineComposite: number\n /** Mean held-out composite of the winner candidate. */\n winnerComposite: number\n /** Held-out lift (winner − baseline); the gate's `delta` when it reported one. */\n delta: number\n /** Why the winner was proposed — present when a shipped winner carried a\n * driver rationale. */\n rationale?: string\n /** Unified baseline→winner diff (empty when the winner is the baseline). */\n diff: string\n /** The full loop result for callers that need generations / campaigns. */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/** @experimental */\nexport async function optimizePrompt<TScenario extends Scenario, TArtifact>(\n opts: OptimizePromptOptions<TScenario, TArtifact>,\n): Promise<OptimizePromptResult<TArtifact, TScenario>> {\n if (!opts.driver && !opts.reflection) {\n throw new ConfigError(\n 'optimizePrompt: pass `reflection` (builds the default gepaDriver) or a custom `driver`',\n )\n }\n if (opts.scenarios.length === 0) {\n throw new ConfigError('optimizePrompt: `scenarios` must be non-empty')\n }\n if (opts.holdoutScenarios.length === 0) {\n throw new ConfigError(\n 'optimizePrompt: `holdoutScenarios` must be non-empty (the gate needs it)',\n )\n }\n\n const driver =\n opts.driver ??\n gepaDriver({\n llm: opts.reflection!.llm,\n model: opts.reflection!.model,\n target: opts.reflection!.target ?? 'system prompt',\n mutationPrimitives: opts.reflection!.mutationPrimitives,\n constraints:\n opts.reflection!.preserveSections || opts.reflection!.maxSentenceEdits !== undefined\n ? {\n preserveSections: opts.reflection!.preserveSections,\n maxSentenceEdits: opts.reflection!.maxSentenceEdits,\n }\n : undefined,\n })\n\n const gate =\n opts.gate ??\n heldOutGate<TArtifact, TScenario>({\n scenarios: opts.holdoutScenarios,\n ...(opts.deltaThreshold !== undefined ? { deltaThreshold: opts.deltaThreshold } : {}),\n })\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n baselineSurface: opts.baselinePrompt,\n dispatchWithSurface: (surface, scenario, ctx) => {\n if (typeof surface !== 'string') {\n // optimizePrompt is the TEXT-surface entry point; a CodeSurface means\n // the caller wired the wrong driver. Fail loud — don't silently run the\n // baseline and report a phantom score.\n throw new ConfigError(\n 'optimizePrompt: received a CodeSurface — this entry point optimizes string prompts only',\n )\n }\n return opts.runWithPrompt(surface, scenario, ctx)\n },\n driver,\n populationSize: opts.populationSize ?? 4,\n maxGenerations: opts.maxGenerations ?? 3,\n ...(opts.promoteTopK !== undefined ? { promoteTopK: opts.promoteTopK } : {}),\n scenarios: opts.scenarios,\n holdoutScenarios: opts.holdoutScenarios,\n judges: opts.judges,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ...(opts.ghOwner !== undefined ? { ghOwner: opts.ghOwner } : {}),\n ...(opts.ghRepo !== undefined ? { ghRepo: opts.ghRepo } : {}),\n runDir: opts.runDir,\n ...(opts.storage !== undefined ? { storage: opts.storage } : {}),\n ...(opts.seed !== undefined ? { seed: opts.seed } : {}),\n ...(opts.reps !== undefined ? { reps: opts.reps } : {}),\n ...(opts.maxConcurrency !== undefined ? { maxConcurrency: opts.maxConcurrency } : {}),\n ...(opts.now !== undefined ? { now: opts.now } : {}),\n })\n\n const improved = result.gateResult.decision === 'ship'\n const winnerSurface =\n typeof result.winnerSurface === 'string' ? result.winnerSurface : opts.baselinePrompt\n const baselineComposite = meanComposite(result.baselineOnHoldout)\n const winnerComposite = meanComposite(result.winnerOnHoldout)\n\n return {\n prompt: improved ? winnerSurface : opts.baselinePrompt,\n improved,\n decision: result.gateResult.decision,\n reasons: result.gateResult.reasons,\n baselineComposite,\n winnerComposite,\n delta: result.gateResult.delta ?? winnerComposite - baselineComposite,\n ...(improved && result.winnerRationale ? { rationale: result.winnerRationale } : {}),\n diff: result.promotedDiff,\n raw: result,\n }\n}\n\n/** Mean composite over a campaign's per-scenario aggregates. The held-out\n * campaigns score one surface across `holdoutScenarios`; averaging the\n * per-scenario means gives the single number the gate's delta is built from. */\nfunction meanComposite(campaign: CampaignResult<unknown, Scenario>): number {\n const scenarios = Object.values(campaign.aggregates.byScenario)\n if (scenarios.length === 0) return 0\n const sum = scenarios.reduce((acc, s) => acc + s.meanComposite, 0)\n return sum / scenarios.length\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`\n * setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;;;;AAuBA,SAAS,iBAAiB;AAmBnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAE9B,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,UAAI,SAAS,YAAY,EAAE,QAAQ,SAAS,CAAC;AAC7C,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY;AAAA,UACZ,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,MAAM,YAAY,GAAG;AACvB,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,iBAAS,OAAO,MAAM;AAAA,MACxB;AACA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,OAAO,QAAwB;AACtC,SAAO,GAAG,MAAM;AAAA;AAAA;AAClB;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACvEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;AC9EA,SAAS,YAAY,aAAa,0BAA0B;AAwG5D,eAAsB,eACpB,MACqD;AACrD,MAAI,CAAC,KAAK,UAAU,CAAC,KAAK,YAAY;AACpC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,MAAI,KAAK,UAAU,WAAW,GAAG;AAC/B,UAAM,IAAI,YAAY,+CAA+C;AAAA,EACvE;AACA,MAAI,KAAK,iBAAiB,WAAW,GAAG;AACtC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK,KAAK,WAAY;AAAA,IACtB,OAAO,KAAK,WAAY;AAAA,IACxB,QAAQ,KAAK,WAAY,UAAU;AAAA,IACnC,oBAAoB,KAAK,WAAY;AAAA,IACrC,aACE,KAAK,WAAY,oBAAoB,KAAK,WAAY,qBAAqB,SACvE;AAAA,MACE,kBAAkB,KAAK,WAAY;AAAA,MACnC,kBAAkB,KAAK,WAAY;AAAA,IACrC,IACA;AAAA,EACR,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,YAAkC;AAAA,IAChC,WAAW,KAAK;AAAA,IAChB,GAAI,KAAK,mBAAmB,SAAY,EAAE,gBAAgB,KAAK,eAAe,IAAI,CAAC;AAAA,EACrF,CAAC;AAEH,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,CAAC,SAAS,UAAU,QAAQ;AAC/C,UAAI,OAAO,YAAY,UAAU;AAI/B,cAAM,IAAI;AAAA,UACR;AAAA,QACF;AAAA,MACF;AACA,aAAO,KAAK,cAAc,SAAS,UAAU,GAAG;AAAA,IAClD;AAAA,IACA;AAAA,IACA,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,gBAAgB,KAAK,kBAAkB;AAAA,IACvC,GAAI,KAAK,gBAAgB,SAAY,EAAE,aAAa,KAAK,YAAY,IAAI,CAAC;AAAA,IAC1E,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK;AAAA,IACvB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,GAAI,KAAK,YAAY,SAAY,EAAE,SAAS,KAAK,QAAQ,IAAI,CAAC;AAAA,IAC9D,GAAI,KAAK,WAAW,SAAY,EAAE,QAAQ,KAAK,OAAO,IAAI,CAAC;AAAA,IAC3D,QAAQ,KAAK;AAAA,IACb,GAAI,KAAK,YAAY,SAAY,EAAE,SAAS,KAAK,QAAQ,IAAI,CAAC;AAAA,IAC9D,GAAI,KAAK,SAAS,SAAY,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,IACrD,GAAI,KAAK,SAAS,SAAY,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,IACrD,GAAI,KAAK,mBAAmB,SAAY,EAAE,gBAAgB,KAAK,eAAe,IAAI,CAAC;AAAA,IACnF,GAAI,KAAK,QAAQ,SAAY,EAAE,KAAK,KAAK,IAAI,IAAI,CAAC;AAAA,EACpD,CAAC;AAED,QAAM,WAAW,OAAO,WAAW,aAAa;AAChD,QAAM,gBACJ,OAAO,OAAO,kBAAkB,WAAW,OAAO,gBAAgB,KAAK;AACzE,QAAM,oBAAoB,cAAc,OAAO,iBAAiB;AAChE,QAAM,kBAAkB,cAAc,OAAO,eAAe;AAE5D,SAAO;AAAA,IACL,QAAQ,WAAW,gBAAgB,KAAK;AAAA,IACxC;AAAA,IACA,UAAU,OAAO,WAAW;AAAA,IAC5B,SAAS,OAAO,WAAW;AAAA,IAC3B;AAAA,IACA;AAAA,IACA,OAAO,OAAO,WAAW,SAAS,kBAAkB;AAAA,IACpD,GAAI,YAAY,OAAO,kBAAkB,EAAE,WAAW,OAAO,gBAAgB,IAAI,CAAC;AAAA,IAClF,MAAM,OAAO;AAAA,IACb,KAAK;AAAA,EACP;AACF;AAKA,SAAS,cAAc,UAAqD;AAC1E,QAAM,YAAY,OAAO,OAAO,SAAS,WAAW,UAAU;AAC9D,MAAI,UAAU,WAAW,EAAG,QAAO;AACnC,QAAM,MAAM,UAAU,OAAO,CAAC,KAAK,MAAM,MAAM,EAAE,eAAe,CAAC;AACjE,SAAO,MAAM,UAAU;AACzB;;;ACnOA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}
|
|
1
|
+
{"version":3,"sources":["../src/improvement/agentic-generator.ts","../src/improvement/improvement-driver.ts","../src/improvement/reflective-generator.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `agenticGenerator` — the full-agentic `CandidateGenerator`: the\n * `shots=N, sandbox=on` setting of the one `improvementDriver`. It runs a real\n * coding harness (claude / codex / opencode) inside the candidate worktree the\n * driver already created, letting the agent read the codebase + the research\n * report and make the change in place. The driver then commits the worktree\n * into a `CodeSurface`.\n *\n * Mechanism: identical to the proven Phase-2.8 in-process executor — spawn the\n * harness as a subprocess with `cwd` = the worktree, on the same filesystem,\n * so edits land in place (no sandbox-mount round-trip). `runLocalHarness` is\n * the verified primitive. The OUTER sandbox is the improvement loop's own\n * execution context; the generator does not nest a second sandbox per\n * candidate (which would reintroduce a host↔sandbox worktree-transport\n * problem that does not need solving here).\n *\n * `maxShots` is the DEPTH dial: the harness runs once; if it produced no change\n * (the worktree stays clean), the generator refines the prompt and retries, up\n * to `maxShots` times. A harness that already changed files returns on shot 1.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport { type LocalHarness, runLocalHarness } from '../mcp/local-harness'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface AgenticGeneratorOptions {\n /** Local coding harness to run in the worktree. Default `claude`. */\n harness?: LocalHarness\n /** Per-shot wall-clock timeout (ms). Default = `runLocalHarness` default (5m). */\n timeoutMs?: number\n /** Build the harness task prompt from the report + findings. Override for\n * domain phrasing; the default turns findings into a concrete coder task. */\n buildPrompt?: (args: { report: unknown; findings: AnalystFinding[] }) => string\n /** Test seam — inject the harness runner (defaults to `runLocalHarness`). */\n runHarness?: typeof runLocalHarness\n /** Test seam — inject the worktree-dirty check (defaults to `git status`). */\n isDirty?: (worktreePath: string) => boolean\n}\n\nexport function agenticGenerator(opts: AgenticGeneratorOptions = {}): CandidateGenerator {\n const harness = opts.harness ?? 'claude'\n const buildPrompt = opts.buildPrompt ?? defaultBuildPrompt\n const run = opts.runHarness ?? runLocalHarness\n const dirty = opts.isDirty ?? worktreeDirty\n\n return {\n kind: `agentic:${harness}`,\n async generate({ worktreePath, report, findings, maxShots, signal }) {\n let prompt = buildPrompt({ report, findings })\n const shots = Math.max(1, maxShots)\n\n for (let shot = 0; shot < shots; shot++) {\n if (signal.aborted) break\n await run({\n harness,\n cwd: worktreePath,\n taskPrompt: prompt,\n timeoutMs: opts.timeoutMs,\n signal,\n })\n // The worktree IS the signal: if the harness touched files, we have a\n // candidate. We don't trust the harness's stdout — we trust the diff.\n if (dirty(worktreePath)) {\n return { applied: true, summary: summarize(findings) }\n }\n // No change this shot — give the next attempt explicit feedback.\n prompt = refine(prompt)\n }\n return { applied: false, summary: '' }\n },\n }\n}\n\n/** Turn the analyst's findings (+ optional report) into a concrete coder task. */\nfunction defaultBuildPrompt(args: { report: unknown; findings: AnalystFinding[] }): string {\n const lines: string[] = [\n 'You are improving this codebase based on an evaluation analysis.',\n 'Make the smallest set of edits that addresses the findings below, then stop.',\n 'Do not change unrelated code. Do not commit — leave changes in the working tree.',\n '',\n 'Findings:',\n ]\n for (const f of args.findings) {\n const where = f.subject ? ` [${f.subject}]` : ''\n lines.push(`- (${f.severity})${where} ${f.claim}`)\n if (f.recommended_action) lines.push(` → ${f.recommended_action}`)\n }\n return lines.join('\\n')\n}\n\nfunction refine(prompt: string): string {\n return `${prompt}\\n\\nNOTE: your previous attempt left the working tree unchanged. Make the concrete file edits now.`\n}\n\n/** A one-line summary for the commit message, derived from the findings. */\nfunction summarize(findings: AnalystFinding[]): string {\n if (findings.length === 0) return 'agentic improvement'\n if (findings.length === 1) return `agentic: ${truncate(findings[0]!.claim, 64)}`\n return `agentic: ${findings.length} findings addressed`\n}\n\nfunction truncate(s: string, n: number): string {\n return s.length <= n ? s : `${s.slice(0, n - 1)}…`\n}\n\n/** Non-empty `git status --porcelain` ⇒ the harness changed the worktree.\n * Fails loud: the worktree is a fresh checkout, so a git error here means\n * something is genuinely broken (git missing, corrupt index, killed mid-run).\n * Folding that into `false` would silently discard a candidate and mask the\n * real failure — forbidden by the no-silent-fallbacks doctrine. */\nfunction worktreeDirty(worktreePath: string): boolean {\n const result = spawnSync('git', ['status', '--porcelain'], {\n cwd: worktreePath,\n encoding: 'utf-8',\n })\n if (result.error) {\n throw new Error(\n `agenticGenerator: git status failed to spawn in ${worktreePath}: ${result.error.message}`,\n )\n }\n if (result.status !== 0) {\n throw new Error(\n `agenticGenerator: git status exited ${result.status} in ${worktreePath}: ${result.stderr.trim()}`,\n )\n }\n return result.stdout.trim().length > 0\n}\n","/**\n * @experimental\n *\n * `improvementDriver` — the ONE reflective/agentic improvement driver for\n * agent-eval's improvement loop. It implements `ImprovementDriver` and owns\n * the candidate lifecycle (worktree create → generate → finalize/discard,\n * × populationSize); it delegates the only thing that genuinely varies — HOW\n * a candidate change is produced — to a pluggable `CandidateGenerator`.\n *\n * There is no separate \"analyst driver\" vs \"autoresearch driver\": those are\n * the SAME driver at two settings of a dial.\n * - cheap reflective path → `reflectiveGenerator` (shots=1, no sandbox;\n * applies pre-drafted patches)\n * - full agentic path → `agenticGenerator` (shots=N, sandbox runLoop;\n * an agent reads code + report and edits)\n * Both emit changes into a worktree the driver finalizes into a\n * `CodeSurface{ worktreeRef }` the loop measures on the holdout. See\n * agent-eval's `docs/design/self-improvement-engine.md`.\n */\n\nimport type { AnalystFinding } from '@tangle-network/agent-eval'\nimport type {\n CodeSurface,\n ImprovementDriver,\n LabeledScenarioStore,\n ProposeContext,\n WorktreeAdapter,\n} from '@tangle-network/agent-eval/campaign'\n\n/** The byte-producing seam — the ONE thing that differs between the cheap\n * reflective path and the full agentic path. A generator makes (uncommitted)\n * changes inside `worktreePath`; the driver commits them via the worktree\n * adapter's `finalize`. */\nexport interface CandidateGenerator {\n kind: string\n generate(args: {\n /** The candidate worktree — a fresh checkout of baseRef. Write changes here. */\n worktreePath: string\n /** Phase-2 research report (analyst findings + diff), opaque. */\n report: unknown\n /** Findings resolved from the report or the loop context. */\n findings: AnalystFinding[]\n /** Handle to all captured data, to ground the change. */\n dataset?: LabeledScenarioStore\n /** DEPTH: max iterations the generator may take (agentic uses this; the\n * reflective generator ignores it). */\n maxShots: number\n signal: AbortSignal\n }): Promise<{ applied: boolean; summary: string }>\n}\n\nexport interface ImprovementDriverOptions {\n worktree: WorktreeAdapter\n generator: CandidateGenerator\n /** Base ref candidate worktrees fork from. Default `main`. */\n baseRef?: string\n}\n\nexport function improvementDriver(\n opts: ImprovementDriverOptions,\n): ImprovementDriver<AnalystFinding> {\n const baseRef = opts.baseRef ?? 'main'\n\n return {\n kind: `improvement:${opts.generator.kind}`,\n async propose(ctx) {\n const findings = resolveFindings(ctx)\n // No signal to act on — propose nothing rather than spin up worktrees.\n if (findings.length === 0 && ctx.report === undefined) return []\n\n const surfaces: CodeSurface[] = []\n for (let i = 0; i < ctx.populationSize; i++) {\n if (ctx.signal.aborted) break\n const wt = await opts.worktree.create({\n baseRef,\n label: `${opts.generator.kind}-gen${ctx.generation}-cand${i}`,\n })\n // Once a worktree exists it MUST be accounted for: finalized into a\n // surface, or discarded. A throw from generate()/finalize() must not\n // leak the worktree + branch — discard best-effort, then rethrow loud.\n try {\n const { applied, summary } = await opts.generator.generate({\n worktreePath: wt.path,\n report: ctx.report,\n findings,\n dataset: ctx.dataset,\n maxShots: ctx.maxImprovementShots ?? 1,\n signal: ctx.signal,\n })\n if (!applied) {\n await opts.worktree.discard(wt)\n continue\n }\n surfaces.push(await opts.worktree.finalize(wt, summary))\n } catch (err) {\n // Best-effort cleanup; never mask the original failure.\n await opts.worktree.discard(wt).catch(() => {})\n throw err\n }\n }\n return surfaces\n },\n }\n}\n\n/** Phase-2 report carries `findings` when present; else fall back to the\n * loop's `ctx.findings`. The report is opaque to the substrate, so probe it\n * structurally. */\nfunction resolveFindings(ctx: ProposeContext<AnalystFinding>): AnalystFinding[] {\n const report = ctx.report\n if (report && typeof report === 'object' && 'findings' in report) {\n const f = (report as { findings: unknown }).findings\n if (Array.isArray(f) && f.length > 0) return f as AnalystFinding[]\n }\n return ctx.findings\n}\n","/**\n * @experimental\n *\n * `reflectiveGenerator` — the cheap, no-sandbox `CandidateGenerator`. It drafts\n * surface edits via the existing improvement adapter (`proposeFromFindings`,\n * one LLM patch per finding) and applies them as ONE coherent improvement into\n * the candidate worktree. `maxShots` is ignored — reflection is single-shot by\n * construction (the patches are already drafted).\n *\n * This is the `shots=1, sandbox=off` setting of the one improvement driver.\n * The `agenticGenerator` (sandbox runLoop) is the `shots=N, sandbox=on`\n * setting — both plug into the same `improvementDriver`.\n */\n\nimport { spawnSync } from 'node:child_process'\nimport type { SurfaceImprovementEdit } from '../agent/improvement-adapter'\nimport type { ImprovementAdapter } from '../analyst-loop/types'\nimport type { CandidateGenerator } from './improvement-driver'\n\nexport interface ReflectiveGeneratorOptions {\n improvementAdapter: ImprovementAdapter<SurfaceImprovementEdit>\n}\n\nexport function reflectiveGenerator(opts: ReflectiveGeneratorOptions): CandidateGenerator {\n return {\n kind: 'reflective',\n async generate({ worktreePath, findings }) {\n const batch = await opts.improvementAdapter.proposeFromFindings(findings)\n if (batch.edits.length === 0) return { applied: false, summary: '' }\n\n let applied = 0\n for (const edit of batch.edits) {\n if (applyPatch(edit.patch, worktreePath)) applied++\n }\n if (applied === 0) return { applied: false, summary: '' }\n\n const summary =\n batch.edits.length === 1\n ? batch.edits[0]!.summary\n : `analyst: ${applied} surface edit${applied === 1 ? '' : 's'}`\n return { applied: true, summary }\n },\n }\n}\n\n/** Mirror the improvement adapter's proven apply invocation, run inside the\n * candidate worktree (a fresh checkout of baseRef, so `-p0` paths match). */\nfunction applyPatch(patch: string, cwd: string): boolean {\n const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {\n cwd,\n input: patch,\n encoding: 'utf-8',\n })\n return result.status === 0\n}\n"],"mappings":";;;;;;;;;;AAuBA,SAAS,iBAAiB;AAmBnB,SAAS,iBAAiB,OAAgC,CAAC,GAAuB;AACvF,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,cAAc,KAAK,eAAe;AACxC,QAAM,MAAM,KAAK,cAAc;AAC/B,QAAM,QAAQ,KAAK,WAAW;AAE9B,SAAO;AAAA,IACL,MAAM,WAAW,OAAO;AAAA,IACxB,MAAM,SAAS,EAAE,cAAc,QAAQ,UAAU,UAAU,OAAO,GAAG;AACnE,UAAI,SAAS,YAAY,EAAE,QAAQ,SAAS,CAAC;AAC7C,YAAM,QAAQ,KAAK,IAAI,GAAG,QAAQ;AAElC,eAAS,OAAO,GAAG,OAAO,OAAO,QAAQ;AACvC,YAAI,OAAO,QAAS;AACpB,cAAM,IAAI;AAAA,UACR;AAAA,UACA,KAAK;AAAA,UACL,YAAY;AAAA,UACZ,WAAW,KAAK;AAAA,UAChB;AAAA,QACF,CAAC;AAGD,YAAI,MAAM,YAAY,GAAG;AACvB,iBAAO,EAAE,SAAS,MAAM,SAAS,UAAU,QAAQ,EAAE;AAAA,QACvD;AAEA,iBAAS,OAAO,MAAM;AAAA,MACxB;AACA,aAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAAA,IACvC;AAAA,EACF;AACF;AAGA,SAAS,mBAAmB,MAA+D;AACzF,QAAM,QAAkB;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACA,aAAW,KAAK,KAAK,UAAU;AAC7B,UAAM,QAAQ,EAAE,UAAU,KAAK,EAAE,OAAO,MAAM;AAC9C,UAAM,KAAK,MAAM,EAAE,QAAQ,IAAI,KAAK,IAAI,EAAE,KAAK,EAAE;AACjD,QAAI,EAAE,mBAAoB,OAAM,KAAK,cAAS,EAAE,kBAAkB,EAAE;AAAA,EACtE;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,OAAO,QAAwB;AACtC,SAAO,GAAG,MAAM;AAAA;AAAA;AAClB;AAGA,SAAS,UAAU,UAAoC;AACrD,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,MAAI,SAAS,WAAW,EAAG,QAAO,YAAY,SAAS,SAAS,CAAC,EAAG,OAAO,EAAE,CAAC;AAC9E,SAAO,YAAY,SAAS,MAAM;AACpC;AAEA,SAAS,SAAS,GAAW,GAAmB;AAC9C,SAAO,EAAE,UAAU,IAAI,IAAI,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC,CAAC;AACjD;AAOA,SAAS,cAAc,cAA+B;AACpD,QAAM,SAAS,UAAU,OAAO,CAAC,UAAU,aAAa,GAAG;AAAA,IACzD,KAAK;AAAA,IACL,UAAU;AAAA,EACZ,CAAC;AACD,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI;AAAA,MACR,mDAAmD,YAAY,KAAK,OAAO,MAAM,OAAO;AAAA,IAC1F;AAAA,EACF;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,uCAAuC,OAAO,MAAM,OAAO,YAAY,KAAK,OAAO,OAAO,KAAK,CAAC;AAAA,IAClG;AAAA,EACF;AACA,SAAO,OAAO,OAAO,KAAK,EAAE,SAAS;AACvC;;;ACvEO,SAAS,kBACd,MACmC;AACnC,QAAM,UAAU,KAAK,WAAW;AAEhC,SAAO;AAAA,IACL,MAAM,eAAe,KAAK,UAAU,IAAI;AAAA,IACxC,MAAM,QAAQ,KAAK;AACjB,YAAM,WAAW,gBAAgB,GAAG;AAEpC,UAAI,SAAS,WAAW,KAAK,IAAI,WAAW,OAAW,QAAO,CAAC;AAE/D,YAAM,WAA0B,CAAC;AACjC,eAAS,IAAI,GAAG,IAAI,IAAI,gBAAgB,KAAK;AAC3C,YAAI,IAAI,OAAO,QAAS;AACxB,cAAM,KAAK,MAAM,KAAK,SAAS,OAAO;AAAA,UACpC;AAAA,UACA,OAAO,GAAG,KAAK,UAAU,IAAI,OAAO,IAAI,UAAU,QAAQ,CAAC;AAAA,QAC7D,CAAC;AAID,YAAI;AACF,gBAAM,EAAE,SAAS,QAAQ,IAAI,MAAM,KAAK,UAAU,SAAS;AAAA,YACzD,cAAc,GAAG;AAAA,YACjB,QAAQ,IAAI;AAAA,YACZ;AAAA,YACA,SAAS,IAAI;AAAA,YACb,UAAU,IAAI,uBAAuB;AAAA,YACrC,QAAQ,IAAI;AAAA,UACd,CAAC;AACD,cAAI,CAAC,SAAS;AACZ,kBAAM,KAAK,SAAS,QAAQ,EAAE;AAC9B;AAAA,UACF;AACA,mBAAS,KAAK,MAAM,KAAK,SAAS,SAAS,IAAI,OAAO,CAAC;AAAA,QACzD,SAAS,KAAK;AAEZ,gBAAM,KAAK,SAAS,QAAQ,EAAE,EAAE,MAAM,MAAM;AAAA,UAAC,CAAC;AAC9C,gBAAM;AAAA,QACR;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAKA,SAAS,gBAAgB,KAAuD;AAC9E,QAAM,SAAS,IAAI;AACnB,MAAI,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAChE,UAAM,IAAK,OAAiC;AAC5C,QAAI,MAAM,QAAQ,CAAC,KAAK,EAAE,SAAS,EAAG,QAAO;AAAA,EAC/C;AACA,SAAO,IAAI;AACb;;;ACrGA,SAAS,aAAAA,kBAAiB;AASnB,SAAS,oBAAoB,MAAsD;AACxF,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,SAAS,EAAE,cAAc,SAAS,GAAG;AACzC,YAAM,QAAQ,MAAM,KAAK,mBAAmB,oBAAoB,QAAQ;AACxE,UAAI,MAAM,MAAM,WAAW,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAEnE,UAAI,UAAU;AACd,iBAAW,QAAQ,MAAM,OAAO;AAC9B,YAAI,WAAW,KAAK,OAAO,YAAY,EAAG;AAAA,MAC5C;AACA,UAAI,YAAY,EAAG,QAAO,EAAE,SAAS,OAAO,SAAS,GAAG;AAExD,YAAM,UACJ,MAAM,MAAM,WAAW,IACnB,MAAM,MAAM,CAAC,EAAG,UAChB,YAAY,OAAO,gBAAgB,YAAY,IAAI,KAAK,GAAG;AACjE,aAAO,EAAE,SAAS,MAAM,QAAQ;AAAA,IAClC;AAAA,EACF;AACF;AAIA,SAAS,WAAW,OAAe,KAAsB;AACvD,QAAM,SAASA,WAAU,OAAO,CAAC,SAAS,oBAAoB,OAAO,GAAG,GAAG;AAAA,IACzE;AAAA,IACA,OAAO;AAAA,IACP,UAAU;AAAA,EACZ,CAAC;AACD,SAAO,OAAO,WAAW;AAC3B;","names":["spawnSync"]}
|
package/dist/index.d.ts
CHANGED
|
@@ -2,10 +2,14 @@ import { AgentEvalError, KnowledgeReadinessReport, RunRecord, ControlEvalResult,
|
|
|
2
2
|
export { AgentEvalError, AgentEvalErrorCode, ConfigError, ControlBudget, ControlDecision, ControlEvalResult, ControlRunResult, ControlStep, DataAcquisitionPlan, JudgeError, KnowledgeReadinessReport, KnowledgeRequirement, NotFoundError, RunRecord, ValidationError } from '@tangle-network/agent-eval';
|
|
3
3
|
import { a as AgentBackendInput, b as AgentExecutionBackend, O as OpenAIChatTool, c as OpenAIChatToolChoice, d as AgentBackendContext, R as RuntimeStreamEvent, K as KnowledgeReadinessDecision, e as RunAgentTaskOptions, f as AgentTaskRunResult, g as RunAgentTaskStreamOptions, h as AgentRuntimeEvent, i as AgentTaskStatus, j as RuntimeSessionStore, k as RuntimeSession } from './types-CsCCryln.js';
|
|
4
4
|
export { l as AgentAdapter, m as AgentKnowledgeProvider, n as AgentRuntimeEventSink, o as AgentTaskContext, A as AgentTaskSpec, B as BackendErrorDetail } from './types-CsCCryln.js';
|
|
5
|
-
import {
|
|
6
|
-
|
|
7
|
-
import {
|
|
8
|
-
|
|
5
|
+
import { Scenario } from '@tangle-network/agent-eval/campaign';
|
|
6
|
+
import { R as RunAnalystLoopOpts, a as RunAnalystLoopResult } from './types-p8dWBIXL.js';
|
|
7
|
+
import { O as OptimizePromptOptions, b as OptimizePromptResult } from './optimize-prompt-cmH9wZdH.js';
|
|
8
|
+
import { T as TopologyPlanner, D as DynamicDecision } from './dynamic-DeOPeeAw.js';
|
|
9
|
+
import { L as LoopSandboxClient, O as OutputAdapter, V as Validator, A as AgentRunSpec, b as LoopResult } from './types-CmkQl8qE.js';
|
|
10
|
+
export { R as RuntimeRunHandle, p as RuntimeRunPersistenceAdapter, q as RuntimeRunRow, s as startRuntimeRun } from './types-CmkQl8qE.js';
|
|
11
|
+
import { d as DelegateCodeArgs, t as CoderReviewer, u as CoderWinnerSelection, A as FactCandidate, w as CreateKbGateOptions } from './otel-export-CNmeg_7B.js';
|
|
12
|
+
export { U as EvalRunEvent, V as EvalRunGeneration, W as EvalRunsExportConfig, X as EvalRunsExportResult, Y as INTELLIGENCE_WIRE_VERSION, Z as OtelAttribute, _ as OtelExportConfig, O as OtelExporter, $ as OtelSpan, a0 as buildLoopOtelSpans, a1 as createOtelExporter, a2 as exportEvalRuns, a3 as loopEventToOtelSpan, Q as mcpToolsForRuntimeMcp, T as mcpToolsForRuntimeMcpSubset } from './otel-export-CNmeg_7B.js';
|
|
9
13
|
import { CoderOutput } from './profiles.js';
|
|
10
14
|
import '@tangle-network/sandbox';
|
|
11
15
|
|
|
@@ -1089,6 +1093,64 @@ declare function coderLoopRunner(options: CoderLoopRunnerOptions): DelegatedLoop
|
|
|
1089
1093
|
declare function reviewLoopRunner(options: CoderLoopRunnerOptions & {
|
|
1090
1094
|
reviewer: CoderReviewer;
|
|
1091
1095
|
}): DelegatedLoopRunner<CoderOutput>;
|
|
1096
|
+
/** @experimental Options for the default `dynamic` runner. */
|
|
1097
|
+
interface DynamicLoopRunnerOptions<Task, Output> {
|
|
1098
|
+
sandboxClient: LoopSandboxClient;
|
|
1099
|
+
/** The agent-authored topology planner (e.g. `createSandboxPlanner(...)`). */
|
|
1100
|
+
planner: TopologyPlanner<Task, Output>;
|
|
1101
|
+
task: Task;
|
|
1102
|
+
output: OutputAdapter<Output>;
|
|
1103
|
+
validator?: Validator<Output>;
|
|
1104
|
+
/** Exactly one of `agentRun` / `agentRuns` (runLoop validates). */
|
|
1105
|
+
agentRun?: AgentRunSpec<Task>;
|
|
1106
|
+
agentRuns?: AgentRunSpec<Task>[];
|
|
1107
|
+
maxIterations?: number;
|
|
1108
|
+
maxFanout?: number;
|
|
1109
|
+
}
|
|
1110
|
+
/** @experimental `dynamic` mode — agent-authored topology over `runLoop`. */
|
|
1111
|
+
declare function dynamicLoopRunner<Task, Output>(o: DynamicLoopRunnerOptions<Task, Output>): DelegatedLoopRunner<LoopResult<Task, Output, DynamicDecision>>;
|
|
1112
|
+
/** @experimental A fact rejected at the KB gate — surfaced, never dropped. */
|
|
1113
|
+
interface VetoedFact {
|
|
1114
|
+
candidate: FactCandidate;
|
|
1115
|
+
vetoedBy?: string;
|
|
1116
|
+
reason?: string;
|
|
1117
|
+
}
|
|
1118
|
+
/** @experimental */
|
|
1119
|
+
interface ResearchLoopResult {
|
|
1120
|
+
/** Facts that passed the fail-closed gate — safe to write to the KB. */
|
|
1121
|
+
accepted: FactCandidate[];
|
|
1122
|
+
/** Facts the gate vetoed in the final round — escalate, do not silently drop. */
|
|
1123
|
+
vetoed: VetoedFact[];
|
|
1124
|
+
/** Research rounds actually run. */
|
|
1125
|
+
rounds: number;
|
|
1126
|
+
}
|
|
1127
|
+
/** @experimental Options for the default `research` runner. */
|
|
1128
|
+
interface ResearchLoopRunnerOptions {
|
|
1129
|
+
/**
|
|
1130
|
+
* The research engine (the consumer's web/doc searcher + extractor). Called
|
|
1131
|
+
* each round with the prior round's vetoes so it can re-research the gaps.
|
|
1132
|
+
* Returns fact candidates carrying their grounding (`verbatimPassage` +
|
|
1133
|
+
* `sourceText`).
|
|
1134
|
+
*/
|
|
1135
|
+
research: (round: number, vetoed: VetoedFact[]) => Promise<FactCandidate[]>;
|
|
1136
|
+
/** Gate config (extra judges, self-artifact kinds, …). The floor is always on. */
|
|
1137
|
+
gate?: CreateKbGateOptions;
|
|
1138
|
+
/** Max research rounds (correct-on-veto remediation). Default 1. */
|
|
1139
|
+
maxRounds?: number;
|
|
1140
|
+
}
|
|
1141
|
+
/**
|
|
1142
|
+
* @experimental `research` mode — research-in-a-loop with valid-only KB growth.
|
|
1143
|
+
*
|
|
1144
|
+
* Each round: research → gate every candidate (fail-closed; passage MUST be in
|
|
1145
|
+
* the source) → accept the clean ones → re-research the vetoed ones next round,
|
|
1146
|
+
* up to `maxRounds`. Vetoed facts in the final round are RETURNED (escalate,
|
|
1147
|
+
* never silently dropped) so the caller audits vs retries.
|
|
1148
|
+
*/
|
|
1149
|
+
declare function researchLoopRunner(o: ResearchLoopRunnerOptions): DelegatedLoopRunner<ResearchLoopResult>;
|
|
1150
|
+
/** @experimental `self-improve` mode — identity-gated prompt optimization. */
|
|
1151
|
+
declare function selfImproveLoopRunner<TScenario extends Scenario, TArtifact>(options: OptimizePromptOptions<TScenario, TArtifact>): DelegatedLoopRunner<OptimizePromptResult<TArtifact, TScenario>>;
|
|
1152
|
+
/** @experimental `audit` mode — analyst loop over captured trace/run data. */
|
|
1153
|
+
declare function auditLoopRunner<TProposal = unknown, TEdit = unknown>(options: RunAnalystLoopOpts): DelegatedLoopRunner<RunAnalystLoopResult<TProposal, TEdit>>;
|
|
1092
1154
|
|
|
1093
1155
|
/**
|
|
1094
1156
|
* @stable
|
|
@@ -1371,4 +1433,4 @@ declare function readinessServerSentEvent(report: KnowledgeReadinessReport, opti
|
|
|
1371
1433
|
/** @stable */
|
|
1372
1434
|
declare function runtimeStreamServerSentEvent(event: RuntimeStreamEvent, options?: RuntimeTelemetryOptions & ServerSentEventOptions): string;
|
|
1373
1435
|
|
|
1374
|
-
export { AgentBackendContext, AgentBackendInput, AgentExecutionBackend, AgentRuntimeEvent, AgentTaskRunResult, AgentTaskStatus, type AuthSource, type BackendCallPolicy, BackendTransportError, type ChatStreamEvent, type ChatTurnHooks, type ChatTurnIdentity, type ChatTurnProducer, type ChatTurnResult, type CircuitBreakerConfig, CircuitBreakerState, CircuitOpenError, type CoderLoopRunnerOptions, type Conversation, type ConversationDriveState, type ConversationJournal, type ConversationJournalEntry, type ConversationParticipant, type ConversationPolicy, type ConversationResult, type ConversationStreamEvent, type ConversationTurn, type D1DatabaseLike, type D1StmtLike, DEFAULT_MAX_DEPTH, DEFAULT_ROUTER_BASE_URL, DeadlineExceededError, type DelegatedLoopMode, type DelegatedLoopRegistry, type DelegatedLoopResult, type DelegatedLoopRunner, FORWARD_HEADERS, FileConversationJournal, type ForwardHeaderName, type HaltContext, type HaltPredicate, type HaltReason, type HaltSignal, InMemoryConversationJournal, InMemoryRuntimeSessionStore, type ModelInfo, OpenAIChatTool, OpenAIChatToolChoice, PlannerError, type PropagatedHeaders, type ResolvedChatModel, type RetryBackoff, type RetryableErrorPredicate, type RouterEnv, type RunChatTurnInput, type RunConversationOptions, type RunDelegatedLoopOptions, type RuntimeEventCollector, RuntimeRunStateError, RuntimeSessionStore, RuntimeStreamEvent, type RuntimeStreamEventCollector, type RuntimeTelemetryOptions, type SanitizedKnowledgeReadinessReport, type SqlAdapter, SqlConversationJournal, type TurnOrder, applyRunRecordDefaults, buildForwardHeaders, cleanModelId, coderLoopRunner, computeBackoff, createConversationBackend, createIterableBackend, createOpenAICompatibleBackend, createRuntimeEventCollector, createRuntimeStreamEventCollector, createSandboxPromptBackend, d1ToSqlAdapter, decideKnowledgeReadiness, defaultIsRetryable, defineConversation, deriveExecutionId, getModels, handleChatTurn, isDepthExceeded, makePerAttemptSignal, readDepth, readinessServerSentEvent, resolveChatModel, resolveRouterBaseUrl, reviewLoopRunner, runAgentTask, runAgentTaskStream, runConversation, runConversationStream, runDelegatedLoop, runtimeStreamServerSentEvent, sanitizeAgentRuntimeEvent, sanitizeKnowledgeReadinessReport, sanitizeRuntimeStreamEvent, sleep, slugifySpeaker, turnId, validateChatModelId };
|
|
1436
|
+
export { AgentBackendContext, AgentBackendInput, AgentExecutionBackend, AgentRuntimeEvent, AgentTaskRunResult, AgentTaskStatus, type AuthSource, type BackendCallPolicy, BackendTransportError, type ChatStreamEvent, type ChatTurnHooks, type ChatTurnIdentity, type ChatTurnProducer, type ChatTurnResult, type CircuitBreakerConfig, CircuitBreakerState, CircuitOpenError, type CoderLoopRunnerOptions, type Conversation, type ConversationDriveState, type ConversationJournal, type ConversationJournalEntry, type ConversationParticipant, type ConversationPolicy, type ConversationResult, type ConversationStreamEvent, type ConversationTurn, type D1DatabaseLike, type D1StmtLike, DEFAULT_MAX_DEPTH, DEFAULT_ROUTER_BASE_URL, DeadlineExceededError, type DelegatedLoopMode, type DelegatedLoopRegistry, type DelegatedLoopResult, type DelegatedLoopRunner, type DynamicLoopRunnerOptions, FORWARD_HEADERS, FileConversationJournal, type ForwardHeaderName, type HaltContext, type HaltPredicate, type HaltReason, type HaltSignal, InMemoryConversationJournal, InMemoryRuntimeSessionStore, type ModelInfo, OpenAIChatTool, OpenAIChatToolChoice, PlannerError, type PropagatedHeaders, type ResearchLoopResult, type ResearchLoopRunnerOptions, type ResolvedChatModel, type RetryBackoff, type RetryableErrorPredicate, type RouterEnv, type RunChatTurnInput, type RunConversationOptions, type RunDelegatedLoopOptions, type RuntimeEventCollector, RuntimeRunStateError, RuntimeSessionStore, RuntimeStreamEvent, type RuntimeStreamEventCollector, type RuntimeTelemetryOptions, type SanitizedKnowledgeReadinessReport, type SqlAdapter, SqlConversationJournal, type TurnOrder, type VetoedFact, applyRunRecordDefaults, auditLoopRunner, buildForwardHeaders, cleanModelId, coderLoopRunner, computeBackoff, createConversationBackend, createIterableBackend, createOpenAICompatibleBackend, createRuntimeEventCollector, createRuntimeStreamEventCollector, createSandboxPromptBackend, d1ToSqlAdapter, decideKnowledgeReadiness, defaultIsRetryable, defineConversation, deriveExecutionId, dynamicLoopRunner, getModels, handleChatTurn, isDepthExceeded, makePerAttemptSignal, readDepth, readinessServerSentEvent, researchLoopRunner, resolveChatModel, resolveRouterBaseUrl, reviewLoopRunner, runAgentTask, runAgentTaskStream, runConversation, runConversationStream, runDelegatedLoop, runtimeStreamServerSentEvent, sanitizeAgentRuntimeEvent, sanitizeKnowledgeReadinessReport, sanitizeRuntimeStreamEvent, selfImproveLoopRunner, sleep, slugifySpeaker, turnId, validateChatModelId };
|
package/dist/index.js
CHANGED
|
@@ -1,16 +1,26 @@
|
|
|
1
|
+
import {
|
|
2
|
+
runAnalystLoop
|
|
3
|
+
} from "./chunk-XBUG326M.js";
|
|
4
|
+
import {
|
|
5
|
+
optimizePrompt
|
|
6
|
+
} from "./chunk-VOX6Z3II.js";
|
|
1
7
|
import {
|
|
2
8
|
INTELLIGENCE_WIRE_VERSION,
|
|
3
9
|
buildLoopOtelSpans,
|
|
10
|
+
createKbGate,
|
|
4
11
|
createOtelExporter,
|
|
5
12
|
exportEvalRuns,
|
|
6
13
|
loopEventToOtelSpan,
|
|
7
14
|
mcpToolsForRuntimeMcp,
|
|
8
15
|
mcpToolsForRuntimeMcpSubset
|
|
9
|
-
} from "./chunk-
|
|
16
|
+
} from "./chunk-Z523NPJK.js";
|
|
10
17
|
import {
|
|
11
18
|
createDefaultCoderDelegate
|
|
12
19
|
} from "./chunk-V6GURW4W.js";
|
|
13
|
-
import
|
|
20
|
+
import {
|
|
21
|
+
createDynamicDriver,
|
|
22
|
+
runLoop
|
|
23
|
+
} from "./chunk-7JBDJQLO.js";
|
|
14
24
|
import "./chunk-3HMHSN22.js";
|
|
15
25
|
import "./chunk-PY6NMZYX.js";
|
|
16
26
|
import {
|
|
@@ -1768,6 +1778,51 @@ function coderLoopRunner(options) {
|
|
|
1768
1778
|
function reviewLoopRunner(options) {
|
|
1769
1779
|
return coderLoopRunner(options);
|
|
1770
1780
|
}
|
|
1781
|
+
function dynamicLoopRunner(o) {
|
|
1782
|
+
return async (signal) => runLoop({
|
|
1783
|
+
driver: createDynamicDriver({
|
|
1784
|
+
planner: o.planner,
|
|
1785
|
+
...o.maxIterations !== void 0 ? { maxIterations: o.maxIterations } : {},
|
|
1786
|
+
...o.maxFanout !== void 0 ? { maxFanout: o.maxFanout } : {}
|
|
1787
|
+
}),
|
|
1788
|
+
...o.agentRun ? { agentRun: o.agentRun } : {},
|
|
1789
|
+
...o.agentRuns ? { agentRuns: o.agentRuns } : {},
|
|
1790
|
+
output: o.output,
|
|
1791
|
+
...o.validator ? { validator: o.validator } : {},
|
|
1792
|
+
task: o.task,
|
|
1793
|
+
ctx: { sandboxClient: o.sandboxClient, signal },
|
|
1794
|
+
...o.maxIterations !== void 0 ? { maxIterations: o.maxIterations } : {}
|
|
1795
|
+
});
|
|
1796
|
+
}
|
|
1797
|
+
function researchLoopRunner(o) {
|
|
1798
|
+
const gate = createKbGate(o.gate);
|
|
1799
|
+
const maxRounds = Math.max(1, Math.trunc(o.maxRounds ?? 1));
|
|
1800
|
+
return async (signal) => {
|
|
1801
|
+
const accepted = [];
|
|
1802
|
+
let vetoed = [];
|
|
1803
|
+
let rounds = 0;
|
|
1804
|
+
for (let round = 0; round < maxRounds; round += 1) {
|
|
1805
|
+
if (signal.aborted) break;
|
|
1806
|
+
rounds += 1;
|
|
1807
|
+
const candidates = await o.research(round, vetoed);
|
|
1808
|
+
if (candidates.length === 0) break;
|
|
1809
|
+
vetoed = [];
|
|
1810
|
+
for (const c of candidates) {
|
|
1811
|
+
const v = await gate(c);
|
|
1812
|
+
if (v.accepted) accepted.push(c);
|
|
1813
|
+
else vetoed.push({ candidate: c, vetoedBy: v.vetoedBy, reason: v.reason });
|
|
1814
|
+
}
|
|
1815
|
+
if (vetoed.length === 0) break;
|
|
1816
|
+
}
|
|
1817
|
+
return { accepted, vetoed, rounds };
|
|
1818
|
+
};
|
|
1819
|
+
}
|
|
1820
|
+
function selfImproveLoopRunner(options) {
|
|
1821
|
+
return async () => optimizePrompt(options);
|
|
1822
|
+
}
|
|
1823
|
+
function auditLoopRunner(options) {
|
|
1824
|
+
return async () => runAnalystLoop(options);
|
|
1825
|
+
}
|
|
1771
1826
|
|
|
1772
1827
|
// src/model-resolution.ts
|
|
1773
1828
|
var DEFAULT_ROUTER_BASE_URL = "https://router.tangle.tools";
|
|
@@ -2774,6 +2829,7 @@ export {
|
|
|
2774
2829
|
SqlConversationJournal,
|
|
2775
2830
|
ValidationError,
|
|
2776
2831
|
applyRunRecordDefaults,
|
|
2832
|
+
auditLoopRunner,
|
|
2777
2833
|
buildForwardHeaders,
|
|
2778
2834
|
buildLoopOtelSpans,
|
|
2779
2835
|
cleanModelId,
|
|
@@ -2791,6 +2847,7 @@ export {
|
|
|
2791
2847
|
defaultIsRetryable,
|
|
2792
2848
|
defineConversation,
|
|
2793
2849
|
deriveExecutionId,
|
|
2850
|
+
dynamicLoopRunner,
|
|
2794
2851
|
exportEvalRuns,
|
|
2795
2852
|
getModels,
|
|
2796
2853
|
handleChatTurn,
|
|
@@ -2801,6 +2858,7 @@ export {
|
|
|
2801
2858
|
mcpToolsForRuntimeMcpSubset,
|
|
2802
2859
|
readDepth,
|
|
2803
2860
|
readinessServerSentEvent,
|
|
2861
|
+
researchLoopRunner,
|
|
2804
2862
|
resolveChatModel,
|
|
2805
2863
|
resolveRouterBaseUrl,
|
|
2806
2864
|
reviewLoopRunner,
|
|
@@ -2813,6 +2871,7 @@ export {
|
|
|
2813
2871
|
sanitizeAgentRuntimeEvent,
|
|
2814
2872
|
sanitizeKnowledgeReadinessReport,
|
|
2815
2873
|
sanitizeRuntimeStreamEvent,
|
|
2874
|
+
selfImproveLoopRunner,
|
|
2816
2875
|
sleep2 as sleep,
|
|
2817
2876
|
slugifySpeaker,
|
|
2818
2877
|
startRuntimeRun,
|