@tangle-network/agent-runtime 0.47.0 → 0.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +79 -15
  2. package/dist/agent.js +1 -1
  3. package/dist/chunk-GHX7XOJ2.js +433 -0
  4. package/dist/chunk-GHX7XOJ2.js.map +1 -0
  5. package/dist/{chunk-T4OQQEE3.js → chunk-IQS4HI3F.js} +14 -5
  6. package/dist/chunk-IQS4HI3F.js.map +1 -0
  7. package/dist/{chunk-72JQCHOZ.js → chunk-PXUTIMGJ.js} +2318 -237
  8. package/dist/chunk-PXUTIMGJ.js.map +1 -0
  9. package/dist/{chunk-MGFEUYOH.js → chunk-U2VEWKKK.js} +3 -3
  10. package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
  11. package/dist/chunk-VIEDXELL.js.map +1 -0
  12. package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
  13. package/dist/index.d.ts +29 -4
  14. package/dist/index.js +109 -21
  15. package/dist/index.js.map +1 -1
  16. package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
  17. package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
  18. package/dist/loop-runner-bin.d.ts +2 -2
  19. package/dist/loop-runner-bin.js +3 -3
  20. package/dist/loops.d.ts +3 -3
  21. package/dist/loops.js +57 -1
  22. package/dist/mcp/bin.js +187 -24
  23. package/dist/mcp/bin.js.map +1 -1
  24. package/dist/mcp/index.d.ts +28 -125
  25. package/dist/mcp/index.js +28 -6
  26. package/dist/mcp/index.js.map +1 -1
  27. package/dist/platform.js +2 -2
  28. package/dist/platform.js.map +1 -1
  29. package/dist/runtime.d.ts +1100 -62
  30. package/dist/runtime.js +57 -1
  31. package/dist/{types-Cbx3dNK5.d.ts → types-BpDfCPUp.d.ts} +1 -1
  32. package/dist/workflow.js +1 -1
  33. package/package.json +7 -6
  34. package/dist/chunk-5YDS7BLC.js +0 -218
  35. package/dist/chunk-5YDS7BLC.js.map +0 -1
  36. package/dist/chunk-72JQCHOZ.js.map +0 -1
  37. package/dist/chunk-JNPK46YH.js.map +0 -1
  38. package/dist/chunk-T4OQQEE3.js.map +0 -1
  39. package/dist/kb-gate-51BlLlVM.d.ts +0 -529
  40. /package/dist/{chunk-MGFEUYOH.js.map → chunk-U2VEWKKK.js.map} +0 -0
  41. /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
package/README.md CHANGED
@@ -2,7 +2,10 @@
2
2
 
3
3
  The shared task-lifecycle skeleton for agents. It runs an agent (a chat turn, a one-shot task, or a multi-attempt loop), captures every run as a trace, and feeds those traces into eval-gated self-improvement.
4
4
 
5
- It owns the lifecycle and the loop kernel. It delegates domain behavior (models, tools, knowledge) to adapters, scoring and the ship gate to [`@tangle-network/agent-eval`](https://www.npmjs.com/package/@tangle-network/agent-eval), and sandboxed long-running execution to [`@tangle-network/sandbox`](https://www.npmjs.com/package/@tangle-network/sandbox).
5
+ It owns the lifecycle, the loop kernel, and the **optimization suite** `Environment` + `Strategy` +
6
+ `runBenchmark` + `runStrategyEvolution`, the published surface for measuring and evolving how an agent
7
+ spends compute against a deployable check. It delegates domain behavior (models, tools, knowledge) to
8
+ adapters, scoring statistics and the ship gate to [`@tangle-network/agent-eval`](https://www.npmjs.com/package/@tangle-network/agent-eval), and sandboxed long-running execution to [`@tangle-network/sandbox`](https://www.npmjs.com/package/@tangle-network/sandbox).
6
9
 
7
10
  ```bash
8
11
  pnpm add @tangle-network/agent-runtime @tangle-network/agent-eval @tangle-network/sandbox
@@ -53,8 +56,9 @@ That is the common case. Everything below is for when one chat turn is not enoug
53
56
  | Run a production chat turn (most products) | `handleChatTurn` | root |
54
57
  | Declare an agent (profile, surfaces, adapters) | `defineAgent` | `/agent` |
55
58
  | Run a one-shot task with verification and eval | `runAgentTask` | root |
56
- | Run a multi-attempt loop (refine or fanout-vote) | `runLoop` plus a driver | `/loops` |
57
- | Let the agent choose the loop shape per round | `createDriver` plus `createSandboxPlanner` | `/loops` |
59
+ | Compare optimization strategies on YOUR domain (5 hooks) | `runBenchmark` + `defineStrategy` | `/loops` |
60
+ | Let the system author + evolve its own strategies, gated | `runStrategyEvolution` · `authorStrategy` · `promotionGate` | `/loops` |
61
+ | Run a multi-attempt loop with a custom driver | `runLoop` + `createDriver` | `/loops` |
58
62
  | Delegate a disciplined loop by mode (code, research, ...) | `runDelegatedLoop` or `agent-runtime-loop` | root |
59
63
  | Build code reliably (reviewed, gated) | `createDefaultCoderDelegate` | `/mcp` |
60
64
  | Grow a knowledge base with only grounded facts | `createKbGate` | `/mcp` |
@@ -64,15 +68,50 @@ That is the common case. Everything below is for when one chat turn is not enoug
64
68
  | Mutate surfaces from trace findings | `runAnalystLoop` | `/analyst-loop` |
65
69
  | Persist a run plus its cost ledger | `startRuntimeRun` | root |
66
70
 
71
+ ## The optimization suite
72
+
73
+ The canonical surface. A domain is an `Environment` (five hooks: `open`/`tools`/`call`/`score`/`close`);
74
+ a **strategy** is how a compute budget is spent to beat the domain's own deployable check. Two
75
+ built-ins (`sample` = best-of-N, `refine` = critique-and-continue) plus `defineStrategy` to compose
76
+ your own from two steps — and `authorStrategy`, where the system writes new strategies from its own
77
+ per-task losses:
78
+
79
+ ```ts
80
+ import { defineStrategy, runBenchmark, sample, refine } from '@tangle-network/agent-runtime/loops'
81
+
82
+ const doubleCheck = defineStrategy('double-check', async ({ shot, critique }) => {
83
+ const first = await shot()
84
+ const steer = first ? await critique(first.messages) : null
85
+ const second = steer ? await shot({ messages: first?.messages, steer }) : null
86
+ const score = Math.max(first?.score ?? 0, second?.score ?? 0)
87
+ return { score, resolved: score >= 1, completions: 2, progression: [first?.score ?? 0, score], shots: 2 }
88
+ })
89
+
90
+ const report = await runBenchmark({ environment, tasks, worker, strategies: [sample, refine, doubleCheck], budget: 3 })
91
+ report.perTask // the losses table an author/optimizer consumes
92
+ report.pareto // the (score, $) frontier
93
+ ```
94
+
95
+ The measurement invariants are structural, not advisory: every strategy spends through a conserved
96
+ budget pool (equal compute by construction), the deliverable score is **harness-verified** from the
97
+ shots actually brokered (a body cannot fabricate a win), and the critic is firewalled from the check
98
+ (selector ≠ judge). `runStrategyEvolution` runs the multi-generation search — populations of authored
99
+ candidates, cost-aware champion selection, a phase ledger with resume, and ONE promotion decision via
100
+ `promotionGate` (seeded paired bootstrap) on a holdout slice the search never touched.
101
+ `createVerifierEnvironment` adapts answer-shaped domains (one `check` function); `createMcpEnvironment`
102
+ adapts any MCP server. The consumer surface — loops as a service with a CLI, detached runner, and MCP
103
+ server — lives in the [`loops`](https://github.com/drewstone/loops) repo; the experiment harness and
104
+ evidence ledger live in [`bench/HARNESS.md`](./bench/HARNESS.md).
105
+
67
106
  ## The loop kernel
68
107
 
69
108
  `runLoop` is a topology-agnostic kernel. Each iteration spawns a sandbox on an `AgentRunSpec`, decodes the output, validates it, and asks a driver what to do next. The driver owns topology. The validator owns scoring. The kernel owns iteration accounting, concurrency, cost and token aggregation, and trace emission.
70
109
 
71
110
  ```ts
72
- import { runLoop, createFanoutVoteDriver } from '@tangle-network/agent-runtime/loops'
111
+ import { runLoop, createDriver } from '@tangle-network/agent-runtime/loops'
73
112
 
74
113
  const result = await runLoop({
75
- driver: createFanoutVoteDriver({ n: 3 }), // 3 parallel attempts, pick the best valid one
114
+ driver: createDriver({ planner }), // the planner emits one TopologyMove per round
76
115
  agentRuns: [claudeSpec, codexSpec, glmSpec], // heterogeneous: one harness per branch
77
116
  output, // events to typed Output
78
117
  validator, // Output to { valid, score }
@@ -82,9 +121,13 @@ const result = await runLoop({
82
121
  result.winner // highest-scoring valid attempt
83
122
  ```
84
123
 
85
- Shipped drivers (`/loops/drivers`): `createRefineDriver` (single task, iterate until valid), `createFanoutVoteDriver` (N parallel, vote), and `createDriver` (the agent authors the topology at runtime). The dynamic driver emits one `TopologyMove` per round (`refine`, `fanout`, or `stop`) from an injected planner; a malformed move throws `PlannerError`, so the loop never runs a topology nobody chose. Topology is orthogonal to harness: the planner never names a backend, and the kernel's `agentRuns` decide which harness runs each branch.
86
-
87
- `runProgram` (also in `/loops`) is the recursive op-set (`sample`, `steer`, `fork`, `parallel`, `select`, `seq`, `stop`) plus a tree executor, for programs that compose sub-loops.
124
+ `createDriver` lets a planner author the topology at runtime: one `TopologyMove` per round
125
+ (`refine`, `fanout`, `select`, or `stop`); a malformed move throws `PlannerError`, so the loop never
126
+ runs a topology nobody chose. Topology is orthogonal to harness: the planner never names a backend,
127
+ and the kernel's `agentRuns` decide which harness runs each branch. For fixed shapes, write a small
128
+ inline `Driver` (see `examples/coder-loop`) or use the `personify` combinators (`fanout`, `loopUntil`,
129
+ `panel`, `pipeline`) over the recursive `Scope`/`Supervisor` core — the newer canonical path for
130
+ recursive work.
88
131
 
89
132
  ## Self-improvement
90
133
 
@@ -106,7 +149,12 @@ const result = await selfImprove({
106
149
  // result.winner.surface is the safe one — the baseline unless gateDecision === 'ship'
107
150
  ```
108
151
 
109
- agent-runtime contributes the runtime-specific piece: the **CODE-surface `improvementDriver`** (`/improvement`) — a git-worktree mutator you pass to `selfImprove` as `driver` to optimize code instead of a string.
152
+ agent-runtime contributes the runtime-specific pieces: the **CODE-surface `improvementDriver`**
153
+ (`/improvement`) — a git-worktree mutator you pass to `selfImprove` as `driver` to optimize code
154
+ instead of a string — and **`runStrategyEvolution`** (`/loops`), the multi-generation search over
155
+ STRATEGY space: the system reads its own per-task losses, authors candidate strategies as code,
156
+ plays them against the incumbent at equal budget, and a seeded statistical gate decides promotion
157
+ on a never-touched holdout slice.
110
158
 
111
159
  `runAnalystLoop` (`/analyst-loop`) mines real run traces into findings; `createAnalystDriverHook` feeds those findings to a dynamic-driver planner via `PlannerContext.analyses`, with a firewall (`assertTraceDerivedFindings`) that rejects any finding derived from a judge verdict. Production intake — turning real run traces into the corpus `selfImprove` optimizes against — is agent-eval's `analyzeRuns` / `partitionRunsByAuthoringModel` (`/contract`).
112
160
 
@@ -156,9 +204,15 @@ const server = createMcpServer({ coderDelegate: createDefaultCoderDelegate({ san
156
204
 
157
205
  Or mount the `agent-runtime-mcp` stdio bin on a production `AgentProfile.mcp`.
158
206
 
207
+ Delegation state is in-memory by default — a server restart drops pending delegations and history. Set `AGENT_RUNTIME_DELEGATION_STATE_FILE=/path/state.json` on the bin (or construct via `DelegationTaskQueue.restore({ store: new FileDelegationStore({ filePath }) })`) to persist records across restarts: `delegation_status`/`delegation_history` keep answering for prior runs, idempotency keys dedupe resubmissions, and in-flight records either resume through the `resumeDelegate` seam (when submitted with a `detachedSessionRef`) or settle as failed with an explicit driver-restart error. A corrupt state file refuses to load (`DelegationStateCorruptError`); `AGENT_RUNTIME_DELEGATION_STATE_RECOVER=1` archives it and starts empty. `AGENT_RUNTIME_DELEGATION_RETAIN_TERMINAL=<n>` caps retained terminal records.
208
+
159
209
  ## The experiment harness (bench/)
160
210
 
161
- `bench/` is the internal harness that asks the binding empirical question: does any non-blind topology beat blind compute at equal k, under a deployable (non-oracle) selector, on a real benchmark? It runs through the same kernel, not a reimplementation.
211
+ `bench/` is the internal harness; [`bench/HARNESS.md`](./bench/HARNESS.md) is its map read that
212
+ first. The canonical path is the optimization suite (`runBenchmark`/`flywheel-evolve` over real
213
+ domains: the EnterpriseOps gym, commit0, answer-shaped math); the older selection-gate paths
214
+ (`runExperiment`, corpus-replay) remain for the legacy evidence. The live evidence ledger is
215
+ `.evolve/current.json` — results never live in this README.
162
216
 
163
217
  One entrypoint, `runExperiment(adapter, { sandboxClient, agentRun, arms, ... })`: N instances times a set of arms, each arm a topology driven through `runLoop`, judged by the adapter, written to a durable canonical corpus. An arm is one steer function `f(rootPrompt, history) => nextPrompt`: `random` ignores history (the compute control), `refine` carries the prior answer plus a directive, `diverse` rotates a strategy lens. The cost dial is the backend type (`hermes` for a direct router call, `opencode` or `claude-code` or `codex` for agent CLIs). The deep statistics (paired bootstrap with Benjamini-Hochberg correction, selector replay) come from `corpus-report.mts` and `corpus-replay.mts` over the written corpus, computed once. See `bench/HARNESS.md` and `docs/learning-flywheel.md`.
164
218
 
@@ -170,8 +224,9 @@ One entrypoint, `runExperiment(adapter, { sandboxClient, agentRun, arms, ... })`
170
224
  | Backend provider | `openai-compat` when `TANGLE_API_KEY`, else `openai` if `OPENAI_API_KEY` | `MODEL_PROVIDER` env |
171
225
  | Router base URL | `https://router.tangle.tools/v1` | `TANGLE_ROUTER_BASE_URL` env |
172
226
  | Sandbox base URL | `https://sandbox.tangle.tools` | `SANDBOX_API_URL` env |
173
- | Loop iteration cap | 10 (`runLoop`), 8 (dynamic driver) | `runLoop({ maxIterations })` |
174
- | Driver | none, required by `runLoop` | `createRefineDriver`, `createFanoutVoteDriver`, `createDriver` |
227
+ | Loop iteration cap | 10 (`runLoop`) | `runLoop({ maxIterations })` |
228
+ | Driver | none, required by `runLoop` | `createDriver` or an inline `Driver` |
229
+ | Strategy budget (suite) | 3 rollouts/shots per strategy per task | `runBenchmark({ budget })` |
175
230
  | Winner selection (coder delegate) | `highest-score` | `winnerSelection` option |
176
231
  | KB gate min passage | 12 chars | `createKbGate({ minPassageChars })` |
177
232
  | `selfImprove` gate | held-out gate (default) | pass `gate: defaultProductionGate` for red-team hardening |
@@ -202,18 +257,27 @@ sandbox AgentProfile, Sandbox.create, streamPrompt, exportTraceBundle. T
202
257
  |---|---|
203
258
  | `@tangle-network/agent-runtime` | chat turns, delegated loop-runner, OTEL export, errors, model resolution |
204
259
  | `.../agent` | `defineAgent` plus surface and outcome adapters |
205
- | `.../loops` | the `runLoop` kernel, the `refine` / `fanout-vote` / `dynamic` drivers, `runProgram`, `loopDispatch` |
260
+ | `.../loops` | **the optimization suite** (`Environment`, `defineStrategy`, `runBenchmark`, `runStrategyEvolution`, `authorStrategy`, `promotionGate`) + the `runLoop` kernel, `createDriver`, `loopDispatch` |
206
261
  | `.../profiles` | `coderProfile`, `researcherProfile` presets |
207
262
  | `.../mcp` | `createMcpServer`, `createDefaultCoderDelegate`, `createKbGate`, the `agent-runtime-mcp` bin |
208
263
  | `.../improvement` | `improvementDriver` (code/worktree `CandidateGenerator`), `agenticGenerator`, `reflectiveGenerator` — the code-surface driver you pass to agent-eval's `selfImprove` |
209
264
  | `.../analyst-loop` | `runAnalystLoop`, the analyst registry driver |
210
265
  | `.../platform` | cross-site SSO and the integrations hub |
266
+ | `.../runtime` | the recursive core by its own name (same module as `/loops`) |
267
+ | `.../topology` | the live agent-tree viewer (folds spawn/settle events into a renderable tree) |
268
+ | `.../workflow` · `.../audit` | workflow orchestration helpers · audit utilities |
211
269
 
212
270
  Bins: `agent-runtime-mcp` (delegation MCP server), `agent-runtime-loop` (schedulable delegated loop-runner).
213
271
 
214
- ## Adoption skill
272
+ ## Teaching an agent to build on this
215
273
 
216
- This package ships a self-contained adoption skill at [`skills/agent-runtime-adoption/SKILL.md`](./skills/agent-runtime-adoption/SKILL.md): driven loops, topology drivers, the `loopDispatch` campaign bridge, MCP delegation, and the code-surface `improvementDriver` for agent-eval's `selfImprove`. It needs only this package plus `@tangle-network/agent-eval`. For the full self-improving pipeline (trace sink, analyst loop, scorecard, production loop, CI), see the `agent-eval-adoption` and `agent-stack-adoption` skills.
274
+ Two agent-consumable skills live in the [`loops`](https://github.com/drewstone/loops) repo:
275
+ **`skills/loop-builder`** (domain → `Environment` → loop → gate → operator surface, with the
276
+ measured foot-gun list) and **`skills/loop-author`** (authoring a strategy body from losses;
277
+ read the contract with `loops contract`). The runnable on-ramp is [`examples/`](./examples/README.md)
278
+ — a learning progression from the production chat turn through the strategy suite to the recursive
279
+ supervisor. For the broader pipeline (trace sink, analyst loop, scorecard, CI), see the
280
+ `agent-eval-adoption` and `agent-stack-adoption` skills.
217
281
 
218
282
  ## Stability, tests, docs
219
283
 
package/dist/agent.js CHANGED
@@ -3,7 +3,7 @@ import {
3
3
  } from "./chunk-7JITYN6T.js";
4
4
  import {
5
5
  createSandboxForSpec
6
- } from "./chunk-72JQCHOZ.js";
6
+ } from "./chunk-PXUTIMGJ.js";
7
7
  import {
8
8
  mapSandboxEvent
9
9
  } from "./chunk-GSUO5QS6.js";
@@ -0,0 +1,433 @@
1
+ import {
2
+ coderProfile,
3
+ multiHarnessCoderFanout
4
+ } from "./chunk-KADIJAD4.js";
5
+ import {
6
+ createSandboxForSpec,
7
+ deleteBoxSafe,
8
+ runLoop,
9
+ sleep,
10
+ throwAbort,
11
+ throwIfAborted
12
+ } from "./chunk-PXUTIMGJ.js";
13
+ import {
14
+ ValidationError
15
+ } from "./chunk-GSUO5QS6.js";
16
+
17
+ // src/mcp/detached-turn.ts
18
+ var DEFAULT_TICK_INTERVAL_MS = 5e3;
19
+ function formatDetachedSessionRef(parts) {
20
+ assertRefComponent("sessionId", parts.sessionId);
21
+ if (parts.sandboxId === void 0) return `session=${parts.sessionId}`;
22
+ assertRefComponent("sandboxId", parts.sandboxId);
23
+ return `sandbox=${parts.sandboxId};session=${parts.sessionId}`;
24
+ }
25
+ function parseDetachedSessionRef(raw) {
26
+ const fields = /* @__PURE__ */ new Map();
27
+ for (const pair of raw.split(";")) {
28
+ const eq = pair.indexOf("=");
29
+ const key = eq === -1 ? "" : pair.slice(0, eq);
30
+ const value = eq === -1 ? "" : pair.slice(eq + 1);
31
+ if (key !== "session" && key !== "sandbox" || value.length === 0 || fields.has(key)) {
32
+ throw new ValidationError(
33
+ `parseDetachedSessionRef: malformed detachedSessionRef ${JSON.stringify(raw)} \u2014 expected "session=<id>" or "sandbox=<id>;session=<id>"`
34
+ );
35
+ }
36
+ fields.set(key, value);
37
+ }
38
+ const sessionId = fields.get("session");
39
+ if (!sessionId) {
40
+ throw new ValidationError(
41
+ `parseDetachedSessionRef: detachedSessionRef ${JSON.stringify(raw)} carries no session id`
42
+ );
43
+ }
44
+ const sandboxId = fields.get("sandbox");
45
+ return { sessionId, ...sandboxId !== void 0 ? { sandboxId } : {} };
46
+ }
47
+ function assertRefComponent(name, value) {
48
+ if (value.length === 0 || value.includes(";") || value.includes("=")) {
49
+ throw new ValidationError(
50
+ `formatDetachedSessionRef: ${name} ${JSON.stringify(value)} must be non-empty and free of ";" / "="`
51
+ );
52
+ }
53
+ }
54
+ function detachedTurnEvents(sessionId, turn) {
55
+ return [
56
+ {
57
+ type: "result",
58
+ id: sessionId,
59
+ data: {
60
+ text: turn.text,
61
+ finalText: turn.text,
62
+ success: true,
63
+ result: turn.result
64
+ }
65
+ }
66
+ ];
67
+ }
68
+ async function runDetachedTurn(options) {
69
+ const intervalMs = options.tickIntervalMs ?? DEFAULT_TICK_INTERVAL_MS;
70
+ const box = await createSandboxForSpec(options.client, options.spec, options.signal);
71
+ const drive = box;
72
+ const onAbort = () => {
73
+ void drive._sessionCancel?.(options.sessionId).catch(() => {
74
+ });
75
+ };
76
+ try {
77
+ if (typeof drive.driveTurn !== "function") {
78
+ throw new ValidationError(
79
+ "runDetachedTurn: the acquired sandbox exposes no driveTurn(message, { sessionId }) \u2014 detached dispatch requires @tangle-network/sandbox >= 0.6 and a session-backed placement (sibling/fleet); disable detached dispatch for this executor."
80
+ );
81
+ }
82
+ const sandboxId = box.id;
83
+ if (typeof sandboxId !== "string" || sandboxId.length === 0) {
84
+ throw new ValidationError(
85
+ "runDetachedTurn: the acquired sandbox carries no id \u2014 without it the detached run cannot be resumed after a restart, so refusing to dispatch detached."
86
+ );
87
+ }
88
+ options.bindSandbox(sandboxId);
89
+ options.signal.addEventListener("abort", onAbort, { once: true });
90
+ for (; ; ) {
91
+ throwIfAborted(options.signal);
92
+ const tick = await drive.driveTurn(options.prompt, {
93
+ sessionId: options.sessionId,
94
+ turnId: options.sessionId,
95
+ ...options.wallCapMs !== void 0 ? { wallCapMs: options.wallCapMs } : {}
96
+ });
97
+ throwIfAborted(options.signal);
98
+ if (tick.state === "completed") return { text: tick.text, result: tick.result };
99
+ if (tick.state === "failed") {
100
+ throw new Error(`detached turn ${options.sessionId} failed: ${tick.error}`);
101
+ }
102
+ options.report({ iteration: 0, phase: detachedRunningPhase(tick.elapsedMs) });
103
+ await sleep(intervalMs, options.signal);
104
+ }
105
+ } finally {
106
+ options.signal.removeEventListener("abort", onAbort);
107
+ if (options.signal.aborted) onAbort();
108
+ await deleteBoxSafe(box);
109
+ }
110
+ }
111
+ function detachedRunningPhase(elapsedMs) {
112
+ return elapsedMs === void 0 ? "detached-running" : `detached-running ${Math.round(elapsedMs / 1e3)}s`;
113
+ }
114
+ function createDriveTurnResumeDriver(options) {
115
+ const cancelHooked = /* @__PURE__ */ new Set();
116
+ return {
117
+ intervalMs: options.intervalMs ?? DEFAULT_TICK_INTERVAL_MS,
118
+ async tick({ record, detachedSessionRef }, ctx) {
119
+ const ref = parseDetachedSessionRef(detachedSessionRef);
120
+ if (ref.sandboxId === void 0) {
121
+ return {
122
+ state: "failed",
123
+ error: {
124
+ message: `detached session "${ref.sessionId}" was never bound to a sandbox \u2014 the previous process died before the box was acquired, so the turn was never dispatched and cannot be resumed`,
125
+ kind: "DetachedSessionUnboundError"
126
+ }
127
+ };
128
+ }
129
+ const box = await options.resolveSandbox(ref.sandboxId);
130
+ if (!cancelHooked.has(record.taskId)) {
131
+ cancelHooked.add(record.taskId);
132
+ ctx.signal.addEventListener(
133
+ "abort",
134
+ () => {
135
+ void box._sessionCancel?.(ref.sessionId).catch(() => {
136
+ });
137
+ },
138
+ { once: true }
139
+ );
140
+ }
141
+ if (ctx.signal.aborted) throwAbort();
142
+ const tick = await box.driveTurn(options.buildMessage(record), {
143
+ sessionId: ref.sessionId,
144
+ turnId: ref.sessionId,
145
+ ...options.wallCapMs !== void 0 ? { wallCapMs: options.wallCapMs } : {}
146
+ });
147
+ if (tick.state === "completed") {
148
+ const output = await options.settleOutput(
149
+ { text: tick.text, result: tick.result },
150
+ record,
151
+ {
152
+ signal: ctx.signal
153
+ }
154
+ );
155
+ return { state: "completed", output };
156
+ }
157
+ if (tick.state === "failed") {
158
+ return {
159
+ state: "failed",
160
+ error: {
161
+ message: `detached turn ${ref.sessionId} failed: ${tick.error}`,
162
+ kind: "DetachedTurnFailedError"
163
+ }
164
+ };
165
+ }
166
+ ctx.report({ iteration: 0, phase: detachedRunningPhase(tick.elapsedMs) });
167
+ return { state: "running" };
168
+ }
169
+ };
170
+ }
171
+
172
+ // src/mcp/executor.ts
173
+ function createSiblingSandboxExecutor(options) {
174
+ const underlying = options.client;
175
+ const client = {
176
+ create(opts) {
177
+ return underlying.create(opts);
178
+ },
179
+ describePlacement(box) {
180
+ return { kind: "sibling", sandboxId: readId(box) };
181
+ }
182
+ };
183
+ return {
184
+ client,
185
+ placement: "sibling",
186
+ describe() {
187
+ return "sibling-sandbox (each delegation = fresh sandbox via client.create)";
188
+ }
189
+ };
190
+ }
191
+ function createFleetWorkspaceExecutor(options) {
192
+ const fleet = options.fleet;
193
+ const exclude = new Set(options.excludeMachineIds ?? []);
194
+ let callIndex = 0;
195
+ const placementBySandboxId = /* @__PURE__ */ new Map();
196
+ const client = {
197
+ async create() {
198
+ const ids = fleet.ids.filter((id) => !exclude.has(id));
199
+ if (ids.length === 0) {
200
+ throw new Error(
201
+ `agent-runtime: fleet ${fleet.fleetId} has no eligible worker machines (ids=[${fleet.ids.join(",")}], excluded=[${[...exclude].join(",")}])`
202
+ );
203
+ }
204
+ const selector = options.selectMachine;
205
+ const machineId = selector ? selector({ callIndex, ids }) : ids[callIndex % ids.length];
206
+ callIndex += 1;
207
+ if (typeof machineId !== "string" || machineId.length === 0) {
208
+ throw new Error("agent-runtime: fleet executor selectMachine returned an empty machine id");
209
+ }
210
+ const box = await fleet.sandbox(machineId);
211
+ const sandboxId = readId(box);
212
+ if (sandboxId) placementBySandboxId.set(sandboxId, { machineId });
213
+ return box;
214
+ },
215
+ describePlacement(box) {
216
+ const sandboxId = readId(box);
217
+ const recorded = sandboxId ? placementBySandboxId.get(sandboxId) : void 0;
218
+ return {
219
+ kind: "fleet",
220
+ sandboxId,
221
+ fleetId: fleet.fleetId,
222
+ machineId: recorded?.machineId
223
+ };
224
+ }
225
+ };
226
+ return {
227
+ client,
228
+ placement: "fleet",
229
+ describe() {
230
+ const excluded = exclude.size > 0 ? ` (excluded=[${[...exclude].join(",")}])` : "";
231
+ return `fleet-workspace (fleetId=${fleet.fleetId}, machines=[${fleet.ids.join(",")}]${excluded})`;
232
+ }
233
+ };
234
+ }
235
+ function readId(box) {
236
+ const raw = box.id;
237
+ return typeof raw === "string" && raw.length > 0 ? raw : void 0;
238
+ }
239
+
240
+ // src/mcp/delegates.ts
241
+ function createDefaultCoderDelegate(options) {
242
+ const executor = resolveExecutor(options);
243
+ const sandboxClient = executor.client;
244
+ const fanoutHarnesses = options.fanoutHarnesses;
245
+ const maxConcurrency = options.maxConcurrency ?? 4;
246
+ const traceEmitter = options.traceEmitter;
247
+ return async (args, ctx) => {
248
+ const task = coderTaskFromArgs(args);
249
+ const variants = Math.max(1, Math.trunc(args.variants ?? 1));
250
+ ctx.report({ iteration: 0, phase: "starting" });
251
+ if (variants <= 1) {
252
+ const { agentRunSpec, output, validator } = coderProfile({
253
+ task,
254
+ ...options.harness ? { harness: options.harness } : {},
255
+ ...options.model ? { model: options.model } : {}
256
+ });
257
+ if (ctx.detachedSessionRef !== void 0 && ctx.updateDetachedSessionRef) {
258
+ const { sessionId } = parseDetachedSessionRef(ctx.detachedSessionRef);
259
+ const rebind = ctx.updateDetachedSessionRef;
260
+ const turn = await runDetachedTurn({
261
+ client: sandboxClient,
262
+ spec: agentRunSpec,
263
+ prompt: agentRunSpec.taskToPrompt(task),
264
+ sessionId,
265
+ bindSandbox: (sandboxId) => rebind(formatDetachedSessionRef({ sandboxId, sessionId })),
266
+ signal: ctx.signal,
267
+ report: ctx.report,
268
+ ...options.detachedTickIntervalMs !== void 0 ? { tickIntervalMs: options.detachedTickIntervalMs } : {},
269
+ ...options.detachedWallCapMs !== void 0 ? { wallCapMs: options.detachedWallCapMs } : {}
270
+ });
271
+ const chosen3 = await settleDetachedCoderTurn(turn, {
272
+ task,
273
+ sessionId,
274
+ signal: ctx.signal,
275
+ ...options.harness ? { harness: options.harness } : {},
276
+ ...options.model ? { model: options.model } : {},
277
+ ...options.reviewer ? { reviewer: options.reviewer } : {}
278
+ });
279
+ ctx.report({ iteration: 1, phase: "completed" });
280
+ return chosen3;
281
+ }
282
+ const result2 = await runLoop({
283
+ driver: singleShotDriver,
284
+ agentRun: agentRunSpec,
285
+ output,
286
+ validator,
287
+ task,
288
+ ctx: { sandboxClient, signal: ctx.signal, ...traceEmitter ? { traceEmitter } : {} },
289
+ maxIterations: 1,
290
+ maxConcurrency
291
+ });
292
+ const chosen2 = await pickCoderWinner({
293
+ iterations: result2.iterations,
294
+ reviewer: options.reviewer,
295
+ selection: options.winnerSelection ?? "highest-score",
296
+ task,
297
+ signal: ctx.signal
298
+ });
299
+ if (!chosen2) throw new Error(noWinnerMessage(options.reviewer));
300
+ ctx.report({ iteration: 1, phase: "completed" });
301
+ return chosen2;
302
+ }
303
+ const fanout = multiHarnessCoderFanout({
304
+ ...fanoutHarnesses && fanoutHarnesses.length > 0 ? { harnesses: fanoutHarnesses.slice(0, variants) } : {},
305
+ ...options.fanoutModels ? { models: options.fanoutModels.slice(0, variants) } : {}
306
+ });
307
+ const agentRuns = fanout.agentRuns.slice(0, variants);
308
+ const result = await runLoop({
309
+ driver: fanout.driver,
310
+ agentRuns,
311
+ output: fanout.output,
312
+ validator: fanout.validator,
313
+ task,
314
+ ctx: { sandboxClient, signal: ctx.signal, ...traceEmitter ? { traceEmitter } : {} },
315
+ maxIterations: variants,
316
+ maxConcurrency: Math.min(maxConcurrency, variants)
317
+ });
318
+ const chosen = await pickCoderWinner({
319
+ iterations: result.iterations,
320
+ reviewer: options.reviewer,
321
+ selection: options.winnerSelection ?? "highest-score",
322
+ task,
323
+ signal: ctx.signal
324
+ });
325
+ if (!chosen) throw new Error(noWinnerMessage(options.reviewer));
326
+ ctx.report({ iteration: agentRuns.length, phase: "completed" });
327
+ return chosen;
328
+ };
329
+ }
330
+ async function pickCoderWinner(args) {
331
+ const valid = [];
332
+ for (const iter of args.iterations) {
333
+ if (iter.output === void 0 || iter.error || iter.verdict?.valid !== true) continue;
334
+ valid.push({
335
+ index: iter.index,
336
+ output: iter.output,
337
+ score: iter.verdict.score ?? 0,
338
+ readiness: iter.verdict.score ?? 0
339
+ });
340
+ }
341
+ if (valid.length === 0) return void 0;
342
+ let eligible = valid;
343
+ if (args.reviewer) {
344
+ eligible = [];
345
+ for (const c of valid) {
346
+ const review = await args.reviewer(c.output, args.task, { signal: args.signal });
347
+ if (review.approved) eligible.push({ ...c, readiness: review.readiness });
348
+ }
349
+ if (eligible.length === 0) return void 0;
350
+ }
351
+ return selectCoderCandidate(eligible, args.selection).output;
352
+ }
353
+ function selectCoderCandidate(candidates, selection) {
354
+ const diffLines = (c) => c.output.diffStats.insertions + c.output.diffStats.deletions;
355
+ const sorted = [...candidates].sort((a, b) => {
356
+ switch (selection) {
357
+ case "smallest-diff":
358
+ return diffLines(a) - diffLines(b) || a.index - b.index;
359
+ case "highest-readiness":
360
+ return b.readiness - a.readiness || a.index - b.index;
361
+ case "first-approved":
362
+ return a.index - b.index;
363
+ default:
364
+ return b.score - a.score || a.index - b.index;
365
+ }
366
+ });
367
+ return sorted[0];
368
+ }
369
+ function noWinnerMessage(reviewer) {
370
+ return reviewer ? "coder delegate: no candidate passed validation + review" : "coder delegate: no candidate passed validation";
371
+ }
372
+ function coderTaskFromArgs(args) {
373
+ return {
374
+ goal: buildCoderGoal(args),
375
+ repoRoot: args.repoRoot,
376
+ testCmd: args.config?.testCmd,
377
+ typecheckCmd: args.config?.typecheckCmd,
378
+ forbiddenPaths: args.config?.forbiddenPaths,
379
+ maxDiffLines: args.config?.maxDiffLines
380
+ };
381
+ }
382
+ async function settleDetachedCoderTurn(turn, options) {
383
+ const { output, validator } = coderProfile({
384
+ task: options.task,
385
+ ...options.harness ? { harness: options.harness } : {},
386
+ ...options.model ? { model: options.model } : {}
387
+ });
388
+ const parsed = output.parse(detachedTurnEvents(options.sessionId, turn));
389
+ const verdict = await validator.validate(parsed, { iteration: 0, signal: options.signal });
390
+ if (verdict.valid !== true) throw new Error(noWinnerMessage(options.reviewer));
391
+ if (options.reviewer) {
392
+ const review = await options.reviewer(parsed, options.task, { signal: options.signal });
393
+ if (!review.approved) throw new Error(noWinnerMessage(options.reviewer));
394
+ }
395
+ return parsed;
396
+ }
397
+ function buildCoderGoal(args) {
398
+ if (!args.contextHint) return args.goal;
399
+ return [args.goal, "", "## Context", args.contextHint].join("\n");
400
+ }
401
+ function resolveExecutor(options) {
402
+ if (options.executor && options.sandboxClient) {
403
+ throw new Error("createDefaultCoderDelegate: pass exactly one of `executor` or `sandboxClient`");
404
+ }
405
+ if (options.executor) return options.executor;
406
+ if (options.sandboxClient) {
407
+ return createSiblingSandboxExecutor({ client: options.sandboxClient });
408
+ }
409
+ throw new Error("createDefaultCoderDelegate: `executor` or `sandboxClient` is required");
410
+ }
411
+ var singleShotDriver = {
412
+ name: "mcp-single-shot",
413
+ async plan(task, history) {
414
+ return history.length === 0 ? [task] : [];
415
+ },
416
+ decide(history) {
417
+ return history.length > 0 ? "pick-winner" : "fail";
418
+ }
419
+ };
420
+
421
+ export {
422
+ formatDetachedSessionRef,
423
+ parseDetachedSessionRef,
424
+ detachedTurnEvents,
425
+ runDetachedTurn,
426
+ createDriveTurnResumeDriver,
427
+ createSiblingSandboxExecutor,
428
+ createFleetWorkspaceExecutor,
429
+ createDefaultCoderDelegate,
430
+ coderTaskFromArgs,
431
+ settleDetachedCoderTurn
432
+ };
433
+ //# sourceMappingURL=chunk-GHX7XOJ2.js.map