@tangle-network/agent-runtime 0.43.0 → 0.45.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +96 -202
  2. package/dist/agent.d.ts +5 -4
  3. package/dist/agent.js +5 -7
  4. package/dist/agent.js.map +1 -1
  5. package/dist/analyst-loop.d.ts +65 -4
  6. package/dist/analyst-loop.js +6 -1
  7. package/dist/audit.d.ts +93 -0
  8. package/dist/audit.js +312 -0
  9. package/dist/audit.js.map +1 -0
  10. package/dist/chunk-4B6U4CVQ.js +15 -0
  11. package/dist/chunk-4B6U4CVQ.js.map +1 -0
  12. package/dist/chunk-FK53TXOP.js +603 -0
  13. package/dist/chunk-FK53TXOP.js.map +1 -0
  14. package/dist/{chunk-MJDGCRAT.js → chunk-IJ6FGOPO.js} +5 -5
  15. package/dist/chunk-IJ6FGOPO.js.map +1 -0
  16. package/dist/{chunk-HVYOHJHK.js → chunk-IJGS6J7X.js} +2 -2
  17. package/dist/chunk-IJGS6J7X.js.map +1 -0
  18. package/dist/chunk-KEWO4KI6.js +3599 -0
  19. package/dist/chunk-KEWO4KI6.js.map +1 -0
  20. package/dist/{chunk-NRZOXCJK.js → chunk-KSMX62JF.js} +2 -2
  21. package/dist/{chunk-C5HMTTNY.js → chunk-NYN5RTLP.js} +13 -12
  22. package/dist/chunk-NYN5RTLP.js.map +1 -0
  23. package/dist/chunk-PRX45WE2.js +264 -0
  24. package/dist/chunk-PRX45WE2.js.map +1 -0
  25. package/dist/{chunk-3HMHSN22.js → chunk-QR4UUC5P.js} +6 -6
  26. package/dist/chunk-QR4UUC5P.js.map +1 -0
  27. package/dist/chunk-WIR4HOOJ.js +27 -0
  28. package/dist/chunk-WIR4HOOJ.js.map +1 -0
  29. package/dist/{chunk-MNCB4SJ5.js → chunk-Z2QXVBA6.js} +296 -8
  30. package/dist/chunk-Z2QXVBA6.js.map +1 -0
  31. package/dist/coder-CczgMqFx.d.ts +114 -0
  32. package/dist/dynamic-BvllHV6M.d.ts +221 -0
  33. package/dist/{improvement-adapter-BC4HhuAR.d.ts → improvement-adapter-CWegd3vw.d.ts} +1 -1
  34. package/dist/improvement.d.ts +2 -3
  35. package/dist/improvement.js +0 -5
  36. package/dist/improvement.js.map +1 -1
  37. package/dist/index.d.ts +123 -10
  38. package/dist/index.js +407 -19
  39. package/dist/index.js.map +1 -1
  40. package/dist/{kb-gate-DTBum3vH.d.ts → kb-gate-D9GBocLN.d.ts} +82 -5
  41. package/dist/{loop-runner-bin-CVoCBmYk.d.ts → loop-runner-bin-CPrCoKqC.d.ts} +14 -10
  42. package/dist/loop-runner-bin.d.ts +9 -7
  43. package/dist/loop-runner-bin.js +6 -8
  44. package/dist/loops.d.ts +7 -371
  45. package/dist/loops.js +96 -19
  46. package/dist/mcp/bin.js +7 -7
  47. package/dist/mcp/bin.js.map +1 -1
  48. package/dist/mcp/index.d.ts +284 -11
  49. package/dist/mcp/index.js +341 -9
  50. package/dist/mcp/index.js.map +1 -1
  51. package/dist/{otel-export-BzvF1Ela.d.ts → otel-export-Dy2DyUCU.d.ts} +1 -1
  52. package/dist/profiles.d.ts +385 -86
  53. package/dist/profiles.js +549 -4
  54. package/dist/profiles.js.map +1 -1
  55. package/dist/run-loop--hSoIknW.d.ts +112 -0
  56. package/dist/runtime-hooks-C7JwKb9E.d.ts +70 -0
  57. package/dist/runtime.d.ts +1860 -0
  58. package/dist/runtime.js +114 -0
  59. package/dist/runtime.js.map +1 -0
  60. package/dist/substrate-CUgk7F7s.d.ts +77 -0
  61. package/dist/topology.d.ts +73 -0
  62. package/dist/topology.js +111 -0
  63. package/dist/topology.js.map +1 -0
  64. package/dist/types-1HbsFa7H.d.ts +438 -0
  65. package/dist/{types-p8dWBIXL.d.ts → types-BtRLF2U3.d.ts} +1 -1
  66. package/dist/{types-Bcp071Jg.d.ts → types-DdzkffAm.d.ts} +95 -1
  67. package/dist/workflow.d.ts +551 -0
  68. package/dist/workflow.js +1778 -0
  69. package/dist/workflow.js.map +1 -0
  70. package/package.json +53 -16
  71. package/skills/agent-runtime-adoption/SKILL.md +29 -26
  72. package/dist/chunk-3HMHSN22.js.map +0 -1
  73. package/dist/chunk-C5HMTTNY.js.map +0 -1
  74. package/dist/chunk-EKBSQYZE.js +0 -813
  75. package/dist/chunk-EKBSQYZE.js.map +0 -1
  76. package/dist/chunk-HVYOHJHK.js.map +0 -1
  77. package/dist/chunk-MJDGCRAT.js.map +0 -1
  78. package/dist/chunk-MNCB4SJ5.js.map +0 -1
  79. package/dist/chunk-PY6NMZYX.js +0 -52
  80. package/dist/chunk-PY6NMZYX.js.map +0 -1
  81. package/dist/chunk-SQSCRJ7U.js +0 -65
  82. package/dist/chunk-SQSCRJ7U.js.map +0 -1
  83. package/dist/chunk-VOX6Z3II.js +0 -90
  84. package/dist/chunk-VOX6Z3II.js.map +0 -1
  85. package/dist/chunk-XBUG326M.js +0 -261
  86. package/dist/chunk-XBUG326M.js.map +0 -1
  87. package/dist/dynamic-B_7GgCwu.d.ts +0 -108
  88. package/dist/optimize-prompt-D-urF2wW.d.ts +0 -129
  89. /package/dist/{chunk-NRZOXCJK.js.map → chunk-KSMX62JF.js.map} +0 -0
package/README.md CHANGED
@@ -1,38 +1,25 @@
1
1
  # @tangle-network/agent-runtime
2
2
 
3
- The task-lifecycle substrate for domain agents. It owns the **chat-turn engine**, the **driven-loop kernel** (refine / fanout-vote / agent-authored *dynamic* topologies), **delegated loops** (build-in-a-loop, valid-only research, review, audit, self-improve), **identity-gated prompt optimization**, **OpenTelemetry GenAI tracing**, knowledge readiness, sanitized telemetry, and the declarative `defineAgent` manifest — and delegates domain behavior (models, tools, KB) to adapters. Long-running execution durability lives in [`@tangle-network/sandbox`](https://www.npmjs.com/package/@tangle-network/sandbox); evals + gates in [`@tangle-network/agent-eval`](https://www.npmjs.com/package/@tangle-network/agent-eval).
3
+ The shared task-lifecycle skeleton for agents. It runs an agent (a chat turn, a one-shot task, or a multi-attempt loop), captures every run as a trace, and feeds those traces into eval-gated self-improvement.
4
+
5
+ It owns the lifecycle and the loop kernel. It delegates domain behavior (models, tools, knowledge) to adapters, scoring and the ship gate to [`@tangle-network/agent-eval`](https://www.npmjs.com/package/@tangle-network/agent-eval), and sandboxed long-running execution to [`@tangle-network/sandbox`](https://www.npmjs.com/package/@tangle-network/sandbox).
4
6
 
5
7
  ```bash
6
8
  pnpm add @tangle-network/agent-runtime @tangle-network/agent-eval @tangle-network/sandbox
7
9
  ```
8
10
 
9
- ---
10
-
11
- ## Contents
12
-
13
- - [Getting started](#getting-started) the 20-line production chat turn
14
- - [Which entry point do I reach for?](#which-entry-point-do-i-reach-for)
15
- - [Capabilities](#capabilities)
16
- - [1. Chat turns `handleChatTurn`](#1-chat-turns--handlechatturn)
17
- - [2. Driven loops + topology drivers](#2-driven-loops--topology-drivers)
18
- - [3. Agent-authored topology — `createDynamicDriver`](#3-agent-authored-topology--createdynamicdriver)
19
- - [4. Delegated loop-runner — `runDelegatedLoop`](#4-delegated-loop-runner--rundelegatedloop)
20
- - [5. Reliable build-in-a-loop — the coder delegate](#5-reliable-build-in-a-loop--the-coder-delegate)
21
- - [6. Valid-only research — `createKbGate`](#6-valid-only-research--createkbgate)
22
- - [7. Identity-gated prompt optimization — `optimizePrompt`](#7-identity-gated-prompt-optimization--optimizeprompt)
23
- - [8. OpenTelemetry GenAI topology tracing](#8-opentelemetry-genai-topology-tracing)
24
- - [9. MCP delegation server — `agent-runtime-mcp`](#9-mcp-delegation-server--agent-runtime-mcp)
25
- - [Defaults](#defaults)
26
- - [Composition with the stack](#composition-with-the-stack)
27
- - [Subpath exports](#subpath-exports)
28
- - [Adoption skill](#adoption-skill)
29
- - [Stability · Tests · Docs](#stability--tests--docs)
30
-
31
- ---
11
+ ## The model
12
+
13
+ One recursive `Agent` atom, run at two timescales, over many tasks. `docs/architecture.md` is the canonical spine. The short version:
14
+
15
+ 1. **One atom.** `driver`, `worker`, `selector`, and `coordinator` are not separate types. They are what a single `Agent` returns from `act`. The recursion bottoms out at execution.
16
+ 2. **Two timescales, one machinery.** The same loop runs at inference time (steer a worker over k attempts) and at optimization time (search the steer or the prompt with GEPA, gated on a held-out split).
17
+ 3. **A benchmark is an adapter.** A new task is a loader plus a worker plus a judge. The loop, the drivers, the corpus, and the selector are the shared spine, written once.
18
+ 4. **The selector is not the judge.** At inference time the selector picks which answer to return without seeing the judge's verdict. The judge is write-only. A steer may read the trace but never the verdict (the firewall that keeps the loop from gaming its own score).
32
19
 
33
20
  ## Getting started
34
21
 
35
- Every product agent is a `handleChatTurn` call inside a route. This is what gtm / creative / legal / tax all run in production:
22
+ Every product agent is a `handleChatTurn` call inside a route. This is what the gtm, creative, legal, and tax products run in production:
36
23
 
37
24
  ```ts
38
25
  import { handleChatTurn } from '@tangle-network/agent-runtime'
@@ -57,181 +44,95 @@ export async function POST({ request, env, ctx }: { request: Request; env: Env;
57
44
  }
58
45
  ```
59
46
 
60
- That's the centerpiece. Everything below is *"when one chat turn isn't enough"* multi-shot loops, delegation, optimization, and the telemetry that makes them auditable.
61
-
62
- ---
47
+ That is the common case. Everything below is for when one chat turn is not enough: multi-attempt loops, delegation, optimization, and the telemetry that makes them auditable.
63
48
 
64
49
  ## Which entry point do I reach for?
65
50
 
66
- | You want to | Reach for | Subpath |
51
+ | You want to | Reach for | Subpath |
67
52
  |---|---|---|
68
- | Run a production chat turn (90% of products) | `handleChatTurn` | root |
69
- | Declare an agent (profile + surfaces + adapters) | `defineAgent` | `/agent` |
70
- | One-shot task with verification + eval | `runAgentTask` | root |
71
- | Multi-shot loop (refine / fanout-vote) | `runLoop` + a driver | `/loops` |
72
- | Let the **agent choose** the loop shape per round | `createDynamicDriver` + `createSandboxPlanner` | `/loops` |
73
- | Delegate a disciplined loop by mode (code/research/…) | `runDelegatedLoop` / `agent-runtime-loop` | root |
53
+ | Run a production chat turn (most products) | `handleChatTurn` | root |
54
+ | Declare an agent (profile, surfaces, adapters) | `defineAgent` | `/agent` |
55
+ | Run a one-shot task with verification and eval | `runAgentTask` | root |
56
+ | Run a multi-attempt loop (refine or fanout-vote) | `runLoop` plus a driver | `/loops` |
57
+ | Let the agent choose the loop shape per round | `createDynamicDriver` plus `createSandboxPlanner` | `/loops` |
58
+ | Delegate a disciplined loop by mode (code, research, ...) | `runDelegatedLoop` or `agent-runtime-loop` | root |
74
59
  | Build code reliably (reviewed, gated) | `createDefaultCoderDelegate` | `/mcp` |
75
- | Grow a KB with only grounded facts | `createKbGate` | `/mcp` |
76
- | Improve a prompt safely (identity-gated) | `optimizePrompt` | `/improvement` |
77
- | Ship loop traces to a GenAI viewer | `buildLoopOtelSpans` + `createOtelExporter` | root |
78
- | Expose delegation as MCP tools to a sandbox agent | `createMcpServer` / `agent-runtime-mcp` | `/mcp` |
60
+ | Grow a knowledge base with only grounded facts | `createKbGate` | `/mcp` |
61
+ | Improve a prompt safely (identity-gated) | `selfImprove` | `@tangle-network/agent-eval/contract` |
62
+ | Ship loop traces to a GenAI viewer | `buildLoopOtelSpans` plus `createOtelExporter` | root |
63
+ | Expose delegation as MCP tools to a sandbox agent | `createMcpServer` or `agent-runtime-mcp` | `/mcp` |
79
64
  | Mutate surfaces from trace findings | `runAnalystLoop` | `/analyst-loop` |
80
- | Persist a run + cost ledger | `startRuntimeRun` | root |
81
-
82
- ---
83
-
84
- ## Capabilities
85
-
86
- ### 1. Chat turns — `handleChatTurn`
65
+ | Persist a run plus its cost ledger | `startRuntimeRun` | root |
87
66
 
88
- The production turn envelope: frames a producer with the `session.run.*` NDJSON protocol, the persist → post-process → trace-flush hook order, and a stable execution id for client-retry replay. See [Getting started](#getting-started) and [`examples/chat-handler/`](./examples/chat-handler/).
67
+ ## The loop kernel
89
68
 
90
- ### 2. Driven loops + topology drivers
91
-
92
- `runLoop` is a topology-agnostic kernel: each iteration spawns a sandbox on an `AgentRunSpec`, decodes the output, validates it, and asks a **driver** what to do next. The driver owns topology; the validator owns scoring; the kernel owns iteration accounting, concurrency, cost/token aggregation, and trace emission.
69
+ `runLoop` is a topology-agnostic kernel. Each iteration spawns a sandbox on an `AgentRunSpec`, decodes the output, validates it, and asks a driver what to do next. The driver owns topology. The validator owns scoring. The kernel owns iteration accounting, concurrency, cost and token aggregation, and trace emission.
93
70
 
94
71
  ```ts
95
72
  import { runLoop, createFanoutVoteDriver } from '@tangle-network/agent-runtime/loops'
96
73
 
97
74
  const result = await runLoop({
98
- driver: createFanoutVoteDriver({ n: 3 }), // 3 parallel attempts, pick the best valid one
99
- agentRuns: [claudeSpec, codexSpec, glmSpec], // heterogeneous: one harness per branch
100
- output, // events typed Output
101
- validator, // Output { valid, score }
75
+ driver: createFanoutVoteDriver({ n: 3 }), // 3 parallel attempts, pick the best valid one
76
+ agentRuns: [claudeSpec, codexSpec, glmSpec], // heterogeneous: one harness per branch
77
+ output, // events to typed Output
78
+ validator, // Output to { valid, score }
102
79
  task,
103
80
  ctx: { sandboxClient: sandbox },
104
81
  })
105
82
  result.winner // highest-scoring valid attempt
106
83
  ```
107
84
 
108
- Shipped drivers (`/loops/drivers`): **`createRefineDriver`** (single task, iterate until valid) and **`createFanoutVoteDriver`** (N parallel, vote). See [`examples/coder-loop/`](./examples/coder-loop/) and [`examples/researcher-loop/`](./examples/researcher-loop/).
85
+ Shipped drivers (`/loops/drivers`): `createRefineDriver` (single task, iterate until valid), `createFanoutVoteDriver` (N parallel, vote), and `createDynamicDriver` (the agent authors the topology at runtime). The dynamic driver emits one `TopologyMove` per round (`refine`, `fanout`, or `stop`) from an injected planner; a malformed move throws `PlannerError`, so the loop never runs a topology nobody chose. Topology is orthogonal to harness: the planner never names a backend, and the kernel's `agentRuns` decide which harness runs each branch.
109
86
 
110
- ### 3. Agent-authored topology `createDynamicDriver`
87
+ `runProgram` (also in `/loops`) is the recursive op-set (`sample`, `steer`, `fork`, `parallel`, `select`, `seq`, `stop`) plus a tree executor, for programs that compose sub-loops.
111
88
 
112
- The third driver lets the **agent author the loop topology at runtime** — refine, fan out, or stop, decided per round by an injected planner. Topology is orthogonal to harness: the planner never names a backend; the kernel's `agentRuns` round-robin decides which harness runs each branch.
89
+ ## Self-improvement
113
90
 
114
- ```ts
115
- import { runLoop, createDynamicDriver, createSandboxPlanner } from '@tangle-network/agent-runtime/loops'
91
+ The same machinery, run at the optimization timescale.
116
92
 
117
- const planner = createSandboxPlanner({
118
- client: sandbox,
119
- profile: { name: 'planner', metadata: { backendType: 'claude-code' } }, // cheap model is fine
120
- decodeTask: (raw) => raw as Task,
121
- })
93
+ The one entry point is agent-eval's **`selfImprove`** (`@tangle-network/agent-eval/contract`). It runs a closed loop over any text/config surface, identity-gated by construction: it evaluates, proposes candidates (default `gepaDriver`), and a held-out gate ships a winner only if it beats the baseline. `result.winner.surface` is the baseline unless `result.gateDecision === 'ship'`, so registering a surface for optimization can never regress it.
122
94
 
123
- const result = await runLoop({
124
- driver: createDynamicDriver({ planner, maxIterations: 8 }),
125
- agentRuns: [claudeSpec, codexSpec], // the planner can fan a single round across both
126
- output, validator, task,
127
- ctx: { sandboxClient: sandbox },
95
+ ```ts
96
+ import { selfImprove } from '@tangle-network/agent-eval/contract'
97
+
98
+ const result = await selfImprove({
99
+ baselineSurface: CURRENT_SYSTEM_PROMPT,
100
+ agent: (surface, scenario, ctx) => runYourThing(surface, scenario),
101
+ scenarios,
102
+ judge,
103
+ budget: { holdoutScenarios, generations: 3 },
104
+ llm: { baseUrl, apiKey, model: 'claude-sonnet-4-6' },
128
105
  })
106
+ // result.winner.surface is the safe one — the baseline unless gateDecision === 'ship'
129
107
  ```
130
108
 
131
- The planner emits one `TopologyMove` per round (`refine` | `fanout` | `stop`) with a rationale; a malformed move throws `PlannerError` (the loop never runs a topology nobody chose).
109
+ agent-runtime contributes the runtime-specific piece: the **CODE-surface `improvementDriver`** (`/improvement`) a git-worktree mutator you pass to `selfImprove` as `driver` to optimize code instead of a string.
132
110
 
133
- ### 4. Delegated loop-runner — `runDelegatedLoop`
111
+ `runAnalystLoop` (`/analyst-loop`) mines real run traces into findings; `createAnalystDriverHook` feeds those findings to a dynamic-driver planner via `PlannerContext.analyses`, with a firewall (`assertTraceDerivedFindings`) that rejects any finding derived from a judge verdict. Production intake turning real run traces into the corpus `selfImprove` optimizes against — is agent-eval's `analyzeRuns` / `partitionRunsByAuthoringModel` (`/contract`).
134
112
 
135
- One configured entrypoint a worker agent (or a scheduled routine) calls to run a disciplined loop in a chosen **mode**, over the hardened engines below. Fail-loud on an unwired mode; a thrown engine is captured as `{ ok: false }` so unattended runs *record* rather than crash.
113
+ ## Delegated loops
114
+
115
+ `runDelegatedLoop` is one entrypoint a worker agent or a scheduled routine calls to run a disciplined loop in a chosen mode, over the hardened engines below. It fails loud on an unwired mode; a thrown engine is captured as `{ ok: false }`, so unattended runs record rather than crash.
136
116
 
137
117
  ```ts
138
- import {
139
- runDelegatedLoop, coderLoopRunner, researchLoopRunner, type DelegatedLoopRegistry,
140
- } from '@tangle-network/agent-runtime'
118
+ import { runDelegatedLoop, coderLoopRunner, researchLoopRunner, type DelegatedLoopRegistry } from '@tangle-network/agent-runtime'
141
119
 
142
120
  const registry: DelegatedLoopRegistry = {
143
- code: coderLoopRunner({
144
- sandboxClient,
145
- args: { goal: 'fix the flaky retry test', repoRoot: '/repo' },
146
- reviewer, // optional adversarial gate
147
- winnerSelection: 'smallest-diff',
148
- }),
121
+ code: coderLoopRunner({ sandboxClient, args: { goal: 'fix the flaky retry test', repoRoot: '/repo' }, reviewer, winnerSelection: 'smallest-diff' }),
149
122
  research: researchLoopRunner({ research, gate: { selfArtifactKinds: ['spec'] }, maxRounds: 3 }),
150
123
  }
151
-
152
124
  const result = await runDelegatedLoop('code', registry)
153
- // → { mode: 'code', ok: true, output: CoderOutput, durationMs }
154
- ```
155
-
156
- Modes: `code` · `review` · `research` · `audit` · `self-improve` · `dynamic` — each with a default factory (`coderLoopRunner`, `reviewLoopRunner`, `researchLoopRunner`, `dynamicLoopRunner`, `selfImproveLoopRunner`, `auditLoopRunner`).
157
-
158
- **Schedulable**: the `agent-runtime-loop` bin runs it from a cron/routine. The config module wires the registry (with full env/creds access):
159
-
160
- ```bash
161
- agent-runtime-loop --mode research --config ./loops.config.js
162
- # exits 0 (ok) · 1 (recorded failure) · 2 (usage/config error); prints the result as JSON
163
- ```
164
-
165
- ```ts
166
- // loops.config.js — default-exports a DelegatedLoopRegistry (or a factory)
167
- import { researchLoopRunner } from '@tangle-network/agent-runtime'
168
- export default { research: researchLoopRunner({ research: myResearchEngine, maxRounds: 3 }) }
169
- ```
170
-
171
- ### 5. Reliable build-in-a-loop — the coder delegate
172
-
173
- `createDefaultCoderDelegate` drives a coder loop with **default-on safety gates** so it never ships junk:
174
-
175
- - **no-op rejection** — an empty patch can't "pass" trivially,
176
- - **secret-path floor** — always-on, independent of `forbiddenPaths` (`.env`, keys, wallets, …),
177
- - optional **`reviewer`** gate — a candidate must pass tests/typecheck **and** be approved to win,
178
- - **`winnerSelection`** — `highest-score` (default) · `smallest-diff` · `highest-readiness` · `first-approved`.
179
-
180
- ```ts
181
- import { createDefaultCoderDelegate } from '@tangle-network/agent-runtime/mcp'
182
-
183
- const coder = createDefaultCoderDelegate({
184
- sandboxClient,
185
- fanoutHarnesses: ['claude-code', 'codex'],
186
- reviewer: async (output, task) => ({ approved: output.testResult.passed, recommendation: 'ship', readiness: 0.9 }),
187
- winnerSelection: 'highest-readiness',
188
- })
189
- const out = await coder({ goal: 'add a retry with backoff', repoRoot: '/repo', variants: 2 }, ctx)
190
125
  ```
191
126
 
192
- See [`examples/coder-loop/`](./examples/coder-loop/) and [`examples/agent-into-reviewer/`](./examples/agent-into-reviewer/).
193
-
194
- ### 6. Valid-only research — `createKbGate`
195
-
196
- A fail-closed gate so a knowledge base grows with **only grounded facts**. The always-on floor: a fact's `verbatimPassage` must literally appear in its `sourceText` (anti-hallucination), the asserted value must be in the passage, and citations can't point at self-generated artifacts (laundering). Plug in your own judges; verdict-only (remediation is yours).
197
-
198
- ```ts
199
- import { createKbGate } from '@tangle-network/agent-runtime/mcp'
200
-
201
- const gate = createKbGate({ selfArtifactKinds: ['spec', 'cad_params'] })
202
- const verdict = await gate({
203
- claim: 'revenue was $1.2B in 2025',
204
- value: 1_200_000_000,
205
- verbatimPassage: 'total revenue was $1,200,000,000 for the fiscal year',
206
- sourceText: rawSource,
207
- })
208
- if (verdict.accepted) writeToKb(fact)
209
- else console.warn('vetoed by', verdict.vetoedBy, verdict.reason)
210
- ```
211
-
212
- `researchLoopRunner` (mode `research`) wraps this with a correct-on-veto remediation loop: research → gate → re-research the vetoed gaps up to `maxRounds`, then **return** the unverified ones (escalate, never silently drop).
213
-
214
- ### 7. Identity-gated prompt optimization — `optimizePrompt`
127
+ Modes: `code`, `review`, `research`, `audit`, `self-improve`, `dynamic`. The `agent-runtime-loop` bin runs the registry from a cron or routine and exits 0 (ok), 1 (recorded failure), or 2 (usage or config error).
215
128
 
216
- Optimize any text prompt over agent-eval's `runImprovementLoop`, **identity-gated by construction**: it runs evals, proposes candidates (default `gepaDriver`), and the held-out gate compares candidate vs baseline. `result.prompt` is the **baseline unless the gate decided `ship`** — so registering a prompt for optimization can never regress it.
129
+ The coder delegate (`createDefaultCoderDelegate`, `/mcp`) has default-on safety gates: no-op rejection (an empty patch cannot pass trivially), an always-on secret-path floor (`.env`, keys, wallets), an optional `reviewer` gate, and a `winnerSelection` policy (`highest-score`, `smallest-diff`, `highest-readiness`, `first-approved`).
217
130
 
218
- ```ts
219
- import { optimizePrompt } from '@tangle-network/agent-runtime/improvement'
220
-
221
- const { prompt, improved, delta } = await optimizePrompt({
222
- baselinePrompt: CURRENT_SYSTEM_PROMPT,
223
- runWithPrompt: (candidate, scenario, ctx) => runYourThing(candidate, scenario),
224
- scenarios, holdoutScenarios, judges, runDir,
225
- reflection: { llm, model: 'claude-sonnet-4-6' },
226
- })
227
- // assign `prompt` unconditionally — it's the safe one
228
- ```
131
+ The knowledge-base gate (`createKbGate`, `/mcp`) is fail-closed: a fact's `verbatimPassage` must appear in its `sourceText`, the asserted value must be in the passage, and citations cannot point at self-generated artifacts. `researchLoopRunner` wraps it with a correct-on-veto loop that re-researches the vetoed gaps up to `maxRounds`, then returns the unverified ones rather than dropping them.
229
132
 
230
- See [`examples/self-improving-loop/`](./examples/self-improving-loop/).
133
+ ## Tracing
231
134
 
232
- ### 8. OpenTelemetry GenAI topology tracing
233
-
234
- `runLoop` emits a structured event stream; `buildLoopOtelSpans` turns it into a **nested, real-duration span tree** that any GenAI trace viewer (Phoenix, Langfuse, Grafana Tempo, Tangle Intelligence) renders natively. Attributes follow the current GenAI semantic conventions (`gen_ai.operation.name`, `gen_ai.agent.name`, `gen_ai.usage.input_tokens/output_tokens`) plus a `tangle.loop.*` extension for the topology (move kind/rationale, edge lineage, verdict, placement, cost).
135
+ `runLoop` emits a structured event stream. `buildLoopOtelSpans` turns it into a nested, real-duration span tree that any GenAI trace viewer (Phoenix, Langfuse, Grafana Tempo, Tangle Intelligence) renders natively. Attributes follow the current GenAI semantic conventions (`gen_ai.operation.name`, `gen_ai.agent.name`, `gen_ai.usage.input_tokens`, `gen_ai.usage.output_tokens`) plus a `tangle.loop.*` extension for the topology (move kind and rationale, edge lineage, verdict, placement, cost).
235
136
 
236
137
  ```ts
237
138
  import { buildLoopOtelSpans, createOtelExporter } from '@tangle-network/agent-runtime'
@@ -241,94 +142,87 @@ for (const span of buildLoopOtelSpans(loopEvents, traceId)) exporter?.exportSpan
241
142
  await exporter?.flush()
242
143
  ```
243
144
 
244
- The shape: `loop loop.round (move + rationale) loop.iteration (agent, usage, verdict, cost, parent edge)`. See [`examples/with-intelligence-export/`](./examples/with-intelligence-export/).
145
+ The shape: `loop` to `loop.round` (move plus rationale) to `loop.iteration` (agent, usage, verdict, cost, parent edge).
245
146
 
246
- ### 9. MCP delegation server — `agent-runtime-mcp`
147
+ ## MCP delegation server
247
148
 
248
- Expose the five delegation tools (`delegate_code`, `delegate_research`, `delegate_feedback`, `delegation_status`, `delegation_history`) to a sandbox coding-harness agent mount the canonical server, don't fork delegation logic.
149
+ Expose the delegation tools (`delegate_code`, `delegate_research`, `delegate_feedback`, `delegation_status`, `delegation_history`) to a sandbox coding agent. Mount the canonical server instead of forking delegation logic.
249
150
 
250
151
  ```ts
251
152
  import { createMcpServer, createDefaultCoderDelegate } from '@tangle-network/agent-runtime/mcp'
252
153
 
253
- const server = createMcpServer({
254
- coderDelegate: createDefaultCoderDelegate({ sandboxClient }),
255
- researcherDelegate, // wire your KB-backed researcher
256
- })
154
+ const server = createMcpServer({ coderDelegate: createDefaultCoderDelegate({ sandboxClient }), researcherDelegate })
257
155
  ```
258
156
 
259
- Or mount the `agent-runtime-mcp` stdio bin on a production `AgentProfile.mcp`. See [`examples/mcp-delegation/`](./examples/mcp-delegation/) and [`examples/fleet-delegation/`](./examples/fleet-delegation/).
157
+ Or mount the `agent-runtime-mcp` stdio bin on a production `AgentProfile.mcp`.
260
158
 
261
- ---
159
+ ## The experiment harness (bench/)
262
160
 
263
- ## Defaults
161
+ `bench/` is the internal harness that asks the binding empirical question: does any non-blind topology beat blind compute at equal k, under a deployable (non-oracle) selector, on a real benchmark? It runs through the same kernel, not a reimplementation.
264
162
 
265
- When nothing is specified:
163
+ One entrypoint, `runExperiment(adapter, { sandboxClient, agentRun, arms, ... })`: N instances times a set of arms, each arm a topology driven through `runLoop`, judged by the adapter, written to a durable canonical corpus. An arm is one steer function `f(rootPrompt, history) => nextPrompt`: `random` ignores history (the compute control), `refine` carries the prior answer plus a directive, `diverse` rotates a strategy lens. The cost dial is the backend type (`hermes` for a direct router call, `opencode` or `claude-code` or `codex` for agent CLIs). The deep statistics (paired bootstrap with Benjamini-Hochberg correction, selector replay) come from `corpus-report.mts` and `corpus-replay.mts` over the written corpus, computed once. See `bench/HARNESS.md` and `docs/learning-flywheel.md`.
164
+
165
+ ## Defaults
266
166
 
267
167
  | Knob | Default | Override |
268
168
  |---|---|---|
269
- | Backend model | `gpt-4o-mini` (via `createOpenAICompatibleBackend`) | `model` option / `MODEL_NAME` env |
169
+ | Backend model | `gpt-4o-mini` (via `createOpenAICompatibleBackend`) | `model` option or `MODEL_NAME` env |
270
170
  | Backend provider | `openai-compat` when `TANGLE_API_KEY`, else `openai` if `OPENAI_API_KEY` | `MODEL_PROVIDER` env |
271
171
  | Router base URL | `https://router.tangle.tools/v1` | `TANGLE_ROUTER_BASE_URL` env |
272
172
  | Sandbox base URL | `https://sandbox.tangle.tools` | `SANDBOX_API_URL` env |
273
- | Loop iteration cap | 10 (`runLoop`); dynamic driver 8 | `runLoop({ maxIterations })` |
274
- | Driver | none required by `runLoop` | `createRefineDriver` / `createFanoutVoteDriver` / `createDynamicDriver` |
173
+ | Loop iteration cap | 10 (`runLoop`), 8 (dynamic driver) | `runLoop({ maxIterations })` |
174
+ | Driver | none, required by `runLoop` | `createRefineDriver`, `createFanoutVoteDriver`, `createDynamicDriver` |
275
175
  | Winner selection (coder delegate) | `highest-score` | `winnerSelection` option |
276
176
  | KB gate min passage | 12 chars | `createKbGate({ minPassageChars })` |
277
- | `optimizePrompt` gate | `heldOutGate` | `defaultProductionGate` for red-team hardening |
177
+ | `selfImprove` gate | held-out gate (default) | pass `gate: defaultProductionGate` for red-team hardening |
278
178
  | OTEL export | off | set `OTEL_EXPORTER_OTLP_ENDPOINT` |
279
179
  | Loop-runner mode failure | recorded as `{ ok: false }` | `runDelegatedLoop` never crashes on a thrown engine |
280
180
 
281
- ---
282
-
283
181
  ## Composition with the stack
284
182
 
285
183
  ```
286
- agent-runtime ── handleChatTurn · runLoop + drivers · runDelegatedLoop · createMcpServer
287
- optimizePrompt · createKbGate · buildLoopOtelSpans · defineAgent
184
+ agent-runtime handleChatTurn, runLoop + drivers, runProgram, runDelegatedLoop, createMcpServer,
185
+ improvementDriver, createKbGate, buildLoopOtelSpans, defineAgent
288
186
 
289
- agent-eval ── runEvalCampaign · runImprovementLoop (gepaDriver) · heldOutGate · runAgentMatrix
290
- (consumes runtime traces, scores, gates promotion)
187
+ agent-eval selfImprove (the optimization entry point), runEvalCampaign,
188
+ runImprovementLoop (gepaDriver), heldOutGate, runAgentMatrix, analyzeRuns.
189
+ Consumes runtime traces, scores, gates promotion. agent-runtime depends on it,
190
+ never the reverse.
291
191
 
292
- agent-knowledge proposeKnowledgeWrites / applyKnowledgeWriteBlocks
293
- (analyst-loop produces these; runtime + createKbGate consume them)
192
+ agent-knowledge proposeKnowledgeWrites, applyKnowledgeWriteBlocks. The analyst loop produces
193
+ these; the runtime and createKbGate consume them.
294
194
 
295
- sandbox ── AgentProfile · Sandbox.create · streamPrompt · exportTraceBundle
296
- (the harness execution surface every loop runs on)
195
+ sandbox AgentProfile, Sandbox.create, streamPrompt, exportTraceBundle. The harness
196
+ execution surface every loop runs on.
297
197
  ```
298
198
 
299
- ---
300
-
301
199
  ## Subpath exports
302
200
 
303
201
  | Import | Owns |
304
202
  |---|---|
305
203
  | `@tangle-network/agent-runtime` | chat turns, delegated loop-runner, OTEL export, errors, model resolution |
306
- | `…/agent` | `defineAgent` + surfaces / outcome adapters |
307
- | `…/loops` | `runLoop` kernel + `refine` / `fanout-vote` / **`dynamic`** drivers + `loopDispatch` |
308
- | `…/profiles` | `coderProfile`, `researcherProfile` presets |
309
- | `…/mcp` | `createMcpServer`, `createDefaultCoderDelegate`, **`createKbGate`**, `agent-runtime-mcp` bin |
310
- | `…/improvement` | **`optimizePrompt`** (text) + `improvementDriver` (code/worktree) |
311
- | `…/analyst-loop` | `runAnalystLoop` analyst registry driver |
312
- | `…/platform` | cross-site SSO + integrations hub |
313
-
314
- Bins: `agent-runtime-mcp` (delegation MCP server) · `agent-runtime-loop` (schedulable delegated loop-runner).
204
+ | `.../agent` | `defineAgent` plus surface and outcome adapters |
205
+ | `.../loops` | the `runLoop` kernel, the `refine` / `fanout-vote` / `dynamic` drivers, `runProgram`, `loopDispatch` |
206
+ | `.../profiles` | `coderProfile`, `researcherProfile` presets |
207
+ | `.../mcp` | `createMcpServer`, `createDefaultCoderDelegate`, `createKbGate`, the `agent-runtime-mcp` bin |
208
+ | `.../improvement` | `improvementDriver` (code/worktree `CandidateGenerator`), `agenticGenerator`, `reflectiveGenerator` — the code-surface driver you pass to agent-eval's `selfImprove` |
209
+ | `.../analyst-loop` | `runAnalystLoop`, the analyst registry driver |
210
+ | `.../platform` | cross-site SSO and the integrations hub |
315
211
 
316
- ---
212
+ Bins: `agent-runtime-mcp` (delegation MCP server), `agent-runtime-loop` (schedulable delegated loop-runner).
317
213
 
318
214
  ## Adoption skill
319
215
 
320
- This package ships a **self-contained adoption skill** at [`skills/agent-runtime-adoption/SKILL.md`](./skills/agent-runtime-adoption/SKILL.md) driven loops, topology drivers, the `loopDispatch` campaign bridge, MCP delegation, and identity-gated `optimizePrompt`. It needs only this package + `@tangle-network/agent-eval`, so external consumers need nothing private. For the full self-improving pipeline (trace sink analyst loop scorecard production loop CI), see the `agent-eval-adoption` / `agent-stack-adoption` skills.
321
-
322
- ---
216
+ This package ships a self-contained adoption skill at [`skills/agent-runtime-adoption/SKILL.md`](./skills/agent-runtime-adoption/SKILL.md): driven loops, topology drivers, the `loopDispatch` campaign bridge, MCP delegation, and the code-surface `improvementDriver` for agent-eval's `selfImprove`. It needs only this package plus `@tangle-network/agent-eval`. For the full self-improving pipeline (trace sink, analyst loop, scorecard, production loop, CI), see the `agent-eval-adoption` and `agent-stack-adoption` skills.
323
217
 
324
- ## Stability · Tests · Docs
218
+ ## Stability, tests, docs
325
219
 
326
- Every public export is annotated `@stable` or `@experimental`. `@stable` exports don't change shape inside a minor; `@experimental` ones may and require a deliberate consumer bump.
220
+ Every public export is annotated `@stable` or `@experimental`. `@stable` exports do not change shape inside a minor version; `@experimental` ones may, and require a deliberate consumer bump.
327
221
 
328
222
  ```bash
329
- pnpm test # full suite across the kernel, drivers, MCP, delegate hardening, kb-gate, loop-runner, backends
223
+ pnpm test # kernel, drivers, MCP, delegate hardening, kb-gate, loop-runner, backends
330
224
  pnpm typecheck
331
225
  pnpm build
332
226
  ```
333
227
 
334
- Deeper docs: [`docs/concepts.md`](./docs/concepts.md) (mental model) · [`docs/agent-bus-protocol.md`](./docs/agent-bus-protocol.md) (cross-gateway header contract) · [`docs/conversation-economics.md`](./docs/conversation-economics.md) (who pays — `authSource`) · [`docs/durability-adapters.md`](./docs/durability-adapters.md) (SQL-backed `ConversationJournal`).
228
+ Deeper docs: [`docs/architecture.md`](./docs/architecture.md) (the canonical spine), [`docs/learning-flywheel.md`](./docs/learning-flywheel.md) (the self-improvement thesis and the open gate), [`docs/concepts.md`](./docs/concepts.md) (mental model), [`docs/agent-bus-protocol.md`](./docs/agent-bus-protocol.md) (cross-gateway header contract), [`docs/conversation-economics.md`](./docs/conversation-economics.md) (who pays), [`docs/durability-adapters.md`](./docs/durability-adapters.md) (SQL-backed `ConversationJournal`).
package/dist/agent.d.ts CHANGED
@@ -1,11 +1,12 @@
1
1
  import * as _tangle_network_agent_eval from '@tangle-network/agent-eval';
2
2
  import { TraceAnalystKindSpec, AnalystFinding } from '@tangle-network/agent-eval';
3
- import { R as RuntimeStreamEvent, L as LoopSandboxClient, a as OutputAdapter, A as AgentRunSpec } from './types-Bcp071Jg.js';
4
- import { A as AgentSurfaces } from './improvement-adapter-BC4HhuAR.js';
5
- export { C as CreateSurfaceImprovementAdapterOpts, D as DraftPatchInput, a as DraftPatchOutput, R as ResolvedSurface, S as SurfaceImprovementEdit, b as SurfaceValidationIssue, c as createSurfaceImprovementAdapter, r as renderSurfaceIssues, d as resolveSubjectPath, v as validateSurfaces } from './improvement-adapter-BC4HhuAR.js';
6
- import { K as KnowledgeAdapter, a as RunAnalystLoopResult } from './types-p8dWBIXL.js';
3
+ import { R as RuntimeStreamEvent, b as LoopSandboxClient, O as OutputAdapter, A as AgentRunSpec } from './types-DdzkffAm.js';
4
+ import { A as AgentSurfaces } from './improvement-adapter-CWegd3vw.js';
5
+ export { C as CreateSurfaceImprovementAdapterOpts, D as DraftPatchInput, a as DraftPatchOutput, R as ResolvedSurface, S as SurfaceImprovementEdit, b as SurfaceValidationIssue, c as createSurfaceImprovementAdapter, r as renderSurfaceIssues, d as resolveSubjectPath, v as validateSurfaces } from './improvement-adapter-CWegd3vw.js';
6
+ import { K as KnowledgeAdapter, a as RunAnalystLoopResult } from './types-BtRLF2U3.js';
7
7
  import { AgentProfile, SandboxEvent } from '@tangle-network/sandbox';
8
8
  import { C as ComposeProductionAgentProfileOptions } from './delegation-profile-1GbW5yA3.js';
9
+ import './runtime-hooks-C7JwKb9E.js';
9
10
 
10
11
  /**
11
12
  * The full agent manifest. Each agent ships ONE of these.
package/dist/agent.js CHANGED
@@ -2,11 +2,11 @@ import {
2
2
  composeProductionAgentProfile
3
3
  } from "./chunk-7JITYN6T.js";
4
4
  import {
5
- createSandboxForSpec,
5
+ createSandboxForSpec
6
+ } from "./chunk-KEWO4KI6.js";
7
+ import {
6
8
  mapSandboxEvent
7
- } from "./chunk-EKBSQYZE.js";
8
- import "./chunk-PY6NMZYX.js";
9
- import "./chunk-SQSCRJ7U.js";
9
+ } from "./chunk-PRX45WE2.js";
10
10
  import {
11
11
  __require
12
12
  } from "./chunk-DGUM43GV.js";
@@ -193,9 +193,7 @@ function defineAgent(manifest) {
193
193
  // src/agent/improvement-adapter.ts
194
194
  import { spawnSync } from "child_process";
195
195
  import { readFileSync } from "fs";
196
- import {
197
- parseFindingSubject
198
- } from "@tangle-network/agent-eval";
196
+ import { parseFindingSubject } from "@tangle-network/agent-eval/analyst";
199
197
  var DEFAULT_CREATE_KINDS = [
200
198
  "knowledge.wiki",
201
199
  "knowledge.claim",