@tangle-network/agent-runtime 0.48.0 → 0.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -15
- package/dist/agent.d.ts +1 -1
- package/dist/agent.js +1 -1
- package/dist/analyst-loop.d.ts +1 -1
- package/dist/{chunk-656G2XCL.js → chunk-BKAIVNFA.js} +3 -3
- package/dist/{chunk-IW2LMLK6.js → chunk-CM2IK7VS.js} +913 -152
- package/dist/chunk-CM2IK7VS.js.map +1 -0
- package/dist/{chunk-VR4JIC5H.js → chunk-ML4IXGTV.js} +2 -2
- package/dist/{chunk-TJS7S3HJ.js → chunk-NDM5VXZW.js} +19 -8
- package/dist/chunk-NDM5VXZW.js.map +1 -0
- package/dist/chunk-OM3YNZIW.js +978 -0
- package/dist/chunk-OM3YNZIW.js.map +1 -0
- package/dist/{chunk-JNPK46YH.js → chunk-RHW75JW5.js} +498 -350
- package/dist/chunk-RHW75JW5.js.map +1 -0
- package/dist/{coder-CVZNGbyg.d.ts → coder-_YCf3BAK.d.ts} +2 -2
- package/dist/{driver-DYU2sgHr.d.ts → driver-DLI1io57.d.ts} +1 -1
- package/dist/index.d.ts +34 -9
- package/dist/index.js +117 -27
- package/dist/index.js.map +1 -1
- package/dist/kb-gate-CHAyt4aI.d.ts +1571 -0
- package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-DFUNgpeK.d.ts} +4 -4
- package/dist/loop-runner-bin.d.ts +5 -5
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +6 -6
- package/dist/loops.js +17 -1
- package/dist/mcp/bin.js +206 -29
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +41 -177
- package/dist/mcp/index.js +40 -6
- package/dist/mcp/index.js.map +1 -1
- package/dist/openai-tools-D4HLDWgw.d.ts +45 -0
- package/dist/platform.js +2 -2
- package/dist/platform.js.map +1 -1
- package/dist/profiles.d.ts +2 -2
- package/dist/{run-loop-DvD4aGiE.d.ts → run-loop-BIineL1T.d.ts} +1 -1
- package/dist/runtime.d.ts +403 -24
- package/dist/runtime.js +17 -1
- package/dist/{types-BpDfCPUp.d.ts → types-5MGt5KTY.d.ts} +1 -1
- package/dist/{types-nBMuollC.d.ts → types-BEQsBhOE.d.ts} +1 -1
- package/dist/workflow.d.ts +2 -2
- package/dist/workflow.js +1 -1
- package/package.json +6 -5
- package/dist/chunk-IW2LMLK6.js.map +0 -1
- package/dist/chunk-JNPK46YH.js.map +0 -1
- package/dist/chunk-LX66I3SC.js +0 -218
- package/dist/chunk-LX66I3SC.js.map +0 -1
- package/dist/chunk-TJS7S3HJ.js.map +0 -1
- package/dist/kb-gate-51BlLlVM.d.ts +0 -529
- package/dist/otel-export-EzfsVUhh.d.ts +0 -191
- /package/dist/{chunk-656G2XCL.js.map → chunk-BKAIVNFA.js.map} +0 -0
- /package/dist/{chunk-VR4JIC5H.js.map → chunk-ML4IXGTV.js.map} +0 -0
package/README.md
CHANGED
|
@@ -2,7 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
The shared task-lifecycle skeleton for agents. It runs an agent (a chat turn, a one-shot task, or a multi-attempt loop), captures every run as a trace, and feeds those traces into eval-gated self-improvement.
|
|
4
4
|
|
|
5
|
-
It owns the lifecycle
|
|
5
|
+
It owns the lifecycle, the loop kernel, and the **optimization suite** — `Environment` + `Strategy` +
|
|
6
|
+
`runBenchmark` + `runStrategyEvolution`, the published surface for measuring and evolving how an agent
|
|
7
|
+
spends compute against a deployable check. It delegates domain behavior (models, tools, knowledge) to
|
|
8
|
+
adapters, scoring statistics and the ship gate to [`@tangle-network/agent-eval`](https://www.npmjs.com/package/@tangle-network/agent-eval), and sandboxed long-running execution to [`@tangle-network/sandbox`](https://www.npmjs.com/package/@tangle-network/sandbox).
|
|
6
9
|
|
|
7
10
|
```bash
|
|
8
11
|
pnpm add @tangle-network/agent-runtime @tangle-network/agent-eval @tangle-network/sandbox
|
|
@@ -53,8 +56,9 @@ That is the common case. Everything below is for when one chat turn is not enoug
|
|
|
53
56
|
| Run a production chat turn (most products) | `handleChatTurn` | root |
|
|
54
57
|
| Declare an agent (profile, surfaces, adapters) | `defineAgent` | `/agent` |
|
|
55
58
|
| Run a one-shot task with verification and eval | `runAgentTask` | root |
|
|
56
|
-
|
|
|
57
|
-
| Let the
|
|
59
|
+
| Compare optimization strategies on YOUR domain (5 hooks) | `runBenchmark` + `defineStrategy` | `/loops` |
|
|
60
|
+
| Let the system author + evolve its own strategies, gated | `runStrategyEvolution` · `authorStrategy` · `promotionGate` | `/loops` |
|
|
61
|
+
| Run a multi-attempt loop with a custom driver | `runLoop` + `createDriver` | `/loops` |
|
|
58
62
|
| Delegate a disciplined loop by mode (code, research, ...) | `runDelegatedLoop` or `agent-runtime-loop` | root |
|
|
59
63
|
| Build code reliably (reviewed, gated) | `createDefaultCoderDelegate` | `/mcp` |
|
|
60
64
|
| Grow a knowledge base with only grounded facts | `createKbGate` | `/mcp` |
|
|
@@ -64,15 +68,50 @@ That is the common case. Everything below is for when one chat turn is not enoug
|
|
|
64
68
|
| Mutate surfaces from trace findings | `runAnalystLoop` | `/analyst-loop` |
|
|
65
69
|
| Persist a run plus its cost ledger | `startRuntimeRun` | root |
|
|
66
70
|
|
|
71
|
+
## The optimization suite
|
|
72
|
+
|
|
73
|
+
The canonical surface. A domain is an `Environment` (five hooks: `open`/`tools`/`call`/`score`/`close`);
|
|
74
|
+
a **strategy** is how a compute budget is spent to beat the domain's own deployable check. Two
|
|
75
|
+
built-ins (`sample` = best-of-N, `refine` = critique-and-continue) plus `defineStrategy` to compose
|
|
76
|
+
your own from two steps — and `authorStrategy`, where the system writes new strategies from its own
|
|
77
|
+
per-task losses:
|
|
78
|
+
|
|
79
|
+
```ts
|
|
80
|
+
import { defineStrategy, runBenchmark, sample, refine } from '@tangle-network/agent-runtime/loops'
|
|
81
|
+
|
|
82
|
+
const doubleCheck = defineStrategy('double-check', async ({ shot, critique }) => {
|
|
83
|
+
const first = await shot()
|
|
84
|
+
const steer = first ? await critique(first.messages) : null
|
|
85
|
+
const second = steer ? await shot({ messages: first?.messages, steer }) : null
|
|
86
|
+
const score = Math.max(first?.score ?? 0, second?.score ?? 0)
|
|
87
|
+
return { score, resolved: score >= 1, completions: 2, progression: [first?.score ?? 0, score], shots: 2 }
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
const report = await runBenchmark({ environment, tasks, worker, strategies: [sample, refine, doubleCheck], budget: 3 })
|
|
91
|
+
report.perTask // the losses table an author/optimizer consumes
|
|
92
|
+
report.pareto // the (score, $) frontier
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
The measurement invariants are structural, not advisory: every strategy spends through a conserved
|
|
96
|
+
budget pool (equal compute by construction), the deliverable score is **harness-verified** from the
|
|
97
|
+
shots actually brokered (a body cannot fabricate a win), and the critic is firewalled from the check
|
|
98
|
+
(selector ≠ judge). `runStrategyEvolution` runs the multi-generation search — populations of authored
|
|
99
|
+
candidates, cost-aware champion selection, a phase ledger with resume, and ONE promotion decision via
|
|
100
|
+
`promotionGate` (seeded paired bootstrap) on a holdout slice the search never touched.
|
|
101
|
+
`createVerifierEnvironment` adapts answer-shaped domains (one `check` function); `createMcpEnvironment`
|
|
102
|
+
adapts any MCP server. The consumer surface — loops as a service with a CLI, detached runner, and MCP
|
|
103
|
+
server — lives in the [`loops`](https://github.com/drewstone/loops) repo; the experiment harness and
|
|
104
|
+
evidence ledger live in [`bench/HARNESS.md`](./bench/HARNESS.md).
|
|
105
|
+
|
|
67
106
|
## The loop kernel
|
|
68
107
|
|
|
69
108
|
`runLoop` is a topology-agnostic kernel. Each iteration spawns a sandbox on an `AgentRunSpec`, decodes the output, validates it, and asks a driver what to do next. The driver owns topology. The validator owns scoring. The kernel owns iteration accounting, concurrency, cost and token aggregation, and trace emission.
|
|
70
109
|
|
|
71
110
|
```ts
|
|
72
|
-
import { runLoop,
|
|
111
|
+
import { runLoop, createDriver } from '@tangle-network/agent-runtime/loops'
|
|
73
112
|
|
|
74
113
|
const result = await runLoop({
|
|
75
|
-
driver:
|
|
114
|
+
driver: createDriver({ planner }), // the planner emits one TopologyMove per round
|
|
76
115
|
agentRuns: [claudeSpec, codexSpec, glmSpec], // heterogeneous: one harness per branch
|
|
77
116
|
output, // events to typed Output
|
|
78
117
|
validator, // Output to { valid, score }
|
|
@@ -82,9 +121,13 @@ const result = await runLoop({
|
|
|
82
121
|
result.winner // highest-scoring valid attempt
|
|
83
122
|
```
|
|
84
123
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
124
|
+
`createDriver` lets a planner author the topology at runtime: one `TopologyMove` per round
|
|
125
|
+
(`refine`, `fanout`, `select`, or `stop`); a malformed move throws `PlannerError`, so the loop never
|
|
126
|
+
runs a topology nobody chose. Topology is orthogonal to harness: the planner never names a backend,
|
|
127
|
+
and the kernel's `agentRuns` decide which harness runs each branch. For fixed shapes, write a small
|
|
128
|
+
inline `Driver` (see `examples/coder-loop`) or use the `personify` combinators (`fanout`, `loopUntil`,
|
|
129
|
+
`panel`, `pipeline`) over the recursive `Scope`/`Supervisor` core — the newer canonical path for
|
|
130
|
+
recursive work.
|
|
88
131
|
|
|
89
132
|
## Self-improvement
|
|
90
133
|
|
|
@@ -106,7 +149,12 @@ const result = await selfImprove({
|
|
|
106
149
|
// result.winner.surface is the safe one — the baseline unless gateDecision === 'ship'
|
|
107
150
|
```
|
|
108
151
|
|
|
109
|
-
agent-runtime contributes the runtime-specific
|
|
152
|
+
agent-runtime contributes the runtime-specific pieces: the **CODE-surface `improvementDriver`**
|
|
153
|
+
(`/improvement`) — a git-worktree mutator you pass to `selfImprove` as `driver` to optimize code
|
|
154
|
+
instead of a string — and **`runStrategyEvolution`** (`/loops`), the multi-generation search over
|
|
155
|
+
STRATEGY space: the system reads its own per-task losses, authors candidate strategies as code,
|
|
156
|
+
plays them against the incumbent at equal budget, and a seeded statistical gate decides promotion
|
|
157
|
+
on a never-touched holdout slice.
|
|
110
158
|
|
|
111
159
|
`runAnalystLoop` (`/analyst-loop`) mines real run traces into findings; `createAnalystDriverHook` feeds those findings to a dynamic-driver planner via `PlannerContext.analyses`, with a firewall (`assertTraceDerivedFindings`) that rejects any finding derived from a judge verdict. Production intake — turning real run traces into the corpus `selfImprove` optimizes against — is agent-eval's `analyzeRuns` / `partitionRunsByAuthoringModel` (`/contract`).
|
|
112
160
|
|
|
@@ -156,9 +204,15 @@ const server = createMcpServer({ coderDelegate: createDefaultCoderDelegate({ san
|
|
|
156
204
|
|
|
157
205
|
Or mount the `agent-runtime-mcp` stdio bin on a production `AgentProfile.mcp`.
|
|
158
206
|
|
|
207
|
+
Delegation state is in-memory by default — a server restart drops pending delegations and history. Set `AGENT_RUNTIME_DELEGATION_STATE_FILE=/path/state.json` on the bin (or construct via `DelegationTaskQueue.restore({ store: new FileDelegationStore({ filePath }) })`) to persist records across restarts: `delegation_status`/`delegation_history` keep answering for prior runs, idempotency keys dedupe resubmissions, and in-flight records either resume through the `resumeDelegate` seam (when submitted with a `detachedSessionRef`) or settle as failed with an explicit driver-restart error. A corrupt state file refuses to load (`DelegationStateCorruptError`); `AGENT_RUNTIME_DELEGATION_STATE_RECOVER=1` archives it and starts empty. `AGENT_RUNTIME_DELEGATION_RETAIN_TERMINAL=<n>` caps retained terminal records.
|
|
208
|
+
|
|
159
209
|
## The experiment harness (bench/)
|
|
160
210
|
|
|
161
|
-
`bench/` is the internal harness
|
|
211
|
+
`bench/` is the internal harness; [`bench/HARNESS.md`](./bench/HARNESS.md) is its map — read that
|
|
212
|
+
first. The canonical path is the optimization suite (`runBenchmark`/`flywheel-evolve` over real
|
|
213
|
+
domains: the EnterpriseOps gym, commit0, answer-shaped math); the older selection-gate paths
|
|
214
|
+
(`runExperiment`, corpus-replay) remain for the legacy evidence. The live evidence ledger is
|
|
215
|
+
`.evolve/current.json` — results never live in this README.
|
|
162
216
|
|
|
163
217
|
One entrypoint, `runExperiment(adapter, { sandboxClient, agentRun, arms, ... })`: N instances times a set of arms, each arm a topology driven through `runLoop`, judged by the adapter, written to a durable canonical corpus. An arm is one steer function `f(rootPrompt, history) => nextPrompt`: `random` ignores history (the compute control), `refine` carries the prior answer plus a directive, `diverse` rotates a strategy lens. The cost dial is the backend type (`hermes` for a direct router call, `opencode` or `claude-code` or `codex` for agent CLIs). The deep statistics (paired bootstrap with Benjamini-Hochberg correction, selector replay) come from `corpus-report.mts` and `corpus-replay.mts` over the written corpus, computed once. See `bench/HARNESS.md` and `docs/learning-flywheel.md`.
|
|
164
218
|
|
|
@@ -170,8 +224,9 @@ One entrypoint, `runExperiment(adapter, { sandboxClient, agentRun, arms, ... })`
|
|
|
170
224
|
| Backend provider | `openai-compat` when `TANGLE_API_KEY`, else `openai` if `OPENAI_API_KEY` | `MODEL_PROVIDER` env |
|
|
171
225
|
| Router base URL | `https://router.tangle.tools/v1` | `TANGLE_ROUTER_BASE_URL` env |
|
|
172
226
|
| Sandbox base URL | `https://sandbox.tangle.tools` | `SANDBOX_API_URL` env |
|
|
173
|
-
| Loop iteration cap | 10 (`runLoop`)
|
|
174
|
-
| Driver | none, required by `runLoop` | `
|
|
227
|
+
| Loop iteration cap | 10 (`runLoop`) | `runLoop({ maxIterations })` |
|
|
228
|
+
| Driver | none, required by `runLoop` | `createDriver` or an inline `Driver` |
|
|
229
|
+
| Strategy budget (suite) | 3 rollouts/shots per strategy per task | `runBenchmark({ budget })` |
|
|
175
230
|
| Winner selection (coder delegate) | `highest-score` | `winnerSelection` option |
|
|
176
231
|
| KB gate min passage | 12 chars | `createKbGate({ minPassageChars })` |
|
|
177
232
|
| `selfImprove` gate | held-out gate (default) | pass `gate: defaultProductionGate` for red-team hardening |
|
|
@@ -202,18 +257,27 @@ sandbox AgentProfile, Sandbox.create, streamPrompt, exportTraceBundle. T
|
|
|
202
257
|
|---|---|
|
|
203
258
|
| `@tangle-network/agent-runtime` | chat turns, delegated loop-runner, OTEL export, errors, model resolution |
|
|
204
259
|
| `.../agent` | `defineAgent` plus surface and outcome adapters |
|
|
205
|
-
| `.../loops` | the
|
|
260
|
+
| `.../loops` | **the optimization suite** (`Environment`, `defineStrategy`, `runBenchmark`, `runStrategyEvolution`, `authorStrategy`, `promotionGate`) + the `runLoop` kernel, `createDriver`, `loopDispatch` |
|
|
206
261
|
| `.../profiles` | `coderProfile`, `researcherProfile` presets |
|
|
207
262
|
| `.../mcp` | `createMcpServer`, `createDefaultCoderDelegate`, `createKbGate`, the `agent-runtime-mcp` bin |
|
|
208
263
|
| `.../improvement` | `improvementDriver` (code/worktree `CandidateGenerator`), `agenticGenerator`, `reflectiveGenerator` — the code-surface driver you pass to agent-eval's `selfImprove` |
|
|
209
264
|
| `.../analyst-loop` | `runAnalystLoop`, the analyst registry driver |
|
|
210
265
|
| `.../platform` | cross-site SSO and the integrations hub |
|
|
266
|
+
| `.../runtime` | the recursive core by its own name (same module as `/loops`) |
|
|
267
|
+
| `.../topology` | the live agent-tree viewer (folds spawn/settle events into a renderable tree) |
|
|
268
|
+
| `.../workflow` · `.../audit` | workflow orchestration helpers · audit utilities |
|
|
211
269
|
|
|
212
270
|
Bins: `agent-runtime-mcp` (delegation MCP server), `agent-runtime-loop` (schedulable delegated loop-runner).
|
|
213
271
|
|
|
214
|
-
##
|
|
272
|
+
## Teaching an agent to build on this
|
|
215
273
|
|
|
216
|
-
|
|
274
|
+
Two agent-consumable skills live in the [`loops`](https://github.com/drewstone/loops) repo:
|
|
275
|
+
**`skills/loop-builder`** (domain → `Environment` → loop → gate → operator surface, with the
|
|
276
|
+
measured foot-gun list) and **`skills/loop-author`** (authoring a strategy body from losses;
|
|
277
|
+
read the contract with `loops contract`). The runnable on-ramp is [`examples/`](./examples/README.md)
|
|
278
|
+
— a learning progression from the production chat turn through the strategy suite to the recursive
|
|
279
|
+
supervisor. For the broader pipeline (trace sink, analyst loop, scorecard, CI), see the
|
|
280
|
+
`agent-eval-adoption` and `agent-stack-adoption` skills.
|
|
217
281
|
|
|
218
282
|
## Stability, tests, docs
|
|
219
283
|
|
package/dist/agent.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import * as _tangle_network_agent_eval from '@tangle-network/agent-eval';
|
|
2
2
|
import { TraceAnalystKindSpec, AnalystFinding } from '@tangle-network/agent-eval';
|
|
3
|
-
import { R as RuntimeStreamEvent, S as SandboxClient, O as OutputAdapter, A as AgentRunSpec } from './types-
|
|
3
|
+
import { R as RuntimeStreamEvent, S as SandboxClient, O as OutputAdapter, A as AgentRunSpec } from './types-BEQsBhOE.js';
|
|
4
4
|
import { A as AgentSurfaces } from './improvement-adapter-BC4HhuAR.js';
|
|
5
5
|
export { C as CreateSurfaceImprovementAdapterOpts, D as DraftPatchInput, a as DraftPatchOutput, R as ResolvedSurface, S as SurfaceImprovementEdit, b as SurfaceValidationIssue, c as createSurfaceImprovementAdapter, r as renderSurfaceIssues, d as resolveSubjectPath, v as validateSurfaces } from './improvement-adapter-BC4HhuAR.js';
|
|
6
6
|
import { K as KnowledgeAdapter, a as RunAnalystLoopResult } from './types-p8dWBIXL.js';
|
package/dist/agent.js
CHANGED
package/dist/analyst-loop.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { TraceAnalystByteBudgets, TraceAnalysisStore } from '@tangle-network/agent-eval';
|
|
2
|
-
import { I as Iteration } from './types-
|
|
2
|
+
import { I as Iteration } from './types-BEQsBhOE.js';
|
|
3
3
|
import { R as RunAnalystLoopOpts, a as RunAnalystLoopResult } from './types-p8dWBIXL.js';
|
|
4
4
|
export { A as AnalystLoopEvent, b as AnalystRegistryLike, c as AnalystRegistryStreamingLike, d as AutoApplyPolicy, F as FindingsStoreLike, I as ImprovementAdapter, e as ImprovementEditBatch, f as ImprovementReport, K as KnowledgeAdapter, g as KnowledgeProposalBatch, h as KnowledgeReport } from './types-p8dWBIXL.js';
|
|
5
5
|
import '@tangle-network/sandbox';
|
|
@@ -3,14 +3,14 @@ import {
|
|
|
3
3
|
} from "./chunk-FNMGYYSS.js";
|
|
4
4
|
import {
|
|
5
5
|
createDefaultCoderDelegate
|
|
6
|
-
} from "./chunk-
|
|
6
|
+
} from "./chunk-OM3YNZIW.js";
|
|
7
7
|
import {
|
|
8
8
|
runAnalystLoop
|
|
9
9
|
} from "./chunk-HNUXAZIJ.js";
|
|
10
10
|
import {
|
|
11
11
|
createDriver,
|
|
12
12
|
runLoop
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-CM2IK7VS.js";
|
|
14
14
|
import {
|
|
15
15
|
ConfigError
|
|
16
16
|
} from "./chunk-GSUO5QS6.js";
|
|
@@ -200,4 +200,4 @@ export {
|
|
|
200
200
|
runLoopRunnerCli,
|
|
201
201
|
parseLoopRunnerArgv
|
|
202
202
|
};
|
|
203
|
-
//# sourceMappingURL=chunk-
|
|
203
|
+
//# sourceMappingURL=chunk-BKAIVNFA.js.map
|