npm - @tangle-network/agent-runtime - Versions diffs - 0.48.0 → 0.50.0 - Mend

@tangle-network/agent-runtime 0.48.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/README.md +79 -15
package/dist/agent.d.ts +1 -1
package/dist/agent.js +1 -1
package/dist/analyst-loop.d.ts +1 -1
package/dist/{chunk-656G2XCL.js → chunk-BKAIVNFA.js} +3 -3
package/dist/{chunk-IW2LMLK6.js → chunk-CM2IK7VS.js} +913 -152
package/dist/chunk-CM2IK7VS.js.map +1 -0
package/dist/{chunk-VR4JIC5H.js → chunk-ML4IXGTV.js} +2 -2
package/dist/{chunk-TJS7S3HJ.js → chunk-NDM5VXZW.js} +19 -8
package/dist/chunk-NDM5VXZW.js.map +1 -0
package/dist/chunk-OM3YNZIW.js +978 -0
package/dist/chunk-OM3YNZIW.js.map +1 -0
package/dist/{chunk-JNPK46YH.js → chunk-RHW75JW5.js} +498 -350
package/dist/chunk-RHW75JW5.js.map +1 -0
package/dist/{coder-CVZNGbyg.d.ts → coder-_YCf3BAK.d.ts} +2 -2
package/dist/{driver-DYU2sgHr.d.ts → driver-DLI1io57.d.ts} +1 -1
package/dist/index.d.ts +34 -9
package/dist/index.js +117 -27
package/dist/index.js.map +1 -1
package/dist/kb-gate-CHAyt4aI.d.ts +1571 -0
package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-DFUNgpeK.d.ts} +4 -4
package/dist/loop-runner-bin.d.ts +5 -5
package/dist/loop-runner-bin.js +3 -3
package/dist/loops.d.ts +6 -6
package/dist/loops.js +17 -1
package/dist/mcp/bin.js +206 -29
package/dist/mcp/bin.js.map +1 -1
package/dist/mcp/index.d.ts +41 -177
package/dist/mcp/index.js +40 -6
package/dist/mcp/index.js.map +1 -1
package/dist/openai-tools-D4HLDWgw.d.ts +45 -0
package/dist/platform.js +2 -2
package/dist/platform.js.map +1 -1
package/dist/profiles.d.ts +2 -2
package/dist/{run-loop-DvD4aGiE.d.ts → run-loop-BIineL1T.d.ts} +1 -1
package/dist/runtime.d.ts +403 -24
package/dist/runtime.js +17 -1
package/dist/{types-BpDfCPUp.d.ts → types-5MGt5KTY.d.ts} +1 -1
package/dist/{types-nBMuollC.d.ts → types-BEQsBhOE.d.ts} +1 -1
package/dist/workflow.d.ts +2 -2
package/dist/workflow.js +1 -1
package/package.json +6 -5
package/dist/chunk-IW2LMLK6.js.map +0 -1
package/dist/chunk-JNPK46YH.js.map +0 -1
package/dist/chunk-LX66I3SC.js +0 -218
package/dist/chunk-LX66I3SC.js.map +0 -1
package/dist/chunk-TJS7S3HJ.js.map +0 -1
package/dist/kb-gate-51BlLlVM.d.ts +0 -529
package/dist/otel-export-EzfsVUhh.d.ts +0 -191
/package/dist/{chunk-656G2XCL.js.map → chunk-BKAIVNFA.js.map} +0 -0
/package/dist/{chunk-VR4JIC5H.js.map → chunk-ML4IXGTV.js.map} +0 -0

package/README.md CHANGED Viewed

@@ -2,7 +2,10 @@
 The shared task-lifecycle skeleton for agents. It runs an agent (a chat turn, a one-shot task, or a multi-attempt loop), captures every run as a trace, and feeds those traces into eval-gated self-improvement.
-It owns the lifecycle and the loop kernel. It delegates domain behavior (models, tools, knowledge) to adapters, scoring and the ship gate to [`@tangle-network/agent-eval`](https://www.npmjs.com/package/@tangle-network/agent-eval), and sandboxed long-running execution to [`@tangle-network/sandbox`](https://www.npmjs.com/package/@tangle-network/sandbox).
+It owns the lifecycle, the loop kernel, and the **optimization suite** — `Environment` + `Strategy` +
+`runBenchmark` + `runStrategyEvolution`, the published surface for measuring and evolving how an agent
+spends compute against a deployable check. It delegates domain behavior (models, tools, knowledge) to
+adapters, scoring statistics and the ship gate to [`@tangle-network/agent-eval`](https://www.npmjs.com/package/@tangle-network/agent-eval), and sandboxed long-running execution to [`@tangle-network/sandbox`](https://www.npmjs.com/package/@tangle-network/sandbox).
 ```bash
 pnpm add @tangle-network/agent-runtime @tangle-network/agent-eval @tangle-network/sandbox
@@ -53,8 +56,9 @@ That is the common case. Everything below is for when one chat turn is not enoug
 | Run a production chat turn (most products) | `handleChatTurn` | root |
 | Declare an agent (profile, surfaces, adapters) | `defineAgent` | `/agent` |
 | Run a one-shot task with verification and eval | `runAgentTask` | root |
-| Run a multi-attempt loop (refine or fanout-vote) | `runLoop` plus a driver | `/loops` |
-| Let the agent choose the loop shape per round | `createDriver` plus `createSandboxPlanner` | `/loops` |
+| Compare optimization strategies on YOUR domain (5 hooks) | `runBenchmark` + `defineStrategy` | `/loops` |
+| Let the system author + evolve its own strategies, gated | `runStrategyEvolution` · `authorStrategy` · `promotionGate` | `/loops` |
+| Run a multi-attempt loop with a custom driver | `runLoop` + `createDriver` | `/loops` |
 | Delegate a disciplined loop by mode (code, research, ...) | `runDelegatedLoop` or `agent-runtime-loop` | root |
 | Build code reliably (reviewed, gated) | `createDefaultCoderDelegate` | `/mcp` |
 | Grow a knowledge base with only grounded facts | `createKbGate` | `/mcp` |
@@ -64,15 +68,50 @@ That is the common case. Everything below is for when one chat turn is not enoug
 | Mutate surfaces from trace findings | `runAnalystLoop` | `/analyst-loop` |
 | Persist a run plus its cost ledger | `startRuntimeRun` | root |
+## The optimization suite
+The canonical surface. A domain is an `Environment` (five hooks: `open`/`tools`/`call`/`score`/`close`);
+a **strategy** is how a compute budget is spent to beat the domain's own deployable check. Two
+built-ins (`sample` = best-of-N, `refine` = critique-and-continue) plus `defineStrategy` to compose
+your own from two steps — and `authorStrategy`, where the system writes new strategies from its own
+per-task losses:
+```ts
+import { defineStrategy, runBenchmark, sample, refine } from '@tangle-network/agent-runtime/loops'
+const doubleCheck = defineStrategy('double-check', async ({ shot, critique }) => {
+  const first = await shot()
+  const steer = first ? await critique(first.messages) : null
+  const second = steer ? await shot({ messages: first?.messages, steer }) : null
+  const score = Math.max(first?.score ?? 0, second?.score ?? 0)
+  return { score, resolved: score >= 1, completions: 2, progression: [first?.score ?? 0, score], shots: 2 }
+})
+const report = await runBenchmark({ environment, tasks, worker, strategies: [sample, refine, doubleCheck], budget: 3 })
+report.perTask // the losses table an author/optimizer consumes
+report.pareto  // the (score, $) frontier
+```
+The measurement invariants are structural, not advisory: every strategy spends through a conserved
+budget pool (equal compute by construction), the deliverable score is **harness-verified** from the
+shots actually brokered (a body cannot fabricate a win), and the critic is firewalled from the check
+(selector ≠ judge). `runStrategyEvolution` runs the multi-generation search — populations of authored
+candidates, cost-aware champion selection, a phase ledger with resume, and ONE promotion decision via
+`promotionGate` (seeded paired bootstrap) on a holdout slice the search never touched.
+`createVerifierEnvironment` adapts answer-shaped domains (one `check` function); `createMcpEnvironment`
+adapts any MCP server. The consumer surface — loops as a service with a CLI, detached runner, and MCP
+server — lives in the [`loops`](https://github.com/drewstone/loops) repo; the experiment harness and
+evidence ledger live in [`bench/HARNESS.md`](./bench/HARNESS.md).
 ## The loop kernel
 `runLoop` is a topology-agnostic kernel. Each iteration spawns a sandbox on an `AgentRunSpec`, decodes the output, validates it, and asks a driver what to do next. The driver owns topology. The validator owns scoring. The kernel owns iteration accounting, concurrency, cost and token aggregation, and trace emission.
 ```ts
-import { runLoop, createFanoutVoteDriver } from '@tangle-network/agent-runtime/loops'
+import { runLoop, createDriver } from '@tangle-network/agent-runtime/loops'
 const result = await runLoop({
-  driver: createFanoutVoteDriver({ n: 3 }),    // 3 parallel attempts, pick the best valid one
+  driver: createDriver({ planner }),           // the planner emits one TopologyMove per round
   agentRuns: [claudeSpec, codexSpec, glmSpec], // heterogeneous: one harness per branch
   output,                                       // events to typed Output
   validator,                                    // Output to { valid, score }
@@ -82,9 +121,13 @@ const result = await runLoop({
 result.winner // highest-scoring valid attempt
 ```
-Shipped drivers (`/loops/drivers`): `createRefineDriver` (single task, iterate until valid), `createFanoutVoteDriver` (N parallel, vote), and `createDriver` (the agent authors the topology at runtime). The dynamic driver emits one `TopologyMove` per round (`refine`, `fanout`, or `stop`) from an injected planner; a malformed move throws `PlannerError`, so the loop never runs a topology nobody chose. Topology is orthogonal to harness: the planner never names a backend, and the kernel's `agentRuns` decide which harness runs each branch.
-`runProgram` (also in `/loops`) is the recursive op-set (`sample`, `steer`, `fork`, `parallel`, `select`, `seq`, `stop`) plus a tree executor, for programs that compose sub-loops.
+`createDriver` lets a planner author the topology at runtime: one `TopologyMove` per round
+(`refine`, `fanout`, `select`, or `stop`); a malformed move throws `PlannerError`, so the loop never
+runs a topology nobody chose. Topology is orthogonal to harness: the planner never names a backend,
+and the kernel's `agentRuns` decide which harness runs each branch. For fixed shapes, write a small
+inline `Driver` (see `examples/coder-loop`) or use the `personify` combinators (`fanout`, `loopUntil`,
+`panel`, `pipeline`) over the recursive `Scope`/`Supervisor` core — the newer canonical path for
+recursive work.
 ## Self-improvement
@@ -106,7 +149,12 @@ const result = await selfImprove({
 // result.winner.surface is the safe one — the baseline unless gateDecision === 'ship'
 ```
-agent-runtime contributes the runtime-specific piece: the **CODE-surface `improvementDriver`** (`/improvement`) — a git-worktree mutator you pass to `selfImprove` as `driver` to optimize code instead of a string.
+agent-runtime contributes the runtime-specific pieces: the **CODE-surface `improvementDriver`**
+(`/improvement`) — a git-worktree mutator you pass to `selfImprove` as `driver` to optimize code
+instead of a string — and **`runStrategyEvolution`** (`/loops`), the multi-generation search over
+STRATEGY space: the system reads its own per-task losses, authors candidate strategies as code,
+plays them against the incumbent at equal budget, and a seeded statistical gate decides promotion
+on a never-touched holdout slice.
 `runAnalystLoop` (`/analyst-loop`) mines real run traces into findings; `createAnalystDriverHook` feeds those findings to a dynamic-driver planner via `PlannerContext.analyses`, with a firewall (`assertTraceDerivedFindings`) that rejects any finding derived from a judge verdict. Production intake — turning real run traces into the corpus `selfImprove` optimizes against — is agent-eval's `analyzeRuns` / `partitionRunsByAuthoringModel` (`/contract`).
@@ -156,9 +204,15 @@ const server = createMcpServer({ coderDelegate: createDefaultCoderDelegate({ san
 Or mount the `agent-runtime-mcp` stdio bin on a production `AgentProfile.mcp`.
+Delegation state is in-memory by default — a server restart drops pending delegations and history. Set `AGENT_RUNTIME_DELEGATION_STATE_FILE=/path/state.json` on the bin (or construct via `DelegationTaskQueue.restore({ store: new FileDelegationStore({ filePath }) })`) to persist records across restarts: `delegation_status`/`delegation_history` keep answering for prior runs, idempotency keys dedupe resubmissions, and in-flight records either resume through the `resumeDelegate` seam (when submitted with a `detachedSessionRef`) or settle as failed with an explicit driver-restart error. A corrupt state file refuses to load (`DelegationStateCorruptError`); `AGENT_RUNTIME_DELEGATION_STATE_RECOVER=1` archives it and starts empty. `AGENT_RUNTIME_DELEGATION_RETAIN_TERMINAL=<n>` caps retained terminal records.
 ## The experiment harness (bench/)
-`bench/` is the internal harness that asks the binding empirical question: does any non-blind topology beat blind compute at equal k, under a deployable (non-oracle) selector, on a real benchmark? It runs through the same kernel, not a reimplementation.
+`bench/` is the internal harness; [`bench/HARNESS.md`](./bench/HARNESS.md) is its map — read that
+first. The canonical path is the optimization suite (`runBenchmark`/`flywheel-evolve` over real
+domains: the EnterpriseOps gym, commit0, answer-shaped math); the older selection-gate paths
+(`runExperiment`, corpus-replay) remain for the legacy evidence. The live evidence ledger is
+`.evolve/current.json` — results never live in this README.
 One entrypoint, `runExperiment(adapter, { sandboxClient, agentRun, arms, ... })`: N instances times a set of arms, each arm a topology driven through `runLoop`, judged by the adapter, written to a durable canonical corpus. An arm is one steer function `f(rootPrompt, history) => nextPrompt`: `random` ignores history (the compute control), `refine` carries the prior answer plus a directive, `diverse` rotates a strategy lens. The cost dial is the backend type (`hermes` for a direct router call, `opencode` or `claude-code` or `codex` for agent CLIs). The deep statistics (paired bootstrap with Benjamini-Hochberg correction, selector replay) come from `corpus-report.mts` and `corpus-replay.mts` over the written corpus, computed once. See `bench/HARNESS.md` and `docs/learning-flywheel.md`.
@@ -170,8 +224,9 @@ One entrypoint, `runExperiment(adapter, { sandboxClient, agentRun, arms, ... })`
 | Backend provider | `openai-compat` when `TANGLE_API_KEY`, else `openai` if `OPENAI_API_KEY` | `MODEL_PROVIDER` env |
 | Router base URL | `https://router.tangle.tools/v1` | `TANGLE_ROUTER_BASE_URL` env |
 | Sandbox base URL | `https://sandbox.tangle.tools` | `SANDBOX_API_URL` env |
-| Loop iteration cap | 10 (`runLoop`), 8 (dynamic driver) | `runLoop({ maxIterations })` |
-| Driver | none, required by `runLoop` | `createRefineDriver`, `createFanoutVoteDriver`, `createDriver` |
+| Loop iteration cap | 10 (`runLoop`) | `runLoop({ maxIterations })` |
+| Driver | none, required by `runLoop` | `createDriver` or an inline `Driver` |
+| Strategy budget (suite) | 3 rollouts/shots per strategy per task | `runBenchmark({ budget })` |
 | Winner selection (coder delegate) | `highest-score` | `winnerSelection` option |
 | KB gate min passage | 12 chars | `createKbGate({ minPassageChars })` |
 | `selfImprove` gate | held-out gate (default) | pass `gate: defaultProductionGate` for red-team hardening |
@@ -202,18 +257,27 @@ sandbox         AgentProfile, Sandbox.create, streamPrompt, exportTraceBundle. T
 |---|---|
 | `@tangle-network/agent-runtime` | chat turns, delegated loop-runner, OTEL export, errors, model resolution |
 | `.../agent` | `defineAgent` plus surface and outcome adapters |
-| `.../loops` | the `runLoop` kernel, the `refine` / `fanout-vote` / `dynamic` drivers, `runProgram`, `loopDispatch` |
+| `.../loops` | **the optimization suite** (`Environment`, `defineStrategy`, `runBenchmark`, `runStrategyEvolution`, `authorStrategy`, `promotionGate`) + the `runLoop` kernel, `createDriver`, `loopDispatch` |
 | `.../profiles` | `coderProfile`, `researcherProfile` presets |
 | `.../mcp` | `createMcpServer`, `createDefaultCoderDelegate`, `createKbGate`, the `agent-runtime-mcp` bin |
 | `.../improvement` | `improvementDriver` (code/worktree `CandidateGenerator`), `agenticGenerator`, `reflectiveGenerator` — the code-surface driver you pass to agent-eval's `selfImprove` |
 | `.../analyst-loop` | `runAnalystLoop`, the analyst registry driver |
 | `.../platform` | cross-site SSO and the integrations hub |
+| `.../runtime` | the recursive core by its own name (same module as `/loops`) |
+| `.../topology` | the live agent-tree viewer (folds spawn/settle events into a renderable tree) |
+| `.../workflow` · `.../audit` | workflow orchestration helpers · audit utilities |
 Bins: `agent-runtime-mcp` (delegation MCP server), `agent-runtime-loop` (schedulable delegated loop-runner).
-## Adoption skill
+## Teaching an agent to build on this
-This package ships a self-contained adoption skill at [`skills/agent-runtime-adoption/SKILL.md`](./skills/agent-runtime-adoption/SKILL.md): driven loops, topology drivers, the `loopDispatch` campaign bridge, MCP delegation, and the code-surface `improvementDriver` for agent-eval's `selfImprove`. It needs only this package plus `@tangle-network/agent-eval`. For the full self-improving pipeline (trace sink, analyst loop, scorecard, production loop, CI), see the `agent-eval-adoption` and `agent-stack-adoption` skills.
+Two agent-consumable skills live in the [`loops`](https://github.com/drewstone/loops) repo:
+**`skills/loop-builder`** (domain → `Environment` → loop → gate → operator surface, with the
+measured foot-gun list) and **`skills/loop-author`** (authoring a strategy body from losses;
+read the contract with `loops contract`). The runnable on-ramp is [`examples/`](./examples/README.md)
+— a learning progression from the production chat turn through the strategy suite to the recursive
+supervisor. For the broader pipeline (trace sink, analyst loop, scorecard, CI), see the
+`agent-eval-adoption` and `agent-stack-adoption` skills.
 ## Stability, tests, docs

package/dist/agent.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import * as _tangle_network_agent_eval from '@tangle-network/agent-eval';
 import { TraceAnalystKindSpec, AnalystFinding } from '@tangle-network/agent-eval';
-import { R as RuntimeStreamEvent, S as SandboxClient, O as OutputAdapter, A as AgentRunSpec } from './types-nBMuollC.js';
+import { R as RuntimeStreamEvent, S as SandboxClient, O as OutputAdapter, A as AgentRunSpec } from './types-BEQsBhOE.js';
 import { A as AgentSurfaces } from './improvement-adapter-BC4HhuAR.js';
 export { C as CreateSurfaceImprovementAdapterOpts, D as DraftPatchInput, a as DraftPatchOutput, R as ResolvedSurface, S as SurfaceImprovementEdit, b as SurfaceValidationIssue, c as createSurfaceImprovementAdapter, r as renderSurfaceIssues, d as resolveSubjectPath, v as validateSurfaces } from './improvement-adapter-BC4HhuAR.js';
 import { K as KnowledgeAdapter, a as RunAnalystLoopResult } from './types-p8dWBIXL.js';

package/dist/agent.js CHANGED Viewed

@@ -3,7 +3,7 @@ import {
 } from "./chunk-7JITYN6T.js";
 import {
   createSandboxForSpec
-} from "./chunk-IW2LMLK6.js";
+} from "./chunk-CM2IK7VS.js";
 import {
   mapSandboxEvent
 } from "./chunk-GSUO5QS6.js";

package/dist/analyst-loop.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { TraceAnalystByteBudgets, TraceAnalysisStore } from '@tangle-network/agent-eval';
-import { I as Iteration } from './types-nBMuollC.js';
+import { I as Iteration } from './types-BEQsBhOE.js';
 import { R as RunAnalystLoopOpts, a as RunAnalystLoopResult } from './types-p8dWBIXL.js';
 export { A as AnalystLoopEvent, b as AnalystRegistryLike, c as AnalystRegistryStreamingLike, d as AutoApplyPolicy, F as FindingsStoreLike, I as ImprovementAdapter, e as ImprovementEditBatch, f as ImprovementReport, K as KnowledgeAdapter, g as KnowledgeProposalBatch, h as KnowledgeReport } from './types-p8dWBIXL.js';
 import '@tangle-network/sandbox';

package/dist/{chunk-656G2XCL.js → chunk-BKAIVNFA.js} RENAMED Viewed

@@ -3,14 +3,14 @@ import {
 } from "./chunk-FNMGYYSS.js";
 import {
   createDefaultCoderDelegate
-} from "./chunk-LX66I3SC.js";
+} from "./chunk-OM3YNZIW.js";
 import {
   runAnalystLoop
 } from "./chunk-HNUXAZIJ.js";
 import {
   createDriver,
   runLoop
-} from "./chunk-IW2LMLK6.js";
+} from "./chunk-CM2IK7VS.js";
 import {
   ConfigError
 } from "./chunk-GSUO5QS6.js";
@@ -200,4 +200,4 @@ export {
   runLoopRunnerCli,
   parseLoopRunnerArgv
 };
-//# sourceMappingURL=chunk-656G2XCL.js.map
+//# sourceMappingURL=chunk-BKAIVNFA.js.map