@tangle-network/agent-runtime 0.8.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -13
- package/dist/agent.d.ts +537 -0
- package/dist/agent.js +475 -0
- package/dist/agent.js.map +1 -0
- package/dist/analyst-loop.d.ts +26 -0
- package/dist/analyst-loop.js +262 -0
- package/dist/analyst-loop.js.map +1 -0
- package/dist/chunk-DGUM43GV.js +11 -0
- package/dist/chunk-DGUM43GV.js.map +1 -0
- package/dist/index.d.ts +235 -35
- package/dist/index.js +284 -3
- package/dist/index.js.map +1 -1
- package/dist/platform.d.ts +197 -0
- package/dist/platform.js +187 -0
- package/dist/platform.js.map +1 -0
- package/dist/types-D_MXrmJP.d.ts +245 -0
- package/package.json +39 -14
- package/docs/domain-agent-runtime-integration-issues.md +0 -165
- package/docs/product-runtime-kernel.md +0 -326
package/README.md
CHANGED
|
@@ -15,8 +15,8 @@ pnpm add @tangle-network/agent-runtime @tangle-network/agent-eval
|
|
|
15
15
|
|---|---|
|
|
16
16
|
| `runAgentTask` | Single-shot adapter-driven task with eval/verification |
|
|
17
17
|
| `runAgentTaskStream` | Streaming product loop with session resume + backends |
|
|
18
|
-
| `startRuntimeRun` | Canonical production-run row + cost ledger
|
|
19
|
-
| `createTraceBridge` | Map `RuntimeStreamEvent` → `agent-eval` `TraceEvent`
|
|
18
|
+
| `startRuntimeRun` | Canonical production-run row + cost ledger |
|
|
19
|
+
| `createTraceBridge` | Map `RuntimeStreamEvent` → `agent-eval` `TraceEvent` |
|
|
20
20
|
| `decideKnowledgeReadiness` | `ready` / `blocked` / `caveat` branch for routes / UI |
|
|
21
21
|
| `createOpenAICompatibleBackend` | OpenAI-compatible streaming backend (TCloud / cli-bridge) |
|
|
22
22
|
| `createSandboxPromptBackend` | Sandbox / sidecar `streamPrompt` clients |
|
|
@@ -50,13 +50,11 @@ const result = await runAgentTask({
|
|
|
50
50
|
console.log(result.status, result.runRecords)
|
|
51
51
|
```
|
|
52
52
|
|
|
53
|
-
## Canonical production-run lifecycle
|
|
53
|
+
## Canonical production-run lifecycle
|
|
54
54
|
|
|
55
|
-
`startRuntimeRun`
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
`completeProductionAgentRun` + `persistRuntimeRun` pair is the canonical
|
|
59
|
-
example of what this subsumes).
|
|
55
|
+
`startRuntimeRun` records what the agent did on behalf of a customer,
|
|
56
|
+
what it cost, and how it ended. Replaces bespoke `agentRuns`-row helpers
|
|
57
|
+
across consumer repos with a single contract.
|
|
60
58
|
|
|
61
59
|
```ts
|
|
62
60
|
import { startRuntimeRun, runAgentTaskStream } from '@tangle-network/agent-runtime'
|
|
@@ -87,11 +85,11 @@ console.log(run.cost()) // { tokensIn, tokensOut, costUsd, wallMs, llmCalls }
|
|
|
87
85
|
|
|
88
86
|
Full runnable: [`examples/runtime-run/`](./examples/runtime-run/).
|
|
89
87
|
|
|
90
|
-
## agent-eval trace bridge
|
|
88
|
+
## agent-eval trace bridge
|
|
91
89
|
|
|
92
|
-
If you persist traces in agent-eval's `TraceStore`,
|
|
93
|
-
events to `TraceEvent`
|
|
94
|
-
|
|
90
|
+
If you persist traces in agent-eval's `TraceStore`, the bridge maps
|
|
91
|
+
runtime stream events to `TraceEvent` so consumer repos don't hand-roll
|
|
92
|
+
the adapter:
|
|
95
93
|
|
|
96
94
|
```ts
|
|
97
95
|
import { createTraceBridge } from '@tangle-network/agent-runtime'
|
|
@@ -146,10 +144,41 @@ details. Private diagnostics opt-in via `RuntimeTelemetryOptions`.
|
|
|
146
144
|
| Package | Owns |
|
|
147
145
|
|---|---|
|
|
148
146
|
| `agent-runtime` | Lifecycle, adapters, backends, `RuntimeRunHandle`, trace bridge |
|
|
147
|
+
| `agent-runtime/platform` | Server-side clients for the Tangle platform: cross-site SSO (`PlatformAuthClient`) and integrations hub (`PlatformHubClient`) |
|
|
149
148
|
| `agent-eval` | Control loops, readiness scoring, traces, evals, failure classes, release evidence |
|
|
150
149
|
| `agent-knowledge` | Evidence, claims, wiki pages, retrieval, knowledge bundle builders |
|
|
151
150
|
| Domain packages | Domain tools, policies, credentials, UI text, rubrics |
|
|
152
151
|
|
|
152
|
+
### `agent-runtime/platform` — Login with Tangle + integrations hub
|
|
153
|
+
|
|
154
|
+
```ts
|
|
155
|
+
import {
|
|
156
|
+
PlatformAuthClient,
|
|
157
|
+
PlatformHubClient,
|
|
158
|
+
} from '@tangle-network/agent-runtime/platform'
|
|
159
|
+
|
|
160
|
+
// Login with Tangle (cross-site SSO bridge).
|
|
161
|
+
const auth = new PlatformAuthClient({
|
|
162
|
+
baseUrl: process.env.TANGLE_PLATFORM_URL!, // https://id.tangle.tools
|
|
163
|
+
appId: 'gtm-agent', // must be registered in TRUSTED_APPS
|
|
164
|
+
})
|
|
165
|
+
const url = auth.authorizeUrl({ state: csrfToken, redirectUri: callbackUrl })
|
|
166
|
+
// …user redirected to `url`, returns to callbackUrl with ?code=…
|
|
167
|
+
const { apiKey, user } = await auth.exchange(code)
|
|
168
|
+
|
|
169
|
+
// Integrations hub (uses the user's apiKey from cross-site exchange).
|
|
170
|
+
const hub = new PlatformHubClient({
|
|
171
|
+
baseUrl: process.env.TANGLE_PLATFORM_URL!,
|
|
172
|
+
bearer: apiKey,
|
|
173
|
+
})
|
|
174
|
+
const connections = await hub.listConnections()
|
|
175
|
+
const { authorizationUrl } = await hub.startAuth({
|
|
176
|
+
providerId: 'google',
|
|
177
|
+
connectorId: 'gmail',
|
|
178
|
+
returnUrl: 'https://gtm.tangle.tools/integrations',
|
|
179
|
+
})
|
|
180
|
+
```
|
|
181
|
+
|
|
153
182
|
The API uses `runAgentTask`, not `runVerticalAgentTask`. `domain` is
|
|
154
183
|
metadata on the task because the runtime is reusable across many kinds of
|
|
155
184
|
agents without baking taxonomy into type names.
|
|
@@ -165,4 +194,4 @@ Runnable in [`examples/`](./examples/):
|
|
|
165
194
|
- [`sse-stream/`](./examples/sse-stream/) — Server-Sent Events for browser clients
|
|
166
195
|
- [`sandbox-stream-backend/`](./examples/sandbox-stream-backend/) — `createSandboxPromptBackend`
|
|
167
196
|
- [`openai-stream-backend/`](./examples/openai-stream-backend/) — `createOpenAICompatibleBackend`
|
|
168
|
-
- [`runtime-run/`](./examples/runtime-run/) — `startRuntimeRun` + cost ledger + persistence adapter
|
|
197
|
+
- [`runtime-run/`](./examples/runtime-run/) — `startRuntimeRun` + cost ledger + persistence adapter
|
package/dist/agent.d.ts
ADDED
|
@@ -0,0 +1,537 @@
|
|
|
1
|
+
import * as _tangle_network_agent_eval from '@tangle-network/agent-eval';
|
|
2
|
+
import { FindingSubject, TraceAnalystKindSpec, AnalystFinding } from '@tangle-network/agent-eval';
|
|
3
|
+
import { I as ImprovementAdapter, K as KnowledgeAdapter, a as RunAnalystLoopResult } from './types-D_MXrmJP.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* `AgentSurfaces` — declarative map of the mutable file/directory paths
|
|
7
|
+
* the self-improvement loop can edit on behalf of an agent.
|
|
8
|
+
*
|
|
9
|
+
* The substrate uses this map to resolve every parsed `FindingSubject`
|
|
10
|
+
* (from agent-eval) to a real on-disk path. No per-vertical glue;
|
|
11
|
+
* no fabricated paths; no silent `existsSync(...)` skips that hide
|
|
12
|
+
* misconfiguration from the operator.
|
|
13
|
+
*
|
|
14
|
+
* Surfaces are validated at `defineAgent` time — missing paths fail
|
|
15
|
+
* loud with a list of every offender. A surface that's not needed
|
|
16
|
+
* (e.g. an agent with no RAG corpora) is simply omitted; the loop
|
|
17
|
+
* refuses to route those subjects rather than fabricating a target.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Surface declarations. Every path is repo-relative (or absolute) at
|
|
22
|
+
* `defineAgent` time. At resolution time, paths are joined against the
|
|
23
|
+
* agent's `repoRoot`.
|
|
24
|
+
*
|
|
25
|
+
* `systemPrompt`, `tools`, `personas` are DIRECTORIES; the loop appends
|
|
26
|
+
* `<section>.md`, `<tool>/README.md`, `<persona-id>.yaml` etc.
|
|
27
|
+
* `rubric`, `outputSchema` are SINGLE FILES; the loop edits them in
|
|
28
|
+
* place.
|
|
29
|
+
*
|
|
30
|
+
* `knowledge` is the agent-knowledge root (typically `.agent-knowledge`);
|
|
31
|
+
* `applyKnowledgeWriteBlocks` writes pages relative to it.
|
|
32
|
+
*
|
|
33
|
+
* Optional surfaces (`scaffolding`, `memory`, `rag`, `outputSchema`)
|
|
34
|
+
* can be omitted — the loop will reject findings targeting them with a
|
|
35
|
+
* clear log message instead of fabricating a path.
|
|
36
|
+
*/
|
|
37
|
+
interface AgentSurfaces {
|
|
38
|
+
/** Directory containing one markdown file per system-prompt section. */
|
|
39
|
+
systemPrompt: string;
|
|
40
|
+
/** Directory containing one subdir per tool (`<tool>/README.md`). */
|
|
41
|
+
tools: string;
|
|
42
|
+
/** Single file (TypeScript module) defining the rubric weights + dimensions. */
|
|
43
|
+
rubric: string;
|
|
44
|
+
/** Knowledge-base root; typically `.agent-knowledge`. */
|
|
45
|
+
knowledge: string;
|
|
46
|
+
/** Directory containing one YAML/JSON file per persona. */
|
|
47
|
+
personas: string;
|
|
48
|
+
/** Optional: directory containing scaffolding rules (precondition checks, retry policies). */
|
|
49
|
+
scaffolding?: string;
|
|
50
|
+
/** Optional: memory store path (JSONL / SQLite / DB). */
|
|
51
|
+
memory?: string;
|
|
52
|
+
/** Optional: directory containing RAG corpora (`<corpus>/<doc-id>.md`). */
|
|
53
|
+
rag?: string;
|
|
54
|
+
/** Optional: single file defining the output schema (Zod / JSON Schema). */
|
|
55
|
+
outputSchema?: string;
|
|
56
|
+
}
|
|
57
|
+
interface ResolvedSurface {
|
|
58
|
+
/** Absolute filesystem path the operator can `cat` / `vim`. */
|
|
59
|
+
absolutePath: string;
|
|
60
|
+
/** Repo-relative path for PR descriptions, diffs, audit logs. */
|
|
61
|
+
repoRelativePath: string;
|
|
62
|
+
/** Whether the path currently exists on disk. */
|
|
63
|
+
exists: boolean;
|
|
64
|
+
/** The substrate's intent: edit an existing file or create a new one. */
|
|
65
|
+
intent: 'edit-existing' | 'create-new';
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Resolve a parsed `FindingSubject` to the file path the substrate
|
|
69
|
+
* should edit (or create) on disk.
|
|
70
|
+
*
|
|
71
|
+
* Returns `null` when:
|
|
72
|
+
* - the subject targets a surface the agent didn't declare
|
|
73
|
+
* (e.g. `rag:*` when `surfaces.rag` is undefined), OR
|
|
74
|
+
* - the subject is a `cluster` (failure-mode emits these as evidence,
|
|
75
|
+
* not actionable mutations — they don't route to a file).
|
|
76
|
+
*
|
|
77
|
+
* Returns a `ResolvedSurface` with `intent: 'create-new'` when the
|
|
78
|
+
* subject names a path that doesn't yet exist (e.g. a new wiki page).
|
|
79
|
+
* The caller chooses whether to honour the create — for tightly-managed
|
|
80
|
+
* surfaces like `systemPrompt` it's usually a contract violation
|
|
81
|
+
* (the analyst named a section that doesn't exist); for `knowledge`
|
|
82
|
+
* it's the whole point.
|
|
83
|
+
*/
|
|
84
|
+
declare function resolveSubjectPath(subject: FindingSubject, surfaces: AgentSurfaces, repoRoot: string): ResolvedSurface | null;
|
|
85
|
+
/**
|
|
86
|
+
* Validate that every declared surface exists on disk under `repoRoot`.
|
|
87
|
+
*
|
|
88
|
+
* Returns an array of `SurfaceValidationIssue` — empty when all required
|
|
89
|
+
* surfaces resolve. `defineAgent` throws with the issues rendered, so
|
|
90
|
+
* a misconfigured manifest fails at startup (not at the first finding
|
|
91
|
+
* the loop produces 20 minutes later).
|
|
92
|
+
*/
|
|
93
|
+
interface SurfaceValidationIssue {
|
|
94
|
+
surface: keyof AgentSurfaces;
|
|
95
|
+
path: string;
|
|
96
|
+
reason: 'missing' | 'not-directory' | 'not-file';
|
|
97
|
+
}
|
|
98
|
+
declare function validateSurfaces(surfaces: AgentSurfaces, repoRoot: string): ReadonlyArray<SurfaceValidationIssue>;
|
|
99
|
+
declare function renderSurfaceIssues(issues: ReadonlyArray<SurfaceValidationIssue>, repoRoot: string): string;
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* The full agent manifest. Each agent ships ONE of these.
|
|
103
|
+
*
|
|
104
|
+
* Generics:
|
|
105
|
+
* `TPersona` — the agent's persona shape (loaded from
|
|
106
|
+
* `surfaces.personas`). Defaults to `unknown` so the substrate's
|
|
107
|
+
* persona discovery (`loadPersonas`) can accept anything; per-agent
|
|
108
|
+
* code re-narrows when it matters.
|
|
109
|
+
* `TRunOutput` — the shape `runtime.act` returns. Used by the rubric
|
|
110
|
+
* scorers and emitted into the trace.
|
|
111
|
+
*/
|
|
112
|
+
interface AgentManifest<TPersona = unknown, TRunOutput = unknown> {
|
|
113
|
+
/**
|
|
114
|
+
* Stable identifier — used as `projectId` in traces, as the analyst
|
|
115
|
+
* loop's `runId` prefix, and as the namespace under which findings
|
|
116
|
+
* are persisted. MUST match the agent's repo name to keep
|
|
117
|
+
* cross-repo telemetry joinable.
|
|
118
|
+
*/
|
|
119
|
+
id: string;
|
|
120
|
+
/**
|
|
121
|
+
* Filesystem root the substrate resolves surface paths against.
|
|
122
|
+
* Typically `process.cwd()` or a fixed absolute path. Use an
|
|
123
|
+
* absolute path when the agent's tests may run from subdirectories
|
|
124
|
+
* (vitest sometimes shifts cwd).
|
|
125
|
+
*/
|
|
126
|
+
repoRoot: string;
|
|
127
|
+
/**
|
|
128
|
+
* Map of mutable surfaces the self-improvement loop can edit. See
|
|
129
|
+
* `AgentSurfaces` — required: `systemPrompt`, `tools`, `rubric`,
|
|
130
|
+
* `knowledge`, `personas`. Optional: `scaffolding`, `memory`, `rag`,
|
|
131
|
+
* `outputSchema`.
|
|
132
|
+
*
|
|
133
|
+
* Every required path is validated at `defineAgent` time. Missing
|
|
134
|
+
* paths throw with the full list of offenders.
|
|
135
|
+
*/
|
|
136
|
+
surfaces: AgentSurfaces;
|
|
137
|
+
/**
|
|
138
|
+
* Rubric the substrate uses to score each run. Dimensions × weights
|
|
139
|
+
* × judges. The substrate computes the weighted composite and
|
|
140
|
+
* stamps it into the RunRecord.
|
|
141
|
+
*/
|
|
142
|
+
rubric: AgentRubric<TRunOutput>;
|
|
143
|
+
/**
|
|
144
|
+
* Runtime adapter — how the substrate INVOKES the agent against a
|
|
145
|
+
* persona. The `act` function takes a persona + a context (with the
|
|
146
|
+
* tracer the substrate threads through for span emission) and
|
|
147
|
+
* returns the run output the rubric will score.
|
|
148
|
+
*
|
|
149
|
+
* The agent's existing production runtime goes in here; the
|
|
150
|
+
* substrate is intentionally thin around it.
|
|
151
|
+
*/
|
|
152
|
+
runtime: AgentRuntime<TPersona, TRunOutput>;
|
|
153
|
+
/**
|
|
154
|
+
* Persona discovery — the substrate loads personas via this function
|
|
155
|
+
* at eval start. Can read from `surfaces.personas`, an API, or be
|
|
156
|
+
* hardcoded. The substrate calls it once per `runAgentEval` call;
|
|
157
|
+
* persona ordering is preserved.
|
|
158
|
+
*/
|
|
159
|
+
personas: () => Promise<ReadonlyArray<TPersona>>;
|
|
160
|
+
/**
|
|
161
|
+
* Analyst kinds the substrate runs against each persona's trace.
|
|
162
|
+
* Defaults to `DEFAULT_TRACE_ANALYST_KINDS` from agent-eval. Per-agent
|
|
163
|
+
* authors can prune (e.g. skip `knowledge-poisoning` when there's no
|
|
164
|
+
* knowledge base) or extend (custom domain kinds).
|
|
165
|
+
*
|
|
166
|
+
* Empty array disables the loop — useful for `pnpm eval --no-analyst`.
|
|
167
|
+
*/
|
|
168
|
+
analystKinds: ReadonlyArray<TraceAnalystKindSpec>;
|
|
169
|
+
/**
|
|
170
|
+
* Analyst LLM configuration. The substrate uses these for all four
|
|
171
|
+
* kinds (override per-kind via `analystKinds` if needed).
|
|
172
|
+
*/
|
|
173
|
+
analyst: AnalystConfig;
|
|
174
|
+
/**
|
|
175
|
+
* Auto-apply policy. Knowledge / improvement edits land only when
|
|
176
|
+
* `enabled === true` AND the source finding's confidence meets the
|
|
177
|
+
* threshold. `mode` controls how applies happen: `'write'` mutates
|
|
178
|
+
* files in-place; `'open-pr'` writes to a branch and opens a PR.
|
|
179
|
+
*
|
|
180
|
+
* Default: knowledge auto-applies at confidence ≥0.85 in `'write'`
|
|
181
|
+
* mode (wiki edits are git-reversible); improvement stays at
|
|
182
|
+
* `enabled: false` until the agent author has measured precision.
|
|
183
|
+
*/
|
|
184
|
+
autoApply?: AutoApplyPolicy;
|
|
185
|
+
}
|
|
186
|
+
interface AgentRubric<TRunOutput> {
|
|
187
|
+
/** Dimensions composing the weighted score. Weights sum to 1.0 by convention. */
|
|
188
|
+
dimensions: ReadonlyArray<RubricDimension<TRunOutput>>;
|
|
189
|
+
/**
|
|
190
|
+
* Optional judges layered on top of deterministic dimensions. Each
|
|
191
|
+
* judge returns a score per dimension; the substrate averages judges
|
|
192
|
+
* (mean by default) for the LLM contribution.
|
|
193
|
+
*/
|
|
194
|
+
judges?: ReadonlyArray<JudgeConfig<TRunOutput>>;
|
|
195
|
+
}
|
|
196
|
+
interface RubricDimension<TRunOutput> {
|
|
197
|
+
/** Unique identifier — appears in finding subjects (`rubric:<id>`). */
|
|
198
|
+
id: string;
|
|
199
|
+
/** 0..1 — weight in the composite. */
|
|
200
|
+
weight: number;
|
|
201
|
+
/**
|
|
202
|
+
* Deterministic scorer: given the persona + run output, returns a
|
|
203
|
+
* 0..1 score. The substrate sums weight × score across dimensions
|
|
204
|
+
* for the deterministic composite; judges supplement subjective dims.
|
|
205
|
+
*/
|
|
206
|
+
score: (input: {
|
|
207
|
+
persona: unknown;
|
|
208
|
+
output: TRunOutput;
|
|
209
|
+
}) => number;
|
|
210
|
+
/** Optional human-readable label for reports. */
|
|
211
|
+
label?: string;
|
|
212
|
+
}
|
|
213
|
+
interface JudgeConfig<TRunOutput> {
|
|
214
|
+
/** Judge identifier — appears in trace spans + manifest. */
|
|
215
|
+
id: string;
|
|
216
|
+
/** Model snapshot to invoke. Pin the snapshot (`claude-sonnet-4-6@2025-04-15`); the validator rejects bare aliases. */
|
|
217
|
+
model: string;
|
|
218
|
+
/** Dimensions this judge scores. */
|
|
219
|
+
dimensions: ReadonlyArray<string>;
|
|
220
|
+
/**
|
|
221
|
+
* Optional rubric anchors — text examples the judge sees as a
|
|
222
|
+
* few-shot prompt to calibrate. STRONGLY recommended for subjective
|
|
223
|
+
* dimensions; required by the calibration gate (Pearson ≥0.7).
|
|
224
|
+
*/
|
|
225
|
+
anchors?: ReadonlyArray<{
|
|
226
|
+
input: string;
|
|
227
|
+
output: TRunOutput;
|
|
228
|
+
expected: Record<string, number>;
|
|
229
|
+
}>;
|
|
230
|
+
}
|
|
231
|
+
interface AgentRuntime<TPersona, TRunOutput> {
|
|
232
|
+
/**
|
|
233
|
+
* Invoke the agent against one persona. Returns the structured run
|
|
234
|
+
* output the rubric will score.
|
|
235
|
+
*
|
|
236
|
+
* `ctx.emitter` is the substrate-threaded `TraceEmitter` — agents
|
|
237
|
+
* SHOULD record their LLM calls / tool calls through it for capture
|
|
238
|
+
* integrity. `ctx.deadlineMs` is wall-clock; the runtime SHOULD
|
|
239
|
+
* honour it for graceful cancel.
|
|
240
|
+
*/
|
|
241
|
+
act: (persona: TPersona, ctx: AgentRunContext) => Promise<TRunOutput>;
|
|
242
|
+
}
|
|
243
|
+
interface AgentRunContext {
|
|
244
|
+
/** Substrate-managed trace emitter. */
|
|
245
|
+
emitter: _tangle_network_agent_eval.TraceEmitter;
|
|
246
|
+
/** Stable run id for this persona × variant cell. */
|
|
247
|
+
runId: string;
|
|
248
|
+
/** Variant the runtime is exercising (e.g. `'baseline'`, `'source-grounded'`). */
|
|
249
|
+
variantId?: string;
|
|
250
|
+
/** Wall-clock deadline (epoch ms). The runtime SHOULD honour for graceful cancel. */
|
|
251
|
+
deadlineMs?: number;
|
|
252
|
+
/** Optional abort signal. */
|
|
253
|
+
signal?: AbortSignal;
|
|
254
|
+
}
|
|
255
|
+
interface AnalystConfig {
|
|
256
|
+
/** Model the analyst kinds use. Override per-kind via `analystKinds[i].cost.models`. */
|
|
257
|
+
model: string;
|
|
258
|
+
/** Optional total budget across all kinds for one run. Substrate enforces via `BudgetGuard`. */
|
|
259
|
+
budgetUsd?: number;
|
|
260
|
+
/** Backend hint for the AxAIService factory — same shape every kind uses. */
|
|
261
|
+
backend?: {
|
|
262
|
+
name?: 'openai' | 'router';
|
|
263
|
+
apiKey?: string;
|
|
264
|
+
baseUrl?: string;
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
interface AutoApplyPolicy {
|
|
268
|
+
knowledge?: {
|
|
269
|
+
enabled: boolean;
|
|
270
|
+
confidenceThreshold?: number;
|
|
271
|
+
mode?: 'write' | 'open-pr';
|
|
272
|
+
};
|
|
273
|
+
improvement?: {
|
|
274
|
+
enabled: boolean;
|
|
275
|
+
confidenceThreshold?: number;
|
|
276
|
+
mode?: 'write' | 'open-pr';
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
declare class AgentManifestError extends Error {
|
|
280
|
+
readonly agentId: string;
|
|
281
|
+
readonly issues: ReadonlyArray<unknown>;
|
|
282
|
+
constructor(message: string, agentId: string, issues?: ReadonlyArray<unknown>);
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Construct a validated agent manifest. Throws `AgentManifestError`
|
|
286
|
+
* if any required surface is missing on disk.
|
|
287
|
+
*
|
|
288
|
+
* Generics: pass your persona / output types if you want narrowed
|
|
289
|
+
* `runtime.act` signatures:
|
|
290
|
+
* `defineAgent<TaxPersona, TaxRunOutput>({ ... })`
|
|
291
|
+
*
|
|
292
|
+
* Most callers don't need the generics — the substrate operates on
|
|
293
|
+
* `unknown` payloads internally and the manifest's `score` /
|
|
294
|
+
* `runtime.act` see the typed shapes via TypeScript inference at
|
|
295
|
+
* the call site.
|
|
296
|
+
*/
|
|
297
|
+
declare function defineAgent<TPersona = unknown, TRunOutput = unknown>(manifest: AgentManifest<TPersona, TRunOutput>): AgentManifest<TPersona, TRunOutput>;
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Substrate-default `ImprovementAdapter` — surfaces-driven, LLM-drafted
|
|
301
|
+
* patches, optional auto-apply or PR-open.
|
|
302
|
+
*
|
|
303
|
+
* This is the one ImprovementAdapter every vertical agent uses. The
|
|
304
|
+
* substrate parses each finding's `subject` via
|
|
305
|
+
* `parseFindingSubject` (agent-eval), resolves it to a real file path
|
|
306
|
+
* via the agent's `AgentSurfaces`, reads the current content, and asks
|
|
307
|
+
* an LLM to draft a unified-diff patch given the finding + current
|
|
308
|
+
* content + per-kind editing-discipline rules.
|
|
309
|
+
*
|
|
310
|
+
* Auto-apply gates on the source-finding's confidence and the
|
|
311
|
+
* autoApply.improvement policy. Two modes:
|
|
312
|
+
* `write` — apply the patch in-place via `git apply -p0`. Operator
|
|
313
|
+
* reviews via `git diff`.
|
|
314
|
+
* `open-pr` — write to a branch, commit, push, open a PR via `gh`.
|
|
315
|
+
* Operator reviews via the PR UI.
|
|
316
|
+
*
|
|
317
|
+
* Fail-loud rules:
|
|
318
|
+
* - Findings whose subject doesn't parse → counted in `errors`.
|
|
319
|
+
* - Findings whose subject targets an undeclared surface → counted in
|
|
320
|
+
* `errors` with the offending kind in the message.
|
|
321
|
+
* - Findings whose target path doesn't exist AND the kind isn't a
|
|
322
|
+
* create-new variant (`new-tool`, `knowledge.wiki`) → counted in
|
|
323
|
+
* `errors` with the resolved path in the message.
|
|
324
|
+
* - LLM drafts that fail JSON-schema validation → counted in
|
|
325
|
+
* `errors` with the schema issue.
|
|
326
|
+
*
|
|
327
|
+
* No silent skips. Every dropped finding has a recorded reason the
|
|
328
|
+
* loop's report surfaces.
|
|
329
|
+
*/
|
|
330
|
+
|
|
331
|
+
interface SurfaceImprovementEdit {
|
|
332
|
+
/** Stable id derived from the source finding so re-proposals are idempotent. */
|
|
333
|
+
id: string;
|
|
334
|
+
/** The finding that produced this edit — for revert + audit trail. */
|
|
335
|
+
sourceFindingId: string;
|
|
336
|
+
/** Parsed subject; included so the apply step doesn't re-parse. */
|
|
337
|
+
subject: FindingSubject;
|
|
338
|
+
/** Resolved on-disk target. */
|
|
339
|
+
target: ResolvedSurface;
|
|
340
|
+
/** SHA-256 of the current file content the patch was drafted against. */
|
|
341
|
+
baseSha256: string;
|
|
342
|
+
/** Unified-diff patch the LLM drafted (relative to `target.absolutePath`). */
|
|
343
|
+
patch: string;
|
|
344
|
+
/** One-line summary the operator sees in the report / PR title. */
|
|
345
|
+
summary: string;
|
|
346
|
+
/** Multi-line rationale for the PR body — finding context + LLM reasoning. */
|
|
347
|
+
rationale: string;
|
|
348
|
+
/** Carry-forward from the finding so the apply gate can check the threshold. */
|
|
349
|
+
confidence: number;
|
|
350
|
+
/** Carry-forward severity for prioritization. */
|
|
351
|
+
severity: AnalystFinding['severity'];
|
|
352
|
+
}
|
|
353
|
+
interface CreateSurfaceImprovementAdapterOpts {
|
|
354
|
+
surfaces: AgentSurfaces;
|
|
355
|
+
repoRoot: string;
|
|
356
|
+
/**
|
|
357
|
+
* LLM-draft callback. Given a finding + current file content + the
|
|
358
|
+
* resolved target, returns a unified-diff patch + summary + rationale.
|
|
359
|
+
*
|
|
360
|
+
* Required — the substrate doesn't ship a hardcoded prompt; the agent
|
|
361
|
+
* author picks the model (Haiku for cheap routine drafts, Sonnet for
|
|
362
|
+
* substantive prompt rewrites, etc.) via this callback.
|
|
363
|
+
*/
|
|
364
|
+
draftPatch: (input: DraftPatchInput) => Promise<DraftPatchOutput>;
|
|
365
|
+
/**
|
|
366
|
+
* Apply mode:
|
|
367
|
+
* `write` — `git apply` in-place; operator reviews via `git diff`
|
|
368
|
+
* `open-pr` — branch + commit + push + `gh pr create`
|
|
369
|
+
* `none` — never apply; collect proposals for the report only
|
|
370
|
+
*
|
|
371
|
+
* The `apply` method honours this even when the loop calls it; the
|
|
372
|
+
* effective behaviour is also gated on the per-finding confidence
|
|
373
|
+
* threshold via `runAnalystLoop`'s `autoApply` policy.
|
|
374
|
+
*/
|
|
375
|
+
mode?: 'write' | 'open-pr' | 'none';
|
|
376
|
+
/** When `mode === 'open-pr'`, the base branch new PRs target. Default: `main`. */
|
|
377
|
+
baseBranch?: string;
|
|
378
|
+
/** Required for `mode === 'open-pr'` — the GH owner/repo (`tangle-network/tax-agent`). */
|
|
379
|
+
ghRepo?: string;
|
|
380
|
+
/**
|
|
381
|
+
* When the resolved target doesn't exist, allow the substrate to
|
|
382
|
+
* CREATE the file (for `knowledge.wiki`, `new-tool` subjects). Default
|
|
383
|
+
* true for those kinds, false for `system-prompt` / `rubric` / etc.
|
|
384
|
+
* (named sections that don't exist are a contract violation, not a
|
|
385
|
+
* scaffolding opportunity).
|
|
386
|
+
*/
|
|
387
|
+
allowCreateForKinds?: ReadonlyArray<FindingSubject['kind']>;
|
|
388
|
+
}
|
|
389
|
+
interface DraftPatchInput {
|
|
390
|
+
finding: AnalystFinding;
|
|
391
|
+
subject: FindingSubject;
|
|
392
|
+
target: ResolvedSurface;
|
|
393
|
+
/** Current file content (empty string when `intent === 'create-new'`). */
|
|
394
|
+
currentContent: string;
|
|
395
|
+
}
|
|
396
|
+
interface DraftPatchOutput {
|
|
397
|
+
/** Unified diff against the current file content. Empty string skips this finding. */
|
|
398
|
+
patch: string;
|
|
399
|
+
/** One-line summary for the operator. */
|
|
400
|
+
summary: string;
|
|
401
|
+
/** Multi-line rationale for the PR body. */
|
|
402
|
+
rationale: string;
|
|
403
|
+
}
|
|
404
|
+
declare function createSurfaceImprovementAdapter(opts: CreateSurfaceImprovementAdapterOpts): ImprovementAdapter<SurfaceImprovementEdit>;
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Substrate-default `KnowledgeAdapter` — wraps agent-knowledge's
|
|
408
|
+
* `proposeFromFindings` + `applyKnowledgeWriteBlocks` with substrate
|
|
409
|
+
* defaults (auto-lint after apply, source linkage via finding id).
|
|
410
|
+
*
|
|
411
|
+
* Every agent that ships a `.agent-knowledge/` tree uses this adapter
|
|
412
|
+
* unmodified. Per-agent customization happens at the manifest level
|
|
413
|
+
* (`autoApply.knowledge.confidenceThreshold`, etc.), not by writing a
|
|
414
|
+
* new adapter.
|
|
415
|
+
*
|
|
416
|
+
* Lint discipline: after each apply we run agent-knowledge's
|
|
417
|
+
* `lintKnowledgeIndex` to catch broken links / circular claims /
|
|
418
|
+
* duplicate pages introduced by the new writes. Findings that fail the
|
|
419
|
+
* post-apply lint are recorded in `warnings`; the apply itself is not
|
|
420
|
+
* rolled back (lint failures are soft — humans review the wiki state).
|
|
421
|
+
*/
|
|
422
|
+
|
|
423
|
+
interface CreateSurfaceKnowledgeAdapterOpts {
|
|
424
|
+
/** `.agent-knowledge/` root (absolute path the substrate writes blocks against). */
|
|
425
|
+
knowledgeRoot: string;
|
|
426
|
+
}
|
|
427
|
+
/**
|
|
428
|
+
* Build the adapter. We accept the agent-knowledge functions as DI so
|
|
429
|
+
* the substrate stays decoupled from a specific agent-knowledge
|
|
430
|
+
* version — the agent author imports them in their manifest module
|
|
431
|
+
* and hands them to the factory.
|
|
432
|
+
*
|
|
433
|
+
* `proposeFromFindings(findings)` returns
|
|
434
|
+
* `{ proposals: KnowledgeProposal[]; skipped: number; errors: ... }`.
|
|
435
|
+
*
|
|
436
|
+
* `applyKnowledgeWriteBlocks(root, content)` returns
|
|
437
|
+
* `{ written: string[]; warnings: string[] }`.
|
|
438
|
+
*
|
|
439
|
+
* `lintKnowledgeIndex(index)` (optional) returns `KnowledgeLintFinding[]`.
|
|
440
|
+
*/
|
|
441
|
+
interface KnowledgeAdapterDeps<TProposal> {
|
|
442
|
+
proposeFromFindings: (findings: ReadonlyArray<AnalystFinding>) => {
|
|
443
|
+
proposals: TProposal[];
|
|
444
|
+
skipped: number;
|
|
445
|
+
errors: Array<{
|
|
446
|
+
findingId: string;
|
|
447
|
+
subject: string;
|
|
448
|
+
message: string;
|
|
449
|
+
}>;
|
|
450
|
+
};
|
|
451
|
+
applyKnowledgeWriteBlocks: (root: string, proposalText: string) => Promise<{
|
|
452
|
+
written: string[];
|
|
453
|
+
warnings: string[];
|
|
454
|
+
}>;
|
|
455
|
+
/**
|
|
456
|
+
* Optional post-apply lint hook. The substrate runs it after each
|
|
457
|
+
* batch of writes; failures land in `warnings` (the apply is not
|
|
458
|
+
* rolled back — lint signals drift to review, not block).
|
|
459
|
+
*/
|
|
460
|
+
lintAfterApply?: (root: string) => Promise<ReadonlyArray<string>>;
|
|
461
|
+
}
|
|
462
|
+
declare function createSurfaceKnowledgeAdapter<TProposal>(opts: CreateSurfaceKnowledgeAdapterOpts, deps: KnowledgeAdapterDeps<TProposal>): KnowledgeAdapter<TProposal>;
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* `OutcomeMeasurement` — the missing metric that turns the analyst
|
|
466
|
+
* loop from "observability" into "self-improvement".
|
|
467
|
+
*
|
|
468
|
+
* Without this hook, the loop reports process counts (`findings: 42`,
|
|
469
|
+
* `applied: 7`) and never proves the applied edits actually improved
|
|
470
|
+
* anything. With this hook, the substrate re-runs the cohort against
|
|
471
|
+
* the same personas after each apply pass and reports a composite
|
|
472
|
+
* score delta. A negative delta is the substrate's strongest signal
|
|
473
|
+
* to either roll back or surface for review.
|
|
474
|
+
*
|
|
475
|
+
* Wiring is intentionally simple: pass the manifest + the `runAgentEval`
|
|
476
|
+
* function and a list of `personaIds` to re-run. The wrapper:
|
|
477
|
+
* 1. Captures the baseline composite from the just-finished run.
|
|
478
|
+
* 2. After `runAnalystLoop` returns, re-invokes `runAgentEval` against
|
|
479
|
+
* the same persona slice.
|
|
480
|
+
* 3. Computes the delta and appends to `loop-report.json`.
|
|
481
|
+
* 4. If `rollbackOnRegression` and delta < 0, reverts applied edits.
|
|
482
|
+
*/
|
|
483
|
+
|
|
484
|
+
interface OutcomeMeasurement {
|
|
485
|
+
/** Baseline composite before applies — captured from the most-recent eval run. */
|
|
486
|
+
baselineComposite: number;
|
|
487
|
+
/** Composite after re-running the cohort with applied edits. */
|
|
488
|
+
afterComposite: number;
|
|
489
|
+
/** `afterComposite - baselineComposite`. Positive = the loop improved the agent. */
|
|
490
|
+
delta: number;
|
|
491
|
+
/** Per-persona deltas for finer-grained review. */
|
|
492
|
+
perPersona: ReadonlyArray<{
|
|
493
|
+
personaId: string;
|
|
494
|
+
before: number;
|
|
495
|
+
after: number;
|
|
496
|
+
delta: number;
|
|
497
|
+
}>;
|
|
498
|
+
/** When the substrate rolled back applies due to regression, the paths reverted. */
|
|
499
|
+
rolledBackPaths: ReadonlyArray<string>;
|
|
500
|
+
}
|
|
501
|
+
interface OutcomeMeasurementOpts {
|
|
502
|
+
/** Composite scores from the run that produced the findings. */
|
|
503
|
+
baseline: ReadonlyArray<{
|
|
504
|
+
personaId: string;
|
|
505
|
+
composite: number;
|
|
506
|
+
}>;
|
|
507
|
+
/**
|
|
508
|
+
* Re-run callback — the substrate invokes this after applies. The
|
|
509
|
+
* agent author provides their `runAgentEval`-equivalent so the
|
|
510
|
+
* substrate can ask "score this persona slice now."
|
|
511
|
+
*
|
|
512
|
+
* The callback SHOULD reuse the same cohort + judges + variant as
|
|
513
|
+
* the baseline run; only the agent's mutable surfaces have changed.
|
|
514
|
+
*/
|
|
515
|
+
reRunCohort: (personaIds: ReadonlyArray<string>) => Promise<ReadonlyArray<{
|
|
516
|
+
personaId: string;
|
|
517
|
+
composite: number;
|
|
518
|
+
}>>;
|
|
519
|
+
/** When `true`, applied edits are reverted on negative delta. Default `false`. */
|
|
520
|
+
rollbackOnRegression?: boolean;
|
|
521
|
+
/** Callback to revert a list of paths (typically `git checkout HEAD --`). */
|
|
522
|
+
revert?: (paths: ReadonlyArray<string>) => Promise<void>;
|
|
523
|
+
}
|
|
524
|
+
/**
|
|
525
|
+
* Run `runAnalystLoop` and stamp an `OutcomeMeasurement` onto the
|
|
526
|
+
* result. The substrate calls this after each canonical eval; the
|
|
527
|
+
* delta lands in `loop-report.json` for cross-run trend analysis.
|
|
528
|
+
*
|
|
529
|
+
* The function returns the original `RunAnalystLoopResult` enriched
|
|
530
|
+
* with `outcome` so callers stay backwards-compatible (the field is
|
|
531
|
+
* optional on the type; missing means no measurement was wired).
|
|
532
|
+
*/
|
|
533
|
+
declare function measureOutcome<TProposal, TEdit>(result: RunAnalystLoopResult<TProposal, TEdit>, opts: OutcomeMeasurementOpts): Promise<RunAnalystLoopResult<TProposal, TEdit> & {
|
|
534
|
+
outcome: OutcomeMeasurement;
|
|
535
|
+
}>;
|
|
536
|
+
|
|
537
|
+
export { type AgentManifest, AgentManifestError, type AgentRubric, type AgentRunContext, type AgentRuntime, type AgentSurfaces, type AnalystConfig, type AutoApplyPolicy, type CreateSurfaceImprovementAdapterOpts, type CreateSurfaceKnowledgeAdapterOpts, type DraftPatchInput, type DraftPatchOutput, type JudgeConfig, type KnowledgeAdapterDeps, type OutcomeMeasurement, type OutcomeMeasurementOpts, type ResolvedSurface, type RubricDimension, type SurfaceImprovementEdit, type SurfaceValidationIssue, createSurfaceImprovementAdapter, createSurfaceKnowledgeAdapter, defineAgent, measureOutcome, renderSurfaceIssues, resolveSubjectPath, validateSurfaces };
|