@tangle-network/agent-eval 0.43.2 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/langchain.d.ts +91 -0
- package/dist/adapters/langchain.js +34 -0
- package/dist/adapters/langchain.js.map +1 -0
- package/dist/campaign/index.d.ts +7 -401
- package/dist/campaign/index.js +24 -634
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-3RF76KTD.js +84 -0
- package/dist/chunk-3RF76KTD.js.map +1 -0
- package/dist/chunk-H5BGRSN4.js +642 -0
- package/dist/chunk-H5BGRSN4.js.map +1 -0
- package/dist/contract/index.d.ts +10 -0
- package/dist/contract/index.js +41 -0
- package/dist/contract/index.js.map +1 -0
- package/dist/governance/index.d.ts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/meta-eval/index.js +4 -79
- package/dist/meta-eval/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-BxJ3DQKJ.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/rl.d.ts +5 -4
- package/dist/rl.js +6 -0
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-CJ08tGwq.d.ts} +1 -1
- package/dist/run-improvement-loop-CbilHQAb.d.ts +401 -0
- package/dist/{types-BLbRTxoc.d.ts → types-DToGONFA.d.ts} +1 -1
- package/docs/quickstart-external.md +190 -0
- package/package.json +11 -1
package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-CJ08tGwq.d.ts}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import { O as OutcomeStore } from './outcome-store-
|
|
2
|
+
import { O as OutcomeStore } from './outcome-store-BxJ3DQKJ.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Rubric predictive validity — does our eval rubric predict deployment
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-DToGONFA.js';
|
|
2
|
+
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
|
+
import { RunRecord } from '@tangle-network/agent-runtime';
|
|
4
|
+
import { R as RedTeamCase } from './red-team-30II1T4o.js';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* @experimental
|
|
8
|
+
*
|
|
9
|
+
* `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's
|
|
10
|
+
* `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening
|
|
11
|
+
* code consumers duplicated 4 times. The PR body includes the campaign's
|
|
12
|
+
* manifest hash, gate verdict, and scorecard summary so reviewers can see
|
|
13
|
+
* exactly what was promoted + why.
|
|
14
|
+
*
|
|
15
|
+
* NOT a deploy mechanism — this only OPENS a PR. The human reviews + merges.
|
|
16
|
+
* The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is
|
|
17
|
+
* deferred to Pass B with the full shadow / canary / rollback stack.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {
|
|
21
|
+
/** Campaign result to attach to the PR. */
|
|
22
|
+
result: CampaignResult<TArtifact, TScenario>;
|
|
23
|
+
/** Gate verdict explaining the promotion. Substrate refuses to open a PR
|
|
24
|
+
* when `gate.decision !== 'ship'` — fails loud. */
|
|
25
|
+
gate: GateResult;
|
|
26
|
+
/** Promoted surface diff — typically the new system prompt addendum or
|
|
27
|
+
* full profile diff. Substrate writes it as the PR body. */
|
|
28
|
+
promotedDiff: string;
|
|
29
|
+
/** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */
|
|
30
|
+
ghOwner: string;
|
|
31
|
+
ghRepo: string;
|
|
32
|
+
/** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */
|
|
33
|
+
branch?: string;
|
|
34
|
+
/** PR title. Default includes manifest hash. */
|
|
35
|
+
title?: string;
|
|
36
|
+
/** Whether to actually open the PR or just dry-run. Default reads
|
|
37
|
+
* `GH_AUTO_PR_TOKEN` env — present = open, absent = dry-run. */
|
|
38
|
+
dryRun?: boolean;
|
|
39
|
+
/** Test seam — substitute `gh pr create` invocation. */
|
|
40
|
+
ghExec?: (args: string[]) => {
|
|
41
|
+
stdout: string;
|
|
42
|
+
stderr: string;
|
|
43
|
+
status: number;
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
interface OpenAutoPrResult {
|
|
47
|
+
opened: boolean;
|
|
48
|
+
prUrl?: string;
|
|
49
|
+
dryRun: boolean;
|
|
50
|
+
reason: string;
|
|
51
|
+
}
|
|
52
|
+
declare function openAutoPr<TArtifact, TScenario extends Scenario>(options: OpenAutoPrOptions<TArtifact, TScenario>): OpenAutoPrResult;
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* @experimental
|
|
56
|
+
*
|
|
57
|
+
* `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
|
|
58
|
+
* GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
|
|
59
|
+
* the evolutionary strategy: each generation, mutate the current best surface
|
|
60
|
+
* into N candidates, measure, select. No generation memory beyond the current
|
|
61
|
+
* surface; the loop body handles ranking + promotion.
|
|
62
|
+
*
|
|
63
|
+
* The reflective alternative is agent-runtime's `improvementDriver` with a
|
|
64
|
+
* `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +
|
|
65
|
+
* trace findings to propose targeted edits rather than blind mutations. Both
|
|
66
|
+
* conform to `ImprovementDriver`; the improvement loop is identical regardless
|
|
67
|
+
* of which drives it.
|
|
68
|
+
*/
|
|
69
|
+
|
|
70
|
+
interface EvolutionaryDriverOptions<TFindings = unknown> {
|
|
71
|
+
mutator: Mutator<TFindings>;
|
|
72
|
+
/** External findings fed to the mutator each generation. Default: []. */
|
|
73
|
+
findings?: TFindings[];
|
|
74
|
+
}
|
|
75
|
+
declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* @experimental
|
|
79
|
+
*
|
|
80
|
+
* `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.
|
|
81
|
+
* Each generation it reflects on the prior best candidate's per-scenario
|
|
82
|
+
* scores + weakest dimensions (the `GenerationCandidate` evidence from
|
|
83
|
+
* `runOptimization`), asks an LLM to propose targeted rewrites of the current
|
|
84
|
+
* surface, and returns them as the next population.
|
|
85
|
+
*
|
|
86
|
+
* This is the substrate's best-in-class prompt optimizer: surface-agnostic, so
|
|
87
|
+
* ANY string surface in ANY consumer opts in by selecting it — system prompts,
|
|
88
|
+
* prompt addenda, judge/reviewer prompts, even a driver's own reflection
|
|
89
|
+
* prompt. It reuses the generic reflection primitive (`buildReflectionPrompt` /
|
|
90
|
+
* `parseReflectionResponse`) and the router client; it has NO dependency on the
|
|
91
|
+
* legacy `runMultiShotOptimization` / `prompt-evolution` orchestration.
|
|
92
|
+
*
|
|
93
|
+
* It earns its keep where there is real per-instance signal (which the
|
|
94
|
+
* dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
|
|
95
|
+
* now provide). For thin-signal surfaces it degrades to plain reflection — so
|
|
96
|
+
* it is a SELECTABLE driver, never a forced default. On generation 0 (no
|
|
97
|
+
* history) it reflects on the current surface against the mutation primitives
|
|
98
|
+
* alone.
|
|
99
|
+
*/
|
|
100
|
+
|
|
101
|
+
interface GepaDriverOptions {
|
|
102
|
+
/** Router transport (apiKey/baseUrl). */
|
|
103
|
+
llm: LlmClientOptions;
|
|
104
|
+
/** Model that performs the reflection. */
|
|
105
|
+
model: string;
|
|
106
|
+
/** What is being optimized — appears in the reflection prompt for orientation. */
|
|
107
|
+
target: string;
|
|
108
|
+
/** Surface-specific mutation levers offered to the model. */
|
|
109
|
+
mutationPrimitives?: string[];
|
|
110
|
+
/** Top/bottom scenarios surfaced as evidence each generation. Default 3. */
|
|
111
|
+
evidenceK?: number;
|
|
112
|
+
/** Reflection sampling temperature. Default 0.7. */
|
|
113
|
+
temperature?: number;
|
|
114
|
+
/** Reflection max tokens. Default 6000. */
|
|
115
|
+
maxTokens?: number;
|
|
116
|
+
}
|
|
117
|
+
declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* @experimental
|
|
121
|
+
*
|
|
122
|
+
* Compose multiple `Gate` implementations — every gate must pass for the
|
|
123
|
+
* composite to ship. Closes the alignment reviewer's "default-only
|
|
124
|
+
* heldOutGate + costGate would happily promote a reward-hacked prompt"
|
|
125
|
+
* concern by making safety gates first-class composable defaults.
|
|
126
|
+
*/
|
|
127
|
+
|
|
128
|
+
/** Compose gates — all must `ship` for the composite to `ship`. First
|
|
129
|
+
* non-ship verdict short-circuits the composite verdict, but ALL gates run
|
|
130
|
+
* (so the result records every gate's reason — useful for diagnostics). */
|
|
131
|
+
declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* @experimental
|
|
135
|
+
*
|
|
136
|
+
* `defaultProductionGate` — composes the substrate's existing safety
|
|
137
|
+
* primitives (red-team / reward-hacking / canary / heldout) into a single
|
|
138
|
+
* Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
|
|
139
|
+
* primitives are off the critical path" blocker.
|
|
140
|
+
*
|
|
141
|
+
* The composition is opinionated — when consumers wire `runImprovementLoop`,
|
|
142
|
+
* THIS gate is the default. Consumers can still pass a custom gate to
|
|
143
|
+
* override; the recommended pattern is to compose THIS gate with whatever
|
|
144
|
+
* extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
|
|
145
|
+
*/
|
|
146
|
+
|
|
147
|
+
interface DefaultProductionGateOptions {
|
|
148
|
+
/** Required: scenarios held out from training; substrate compares
|
|
149
|
+
* candidate-on-holdout vs baseline-on-holdout. */
|
|
150
|
+
holdoutScenarios: Scenario[];
|
|
151
|
+
/** Minimum mean-composite improvement required to ship. Default 0.5. */
|
|
152
|
+
deltaThreshold?: number;
|
|
153
|
+
/** Total $ budget for ALL cells in this campaign — including baseline + candidate.
|
|
154
|
+
* Composite verdict refuses to ship when spend exceeded budget. */
|
|
155
|
+
budgetUsd?: number;
|
|
156
|
+
/** Red-team cases to probe candidate outputs against. When omitted the
|
|
157
|
+
* substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
|
|
158
|
+
* battery for tighter coverage. */
|
|
159
|
+
redTeamBattery?: RedTeamCase[];
|
|
160
|
+
/** Run records (oldest-first) needed for the reward-hacking detector.
|
|
161
|
+
* Substrate populates from prior production-loop generations. */
|
|
162
|
+
recentRuns?: RunRecord[];
|
|
163
|
+
/** When true, the gate refuses to ship if the reward-hacking detector
|
|
164
|
+
* fires at the `gaming` severity. Default true. */
|
|
165
|
+
blockOnRewardHackingGaming?: boolean;
|
|
166
|
+
}
|
|
167
|
+
declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* @experimental
|
|
171
|
+
*
|
|
172
|
+
* Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
|
|
173
|
+
* `Gate`. Use when you want held-out as one of N composed gates instead of
|
|
174
|
+
* the full `defaultProductionGate` stack.
|
|
175
|
+
*/
|
|
176
|
+
|
|
177
|
+
interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
|
|
178
|
+
scenarios: TScenario[];
|
|
179
|
+
deltaThreshold?: number;
|
|
180
|
+
}
|
|
181
|
+
declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* @experimental
|
|
185
|
+
*
|
|
186
|
+
* `CampaignStorage` — the filesystem seam `runCampaign` writes through
|
|
187
|
+
* (run/cell dirs, the resumability cache, per-cell artifacts, trace spans).
|
|
188
|
+
*
|
|
189
|
+
* The default (`fsCampaignStorage`) is the Node filesystem — identical
|
|
190
|
+
* behavior to the inline `node:fs` calls it replaces, so existing CLI
|
|
191
|
+
* consumers are unaffected. `inMemoryCampaignStorage` keeps everything in a
|
|
192
|
+
* `Map`, so the substrate runs in environments WITHOUT a filesystem
|
|
193
|
+
* (Cloudflare Workers, Deno Deploy, other edge runtimes) — the campaign
|
|
194
|
+
* still produces its `CampaignResult` (cells + aggregates) in memory;
|
|
195
|
+
* artifacts/traces simply aren't persisted to disk.
|
|
196
|
+
*
|
|
197
|
+
* Paths are opaque keys to the in-memory adapter — it does not parse them,
|
|
198
|
+
* so the same `join(...)`-built paths work unchanged across both adapters.
|
|
199
|
+
*/
|
|
200
|
+
interface CampaignStorage {
|
|
201
|
+
/** Ensure a directory exists (recursive). No-op for in-memory. */
|
|
202
|
+
ensureDir(dir: string): void;
|
|
203
|
+
/** Does this path exist (as a written file or an ensured dir)? */
|
|
204
|
+
exists(path: string): boolean;
|
|
205
|
+
/** Read a UTF-8 file; `undefined` when missing or unreadable. */
|
|
206
|
+
read(path: string): string | undefined;
|
|
207
|
+
/** Write a file (string or bytes). Parent dir is assumed ensured. */
|
|
208
|
+
write(path: string, content: string | Uint8Array): void;
|
|
209
|
+
}
|
|
210
|
+
/** Node-filesystem storage — the default. Lazily requires `node:fs` so the
|
|
211
|
+
* module imports cleanly in non-Node runtimes (where the caller passes
|
|
212
|
+
* `inMemoryCampaignStorage` instead and never constructs this). */
|
|
213
|
+
declare function fsCampaignStorage(): CampaignStorage;
|
|
214
|
+
/** In-memory storage for filesystem-less runtimes. Artifacts + trace spans
|
|
215
|
+
* live in a `Map` for the duration of the run; the `CampaignResult` is
|
|
216
|
+
* fully populated, but nothing is persisted to disk. */
|
|
217
|
+
declare function inMemoryCampaignStorage(): CampaignStorage;
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* @experimental
|
|
221
|
+
*
|
|
222
|
+
* `runCampaign` — Pass A substrate primitive. ONE function that orchestrates
|
|
223
|
+
* scenarios → dispatch → artifacts → judges → aggregates, with full
|
|
224
|
+
* reproducibility (seed + manifest hash), cell-level resumability, bootstrap
|
|
225
|
+
* CIs, and the `LabeledScenarioStore` capture flywheel.
|
|
226
|
+
*
|
|
227
|
+
* Improvement loops (optimizer / gate / autoOnPromote) ride on top of this
|
|
228
|
+
* primitive but live in `presets/run-improvement-loop.ts`. This file keeps
|
|
229
|
+
* the core orchestrator minimal — Phase 1 of the Pass A track.
|
|
230
|
+
*/
|
|
231
|
+
|
|
232
|
+
interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
|
|
233
|
+
scenarios: TScenario[];
|
|
234
|
+
dispatch: DispatchFn<TScenario, TArtifact>;
|
|
235
|
+
judges?: JudgeConfig<TArtifact, TScenario>[];
|
|
236
|
+
/** Required for reproducibility. Default 42. */
|
|
237
|
+
seed?: number;
|
|
238
|
+
/** Per-scenario replicates for CI bands. Default 1; raise to 5+ for
|
|
239
|
+
* bootstrap-tight intervals on critical eval. */
|
|
240
|
+
reps?: number;
|
|
241
|
+
/** When true (default), completed cells are cached by
|
|
242
|
+
* (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */
|
|
243
|
+
resumable?: boolean;
|
|
244
|
+
/** Optional store — when present, every artifact + judge score is captured
|
|
245
|
+
* with the configured `captureSource`. Capture is default ON; pass `'off'`
|
|
246
|
+
* to disable. */
|
|
247
|
+
labeledStore?: LabeledScenarioStore | 'off';
|
|
248
|
+
captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
|
|
249
|
+
captureSourceVersionHash?: string;
|
|
250
|
+
/** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */
|
|
251
|
+
costCeiling?: number;
|
|
252
|
+
/** Max concurrent cells. Default 2. */
|
|
253
|
+
maxConcurrency?: number;
|
|
254
|
+
/** Required: where artifacts + traces land. */
|
|
255
|
+
runDir: string;
|
|
256
|
+
/** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted
|
|
257
|
+
* at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
|
|
258
|
+
* refuses this when the caller wires `autoOnPromote !== 'none'`. */
|
|
259
|
+
tracing?: 'on' | 'off';
|
|
260
|
+
/** Test seam — override the wall clock for deterministic tests. */
|
|
261
|
+
now?: () => Date;
|
|
262
|
+
/** Test seam — override per-cell trace writer factory. */
|
|
263
|
+
buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter;
|
|
264
|
+
/** Storage backend for run/cell dirs, the resumability cache, artifacts,
|
|
265
|
+
* and trace spans. Default: the Node filesystem (`fsCampaignStorage`).
|
|
266
|
+
* Pass `inMemoryCampaignStorage()` to run in a filesystem-less runtime
|
|
267
|
+
* (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still
|
|
268
|
+
* produced; artifacts/traces just aren't persisted to disk. */
|
|
269
|
+
storage?: CampaignStorage;
|
|
270
|
+
}
|
|
271
|
+
declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* @experimental
|
|
275
|
+
*
|
|
276
|
+
* `runEval` — the simplest preset over `runCampaign`. No optimizer, no
|
|
277
|
+
* gate, no auto-PR. Just: run scenarios through dispatch, score with
|
|
278
|
+
* judges, return CampaignResult.
|
|
279
|
+
*
|
|
280
|
+
* The 80% case for consumers who want a scorecard, not an improvement loop.
|
|
281
|
+
*/
|
|
282
|
+
|
|
283
|
+
interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
|
|
284
|
+
runDir: string;
|
|
285
|
+
}
|
|
286
|
+
declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* @experimental
|
|
290
|
+
*
|
|
291
|
+
* `runOptimization` — the improvement loop body. Runs N generations: the
|
|
292
|
+
* `ImprovementDriver` proposes K candidate surfaces per generation, each
|
|
293
|
+
* candidate runs a campaign (the measurement), top-scoring promote to the
|
|
294
|
+
* next generation. Driver-agnostic — the same loop runs an evolutionary
|
|
295
|
+
* population mutator (`evolutionaryDriver`) or agent-runtime's
|
|
296
|
+
* `improvementDriver` (reflective / agentic generators); they differ only in
|
|
297
|
+
* how `propose()` picks candidates.
|
|
298
|
+
*
|
|
299
|
+
* This is `runLoop`'s shape (plan → measure → decide) specialized to surface
|
|
300
|
+
* improvement: `driver.propose` = plan, `runCampaign` = the measurement (which
|
|
301
|
+
* runs the worker behind `dispatch`), the mean-composite ranking = the
|
|
302
|
+
* validator, `driver.decide` = the stop check.
|
|
303
|
+
*
|
|
304
|
+
* The gated-promotion shell (`runImprovementLoop`) wraps this with a holdout
|
|
305
|
+
* re-score + release gate + optional PR.
|
|
306
|
+
*/
|
|
307
|
+
|
|
308
|
+
interface RunOptimizationOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch'> {
|
|
309
|
+
/** Initial mutable surface (typically system prompt or addendum). */
|
|
310
|
+
baselineSurface: MutableSurface;
|
|
311
|
+
/** Dispatcher that takes the CURRENT surface + scenario → artifact. */
|
|
312
|
+
dispatchWithSurface: (surface: MutableSurface, scenario: TScenario, ctx: Parameters<RunCampaignOptions<TScenario, TArtifact>['dispatch']>[1]) => Promise<TArtifact>;
|
|
313
|
+
/** The improvement strategy. Wrap a population `Mutator` via
|
|
314
|
+
* `evolutionaryDriver({ mutator })`, or pass agent-runtime's
|
|
315
|
+
* `improvementDriver` (reflective / agentic generators). */
|
|
316
|
+
driver: ImprovementDriver;
|
|
317
|
+
populationSize: number;
|
|
318
|
+
maxGenerations: number;
|
|
319
|
+
/** How many top-scoring candidates carry to the next generation. Default 2. */
|
|
320
|
+
promoteTopK?: number;
|
|
321
|
+
/** DEPTH knob forwarded to the driver's `propose()` — max iterations the
|
|
322
|
+
* agentic generator may take per candidate. */
|
|
323
|
+
maxImprovementShots?: number;
|
|
324
|
+
/** Phase-2 research report forwarded to `propose()` (analyst findings +
|
|
325
|
+
* diff). Opaque here; the driver types it. */
|
|
326
|
+
report?: unknown;
|
|
327
|
+
}
|
|
328
|
+
interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
|
|
329
|
+
generations: Array<{
|
|
330
|
+
record: GenerationRecord;
|
|
331
|
+
surfaces: Array<{
|
|
332
|
+
surfaceHash: string;
|
|
333
|
+
surface: MutableSurface;
|
|
334
|
+
campaign: CampaignResult<TArtifact, TScenario>;
|
|
335
|
+
}>;
|
|
336
|
+
}>;
|
|
337
|
+
winnerSurface: MutableSurface;
|
|
338
|
+
winnerSurfaceHash: string;
|
|
339
|
+
baselineCampaign: CampaignResult<TArtifact, TScenario>;
|
|
340
|
+
}
|
|
341
|
+
declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
|
|
342
|
+
declare function surfaceHash(surface: MutableSurface): string;
|
|
343
|
+
|
|
344
|
+
/**
|
|
345
|
+
* @experimental
|
|
346
|
+
*
|
|
347
|
+
* `runImprovementLoop` — the gated-promotion shell around the improvement
|
|
348
|
+
* loop body (`runOptimization`). Drives candidate surfaces via the
|
|
349
|
+
* `ImprovementDriver`, re-scores the winner against the baseline on a
|
|
350
|
+
* holdout set, runs the release gate, and optionally opens a PR.
|
|
351
|
+
*
|
|
352
|
+
* Role vocabulary (see docs/design/loop-taxonomy.md):
|
|
353
|
+
* - DRIVER = the `ImprovementDriver` (evolutionary GEPA mutator OR
|
|
354
|
+
* reflective analyst). Proposes candidate SURFACES — the
|
|
355
|
+
* worker's system prompt / tool config — NOT conversation
|
|
356
|
+
* turns.
|
|
357
|
+
* - MEASUREMENT= `runCampaign`. Scores one surface by running the worker
|
|
358
|
+
* (via `dispatch`) over scenarios and judging the output.
|
|
359
|
+
* - WORKER = the agent harness in the sandbox, invoked behind the
|
|
360
|
+
* topology-opaque `dispatch` seam — never referenced here.
|
|
361
|
+
*
|
|
362
|
+
* Distinct from `runLoop` in `@tangle-network/agent-runtime`, which is the
|
|
363
|
+
* INNER conversation loop (driver↔workers in a sandbox). `runImprovementLoop`
|
|
364
|
+
* is the OUTER loop: it improves the surface that those workers run.
|
|
365
|
+
*
|
|
366
|
+
* Hard-refuses unsafe configurations:
|
|
367
|
+
* - `tracing: 'off'` when a driver is wired (improvement is unattributable)
|
|
368
|
+
* - `autoOnPromote: 'config'` — DEFERRED to Pass B; v0.40 only ships
|
|
369
|
+
* `'pr'` and `'none'`.
|
|
370
|
+
*/
|
|
371
|
+
|
|
372
|
+
interface RunImprovementLoopOptions<TScenario extends Scenario, TArtifact> extends RunOptimizationOptions<TScenario, TArtifact> {
|
|
373
|
+
/** Holdout scenarios kept OUT of the training optimization pool — used
|
|
374
|
+
* ONLY to score baseline vs winner for the gate. */
|
|
375
|
+
holdoutScenarios: TScenario[];
|
|
376
|
+
/** Promotion gate. Substrate strongly recommends `defaultProductionGate`
|
|
377
|
+
* for production wiring (composes red-team / reward-hacking / canary /
|
|
378
|
+
* heldout). */
|
|
379
|
+
gate: Gate<TArtifact, TScenario>;
|
|
380
|
+
/** What to do when the gate ships:
|
|
381
|
+
* - `'pr'`: open a PR via `openAutoPr`
|
|
382
|
+
* - `'none'`: just report — caller decides what to do with the winner
|
|
383
|
+
* v0.40 does NOT support `'config'` (live-runtime self-mutation) —
|
|
384
|
+
* deferred to Pass B behind safety stack. */
|
|
385
|
+
autoOnPromote: 'pr' | 'none';
|
|
386
|
+
/** GH owner / repo for the auto-PR. Required when autoOnPromote === 'pr'. */
|
|
387
|
+
ghOwner?: string;
|
|
388
|
+
ghRepo?: string;
|
|
389
|
+
/** Optional render override — substrate writes a diff-shaped surface; pass
|
|
390
|
+
* a function to format the promoted surface differently. */
|
|
391
|
+
renderPromotedDiff?: (winnerSurface: MutableSurface, baselineSurface: MutableSurface) => string;
|
|
392
|
+
}
|
|
393
|
+
interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extends RunOptimizationResult<TArtifact, TScenario> {
|
|
394
|
+
baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
395
|
+
winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
396
|
+
gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
|
|
397
|
+
prResult?: ReturnType<typeof openAutoPr>;
|
|
398
|
+
}
|
|
399
|
+
declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
|
|
400
|
+
|
|
401
|
+
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type RunEvalOptions as a, type RunImprovementLoopOptions as b, type RunImprovementLoopResult as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
|
|
@@ -364,4 +364,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
|
|
|
364
364
|
scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
|
|
365
365
|
}
|
|
366
366
|
|
|
367
|
-
export type {
|
|
367
|
+
export type { CampaignAggregates as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchContext as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Quickstart — self-improvement loop for any agent (15 minutes)
|
|
2
|
+
|
|
3
|
+
The standalone walkthrough mirroring
|
|
4
|
+
`examples/foreign-agent-quickstart/`. Read this first; copy the runnable
|
|
5
|
+
example second.
|
|
6
|
+
|
|
7
|
+
## What you get
|
|
8
|
+
|
|
9
|
+
After 15 minutes you have a closed self-improvement loop running
|
|
10
|
+
against your agent — measured, gated, and reproducible — with no
|
|
11
|
+
Tangle sandbox, no Tangle account, and no hosted infrastructure.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```sh
|
|
16
|
+
npm i @tangle-network/agent-eval@^0.44.0
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
The package's `@tangle-network/sandbox` peer is `optional` (as of
|
|
20
|
+
0.44.0). Foreign consumers can install agent-eval and run the full LAND
|
|
21
|
+
tier without our sandbox or its dependencies.
|
|
22
|
+
|
|
23
|
+
## Five types, four functions
|
|
24
|
+
|
|
25
|
+
```ts
|
|
26
|
+
import {
|
|
27
|
+
// Types
|
|
28
|
+
type Scenario, // what you evaluate against (id + kind + your fields)
|
|
29
|
+
type Dispatch, // your agent, wrapped as one function
|
|
30
|
+
type JudgeConfig, // pluggable dimensional scorer
|
|
31
|
+
type Mutator, // proposes a next surface
|
|
32
|
+
type Gate, // promotion guard
|
|
33
|
+
|
|
34
|
+
// Functions
|
|
35
|
+
runEval,
|
|
36
|
+
runCampaign,
|
|
37
|
+
runImprovementLoop,
|
|
38
|
+
defaultProductionGate,
|
|
39
|
+
|
|
40
|
+
// Storage
|
|
41
|
+
fsCampaignStorage,
|
|
42
|
+
inMemoryCampaignStorage,
|
|
43
|
+
} from '@tangle-network/agent-eval/contract'
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Every export above is committed under semver. New minors only ADD;
|
|
47
|
+
nothing here changes shape in a 0.x minor.
|
|
48
|
+
|
|
49
|
+
## Three steps to wire your agent
|
|
50
|
+
|
|
51
|
+
### 1. Scenarios
|
|
52
|
+
|
|
53
|
+
```ts
|
|
54
|
+
interface MarketingScenario extends Scenario {
|
|
55
|
+
blurb: string
|
|
56
|
+
surface: 'landing-hero' | 'tweet' | 'email-subject'
|
|
57
|
+
audience: string
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const scenarios: MarketingScenario[] = [
|
|
61
|
+
{ id: 's1', kind: 'marketing-rewrite', blurb: '...', surface: 'tweet', audience: '...' },
|
|
62
|
+
// ...
|
|
63
|
+
]
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Wrap your agent as `Dispatch`
|
|
67
|
+
|
|
68
|
+
```ts
|
|
69
|
+
const dispatch: Dispatch<MarketingScenario, MarketingArtifact> = async (scenario, ctx) => {
|
|
70
|
+
const rewrite = await callYourAgent(scenario, { signal: ctx.signal })
|
|
71
|
+
return { rewrite, modelUsed: '...' }
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
`ctx` carries `signal` (cancellation), `trace` (write spans), `artifacts`
|
|
76
|
+
(write blobs), `cost` (token + $ meter). Use them or ignore them.
|
|
77
|
+
|
|
78
|
+
### 3. Bring a judge
|
|
79
|
+
|
|
80
|
+
```ts
|
|
81
|
+
const judge: JudgeConfig<MarketingArtifact, MarketingScenario> = {
|
|
82
|
+
name: 'marketing-quality',
|
|
83
|
+
dimensions: [
|
|
84
|
+
{ key: 'hook_strength', description: '...' },
|
|
85
|
+
{ key: 'voice_match', description: '...' },
|
|
86
|
+
{ key: 'cta_clarity', description: '...' },
|
|
87
|
+
{ key: 'factual_grounding', description: '...' },
|
|
88
|
+
],
|
|
89
|
+
async score({ artifact, scenario, signal }) {
|
|
90
|
+
// LLM call, heuristic, ensemble — anything. Return JudgeScore.
|
|
91
|
+
return { dimensions: { ... }, composite: 0.72, notes: '...' }
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Throw on failure; the substrate records it as a failed cell. No silent
|
|
97
|
+
zeros.
|
|
98
|
+
|
|
99
|
+
## Baseline
|
|
100
|
+
|
|
101
|
+
```ts
|
|
102
|
+
const baseline = await runEval({
|
|
103
|
+
scenarios,
|
|
104
|
+
dispatch,
|
|
105
|
+
judges: [judge],
|
|
106
|
+
storage: inMemoryCampaignStorage(),
|
|
107
|
+
runDir: 'mem://my-baseline',
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
const score = Object.values(baseline.aggregates.byScenario)
|
|
111
|
+
.reduce((sum, s) => sum + s.meanComposite, 0) / scenarios.length
|
|
112
|
+
|
|
113
|
+
console.log(`Baseline composite: ${score.toFixed(3)}`)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Self-improvement loop
|
|
117
|
+
|
|
118
|
+
```ts
|
|
119
|
+
import { gepaDriver, defaultProductionGate } from '@tangle-network/agent-eval/contract'
|
|
120
|
+
|
|
121
|
+
const result = await runImprovementLoop({
|
|
122
|
+
scenarios: trainScenarios,
|
|
123
|
+
baselineSurface,
|
|
124
|
+
dispatchWithSurface: (surface, scenario, ctx) =>
|
|
125
|
+
runYourAgent({ systemPrompt: surface as string }, scenario, ctx),
|
|
126
|
+
driver: gepaDriver({
|
|
127
|
+
llm: { apiKey: process.env.OPENAI_API_KEY, baseUrl: '...' },
|
|
128
|
+
model: 'gpt-4o-mini',
|
|
129
|
+
target: 'marketing copywriting system prompt',
|
|
130
|
+
mutationPrimitives: [
|
|
131
|
+
'Tighten the hook: lead with the concrete user outcome.',
|
|
132
|
+
'Replace generic adjectives with specific verbs.',
|
|
133
|
+
// ...
|
|
134
|
+
],
|
|
135
|
+
}),
|
|
136
|
+
judges: [judge],
|
|
137
|
+
populationSize: 2,
|
|
138
|
+
maxGenerations: 3,
|
|
139
|
+
holdoutScenarios,
|
|
140
|
+
gate: defaultProductionGate({
|
|
141
|
+
holdoutScenarios,
|
|
142
|
+
deltaThreshold: 0.05,
|
|
143
|
+
}),
|
|
144
|
+
autoOnPromote: 'none',
|
|
145
|
+
storage: inMemoryCampaignStorage(),
|
|
146
|
+
runDir: 'mem://my-improve',
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
if (result.gateResult.decision === 'ship') {
|
|
150
|
+
// Deploy result.winnerSurface — we don't push it for you.
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
The gate decision is `'ship'` | `'hold'` | `'need_more_work'` |
|
|
155
|
+
`'model_ceiling'` | `'arch_ceiling'`. You define what each means in
|
|
156
|
+
your deploy pipeline.
|
|
157
|
+
|
|
158
|
+
## What you control
|
|
159
|
+
|
|
160
|
+
- The agent (any framework, any model, any backend).
|
|
161
|
+
- The judge (LLM, heuristic, ensemble; we don't pick).
|
|
162
|
+
- The mutation strategy (`gepaDriver` for reflective LLM mutation,
|
|
163
|
+
`evolutionaryDriver({ mutator })` for population search, or
|
|
164
|
+
implement `ImprovementDriver` directly).
|
|
165
|
+
- The gate (compose `defaultProductionGate` with custom checks via
|
|
166
|
+
`composeGate`).
|
|
167
|
+
- The deploy step (`autoOnPromote: 'pr'` opens a GitHub PR with the
|
|
168
|
+
winner; `'none'` returns the surface and you ship however you ship).
|
|
169
|
+
|
|
170
|
+
## What this does NOT install
|
|
171
|
+
|
|
172
|
+
- No `@tangle-network/sandbox` — nothing runs in a Tangle sandbox.
|
|
173
|
+
- No hosted orchestrator — traces, artifacts, judge scores stay on
|
|
174
|
+
your machine (or in `inMemoryCampaignStorage` for Workers/edge).
|
|
175
|
+
- No daemons — `runEval` and `runImprovementLoop` complete in-process
|
|
176
|
+
and return.
|
|
177
|
+
|
|
178
|
+
## When you want more
|
|
179
|
+
|
|
180
|
+
The wedge doc (`docs/design/external-agent-wedge.md`) lays out three
|
|
181
|
+
graduated tiers:
|
|
182
|
+
|
|
183
|
+
| Tier | What you do | What you get |
|
|
184
|
+
|---|---|---|
|
|
185
|
+
| **LAND** (this quickstart) | `npm i @tangle-network/agent-eval`, wrap dispatch + judge, run loops | Local artifacts; full self-improvement; no Tangle infra |
|
|
186
|
+
| **EXPAND** | Point trace/eval data at our hosted orchestrator | Hosted dashboards, cross-run intelligence, billing on data routed to us |
|
|
187
|
+
| **PLATFORM** | Move execution into our sandbox | Substrate + orchestrator data pre-wired; sandbox usage billing |
|
|
188
|
+
|
|
189
|
+
Each tier is opt-in. EXPAND and PLATFORM build on the same primitives;
|
|
190
|
+
upgrading is adding configuration, not rewriting your wiring.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.44.1",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -104,6 +104,16 @@
|
|
|
104
104
|
"import": "./dist/campaign/index.js",
|
|
105
105
|
"default": "./dist/campaign/index.js"
|
|
106
106
|
},
|
|
107
|
+
"./contract": {
|
|
108
|
+
"types": "./dist/contract/index.d.ts",
|
|
109
|
+
"import": "./dist/contract/index.js",
|
|
110
|
+
"default": "./dist/contract/index.js"
|
|
111
|
+
},
|
|
112
|
+
"./adapters/langchain": {
|
|
113
|
+
"types": "./dist/adapters/langchain.d.ts",
|
|
114
|
+
"import": "./dist/adapters/langchain.js",
|
|
115
|
+
"default": "./dist/adapters/langchain.js"
|
|
116
|
+
},
|
|
107
117
|
"./openapi.json": {
|
|
108
118
|
"default": "./dist/openapi.json"
|
|
109
119
|
}
|