@tangle-network/agent-eval 0.65.0 → 0.66.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/adapters/otel.d.ts +1 -1
  3. package/dist/campaign/index.d.ts +4 -3
  4. package/dist/campaign/index.js +18 -19
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
  7. package/dist/chunk-6XQIEUQ2.js.map +1 -0
  8. package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
  9. package/dist/chunk-DFS3FEXO.js.map +1 -0
  10. package/dist/{chunk-4ODZXQV2.js → chunk-Q56RRLEC.js} +635 -2
  11. package/dist/chunk-Q56RRLEC.js.map +1 -0
  12. package/dist/chunk-RDK3P4JE.js +482 -0
  13. package/dist/chunk-RDK3P4JE.js.map +1 -0
  14. package/dist/contract/index.d.ts +10 -8
  15. package/dist/contract/index.js +11 -12
  16. package/dist/contract/index.js.map +1 -1
  17. package/dist/hosted/index.d.ts +1 -1
  18. package/dist/hosted/index.js +1 -1
  19. package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
  20. package/dist/index.d.ts +246 -3
  21. package/dist/index.js +292 -2
  22. package/dist/index.js.map +1 -1
  23. package/dist/openapi.json +1 -1
  24. package/dist/provenance-BZUFC1_D.d.ts +292 -0
  25. package/dist/{registry-DPly4_hZ.d.ts → registry-BzAEvqAt.d.ts} +1 -1
  26. package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
  27. package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
  28. package/package.json +1 -1
  29. package/dist/chunk-4ODZXQV2.js.map +0 -1
  30. package/dist/chunk-7TPYV2ER.js.map +0 -1
  31. package/dist/chunk-CZRKD2X2.js +0 -1104
  32. package/dist/chunk-CZRKD2X2.js.map +0 -1
  33. package/dist/chunk-E22YUOAL.js +0 -111
  34. package/dist/chunk-E22YUOAL.js.map +0 -1
  35. package/dist/chunk-HKINEDRZ.js.map +0 -1
  36. /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.65.0",
5
+ "version": "0.66.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -0,0 +1,292 @@
1
+ import { o as Mutator, I as ImprovementDriver, S as Scenario, G as Gate, f as CampaignResult, M as MutableSurface, k as GateResult, j as GateDecision } from './types-c2R2kfmv.js';
2
+ import { R as RedTeamCase } from './red-team-DW9Ca_tj.js';
3
+ import { R as RunRecord } from './run-record-BgTFzO2r.js';
4
+ import { a as RunCampaignOptions, C as CampaignStorage } from './run-improvement-loop-BKpM5T4t.js';
5
+ import { H as HostedClient, T as TraceSpanEvent } from './index-DSEHMwvS.js';
6
+
7
+ /**
8
+ * @experimental
9
+ *
10
+ * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
11
+ * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
12
+ * the evolutionary strategy: each generation, mutate the current best surface
13
+ * into N candidates, measure, select. No generation memory beyond the current
14
+ * surface; the loop body handles ranking + promotion.
15
+ *
16
+ * The reflective alternative is agent-runtime's `improvementDriver` with a
17
+ * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +
18
+ * trace findings to propose targeted edits rather than blind mutations. Both
19
+ * conform to `ImprovementDriver`; the improvement loop is identical regardless
20
+ * of which drives it.
21
+ */
22
+
23
+ interface EvolutionaryDriverOptions<TFindings = unknown> {
24
+ mutator: Mutator<TFindings>;
25
+ /** External findings fed to the mutator each generation. Default: []. */
26
+ findings?: TFindings[];
27
+ }
28
+ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
29
+
30
+ /**
31
+ * @experimental
32
+ *
33
+ * Compose multiple `Gate` implementations — every gate must pass for the
34
+ * composite to ship. Closes the alignment reviewer's "default-only
35
+ * heldOutGate + costGate would happily promote a reward-hacked prompt"
36
+ * concern by making safety gates first-class composable defaults.
37
+ */
38
+
39
+ /** Compose gates — all must `ship` for the composite to `ship`. First
40
+ * non-ship verdict short-circuits the composite verdict, but ALL gates run
41
+ * (so the result records every gate's reason — useful for diagnostics). */
42
+ declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
43
+
44
+ /**
45
+ * @experimental
46
+ *
47
+ * `defaultProductionGate` — composes the substrate's existing safety
48
+ * primitives (red-team / reward-hacking / canary / heldout) into a single
49
+ * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
50
+ * primitives are off the critical path" blocker.
51
+ *
52
+ * The composition is opinionated — when consumers wire `runImprovementLoop`,
53
+ * THIS gate is the default. Consumers can still pass a custom gate to
54
+ * override; the recommended pattern is to compose THIS gate with whatever
55
+ * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
56
+ */
57
+
58
+ interface DefaultProductionGateOptions {
59
+ /** Required: scenarios held out from training; substrate compares
60
+ * candidate-on-holdout vs baseline-on-holdout. */
61
+ holdoutScenarios: Scenario[];
62
+ /** Minimum mean-composite improvement required to ship. Default 0.5. */
63
+ deltaThreshold?: number;
64
+ /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
65
+ * Composite verdict refuses to ship when spend exceeded budget. */
66
+ budgetUsd?: number;
67
+ /** Red-team cases to probe candidate outputs against. When omitted the
68
+ * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
69
+ * battery for tighter coverage. */
70
+ redTeamBattery?: RedTeamCase[];
71
+ /** Run records (oldest-first) needed for the reward-hacking detector.
72
+ * Substrate populates from prior production-loop generations. */
73
+ recentRuns?: RunRecord[];
74
+ /** When true, the gate refuses to ship if the reward-hacking detector
75
+ * fires at the `gaming` severity. Default true. */
76
+ blockOnRewardHackingGaming?: boolean;
77
+ }
78
+ declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
79
+
80
+ /**
81
+ * @experimental
82
+ *
83
+ * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
84
+ * `Gate`. Use when you want held-out as one of N composed gates instead of
85
+ * the full `defaultProductionGate` stack.
86
+ */
87
+
88
+ interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
89
+ scenarios: TScenario[];
90
+ deltaThreshold?: number;
91
+ }
92
+ declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
93
+
94
+ /**
95
+ * @experimental
96
+ *
97
+ * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
98
+ * gate, no auto-PR. Just: run scenarios through dispatch, score with
99
+ * judges, return CampaignResult.
100
+ *
101
+ * The 80% case for consumers who want a scorecard, not an improvement loop.
102
+ */
103
+
104
+ interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
105
+ runDir: string;
106
+ }
107
+ declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
108
+
109
+ /**
110
+ * @experimental
111
+ *
112
+ * Loop provenance — the durable, queryable record of WHAT a self-improvement
113
+ * loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
114
+ * an eval-run to the underlying candidate→cell→gate→promote chain.
115
+ *
116
+ * Two artifacts, one source of truth:
117
+ *
118
+ * 1. `LoopProvenanceRecord` — a structured JSON record capturing every
119
+ * candidate (surfaceHash + label + rationale), its measured composite,
120
+ * the gate decision + reasons + delta, the held-out lift, the explicit
121
+ * baseline→candidate diff, and BACKEND PROVENANCE (the
122
+ * `assertRealBackend` verdict + worker call count + model). This is the
123
+ * ingestable audit artifact: the +lift recomputes from it, the "because
124
+ * Z" rationale survives in it, and a stub backend is detectable from it.
125
+ *
126
+ * 2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
127
+ * `TraceSpanEvent`s, pivoted on the substrate's standard
128
+ * `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
129
+ * `tangle.generation` attributes (the same pivots `/adapters/otel`
130
+ * reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
131
+ * not just the `cost.*` spans `runCampaign` already emits per cell.
132
+ *
133
+ * The record is built from the substrate's own loop result + the per-call
134
+ * `RunRecord`s the worker emitted — no new measurement, no recomputation that
135
+ * could drift from what the gate actually saw.
136
+ */
137
+
138
+ /** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
139
+ * their worktree+base identity since the content lives in git. Distinct from
140
+ * `surfaceHash` (16-char content fingerprint used as a loop identity key);
141
+ * this is the byte-identical-verifiable content hash the provenance record +
142
+ * `RunRecord.promptHash` carry. */
143
+ declare function surfaceContentHash(surface: MutableSurface): string;
144
+ interface LoopProvenanceCandidate {
145
+ /** Generation index this candidate was proposed in. */
146
+ generation: number;
147
+ /** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
148
+ surfaceHash: string;
149
+ /** Full sha256 content hash — byte-identical-verifiable. */
150
+ contentHash: string;
151
+ /** Driver label, when the driver returned a `ProposedCandidate`. */
152
+ label?: string;
153
+ /** Driver rationale — the "because Z". When the driver returned a bare
154
+ * surface (blind mutator) this is absent. */
155
+ rationale?: string;
156
+ /** Mean composite this candidate scored on the search split. */
157
+ composite: number;
158
+ /** Whether this candidate was promoted out of its generation. */
159
+ promoted: boolean;
160
+ }
161
+ interface LoopProvenanceBackend {
162
+ /** `assertRealBackend`-grade verdict over the worker call records. */
163
+ verdict: 'real' | 'mixed' | 'stub';
164
+ /** Number of worker LLM calls captured (the audit's "worker call count"). */
165
+ workerCallCount: number;
166
+ /** Distinct model ids observed across worker calls. */
167
+ models: string[];
168
+ totalInputTokens: number;
169
+ totalOutputTokens: number;
170
+ totalCostUsd: number;
171
+ }
172
+ /**
173
+ * The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
174
+ * ADDS the rationale + the explicit baseline→candidate diff (both omitted from
175
+ * the bare hosted event) + backend provenance.
176
+ */
177
+ interface LoopProvenanceRecord {
178
+ schema: 'tangle.loop-provenance.v1';
179
+ runId: string;
180
+ runDir: string;
181
+ timestamp: string;
182
+ /** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
183
+ baselineContentHash: string;
184
+ winnerContentHash: string;
185
+ /** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
186
+ winnerLabel?: string;
187
+ winnerRationale?: string;
188
+ /** The explicit baseline→winner unified diff the gate decided on. */
189
+ diff: string;
190
+ /** Every candidate across every generation, each carrying its rationale. */
191
+ candidates: LoopProvenanceCandidate[];
192
+ /** The gate verdict — decision + reasons + contributing gates + delta. */
193
+ gate: {
194
+ decision: GateDecision;
195
+ reasons: string[];
196
+ delta?: number;
197
+ contributingGates: Array<{
198
+ name: string;
199
+ passed: boolean;
200
+ }>;
201
+ };
202
+ /** baseline-on-holdout composite mean. */
203
+ baselineHoldoutComposite: number;
204
+ /** winner-on-holdout composite mean. */
205
+ winnerHoldoutComposite: number;
206
+ /** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
207
+ heldOutLift: number;
208
+ /** Backend provenance: stub-vs-real verdict + worker call count + models. */
209
+ backend: LoopProvenanceBackend;
210
+ totalCostUsd: number;
211
+ totalDurationMs: number;
212
+ }
213
+ interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
214
+ runId: string;
215
+ runDir: string;
216
+ timestamp: string;
217
+ baselineSurface: MutableSurface;
218
+ winnerSurface: MutableSurface;
219
+ winnerLabel?: string;
220
+ winnerRationale?: string;
221
+ diff: string;
222
+ /** Per-generation candidate records straight off the loop result. */
223
+ generations: Array<{
224
+ generationIndex: number;
225
+ candidates: Array<{
226
+ surfaceHash: string;
227
+ composite: number;
228
+ label?: string;
229
+ rationale?: string;
230
+ }>;
231
+ promoted: string[];
232
+ /** Surfaces measured this generation, keyed positionally to candidates so
233
+ * the content hash can be computed from the real surface text. */
234
+ surfaces: Array<{
235
+ surfaceHash: string;
236
+ surface: MutableSurface;
237
+ }>;
238
+ }>;
239
+ gate: GateResult;
240
+ baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
241
+ winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
242
+ /** Worker call records — the source for backend provenance. */
243
+ workerRecords: ReadonlyArray<RunRecord>;
244
+ totalCostUsd: number;
245
+ totalDurationMs: number;
246
+ }
247
+ /** Build the durable provenance record from a completed loop result. */
248
+ declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
249
+ /**
250
+ * Build the loop's OTLP-ingestable spans from a provenance record. One root
251
+ * span per loop (`tangle.runId`), one span per generation, one span per
252
+ * candidate (carrying its surfaceHash + label), and one span for the gate
253
+ * decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
254
+ * the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
255
+ * reads, so the hosted collector reconstructs the full tree.
256
+ *
257
+ * Times are synthesized monotonically off a single base so the span tree is
258
+ * orderable; the substrate does not retain per-candidate wall-clock starts.
259
+ */
260
+ declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
261
+ baseTimeMs?: number;
262
+ }): TraceSpanEvent[];
263
+ /** Canonical durable paths under the run dir. */
264
+ declare function provenanceRecordPath(runDir: string): string;
265
+ declare function provenanceSpansPath(runDir: string): string;
266
+ interface EmitLoopProvenanceResult {
267
+ record: LoopProvenanceRecord;
268
+ spans: TraceSpanEvent[];
269
+ /** Absolute paths the record + spans were written to, when storage persists. */
270
+ recordPath: string;
271
+ spansPath: string;
272
+ }
273
+ interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
274
+ /** Storage the record + spans are written through. */
275
+ storage: CampaignStorage;
276
+ /** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
277
+ * endpoint so the collector receives the full loop, not just `cost.*`. */
278
+ hostedClient?: HostedClient;
279
+ }
280
+ /**
281
+ * Build the provenance record + OTel spans and persist them durably under the
282
+ * run dir (and ship spans to a hosted collector when one is wired). Returns
283
+ * both artifacts so the caller can assert on / re-derive from them.
284
+ *
285
+ * Fail-loud: the durable write throws on storage failure (a swallowed write is
286
+ * exactly the "emitted but lost" failure this closes). The hosted span ship is
287
+ * the one best-effort leg — its failure is logged, not thrown, so an offline
288
+ * collector never fails the loop (the durable artifact is the source of truth).
289
+ */
290
+ declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
291
+
292
+ export { type BuildLoopProvenanceArgs as B, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type HeldOutGateOptions as H, type LoopProvenanceRecord as L, type RunEvalOptions as R, type EmitLoopProvenanceArgs as a, type EmitLoopProvenanceResult as b, composeGate as c, defaultProductionGate as d, evolutionaryDriver as e, type LoopProvenanceBackend as f, type LoopProvenanceCandidate as g, heldOutGate as h, buildLoopProvenanceRecord as i, emitLoopProvenance as j, provenanceSpansPath as k, loopProvenanceSpans as l, provenanceRecordPath as p, runEval as r, surfaceContentHash as s };
@@ -454,4 +454,4 @@ declare class AnalystRegistry {
454
454
  private routeInput;
455
455
  }
456
456
 
457
- export { AnalystRegistry as A, type BudgetPolicy as B, type ChatCallOpts as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type AnalystHooks as f, type AnalystInputKind as g, type AnalystRegistryOptions as h, type AnalystRequirements as i, type AnalystRunEvent as j, type AnalystRunInputs as k, type AnalystRunResult as l, type AnalystRunSummary as m, type ChatClient as n, type ChatRequest as o, type ChatResponse as p, type ChatTransport as q, type CliBridgeTransportOpts as r, type CreateChatClientOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };
457
+ export { AnalystRegistry as A, type BudgetPolicy as B, type ChatRequest as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type CreateChatClientOpts as f, type AnalystHooks as g, type AnalystInputKind as h, type AnalystRegistryOptions as i, type AnalystRequirements as j, type AnalystRunEvent as k, type AnalystRunInputs as l, type AnalystRunResult as m, type AnalystRunSummary as n, type ChatCallOpts as o, type ChatClient as p, type ChatResponse as q, type ChatTransport as r, type CliBridgeTransportOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };
@@ -1,11 +1,10 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-7TPYV2ER.js";
4
- import "./chunk-E22YUOAL.js";
3
+ } from "./chunk-6XQIEUQ2.js";
5
4
  import "./chunk-ITBRCT73.js";
6
5
  import "./chunk-3BFEG2F6.js";
7
6
  import "./chunk-PZ5AY32C.js";
8
7
  export {
9
8
  runCampaign
10
9
  };
11
- //# sourceMappingURL=run-campaign-5J3ED2UJ.js.map
10
+ //# sourceMappingURL=run-campaign-BVY3RGAZ.js.map