@tangle-network/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +236 -1
  2. package/README.md +17 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  5. package/dist/chunk-4W4NCYM2.js.map +1 -0
  6. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  7. package/dist/chunk-6M774GY6.js +53 -0
  8. package/dist/chunk-6M774GY6.js.map +1 -0
  9. package/dist/chunk-7EAUOUQS.js +495 -0
  10. package/dist/chunk-7EAUOUQS.js.map +1 -0
  11. package/dist/chunk-AXHNWLIX.js +246 -0
  12. package/dist/chunk-AXHNWLIX.js.map +1 -0
  13. package/dist/chunk-EXGR4XEM.js +283 -0
  14. package/dist/chunk-EXGR4XEM.js.map +1 -0
  15. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  16. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  17. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  18. package/dist/chunk-LZKIOBG2.js +2026 -0
  19. package/dist/chunk-LZKIOBG2.js.map +1 -0
  20. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  21. package/dist/chunk-QBW3YBTR.js.map +1 -0
  22. package/dist/chunk-QUKKGHTZ.js +121 -0
  23. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  24. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  25. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  26. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  27. package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
  28. package/dist/cli.js +3 -3
  29. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  30. package/dist/control.d.ts +3 -3
  31. package/dist/control.js +2 -2
  32. package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
  33. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  34. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  35. package/dist/index-ekBXweiQ.d.ts +1894 -0
  36. package/dist/index.d.ts +20 -430
  37. package/dist/index.js +154 -34
  38. package/dist/index.js.map +1 -1
  39. package/dist/integrity-Cr5YodSY.d.ts +210 -0
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +7 -145
  42. package/dist/optimization.js +12 -3
  43. package/dist/reporting.d.ts +294 -4
  44. package/dist/reporting.js +18 -9
  45. package/dist/rl.d.ts +8 -0
  46. package/dist/rl.js +113 -0
  47. package/dist/rl.js.map +1 -0
  48. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  49. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  50. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
  51. package/dist/traces.d.ts +101 -181
  52. package/dist/traces.js +19 -8
  53. package/dist/wire/index.js +3 -3
  54. package/docs/auto-research-loop-end-to-end.md +186 -0
  55. package/docs/research-report-methodology.md +19 -4
  56. package/docs/three-package-architecture.md +180 -0
  57. package/docs/wire-protocol.md +1 -1
  58. package/package.json +7 -2
  59. package/dist/chunk-3IX6QTB7.js.map +0 -1
  60. package/dist/chunk-KRR4VMH7.js +0 -423
  61. package/dist/chunk-KRR4VMH7.js.map +0 -1
  62. package/dist/chunk-WOK2RTWG.js.map +0 -1
  63. package/dist/chunk-YUFXO3TU.js.map +0 -1
  64. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  65. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  66. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
  67. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
  68. /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
@@ -0,0 +1,573 @@
1
+ import { C as GateDecision, F as ResearchReportOptions, H as ResearchReport } from './summary-report-Ce1r4EYo.js';
2
+ import { R as RunRecord, a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata } from './run-record-DNiOMBrZ.js';
3
+ import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-Cr5YodSY.js';
4
+ import { T as TraceEmitter, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
5
+ import { T as TraceStore } from './store-u47QaJ9G.js';
6
+
7
+ /**
8
+ * LLM client with graceful degrade.
9
+ *
10
+ * OpenAI-compatible `/v1/chat/completions` client with:
11
+ * - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
12
+ * - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
13
+ * - Graceful json_schema → json_object degrade on 400 with schema-reject body.
14
+ * - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
15
+ * - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
16
+ * directly, cli-bridge subscriptions, and any router that speaks the spec.
17
+ *
18
+ * Usage:
19
+ * const { value, result } = await callLlmJson<MyType>(
20
+ * { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
21
+ * { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
22
+ * )
23
+ *
24
+ * This is THE llm-calling seam for agent-eval primitives that need structured
25
+ * output (semantic concept judge, reviewer directives, critic scores). Primitives
26
+ * that need free-form text use `callLlm` and parse output themselves.
27
+ */
28
+
29
+ interface LlmMessage {
30
+ role: 'system' | 'user' | 'assistant';
31
+ /**
32
+ * Either a plain text content string OR a multimodal content array
33
+ * (text + image_url parts) for vision-capable models.
34
+ */
35
+ content: string | Array<{
36
+ type: 'text';
37
+ text: string;
38
+ } | {
39
+ type: 'image_url';
40
+ image_url: {
41
+ url: string;
42
+ detail?: 'auto' | 'low' | 'high';
43
+ };
44
+ }>;
45
+ }
46
+ interface LlmCallRequest {
47
+ model: string;
48
+ messages: LlmMessage[];
49
+ /** Optional JSON-mode response format (response_format: json_object). */
50
+ jsonMode?: boolean;
51
+ /** Optional structured output via JSON Schema. Falls back to json_object on 400. */
52
+ jsonSchema?: {
53
+ name: string;
54
+ schema: Record<string, unknown>;
55
+ };
56
+ temperature?: number;
57
+ maxTokens?: number;
58
+ /** Per-call timeout, default 60s. */
59
+ timeoutMs?: number;
60
+ }
61
+ interface LlmUsage {
62
+ promptTokens: number;
63
+ completionTokens: number;
64
+ totalTokens: number;
65
+ /** Proxies populate this when prompt caching is on. */
66
+ cachedPromptTokens?: number;
67
+ }
68
+ interface LlmCallResult {
69
+ /** The text content of the first choice. Empty string if none. */
70
+ content: string;
71
+ usage: LlmUsage;
72
+ /**
73
+ * Cost in USD. Pulled from proxy's `_response_cost` field when present;
74
+ * `null` when neither the proxy nor the caller can derive it.
75
+ */
76
+ costUsd: number | null;
77
+ /** Model name actually used (echoed from response). */
78
+ model: string;
79
+ /** Wall-clock duration of the HTTP call (last attempt, if retried). */
80
+ durationMs: number;
81
+ /** Raw response body. */
82
+ raw: Record<string, unknown>;
83
+ }
84
+ declare class LlmCallError extends Error {
85
+ readonly status: number;
86
+ readonly body: string;
87
+ readonly model: string;
88
+ constructor(message: string, status: number, body: string, model: string);
89
+ }
90
+ interface LlmClientOptions {
91
+ /** Base URL (without trailing slash). Must end at the `/v1` prefix. */
92
+ baseUrl?: string;
93
+ /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
94
+ apiKey?: string;
95
+ bearer?: string;
96
+ /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
97
+ authHeader?: {
98
+ name: string;
99
+ value: string;
100
+ };
101
+ /** Default timeout in ms. Per-call can override. */
102
+ defaultTimeoutMs?: number;
103
+ /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
104
+ maxRetries?: number;
105
+ /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
106
+ fetch?: typeof fetch;
107
+ /**
108
+ * Optional raw HTTP capture sink. When provided, every request, response,
109
+ * and error (across all retry attempts) is recorded to the sink, with auth
110
+ * headers and credential-shaped body fields redacted by default. This is
111
+ * the layer-1 forensics primitive: structured `LlmSpan`s record intent,
112
+ * raw events record what actually crossed the wire.
113
+ */
114
+ rawSink?: RawProviderSink;
115
+ /**
116
+ * Logical provider id attached to raw events. When omitted, derived from
117
+ * `baseUrl` via `providerFromBaseUrl`.
118
+ */
119
+ provider?: string;
120
+ /** Trace context attached to raw events; populated by emitter-aware callers. */
121
+ traceContext?: {
122
+ runId?: string;
123
+ spanId?: string;
124
+ };
125
+ /** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
126
+ redactor?: ProviderRedactor;
127
+ }
128
+ /**
129
+ * Strip a ```json / ``` code fence if the model emitted one.
130
+ * Idempotent for naked JSON. Some models (claude-code via router, certain
131
+ * deepseek models) wrap output even under json_object.
132
+ */
133
+ declare function stripFencedJson(raw: string): string;
134
+ /**
135
+ * Low-level call. Returns raw content + usage + cost. Retries on transient
136
+ * failures; does NOT degrade schema here — callers that want graceful
137
+ * degrade use `callLlmJson`.
138
+ */
139
+ declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
140
+ /**
141
+ * Structured-output call. Returns parsed JSON plus the raw result envelope.
142
+ * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
143
+ * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
144
+ * the `response_format.json_schema` shape but DO accept `json_object`.
145
+ */
146
+ declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
147
+ value: T;
148
+ result: LlmCallResult;
149
+ }>;
150
+ declare class LlmRouteAssertionError extends Error {
151
+ readonly code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
152
+ readonly baseUrl: string;
153
+ constructor(message: string, code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider', baseUrl: string);
154
+ }
155
+ interface LlmRouteRequirements {
156
+ /**
157
+ * Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
158
+ * `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
159
+ * the public/free-tier router is a defect — the launch reviewer needs to
160
+ * know exactly which provider answered.
161
+ */
162
+ requireExplicitBaseUrl?: boolean;
163
+ /**
164
+ * Allowlist of acceptable base URLs. Strings match by prefix
165
+ * (case-insensitive); RegExps test against the full base URL.
166
+ */
167
+ allowedBaseUrls?: Array<string | RegExp>;
168
+ /** Blocklist that takes precedence over `allowedBaseUrls`. */
169
+ blockedBaseUrls?: Array<string | RegExp>;
170
+ /** Throw if no auth header / api key is configured. */
171
+ requireAuth?: boolean;
172
+ /**
173
+ * Logical provider id the configured `baseUrl` is expected to match (via
174
+ * `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
175
+ */
176
+ expectedProvider?: string;
177
+ }
178
+ /**
179
+ * Fail-loud assertion that the configured LLM client points at the route
180
+ * the caller intends. Designed for the matrix-runner preflight: invoke
181
+ * once before any LLM call to catch misconfiguration before a sweep burns
182
+ * dollars on the wrong provider.
183
+ *
184
+ * Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
185
+ * from constructors and CI gates.
186
+ */
187
+ declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
188
+ /**
189
+ * Probe whether a model is reachable. Returns latency + null error on
190
+ * success; `ok=false` + error message on any failure (HTTP, timeout,
191
+ * network, parse). Designed for sweep preflights — fail loud at the
192
+ * boundary before burning a 30-leaf run on a misconfigured router.
193
+ *
194
+ * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
195
+ * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
196
+ * for short prompts, so don't tighten this further. We don't validate
197
+ * content; HTTP 200 means reachable.
198
+ */
199
+ declare function probeLlm(model: string, opts?: LlmClientOptions & {
200
+ timeoutMs?: number;
201
+ }): Promise<{
202
+ ok: boolean;
203
+ latencyMs: number;
204
+ error: string | null;
205
+ }>;
206
+ /**
207
+ * Stateful client — construct once with defaults, call many times.
208
+ * Thin wrapper around the free functions; exists for callers that want
209
+ * to inject a single configured instance into multiple primitives.
210
+ */
211
+ declare class LlmClient {
212
+ private readonly opts;
213
+ constructor(opts?: LlmClientOptions);
214
+ call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
215
+ callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
216
+ value: T;
217
+ result: LlmCallResult;
218
+ }>;
219
+ }
220
+
221
+ /**
222
+ * Researcher interface — stable hook for an external autonomous-research
223
+ * agent to drive the meta-loop.
224
+ *
225
+ * Implementations live downstream (typically in a private repo that
226
+ * runs the actual LLM). This package ships only the contract + a
227
+ * `NoopResearcher` so consumers can wire the surface without being
228
+ * forced to implement every method up front.
229
+ *
230
+ * The four methods mirror the four stages of the paper "Two Loops,
231
+ * Three Roles":
232
+ *
233
+ * inspectFailures — given the observed runs, what failure modes
234
+ * are present? (data → diagnosis)
235
+ * proposeChange — given diagnosed failure modes, what
236
+ * structural changes should we try?
237
+ * (diagnosis → plan delta)
238
+ * applyChange — fold the proposed deltas into a concrete
239
+ * experiment plan against an existing baseline.
240
+ * (plan delta → executable plan)
241
+ * evaluateChange — run the plan, return runs + the gate verdict.
242
+ * (executable plan → verdict)
243
+ *
244
+ * Composition is the discipline: a Researcher implementation MUST
245
+ * keep these four steps separate and inspectable. Conflating
246
+ * "diagnose + propose + run" into a single LLM call defeats the
247
+ * point of the framework — you can't audit which step lied.
248
+ *
249
+ * THIS INTERFACE IS STABLE. Breaking changes require a new module
250
+ * (e.g. `Researcher2`) so existing implementations keep working.
251
+ */
252
+
253
+ /** A diagnosed failure mode with the run-IDs that exhibit it. */
254
+ interface FailureMode {
255
+ /** Short machine-readable code. Must be stable across runs of the
256
+ * same researcher to enable longitudinal tracking. */
257
+ code: string;
258
+ /** Human-readable description for the paper / dashboard. */
259
+ description: string;
260
+ evidence: {
261
+ /** Run IDs (from `RunRecord.runId`) where this failure mode was
262
+ * observed. */
263
+ runIds: string[];
264
+ /** Number of run samples that informed the diagnosis. */
265
+ samples: number;
266
+ };
267
+ }
268
+ /** A single steering change the researcher wants to try. */
269
+ interface SteeringChange {
270
+ kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
271
+ /** Implementation-specific payload. Researcher implementations
272
+ * define the schema — keep this `unknown` here to avoid coupling
273
+ * the public interface to any one researcher's internal model. */
274
+ payload: unknown;
275
+ /** Why the researcher proposed this change. Goes into the audit
276
+ * trail next to the failure-mode evidence. */
277
+ rationale: string;
278
+ /** Optional self-reported expected delta on the headline metric. */
279
+ expectedDelta?: number;
280
+ }
281
+ /** A single experiment plan, mapped onto the search/holdout splits. */
282
+ interface ExperimentPlan {
283
+ baselineCandidateId: string;
284
+ proposedCandidateId: string;
285
+ changes: SteeringChange[];
286
+ /** USD ceiling for the entire experiment. The runner must stop
287
+ * before exceeding this and report a partial result. */
288
+ evaluationBudgetUsd: number;
289
+ /** Item IDs (your dataset keys) for the search vs holdout splits. */
290
+ splits: {
291
+ search: string[];
292
+ holdout: string[];
293
+ };
294
+ }
295
+ /** Result of running a plan: every run, plus the gate verdict. */
296
+ interface ExperimentResult {
297
+ plan: ExperimentPlan;
298
+ runs: RunRecord[];
299
+ gateDecision: GateDecision;
300
+ }
301
+ /**
302
+ * The researcher loop. Stable, four-step, inspectable.
303
+ *
304
+ * ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
305
+ * │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
306
+ * └──────────┘ └──────────┘ └────┬─────┘
307
+ * │
308
+ * ▼
309
+ * ┌────────────────┐ applyChange ┌────────┐
310
+ * │ ExperimentPlan │ ◀────────────│ base │
311
+ * └────────┬───────┘ └────────┘
312
+ * │
313
+ * evaluateChange ▼
314
+ * ┌────────────────┐
315
+ * │ ExperimentResult│
316
+ * └────────────────┘
317
+ */
318
+ interface Researcher {
319
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
320
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
321
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
322
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
323
+ }
324
+ interface CallbackResearcherOptions {
325
+ inspectFailures: Researcher['inspectFailures'];
326
+ proposeChange: Researcher['proposeChange'];
327
+ applyChange: Researcher['applyChange'];
328
+ evaluateChange: Researcher['evaluateChange'];
329
+ }
330
+ /**
331
+ * Minimal concrete researcher for tests, scripts, and small integrations.
332
+ * Larger autonomous researchers can still implement `Researcher` directly.
333
+ */
334
+ declare class CallbackResearcher implements Researcher {
335
+ private readonly callbacks;
336
+ constructor(callbacks: CallbackResearcherOptions);
337
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
338
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
339
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
340
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
341
+ }
342
+ /**
343
+ * No-op researcher — fails loud on every method. Use as a placeholder
344
+ * in code paths that wire the interface but don't have an implementation
345
+ * yet. Importantly, this does NOT silently succeed: a no-op researcher
346
+ * that returned empty arrays would muffle the loop's signal that
347
+ * nobody implemented the brain.
348
+ */
349
+ declare class NoopResearcher implements Researcher {
350
+ private readonly hint;
351
+ constructor(hint?: string);
352
+ inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
353
+ proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
354
+ applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
355
+ evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
356
+ }
357
+
358
+ /**
359
+ * EvalCampaign — opinionated matrix runner that wires the four
360
+ * capture-integrity directives by construction.
361
+ *
362
+ * Every consumer that ran a launch-grade benchmark before 0.22 reinvented
363
+ * the same shape: matrix runner → for each (variant, scenario, seed) →
364
+ * start a TraceEmitter → call LLMs → end the run → maybe analyze.
365
+ * The bug class blueprint-agent reported (raw events not captured, route
366
+ * silently wrong, integrity not asserted, analyst never ran) lives at the
367
+ * integration boundary — not the agent-eval API surface. The four
368
+ * directives in `SKILL.md § Capture integrity` are mitigations.
369
+ *
370
+ * `EvalCampaign` is the structural fix. Consumers don't wire the integrity
371
+ * surface anymore; the campaign owns it. Specifically, the campaign:
372
+ *
373
+ * - calls `assertLlmRoute` once at preflight before any work runs
374
+ * - constructs a per-run `TraceStore` and `RawProviderSink` via factories
375
+ * - constructs the `TraceEmitter` with `onRunComplete: [analyst hook]`
376
+ * - hands the runner an `LlmClientOptions` pre-wired with the sink and
377
+ * trace context — the runner can't accidentally call an LLM without
378
+ * capturing the raw HTTP envelope
379
+ * - calls `assertRunCaptured` after every `endRun` and routes failures
380
+ * through a configurable policy (`throw` / `mark_failed` / `log`)
381
+ * - assembles per-run `RunRecord`s and runs `researchReport` at the end
382
+ * so the campaign artifact is launch-decision-grade by default
383
+ * - embeds the campaign fingerprint (a SHA-256 over the canonicalised
384
+ * run set) and optional `preregistrationHash` in the report
385
+ *
386
+ * The runner contract is intentionally narrow: produce a `CampaignRunOutcome`
387
+ * given a fully-wired `CampaignRunContext`. Everything orchestration-shaped
388
+ * lives in the campaign. This is the inversion-of-control point — consumers
389
+ * stop writing matrix runners and start writing scenario-runners.
390
+ *
391
+ * Out of scope for v1 (tracked in `docs/research-report-methodology.md`):
392
+ *
393
+ * - Distributed/cluster execution (concurrency is local async)
394
+ * - Adaptive sampling / sequential interim looks
395
+ * - Resume from partial state across crashes
396
+ * - LLM-call retry beyond what `LlmClient` already does
397
+ */
398
+
399
+ interface CampaignVariant<V> {
400
+ id: string;
401
+ payload: V;
402
+ }
403
+ interface CampaignScenario {
404
+ scenarioId: string;
405
+ /** Free-form metadata propagated to runs and reports. */
406
+ tags?: Record<string, string>;
407
+ }
408
+ interface CampaignRunContext<V> {
409
+ /** Stable run id. The campaign generates this; the runner does not. */
410
+ runId: string;
411
+ /** Logical experiment id (campaignId by default; overridable per-run via opts). */
412
+ experimentId: string;
413
+ variant: V;
414
+ variantId: string;
415
+ scenarioId: string;
416
+ scenarioTags: Record<string, string>;
417
+ seed: number;
418
+ splitTag: RunSplitTag;
419
+ /**
420
+ * The TraceEmitter for this run, with `onRunComplete` hooks pre-wired
421
+ * (analyst auto-execution if configured, plus integrity check). The
422
+ * runner MUST call `emitter.startRun` before doing any work and either
423
+ * `emitter.endRun` or `emitter.abortRun` before returning.
424
+ */
425
+ emitter: TraceEmitter;
426
+ store: TraceStore;
427
+ rawSink: RawProviderSink;
428
+ /**
429
+ * Pre-wired LLM client options — `rawSink` and `traceContext` are populated
430
+ * so any `callLlm(req, ctx.llmOpts)` automatically captures raw HTTP. The
431
+ * runner can spread additional fields if needed.
432
+ */
433
+ llmOpts: LlmClientOptions;
434
+ }
435
+ interface CampaignRunOutcome {
436
+ /** Did the run pass? Mirrors `RunOutcome.pass` semantics. */
437
+ pass: boolean;
438
+ /** Score for the run on its split. Maps to `searchScore` or `holdoutScore`. */
439
+ score: number;
440
+ /** Mandatory cost in USD. Use 0 + raw.cost_unknown=1 only if truly unknown. */
441
+ costUsd: number;
442
+ tokenUsage: RunTokenUsage;
443
+ /** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */
444
+ model: string;
445
+ /** sha256 of the effective prompt sent to the model. */
446
+ promptHash: string;
447
+ /** sha256 of the effective config (model, temperature, tools, judges, splits). */
448
+ configHash: string;
449
+ /** Optional extra numeric metrics to land in `outcome.raw`. */
450
+ raw?: Record<string, number>;
451
+ /** Optional failure-taxonomy tag if the run failed. */
452
+ failureMode?: string;
453
+ /** Optional judge metadata when a judge was used. */
454
+ judgeMetadata?: RunJudgeMetadata;
455
+ }
456
+ type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
457
+ type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';
458
+ interface EvalCampaignOptions<V> {
459
+ /**
460
+ * Stable id for the campaign. Used as the default `experimentId` on
461
+ * every run, and folded into the campaign fingerprint.
462
+ */
463
+ campaignId: string;
464
+ variants: CampaignVariant<V>[];
465
+ scenarios: CampaignScenario[];
466
+ /** Default `[0, 1, 2]`. */
467
+ seeds?: number[];
468
+ /** Default `'holdout'` — the split that anchors a launch decision. */
469
+ splitTag?: RunSplitTag;
470
+ /** Git SHA the campaign is run against. Mandatory; `RunRecord` rejects unset. */
471
+ commitSha: string;
472
+ /**
473
+ * LLM client config. Augmented per-run with `rawSink` and `traceContext`
474
+ * before being passed to the runner. The campaign asserts this config
475
+ * matches `routeRequirements` once at preflight.
476
+ */
477
+ llmOpts: LlmClientOptions;
478
+ /**
479
+ * Default `{ requireExplicitBaseUrl: true, requireAuth: true }` — fail
480
+ * loud if the campaign would silently fall back to the public router or
481
+ * run unauthenticated. Override with an empty object to disable.
482
+ */
483
+ routeRequirements?: LlmRouteRequirements;
484
+ /**
485
+ * Per-run TraceStore factory. Common shape: a fresh store per run keyed
486
+ * on `runId`. Implementations that share a store across the campaign
487
+ * are valid — the campaign only writes through `emitter`.
488
+ */
489
+ storeFactory: (params: CampaignFactoryParams) => TraceStore;
490
+ /**
491
+ * Per-run RawProviderSink factory. Defaults to `FileSystemRawProviderSink`
492
+ * rooted at `${workDir}/raw-events/${runId}` if `workDir` is supplied;
493
+ * otherwise required. Forensic capture is non-negotiable in a campaign
494
+ * run — pass `NoopRawProviderSink` explicitly if you want to opt out.
495
+ */
496
+ rawSinkFactory?: (params: CampaignFactoryParams) => RawProviderSink;
497
+ /**
498
+ * Filesystem root for default `rawSinkFactory`. Ignored if
499
+ * `rawSinkFactory` is supplied.
500
+ */
501
+ workDir?: string;
502
+ /**
503
+ * Extra `onRunComplete` hooks the campaign appends (after its own
504
+ * integrity-check hook). Pass `traceAnalystOnRunComplete(...)` here.
505
+ */
506
+ onRunComplete?: RunCompleteHook[];
507
+ /**
508
+ * Per-run integrity expectations. Defaults to:
509
+ * `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }`.
510
+ * Override (e.g. `{ llmSpansMin: 0 }`) for runs that don't call LLMs.
511
+ */
512
+ integrity?: RunIntegrityExpectations;
513
+ /** Behaviour when integrity fails. Default `'mark_failed'`. */
514
+ onIntegrityFailure?: CampaignIntegrityPolicy;
515
+ /**
516
+ * Per-run runner. Receives a fully-wired context; produces an outcome
517
+ * the campaign converts into a `RunRecord`.
518
+ */
519
+ runner: CampaignRunner<V>;
520
+ /**
521
+ * If set, the campaign computes `researchReport` at the end. `comparator`
522
+ * is a `variantId`. Other fields are forwarded verbatim.
523
+ */
524
+ report?: {
525
+ comparator?: string;
526
+ } & Omit<ResearchReportOptions, 'comparator' | 'preregistrationHash' | 'generatedAt'>;
527
+ /**
528
+ * Hash of a signed `HypothesisManifest` (see `pre-registration.ts`).
529
+ * Embedded in the campaign fingerprint and the research report.
530
+ */
531
+ preregistrationHash?: string;
532
+ /** Local concurrency. Default `1` (sequential). */
533
+ concurrency?: number;
534
+ /**
535
+ * Override the time source. Tests pass a mock to make wallMs deterministic.
536
+ */
537
+ now?: () => number;
538
+ /** Override the runId generator. Tests pin this. */
539
+ runId?: (params: CampaignFactoryParams) => string;
540
+ }
541
+ interface CampaignFactoryParams {
542
+ campaignId: string;
543
+ runId: string;
544
+ variantId: string;
545
+ scenarioId: string;
546
+ seed: number;
547
+ }
548
+ interface FailedRun {
549
+ runId: string;
550
+ variantId: string;
551
+ scenarioId: string;
552
+ seed: number;
553
+ reason: string;
554
+ error?: string;
555
+ }
556
+ interface EvalCampaignResult {
557
+ campaignId: string;
558
+ /** SHA-256 over canonicalised `(variantIds, scenarioIds, seeds, comparator, splitTag, baseUrl, provider, preregistrationHash)`. */
559
+ campaignFingerprint: string;
560
+ preregistrationHash: string | null;
561
+ /** Successful runs only. Failed runs land in `failedRuns`. */
562
+ runs: RunRecord[];
563
+ /** Integrity reports for every successful run. */
564
+ integrityReports: RunIntegrityReport[];
565
+ failedRuns: FailedRun[];
566
+ /** Computed when `report` is set on options. */
567
+ report?: ResearchReport;
568
+ startedAt: string;
569
+ endedAt: string;
570
+ }
571
+ declare function runEvalCampaign<V>(opts: EvalCampaignOptions<V>): Promise<EvalCampaignResult>;
572
+
573
+ export { CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, LlmCallError as m, type LlmCallRequest as n, type LlmCallResult as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v, callLlm as w, callLlmJson as x, probeLlm as y, stripFencedJson as z };
@@ -343,4 +343,4 @@ declare function controlRunToFeedbackTrajectory<TState, TAction, TActionResult>(
343
343
  createdAt?: string;
344
344
  }): FeedbackTrajectory;
345
345
 
346
- export { type ProposedSideEffect as A, allCriticalPassed as B, type ControlSeverity as C, assignFeedbackSplit as D, controlRunToFeedbackTrajectory as E, type FeedbackLabel as F, createFeedbackTrajectory as G, feedbackTrajectoriesToDatasetScenarios as H, InMemoryFeedbackTrajectoryStore as I, feedbackTrajectoriesToOptimizerRows as J, feedbackTrajectoryToDatasetScenario as K, feedbackTrajectoryToOptimizerRow as L, objectiveEval as M, parseFeedbackTrajectoriesJsonl as N, renderPreferenceMemoryMarkdown as O, type PreferenceMemoryEntry as P, replayFeedbackTrajectories as Q, replayFeedbackTrajectory as R, type StopDecision as S, runAgentControlLoop as T, serializeFeedbackTrajectoriesJsonl as U, stopOnNoProgress as V, stopOnRepeatedAction as W, subjectiveEval as X, summarizePreferenceMemory as Y, withAssignedFeedbackSplit as Z, type FeedbackTrajectoryStore as a, type FeedbackTrajectory as b, type ControlEvalResult as c, type ControlActionFailureMode as d, type ControlActionOutcome as e, type ControlBudget as f, type ControlContext as g, type ControlDecision as h, type ControlRunResult as i, type ControlRuntimeConfig as j, type ControlRuntimeError as k, type ControlStep as l, type ControlStopPolicies as m, type FeedbackArtifactType as n, type FeedbackAttempt as o, type FeedbackLabelKind as p, type FeedbackLabelSource as q, type FeedbackOptimizerRow as r, type FeedbackOutcome as s, type FeedbackReplayAdapter as t, type FeedbackReplayResult as u, type FeedbackSeverity as v, type FeedbackSplitPolicy as w, type FeedbackTask as x, type FeedbackTrajectoryFilter as y, FileSystemFeedbackTrajectoryStore as z };
346
+ export { replayFeedbackTrajectory as A, serializeFeedbackTrajectoriesJsonl as B, summarizePreferenceMemory as C, withAssignedFeedbackSplit as D, type ControlSeverity as E, type FeedbackArtifactType as F, type ControlEvalResult as G, type ControlActionFailureMode as H, InMemoryFeedbackTrajectoryStore as I, type ControlActionOutcome as J, type ControlBudget as K, type ControlContext as L, type ControlDecision as M, type ControlRunResult as N, type ControlRuntimeConfig as O, type PreferenceMemoryEntry as P, type ControlRuntimeError as Q, type ControlStep as R, type ControlStopPolicies as S, type StopDecision as T, allCriticalPassed as U, objectiveEval as V, runAgentControlLoop as W, stopOnNoProgress as X, stopOnRepeatedAction as Y, subjectiveEval as Z, type FeedbackAttempt as a, type FeedbackLabel as b, type FeedbackLabelKind as c, type FeedbackLabelSource as d, type FeedbackOptimizerRow as e, type FeedbackOutcome as f, type FeedbackReplayAdapter as g, type FeedbackReplayResult as h, type FeedbackSeverity as i, type FeedbackSplitPolicy as j, type FeedbackTask as k, type FeedbackTrajectory as l, type FeedbackTrajectoryFilter as m, type FeedbackTrajectoryStore as n, FileSystemFeedbackTrajectoryStore as o, type ProposedSideEffect as p, assignFeedbackSplit as q, controlRunToFeedbackTrajectory as r, createFeedbackTrajectory as s, feedbackTrajectoriesToDatasetScenarios as t, feedbackTrajectoriesToOptimizerRows as u, feedbackTrajectoryToDatasetScenario as v, feedbackTrajectoryToOptimizerRow as w, parseFeedbackTrajectoriesJsonl as x, renderPreferenceMemoryMarkdown as y, replayFeedbackTrajectories as z };
@@ -1,4 +1,4 @@
1
- import { R as RunSplitTag } from './run-record-CX_jcAyr.js';
1
+ import { a as RunSplitTag } from './run-record-DNiOMBrZ.js';
2
2
 
3
3
  /**
4
4
  * Shared types for the reference benchmark wrappers under