@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +102 -1
  2. package/README.md +4 -0
  3. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  6. package/dist/chunk-6M774GY6.js +53 -0
  7. package/dist/chunk-6M774GY6.js.map +1 -0
  8. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  9. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  10. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  11. package/dist/chunk-QUKKGHTZ.js +121 -0
  12. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  13. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  14. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  15. package/dist/chunk-UAND2LOT.js +738 -0
  16. package/dist/chunk-UAND2LOT.js.map +1 -0
  17. package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
  18. package/dist/chunk-USHQBPMH.js.map +1 -0
  19. package/dist/cli.js +3 -3
  20. package/dist/index.d.ts +10 -284
  21. package/dist/index.js +39 -19
  22. package/dist/index.js.map +1 -1
  23. package/dist/integrity-K2oVlF57.d.ts +210 -0
  24. package/dist/openapi.json +1 -1
  25. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  26. package/dist/optimization.d.ts +6 -144
  27. package/dist/optimization.js +9 -2
  28. package/dist/reporting-B82RSv9C.d.ts +593 -0
  29. package/dist/reporting.d.ts +2 -2
  30. package/dist/reporting.js +15 -8
  31. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  32. package/dist/traces.d.ts +101 -181
  33. package/dist/traces.js +16 -5
  34. package/dist/wire/index.js +3 -3
  35. package/docs/research-report-methodology.md +19 -4
  36. package/docs/wire-protocol.md +1 -1
  37. package/package.json +2 -2
  38. package/dist/chunk-3IX6QTB7.js.map +0 -1
  39. package/dist/chunk-HRZELXCR.js.map +0 -1
  40. package/dist/chunk-KRR4VMH7.js +0 -423
  41. package/dist/chunk-KRR4VMH7.js.map +0 -1
  42. package/dist/chunk-WOK2RTWG.js.map +0 -1
  43. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  44. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  45. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
@@ -0,0 +1,574 @@
1
+ import { d as RawProviderSink, P as ProviderRedactor, g as RunIntegrityExpectations, j as RunIntegrityReport } from './integrity-K2oVlF57.js';
2
+ import { T as TraceEmitter, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
3
+ import { T as TraceStore } from './store-u47QaJ9G.js';
4
+ import { a as RunRecord, R as RunSplitTag, e as RunTokenUsage, b as RunJudgeMetadata } from './run-record-CX_jcAyr.js';
5
+ import { k as GateDecision, $ as ResearchReportOptions, X as ResearchReport } from './summary-report-D4p7RlDu.js';
6
+ import './feedback-trajectory-CB0A32o3.js';
7
+
8
+ /**
9
+ * LLM client with graceful degrade.
10
+ *
11
+ * OpenAI-compatible `/v1/chat/completions` client with:
12
+ * - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
13
+ * - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
14
+ * - Graceful json_schema → json_object degrade on 400 with schema-reject body.
15
+ * - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
16
+ * - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
17
+ * directly, cli-bridge subscriptions, and any router that speaks the spec.
18
+ *
19
+ * Usage:
20
+ * const { value, result } = await callLlmJson<MyType>(
21
+ * { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
22
+ * { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
23
+ * )
24
+ *
25
+ * This is THE llm-calling seam for agent-eval primitives that need structured
26
+ * output (semantic concept judge, reviewer directives, critic scores). Primitives
27
+ * that need free-form text use `callLlm` and parse output themselves.
28
+ */
29
+
30
+ interface LlmMessage {
31
+ role: 'system' | 'user' | 'assistant';
32
+ /**
33
+ * Either a plain text content string OR a multimodal content array
34
+ * (text + image_url parts) for vision-capable models.
35
+ */
36
+ content: string | Array<{
37
+ type: 'text';
38
+ text: string;
39
+ } | {
40
+ type: 'image_url';
41
+ image_url: {
42
+ url: string;
43
+ detail?: 'auto' | 'low' | 'high';
44
+ };
45
+ }>;
46
+ }
47
+ interface LlmCallRequest {
48
+ model: string;
49
+ messages: LlmMessage[];
50
+ /** Optional JSON-mode response format (response_format: json_object). */
51
+ jsonMode?: boolean;
52
+ /** Optional structured output via JSON Schema. Falls back to json_object on 400. */
53
+ jsonSchema?: {
54
+ name: string;
55
+ schema: Record<string, unknown>;
56
+ };
57
+ temperature?: number;
58
+ maxTokens?: number;
59
+ /** Per-call timeout, default 60s. */
60
+ timeoutMs?: number;
61
+ }
62
+ interface LlmUsage {
63
+ promptTokens: number;
64
+ completionTokens: number;
65
+ totalTokens: number;
66
+ /** Proxies populate this when prompt caching is on. */
67
+ cachedPromptTokens?: number;
68
+ }
69
+ interface LlmCallResult {
70
+ /** The text content of the first choice. Empty string if none. */
71
+ content: string;
72
+ usage: LlmUsage;
73
+ /**
74
+ * Cost in USD. Pulled from proxy's `_response_cost` field when present;
75
+ * `null` when neither the proxy nor the caller can derive it.
76
+ */
77
+ costUsd: number | null;
78
+ /** Model name actually used (echoed from response). */
79
+ model: string;
80
+ /** Wall-clock duration of the HTTP call (last attempt, if retried). */
81
+ durationMs: number;
82
+ /** Raw response body. */
83
+ raw: Record<string, unknown>;
84
+ }
85
+ declare class LlmCallError extends Error {
86
+ readonly status: number;
87
+ readonly body: string;
88
+ readonly model: string;
89
+ constructor(message: string, status: number, body: string, model: string);
90
+ }
91
+ interface LlmClientOptions {
92
+ /** Base URL (without trailing slash). Must end at the `/v1` prefix. */
93
+ baseUrl?: string;
94
+ /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
95
+ apiKey?: string;
96
+ bearer?: string;
97
+ /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
98
+ authHeader?: {
99
+ name: string;
100
+ value: string;
101
+ };
102
+ /** Default timeout in ms. Per-call can override. */
103
+ defaultTimeoutMs?: number;
104
+ /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
105
+ maxRetries?: number;
106
+ /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
107
+ fetch?: typeof fetch;
108
+ /**
109
+ * Optional raw HTTP capture sink. When provided, every request, response,
110
+ * and error (across all retry attempts) is recorded to the sink, with auth
111
+ * headers and credential-shaped body fields redacted by default. This is
112
+ * the layer-1 forensics primitive: structured `LlmSpan`s record intent,
113
+ * raw events record what actually crossed the wire.
114
+ */
115
+ rawSink?: RawProviderSink;
116
+ /**
117
+ * Logical provider id attached to raw events. When omitted, derived from
118
+ * `baseUrl` via `providerFromBaseUrl`.
119
+ */
120
+ provider?: string;
121
+ /** Trace context attached to raw events; populated by emitter-aware callers. */
122
+ traceContext?: {
123
+ runId?: string;
124
+ spanId?: string;
125
+ };
126
+ /** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
127
+ redactor?: ProviderRedactor;
128
+ }
129
+ /**
130
+ * Strip a ```json / ``` code fence if the model emitted one.
131
+ * Idempotent for naked JSON. Some models (claude-code via router, certain
132
+ * deepseek models) wrap output even under json_object.
133
+ */
134
+ declare function stripFencedJson(raw: string): string;
135
+ /**
136
+ * Low-level call. Returns raw content + usage + cost. Retries on transient
137
+ * failures; does NOT degrade schema here — callers that want graceful
138
+ * degrade use `callLlmJson`.
139
+ */
140
+ declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
141
+ /**
142
+ * Structured-output call. Returns parsed JSON plus the raw result envelope.
143
+ * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
144
+ * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
145
+ * the `response_format.json_schema` shape but DO accept `json_object`.
146
+ */
147
+ declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
148
+ value: T;
149
+ result: LlmCallResult;
150
+ }>;
151
+ declare class LlmRouteAssertionError extends Error {
152
+ readonly code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
153
+ readonly baseUrl: string;
154
+ constructor(message: string, code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider', baseUrl: string);
155
+ }
156
+ interface LlmRouteRequirements {
157
+ /**
158
+ * Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
159
+ * `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
160
+ * the public/free-tier router is a defect — the launch reviewer needs to
161
+ * know exactly which provider answered.
162
+ */
163
+ requireExplicitBaseUrl?: boolean;
164
+ /**
165
+ * Allowlist of acceptable base URLs. Strings match by prefix
166
+ * (case-insensitive); RegExps test against the full base URL.
167
+ */
168
+ allowedBaseUrls?: Array<string | RegExp>;
169
+ /** Blocklist that takes precedence over `allowedBaseUrls`. */
170
+ blockedBaseUrls?: Array<string | RegExp>;
171
+ /** Throw if no auth header / api key is configured. */
172
+ requireAuth?: boolean;
173
+ /**
174
+ * Logical provider id the configured `baseUrl` is expected to match (via
175
+ * `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
176
+ */
177
+ expectedProvider?: string;
178
+ }
179
+ /**
180
+ * Fail-loud assertion that the configured LLM client points at the route
181
+ * the caller intends. Designed for the matrix-runner preflight: invoke
182
+ * once before any LLM call to catch misconfiguration before a sweep burns
183
+ * dollars on the wrong provider.
184
+ *
185
+ * Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
186
+ * from constructors and CI gates.
187
+ */
188
+ declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
189
+ /**
190
+ * Probe whether a model is reachable. Returns latency + null error on
191
+ * success; `ok=false` + error message on any failure (HTTP, timeout,
192
+ * network, parse). Designed for sweep preflights — fail loud at the
193
+ * boundary before burning a 30-leaf run on a misconfigured router.
194
+ *
195
+ * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
196
+ * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
197
+ * for short prompts, so don't tighten this further. We don't validate
198
+ * content; HTTP 200 means reachable.
199
+ */
200
+ declare function probeLlm(model: string, opts?: LlmClientOptions & {
201
+ timeoutMs?: number;
202
+ }): Promise<{
203
+ ok: boolean;
204
+ latencyMs: number;
205
+ error: string | null;
206
+ }>;
207
+ /**
208
+ * Stateful client — construct once with defaults, call many times.
209
+ * Thin wrapper around the free functions; exists for callers that want
210
+ * to inject a single configured instance into multiple primitives.
211
+ */
212
+ declare class LlmClient {
213
+ private readonly opts;
214
+ constructor(opts?: LlmClientOptions);
215
+ call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
216
+ callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
217
+ value: T;
218
+ result: LlmCallResult;
219
+ }>;
220
+ }
221
+
222
+ /**
223
+ * Researcher interface — stable hook for an external autonomous-research
224
+ * agent to drive the meta-loop.
225
+ *
226
+ * Implementations live downstream (typically in a private repo that
227
+ * runs the actual LLM). This package ships only the contract + a
228
+ * `NoopResearcher` so consumers can wire the surface without being
229
+ * forced to implement every method up front.
230
+ *
231
+ * The four methods mirror the four stages of the paper "Two Loops,
232
+ * Three Roles":
233
+ *
234
+ * inspectFailures — given the observed runs, what failure modes
235
+ * are present? (data → diagnosis)
236
+ * proposeChange — given diagnosed failure modes, what
237
+ * structural changes should we try?
238
+ * (diagnosis → plan delta)
239
+ * applyChange — fold the proposed deltas into a concrete
240
+ * experiment plan against an existing baseline.
241
+ * (plan delta → executable plan)
242
+ * evaluateChange — run the plan, return runs + the gate verdict.
243
+ * (executable plan → verdict)
244
+ *
245
+ * Composition is the discipline: a Researcher implementation MUST
246
+ * keep these four steps separate and inspectable. Conflating
247
+ * "diagnose + propose + run" into a single LLM call defeats the
248
+ * point of the framework — you can't audit which step lied.
249
+ *
250
+ * THIS INTERFACE IS STABLE. Breaking changes require a new module
251
+ * (e.g. `Researcher2`) so existing implementations keep working.
252
+ */
253
+
254
+ /** A diagnosed failure mode with the run-IDs that exhibit it. */
255
+ interface FailureMode {
256
+ /** Short machine-readable code. Must be stable across runs of the
257
+ * same researcher to enable longitudinal tracking. */
258
+ code: string;
259
+ /** Human-readable description for the paper / dashboard. */
260
+ description: string;
261
+ evidence: {
262
+ /** Run IDs (from `RunRecord.runId`) where this failure mode was
263
+ * observed. */
264
+ runIds: string[];
265
+ /** Number of run samples that informed the diagnosis. */
266
+ samples: number;
267
+ };
268
+ }
269
+ /** A single steering change the researcher wants to try. */
270
+ interface SteeringChange {
271
+ kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
272
+ /** Implementation-specific payload. Researcher implementations
273
+ * define the schema — keep this `unknown` here to avoid coupling
274
+ * the public interface to any one researcher's internal model. */
275
+ payload: unknown;
276
+ /** Why the researcher proposed this change. Goes into the audit
277
+ * trail next to the failure-mode evidence. */
278
+ rationale: string;
279
+ /** Optional self-reported expected delta on the headline metric. */
280
+ expectedDelta?: number;
281
+ }
282
+ /** A single experiment plan, mapped onto the search/holdout splits. */
283
+ interface ExperimentPlan {
284
+ baselineCandidateId: string;
285
+ proposedCandidateId: string;
286
+ changes: SteeringChange[];
287
+ /** USD ceiling for the entire experiment. The runner must stop
288
+ * before exceeding this and report a partial result. */
289
+ evaluationBudgetUsd: number;
290
+ /** Item IDs (your dataset keys) for the search vs holdout splits. */
291
+ splits: {
292
+ search: string[];
293
+ holdout: string[];
294
+ };
295
+ }
296
+ /** Result of running a plan: every run, plus the gate verdict. */
297
+ interface ExperimentResult {
298
+ plan: ExperimentPlan;
299
+ runs: RunRecord[];
300
+ gateDecision: GateDecision;
301
+ }
302
+ /**
303
+ * The researcher loop. Stable, four-step, inspectable.
304
+ *
305
+ * ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
306
+ * │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
307
+ * └──────────┘ └──────────┘ └────┬─────┘
308
+ * │
309
+ * ▼
310
+ * ┌────────────────┐ applyChange ┌────────┐
311
+ * │ ExperimentPlan │ ◀────────────│ base │
312
+ * └────────┬───────┘ └────────┘
313
+ * │
314
+ * evaluateChange ▼
315
+ * ┌────────────────┐
316
+ * │ ExperimentResult│
317
+ * └────────────────┘
318
+ */
319
+ interface Researcher {
320
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
321
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
322
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
323
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
324
+ }
325
+ interface CallbackResearcherOptions {
326
+ inspectFailures: Researcher['inspectFailures'];
327
+ proposeChange: Researcher['proposeChange'];
328
+ applyChange: Researcher['applyChange'];
329
+ evaluateChange: Researcher['evaluateChange'];
330
+ }
331
+ /**
332
+ * Minimal concrete researcher for tests, scripts, and small integrations.
333
+ * Larger autonomous researchers can still implement `Researcher` directly.
334
+ */
335
+ declare class CallbackResearcher implements Researcher {
336
+ private readonly callbacks;
337
+ constructor(callbacks: CallbackResearcherOptions);
338
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
339
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
340
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
341
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
342
+ }
343
+ /**
344
+ * No-op researcher — fails loud on every method. Use as a placeholder
345
+ * in code paths that wire the interface but don't have an implementation
346
+ * yet. Importantly, this does NOT silently succeed: a no-op researcher
347
+ * that returned empty arrays would muffle the loop's signal that
348
+ * nobody implemented the brain.
349
+ */
350
+ declare class NoopResearcher implements Researcher {
351
+ private readonly hint;
352
+ constructor(hint?: string);
353
+ inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
354
+ proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
355
+ applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
356
+ evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
357
+ }
358
+
359
+ /**
360
+ * EvalCampaign — opinionated matrix runner that wires the four
361
+ * capture-integrity directives by construction.
362
+ *
363
+ * Every consumer that ran a launch-grade benchmark before 0.22 reinvented
364
+ * the same shape: matrix runner → for each (variant, scenario, seed) →
365
+ * start a TraceEmitter → call LLMs → end the run → maybe analyze.
366
+ * The bug class blueprint-agent reported (raw events not captured, route
367
+ * silently wrong, integrity not asserted, analyst never ran) lives at the
368
+ * integration boundary — not the agent-eval API surface. The four
369
+ * directives in `SKILL.md § Capture integrity` are mitigations.
370
+ *
371
+ * `EvalCampaign` is the structural fix. Consumers don't wire the integrity
372
+ * surface anymore; the campaign owns it. Specifically, the campaign:
373
+ *
374
+ * - calls `assertLlmRoute` once at preflight before any work runs
375
+ * - constructs a per-run `TraceStore` and `RawProviderSink` via factories
376
+ * - constructs the `TraceEmitter` with `onRunComplete: [analyst hook]`
377
+ * - hands the runner an `LlmClientOptions` pre-wired with the sink and
378
+ * trace context — the runner can't accidentally call an LLM without
379
+ * capturing the raw HTTP envelope
380
+ * - calls `assertRunCaptured` after every `endRun` and routes failures
381
+ * through a configurable policy (`throw` / `mark_failed` / `log`)
382
+ * - assembles per-run `RunRecord`s and runs `researchReport` at the end
383
+ * so the campaign artifact is launch-decision-grade by default
384
+ * - embeds the campaign fingerprint (a SHA-256 over the canonicalised
385
+ * run set) and optional `preregistrationHash` in the report
386
+ *
387
+ * The runner contract is intentionally narrow: produce a `CampaignRunOutcome`
388
+ * given a fully-wired `CampaignRunContext`. Everything orchestration-shaped
389
+ * lives in the campaign. This is the inversion-of-control point — consumers
390
+ * stop writing matrix runners and start writing scenario-runners.
391
+ *
392
+ * Out of scope for v1 (tracked in `docs/research-report-methodology.md`):
393
+ *
394
+ * - Distributed/cluster execution (concurrency is local async)
395
+ * - Adaptive sampling / sequential interim looks
396
+ * - Resume from partial state across crashes
397
+ * - LLM-call retry beyond what `LlmClient` already does
398
+ */
399
+
400
+ interface CampaignVariant<V> {
401
+ id: string;
402
+ payload: V;
403
+ }
404
+ interface CampaignScenario {
405
+ scenarioId: string;
406
+ /** Free-form metadata propagated to runs and reports. */
407
+ tags?: Record<string, string>;
408
+ }
409
+ interface CampaignRunContext<V> {
410
+ /** Stable run id. The campaign generates this; the runner does not. */
411
+ runId: string;
412
+ /** Logical experiment id (campaignId by default; overridable per-run via opts). */
413
+ experimentId: string;
414
+ variant: V;
415
+ variantId: string;
416
+ scenarioId: string;
417
+ scenarioTags: Record<string, string>;
418
+ seed: number;
419
+ splitTag: RunSplitTag;
420
+ /**
421
+ * The TraceEmitter for this run, with `onRunComplete` hooks pre-wired
422
+ * (analyst auto-execution if configured, plus integrity check). The
423
+ * runner MUST call `emitter.startRun` before doing any work and either
424
+ * `emitter.endRun` or `emitter.abortRun` before returning.
425
+ */
426
+ emitter: TraceEmitter;
427
+ store: TraceStore;
428
+ rawSink: RawProviderSink;
429
+ /**
430
+ * Pre-wired LLM client options — `rawSink` and `traceContext` are populated
431
+ * so any `callLlm(req, ctx.llmOpts)` automatically captures raw HTTP. The
432
+ * runner can spread additional fields if needed.
433
+ */
434
+ llmOpts: LlmClientOptions;
435
+ }
436
+ interface CampaignRunOutcome {
437
+ /** Did the run pass? Mirrors `RunOutcome.pass` semantics. */
438
+ pass: boolean;
439
+ /** Score for the run on its split. Maps to `searchScore` or `holdoutScore`. */
440
+ score: number;
441
+ /** Mandatory cost in USD. Use 0 + raw.cost_unknown=1 only if truly unknown. */
442
+ costUsd: number;
443
+ tokenUsage: RunTokenUsage;
444
+ /** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */
445
+ model: string;
446
+ /** sha256 of the effective prompt sent to the model. */
447
+ promptHash: string;
448
+ /** sha256 of the effective config (model, temperature, tools, judges, splits). */
449
+ configHash: string;
450
+ /** Optional extra numeric metrics to land in `outcome.raw`. */
451
+ raw?: Record<string, number>;
452
+ /** Optional failure-taxonomy tag if the run failed. */
453
+ failureMode?: string;
454
+ /** Optional judge metadata when a judge was used. */
455
+ judgeMetadata?: RunJudgeMetadata;
456
+ }
457
+ type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
458
+ type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';
459
+ interface EvalCampaignOptions<V> {
460
+ /**
461
+ * Stable id for the campaign. Used as the default `experimentId` on
462
+ * every run, and folded into the campaign fingerprint.
463
+ */
464
+ campaignId: string;
465
+ variants: CampaignVariant<V>[];
466
+ scenarios: CampaignScenario[];
467
+ /** Default `[0, 1, 2]`. */
468
+ seeds?: number[];
469
+ /** Default `'holdout'` — the split that anchors a launch decision. */
470
+ splitTag?: RunSplitTag;
471
+ /** Git SHA the campaign is run against. Mandatory; `RunRecord` rejects unset. */
472
+ commitSha: string;
473
+ /**
474
+ * LLM client config. Augmented per-run with `rawSink` and `traceContext`
475
+ * before being passed to the runner. The campaign asserts this config
476
+ * matches `routeRequirements` once at preflight.
477
+ */
478
+ llmOpts: LlmClientOptions;
479
+ /**
480
+ * Default `{ requireExplicitBaseUrl: true, requireAuth: true }` — fail
481
+ * loud if the campaign would silently fall back to the public router or
482
+ * run unauthenticated. Override with an empty object to disable.
483
+ */
484
+ routeRequirements?: LlmRouteRequirements;
485
+ /**
486
+ * Per-run TraceStore factory. Common shape: a fresh store per run keyed
487
+ * on `runId`. Implementations that share a store across the campaign
488
+ * are valid — the campaign only writes through `emitter`.
489
+ */
490
+ storeFactory: (params: CampaignFactoryParams) => TraceStore;
491
+ /**
492
+ * Per-run RawProviderSink factory. Defaults to `FileSystemRawProviderSink`
493
+ * rooted at `${workDir}/raw-events/${runId}` if `workDir` is supplied;
494
+ * otherwise required. Forensic capture is non-negotiable in a campaign
495
+ * run — pass `NoopRawProviderSink` explicitly if you want to opt out.
496
+ */
497
+ rawSinkFactory?: (params: CampaignFactoryParams) => RawProviderSink;
498
+ /**
499
+ * Filesystem root for default `rawSinkFactory`. Ignored if
500
+ * `rawSinkFactory` is supplied.
501
+ */
502
+ workDir?: string;
503
+ /**
504
+ * Extra `onRunComplete` hooks the campaign appends (after its own
505
+ * integrity-check hook). Pass `traceAnalystOnRunComplete(...)` here.
506
+ */
507
+ onRunComplete?: RunCompleteHook[];
508
+ /**
509
+ * Per-run integrity expectations. Defaults to:
510
+ * `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }`.
511
+ * Override (e.g. `{ llmSpansMin: 0 }`) for runs that don't call LLMs.
512
+ */
513
+ integrity?: RunIntegrityExpectations;
514
+ /** Behaviour when integrity fails. Default `'mark_failed'`. */
515
+ onIntegrityFailure?: CampaignIntegrityPolicy;
516
+ /**
517
+ * Per-run runner. Receives a fully-wired context; produces an outcome
518
+ * the campaign converts into a `RunRecord`.
519
+ */
520
+ runner: CampaignRunner<V>;
521
+ /**
522
+ * If set, the campaign computes `researchReport` at the end. `comparator`
523
+ * is a `variantId`. Other fields are forwarded verbatim.
524
+ */
525
+ report?: {
526
+ comparator?: string;
527
+ } & Omit<ResearchReportOptions, 'comparator' | 'preregistrationHash' | 'generatedAt'>;
528
+ /**
529
+ * Hash of a signed `HypothesisManifest` (see `pre-registration.ts`).
530
+ * Embedded in the campaign fingerprint and the research report.
531
+ */
532
+ preregistrationHash?: string;
533
+ /** Local concurrency. Default `1` (sequential). */
534
+ concurrency?: number;
535
+ /**
536
+ * Override the time source. Tests pass a mock to make wallMs deterministic.
537
+ */
538
+ now?: () => number;
539
+ /** Override the runId generator. Tests pin this. */
540
+ runId?: (params: CampaignFactoryParams) => string;
541
+ }
542
+ interface CampaignFactoryParams {
543
+ campaignId: string;
544
+ runId: string;
545
+ variantId: string;
546
+ scenarioId: string;
547
+ seed: number;
548
+ }
549
+ interface FailedRun {
550
+ runId: string;
551
+ variantId: string;
552
+ scenarioId: string;
553
+ seed: number;
554
+ reason: string;
555
+ error?: string;
556
+ }
557
+ interface EvalCampaignResult {
558
+ campaignId: string;
559
+ /** SHA-256 over canonicalised `(variantIds, scenarioIds, seeds, comparator, splitTag, baseUrl, provider, preregistrationHash)`. */
560
+ campaignFingerprint: string;
561
+ preregistrationHash: string | null;
562
+ /** Successful runs only. Failed runs land in `failedRuns`. */
563
+ runs: RunRecord[];
564
+ /** Integrity reports for every successful run. */
565
+ integrityReports: RunIntegrityReport[];
566
+ failedRuns: FailedRun[];
567
+ /** Computed when `report` is set on options. */
568
+ report?: ResearchReport;
569
+ startedAt: string;
570
+ endedAt: string;
571
+ }
572
+ declare function runEvalCampaign<V>(opts: EvalCampaignOptions<V>): Promise<EvalCampaignResult>;
573
+
574
+ export { CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, LlmCallError as m, type LlmCallRequest as n, type LlmCallResult as o, LlmClient as p, type LlmMessage as q, LlmRouteAssertionError as r, type LlmRouteRequirements as s, type LlmUsage as t, assertLlmRoute as u, callLlm as v, callLlmJson as w, probeLlm as x, runEvalCampaign as y, stripFencedJson as z };