@tangle-network/agent-eval 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -1
- package/README.md +4 -0
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/chunk-UAND2LOT.js +738 -0
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/index.d.ts +10 -284
- package/dist/index.js +39 -19
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +6 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +15 -8
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +16 -5
- package/dist/wire/index.js +3 -3
- package/docs/research-report-methodology.md +19 -4
- package/docs/wire-protocol.md +1 -1
- package/package.json +2 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-HRZELXCR.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
|
@@ -0,0 +1,574 @@
|
|
|
1
|
+
import { d as RawProviderSink, P as ProviderRedactor, g as RunIntegrityExpectations, j as RunIntegrityReport } from './integrity-K2oVlF57.js';
|
|
2
|
+
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
|
|
3
|
+
import { T as TraceStore } from './store-u47QaJ9G.js';
|
|
4
|
+
import { a as RunRecord, R as RunSplitTag, e as RunTokenUsage, b as RunJudgeMetadata } from './run-record-CX_jcAyr.js';
|
|
5
|
+
import { k as GateDecision, $ as ResearchReportOptions, X as ResearchReport } from './summary-report-D4p7RlDu.js';
|
|
6
|
+
import './feedback-trajectory-CB0A32o3.js';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* LLM client with graceful degrade.
|
|
10
|
+
*
|
|
11
|
+
* OpenAI-compatible `/v1/chat/completions` client with:
|
|
12
|
+
* - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
|
|
13
|
+
* - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
|
|
14
|
+
* - Graceful json_schema → json_object degrade on 400 with schema-reject body.
|
|
15
|
+
* - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
|
|
16
|
+
* - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
|
|
17
|
+
* directly, cli-bridge subscriptions, and any router that speaks the spec.
|
|
18
|
+
*
|
|
19
|
+
* Usage:
|
|
20
|
+
* const { value, result } = await callLlmJson<MyType>(
|
|
21
|
+
* { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
|
|
22
|
+
* { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
|
|
23
|
+
* )
|
|
24
|
+
*
|
|
25
|
+
* This is THE llm-calling seam for agent-eval primitives that need structured
|
|
26
|
+
* output (semantic concept judge, reviewer directives, critic scores). Primitives
|
|
27
|
+
* that need free-form text use `callLlm` and parse output themselves.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
interface LlmMessage {
|
|
31
|
+
role: 'system' | 'user' | 'assistant';
|
|
32
|
+
/**
|
|
33
|
+
* Either a plain text content string OR a multimodal content array
|
|
34
|
+
* (text + image_url parts) for vision-capable models.
|
|
35
|
+
*/
|
|
36
|
+
content: string | Array<{
|
|
37
|
+
type: 'text';
|
|
38
|
+
text: string;
|
|
39
|
+
} | {
|
|
40
|
+
type: 'image_url';
|
|
41
|
+
image_url: {
|
|
42
|
+
url: string;
|
|
43
|
+
detail?: 'auto' | 'low' | 'high';
|
|
44
|
+
};
|
|
45
|
+
}>;
|
|
46
|
+
}
|
|
47
|
+
interface LlmCallRequest {
|
|
48
|
+
model: string;
|
|
49
|
+
messages: LlmMessage[];
|
|
50
|
+
/** Optional JSON-mode response format (response_format: json_object). */
|
|
51
|
+
jsonMode?: boolean;
|
|
52
|
+
/** Optional structured output via JSON Schema. Falls back to json_object on 400. */
|
|
53
|
+
jsonSchema?: {
|
|
54
|
+
name: string;
|
|
55
|
+
schema: Record<string, unknown>;
|
|
56
|
+
};
|
|
57
|
+
temperature?: number;
|
|
58
|
+
maxTokens?: number;
|
|
59
|
+
/** Per-call timeout, default 60s. */
|
|
60
|
+
timeoutMs?: number;
|
|
61
|
+
}
|
|
62
|
+
interface LlmUsage {
|
|
63
|
+
promptTokens: number;
|
|
64
|
+
completionTokens: number;
|
|
65
|
+
totalTokens: number;
|
|
66
|
+
/** Proxies populate this when prompt caching is on. */
|
|
67
|
+
cachedPromptTokens?: number;
|
|
68
|
+
}
|
|
69
|
+
interface LlmCallResult {
|
|
70
|
+
/** The text content of the first choice. Empty string if none. */
|
|
71
|
+
content: string;
|
|
72
|
+
usage: LlmUsage;
|
|
73
|
+
/**
|
|
74
|
+
* Cost in USD. Pulled from proxy's `_response_cost` field when present;
|
|
75
|
+
* `null` when neither the proxy nor the caller can derive it.
|
|
76
|
+
*/
|
|
77
|
+
costUsd: number | null;
|
|
78
|
+
/** Model name actually used (echoed from response). */
|
|
79
|
+
model: string;
|
|
80
|
+
/** Wall-clock duration of the HTTP call (last attempt, if retried). */
|
|
81
|
+
durationMs: number;
|
|
82
|
+
/** Raw response body. */
|
|
83
|
+
raw: Record<string, unknown>;
|
|
84
|
+
}
|
|
85
|
+
declare class LlmCallError extends Error {
|
|
86
|
+
readonly status: number;
|
|
87
|
+
readonly body: string;
|
|
88
|
+
readonly model: string;
|
|
89
|
+
constructor(message: string, status: number, body: string, model: string);
|
|
90
|
+
}
|
|
91
|
+
interface LlmClientOptions {
|
|
92
|
+
/** Base URL (without trailing slash). Must end at the `/v1` prefix. */
|
|
93
|
+
baseUrl?: string;
|
|
94
|
+
/** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
|
|
95
|
+
apiKey?: string;
|
|
96
|
+
bearer?: string;
|
|
97
|
+
/** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
|
|
98
|
+
authHeader?: {
|
|
99
|
+
name: string;
|
|
100
|
+
value: string;
|
|
101
|
+
};
|
|
102
|
+
/** Default timeout in ms. Per-call can override. */
|
|
103
|
+
defaultTimeoutMs?: number;
|
|
104
|
+
/** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
|
|
105
|
+
maxRetries?: number;
|
|
106
|
+
/** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
|
|
107
|
+
fetch?: typeof fetch;
|
|
108
|
+
/**
|
|
109
|
+
* Optional raw HTTP capture sink. When provided, every request, response,
|
|
110
|
+
* and error (across all retry attempts) is recorded to the sink, with auth
|
|
111
|
+
* headers and credential-shaped body fields redacted by default. This is
|
|
112
|
+
* the layer-1 forensics primitive: structured `LlmSpan`s record intent,
|
|
113
|
+
* raw events record what actually crossed the wire.
|
|
114
|
+
*/
|
|
115
|
+
rawSink?: RawProviderSink;
|
|
116
|
+
/**
|
|
117
|
+
* Logical provider id attached to raw events. When omitted, derived from
|
|
118
|
+
* `baseUrl` via `providerFromBaseUrl`.
|
|
119
|
+
*/
|
|
120
|
+
provider?: string;
|
|
121
|
+
/** Trace context attached to raw events; populated by emitter-aware callers. */
|
|
122
|
+
traceContext?: {
|
|
123
|
+
runId?: string;
|
|
124
|
+
spanId?: string;
|
|
125
|
+
};
|
|
126
|
+
/** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
|
|
127
|
+
redactor?: ProviderRedactor;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Strip a ```json / ``` code fence if the model emitted one.
|
|
131
|
+
* Idempotent for naked JSON. Some models (claude-code via router, certain
|
|
132
|
+
* deepseek models) wrap output even under json_object.
|
|
133
|
+
*/
|
|
134
|
+
declare function stripFencedJson(raw: string): string;
|
|
135
|
+
/**
|
|
136
|
+
* Low-level call. Returns raw content + usage + cost. Retries on transient
|
|
137
|
+
* failures; does NOT degrade schema here — callers that want graceful
|
|
138
|
+
* degrade use `callLlmJson`.
|
|
139
|
+
*/
|
|
140
|
+
declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
|
|
141
|
+
/**
|
|
142
|
+
* Structured-output call. Returns parsed JSON plus the raw result envelope.
|
|
143
|
+
* Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
|
|
144
|
+
* critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
|
|
145
|
+
* the `response_format.json_schema` shape but DO accept `json_object`.
|
|
146
|
+
*/
|
|
147
|
+
declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
|
|
148
|
+
value: T;
|
|
149
|
+
result: LlmCallResult;
|
|
150
|
+
}>;
|
|
151
|
+
declare class LlmRouteAssertionError extends Error {
|
|
152
|
+
readonly code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
|
|
153
|
+
readonly baseUrl: string;
|
|
154
|
+
constructor(message: string, code: 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider', baseUrl: string);
|
|
155
|
+
}
|
|
156
|
+
interface LlmRouteRequirements {
|
|
157
|
+
/**
|
|
158
|
+
* Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
|
|
159
|
+
* `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
|
|
160
|
+
* the public/free-tier router is a defect — the launch reviewer needs to
|
|
161
|
+
* know exactly which provider answered.
|
|
162
|
+
*/
|
|
163
|
+
requireExplicitBaseUrl?: boolean;
|
|
164
|
+
/**
|
|
165
|
+
* Allowlist of acceptable base URLs. Strings match by prefix
|
|
166
|
+
* (case-insensitive); RegExps test against the full base URL.
|
|
167
|
+
*/
|
|
168
|
+
allowedBaseUrls?: Array<string | RegExp>;
|
|
169
|
+
/** Blocklist that takes precedence over `allowedBaseUrls`. */
|
|
170
|
+
blockedBaseUrls?: Array<string | RegExp>;
|
|
171
|
+
/** Throw if no auth header / api key is configured. */
|
|
172
|
+
requireAuth?: boolean;
|
|
173
|
+
/**
|
|
174
|
+
* Logical provider id the configured `baseUrl` is expected to match (via
|
|
175
|
+
* `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
|
|
176
|
+
*/
|
|
177
|
+
expectedProvider?: string;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Fail-loud assertion that the configured LLM client points at the route
|
|
181
|
+
* the caller intends. Designed for the matrix-runner preflight: invoke
|
|
182
|
+
* once before any LLM call to catch misconfiguration before a sweep burns
|
|
183
|
+
* dollars on the wrong provider.
|
|
184
|
+
*
|
|
185
|
+
* Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
|
|
186
|
+
* from constructors and CI gates.
|
|
187
|
+
*/
|
|
188
|
+
declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
|
|
189
|
+
/**
|
|
190
|
+
* Probe whether a model is reachable. Returns latency + null error on
|
|
191
|
+
* success; `ok=false` + error message on any failure (HTTP, timeout,
|
|
192
|
+
* network, parse). Designed for sweep preflights — fail loud at the
|
|
193
|
+
* boundary before burning a 30-leaf run on a misconfigured router.
|
|
194
|
+
*
|
|
195
|
+
* Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
|
|
196
|
+
* (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
|
|
197
|
+
* for short prompts, so don't tighten this further. We don't validate
|
|
198
|
+
* content; HTTP 200 means reachable.
|
|
199
|
+
*/
|
|
200
|
+
declare function probeLlm(model: string, opts?: LlmClientOptions & {
|
|
201
|
+
timeoutMs?: number;
|
|
202
|
+
}): Promise<{
|
|
203
|
+
ok: boolean;
|
|
204
|
+
latencyMs: number;
|
|
205
|
+
error: string | null;
|
|
206
|
+
}>;
|
|
207
|
+
/**
|
|
208
|
+
* Stateful client — construct once with defaults, call many times.
|
|
209
|
+
* Thin wrapper around the free functions; exists for callers that want
|
|
210
|
+
* to inject a single configured instance into multiple primitives.
|
|
211
|
+
*/
|
|
212
|
+
declare class LlmClient {
|
|
213
|
+
private readonly opts;
|
|
214
|
+
constructor(opts?: LlmClientOptions);
|
|
215
|
+
call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
|
|
216
|
+
callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
|
|
217
|
+
value: T;
|
|
218
|
+
result: LlmCallResult;
|
|
219
|
+
}>;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Researcher interface — stable hook for an external autonomous-research
|
|
224
|
+
* agent to drive the meta-loop.
|
|
225
|
+
*
|
|
226
|
+
* Implementations live downstream (typically in a private repo that
|
|
227
|
+
* runs the actual LLM). This package ships only the contract + a
|
|
228
|
+
* `NoopResearcher` so consumers can wire the surface without being
|
|
229
|
+
* forced to implement every method up front.
|
|
230
|
+
*
|
|
231
|
+
* The four methods mirror the four stages of the paper "Two Loops,
|
|
232
|
+
* Three Roles":
|
|
233
|
+
*
|
|
234
|
+
* inspectFailures — given the observed runs, what failure modes
|
|
235
|
+
* are present? (data → diagnosis)
|
|
236
|
+
* proposeChange — given diagnosed failure modes, what
|
|
237
|
+
* structural changes should we try?
|
|
238
|
+
* (diagnosis → plan delta)
|
|
239
|
+
* applyChange — fold the proposed deltas into a concrete
|
|
240
|
+
* experiment plan against an existing baseline.
|
|
241
|
+
* (plan delta → executable plan)
|
|
242
|
+
* evaluateChange — run the plan, return runs + the gate verdict.
|
|
243
|
+
* (executable plan → verdict)
|
|
244
|
+
*
|
|
245
|
+
* Composition is the discipline: a Researcher implementation MUST
|
|
246
|
+
* keep these four steps separate and inspectable. Conflating
|
|
247
|
+
* "diagnose + propose + run" into a single LLM call defeats the
|
|
248
|
+
* point of the framework — you can't audit which step lied.
|
|
249
|
+
*
|
|
250
|
+
* THIS INTERFACE IS STABLE. Breaking changes require a new module
|
|
251
|
+
* (e.g. `Researcher2`) so existing implementations keep working.
|
|
252
|
+
*/
|
|
253
|
+
|
|
254
|
+
/** A diagnosed failure mode with the run-IDs that exhibit it. */
|
|
255
|
+
interface FailureMode {
|
|
256
|
+
/** Short machine-readable code. Must be stable across runs of the
|
|
257
|
+
* same researcher to enable longitudinal tracking. */
|
|
258
|
+
code: string;
|
|
259
|
+
/** Human-readable description for the paper / dashboard. */
|
|
260
|
+
description: string;
|
|
261
|
+
evidence: {
|
|
262
|
+
/** Run IDs (from `RunRecord.runId`) where this failure mode was
|
|
263
|
+
* observed. */
|
|
264
|
+
runIds: string[];
|
|
265
|
+
/** Number of run samples that informed the diagnosis. */
|
|
266
|
+
samples: number;
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
/** A single steering change the researcher wants to try. */
|
|
270
|
+
interface SteeringChange {
|
|
271
|
+
kind: 'reviewer_prompt' | 'skill_add' | 'skill_remove' | 'threshold' | 'budget';
|
|
272
|
+
/** Implementation-specific payload. Researcher implementations
|
|
273
|
+
* define the schema — keep this `unknown` here to avoid coupling
|
|
274
|
+
* the public interface to any one researcher's internal model. */
|
|
275
|
+
payload: unknown;
|
|
276
|
+
/** Why the researcher proposed this change. Goes into the audit
|
|
277
|
+
* trail next to the failure-mode evidence. */
|
|
278
|
+
rationale: string;
|
|
279
|
+
/** Optional self-reported expected delta on the headline metric. */
|
|
280
|
+
expectedDelta?: number;
|
|
281
|
+
}
|
|
282
|
+
/** A single experiment plan, mapped onto the search/holdout splits. */
|
|
283
|
+
interface ExperimentPlan {
|
|
284
|
+
baselineCandidateId: string;
|
|
285
|
+
proposedCandidateId: string;
|
|
286
|
+
changes: SteeringChange[];
|
|
287
|
+
/** USD ceiling for the entire experiment. The runner must stop
|
|
288
|
+
* before exceeding this and report a partial result. */
|
|
289
|
+
evaluationBudgetUsd: number;
|
|
290
|
+
/** Item IDs (your dataset keys) for the search vs holdout splits. */
|
|
291
|
+
splits: {
|
|
292
|
+
search: string[];
|
|
293
|
+
holdout: string[];
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
/** Result of running a plan: every run, plus the gate verdict. */
|
|
297
|
+
interface ExperimentResult {
|
|
298
|
+
plan: ExperimentPlan;
|
|
299
|
+
runs: RunRecord[];
|
|
300
|
+
gateDecision: GateDecision;
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* The researcher loop. Stable, four-step, inspectable.
|
|
304
|
+
*
|
|
305
|
+
* ┌──────────┐ inspectFailures ┌──────────┐ proposeChange ┌──────────┐
|
|
306
|
+
* │ runs │ ─────────────────▶│ failures │ ──────────────▶│ changes │
|
|
307
|
+
* └──────────┘ └──────────┘ └────┬─────┘
|
|
308
|
+
* │
|
|
309
|
+
* ▼
|
|
310
|
+
* ┌────────────────┐ applyChange ┌────────┐
|
|
311
|
+
* │ ExperimentPlan │ ◀────────────│ base │
|
|
312
|
+
* └────────┬───────┘ └────────┘
|
|
313
|
+
* │
|
|
314
|
+
* evaluateChange ▼
|
|
315
|
+
* ┌────────────────┐
|
|
316
|
+
* │ ExperimentResult│
|
|
317
|
+
* └────────────────┘
|
|
318
|
+
*/
|
|
319
|
+
interface Researcher {
|
|
320
|
+
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
321
|
+
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
322
|
+
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
323
|
+
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
324
|
+
}
|
|
325
|
+
interface CallbackResearcherOptions {
|
|
326
|
+
inspectFailures: Researcher['inspectFailures'];
|
|
327
|
+
proposeChange: Researcher['proposeChange'];
|
|
328
|
+
applyChange: Researcher['applyChange'];
|
|
329
|
+
evaluateChange: Researcher['evaluateChange'];
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Minimal concrete researcher for tests, scripts, and small integrations.
|
|
333
|
+
* Larger autonomous researchers can still implement `Researcher` directly.
|
|
334
|
+
*/
|
|
335
|
+
declare class CallbackResearcher implements Researcher {
|
|
336
|
+
private readonly callbacks;
|
|
337
|
+
constructor(callbacks: CallbackResearcherOptions);
|
|
338
|
+
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
339
|
+
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
340
|
+
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
341
|
+
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* No-op researcher — fails loud on every method. Use as a placeholder
|
|
345
|
+
* in code paths that wire the interface but don't have an implementation
|
|
346
|
+
* yet. Importantly, this does NOT silently succeed: a no-op researcher
|
|
347
|
+
* that returned empty arrays would muffle the loop's signal that
|
|
348
|
+
* nobody implemented the brain.
|
|
349
|
+
*/
|
|
350
|
+
declare class NoopResearcher implements Researcher {
|
|
351
|
+
private readonly hint;
|
|
352
|
+
constructor(hint?: string);
|
|
353
|
+
inspectFailures(_runs: RunRecord[]): Promise<FailureMode[]>;
|
|
354
|
+
proposeChange(_failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
355
|
+
applyChange(_changes: SteeringChange[], _baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
356
|
+
evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* EvalCampaign — opinionated matrix runner that wires the four
|
|
361
|
+
* capture-integrity directives by construction.
|
|
362
|
+
*
|
|
363
|
+
* Every consumer that ran a launch-grade benchmark before 0.22 reinvented
|
|
364
|
+
* the same shape: matrix runner → for each (variant, scenario, seed) →
|
|
365
|
+
* start a TraceEmitter → call LLMs → end the run → maybe analyze.
|
|
366
|
+
* The bug class blueprint-agent reported (raw events not captured, route
|
|
367
|
+
* silently wrong, integrity not asserted, analyst never ran) lives at the
|
|
368
|
+
* integration boundary — not the agent-eval API surface. The four
|
|
369
|
+
* directives in `SKILL.md § Capture integrity` are mitigations.
|
|
370
|
+
*
|
|
371
|
+
* `EvalCampaign` is the structural fix. Consumers don't wire the integrity
|
|
372
|
+
* surface anymore; the campaign owns it. Specifically, the campaign:
|
|
373
|
+
*
|
|
374
|
+
* - calls `assertLlmRoute` once at preflight before any work runs
|
|
375
|
+
* - constructs a per-run `TraceStore` and `RawProviderSink` via factories
|
|
376
|
+
* - constructs the `TraceEmitter` with `onRunComplete: [analyst hook]`
|
|
377
|
+
* - hands the runner an `LlmClientOptions` pre-wired with the sink and
|
|
378
|
+
* trace context — the runner can't accidentally call an LLM without
|
|
379
|
+
* capturing the raw HTTP envelope
|
|
380
|
+
* - calls `assertRunCaptured` after every `endRun` and routes failures
|
|
381
|
+
* through a configurable policy (`throw` / `mark_failed` / `log`)
|
|
382
|
+
* - assembles per-run `RunRecord`s and runs `researchReport` at the end
|
|
383
|
+
* so the campaign artifact is launch-decision-grade by default
|
|
384
|
+
* - embeds the campaign fingerprint (a SHA-256 over the canonicalised
|
|
385
|
+
* run set) and optional `preregistrationHash` in the report
|
|
386
|
+
*
|
|
387
|
+
* The runner contract is intentionally narrow: produce a `CampaignRunOutcome`
|
|
388
|
+
* given a fully-wired `CampaignRunContext`. Everything orchestration-shaped
|
|
389
|
+
* lives in the campaign. This is the inversion-of-control point — consumers
|
|
390
|
+
* stop writing matrix runners and start writing scenario-runners.
|
|
391
|
+
*
|
|
392
|
+
* Out of scope for v1 (tracked in `docs/research-report-methodology.md`):
|
|
393
|
+
*
|
|
394
|
+
* - Distributed/cluster execution (concurrency is local async)
|
|
395
|
+
* - Adaptive sampling / sequential interim looks
|
|
396
|
+
* - Resume from partial state across crashes
|
|
397
|
+
* - LLM-call retry beyond what `LlmClient` already does
|
|
398
|
+
*/
|
|
399
|
+
|
|
400
|
+
interface CampaignVariant<V> {
|
|
401
|
+
id: string;
|
|
402
|
+
payload: V;
|
|
403
|
+
}
|
|
404
|
+
interface CampaignScenario {
|
|
405
|
+
scenarioId: string;
|
|
406
|
+
/** Free-form metadata propagated to runs and reports. */
|
|
407
|
+
tags?: Record<string, string>;
|
|
408
|
+
}
|
|
409
|
+
interface CampaignRunContext<V> {
|
|
410
|
+
/** Stable run id. The campaign generates this; the runner does not. */
|
|
411
|
+
runId: string;
|
|
412
|
+
/** Logical experiment id (campaignId by default; overridable per-run via opts). */
|
|
413
|
+
experimentId: string;
|
|
414
|
+
variant: V;
|
|
415
|
+
variantId: string;
|
|
416
|
+
scenarioId: string;
|
|
417
|
+
scenarioTags: Record<string, string>;
|
|
418
|
+
seed: number;
|
|
419
|
+
splitTag: RunSplitTag;
|
|
420
|
+
/**
|
|
421
|
+
* The TraceEmitter for this run, with `onRunComplete` hooks pre-wired
|
|
422
|
+
* (analyst auto-execution if configured, plus integrity check). The
|
|
423
|
+
* runner MUST call `emitter.startRun` before doing any work and either
|
|
424
|
+
* `emitter.endRun` or `emitter.abortRun` before returning.
|
|
425
|
+
*/
|
|
426
|
+
emitter: TraceEmitter;
|
|
427
|
+
store: TraceStore;
|
|
428
|
+
rawSink: RawProviderSink;
|
|
429
|
+
/**
|
|
430
|
+
* Pre-wired LLM client options — `rawSink` and `traceContext` are populated
|
|
431
|
+
* so any `callLlm(req, ctx.llmOpts)` automatically captures raw HTTP. The
|
|
432
|
+
* runner can spread additional fields if needed.
|
|
433
|
+
*/
|
|
434
|
+
llmOpts: LlmClientOptions;
|
|
435
|
+
}
|
|
436
|
+
interface CampaignRunOutcome {
|
|
437
|
+
/** Did the run pass? Mirrors `RunOutcome.pass` semantics. */
|
|
438
|
+
pass: boolean;
|
|
439
|
+
/** Score for the run on its split. Maps to `searchScore` or `holdoutScore`. */
|
|
440
|
+
score: number;
|
|
441
|
+
/** Mandatory cost in USD. Use 0 + raw.cost_unknown=1 only if truly unknown. */
|
|
442
|
+
costUsd: number;
|
|
443
|
+
tokenUsage: RunTokenUsage;
|
|
444
|
+
/** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */
|
|
445
|
+
model: string;
|
|
446
|
+
/** sha256 of the effective prompt sent to the model. */
|
|
447
|
+
promptHash: string;
|
|
448
|
+
/** sha256 of the effective config (model, temperature, tools, judges, splits). */
|
|
449
|
+
configHash: string;
|
|
450
|
+
/** Optional extra numeric metrics to land in `outcome.raw`. */
|
|
451
|
+
raw?: Record<string, number>;
|
|
452
|
+
/** Optional failure-taxonomy tag if the run failed. */
|
|
453
|
+
failureMode?: string;
|
|
454
|
+
/** Optional judge metadata when a judge was used. */
|
|
455
|
+
judgeMetadata?: RunJudgeMetadata;
|
|
456
|
+
}
|
|
457
|
+
type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>;
|
|
458
|
+
type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log';
|
|
459
|
+
interface EvalCampaignOptions<V> {
|
|
460
|
+
/**
|
|
461
|
+
* Stable id for the campaign. Used as the default `experimentId` on
|
|
462
|
+
* every run, and folded into the campaign fingerprint.
|
|
463
|
+
*/
|
|
464
|
+
campaignId: string;
|
|
465
|
+
variants: CampaignVariant<V>[];
|
|
466
|
+
scenarios: CampaignScenario[];
|
|
467
|
+
/** Default `[0, 1, 2]`. */
|
|
468
|
+
seeds?: number[];
|
|
469
|
+
/** Default `'holdout'` — the split that anchors a launch decision. */
|
|
470
|
+
splitTag?: RunSplitTag;
|
|
471
|
+
/** Git SHA the campaign is run against. Mandatory; `RunRecord` rejects unset. */
|
|
472
|
+
commitSha: string;
|
|
473
|
+
/**
|
|
474
|
+
* LLM client config. Augmented per-run with `rawSink` and `traceContext`
|
|
475
|
+
* before being passed to the runner. The campaign asserts this config
|
|
476
|
+
* matches `routeRequirements` once at preflight.
|
|
477
|
+
*/
|
|
478
|
+
llmOpts: LlmClientOptions;
|
|
479
|
+
/**
|
|
480
|
+
* Default `{ requireExplicitBaseUrl: true, requireAuth: true }` — fail
|
|
481
|
+
* loud if the campaign would silently fall back to the public router or
|
|
482
|
+
* run unauthenticated. Override with an empty object to disable.
|
|
483
|
+
*/
|
|
484
|
+
routeRequirements?: LlmRouteRequirements;
|
|
485
|
+
/**
|
|
486
|
+
* Per-run TraceStore factory. Common shape: a fresh store per run keyed
|
|
487
|
+
* on `runId`. Implementations that share a store across the campaign
|
|
488
|
+
* are valid — the campaign only writes through `emitter`.
|
|
489
|
+
*/
|
|
490
|
+
storeFactory: (params: CampaignFactoryParams) => TraceStore;
|
|
491
|
+
/**
|
|
492
|
+
* Per-run RawProviderSink factory. Defaults to `FileSystemRawProviderSink`
|
|
493
|
+
* rooted at `${workDir}/raw-events/${runId}` if `workDir` is supplied;
|
|
494
|
+
* otherwise required. Forensic capture is non-negotiable in a campaign
|
|
495
|
+
* run — pass `NoopRawProviderSink` explicitly if you want to opt out.
|
|
496
|
+
*/
|
|
497
|
+
rawSinkFactory?: (params: CampaignFactoryParams) => RawProviderSink;
|
|
498
|
+
/**
|
|
499
|
+
* Filesystem root for default `rawSinkFactory`. Ignored if
|
|
500
|
+
* `rawSinkFactory` is supplied.
|
|
501
|
+
*/
|
|
502
|
+
workDir?: string;
|
|
503
|
+
/**
|
|
504
|
+
* Extra `onRunComplete` hooks the campaign appends (after its own
|
|
505
|
+
* integrity-check hook). Pass `traceAnalystOnRunComplete(...)` here.
|
|
506
|
+
*/
|
|
507
|
+
onRunComplete?: RunCompleteHook[];
|
|
508
|
+
/**
|
|
509
|
+
* Per-run integrity expectations. Defaults to:
|
|
510
|
+
* `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }`.
|
|
511
|
+
* Override (e.g. `{ llmSpansMin: 0 }`) for runs that don't call LLMs.
|
|
512
|
+
*/
|
|
513
|
+
integrity?: RunIntegrityExpectations;
|
|
514
|
+
/** Behaviour when integrity fails. Default `'mark_failed'`. */
|
|
515
|
+
onIntegrityFailure?: CampaignIntegrityPolicy;
|
|
516
|
+
/**
|
|
517
|
+
* Per-run runner. Receives a fully-wired context; produces an outcome
|
|
518
|
+
* the campaign converts into a `RunRecord`.
|
|
519
|
+
*/
|
|
520
|
+
runner: CampaignRunner<V>;
|
|
521
|
+
/**
|
|
522
|
+
* If set, the campaign computes `researchReport` at the end. `comparator`
|
|
523
|
+
* is a `variantId`. Other fields are forwarded verbatim.
|
|
524
|
+
*/
|
|
525
|
+
report?: {
|
|
526
|
+
comparator?: string;
|
|
527
|
+
} & Omit<ResearchReportOptions, 'comparator' | 'preregistrationHash' | 'generatedAt'>;
|
|
528
|
+
/**
|
|
529
|
+
* Hash of a signed `HypothesisManifest` (see `pre-registration.ts`).
|
|
530
|
+
* Embedded in the campaign fingerprint and the research report.
|
|
531
|
+
*/
|
|
532
|
+
preregistrationHash?: string;
|
|
533
|
+
/** Local concurrency. Default `1` (sequential). */
|
|
534
|
+
concurrency?: number;
|
|
535
|
+
/**
|
|
536
|
+
* Override the time source. Tests pass a mock to make wallMs deterministic.
|
|
537
|
+
*/
|
|
538
|
+
now?: () => number;
|
|
539
|
+
/** Override the runId generator. Tests pin this. */
|
|
540
|
+
runId?: (params: CampaignFactoryParams) => string;
|
|
541
|
+
}
|
|
542
|
+
interface CampaignFactoryParams {
|
|
543
|
+
campaignId: string;
|
|
544
|
+
runId: string;
|
|
545
|
+
variantId: string;
|
|
546
|
+
scenarioId: string;
|
|
547
|
+
seed: number;
|
|
548
|
+
}
|
|
549
|
+
interface FailedRun {
|
|
550
|
+
runId: string;
|
|
551
|
+
variantId: string;
|
|
552
|
+
scenarioId: string;
|
|
553
|
+
seed: number;
|
|
554
|
+
reason: string;
|
|
555
|
+
error?: string;
|
|
556
|
+
}
|
|
557
|
+
interface EvalCampaignResult {
|
|
558
|
+
campaignId: string;
|
|
559
|
+
/** SHA-256 over canonicalised `(variantIds, scenarioIds, seeds, comparator, splitTag, baseUrl, provider, preregistrationHash)`. */
|
|
560
|
+
campaignFingerprint: string;
|
|
561
|
+
preregistrationHash: string | null;
|
|
562
|
+
/** Successful runs only. Failed runs land in `failedRuns`. */
|
|
563
|
+
runs: RunRecord[];
|
|
564
|
+
/** Integrity reports for every successful run. */
|
|
565
|
+
integrityReports: RunIntegrityReport[];
|
|
566
|
+
failedRuns: FailedRun[];
|
|
567
|
+
/** Computed when `report` is set on options. */
|
|
568
|
+
report?: ResearchReport;
|
|
569
|
+
startedAt: string;
|
|
570
|
+
endedAt: string;
|
|
571
|
+
}
|
|
572
|
+
declare function runEvalCampaign<V>(opts: EvalCampaignOptions<V>): Promise<EvalCampaignResult>;
|
|
573
|
+
|
|
574
|
+
export { CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, LlmCallError as m, type LlmCallRequest as n, type LlmCallResult as o, LlmClient as p, type LlmMessage as q, LlmRouteAssertionError as r, type LlmRouteRequirements as s, type LlmUsage as t, assertLlmRoute as u, callLlm as v, callLlmJson as w, probeLlm as x, runEvalCampaign as y, stripFencedJson as z };
|