@tangle-network/agent-eval 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -1
- package/README.md +4 -0
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/chunk-UAND2LOT.js +738 -0
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/index.d.ts +10 -284
- package/dist/index.js +39 -19
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +6 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +15 -8
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +16 -5
- package/dist/wire/index.js +3 -3
- package/docs/research-report-methodology.md +19 -4
- package/docs/wire-protocol.md +1 -1
- package/package.json +2 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-HRZELXCR.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
package/dist/traces.d.ts
CHANGED
|
@@ -2,6 +2,8 @@ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureCl
|
|
|
2
2
|
export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
|
|
3
3
|
import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
|
|
4
4
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
|
|
5
|
+
import { d as RawProviderSink, c as RawProviderEvent } from './integrity-K2oVlF57.js';
|
|
6
|
+
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
|
|
5
7
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
6
8
|
|
|
7
9
|
/**
|
|
@@ -133,204 +135,122 @@ interface OtlpExport {
|
|
|
133
135
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
134
136
|
|
|
135
137
|
/**
|
|
136
|
-
*
|
|
137
|
-
*
|
|
138
|
+
* Replay-from-raw-events — turn every captured campaign run into a
|
|
139
|
+
* re-runnable artifact.
|
|
138
140
|
*
|
|
139
|
-
*
|
|
141
|
+
* The premise: 0.21 made `RawProviderSink` capture every provider HTTP
|
|
142
|
+
* envelope. 0.22's `runEvalCampaign` makes capture the default. Together
|
|
143
|
+
* they mean every past run is a complete fingerprint of what happened on
|
|
144
|
+
* the wire — and that fingerprint is enough to replay the run without
|
|
145
|
+
* burning new LLM cost.
|
|
140
146
|
*
|
|
141
|
-
*
|
|
142
|
-
* usage. It's what dashboards read; it's NOT enough for forensics.
|
|
143
|
-
* - When a downstream consumer reports "the verifier used the wrong route"
|
|
144
|
-
* or "tokens look right but reasoning was missing," the only way to
|
|
145
|
-
* answer is the raw HTTP body. Span fields can lie (a proxy can echo
|
|
146
|
-
* a different `model` value than what actually answered); the raw
|
|
147
|
-
* response is ground truth.
|
|
147
|
+
* Three use cases this primitive enables:
|
|
148
148
|
*
|
|
149
|
-
*
|
|
150
|
-
*
|
|
151
|
-
*
|
|
152
|
-
*
|
|
149
|
+
* 1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
|
|
150
|
+
* to last week's runs without re-calling any LLM. The cost of trying
|
|
151
|
+
* a new rubric drops from "another full sweep" to a CPU-bound replay.
|
|
152
|
+
* 2. **Determinism audits** — replay the same campaign and verify the
|
|
153
|
+
* raw responses match byte-for-byte. Any drift is a non-determinism
|
|
154
|
+
* bug (in the harness, the prompt builder, the sandbox, …).
|
|
155
|
+
* 3. **Free judge calibration** — run two judges on identical responses
|
|
156
|
+
* and measure inter-judge agreement without doubling LLM spend.
|
|
153
157
|
*
|
|
154
|
-
*
|
|
155
|
-
*
|
|
156
|
-
*
|
|
157
|
-
*
|
|
158
|
-
* the per-call `redactor`. The `redactedFields` array on the persisted
|
|
159
|
-
* event lets a reviewer see what was stripped without exposing the values.
|
|
158
|
+
* The interface is deliberately fetch-shaped. Inject `createReplayFetch`
|
|
159
|
+
* into `LlmClientOptions.fetch` and every `callLlm` transparently reads
|
|
160
|
+
* from the cache instead of calling the network. No new code path through
|
|
161
|
+
* the LLM client is needed; the cache hit is invisible to the runner.
|
|
160
162
|
*/
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
runId?: string;
|
|
167
|
-
spanId?: string;
|
|
168
|
-
/**
|
|
169
|
-
* Logical provider name. Free-form so callers can use whatever id matches
|
|
170
|
-
* their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
|
|
171
|
-
* omitted, derived from `baseUrl` in `LlmClientOptions`.
|
|
172
|
-
*/
|
|
173
|
-
provider: string;
|
|
174
|
-
model: string;
|
|
175
|
-
/** Endpoint path, e.g. `'/v1/chat/completions'`. */
|
|
176
|
-
endpoint: string;
|
|
177
|
-
/** Base URL used for the call (already-normalised — no trailing slash). */
|
|
178
|
-
baseUrl: string;
|
|
179
|
-
/** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
|
|
180
|
-
attemptIndex: number;
|
|
181
|
-
direction: RawProviderDirection;
|
|
182
|
-
/** Unix ms. */
|
|
183
|
-
timestamp: number;
|
|
184
|
-
/** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
|
|
185
|
-
durationMs?: number;
|
|
186
|
-
statusCode?: number;
|
|
187
|
-
requestHeaders?: Record<string, string>;
|
|
188
|
-
requestBody?: unknown;
|
|
189
|
-
responseHeaders?: Record<string, string>;
|
|
190
|
-
responseBody?: unknown;
|
|
191
|
-
/** Set on `direction: 'error'` events. */
|
|
192
|
-
errorMessage?: string;
|
|
193
|
-
/** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
|
|
194
|
-
redactedFields: string[];
|
|
195
|
-
}
|
|
196
|
-
interface RawProviderSinkFilter {
|
|
197
|
-
runId?: string;
|
|
198
|
-
spanId?: string;
|
|
199
|
-
direction?: RawProviderDirection;
|
|
200
|
-
attemptIndex?: number;
|
|
163
|
+
|
|
164
|
+
declare class ReplayCacheMissError extends Error {
|
|
165
|
+
readonly url: string;
|
|
166
|
+
readonly requestKey: string;
|
|
167
|
+
constructor(url: string, requestKey: string, message?: string);
|
|
201
168
|
}
|
|
202
|
-
interface
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
206
|
-
/** Optional teardown for backed implementations. */
|
|
207
|
-
close?(): Promise<void>;
|
|
169
|
+
interface ReplayCacheEntry {
|
|
170
|
+
request: RawProviderEvent;
|
|
171
|
+
response: RawProviderEvent;
|
|
208
172
|
}
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
|
|
216
|
-
interface InMemoryRawProviderSinkOptions {
|
|
217
|
-
redactor?: ProviderRedactor;
|
|
218
|
-
}
|
|
219
|
-
declare class InMemoryRawProviderSink implements RawProviderSink {
|
|
220
|
-
private events;
|
|
221
|
-
private redactor;
|
|
222
|
-
constructor(opts?: InMemoryRawProviderSinkOptions);
|
|
223
|
-
record(event: RawProviderEvent): Promise<void>;
|
|
224
|
-
list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
225
|
-
size(): number;
|
|
226
|
-
}
|
|
227
|
-
declare class NoopRawProviderSink implements RawProviderSink {
|
|
228
|
-
record(): Promise<void>;
|
|
229
|
-
}
|
|
230
|
-
interface FileSystemRawProviderSinkOptions {
|
|
231
|
-
/** Directory the NDJSON file is written into. Created if missing. */
|
|
232
|
-
dir: string;
|
|
233
|
-
/** File name; default `'raw-provider-events.ndjson'`. */
|
|
234
|
-
fileName?: string;
|
|
235
|
-
/** Bytes after which the writer rolls over to a new file (default 32 MiB). */
|
|
236
|
-
rollAtBytes?: number;
|
|
237
|
-
redactor?: ProviderRedactor;
|
|
238
|
-
}
|
|
239
|
-
declare class FileSystemRawProviderSink implements RawProviderSink {
|
|
240
|
-
private dir;
|
|
241
|
-
private fileName;
|
|
242
|
-
private rollAtBytes;
|
|
243
|
-
private redactor;
|
|
244
|
-
private bytesWritten;
|
|
245
|
-
private rollIndex;
|
|
246
|
-
private initPromise;
|
|
247
|
-
constructor(opts: FileSystemRawProviderSinkOptions);
|
|
248
|
-
private ensureInit;
|
|
249
|
-
private currentPath;
|
|
250
|
-
record(event: RawProviderEvent): Promise<void>;
|
|
251
|
-
list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
173
|
+
interface ReplayCacheStats {
|
|
174
|
+
total: number;
|
|
175
|
+
byProvider: Record<string, number>;
|
|
176
|
+
byModel: Record<string, number>;
|
|
177
|
+
/** Spans for which we have a request but no response (run aborted mid-call). */
|
|
178
|
+
orphanRequests: number;
|
|
252
179
|
}
|
|
253
180
|
/**
|
|
254
|
-
*
|
|
255
|
-
*
|
|
256
|
-
|
|
257
|
-
declare function providerFromBaseUrl(baseUrl: string): string;
|
|
258
|
-
|
|
259
|
-
/**
|
|
260
|
-
* Run-completion integrity check — at end of run, verify the expected event
|
|
261
|
-
* types were actually captured. The point is the launch-review failure mode:
|
|
262
|
-
* a run *appears* successful but the raw provider events were never written,
|
|
263
|
-
* so a downstream reviewer can't reconstruct what happened.
|
|
181
|
+
* In-memory deterministic cache of (request → response) keyed on a stable
|
|
182
|
+
* hash of the request body. Built from a `RawProviderSink` containing
|
|
183
|
+
* paired `request` and `response` events from a previous run.
|
|
264
184
|
*
|
|
265
|
-
*
|
|
266
|
-
*
|
|
267
|
-
* const report = await assertRunCaptured(store, runId, {
|
|
268
|
-
* llmSpansMin: 1,
|
|
269
|
-
* judgeSpansMin: 1,
|
|
270
|
-
* rawSink: providerSink, // must have ≥ 1 event for this run
|
|
271
|
-
* requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
|
|
272
|
-
* })
|
|
273
|
-
* if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
|
|
274
|
-
*
|
|
275
|
-
* The function is read-only on the store and returns a structured report;
|
|
276
|
-
* the caller chooses the failure mode (throw, mark run failed, log warning).
|
|
277
|
-
* `throwIfRunIncomplete` is the convenient strict mode.
|
|
185
|
+
* The cache is the source of truth for replay; `createReplayFetch` is a
|
|
186
|
+
* thin wrapper that reads from it.
|
|
278
187
|
*/
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
judgeSpansMin?: number;
|
|
285
|
-
/** Minimum tool span count. Default 0. */
|
|
286
|
-
toolSpansMin?: number;
|
|
188
|
+
declare class ReplayCache {
|
|
189
|
+
private byKey;
|
|
190
|
+
private orphans;
|
|
191
|
+
private byProvider;
|
|
192
|
+
private byModel;
|
|
287
193
|
/**
|
|
288
|
-
*
|
|
289
|
-
*
|
|
194
|
+
* Build a cache from a sink's events. The sink must implement `list()`.
|
|
195
|
+
* Filter by `runId` / `spanId` to scope to a specific replay.
|
|
290
196
|
*/
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
197
|
+
static fromSink(sink: RawProviderSink, filter?: {
|
|
198
|
+
runId?: string;
|
|
199
|
+
spanId?: string;
|
|
200
|
+
}): Promise<ReplayCache>;
|
|
201
|
+
/** Build a cache from an in-memory event list. */
|
|
202
|
+
static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
|
|
203
|
+
/** Number of cacheable (request, response) pairs in the cache. */
|
|
204
|
+
size(): number;
|
|
205
|
+
stats(): ReplayCacheStats;
|
|
294
206
|
/**
|
|
295
|
-
*
|
|
296
|
-
*
|
|
297
|
-
*
|
|
207
|
+
* Look up a cached response by hashing the (model, messages, temperature,
|
|
208
|
+
* maxTokens, response_format) shape. Returns `undefined` on miss; the
|
|
209
|
+
* caller decides whether to throw, fall back to the network, or skip.
|
|
298
210
|
*/
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
}
|
|
303
|
-
type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
|
|
304
|
-
interface RunIntegrityIssue {
|
|
305
|
-
code: RunIntegrityIssueCode;
|
|
306
|
-
message: string;
|
|
307
|
-
detail?: Record<string, unknown>;
|
|
308
|
-
}
|
|
309
|
-
interface RunIntegrityReport {
|
|
310
|
-
ok: boolean;
|
|
311
|
-
runId: string;
|
|
312
|
-
llmSpanCount: number;
|
|
313
|
-
judgeSpanCount: number;
|
|
314
|
-
toolSpanCount: number;
|
|
315
|
-
rawProviderEventCount: number;
|
|
211
|
+
lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
|
|
212
|
+
}
|
|
213
|
+
interface ReplayFetchOptions {
|
|
316
214
|
/**
|
|
317
|
-
*
|
|
318
|
-
* `
|
|
319
|
-
*
|
|
215
|
+
* Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
|
|
216
|
+
* `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
|
|
217
|
+
* still complete; `'fail-closed'` returns a synthetic 599 response so the
|
|
218
|
+
* call site sees a non-retriable failure.
|
|
320
219
|
*/
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
220
|
+
onMiss?: 'throw' | 'fallback' | 'fail-closed';
|
|
221
|
+
fallbackFetch?: typeof fetch;
|
|
222
|
+
/** Optional callback fired once per replayed call (for telemetry / counters). */
|
|
223
|
+
onHit?: (info: {
|
|
224
|
+
url: string;
|
|
225
|
+
provider: string;
|
|
226
|
+
model: string;
|
|
227
|
+
}) => void;
|
|
228
|
+
/** Optional callback fired on cache miss before the `onMiss` policy applies. */
|
|
229
|
+
onMissNotify?: (info: {
|
|
230
|
+
url: string;
|
|
231
|
+
requestBody: unknown;
|
|
232
|
+
}) => void;
|
|
330
233
|
}
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
234
|
+
/**
|
|
235
|
+
* Build a `fetch`-shaped function that serves cached responses out of a
|
|
236
|
+
* `ReplayCache` for any URL ending in `/chat/completions`. Pass through
|
|
237
|
+
* `LlmClientOptions.fetch` and `callLlm` becomes free.
|
|
238
|
+
*
|
|
239
|
+
* Non-`/chat/completions` URLs are passed straight to the fallback fetch
|
|
240
|
+
* (default: `globalThis.fetch`). This matters because non-LLM HTTP work
|
|
241
|
+
* (judge HTTP servers, sandbox callbacks) sometimes flows through the same
|
|
242
|
+
* `fetch` and shouldn't be intercepted.
|
|
243
|
+
*/
|
|
244
|
+
declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
|
|
245
|
+
/**
|
|
246
|
+
* Convenience iterator over `(request, response)` pairs in a sink — for
|
|
247
|
+
* post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
|
|
248
|
+
* runs purely in-process over cached LLM outputs.
|
|
249
|
+
*/
|
|
250
|
+
declare function iterateRawCalls(sink: RawProviderSink, filter?: {
|
|
251
|
+
runId?: string;
|
|
252
|
+
spanId?: string;
|
|
253
|
+
}): AsyncGenerator<ReplayCacheEntry>;
|
|
334
254
|
|
|
335
255
|
/**
|
|
336
256
|
* Shared types for the trace-analyst module.
|
|
@@ -911,4 +831,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
|
|
|
911
831
|
declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
|
|
912
832
|
declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
|
|
913
833
|
|
|
914
|
-
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass,
|
|
834
|
+
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, Run, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
package/dist/traces.js
CHANGED
|
@@ -7,7 +7,8 @@ import {
|
|
|
7
7
|
OTEL_AGENT_EVAL_SCOPE,
|
|
8
8
|
OtlpFileTraceStore,
|
|
9
9
|
REDACTION_VERSION,
|
|
10
|
-
|
|
10
|
+
ReplayCache,
|
|
11
|
+
ReplayCacheMissError,
|
|
11
12
|
SpanNotFoundError,
|
|
12
13
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
13
14
|
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
@@ -19,10 +20,10 @@ import {
|
|
|
19
20
|
aggregateLlm,
|
|
20
21
|
analyzeTraces,
|
|
21
22
|
argHash,
|
|
22
|
-
assertRunCaptured,
|
|
23
23
|
buildTraceAnalystTools,
|
|
24
24
|
buildTraceInsightContext,
|
|
25
25
|
buildTraceInsightPrompt,
|
|
26
|
+
createReplayFetch,
|
|
26
27
|
defaultTraceInsightPanel,
|
|
27
28
|
describeTraceInsightScope,
|
|
28
29
|
domainEvidencePattern,
|
|
@@ -34,6 +35,7 @@ import {
|
|
|
34
35
|
isRetrievalSpan,
|
|
35
36
|
isSandboxSpan,
|
|
36
37
|
isToolSpan,
|
|
38
|
+
iterateRawCalls,
|
|
37
39
|
judgeSpans,
|
|
38
40
|
llmSpans,
|
|
39
41
|
planTraceInsightQuestions,
|
|
@@ -42,23 +44,28 @@ import {
|
|
|
42
44
|
runFailureClass,
|
|
43
45
|
runsForScenario,
|
|
44
46
|
scoreTraceInsightReadiness,
|
|
45
|
-
throwIfRunIncomplete,
|
|
46
47
|
tokenizeDomainWords,
|
|
47
48
|
toolSpans,
|
|
48
49
|
traceAnalystFunctionGroup,
|
|
49
50
|
traceAnalystOnRunComplete
|
|
50
|
-
} from "./chunk-
|
|
51
|
+
} from "./chunk-4W4NCYM2.js";
|
|
52
|
+
import {
|
|
53
|
+
RunIntegrityError,
|
|
54
|
+
assertRunCaptured,
|
|
55
|
+
throwIfRunIncomplete
|
|
56
|
+
} from "./chunk-QUKKGHTZ.js";
|
|
51
57
|
import {
|
|
52
58
|
TraceEmitter,
|
|
53
59
|
llmSpanFromProvider
|
|
54
60
|
} from "./chunk-5IIQKMD5.js";
|
|
61
|
+
import "./chunk-6M774GY6.js";
|
|
55
62
|
import {
|
|
56
63
|
FileSystemRawProviderSink,
|
|
57
64
|
InMemoryRawProviderSink,
|
|
58
65
|
NoopRawProviderSink,
|
|
59
66
|
defaultProviderRedactor,
|
|
60
67
|
providerFromBaseUrl
|
|
61
|
-
} from "./chunk-
|
|
68
|
+
} from "./chunk-SQQLHODJ.js";
|
|
62
69
|
import "./chunk-PZ5AY32C.js";
|
|
63
70
|
export {
|
|
64
71
|
DEFAULT_REDACTION_RULES,
|
|
@@ -72,6 +79,8 @@ export {
|
|
|
72
79
|
OTEL_AGENT_EVAL_SCOPE,
|
|
73
80
|
OtlpFileTraceStore,
|
|
74
81
|
REDACTION_VERSION,
|
|
82
|
+
ReplayCache,
|
|
83
|
+
ReplayCacheMissError,
|
|
75
84
|
RunIntegrityError,
|
|
76
85
|
SpanNotFoundError,
|
|
77
86
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
@@ -89,6 +98,7 @@ export {
|
|
|
89
98
|
buildTraceAnalystTools,
|
|
90
99
|
buildTraceInsightContext,
|
|
91
100
|
buildTraceInsightPrompt,
|
|
101
|
+
createReplayFetch,
|
|
92
102
|
defaultProviderRedactor,
|
|
93
103
|
defaultTraceInsightPanel,
|
|
94
104
|
describeTraceInsightScope,
|
|
@@ -101,6 +111,7 @@ export {
|
|
|
101
111
|
isRetrievalSpan,
|
|
102
112
|
isSandboxSpan,
|
|
103
113
|
isToolSpan,
|
|
114
|
+
iterateRawCalls,
|
|
104
115
|
judgeSpans,
|
|
105
116
|
llmSpanFromProvider,
|
|
106
117
|
llmSpans,
|
package/dist/wire/index.js
CHANGED
|
@@ -24,9 +24,9 @@ import {
|
|
|
24
24
|
runRpcBatch,
|
|
25
25
|
runRpcOnce,
|
|
26
26
|
startServer
|
|
27
|
-
} from "../chunk-
|
|
28
|
-
import "../chunk-
|
|
29
|
-
import "../chunk-
|
|
27
|
+
} from "../chunk-6KQG5HAH.js";
|
|
28
|
+
import "../chunk-KAO3Q65R.js";
|
|
29
|
+
import "../chunk-SQQLHODJ.js";
|
|
30
30
|
import "../chunk-PZ5AY32C.js";
|
|
31
31
|
export {
|
|
32
32
|
BUILTIN_RUBRICS,
|
|
@@ -113,15 +113,30 @@ risks list and the executive summary. Treat them as descriptive only.
|
|
|
113
113
|
- **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
|
|
114
114
|
and unpaired tests throw away the variance reduction. Use the paired test
|
|
115
115
|
by default.
|
|
116
|
-
- **Sequential / always-valid inference (e-values,
|
|
117
|
-
|
|
118
|
-
|
|
116
|
+
- **Sequential / always-valid inference (e-values, alpha-spending).**
|
|
117
|
+
**Shipped in 0.22.** `pairedEvalueSequence` and
|
|
118
|
+
`evaluateInterimReleaseConfidence` provide time-uniform inference using
|
|
119
|
+
the predictable plug-in betting martingale (Waudby-Smith & Ramdas 2024)
|
|
120
|
+
paired with the empirical Bernstein confidence sequence (Howard et al.
|
|
121
|
+
2021). For *rolling* analyses (interim looks at a campaign that's still
|
|
122
|
+
accumulating data) call those primitives directly; `researchReport`
|
|
123
|
+
remains the single-look summary. Paper-grade pre-registration covers the
|
|
124
|
+
static analysis; the sequential primitives cover the iterative one.
|
|
119
125
|
- **Hierarchical Bayesian shrinkage across many candidates.** Future work.
|
|
120
126
|
The current ranking is on raw paired statistics and over-credits the top
|
|
121
|
-
candidate when many are tested.
|
|
127
|
+
candidate when many are tested. A Bayesian hierarchical model with a
|
|
128
|
+
weakly informative prior would shrink each variant toward the grand mean,
|
|
129
|
+
reducing rank flips between near-tied candidates.
|
|
122
130
|
- **Calibration / coverage simulation on the bootstrap CI.** Future work; we
|
|
123
131
|
rely on the asymptotic guarantee plus the hard pair floor to keep coverage
|
|
124
132
|
reasonable.
|
|
133
|
+
- **Outcome-anchored calibration.** **Shipped in 0.22.**
|
|
134
|
+
`rubricPredictiveValidity` joins `RunRecord`s to a `DeploymentOutcomeStore`
|
|
135
|
+
and reports per-rubric Spearman against deployment outcomes (revenue,
|
|
136
|
+
retention, CSAT, …). Combined with the static methodology in this
|
|
137
|
+
document, the loop is: pre-register → measure with `researchReport` →
|
|
138
|
+
ship → observe outcomes → recalibrate rubric weights with
|
|
139
|
+
`rubricPredictiveValidity`.
|
|
125
140
|
|
|
126
141
|
## When NOT to apply
|
|
127
142
|
|
package/docs/wire-protocol.md
CHANGED
|
@@ -188,7 +188,7 @@ Each invocation is one process — Node startup adds ~500 ms. For more than a fe
|
|
|
188
188
|
4. **RPC case** — add `case 'x':` in `dispatchRpc` in `src/wire/rpc.ts`.
|
|
189
189
|
5. **OpenAPI route** — register in `src/wire/openapi.ts` so it shows up in the spec.
|
|
190
190
|
6. **Test** — add to `tests/wire/`. At minimum: schema validation, happy-path, error-path.
|
|
191
|
-
7. **Python client** — add a method on `Client` in `clients/python/src/
|
|
191
|
+
7. **Python client** — add a method on `Client` in `clients/python/src/agent_eval_rpc/client.py`, plus pydantic models in `models.py` mirroring the new schemas.
|
|
192
192
|
|
|
193
193
|
The pattern is mechanical. When the surface grows past ~10 methods, swap the hand-written Python models for `datamodel-code-generator -i openapi.json -o models.py`.
|
|
194
194
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.22.0",
|
|
4
4
|
"description": "Trace-first evaluation infrastructure for agent systems: traces, harnesses, verifier pipelines, judges, datasets, gates, optimization, and reporting.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -79,7 +79,7 @@
|
|
|
79
79
|
"@ax-llm/ax": "^19.0.25",
|
|
80
80
|
"@hono/node-server": "^2.0.0",
|
|
81
81
|
"@tangle-network/tcloud": "^0.4.6",
|
|
82
|
-
"hono": "^4.12.
|
|
82
|
+
"hono": "^4.12.16",
|
|
83
83
|
"zod": "^4.3.6"
|
|
84
84
|
},
|
|
85
85
|
"devDependencies": {
|