@tangle-network/agent-eval 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +236 -1
- package/README.md +17 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/chunk-7EAUOUQS.js +495 -0
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
- package/dist/cli.js +3 -3
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +20 -430
- package/dist/index.js +154 -34
- package/dist/index.js.map +1 -1
- package/dist/integrity-Cr5YodSY.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +7 -145
- package/dist/optimization.js +12 -3
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +18 -9
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +19 -8
- package/dist/wire/index.js +3 -3
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/research-report-methodology.md +19 -4
- package/docs/three-package-architecture.md +180 -0
- package/docs/wire-protocol.md +1 -1
- package/package.json +7 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
- /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
package/dist/traces.d.ts
CHANGED
|
@@ -2,6 +2,8 @@ import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureCl
|
|
|
2
2
|
export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
|
|
3
3
|
import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
|
|
4
4
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
|
|
5
|
+
import { R as RawProviderSink, f as RawProviderEvent } from './integrity-Cr5YodSY.js';
|
|
6
|
+
export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
|
|
5
7
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
6
8
|
|
|
7
9
|
/**
|
|
@@ -133,204 +135,122 @@ interface OtlpExport {
|
|
|
133
135
|
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
134
136
|
|
|
135
137
|
/**
|
|
136
|
-
*
|
|
137
|
-
*
|
|
138
|
+
* Replay-from-raw-events — turn every captured campaign run into a
|
|
139
|
+
* re-runnable artifact.
|
|
138
140
|
*
|
|
139
|
-
*
|
|
141
|
+
* The premise: 0.21 made `RawProviderSink` capture every provider HTTP
|
|
142
|
+
* envelope. 0.22's `runEvalCampaign` makes capture the default. Together
|
|
143
|
+
* they mean every past run is a complete fingerprint of what happened on
|
|
144
|
+
* the wire — and that fingerprint is enough to replay the run without
|
|
145
|
+
* burning new LLM cost.
|
|
140
146
|
*
|
|
141
|
-
*
|
|
142
|
-
* usage. It's what dashboards read; it's NOT enough for forensics.
|
|
143
|
-
* - When a downstream consumer reports "the verifier used the wrong route"
|
|
144
|
-
* or "tokens look right but reasoning was missing," the only way to
|
|
145
|
-
* answer is the raw HTTP body. Span fields can lie (a proxy can echo
|
|
146
|
-
* a different `model` value than what actually answered); the raw
|
|
147
|
-
* response is ground truth.
|
|
147
|
+
* Three use cases this primitive enables:
|
|
148
148
|
*
|
|
149
|
-
*
|
|
150
|
-
*
|
|
151
|
-
*
|
|
152
|
-
*
|
|
149
|
+
* 1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
|
|
150
|
+
* to last week's runs without re-calling any LLM. The cost of trying
|
|
151
|
+
* a new rubric drops from "another full sweep" to a CPU-bound replay.
|
|
152
|
+
* 2. **Determinism audits** — replay the same campaign and verify the
|
|
153
|
+
* raw responses match byte-for-byte. Any drift is a non-determinism
|
|
154
|
+
* bug (in the harness, the prompt builder, the sandbox, …).
|
|
155
|
+
* 3. **Free judge calibration** — run two judges on identical responses
|
|
156
|
+
* and measure inter-judge agreement without doubling LLM spend.
|
|
153
157
|
*
|
|
154
|
-
*
|
|
155
|
-
*
|
|
156
|
-
*
|
|
157
|
-
*
|
|
158
|
-
* the per-call `redactor`. The `redactedFields` array on the persisted
|
|
159
|
-
* event lets a reviewer see what was stripped without exposing the values.
|
|
158
|
+
* The interface is deliberately fetch-shaped. Inject `createReplayFetch`
|
|
159
|
+
* into `LlmClientOptions.fetch` and every `callLlm` transparently reads
|
|
160
|
+
* from the cache instead of calling the network. No new code path through
|
|
161
|
+
* the LLM client is needed; the cache hit is invisible to the runner.
|
|
160
162
|
*/
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
runId?: string;
|
|
167
|
-
spanId?: string;
|
|
168
|
-
/**
|
|
169
|
-
* Logical provider name. Free-form so callers can use whatever id matches
|
|
170
|
-
* their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
|
|
171
|
-
* omitted, derived from `baseUrl` in `LlmClientOptions`.
|
|
172
|
-
*/
|
|
173
|
-
provider: string;
|
|
174
|
-
model: string;
|
|
175
|
-
/** Endpoint path, e.g. `'/v1/chat/completions'`. */
|
|
176
|
-
endpoint: string;
|
|
177
|
-
/** Base URL used for the call (already-normalised — no trailing slash). */
|
|
178
|
-
baseUrl: string;
|
|
179
|
-
/** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
|
|
180
|
-
attemptIndex: number;
|
|
181
|
-
direction: RawProviderDirection;
|
|
182
|
-
/** Unix ms. */
|
|
183
|
-
timestamp: number;
|
|
184
|
-
/** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
|
|
185
|
-
durationMs?: number;
|
|
186
|
-
statusCode?: number;
|
|
187
|
-
requestHeaders?: Record<string, string>;
|
|
188
|
-
requestBody?: unknown;
|
|
189
|
-
responseHeaders?: Record<string, string>;
|
|
190
|
-
responseBody?: unknown;
|
|
191
|
-
/** Set on `direction: 'error'` events. */
|
|
192
|
-
errorMessage?: string;
|
|
193
|
-
/** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
|
|
194
|
-
redactedFields: string[];
|
|
195
|
-
}
|
|
196
|
-
interface RawProviderSinkFilter {
|
|
197
|
-
runId?: string;
|
|
198
|
-
spanId?: string;
|
|
199
|
-
direction?: RawProviderDirection;
|
|
200
|
-
attemptIndex?: number;
|
|
163
|
+
|
|
164
|
+
declare class ReplayCacheMissError extends Error {
|
|
165
|
+
readonly url: string;
|
|
166
|
+
readonly requestKey: string;
|
|
167
|
+
constructor(url: string, requestKey: string, message?: string);
|
|
201
168
|
}
|
|
202
|
-
interface
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
206
|
-
/** Optional teardown for backed implementations. */
|
|
207
|
-
close?(): Promise<void>;
|
|
169
|
+
interface ReplayCacheEntry {
|
|
170
|
+
request: RawProviderEvent;
|
|
171
|
+
response: RawProviderEvent;
|
|
208
172
|
}
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
|
|
216
|
-
interface InMemoryRawProviderSinkOptions {
|
|
217
|
-
redactor?: ProviderRedactor;
|
|
218
|
-
}
|
|
219
|
-
declare class InMemoryRawProviderSink implements RawProviderSink {
|
|
220
|
-
private events;
|
|
221
|
-
private redactor;
|
|
222
|
-
constructor(opts?: InMemoryRawProviderSinkOptions);
|
|
223
|
-
record(event: RawProviderEvent): Promise<void>;
|
|
224
|
-
list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
225
|
-
size(): number;
|
|
226
|
-
}
|
|
227
|
-
declare class NoopRawProviderSink implements RawProviderSink {
|
|
228
|
-
record(): Promise<void>;
|
|
229
|
-
}
|
|
230
|
-
interface FileSystemRawProviderSinkOptions {
|
|
231
|
-
/** Directory the NDJSON file is written into. Created if missing. */
|
|
232
|
-
dir: string;
|
|
233
|
-
/** File name; default `'raw-provider-events.ndjson'`. */
|
|
234
|
-
fileName?: string;
|
|
235
|
-
/** Bytes after which the writer rolls over to a new file (default 32 MiB). */
|
|
236
|
-
rollAtBytes?: number;
|
|
237
|
-
redactor?: ProviderRedactor;
|
|
238
|
-
}
|
|
239
|
-
declare class FileSystemRawProviderSink implements RawProviderSink {
|
|
240
|
-
private dir;
|
|
241
|
-
private fileName;
|
|
242
|
-
private rollAtBytes;
|
|
243
|
-
private redactor;
|
|
244
|
-
private bytesWritten;
|
|
245
|
-
private rollIndex;
|
|
246
|
-
private initPromise;
|
|
247
|
-
constructor(opts: FileSystemRawProviderSinkOptions);
|
|
248
|
-
private ensureInit;
|
|
249
|
-
private currentPath;
|
|
250
|
-
record(event: RawProviderEvent): Promise<void>;
|
|
251
|
-
list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
173
|
+
interface ReplayCacheStats {
|
|
174
|
+
total: number;
|
|
175
|
+
byProvider: Record<string, number>;
|
|
176
|
+
byModel: Record<string, number>;
|
|
177
|
+
/** Spans for which we have a request but no response (run aborted mid-call). */
|
|
178
|
+
orphanRequests: number;
|
|
252
179
|
}
|
|
253
180
|
/**
|
|
254
|
-
*
|
|
255
|
-
*
|
|
256
|
-
|
|
257
|
-
declare function providerFromBaseUrl(baseUrl: string): string;
|
|
258
|
-
|
|
259
|
-
/**
|
|
260
|
-
* Run-completion integrity check — at end of run, verify the expected event
|
|
261
|
-
* types were actually captured. The point is the launch-review failure mode:
|
|
262
|
-
* a run *appears* successful but the raw provider events were never written,
|
|
263
|
-
* so a downstream reviewer can't reconstruct what happened.
|
|
181
|
+
* In-memory deterministic cache of (request → response) keyed on a stable
|
|
182
|
+
* hash of the request body. Built from a `RawProviderSink` containing
|
|
183
|
+
* paired `request` and `response` events from a previous run.
|
|
264
184
|
*
|
|
265
|
-
*
|
|
266
|
-
*
|
|
267
|
-
* const report = await assertRunCaptured(store, runId, {
|
|
268
|
-
* llmSpansMin: 1,
|
|
269
|
-
* judgeSpansMin: 1,
|
|
270
|
-
* rawSink: providerSink, // must have ≥ 1 event for this run
|
|
271
|
-
* requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
|
|
272
|
-
* })
|
|
273
|
-
* if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
|
|
274
|
-
*
|
|
275
|
-
* The function is read-only on the store and returns a structured report;
|
|
276
|
-
* the caller chooses the failure mode (throw, mark run failed, log warning).
|
|
277
|
-
* `throwIfRunIncomplete` is the convenient strict mode.
|
|
185
|
+
* The cache is the source of truth for replay; `createReplayFetch` is a
|
|
186
|
+
* thin wrapper that reads from it.
|
|
278
187
|
*/
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
judgeSpansMin?: number;
|
|
285
|
-
/** Minimum tool span count. Default 0. */
|
|
286
|
-
toolSpansMin?: number;
|
|
188
|
+
declare class ReplayCache {
|
|
189
|
+
private byKey;
|
|
190
|
+
private orphans;
|
|
191
|
+
private byProvider;
|
|
192
|
+
private byModel;
|
|
287
193
|
/**
|
|
288
|
-
*
|
|
289
|
-
*
|
|
194
|
+
* Build a cache from a sink's events. The sink must implement `list()`.
|
|
195
|
+
* Filter by `runId` / `spanId` to scope to a specific replay.
|
|
290
196
|
*/
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
197
|
+
static fromSink(sink: RawProviderSink, filter?: {
|
|
198
|
+
runId?: string;
|
|
199
|
+
spanId?: string;
|
|
200
|
+
}): Promise<ReplayCache>;
|
|
201
|
+
/** Build a cache from an in-memory event list. */
|
|
202
|
+
static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
|
|
203
|
+
/** Number of cacheable (request, response) pairs in the cache. */
|
|
204
|
+
size(): number;
|
|
205
|
+
stats(): ReplayCacheStats;
|
|
294
206
|
/**
|
|
295
|
-
*
|
|
296
|
-
*
|
|
297
|
-
*
|
|
207
|
+
* Look up a cached response by hashing the (model, messages, temperature,
|
|
208
|
+
* maxTokens, response_format) shape. Returns `undefined` on miss; the
|
|
209
|
+
* caller decides whether to throw, fall back to the network, or skip.
|
|
298
210
|
*/
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
}
|
|
303
|
-
type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
|
|
304
|
-
interface RunIntegrityIssue {
|
|
305
|
-
code: RunIntegrityIssueCode;
|
|
306
|
-
message: string;
|
|
307
|
-
detail?: Record<string, unknown>;
|
|
308
|
-
}
|
|
309
|
-
interface RunIntegrityReport {
|
|
310
|
-
ok: boolean;
|
|
311
|
-
runId: string;
|
|
312
|
-
llmSpanCount: number;
|
|
313
|
-
judgeSpanCount: number;
|
|
314
|
-
toolSpanCount: number;
|
|
315
|
-
rawProviderEventCount: number;
|
|
211
|
+
lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
|
|
212
|
+
}
|
|
213
|
+
interface ReplayFetchOptions {
|
|
316
214
|
/**
|
|
317
|
-
*
|
|
318
|
-
* `
|
|
319
|
-
*
|
|
215
|
+
* Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
|
|
216
|
+
* `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
|
|
217
|
+
* still complete; `'fail-closed'` returns a synthetic 599 response so the
|
|
218
|
+
* call site sees a non-retriable failure.
|
|
320
219
|
*/
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
220
|
+
onMiss?: 'throw' | 'fallback' | 'fail-closed';
|
|
221
|
+
fallbackFetch?: typeof fetch;
|
|
222
|
+
/** Optional callback fired once per replayed call (for telemetry / counters). */
|
|
223
|
+
onHit?: (info: {
|
|
224
|
+
url: string;
|
|
225
|
+
provider: string;
|
|
226
|
+
model: string;
|
|
227
|
+
}) => void;
|
|
228
|
+
/** Optional callback fired on cache miss before the `onMiss` policy applies. */
|
|
229
|
+
onMissNotify?: (info: {
|
|
230
|
+
url: string;
|
|
231
|
+
requestBody: unknown;
|
|
232
|
+
}) => void;
|
|
330
233
|
}
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
234
|
+
/**
|
|
235
|
+
* Build a `fetch`-shaped function that serves cached responses out of a
|
|
236
|
+
* `ReplayCache` for any URL ending in `/chat/completions`. Pass through
|
|
237
|
+
* `LlmClientOptions.fetch` and `callLlm` becomes free.
|
|
238
|
+
*
|
|
239
|
+
* Non-`/chat/completions` URLs are passed straight to the fallback fetch
|
|
240
|
+
* (default: `globalThis.fetch`). This matters because non-LLM HTTP work
|
|
241
|
+
* (judge HTTP servers, sandbox callbacks) sometimes flows through the same
|
|
242
|
+
* `fetch` and shouldn't be intercepted.
|
|
243
|
+
*/
|
|
244
|
+
declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
|
|
245
|
+
/**
|
|
246
|
+
* Convenience iterator over `(request, response)` pairs in a sink — for
|
|
247
|
+
* post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
|
|
248
|
+
* runs purely in-process over cached LLM outputs.
|
|
249
|
+
*/
|
|
250
|
+
declare function iterateRawCalls(sink: RawProviderSink, filter?: {
|
|
251
|
+
runId?: string;
|
|
252
|
+
spanId?: string;
|
|
253
|
+
}): AsyncGenerator<ReplayCacheEntry>;
|
|
334
254
|
|
|
335
255
|
/**
|
|
336
256
|
* Shared types for the trace-analyst module.
|
|
@@ -911,4 +831,4 @@ declare function scoreTraceInsightReadiness(context: TraceInsightContext): Trace
|
|
|
911
831
|
declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
|
|
912
832
|
declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
|
|
913
833
|
|
|
914
|
-
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass,
|
|
834
|
+
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, Run, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|
package/dist/traces.js
CHANGED
|
@@ -7,7 +7,8 @@ import {
|
|
|
7
7
|
OTEL_AGENT_EVAL_SCOPE,
|
|
8
8
|
OtlpFileTraceStore,
|
|
9
9
|
REDACTION_VERSION,
|
|
10
|
-
|
|
10
|
+
ReplayCache,
|
|
11
|
+
ReplayCacheMissError,
|
|
11
12
|
SpanNotFoundError,
|
|
12
13
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
13
14
|
TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
|
|
@@ -19,10 +20,10 @@ import {
|
|
|
19
20
|
aggregateLlm,
|
|
20
21
|
analyzeTraces,
|
|
21
22
|
argHash,
|
|
22
|
-
assertRunCaptured,
|
|
23
23
|
buildTraceAnalystTools,
|
|
24
24
|
buildTraceInsightContext,
|
|
25
25
|
buildTraceInsightPrompt,
|
|
26
|
+
createReplayFetch,
|
|
26
27
|
defaultTraceInsightPanel,
|
|
27
28
|
describeTraceInsightScope,
|
|
28
29
|
domainEvidencePattern,
|
|
@@ -34,6 +35,7 @@ import {
|
|
|
34
35
|
isRetrievalSpan,
|
|
35
36
|
isSandboxSpan,
|
|
36
37
|
isToolSpan,
|
|
38
|
+
iterateRawCalls,
|
|
37
39
|
judgeSpans,
|
|
38
40
|
llmSpans,
|
|
39
41
|
planTraceInsightQuestions,
|
|
@@ -42,23 +44,28 @@ import {
|
|
|
42
44
|
runFailureClass,
|
|
43
45
|
runsForScenario,
|
|
44
46
|
scoreTraceInsightReadiness,
|
|
45
|
-
throwIfRunIncomplete,
|
|
46
47
|
tokenizeDomainWords,
|
|
47
48
|
toolSpans,
|
|
48
49
|
traceAnalystFunctionGroup,
|
|
49
50
|
traceAnalystOnRunComplete
|
|
50
|
-
} from "./chunk-
|
|
51
|
+
} from "./chunk-4W4NCYM2.js";
|
|
51
52
|
import {
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
RunIntegrityError,
|
|
54
|
+
assertRunCaptured,
|
|
55
|
+
throwIfRunIncomplete
|
|
56
|
+
} from "./chunk-QUKKGHTZ.js";
|
|
55
57
|
import {
|
|
56
58
|
FileSystemRawProviderSink,
|
|
57
59
|
InMemoryRawProviderSink,
|
|
58
60
|
NoopRawProviderSink,
|
|
59
61
|
defaultProviderRedactor,
|
|
60
62
|
providerFromBaseUrl
|
|
61
|
-
} from "./chunk-
|
|
63
|
+
} from "./chunk-SQQLHODJ.js";
|
|
64
|
+
import {
|
|
65
|
+
TraceEmitter,
|
|
66
|
+
llmSpanFromProvider
|
|
67
|
+
} from "./chunk-5IIQKMD5.js";
|
|
68
|
+
import "./chunk-6M774GY6.js";
|
|
62
69
|
import "./chunk-PZ5AY32C.js";
|
|
63
70
|
export {
|
|
64
71
|
DEFAULT_REDACTION_RULES,
|
|
@@ -72,6 +79,8 @@ export {
|
|
|
72
79
|
OTEL_AGENT_EVAL_SCOPE,
|
|
73
80
|
OtlpFileTraceStore,
|
|
74
81
|
REDACTION_VERSION,
|
|
82
|
+
ReplayCache,
|
|
83
|
+
ReplayCacheMissError,
|
|
75
84
|
RunIntegrityError,
|
|
76
85
|
SpanNotFoundError,
|
|
77
86
|
TRACE_ANALYST_ACTOR_DESCRIPTION,
|
|
@@ -89,6 +98,7 @@ export {
|
|
|
89
98
|
buildTraceAnalystTools,
|
|
90
99
|
buildTraceInsightContext,
|
|
91
100
|
buildTraceInsightPrompt,
|
|
101
|
+
createReplayFetch,
|
|
92
102
|
defaultProviderRedactor,
|
|
93
103
|
defaultTraceInsightPanel,
|
|
94
104
|
describeTraceInsightScope,
|
|
@@ -101,6 +111,7 @@ export {
|
|
|
101
111
|
isRetrievalSpan,
|
|
102
112
|
isSandboxSpan,
|
|
103
113
|
isToolSpan,
|
|
114
|
+
iterateRawCalls,
|
|
104
115
|
judgeSpans,
|
|
105
116
|
llmSpanFromProvider,
|
|
106
117
|
llmSpans,
|
package/dist/wire/index.js
CHANGED
|
@@ -24,9 +24,9 @@ import {
|
|
|
24
24
|
runRpcBatch,
|
|
25
25
|
runRpcOnce,
|
|
26
26
|
startServer
|
|
27
|
-
} from "../chunk-
|
|
28
|
-
import "../chunk-
|
|
29
|
-
import "../chunk-
|
|
27
|
+
} from "../chunk-6KQG5HAH.js";
|
|
28
|
+
import "../chunk-KAO3Q65R.js";
|
|
29
|
+
import "../chunk-SQQLHODJ.js";
|
|
30
30
|
import "../chunk-PZ5AY32C.js";
|
|
31
31
|
export {
|
|
32
32
|
BUILTIN_RUBRICS,
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# Auto-research loop end-to-end
|
|
2
|
+
|
|
3
|
+
This is the runnable composition pattern that closes the loop the package
|
|
4
|
+
was originally designed for: capture-integrity → eval → preferences →
|
|
5
|
+
mutation → improved candidate → repeat.
|
|
6
|
+
|
|
7
|
+
There's no new orchestrator primitive that runs this for you (and we
|
|
8
|
+
deliberately resisted shipping one — every consumer's loop has different
|
|
9
|
+
invariants). What this doc gives you is **the integration recipe**: the
|
|
10
|
+
imports, the wiring, and the explicit invariants every iteration must
|
|
11
|
+
preserve.
|
|
12
|
+
|
|
13
|
+
A working version of this recipe lives at
|
|
14
|
+
[`examples/auto-research-with-agent-builder/`](../examples/auto-research-with-agent-builder/) —
|
|
15
|
+
runnable, ~250 lines, demonstrates the score climbing across iterations.
|
|
16
|
+
|
|
17
|
+
## The pattern
|
|
18
|
+
|
|
19
|
+
```ts
|
|
20
|
+
import {
|
|
21
|
+
runEvalCampaign,
|
|
22
|
+
analyzeOptimizationResult,
|
|
23
|
+
trialsToRunRecords,
|
|
24
|
+
PredictiveValidityResearcher,
|
|
25
|
+
} from '@tangle-network/agent-eval'
|
|
26
|
+
import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces'
|
|
27
|
+
|
|
28
|
+
async function runAutoResearchLoop(opts: {
|
|
29
|
+
task: string
|
|
30
|
+
initialVariants: Variant[]
|
|
31
|
+
scenarios: Scenario[]
|
|
32
|
+
iterations: number
|
|
33
|
+
// The thing that turns a Variant into a scoreable artifact.
|
|
34
|
+
// For agent-builder this is `runForgeBuilderSim`; for tax-agent it's
|
|
35
|
+
// their domain runner; for the multi-shot prompt evolution case it's
|
|
36
|
+
// already wired inside `runPromptEvolution`.
|
|
37
|
+
candidateRunner: CandidateRunner<Variant>
|
|
38
|
+
// The thing that proposes the next variants given the analysis output.
|
|
39
|
+
// For prompt-only optimization, this is `reflective-mutation` against
|
|
40
|
+
// the top/bottom trials. For code+prompt, this is `createCompositeMutator`.
|
|
41
|
+
// For agent-builder, this can be a hand-rolled "edit the system prompt"
|
|
42
|
+
// function — the example shows one.
|
|
43
|
+
mutator: (champion: Variant, analysis: AnalysisReport) => Promise<Variant[]>
|
|
44
|
+
// Optional: outcome store for predictive validity. When present, the
|
|
45
|
+
// loop learns which scoring rubrics actually predict deployment outcomes
|
|
46
|
+
// and reweights the composite score accordingly.
|
|
47
|
+
outcomes?: { store: OutcomeStore; metrics: string[] }
|
|
48
|
+
}): Promise<IterationReport[]> {
|
|
49
|
+
const reports: IterationReport[] = []
|
|
50
|
+
let variants = opts.initialVariants
|
|
51
|
+
|
|
52
|
+
// (Optional) standing researcher that drives rubric reweighting.
|
|
53
|
+
const researcher = opts.outcomes
|
|
54
|
+
? new PredictiveValidityResearcher({
|
|
55
|
+
outcomes: opts.outcomes.store,
|
|
56
|
+
outcomeMetrics: opts.outcomes.metrics,
|
|
57
|
+
})
|
|
58
|
+
: null
|
|
59
|
+
|
|
60
|
+
for (let iter = 0; iter < opts.iterations; iter++) {
|
|
61
|
+
// 1. Capture-integrity-by-construction matrix run.
|
|
62
|
+
const campaign = await runEvalCampaign({
|
|
63
|
+
campaignId: `auto-research-iter-${iter}`,
|
|
64
|
+
commitSha: opts.task,
|
|
65
|
+
variants: variants.map((v) => ({ id: v.id, payload: v })),
|
|
66
|
+
scenarios: opts.scenarios,
|
|
67
|
+
seeds: [0, 1, 2],
|
|
68
|
+
llmOpts: { ... },
|
|
69
|
+
storeFactory: () => new InMemoryTraceStore(),
|
|
70
|
+
rawSinkFactory: () => new InMemoryRawProviderSink(),
|
|
71
|
+
runner: makeCampaignRunner(opts.candidateRunner),
|
|
72
|
+
onRunComplete: opts.outcomes
|
|
73
|
+
? [traceAnalystOnRunComplete({ analyze: ..., save: ... })]
|
|
74
|
+
: [],
|
|
75
|
+
report: { comparator: variants[0]!.id },
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
// 2. RL-bridge analysis: preferences, verifiable rewards, sequential
|
|
79
|
+
// interim verdict, reward-hacking diagnosis.
|
|
80
|
+
const analysis = await analyzeOptimizationResult({
|
|
81
|
+
result: pretendItsAPromptEvolution(campaign),
|
|
82
|
+
ctx: { experimentId: 'task', model: '...', commitSha: '...', promptHash: '...', configHash: '...' },
|
|
83
|
+
comparator: variants[0]!.id,
|
|
84
|
+
outcomes: opts.outcomes,
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
// 3. Periodic rubric recalibration via predictive validity.
|
|
88
|
+
if (researcher && iter > 0 && iter % 5 === 0) {
|
|
89
|
+
await researcher.runValidityCheck(campaign.runs)
|
|
90
|
+
// The researcher's `proposeChange` output can be folded into the
|
|
91
|
+
// mutator as a steering signal in the next iteration.
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// 4. Pick champion + record this iteration.
|
|
95
|
+
const champion = pickChampion(campaign.runs)
|
|
96
|
+
reports.push({ iter, champion, score: champion.score, analysis })
|
|
97
|
+
|
|
98
|
+
// 5. Sequential stop: the anytime-valid e-value can decisively call
|
|
99
|
+
// 'promote_now' or 'reject_now' before iterations exhausted.
|
|
100
|
+
if (analysis.interimConfidence?.recommendation.decision === 'promote_now') {
|
|
101
|
+
break
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// 6. Propose next variants via the mutator.
|
|
105
|
+
if (iter < opts.iterations - 1) {
|
|
106
|
+
variants = await opts.mutator(champion.variant, analysis)
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return reports
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Invariants every iteration must preserve
|
|
115
|
+
|
|
116
|
+
1. **The campaign produces RunRecord[] with `scenarioId` populated.** Every
|
|
117
|
+
downstream primitive (preferences, sequential, predictive validity,
|
|
118
|
+
tournament) keys on this. `runEvalCampaign` populates it canonically;
|
|
119
|
+
if you adapt from `runPromptEvolution` use `trialsToRunRecords`.
|
|
120
|
+
|
|
121
|
+
2. **Capture is wired by construction.** Don't pass `NoopRawProviderSink`
|
|
122
|
+
to `rawSinkFactory` unless the iteration is exploratory. Every
|
|
123
|
+
captured run is replayable, every replayable run is free judge-iteration
|
|
124
|
+
data for the next loop.
|
|
125
|
+
|
|
126
|
+
3. **`commitSha` is real.** It's how downstream tooling (predictive
|
|
127
|
+
validity, contamination probe, tournament) ties iterations together.
|
|
128
|
+
|
|
129
|
+
4. **The comparator is stable across iterations.** Either the original
|
|
130
|
+
`baseline` or whichever champion you froze. Shifting the comparator
|
|
131
|
+
between iterations corrupts the paired-delta semantics.
|
|
132
|
+
|
|
133
|
+
5. **The mutator is deterministic given the analysis output.** Otherwise
|
|
134
|
+
the iteration isn't reproducible and the auto-research artifacts
|
|
135
|
+
become unfalsifiable. If you need stochastic mutation, seed the
|
|
136
|
+
mutator and emit the seed onto the run record.
|
|
137
|
+
|
|
138
|
+
## When to run each primitive
|
|
139
|
+
|
|
140
|
+
| Frequency | Primitive | Why |
|
|
141
|
+
|---|---|---|
|
|
142
|
+
| Every iteration | `runEvalCampaign` | core measurement |
|
|
143
|
+
| Every iteration | `analyzeOptimizationResult` | preferences + verifiable rewards + reward-hacking |
|
|
144
|
+
| Every iteration | `evaluateInterimReleaseConfidence` (via `analyzeOptimizationResult`) | anytime-valid stop signal |
|
|
145
|
+
| Every 5–10 iterations | `rubricPredictiveValidity` | rubric weights drift; recalibrate |
|
|
146
|
+
| Every release | `runContaminationProbe` | scenario set freshness |
|
|
147
|
+
| Once per task | `runComputeCurve` | cost-quality frontier |
|
|
148
|
+
| As-needed | `adversarialScenarioSearch` | discover failure modes the curated set missed |
|
|
149
|
+
|
|
150
|
+
## When to drop into the smaller primitives
|
|
151
|
+
|
|
152
|
+
Two cases:
|
|
153
|
+
|
|
154
|
+
1. **Trajectory-shaped optimization with steering.** Use
|
|
155
|
+
`runMultiShotOptimization` directly — it already runs the inner
|
|
156
|
+
search-vs-holdout loop. Wrap with `analyzeOptimizationResult` after
|
|
157
|
+
for the RL bridge.
|
|
158
|
+
|
|
159
|
+
2. **Prompt + code evolution with sandboxed code mutation.** Use
|
|
160
|
+
`runPromptEvolution` + `createCompositeMutator` directly. Same wrap
|
|
161
|
+
pattern.
|
|
162
|
+
|
|
163
|
+
The auto-research loop above wraps these primitives in a higher-level
|
|
164
|
+
loop that runs them across multiple campaigns. They're each one tick of
|
|
165
|
+
the bigger loop.
|
|
166
|
+
|
|
167
|
+
## What this does NOT do
|
|
168
|
+
|
|
169
|
+
- It doesn't fine-tune model weights. That's the
|
|
170
|
+
[`fine-tune-with-prime-rl`](../examples/fine-tune-with-prime-rl/) example
|
|
171
|
+
— separate concern, separate trainer.
|
|
172
|
+
- It doesn't drive a production deployment decision on its own. The
|
|
173
|
+
artifacts feed a launch-review process (humans, the `researchReport`
|
|
174
|
+
output, the `assertReleaseConfidence` gate). Loop ≠ promotion gate.
|
|
175
|
+
- It doesn't substitute for a real preregistration trail. The
|
|
176
|
+
`preregistrationHash` field on the report exists so iterations can be
|
|
177
|
+
audited, but the auto-research loop *is* iterative and post-hoc by
|
|
178
|
+
definition. Use the standing `assertReleaseConfidence` gate at the
|
|
179
|
+
release boundary; use the auto-research loop everywhere upstream of it.
|
|
180
|
+
|
|
181
|
+
## Reading order for the example
|
|
182
|
+
|
|
183
|
+
1. [`examples/auto-research-with-agent-builder/README.md`](../examples/auto-research-with-agent-builder/README.md) — architectural picture.
|
|
184
|
+
2. [`examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`](../examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts) — runnable demo.
|
|
185
|
+
3. Run it: `npx tsx examples/auto-research-with-agent-builder/auto-research-with-agent-builder.ts`.
|
|
186
|
+
It prints the iteration progression and the score climbing.
|
|
@@ -113,15 +113,30 @@ risks list and the executive summary. Treat them as descriptive only.
|
|
|
113
113
|
- **Unpaired Mann–Whitney.** Rejected: matched scenarios make pairing free,
|
|
114
114
|
and unpaired tests throw away the variance reduction. Use the paired test
|
|
115
115
|
by default.
|
|
116
|
-
- **Sequential / always-valid inference (e-values,
|
|
117
|
-
|
|
118
|
-
|
|
116
|
+
- **Sequential / always-valid inference (e-values, alpha-spending).**
|
|
117
|
+
**Shipped in 0.22.** `pairedEvalueSequence` and
|
|
118
|
+
`evaluateInterimReleaseConfidence` provide time-uniform inference using
|
|
119
|
+
the predictable plug-in betting martingale (Waudby-Smith & Ramdas 2024)
|
|
120
|
+
paired with the empirical Bernstein confidence sequence (Howard et al.
|
|
121
|
+
2021). For *rolling* analyses (interim looks at a campaign that's still
|
|
122
|
+
accumulating data) call those primitives directly; `researchReport`
|
|
123
|
+
remains the single-look summary. Paper-grade pre-registration covers the
|
|
124
|
+
static analysis; the sequential primitives cover the iterative one.
|
|
119
125
|
- **Hierarchical Bayesian shrinkage across many candidates.** Future work.
|
|
120
126
|
The current ranking is on raw paired statistics and over-credits the top
|
|
121
|
-
candidate when many are tested.
|
|
127
|
+
candidate when many are tested. A Bayesian hierarchical model with a
|
|
128
|
+
weakly informative prior would shrink each variant toward the grand mean,
|
|
129
|
+
reducing rank flips between near-tied candidates.
|
|
122
130
|
- **Calibration / coverage simulation on the bootstrap CI.** Future work; we
|
|
123
131
|
rely on the asymptotic guarantee plus the hard pair floor to keep coverage
|
|
124
132
|
reasonable.
|
|
133
|
+
- **Outcome-anchored calibration.** **Shipped in 0.22.**
|
|
134
|
+
`rubricPredictiveValidity` joins `RunRecord`s to a `DeploymentOutcomeStore`
|
|
135
|
+
and reports per-rubric Spearman against deployment outcomes (revenue,
|
|
136
|
+
retention, CSAT, …). Combined with the static methodology in this
|
|
137
|
+
document, the loop is: pre-register → measure with `researchReport` →
|
|
138
|
+
ship → observe outcomes → recalibrate rubric weights with
|
|
139
|
+
`rubricPredictiveValidity`.
|
|
125
140
|
|
|
126
141
|
## When NOT to apply
|
|
127
142
|
|