@tangle-network/agent-eval 0.20.11 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/README.md +137 -170
- package/dist/benchmarks/index.d.ts +2 -1
- package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
- package/dist/chunk-3GN6U53I.js.map +1 -0
- package/dist/chunk-3IX6QTB7.js +1349 -0
- package/dist/chunk-3IX6QTB7.js.map +1 -0
- package/dist/chunk-5IIQKMD5.js +236 -0
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/chunk-ARZ6BEV6.js +1310 -0
- package/dist/chunk-ARZ6BEV6.js.map +1 -0
- package/dist/chunk-HRZELXCR.js +1354 -0
- package/dist/chunk-HRZELXCR.js.map +1 -0
- package/dist/chunk-KRR4VMH7.js +423 -0
- package/dist/chunk-KRR4VMH7.js.map +1 -0
- package/dist/chunk-SNUHRBDL.js +154 -0
- package/dist/chunk-SNUHRBDL.js.map +1 -0
- package/dist/chunk-WOK2RTWG.js +1920 -0
- package/dist/chunk-WOK2RTWG.js.map +1 -0
- package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
- package/dist/chunk-YUFXO3TU.js +148 -0
- package/dist/chunk-YUFXO3TU.js.map +1 -0
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/control-cxwMOAsy.d.ts +259 -0
- package/dist/control.d.ts +6 -0
- package/dist/control.js +30 -0
- package/dist/control.js.map +1 -0
- package/dist/dataset-B9qvlm_o.d.ts +112 -0
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
- package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
- package/dist/index.d.ts +178 -2945
- package/dist/index.js +1066 -6185
- package/dist/index.js.map +1 -1
- package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +146 -0
- package/dist/optimization.js +60 -0
- package/dist/optimization.js.map +1 -0
- package/dist/reporting-Da2ihlcM.d.ts +672 -0
- package/dist/reporting.d.ts +5 -0
- package/dist/reporting.js +36 -0
- package/dist/reporting.js.map +1 -0
- package/dist/run-record-CX_jcAyr.d.ts +134 -0
- package/dist/store-u47QaJ9G.d.ts +297 -0
- package/dist/traces.d.ts +914 -0
- package/dist/traces.js +120 -0
- package/dist/traces.js.map +1 -0
- package/dist/wire/index.js +3 -2
- package/docs/concepts.md +16 -11
- package/docs/feature-guide.md +10 -17
- package/docs/integration-launch-gates.md +77 -0
- package/docs/product-eval-adoption.md +27 -0
- package/docs/research-report-methodology.md +155 -0
- package/docs/trace-analysis.md +75 -0
- package/package.json +30 -12
- package/dist/chunk-JAOLXRIA.js.map +0 -1
- /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
package/dist/traces.d.ts
ADDED
|
@@ -0,0 +1,914 @@
|
|
|
1
|
+
import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureClass, c as ToolSpan } from './store-u47QaJ9G.js';
|
|
2
|
+
export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
|
|
3
|
+
import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
|
|
4
|
+
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
|
|
5
|
+
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Typed query helpers over TraceStore.
|
|
9
|
+
*
|
|
10
|
+
* Not a full SQL engine — a minimal, composable set of operators that
|
|
11
|
+
* cover the canned-pipeline use cases. For ad-hoc analytics, persist to
|
|
12
|
+
* NDJSON and point DuckDB at it; the schema is stable so external SQL
|
|
13
|
+
* tooling works out of the box.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
|
|
17
|
+
declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
|
|
18
|
+
declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
|
|
19
|
+
declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
|
|
20
|
+
/** Group spans by any key selector. */
|
|
21
|
+
declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
|
|
22
|
+
/** Hash tool arguments to an orderless-key-stable string for de-duplication. */
|
|
23
|
+
declare function argHash(args: unknown): string;
|
|
24
|
+
/** Sum an LLM-span array into aggregate token + cost. */
|
|
25
|
+
declare function aggregateLlm(spans: LlmSpan[]): {
|
|
26
|
+
inputTokens: number;
|
|
27
|
+
outputTokens: number;
|
|
28
|
+
cachedTokens: number;
|
|
29
|
+
costUsd: number;
|
|
30
|
+
};
|
|
31
|
+
/** Pick the outcome's failure class when present, else derive 'success' from run status. */
|
|
32
|
+
declare function runFailureClass(run: Run): FailureClass;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Redaction — remove PII / secrets from trace payloads before persist.
|
|
36
|
+
*
|
|
37
|
+
* Pre-persistence rules mean raw traces in storage are already scrubbed.
|
|
38
|
+
* Unredacted variants (for debugging / post-mortems) live in a separate
|
|
39
|
+
* storage layer with stricter access controls; this module only covers
|
|
40
|
+
* the default scrub-then-persist path.
|
|
41
|
+
*
|
|
42
|
+
* Rules compose: pass an array of `RedactionRule`, each is applied in
|
|
43
|
+
* order. Strings that match get replaced with a tagged sentinel so the
|
|
44
|
+
* eval framework can count how many redactions happened per run
|
|
45
|
+
* (surfaced via `redaction_applied` events).
|
|
46
|
+
*/
|
|
47
|
+
interface RedactionRule {
|
|
48
|
+
id: string;
|
|
49
|
+
pattern: RegExp;
|
|
50
|
+
/** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
|
|
51
|
+
replacement?: string;
|
|
52
|
+
}
|
|
53
|
+
interface RedactionReport {
|
|
54
|
+
redactionCount: number;
|
|
55
|
+
byRule: Record<string, number>;
|
|
56
|
+
}
|
|
57
|
+
/** OWASP / common-sense defaults — extend per-domain. */
|
|
58
|
+
declare const DEFAULT_REDACTION_RULES: RedactionRule[];
|
|
59
|
+
declare const REDACTION_VERSION = "1.0.0";
|
|
60
|
+
/**
|
|
61
|
+
* Redact a single string. Returns the new string and a per-rule count of
|
|
62
|
+
* how many substitutions fired.
|
|
63
|
+
*/
|
|
64
|
+
declare function redactString(input: string, rules?: RedactionRule[]): {
|
|
65
|
+
output: string;
|
|
66
|
+
report: RedactionReport;
|
|
67
|
+
};
|
|
68
|
+
/**
|
|
69
|
+
* Walk a JSON-ish value applying `redactString` to every string leaf.
|
|
70
|
+
* Arrays and plain objects are recursed; other types pass through
|
|
71
|
+
* untouched. Circular references throw — traces should be tree-shaped.
|
|
72
|
+
*/
|
|
73
|
+
declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
|
|
74
|
+
value: unknown;
|
|
75
|
+
report: RedactionReport;
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
|
|
80
|
+
* traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
|
|
81
|
+
*
|
|
82
|
+
* Wire format only. We do NOT depend on the @opentelemetry SDK — that
|
|
83
|
+
* would drag in polyfills incompatible with Workers/Edge. Consumers
|
|
84
|
+
* push the JSON to their collector of choice via HTTP.
|
|
85
|
+
*
|
|
86
|
+
* Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
|
|
87
|
+
*/
|
|
88
|
+
|
|
89
|
+
declare const OTEL_AGENT_EVAL_SCOPE: {
|
|
90
|
+
name: string;
|
|
91
|
+
version: string;
|
|
92
|
+
};
|
|
93
|
+
interface OtlpSpan {
|
|
94
|
+
traceId: string;
|
|
95
|
+
spanId: string;
|
|
96
|
+
parentSpanId?: string;
|
|
97
|
+
name: string;
|
|
98
|
+
kind: number;
|
|
99
|
+
startTimeUnixNano: string;
|
|
100
|
+
endTimeUnixNano: string;
|
|
101
|
+
attributes: Array<{
|
|
102
|
+
key: string;
|
|
103
|
+
value: {
|
|
104
|
+
stringValue?: string;
|
|
105
|
+
intValue?: string;
|
|
106
|
+
doubleValue?: number;
|
|
107
|
+
boolValue?: boolean;
|
|
108
|
+
};
|
|
109
|
+
}>;
|
|
110
|
+
events?: Array<{
|
|
111
|
+
timeUnixNano: string;
|
|
112
|
+
name: string;
|
|
113
|
+
attributes?: OtlpSpan['attributes'];
|
|
114
|
+
}>;
|
|
115
|
+
status?: {
|
|
116
|
+
code: number;
|
|
117
|
+
message?: string;
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
interface OtlpResourceSpans {
|
|
121
|
+
resource: {
|
|
122
|
+
attributes: OtlpSpan['attributes'];
|
|
123
|
+
};
|
|
124
|
+
scopeSpans: Array<{
|
|
125
|
+
scope: typeof OTEL_AGENT_EVAL_SCOPE;
|
|
126
|
+
spans: OtlpSpan[];
|
|
127
|
+
}>;
|
|
128
|
+
}
|
|
129
|
+
interface OtlpExport {
|
|
130
|
+
resourceSpans: OtlpResourceSpans[];
|
|
131
|
+
}
|
|
132
|
+
/** Export a single run's spans + events in OTLP/JSON. */
|
|
133
|
+
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* RawProviderSink — first-class persistence for the actual HTTP-level
|
|
137
|
+
* request/response bodies of every LLM provider call.
|
|
138
|
+
*
|
|
139
|
+
* Why this is a separate sink from the structured `LlmSpan`:
|
|
140
|
+
*
|
|
141
|
+
* - `LlmSpan` records the *intent* — model name, messages, output text,
|
|
142
|
+
* usage. It's what dashboards read; it's NOT enough for forensics.
|
|
143
|
+
* - When a downstream consumer reports "the verifier used the wrong route"
|
|
144
|
+
* or "tokens look right but reasoning was missing," the only way to
|
|
145
|
+
* answer is the raw HTTP body. Span fields can lie (a proxy can echo
|
|
146
|
+
* a different `model` value than what actually answered); the raw
|
|
147
|
+
* response is ground truth.
|
|
148
|
+
*
|
|
149
|
+
* Default behaviour: opt-in. Pass `rawSink` to `LlmClientOptions` (or the
|
|
150
|
+
* matrix runner / BuilderSession sets it up automatically) and every
|
|
151
|
+
* request, response, and error is recorded — including retries, with the
|
|
152
|
+
* attempt index attached so a flaky call's full event chain is recoverable.
|
|
153
|
+
*
|
|
154
|
+
* Redaction is enforced at sink time. The default redactor strips
|
|
155
|
+
* `Authorization`, `X-Api-Key`, `X-Auth-Token`, `Cookie` headers and any
|
|
156
|
+
* payload field whose key matches `apiKey | api_key | bearer | password |
|
|
157
|
+
* secret | token` (case-insensitive). Override via the sink constructor or
|
|
158
|
+
* the per-call `redactor`. The `redactedFields` array on the persisted
|
|
159
|
+
* event lets a reviewer see what was stripped without exposing the values.
|
|
160
|
+
*/
|
|
161
|
+
type RawProviderDirection = 'request' | 'response' | 'error';
|
|
162
|
+
interface RawProviderEvent {
|
|
163
|
+
/** Stable id. Generated by the sink if omitted. */
|
|
164
|
+
eventId: string;
|
|
165
|
+
/** Trace context populated by `LlmClient` when the call is wrapped in a span. */
|
|
166
|
+
runId?: string;
|
|
167
|
+
spanId?: string;
|
|
168
|
+
/**
|
|
169
|
+
* Logical provider name. Free-form so callers can use whatever id matches
|
|
170
|
+
* their topology (`'openai'`, `'anthropic'`, `'tangle-router'`, …). When
|
|
171
|
+
* omitted, derived from `baseUrl` in `LlmClientOptions`.
|
|
172
|
+
*/
|
|
173
|
+
provider: string;
|
|
174
|
+
model: string;
|
|
175
|
+
/** Endpoint path, e.g. `'/v1/chat/completions'`. */
|
|
176
|
+
endpoint: string;
|
|
177
|
+
/** Base URL used for the call (already-normalised — no trailing slash). */
|
|
178
|
+
baseUrl: string;
|
|
179
|
+
/** 0-indexed retry attempt. The first attempt is 0; a retried call gets 1, 2, … */
|
|
180
|
+
attemptIndex: number;
|
|
181
|
+
direction: RawProviderDirection;
|
|
182
|
+
/** Unix ms. */
|
|
183
|
+
timestamp: number;
|
|
184
|
+
/** Wall-clock duration of the call leg. Set on `response` and `error` events; null on `request`. */
|
|
185
|
+
durationMs?: number;
|
|
186
|
+
statusCode?: number;
|
|
187
|
+
requestHeaders?: Record<string, string>;
|
|
188
|
+
requestBody?: unknown;
|
|
189
|
+
responseHeaders?: Record<string, string>;
|
|
190
|
+
responseBody?: unknown;
|
|
191
|
+
/** Set on `direction: 'error'` events. */
|
|
192
|
+
errorMessage?: string;
|
|
193
|
+
/** Field paths the redactor stripped from this event ('header:Authorization', 'body.apiKey', …). */
|
|
194
|
+
redactedFields: string[];
|
|
195
|
+
}
|
|
196
|
+
interface RawProviderSinkFilter {
|
|
197
|
+
runId?: string;
|
|
198
|
+
spanId?: string;
|
|
199
|
+
direction?: RawProviderDirection;
|
|
200
|
+
attemptIndex?: number;
|
|
201
|
+
}
|
|
202
|
+
interface RawProviderSink {
|
|
203
|
+
record(event: RawProviderEvent): Promise<void>;
|
|
204
|
+
/** Optional listing — implementations that durably persist (file, db) should support this. */
|
|
205
|
+
list?(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
206
|
+
/** Optional teardown for backed implementations. */
|
|
207
|
+
close?(): Promise<void>;
|
|
208
|
+
}
|
|
209
|
+
type ProviderRedactor = (event: RawProviderEvent) => RawProviderEvent;
|
|
210
|
+
/**
|
|
211
|
+
* Default redactor — strips well-known auth headers and any body field whose
|
|
212
|
+
* key matches the credential pattern. Records every redacted path on
|
|
213
|
+
* `event.redactedFields` so a downstream reviewer can see what was removed.
|
|
214
|
+
*/
|
|
215
|
+
declare function defaultProviderRedactor(event: RawProviderEvent): RawProviderEvent;
|
|
216
|
+
interface InMemoryRawProviderSinkOptions {
|
|
217
|
+
redactor?: ProviderRedactor;
|
|
218
|
+
}
|
|
219
|
+
declare class InMemoryRawProviderSink implements RawProviderSink {
|
|
220
|
+
private events;
|
|
221
|
+
private redactor;
|
|
222
|
+
constructor(opts?: InMemoryRawProviderSinkOptions);
|
|
223
|
+
record(event: RawProviderEvent): Promise<void>;
|
|
224
|
+
list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
225
|
+
size(): number;
|
|
226
|
+
}
|
|
227
|
+
declare class NoopRawProviderSink implements RawProviderSink {
|
|
228
|
+
record(): Promise<void>;
|
|
229
|
+
}
|
|
230
|
+
interface FileSystemRawProviderSinkOptions {
|
|
231
|
+
/** Directory the NDJSON file is written into. Created if missing. */
|
|
232
|
+
dir: string;
|
|
233
|
+
/** File name; default `'raw-provider-events.ndjson'`. */
|
|
234
|
+
fileName?: string;
|
|
235
|
+
/** Bytes after which the writer rolls over to a new file (default 32 MiB). */
|
|
236
|
+
rollAtBytes?: number;
|
|
237
|
+
redactor?: ProviderRedactor;
|
|
238
|
+
}
|
|
239
|
+
declare class FileSystemRawProviderSink implements RawProviderSink {
|
|
240
|
+
private dir;
|
|
241
|
+
private fileName;
|
|
242
|
+
private rollAtBytes;
|
|
243
|
+
private redactor;
|
|
244
|
+
private bytesWritten;
|
|
245
|
+
private rollIndex;
|
|
246
|
+
private initPromise;
|
|
247
|
+
constructor(opts: FileSystemRawProviderSinkOptions);
|
|
248
|
+
private ensureInit;
|
|
249
|
+
private currentPath;
|
|
250
|
+
record(event: RawProviderEvent): Promise<void>;
|
|
251
|
+
list(filter?: RawProviderSinkFilter): Promise<RawProviderEvent[]>;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Best-effort provider id from a base URL. Falls back to the URL host when
|
|
255
|
+
* none of the well-known patterns match.
|
|
256
|
+
*/
|
|
257
|
+
declare function providerFromBaseUrl(baseUrl: string): string;
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Run-completion integrity check — at end of run, verify the expected event
|
|
261
|
+
* types were actually captured. The point is the launch-review failure mode:
|
|
262
|
+
* a run *appears* successful but the raw provider events were never written,
|
|
263
|
+
* so a downstream reviewer can't reconstruct what happened.
|
|
264
|
+
*
|
|
265
|
+
* Pattern:
|
|
266
|
+
*
|
|
267
|
+
* const report = await assertRunCaptured(store, runId, {
|
|
268
|
+
* llmSpansMin: 1,
|
|
269
|
+
* judgeSpansMin: 1,
|
|
270
|
+
* rawSink: providerSink, // must have ≥ 1 event for this run
|
|
271
|
+
* requireRawCoverageOfLlmSpans: true, // every llm span has matching raw events
|
|
272
|
+
* })
|
|
273
|
+
* if (!report.ok) throwIfRunIncomplete(report) // or mark run failed and continue
|
|
274
|
+
*
|
|
275
|
+
* The function is read-only on the store and returns a structured report;
|
|
276
|
+
* the caller chooses the failure mode (throw, mark run failed, log warning).
|
|
277
|
+
* `throwIfRunIncomplete` is the convenient strict mode.
|
|
278
|
+
*/
|
|
279
|
+
|
|
280
|
+
interface RunIntegrityExpectations {
|
|
281
|
+
/** Minimum LLM span count. Default 0 (no requirement). */
|
|
282
|
+
llmSpansMin?: number;
|
|
283
|
+
/** Minimum judge span count. Default 0. */
|
|
284
|
+
judgeSpansMin?: number;
|
|
285
|
+
/** Minimum tool span count. Default 0. */
|
|
286
|
+
toolSpansMin?: number;
|
|
287
|
+
/**
|
|
288
|
+
* Raw provider sink to consult for capture verification. When present,
|
|
289
|
+
* the check requires at least one raw event for the run.
|
|
290
|
+
*/
|
|
291
|
+
rawSink?: RawProviderSink;
|
|
292
|
+
/** Minimum raw provider event count. Default 0; ignored when `rawSink` absent. */
|
|
293
|
+
rawProviderEventsMin?: number;
|
|
294
|
+
/**
|
|
295
|
+
* Every LLM span must have at least one matching raw `request` event
|
|
296
|
+
* (matched by spanId). Catches the common bug where the structured span
|
|
297
|
+
* was emitted but the raw HTTP capture was wired to a different sink.
|
|
298
|
+
*/
|
|
299
|
+
requireRawCoverageOfLlmSpans?: boolean;
|
|
300
|
+
/** Run outcome must be set (not null/undefined). Default false. */
|
|
301
|
+
requireOutcome?: boolean;
|
|
302
|
+
}
|
|
303
|
+
type RunIntegrityIssueCode = 'no_run' | 'missing_llm_spans' | 'missing_judge_spans' | 'missing_tool_spans' | 'missing_raw_events' | 'no_raw_sink' | 'orphan_llm_span' | 'missing_outcome';
|
|
304
|
+
interface RunIntegrityIssue {
|
|
305
|
+
code: RunIntegrityIssueCode;
|
|
306
|
+
message: string;
|
|
307
|
+
detail?: Record<string, unknown>;
|
|
308
|
+
}
|
|
309
|
+
interface RunIntegrityReport {
|
|
310
|
+
ok: boolean;
|
|
311
|
+
runId: string;
|
|
312
|
+
llmSpanCount: number;
|
|
313
|
+
judgeSpanCount: number;
|
|
314
|
+
toolSpanCount: number;
|
|
315
|
+
rawProviderEventCount: number;
|
|
316
|
+
/**
|
|
317
|
+
* Coverage of LLM spans by raw provider events keyed on spanId.
|
|
318
|
+
* `total` is the number of LLM spans; `covered` is the count with at
|
|
319
|
+
* least one matching `request` raw event.
|
|
320
|
+
*/
|
|
321
|
+
rawSpanCoverage: {
|
|
322
|
+
covered: number;
|
|
323
|
+
total: number;
|
|
324
|
+
};
|
|
325
|
+
issues: RunIntegrityIssue[];
|
|
326
|
+
}
|
|
327
|
+
declare class RunIntegrityError extends Error {
|
|
328
|
+
readonly report: RunIntegrityReport;
|
|
329
|
+
constructor(report: RunIntegrityReport);
|
|
330
|
+
}
|
|
331
|
+
declare function assertRunCaptured(store: TraceStore, runId: string, expectations?: RunIntegrityExpectations): Promise<RunIntegrityReport>;
|
|
332
|
+
/** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
|
|
333
|
+
declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Shared types for the trace-analyst module.
|
|
337
|
+
*
|
|
338
|
+
* Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
|
|
339
|
+
* line per span, OTLP-shaped. We do NOT depend on a specific tracing
|
|
340
|
+
* vendor at the type level. Adapter
|
|
341
|
+
* layers map upstream shapes onto this interface.
|
|
342
|
+
*
|
|
343
|
+
* Design constraint. Every read operation that can return arbitrary
|
|
344
|
+
* payload must carry a byte budget so the agent's tool result stays
|
|
345
|
+
* bounded regardless of input trace size. Oversized responses
|
|
346
|
+
* substitute a deterministic summary instead of bytes — see
|
|
347
|
+
* `ViewTraceOversized`.
|
|
348
|
+
*/
|
|
349
|
+
/** OTLP span kind (subset we actually use). */
|
|
350
|
+
type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
|
|
351
|
+
type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
|
|
352
|
+
/** Subset of OTLP span fields the analyst exposes to the agent. The
|
|
353
|
+
* store's job is to project upstream's full span shape down to this
|
|
354
|
+
* view — the analyst never sees vendor extensions directly. */
|
|
355
|
+
interface TraceAnalystSpan {
|
|
356
|
+
trace_id: string;
|
|
357
|
+
span_id: string;
|
|
358
|
+
parent_span_id: string | null;
|
|
359
|
+
name: string;
|
|
360
|
+
kind: TraceAnalystSpanKind;
|
|
361
|
+
start_time: string;
|
|
362
|
+
end_time: string;
|
|
363
|
+
duration_ms: number;
|
|
364
|
+
status: TraceAnalystSpanStatus;
|
|
365
|
+
status_message?: string;
|
|
366
|
+
service_name: string | null;
|
|
367
|
+
agent_name: string | null;
|
|
368
|
+
model_name: string | null;
|
|
369
|
+
tool_name: string | null;
|
|
370
|
+
/** Raw JSON-serialisable attribute map. May contain large strings;
|
|
371
|
+
* callers must respect the per-attribute byte cap. */
|
|
372
|
+
attributes: Record<string, unknown>;
|
|
373
|
+
}
|
|
374
|
+
interface TraceAnalystTraceSummary {
|
|
375
|
+
trace_id: string;
|
|
376
|
+
service_name: string | null;
|
|
377
|
+
agent_name: string | null;
|
|
378
|
+
span_count: number;
|
|
379
|
+
has_errors: boolean;
|
|
380
|
+
start_time: string;
|
|
381
|
+
end_time: string;
|
|
382
|
+
duration_ms: number;
|
|
383
|
+
raw_jsonl_bytes: number;
|
|
384
|
+
models: string[];
|
|
385
|
+
tools: string[];
|
|
386
|
+
}
|
|
387
|
+
interface TraceAnalystFilters {
|
|
388
|
+
/** Restrict to traces that contain at least one error span. */
|
|
389
|
+
has_errors?: boolean;
|
|
390
|
+
/** Match if any span's `service.name` is in this list. */
|
|
391
|
+
service_names?: string[];
|
|
392
|
+
/** Match if any span's `agent.name` is in this list. */
|
|
393
|
+
agent_names?: string[];
|
|
394
|
+
/** Match if any LLM span's `llm.model_name` is in this list. */
|
|
395
|
+
model_names?: string[];
|
|
396
|
+
/** Match if any tool span's `tool.name` is in this list. */
|
|
397
|
+
tool_names?: string[];
|
|
398
|
+
/** ISO-8601 lower bound on the trace's earliest start time. */
|
|
399
|
+
start_time_after?: string;
|
|
400
|
+
/** ISO-8601 upper bound on the trace's earliest start time. */
|
|
401
|
+
start_time_before?: string;
|
|
402
|
+
/** Single regex applied to raw JSONL bytes for the trace. Opt-in;
|
|
403
|
+
* expensive on large datasets. Use the indexed filters above first. */
|
|
404
|
+
regex_pattern?: string;
|
|
405
|
+
}
|
|
406
|
+
interface DatasetOverview {
|
|
407
|
+
total_traces: number;
|
|
408
|
+
raw_jsonl_bytes: number;
|
|
409
|
+
services: string[];
|
|
410
|
+
agents: string[];
|
|
411
|
+
models: string[];
|
|
412
|
+
tool_names: string[];
|
|
413
|
+
/** Up to 20 real trace ids the agent may pass to view/search tools. */
|
|
414
|
+
sample_trace_ids: string[];
|
|
415
|
+
errors: {
|
|
416
|
+
trace_count: number;
|
|
417
|
+
span_count: number;
|
|
418
|
+
};
|
|
419
|
+
time_range: {
|
|
420
|
+
earliest: string;
|
|
421
|
+
latest: string;
|
|
422
|
+
} | null;
|
|
423
|
+
}
|
|
424
|
+
interface QueryTracesPage {
|
|
425
|
+
traces: TraceAnalystTraceSummary[];
|
|
426
|
+
total: number;
|
|
427
|
+
has_more: boolean;
|
|
428
|
+
}
|
|
429
|
+
/** Full-trace view. When the response would exceed the per-call byte
|
|
430
|
+
* budget, `oversized` is populated INSTEAD of `spans` so the agent
|
|
431
|
+
* knows to switch to `searchTrace` / `viewSpans`. */
|
|
432
|
+
interface ViewTraceResult {
|
|
433
|
+
trace_id: string;
|
|
434
|
+
spans?: TraceAnalystSpan[];
|
|
435
|
+
oversized?: ViewTraceOversized;
|
|
436
|
+
}
|
|
437
|
+
interface ViewTraceOversized {
|
|
438
|
+
span_count: number;
|
|
439
|
+
/** Names with their counts, sorted desc. Capped at 20 entries. */
|
|
440
|
+
top_span_names: Array<[string, number]>;
|
|
441
|
+
/** Largest single span body (bytes after attribute-cap projection). */
|
|
442
|
+
span_response_bytes_max: number;
|
|
443
|
+
error_span_count: number;
|
|
444
|
+
}
|
|
445
|
+
interface ViewSpansResult {
|
|
446
|
+
trace_id: string;
|
|
447
|
+
spans: TraceAnalystSpan[];
|
|
448
|
+
/** Number of requested span ids that were not found in the trace. */
|
|
449
|
+
missing_span_ids: string[];
|
|
450
|
+
/** Number of attribute fields truncated to fit the per-attribute cap. */
|
|
451
|
+
truncated_attribute_count: number;
|
|
452
|
+
}
|
|
453
|
+
interface SpanMatchRecord {
|
|
454
|
+
trace_id: string;
|
|
455
|
+
span_id: string;
|
|
456
|
+
span_name: string;
|
|
457
|
+
span_kind: TraceAnalystSpanKind;
|
|
458
|
+
/** JSON pointer-style path to the matched value, e.g.
|
|
459
|
+
* `attributes."llm.input_messages"[2].content`. */
|
|
460
|
+
attribute_path: string;
|
|
461
|
+
matched_text: string;
|
|
462
|
+
context_before: string;
|
|
463
|
+
context_after: string;
|
|
464
|
+
match_offset: number;
|
|
465
|
+
}
|
|
466
|
+
interface SearchTraceResult {
|
|
467
|
+
trace_id: string;
|
|
468
|
+
hits: SpanMatchRecord[];
|
|
469
|
+
total_matches: number;
|
|
470
|
+
has_more: boolean;
|
|
471
|
+
}
|
|
472
|
+
interface SearchSpanResult {
|
|
473
|
+
trace_id: string;
|
|
474
|
+
span_id: string;
|
|
475
|
+
hits: SpanMatchRecord[];
|
|
476
|
+
total_matches: number;
|
|
477
|
+
has_more: boolean;
|
|
478
|
+
}
|
|
479
|
+
/** Tunable byte budgets for bounded RLM tool output. */
|
|
480
|
+
interface TraceAnalystByteBudgets {
|
|
481
|
+
/** Max bytes any single tool response may emit. Hard ceiling enforced
|
|
482
|
+
* by the store; oversized → summary. Default 150_000. */
|
|
483
|
+
perCallByteCeiling: number;
|
|
484
|
+
/** Per-attribute string truncation cap on `viewTrace` (discovery scan).
|
|
485
|
+
* Default 4096. */
|
|
486
|
+
perAttributeViewBudget: number;
|
|
487
|
+
/** Per-attribute string truncation cap on `viewSpans` (surgical reads).
|
|
488
|
+
* Default 16384. */
|
|
489
|
+
perAttributeSpanBudget: number;
|
|
490
|
+
/** Per-attribute cap on a single match record's `matched_text` and
|
|
491
|
+
* context window. Default 1024. */
|
|
492
|
+
perMatchTextBudget: number;
|
|
493
|
+
}
|
|
494
|
+
declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
|
|
495
|
+
/** Marker substituted in place of truncated string payloads. Callers
|
|
496
|
+
* parsing tool output can detect it deterministically. */
|
|
497
|
+
declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
|
|
498
|
+
|
|
499
|
+
/**
|
|
500
|
+
* `TraceAnalysisStore` — read-side interface the trace-analyst calls
|
|
501
|
+
* through. Six operations, all bounded:
|
|
502
|
+
*
|
|
503
|
+
* - `getOverview(filters?)` — dataset rollup + sample trace ids.
|
|
504
|
+
* - `queryTraces(filters?, limit, offset)` — paginated summaries.
|
|
505
|
+
* - `countTraces(filters?)` — cheap count without materialisation.
|
|
506
|
+
* - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
|
|
507
|
+
* - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
|
|
508
|
+
* - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
|
|
509
|
+
* - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
|
|
510
|
+
*
|
|
511
|
+
* Multiple implementations ship in the core (`OtlpFileTraceStore`).
|
|
512
|
+
* Downstream callers can supply their own — e.g. a DuckDB-backed
|
|
513
|
+
* adapter or an in-memory adapter for tests — by implementing this
|
|
514
|
+
* interface.
|
|
515
|
+
*
|
|
516
|
+
* Filters compose with AND semantics. Empty/undefined fields impose
|
|
517
|
+
* no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
|
|
518
|
+
* implementations may skip it via `count`/`overview` when not set.
|
|
519
|
+
*/
|
|
520
|
+
|
|
521
|
+
interface TraceAnalysisStore {
|
|
522
|
+
getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
|
|
523
|
+
queryTraces(opts: {
|
|
524
|
+
filters?: TraceAnalystFilters;
|
|
525
|
+
limit: number;
|
|
526
|
+
offset?: number;
|
|
527
|
+
}): Promise<QueryTracesPage>;
|
|
528
|
+
countTraces(filters?: TraceAnalystFilters): Promise<number>;
|
|
529
|
+
viewTrace(opts: {
|
|
530
|
+
trace_id: string;
|
|
531
|
+
/** Override per-attribute byte cap. Defaults to discovery budget. */
|
|
532
|
+
per_attribute_byte_cap?: number;
|
|
533
|
+
}): Promise<ViewTraceResult>;
|
|
534
|
+
viewSpans(opts: {
|
|
535
|
+
trace_id: string;
|
|
536
|
+
span_ids: readonly string[];
|
|
537
|
+
/** Override per-attribute byte cap. Defaults to surgical budget. */
|
|
538
|
+
per_attribute_byte_cap?: number;
|
|
539
|
+
}): Promise<ViewSpansResult>;
|
|
540
|
+
searchTrace(opts: {
|
|
541
|
+
trace_id: string;
|
|
542
|
+
regex_pattern: string;
|
|
543
|
+
/** Hard cap on matches returned. Default 50. */
|
|
544
|
+
max_matches?: number;
|
|
545
|
+
}): Promise<SearchTraceResult>;
|
|
546
|
+
searchSpan(opts: {
|
|
547
|
+
trace_id: string;
|
|
548
|
+
span_id: string;
|
|
549
|
+
regex_pattern: string;
|
|
550
|
+
max_matches?: number;
|
|
551
|
+
}): Promise<SearchSpanResult>;
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
interface AnalyzeTracesInput {
|
|
555
|
+
/** The user-facing question. Domain framing belongs here, not in the
|
|
556
|
+
* actor description. */
|
|
557
|
+
question: string;
|
|
558
|
+
}
|
|
559
|
+
interface AnalyzeTracesResult {
|
|
560
|
+
/** The responder's prose answer. */
|
|
561
|
+
answer: string;
|
|
562
|
+
/** Bulleted findings extracted from the responder's structured output. */
|
|
563
|
+
findings: string[];
|
|
564
|
+
/** Per-actor-turn snapshots captured via `actorTurnCallback`. */
|
|
565
|
+
turns: AnalyzeTracesTurnSnapshot[];
|
|
566
|
+
/** Total turns the actor took. */
|
|
567
|
+
turnCount: number;
|
|
568
|
+
/** Token usage by role. */
|
|
569
|
+
usage: TraceAnalystUsage;
|
|
570
|
+
/** Full system + assistant + tool message log by role. */
|
|
571
|
+
chatLog: TraceAnalystChatLog;
|
|
572
|
+
/** Prompt version that produced this run. */
|
|
573
|
+
actorPromptVersion: string;
|
|
574
|
+
}
|
|
575
|
+
interface TraceAnalystUsage {
|
|
576
|
+
actor: TraceAnalystUsageEntry[];
|
|
577
|
+
responder: TraceAnalystUsageEntry[];
|
|
578
|
+
}
|
|
579
|
+
interface TraceAnalystUsageEntry {
|
|
580
|
+
[key: string]: unknown;
|
|
581
|
+
}
|
|
582
|
+
interface TraceAnalystChatLog {
|
|
583
|
+
actor: TraceAnalystChatMessage[];
|
|
584
|
+
responder: TraceAnalystChatMessage[];
|
|
585
|
+
}
|
|
586
|
+
interface TraceAnalystChatMessage {
|
|
587
|
+
[key: string]: unknown;
|
|
588
|
+
}
|
|
589
|
+
interface AnalyzeTracesTurnSnapshot {
|
|
590
|
+
turn: number;
|
|
591
|
+
isError: boolean;
|
|
592
|
+
/** The JS code the actor produced for this turn. */
|
|
593
|
+
code: string;
|
|
594
|
+
/** The formatted action-log entry the actor sees on the next turn. */
|
|
595
|
+
output: string;
|
|
596
|
+
/** Provider thought (when `actorOptions.showThoughts` is true and the
|
|
597
|
+
* provider returns it). */
|
|
598
|
+
thought?: string;
|
|
599
|
+
}
|
|
600
|
+
interface AnalyzeTracesOptions {
|
|
601
|
+
/** Trace data source. Pass either an OTLP-JSONL path or a custom store. */
|
|
602
|
+
source: string | TraceAnalysisStore;
|
|
603
|
+
/** Caller-provided AxAIService. */
|
|
604
|
+
ai: AxAIService;
|
|
605
|
+
/** Model id forwarded to actor + responder. */
|
|
606
|
+
model?: string;
|
|
607
|
+
/** Recursion depth. 0 = no sub-agent dispatch. Default 1. */
|
|
608
|
+
maxDepth?: number;
|
|
609
|
+
/** Maximum actor turns. Default 12. */
|
|
610
|
+
maxTurns?: number;
|
|
611
|
+
/** Maximum parallel sub-agent calls in batched llmQuery. Default 2. */
|
|
612
|
+
maxParallelSubagents?: number;
|
|
613
|
+
/** Override the actor description. */
|
|
614
|
+
actorDescription?: string;
|
|
615
|
+
/** Override the subagent description. */
|
|
616
|
+
subagentDescription?: string;
|
|
617
|
+
/** Per-turn observability hook. */
|
|
618
|
+
onTurn?: (turn: AnalyzeTracesTurnSnapshot) => void | Promise<void>;
|
|
619
|
+
/** Override max runtime characters per turn. Default 6000. */
|
|
620
|
+
maxRuntimeChars?: number;
|
|
621
|
+
/** When set, every turn's snapshot is appended to this JSONL file
|
|
622
|
+
* immediately. If the analyst crashes mid-loop (provider 503,
|
|
623
|
+
* network error, validator reject) the partial reasoning is still
|
|
624
|
+
* on disk. Replay the file with the responder afterward to recover
|
|
625
|
+
* evidence. */
|
|
626
|
+
progressLogPath?: string;
|
|
627
|
+
}
|
|
628
|
+
/**
|
|
629
|
+
* Run the trace analyst.
|
|
630
|
+
*
|
|
631
|
+
* Throws:
|
|
632
|
+
* - `TraceFileMissingError` if `source` is a path and doesn't exist.
|
|
633
|
+
* - `AxAgentClarificationError` if the analyst asks for clarification.
|
|
634
|
+
* - Provider errors (auth, rate limits) propagate from the AI service.
|
|
635
|
+
*/
|
|
636
|
+
declare function analyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions): Promise<AnalyzeTracesResult>;
|
|
637
|
+
|
|
638
|
+
/**
|
|
639
|
+
* `OtlpFileTraceStore` — read-only OTLP-JSONL trace store for the
|
|
640
|
+
* trace-analyst.
|
|
641
|
+
*
|
|
642
|
+
* Wire shape. Each line of the input file is one OTLP-shaped span. The
|
|
643
|
+
* store understands flattened OTLP JSONL plus the OpenInference vocab.
|
|
644
|
+
* We project upstream's full
|
|
645
|
+
* span shape down to `TraceAnalystSpan` lazily — full materialisation
|
|
646
|
+
* only happens for the spans the agent actually requests.
|
|
647
|
+
*
|
|
648
|
+
* Indexing. On first read the store builds an in-memory index keyed
|
|
649
|
+
* by `trace_id` carrying:
|
|
650
|
+
* - byte offsets + lengths for each span line (for surgical reads
|
|
651
|
+
* without re-parsing the whole file)
|
|
652
|
+
* - a `TraceAnalystTraceSummary` rollup
|
|
653
|
+
* - sets of services / agents / models / tools / has_errors
|
|
654
|
+
* - byte size of the trace's JSONL slab
|
|
655
|
+
*
|
|
656
|
+
* Memory bound. The index keeps span metadata only — names, kinds,
|
|
657
|
+
* offsets, status. Attribute payloads stay on disk until requested.
|
|
658
|
+
* For a 50MB JSONL with 50k spans, the index is ~5MB.
|
|
659
|
+
*
|
|
660
|
+
* Concurrency. The store builds the index once on first read and
|
|
661
|
+
* caches it. Subsequent reads reuse the index. The file is opened on
|
|
662
|
+
* each read; we never hold a long-lived FD.
|
|
663
|
+
*/
|
|
664
|
+
|
|
665
|
+
interface OtlpFileTraceStoreOptions {
|
|
666
|
+
/** Path to the OTLP-JSONL file. */
|
|
667
|
+
path: string;
|
|
668
|
+
/** Override the discovery (`viewTrace`) per-attribute byte cap. */
|
|
669
|
+
perAttributeViewBudget?: number;
|
|
670
|
+
/** Override the surgical (`viewSpans`) per-attribute byte cap. */
|
|
671
|
+
perAttributeSpanBudget?: number;
|
|
672
|
+
/** Override the per-call ceiling that triggers oversized summaries. */
|
|
673
|
+
perCallByteCeiling?: number;
|
|
674
|
+
/** Override the per-match text budget. */
|
|
675
|
+
perMatchTextBudget?: number;
|
|
676
|
+
}
|
|
677
|
+
declare class OtlpFileTraceStore implements TraceAnalysisStore {
|
|
678
|
+
private readonly path;
|
|
679
|
+
private readonly perAttributeViewBudget;
|
|
680
|
+
private readonly perAttributeSpanBudget;
|
|
681
|
+
private readonly perCallByteCeiling;
|
|
682
|
+
private readonly perMatchTextBudget;
|
|
683
|
+
private indexPromise?;
|
|
684
|
+
/** Cached UTF-8 buffer of the file. We pin it once because every
|
|
685
|
+
* read needs slice access and re-reading on each call balloons the
|
|
686
|
+
* syscall count. */
|
|
687
|
+
private bufferPromise?;
|
|
688
|
+
constructor(opts: OtlpFileTraceStoreOptions);
|
|
689
|
+
getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
|
|
690
|
+
queryTraces(opts: {
|
|
691
|
+
filters?: TraceAnalystFilters;
|
|
692
|
+
limit: number;
|
|
693
|
+
offset?: number;
|
|
694
|
+
}): Promise<QueryTracesPage>;
|
|
695
|
+
countTraces(filters?: TraceAnalystFilters): Promise<number>;
|
|
696
|
+
viewTrace(opts: {
|
|
697
|
+
trace_id: string;
|
|
698
|
+
per_attribute_byte_cap?: number;
|
|
699
|
+
}): Promise<ViewTraceResult>;
|
|
700
|
+
viewSpans(opts: {
|
|
701
|
+
trace_id: string;
|
|
702
|
+
span_ids: readonly string[];
|
|
703
|
+
per_attribute_byte_cap?: number;
|
|
704
|
+
}): Promise<ViewSpansResult>;
|
|
705
|
+
searchTrace(opts: {
|
|
706
|
+
trace_id: string;
|
|
707
|
+
regex_pattern: string;
|
|
708
|
+
max_matches?: number;
|
|
709
|
+
}): Promise<SearchTraceResult>;
|
|
710
|
+
searchSpan(opts: {
|
|
711
|
+
trace_id: string;
|
|
712
|
+
span_id: string;
|
|
713
|
+
regex_pattern: string;
|
|
714
|
+
max_matches?: number;
|
|
715
|
+
}): Promise<SearchSpanResult>;
|
|
716
|
+
/** Force the index to materialise. Useful to amortise startup cost
|
|
717
|
+
* before the first agent call. */
|
|
718
|
+
ensureIndexed(): Promise<void>;
|
|
719
|
+
private buffer;
|
|
720
|
+
private index;
|
|
721
|
+
private buildIndex;
|
|
722
|
+
private matchedTraces;
|
|
723
|
+
private toSummary;
|
|
724
|
+
private projectSpan;
|
|
725
|
+
private buildOversizedSummary;
|
|
726
|
+
private scanSpanForMatches;
|
|
727
|
+
}
|
|
728
|
+
declare class TraceFileMissingError extends Error {
|
|
729
|
+
constructor(path: string);
|
|
730
|
+
}
|
|
731
|
+
declare class TraceNotFoundError extends Error {
|
|
732
|
+
readonly trace_id: string;
|
|
733
|
+
constructor(trace_id: string);
|
|
734
|
+
}
|
|
735
|
+
declare class SpanNotFoundError extends Error {
|
|
736
|
+
readonly trace_id: string;
|
|
737
|
+
readonly span_id: string;
|
|
738
|
+
constructor(trace_id: string, span_id: string);
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
/**
|
|
742
|
+
* Trace-analyst tool surface — six namespaced AxFunctions the analyst
|
|
743
|
+
* agent calls from generated JS code via `traces.<name>(...)`.
|
|
744
|
+
*
|
|
745
|
+
* Discovery → narrow → deep-read protocol. Tool names + ordering
|
|
746
|
+
* support RLM discovery:
|
|
747
|
+
*
|
|
748
|
+
* 1. `getDatasetOverview` (cheap) — first call, sizes the dataset
|
|
749
|
+
* 2. `queryTraces` — paginated summaries with `raw_jsonl_bytes`
|
|
750
|
+
* 3. `countTraces` — cheap pre-flight before regex
|
|
751
|
+
* 4. `viewTrace` — full span list, oversized → summary
|
|
752
|
+
* 5. `viewSpans` — surgical 16KB-cap reads
|
|
753
|
+
* 6. `searchTrace` / `searchSpan` — bounded regex hits
|
|
754
|
+
*
|
|
755
|
+
* Failure mode. Tool handlers throw on bad input (invalid trace ids,
|
|
756
|
+
* out-of-range pagination, malformed regex). Ax converts thrown errors
|
|
757
|
+
* into actor-visible `[ERROR]` strings so the analyst can adjust on
|
|
758
|
+
* the next turn instead of looping.
|
|
759
|
+
*/
|
|
760
|
+
|
|
761
|
+
interface BuildTraceAnalystToolsOpts {
|
|
762
|
+
store: TraceAnalysisStore;
|
|
763
|
+
/** Override the default sample-trace-id slot count (20). Mostly for tests. */
|
|
764
|
+
sampleTraceLimit?: number;
|
|
765
|
+
}
|
|
766
|
+
/**
|
|
767
|
+
* Build the trace-analyst function set. Pass the result into
|
|
768
|
+
* `agent(...).functions.local`.
|
|
769
|
+
*/
|
|
770
|
+
declare function buildTraceAnalystTools(opts: BuildTraceAnalystToolsOpts): AxFunction[];
|
|
771
|
+
/**
|
|
772
|
+
* Convenience: same shape as `buildTraceAnalystTools` but returns the
|
|
773
|
+
* grouped form expected when registering trace tools alongside other
|
|
774
|
+
* agent function modules. */
|
|
775
|
+
declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
|
|
776
|
+
namespace: string;
|
|
777
|
+
title: string;
|
|
778
|
+
selectionCriteria: string;
|
|
779
|
+
description: string;
|
|
780
|
+
functions: AxFunction[];
|
|
781
|
+
};
|
|
782
|
+
|
|
783
|
+
/**
|
|
784
|
+
* Trace-analyst auto-execution hook.
|
|
785
|
+
*
|
|
786
|
+
* Wires `analyzeTraces` into a `TraceEmitter`'s `onRunComplete` so a
|
|
787
|
+
* direct matrix run produces an analysis artifact without an out-of-band
|
|
788
|
+
* step. Designed for the case where the consumer reports "the analyst
|
|
789
|
+
* never ran" — the cause is almost always orchestration, not the analyst.
|
|
790
|
+
*
|
|
791
|
+
* Usage:
|
|
792
|
+
*
|
|
793
|
+
* const emitter = new TraceEmitter(store, {
|
|
794
|
+
* onRunComplete: [traceAnalystOnRunComplete({ analyze: opts, save })],
|
|
795
|
+
* })
|
|
796
|
+
*
|
|
797
|
+
* Hooks are best-effort by default — they never crash the underlying run.
|
|
798
|
+
* The caller decides whether to gate the run on the analysis result via
|
|
799
|
+
* the `gateOn` callback.
|
|
800
|
+
*/
|
|
801
|
+
|
|
802
|
+
interface TraceAnalystHookOptions {
|
|
803
|
+
/**
|
|
804
|
+
* Options forwarded to `analyzeTraces`. The hook supplies the question
|
|
805
|
+
* if you don't pass one — defaulting to a launch-grade prompt that asks
|
|
806
|
+
* for failure modes, surprising findings, and a recommendation.
|
|
807
|
+
*/
|
|
808
|
+
analyze: Omit<AnalyzeTracesOptions, 'source'> & {
|
|
809
|
+
source?: AnalyzeTracesOptions['source'];
|
|
810
|
+
};
|
|
811
|
+
/**
|
|
812
|
+
* Override the question. The default is intentionally generic:
|
|
813
|
+
* "Summarise what happened in this run, surface any failure modes,
|
|
814
|
+
* surprising findings, or evidence the verdict is wrong."
|
|
815
|
+
*/
|
|
816
|
+
question?: string;
|
|
817
|
+
/**
|
|
818
|
+
* Persist the result. The hook calls this with the analysis output and
|
|
819
|
+
* the run context. Common implementations write to a TraceAnalysisStore
|
|
820
|
+
* or append to a per-run JSONL.
|
|
821
|
+
*/
|
|
822
|
+
save?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => Promise<void>;
|
|
823
|
+
/**
|
|
824
|
+
* Predicate gating execution per run. Default: every completed run.
|
|
825
|
+
* Use to skip aborted runs, debug runs, or runs without LLM activity.
|
|
826
|
+
*/
|
|
827
|
+
shouldRun?: (ctx: RunCompleteHookContext) => boolean;
|
|
828
|
+
/**
|
|
829
|
+
* Optional gate: if set and returns false, the hook records the failure
|
|
830
|
+
* as a log event on the run instead of staying quiet. The caller can
|
|
831
|
+
* then trigger downstream alerts off `analyst_gate_failed` log events.
|
|
832
|
+
*/
|
|
833
|
+
gateOn?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => boolean;
|
|
834
|
+
}
|
|
835
|
+
declare function traceAnalystOnRunComplete(opts: TraceAnalystHookOptions): RunCompleteHook;
|
|
836
|
+
|
|
837
|
+
/** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
|
|
838
|
+
declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n ```js\n await final(\"Format the per-item verdict report from the evidence below.\", {\n findings: [\n { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n ...all items\n ],\n systemic_summary: '3 sentences I wrote based on the evidence above'\n });\n ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
|
|
839
|
+
declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
|
|
840
|
+
/** Subagent prompt for focused trace-inspection subtasks. */
|
|
841
|
+
declare const TRACE_ANALYST_SUBAGENT_DESCRIPTION = "You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.\n\nCite trace ids and span ids for every claim. Do NOT invent ids.";
|
|
842
|
+
|
|
843
|
+
interface TraceInsightTask {
|
|
844
|
+
id: string;
|
|
845
|
+
name: string;
|
|
846
|
+
prompt?: string;
|
|
847
|
+
difficulty?: string;
|
|
848
|
+
tags?: string[];
|
|
849
|
+
outcome?: string;
|
|
850
|
+
score?: number;
|
|
851
|
+
gaps?: string[];
|
|
852
|
+
}
|
|
853
|
+
interface TraceInsightSuite {
|
|
854
|
+
name: string;
|
|
855
|
+
collectionId?: string;
|
|
856
|
+
tasks: TraceInsightTask[];
|
|
857
|
+
}
|
|
858
|
+
interface TraceInsightFinding {
|
|
859
|
+
kind: string;
|
|
860
|
+
severity?: string;
|
|
861
|
+
taskIds: string[];
|
|
862
|
+
evidence?: string;
|
|
863
|
+
proposedFixClass?: string;
|
|
864
|
+
}
|
|
865
|
+
interface TraceInsightQuestion {
|
|
866
|
+
id: string;
|
|
867
|
+
question: string;
|
|
868
|
+
why: string;
|
|
869
|
+
}
|
|
870
|
+
interface TraceInsightPanelRole {
|
|
871
|
+
id: string;
|
|
872
|
+
name: string;
|
|
873
|
+
responsibility: string;
|
|
874
|
+
}
|
|
875
|
+
interface TraceInsightPromptInput {
|
|
876
|
+
suite: TraceInsightSuite;
|
|
877
|
+
findings?: TraceInsightFinding[];
|
|
878
|
+
agent?: Record<string, unknown>;
|
|
879
|
+
totals?: Record<string, unknown>;
|
|
880
|
+
maxRepresentativeTraces?: number;
|
|
881
|
+
}
|
|
882
|
+
interface TraceInsightContext {
|
|
883
|
+
suite: TraceInsightSuite;
|
|
884
|
+
scope: string;
|
|
885
|
+
keywords: string[];
|
|
886
|
+
questions: TraceInsightQuestion[];
|
|
887
|
+
panel: TraceInsightPanelRole[];
|
|
888
|
+
findings: TraceInsightFinding[];
|
|
889
|
+
agent: Record<string, unknown> | null;
|
|
890
|
+
totals: Record<string, unknown> | null;
|
|
891
|
+
}
|
|
892
|
+
interface TraceInsightQualityGate {
|
|
893
|
+
id: string;
|
|
894
|
+
label: string;
|
|
895
|
+
passed: boolean;
|
|
896
|
+
severity: 'critical' | 'high' | 'medium' | 'low';
|
|
897
|
+
detail: string;
|
|
898
|
+
}
|
|
899
|
+
interface TraceInsightReadiness {
|
|
900
|
+
score: number;
|
|
901
|
+
grade: 'external-ready' | 'internal-review' | 'raw-analysis';
|
|
902
|
+
gates: TraceInsightQualityGate[];
|
|
903
|
+
}
|
|
904
|
+
declare function tokenizeDomainWords(value: string): string[];
|
|
905
|
+
declare function inferDomainKeywords(suite: TraceInsightSuite): string[];
|
|
906
|
+
declare function domainEvidencePattern(keywords: string[]): RegExp;
|
|
907
|
+
declare function describeTraceInsightScope(suite: TraceInsightSuite): string;
|
|
908
|
+
declare function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[];
|
|
909
|
+
declare function buildTraceInsightContext(input: TraceInsightPromptInput): TraceInsightContext;
|
|
910
|
+
declare function scoreTraceInsightReadiness(context: TraceInsightContext): TraceInsightReadiness;
|
|
911
|
+
declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
|
|
912
|
+
declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
|
|
913
|
+
|
|
914
|
+
export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, FileSystemRawProviderSink, type FileSystemRawProviderSinkOptions, InMemoryRawProviderSink, type InMemoryRawProviderSinkOptions, JudgeSpan, LlmSpan, NoopRawProviderSink, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type ProviderRedactor, type QueryTracesPage, REDACTION_VERSION, type RawProviderDirection, type RawProviderEvent, type RawProviderSink, type RawProviderSinkFilter, type RedactionReport, type RedactionRule, Run, RunCompleteHook, RunCompleteHookContext, RunIntegrityError, type RunIntegrityExpectations, type RunIntegrityIssue, type RunIntegrityIssueCode, type RunIntegrityReport, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, assertRunCaptured, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultProviderRedactor, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, judgeSpans, llmSpans, planTraceInsightQuestions, providerFromBaseUrl, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, throwIfRunIncomplete, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
|