@tangle-network/agent-eval 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +102 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +35 -2
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-4W4NCYM2.js +0 -1945
  129. package/dist/chunk-4W4NCYM2.js.map +0 -1
  130. package/dist/chunk-5IIQKMD5.js.map +0 -1
  131. package/dist/chunk-6KQG5HAH.js.map +0 -1
  132. package/dist/chunk-6M774GY6.js.map +0 -1
  133. package/dist/chunk-7EAUOUQS.js.map +0 -1
  134. package/dist/chunk-AXHNWLIX.js.map +0 -1
  135. package/dist/chunk-EXGR4XEM.js.map +0 -1
  136. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  137. package/dist/chunk-KAO3Q65R.js.map +0 -1
  138. package/dist/chunk-LZKIOBG2.js +0 -2026
  139. package/dist/chunk-LZKIOBG2.js.map +0 -1
  140. package/dist/chunk-QBW3YBTR.js.map +0 -1
  141. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  142. package/dist/chunk-SQQLHODJ.js.map +0 -1
  143. package/dist/chunk-V5QSWN7L.js +0 -1310
  144. package/dist/chunk-V5QSWN7L.js.map +0 -1
  145. package/dist/chunk-VQQSPGSM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
package/dist/traces.d.ts CHANGED
@@ -1,256 +1,11 @@
1
- import { T as TraceStore, L as LlmSpan, J as JudgeSpan, a as Run, F as FailureClass, c as ToolSpan } from './store-u47QaJ9G.js';
2
- export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, h as RunFilter, m as RunLayer, R as RunOutcome, n as RunStatus, e as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, f as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
3
- import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-B2XqDKFU.js';
4
- export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
5
- import { R as RawProviderSink, f as RawProviderEvent } from './integrity-Cr5YodSY.js';
6
- export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
1
+ export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-BL96gCEP.js';
2
+ import { a as RunCompleteHookContext, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
3
+ export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
4
+ export { F as FileSystemRawProviderSink, d as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, e as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, f as RawProviderDirection, c as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DK2EBVZC.js';
5
+ export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
6
+ export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, T as TraceStore, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
7
7
  import { AxAIService, AxFunction } from '@ax-llm/ax';
8
-
9
- /**
10
- * Typed query helpers over TraceStore.
11
- *
12
- * Not a full SQL engine — a minimal, composable set of operators that
13
- * cover the canned-pipeline use cases. For ad-hoc analytics, persist to
14
- * NDJSON and point DuckDB at it; the schema is stable so external SQL
15
- * tooling works out of the box.
16
- */
17
-
18
- declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
19
- declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
20
- declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
21
- declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
22
- /** Group spans by any key selector. */
23
- declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
24
- /** Hash tool arguments to an orderless-key-stable string for de-duplication. */
25
- declare function argHash(args: unknown): string;
26
- /** Sum an LLM-span array into aggregate token + cost. */
27
- declare function aggregateLlm(spans: LlmSpan[]): {
28
- inputTokens: number;
29
- outputTokens: number;
30
- cachedTokens: number;
31
- costUsd: number;
32
- };
33
- /** Pick the outcome's failure class when present, else derive 'success' from run status. */
34
- declare function runFailureClass(run: Run): FailureClass;
35
-
36
- /**
37
- * Redaction — remove PII / secrets from trace payloads before persist.
38
- *
39
- * Pre-persistence rules mean raw traces in storage are already scrubbed.
40
- * Unredacted variants (for debugging / post-mortems) live in a separate
41
- * storage layer with stricter access controls; this module only covers
42
- * the default scrub-then-persist path.
43
- *
44
- * Rules compose: pass an array of `RedactionRule`, each is applied in
45
- * order. Strings that match get replaced with a tagged sentinel so the
46
- * eval framework can count how many redactions happened per run
47
- * (surfaced via `redaction_applied` events).
48
- */
49
- interface RedactionRule {
50
- id: string;
51
- pattern: RegExp;
52
- /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
53
- replacement?: string;
54
- }
55
- interface RedactionReport {
56
- redactionCount: number;
57
- byRule: Record<string, number>;
58
- }
59
- /** OWASP / common-sense defaults — extend per-domain. */
60
- declare const DEFAULT_REDACTION_RULES: RedactionRule[];
61
- declare const REDACTION_VERSION = "1.0.0";
62
- /**
63
- * Redact a single string. Returns the new string and a per-rule count of
64
- * how many substitutions fired.
65
- */
66
- declare function redactString(input: string, rules?: RedactionRule[]): {
67
- output: string;
68
- report: RedactionReport;
69
- };
70
- /**
71
- * Walk a JSON-ish value applying `redactString` to every string leaf.
72
- * Arrays and plain objects are recursed; other types pass through
73
- * untouched. Circular references throw — traces should be tree-shaped.
74
- */
75
- declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
76
- value: unknown;
77
- report: RedactionReport;
78
- };
79
-
80
- /**
81
- * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
82
- * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
83
- *
84
- * Wire format only. We do NOT depend on the @opentelemetry SDK — that
85
- * would drag in polyfills incompatible with Workers/Edge. Consumers
86
- * push the JSON to their collector of choice via HTTP.
87
- *
88
- * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
89
- */
90
-
91
- declare const OTEL_AGENT_EVAL_SCOPE: {
92
- name: string;
93
- version: string;
94
- };
95
- interface OtlpSpan {
96
- traceId: string;
97
- spanId: string;
98
- parentSpanId?: string;
99
- name: string;
100
- kind: number;
101
- startTimeUnixNano: string;
102
- endTimeUnixNano: string;
103
- attributes: Array<{
104
- key: string;
105
- value: {
106
- stringValue?: string;
107
- intValue?: string;
108
- doubleValue?: number;
109
- boolValue?: boolean;
110
- };
111
- }>;
112
- events?: Array<{
113
- timeUnixNano: string;
114
- name: string;
115
- attributes?: OtlpSpan['attributes'];
116
- }>;
117
- status?: {
118
- code: number;
119
- message?: string;
120
- };
121
- }
122
- interface OtlpResourceSpans {
123
- resource: {
124
- attributes: OtlpSpan['attributes'];
125
- };
126
- scopeSpans: Array<{
127
- scope: typeof OTEL_AGENT_EVAL_SCOPE;
128
- spans: OtlpSpan[];
129
- }>;
130
- }
131
- interface OtlpExport {
132
- resourceSpans: OtlpResourceSpans[];
133
- }
134
- /** Export a single run's spans + events in OTLP/JSON. */
135
- declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
136
-
137
- /**
138
- * Replay-from-raw-events — turn every captured campaign run into a
139
- * re-runnable artifact.
140
- *
141
- * The premise: 0.21 made `RawProviderSink` capture every provider HTTP
142
- * envelope. 0.22's `runEvalCampaign` makes capture the default. Together
143
- * they mean every past run is a complete fingerprint of what happened on
144
- * the wire — and that fingerprint is enough to replay the run without
145
- * burning new LLM cost.
146
- *
147
- * Three use cases this primitive enables:
148
- *
149
- * 1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
150
- * to last week's runs without re-calling any LLM. The cost of trying
151
- * a new rubric drops from "another full sweep" to a CPU-bound replay.
152
- * 2. **Determinism audits** — replay the same campaign and verify the
153
- * raw responses match byte-for-byte. Any drift is a non-determinism
154
- * bug (in the harness, the prompt builder, the sandbox, …).
155
- * 3. **Free judge calibration** — run two judges on identical responses
156
- * and measure inter-judge agreement without doubling LLM spend.
157
- *
158
- * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
159
- * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
160
- * from the cache instead of calling the network. No new code path through
161
- * the LLM client is needed; the cache hit is invisible to the runner.
162
- */
163
-
164
- declare class ReplayCacheMissError extends Error {
165
- readonly url: string;
166
- readonly requestKey: string;
167
- constructor(url: string, requestKey: string, message?: string);
168
- }
169
- interface ReplayCacheEntry {
170
- request: RawProviderEvent;
171
- response: RawProviderEvent;
172
- }
173
- interface ReplayCacheStats {
174
- total: number;
175
- byProvider: Record<string, number>;
176
- byModel: Record<string, number>;
177
- /** Spans for which we have a request but no response (run aborted mid-call). */
178
- orphanRequests: number;
179
- }
180
- /**
181
- * In-memory deterministic cache of (request → response) keyed on a stable
182
- * hash of the request body. Built from a `RawProviderSink` containing
183
- * paired `request` and `response` events from a previous run.
184
- *
185
- * The cache is the source of truth for replay; `createReplayFetch` is a
186
- * thin wrapper that reads from it.
187
- */
188
- declare class ReplayCache {
189
- private byKey;
190
- private orphans;
191
- private byProvider;
192
- private byModel;
193
- /**
194
- * Build a cache from a sink's events. The sink must implement `list()`.
195
- * Filter by `runId` / `spanId` to scope to a specific replay.
196
- */
197
- static fromSink(sink: RawProviderSink, filter?: {
198
- runId?: string;
199
- spanId?: string;
200
- }): Promise<ReplayCache>;
201
- /** Build a cache from an in-memory event list. */
202
- static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
203
- /** Number of cacheable (request, response) pairs in the cache. */
204
- size(): number;
205
- stats(): ReplayCacheStats;
206
- /**
207
- * Look up a cached response by hashing the (model, messages, temperature,
208
- * maxTokens, response_format) shape. Returns `undefined` on miss; the
209
- * caller decides whether to throw, fall back to the network, or skip.
210
- */
211
- lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
212
- }
213
- interface ReplayFetchOptions {
214
- /**
215
- * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
216
- * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
217
- * still complete; `'fail-closed'` returns a synthetic 599 response so the
218
- * call site sees a non-retriable failure.
219
- */
220
- onMiss?: 'throw' | 'fallback' | 'fail-closed';
221
- fallbackFetch?: typeof fetch;
222
- /** Optional callback fired once per replayed call (for telemetry / counters). */
223
- onHit?: (info: {
224
- url: string;
225
- provider: string;
226
- model: string;
227
- }) => void;
228
- /** Optional callback fired on cache miss before the `onMiss` policy applies. */
229
- onMissNotify?: (info: {
230
- url: string;
231
- requestBody: unknown;
232
- }) => void;
233
- }
234
- /**
235
- * Build a `fetch`-shaped function that serves cached responses out of a
236
- * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
237
- * `LlmClientOptions.fetch` and `callLlm` becomes free.
238
- *
239
- * Non-`/chat/completions` URLs are passed straight to the fallback fetch
240
- * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
241
- * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
242
- * `fetch` and shouldn't be intercepted.
243
- */
244
- declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
245
- /**
246
- * Convenience iterator over `(request, response)` pairs in a sink — for
247
- * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
248
- * runs purely in-process over cached LLM outputs.
249
- */
250
- declare function iterateRawCalls(sink: RawProviderSink, filter?: {
251
- runId?: string;
252
- spanId?: string;
253
- }): AsyncGenerator<ReplayCacheEntry>;
8
+ import { N as NotFoundError } from './errors-BZ9sTdz7.js';
254
9
 
255
10
  /**
256
11
  * Shared types for the trace-analyst module.
@@ -555,6 +310,137 @@ interface AnalyzeTracesOptions {
555
310
  */
556
311
  declare function analyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions): Promise<AnalyzeTracesResult>;
557
312
 
313
+ /**
314
+ * Trace-analyst auto-execution hook.
315
+ *
316
+ * Wires `analyzeTraces` into a `TraceEmitter`'s `onRunComplete` so a
317
+ * direct matrix run produces an analysis artifact without an out-of-band
318
+ * step. Designed for the case where the consumer reports "the analyst
319
+ * never ran" — the cause is almost always orchestration, not the analyst.
320
+ *
321
+ * Usage:
322
+ *
323
+ * const emitter = new TraceEmitter(store, {
324
+ * onRunComplete: [traceAnalystOnRunComplete({ analyze: opts, save })],
325
+ * })
326
+ *
327
+ * Hooks are best-effort by default — they never crash the underlying run.
328
+ * The caller decides whether to gate the run on the analysis result via
329
+ * the `gateOn` callback.
330
+ */
331
+
332
+ interface TraceAnalystHookOptions {
333
+ /**
334
+ * Options forwarded to `analyzeTraces`. The hook supplies the question
335
+ * if you don't pass one — defaulting to a launch-grade prompt that asks
336
+ * for failure modes, surprising findings, and a recommendation.
337
+ */
338
+ analyze: Omit<AnalyzeTracesOptions, 'source'> & {
339
+ source?: AnalyzeTracesOptions['source'];
340
+ };
341
+ /**
342
+ * Override the question. The default is intentionally generic:
343
+ * "Summarise what happened in this run, surface any failure modes,
344
+ * surprising findings, or evidence the verdict is wrong."
345
+ */
346
+ question?: string;
347
+ /**
348
+ * Persist the result. The hook calls this with the analysis output and
349
+ * the run context. Common implementations write to a TraceAnalysisStore
350
+ * or append to a per-run JSONL.
351
+ */
352
+ save?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => Promise<void>;
353
+ /**
354
+ * Predicate gating execution per run. Default: every completed run.
355
+ * Use to skip aborted runs, debug runs, or runs without LLM activity.
356
+ */
357
+ shouldRun?: (ctx: RunCompleteHookContext) => boolean;
358
+ /**
359
+ * Optional gate: if set and returns false, the hook records the failure
360
+ * as a log event on the run instead of staying quiet. The caller can
361
+ * then trigger downstream alerts off `analyst_gate_failed` log events.
362
+ */
363
+ gateOn?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => boolean;
364
+ }
365
+ declare function traceAnalystOnRunComplete(opts: TraceAnalystHookOptions): RunCompleteHook;
366
+
367
+ interface TraceInsightTask {
368
+ id: string;
369
+ name: string;
370
+ prompt?: string;
371
+ difficulty?: string;
372
+ tags?: string[];
373
+ outcome?: string;
374
+ score?: number;
375
+ gaps?: string[];
376
+ }
377
+ interface TraceInsightSuite {
378
+ name: string;
379
+ collectionId?: string;
380
+ tasks: TraceInsightTask[];
381
+ }
382
+ interface TraceInsightFinding {
383
+ kind: string;
384
+ severity?: string;
385
+ taskIds: string[];
386
+ evidence?: string;
387
+ proposedFixClass?: string;
388
+ }
389
+ interface TraceInsightQuestion {
390
+ id: string;
391
+ question: string;
392
+ why: string;
393
+ }
394
+ interface TraceInsightPanelRole {
395
+ id: string;
396
+ name: string;
397
+ responsibility: string;
398
+ }
399
+ interface TraceInsightPromptInput {
400
+ suite: TraceInsightSuite;
401
+ findings?: TraceInsightFinding[];
402
+ agent?: Record<string, unknown>;
403
+ totals?: Record<string, unknown>;
404
+ maxRepresentativeTraces?: number;
405
+ }
406
+ interface TraceInsightContext {
407
+ suite: TraceInsightSuite;
408
+ scope: string;
409
+ keywords: string[];
410
+ questions: TraceInsightQuestion[];
411
+ panel: TraceInsightPanelRole[];
412
+ findings: TraceInsightFinding[];
413
+ agent: Record<string, unknown> | null;
414
+ totals: Record<string, unknown> | null;
415
+ }
416
+ interface TraceInsightQualityGate {
417
+ id: string;
418
+ label: string;
419
+ passed: boolean;
420
+ severity: 'critical' | 'high' | 'medium' | 'low';
421
+ detail: string;
422
+ }
423
+ interface TraceInsightReadiness {
424
+ score: number;
425
+ grade: 'external-ready' | 'internal-review' | 'raw-analysis';
426
+ gates: TraceInsightQualityGate[];
427
+ }
428
+ declare function tokenizeDomainWords(value: string): string[];
429
+ declare function inferDomainKeywords(suite: TraceInsightSuite): string[];
430
+ declare function domainEvidencePattern(keywords: string[]): RegExp;
431
+ declare function describeTraceInsightScope(suite: TraceInsightSuite): string;
432
+ declare function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[];
433
+ declare function buildTraceInsightContext(input: TraceInsightPromptInput): TraceInsightContext;
434
+ declare function scoreTraceInsightReadiness(context: TraceInsightContext): TraceInsightReadiness;
435
+ declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
436
+ declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
437
+
438
+ /** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
439
+ declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n ```js\n await final(\"Format the per-item verdict report from the evidence below.\", {\n findings: [\n { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n ...all items\n ],\n systemic_summary: '3 sentences I wrote based on the evidence above'\n });\n ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
440
+ declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
441
+ /** Subagent prompt for focused trace-inspection subtasks. */
442
+ declare const TRACE_ANALYST_SUBAGENT_DESCRIPTION = "You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.\n\nCite trace ids and span ids for every claim. Do NOT invent ids.";
443
+
558
444
  /**
559
445
  * `OtlpFileTraceStore` — read-only OTLP-JSONL trace store for the
560
446
  * trace-analyst.
@@ -645,14 +531,14 @@ declare class OtlpFileTraceStore implements TraceAnalysisStore {
645
531
  private buildOversizedSummary;
646
532
  private scanSpanForMatches;
647
533
  }
648
- declare class TraceFileMissingError extends Error {
534
+ declare class TraceFileMissingError extends NotFoundError {
649
535
  constructor(path: string);
650
536
  }
651
- declare class TraceNotFoundError extends Error {
537
+ declare class TraceNotFoundError extends NotFoundError {
652
538
  readonly trace_id: string;
653
539
  constructor(trace_id: string);
654
540
  }
655
- declare class SpanNotFoundError extends Error {
541
+ declare class SpanNotFoundError extends NotFoundError {
656
542
  readonly trace_id: string;
657
543
  readonly span_id: string;
658
544
  constructor(trace_id: string, span_id: string);
@@ -700,135 +586,4 @@ declare function traceAnalystFunctionGroup(opts: BuildTraceAnalystToolsOpts): {
700
586
  functions: AxFunction[];
701
587
  };
702
588
 
703
- /**
704
- * Trace-analyst auto-execution hook.
705
- *
706
- * Wires `analyzeTraces` into a `TraceEmitter`'s `onRunComplete` so a
707
- * direct matrix run produces an analysis artifact without an out-of-band
708
- * step. Designed for the case where the consumer reports "the analyst
709
- * never ran" — the cause is almost always orchestration, not the analyst.
710
- *
711
- * Usage:
712
- *
713
- * const emitter = new TraceEmitter(store, {
714
- * onRunComplete: [traceAnalystOnRunComplete({ analyze: opts, save })],
715
- * })
716
- *
717
- * Hooks are best-effort by default — they never crash the underlying run.
718
- * The caller decides whether to gate the run on the analysis result via
719
- * the `gateOn` callback.
720
- */
721
-
722
- interface TraceAnalystHookOptions {
723
- /**
724
- * Options forwarded to `analyzeTraces`. The hook supplies the question
725
- * if you don't pass one — defaulting to a launch-grade prompt that asks
726
- * for failure modes, surprising findings, and a recommendation.
727
- */
728
- analyze: Omit<AnalyzeTracesOptions, 'source'> & {
729
- source?: AnalyzeTracesOptions['source'];
730
- };
731
- /**
732
- * Override the question. The default is intentionally generic:
733
- * "Summarise what happened in this run, surface any failure modes,
734
- * surprising findings, or evidence the verdict is wrong."
735
- */
736
- question?: string;
737
- /**
738
- * Persist the result. The hook calls this with the analysis output and
739
- * the run context. Common implementations write to a TraceAnalysisStore
740
- * or append to a per-run JSONL.
741
- */
742
- save?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => Promise<void>;
743
- /**
744
- * Predicate gating execution per run. Default: every completed run.
745
- * Use to skip aborted runs, debug runs, or runs without LLM activity.
746
- */
747
- shouldRun?: (ctx: RunCompleteHookContext) => boolean;
748
- /**
749
- * Optional gate: if set and returns false, the hook records the failure
750
- * as a log event on the run instead of staying quiet. The caller can
751
- * then trigger downstream alerts off `analyst_gate_failed` log events.
752
- */
753
- gateOn?: (result: AnalyzeTracesResult, ctx: RunCompleteHookContext) => boolean;
754
- }
755
- declare function traceAnalystOnRunComplete(opts: TraceAnalystHookOptions): RunCompleteHook;
756
-
757
- /** Ax RLM prompt for bounded trace discovery and evidence-backed analysis. */
758
- declare const TRACE_ANALYST_ACTOR_DESCRIPTION = "You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the `traces` namespace.\n\nDISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol \u2014 follow exactly:\n\n1. ALWAYS call `traces.getDatasetOverview({})` FIRST without a regex_pattern. The result tells you total_traces, raw_jsonl_bytes, services, agents, models, and sample_trace_ids (real ids \u2014 never fabricate one).\n\n2. Use raw_jsonl_bytes to gauge how expensive raw scans will be. `filters.regex_pattern` is the one scan-heavy filter on getDatasetOverview / queryTraces / countTraces \u2014 narrow with indexed fields (has_errors, model_names, service_names, agent_names, time bounds) BEFORE adding a regex on a large dataset.\n\n3. To list more traces than the sample, call `traces.queryTraces({ filters?, limit, offset? })`. Each summary carries raw_jsonl_bytes \u2014 use it to choose between viewTrace and searchTrace BEFORE calling either.\n\n4. Per-trace inspection:\n - SMALL trace (raw_jsonl_bytes well under 150_000): call `traces.viewTrace({ trace_id })`. Returns all spans. Per-attribute payloads are head-capped at ~4KB; large `input.value` / `output.value` / `llm.input_messages` will show a `[trace-analyst truncated: N bytes]` marker.\n - LARGE trace (raw_jsonl_bytes near or above 150_000, or you saw an `oversized` response): use `traces.searchTrace({ trace_id, regex_pattern })` to get bounded SpanMatchRecords (span metadata + matched text + surrounding context). Then call `traces.viewSpans({ trace_id, span_ids: [...] })` for surgical reads (~16KB cap, 4\u00D7 higher than discovery), or `traces.searchSpan({ trace_id, span_id, regex_pattern })` for one large span. Stays bounded regardless of trace size.\n - Useful regex patterns: `STATUS_CODE_ERROR` (failures), tool names like `grep` or `view_trace`, error strings like `MaxTurnsExceeded`, model names, attribute keys.\n\n5. ONLY call viewTrace / viewSpans / searchTrace / searchSpan with trace/span ids you have already seen in sample_trace_ids, a queryTraces page, or a previous search result. Never invent ids.\n\n5a. **Result-shape contract** \u2014 searchTrace and searchSpan return `{ trace_id, hits, total_matches, has_more }`. Iterate `result.hits` (NOT result.matches). Each hit has `{ span_id, span_name, span_kind, attribute_path, matched_text, context_before, context_after, match_offset }`. viewTrace returns `{ trace_id, spans }` (or `oversized`). viewSpans returns `{ trace_id, spans, missing_span_ids, truncated_attribute_count }`. Never assume a field name \u2014 log the result shape first if unsure.\n\n6. If viewTrace returns an `oversized` summary instead of `spans`, DO NOT retry the same call. Read the summary's top_span_names, span_count, span_response_bytes_max, error_span_count to plan a follow-up: switch to searchTrace (or searchSpan for one large span), then viewSpans on a smaller, surgical span_ids set.\n\n7. If searchTrace or searchSpan returns has_more=true, REFINE the regex to be more specific rather than blindly raising max_matches.\n\n8. If a tool errors (invalid regex, range error), STOP and reconsider \u2014 don't retry with a guessed id or argument. Use the discovery tools above to recover.\n\n9. If a ~4KB-truncated payload from viewTrace / searchTrace matters for your answer, first try viewSpans on that span id (~16KB cap). If a 16KB-truncated payload from viewSpans still matters, narrow further with searchSpan against a more specific regex rather than asking for the full payload again.\n\n10. If maxDepth > 0 and the question splits into independent semantic branches, delegate well-defined subtasks to subagents using `await llmQuery(...)`. Pass narrow context and a focused query. Examples:\n\n const reviews = await llmQuery([\n { query: 'Drill into trace abc123 \u2014 what tool calls preceded the failure?', context: { trace_id: 'abc123' } },\n { query: 'Drill into trace def456 \u2014 same failure mode?', context: { trace_id: 'def456' } },\n ]);\n\nOBSERVABILITY rules:\n- Each non-final actor turn must emit at least one `console.log(...)` for evidence. Up to 3 logs per turn is fine when correlating multiple data sources (e.g. one log for findings list, one for source-file content, one for derived analysis).\n- Do NOT combine `console.log` with `final(...)` or `askClarification(...)` in the same turn \u2014 finish gathering data first, then call final on its own turn.\n- Reuse runtime variables across turns; don't recompute.\n- When done, call `await final(answer)` with the fully-formed report. The responder rewrites the answer into output fields; if you only pass a vague summary string the responder has nothing concrete to format.\n\nCRITICAL \u2014 `final()` payload contract for evidence-grounded analysis tasks:\n- Pass a STRUCTURED object as the second arg with the actual data the responder needs to format the answer. Do NOT pass abstract instructions; pass evidence.\n- Example for per-item verdict tasks:\n ```js\n await final(\"Format the per-item verdict report from the evidence below.\", {\n findings: [\n { id: 'sub-1-finding-1', claim: '...', verdict: 'TRUE-POSITIVE', evidence: 'lines 42-45 of contracts/X.sol show ...' },\n ...all items\n ],\n systemic_summary: '3 sentences I wrote based on the evidence above'\n });\n ```\n- Calling `final(\"answer\", {})` with no evidence is a failure mode \u2014 the responder will hallucinate or echo back the field names. Always include the gathered data.\n- Premature final after a single viewSpans call is INSUFFICIENT for per-finding analysis tasks. Read the requested attributes (e.g. `spans[i].attributes['redteam.finding.title']`), and for each one perform the requested cross-reference (e.g. read the source SPAN's `attributes['source.content']`).\n\nOUTPUT contract \u2014 your final answer must include:\n- A clear prose conclusion answering the user's question.\n- Trace ids and span ids cited as evidence for each claim.\n- Failure modes named in the user's domain language, with frequency and concrete examples.\n\nDo NOT invent trace ids, span ids, error messages, or model names. Every fact must be traceable to a tool result.";
759
- declare const TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION = "trace-analyst-actor-v5-2026-05-06";
760
- /** Subagent prompt for focused trace-inspection subtasks. */
761
- declare const TRACE_ANALYST_SUBAGENT_DESCRIPTION = "You are a trace-analyst subagent. Your parent has delegated a focused trace-inspection question. Use the same DISCOVERY \u2192 NARROW \u2192 DEEP-READ protocol but stay tightly scoped: do exactly what was asked, return a concise compact answer, do NOT spawn further subagents unless the parent's question is genuinely multi-branch.\n\nCite trace ids and span ids for every claim. Do NOT invent ids.";
762
-
763
- interface TraceInsightTask {
764
- id: string;
765
- name: string;
766
- prompt?: string;
767
- difficulty?: string;
768
- tags?: string[];
769
- outcome?: string;
770
- score?: number;
771
- gaps?: string[];
772
- }
773
- interface TraceInsightSuite {
774
- name: string;
775
- collectionId?: string;
776
- tasks: TraceInsightTask[];
777
- }
778
- interface TraceInsightFinding {
779
- kind: string;
780
- severity?: string;
781
- taskIds: string[];
782
- evidence?: string;
783
- proposedFixClass?: string;
784
- }
785
- interface TraceInsightQuestion {
786
- id: string;
787
- question: string;
788
- why: string;
789
- }
790
- interface TraceInsightPanelRole {
791
- id: string;
792
- name: string;
793
- responsibility: string;
794
- }
795
- interface TraceInsightPromptInput {
796
- suite: TraceInsightSuite;
797
- findings?: TraceInsightFinding[];
798
- agent?: Record<string, unknown>;
799
- totals?: Record<string, unknown>;
800
- maxRepresentativeTraces?: number;
801
- }
802
- interface TraceInsightContext {
803
- suite: TraceInsightSuite;
804
- scope: string;
805
- keywords: string[];
806
- questions: TraceInsightQuestion[];
807
- panel: TraceInsightPanelRole[];
808
- findings: TraceInsightFinding[];
809
- agent: Record<string, unknown> | null;
810
- totals: Record<string, unknown> | null;
811
- }
812
- interface TraceInsightQualityGate {
813
- id: string;
814
- label: string;
815
- passed: boolean;
816
- severity: 'critical' | 'high' | 'medium' | 'low';
817
- detail: string;
818
- }
819
- interface TraceInsightReadiness {
820
- score: number;
821
- grade: 'external-ready' | 'internal-review' | 'raw-analysis';
822
- gates: TraceInsightQualityGate[];
823
- }
824
- declare function tokenizeDomainWords(value: string): string[];
825
- declare function inferDomainKeywords(suite: TraceInsightSuite): string[];
826
- declare function domainEvidencePattern(keywords: string[]): RegExp;
827
- declare function describeTraceInsightScope(suite: TraceInsightSuite): string;
828
- declare function planTraceInsightQuestions(input: TraceInsightPromptInput): TraceInsightQuestion[];
829
- declare function buildTraceInsightContext(input: TraceInsightPromptInput): TraceInsightContext;
830
- declare function scoreTraceInsightReadiness(context: TraceInsightContext): TraceInsightReadiness;
831
- declare function defaultTraceInsightPanel(): TraceInsightPanelRole[];
832
- declare function buildTraceInsightPrompt(input: TraceInsightPromptInput): string;
833
-
834
- export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, FailureClass, JudgeSpan, LlmSpan, OTEL_AGENT_EVAL_SCOPE, type OtlpExport, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type OtlpResourceSpans, type OtlpSpan, type QueryTracesPage, REDACTION_VERSION, RawProviderEvent, RawProviderSink, type RedactionReport, type RedactionRule, ReplayCache, type ReplayCacheEntry, ReplayCacheMissError, type ReplayCacheStats, type ReplayFetchOptions, Run, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, ToolSpan, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, TraceStore, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, llmSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete };
589
+ export { type AnalyzeTracesInput, type AnalyzeTracesOptions, type AnalyzeTracesResult, type AnalyzeTracesTurnSnapshot, DEFAULT_TRACE_ANALYST_BUDGETS, type DatasetOverview, OtlpFileTraceStore, type OtlpFileTraceStoreOptions, type QueryTracesPage, RunCompleteHook, RunCompleteHookContext, type SearchSpanResult, type SearchTraceResult, type SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, type TraceAnalysisStore, type TraceAnalystByteBudgets, type TraceAnalystFilters, type TraceAnalystHookOptions, type TraceAnalystSpan, type TraceAnalystSpanKind, type TraceAnalystSpanStatus, type TraceAnalystTraceSummary, TraceFileMissingError, type TraceInsightContext, type TraceInsightFinding, type TraceInsightPanelRole, type TraceInsightPromptInput, type TraceInsightQualityGate, type TraceInsightQuestion, type TraceInsightReadiness, type TraceInsightSuite, type TraceInsightTask, TraceNotFoundError, type ViewSpansResult, type ViewTraceOversized, type ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, inferDomainKeywords, planTraceInsightQuestions, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete };