@tangle-network/agent-eval 0.48.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +7 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/{traceai.d.ts → otel.d.ts} +29 -29
  5. package/dist/adapters/{traceai.js → otel.js} +9 -5
  6. package/dist/adapters/otel.js.map +1 -0
  7. package/dist/campaign/index.d.ts +3 -3
  8. package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
  9. package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
  10. package/dist/chunk-EGIPWXHL.js.map +1 -0
  11. package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
  12. package/dist/chunk-FQK2CCIM.js.map +1 -0
  13. package/dist/chunk-MAZ26DC7.js +99 -0
  14. package/dist/chunk-MAZ26DC7.js.map +1 -0
  15. package/dist/chunk-SHTXZ4O2.js +113 -0
  16. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  17. package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
  18. package/dist/contract/index.d.ts +206 -9
  19. package/dist/contract/index.js +751 -3
  20. package/dist/contract/index.js.map +1 -1
  21. package/dist/governance/index.d.ts +1 -1
  22. package/dist/hosted/index.d.ts +8 -192
  23. package/dist/hosted/index.js +1 -1
  24. package/dist/index-BRxz6qov.d.ts +409 -0
  25. package/dist/index.d.ts +18 -462
  26. package/dist/index.js +14 -106
  27. package/dist/index.js.map +1 -1
  28. package/dist/meta-eval/index.d.ts +3 -3
  29. package/dist/openapi.json +1 -1
  30. package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
  31. package/dist/registry-8KAs18kY.d.ts +457 -0
  32. package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
  33. package/dist/reporting.d.ts +6 -4
  34. package/dist/reporting.js +6 -4
  35. package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
  36. package/dist/rl.d.ts +9 -8
  37. package/dist/rl.js +3 -2
  38. package/dist/rl.js.map +1 -1
  39. package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  40. package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
  41. package/dist/sequential-5iSVfzl2.d.ts +139 -0
  42. package/dist/store-CJbzDxZ2.d.ts +220 -0
  43. package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
  44. package/dist/traces.d.ts +3 -220
  45. package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
  46. package/dist/types-DhqpAi_z.d.ts +296 -0
  47. package/docs/adapters-observability.md +3 -3
  48. package/package.json +5 -5
  49. package/dist/adapters/traceai.js.map +0 -1
  50. package/dist/chunk-MNL6LXGQ.js.map +0 -1
  51. package/dist/chunk-OYI6RZJK.js.map +0 -1
  52. /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
  53. /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
  54. /package/docs/design/{substrate-gaps-2026-05-27.md → substrate-gaps.md} +0 -0
@@ -1,7 +1,7 @@
1
1
  import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
2
- import { b as OutcomeFilter, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
3
- export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
4
- export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-CJ08tGwq.js';
2
+ import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
3
+ export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
+ export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-ByZEC3BX.js';
5
5
  import '../run-record-BGY6bHRh.js';
6
6
  import '../errors-mje_cKOs.js';
7
7
 
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.48.0",
5
+ "version": "0.50.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -60,4 +60,4 @@ declare class FileSystemOutcomeStore implements OutcomeStore {
60
60
  list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
61
61
  }
62
62
 
63
- export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeStore as O, type FileSystemOutcomeStoreOptions as a, type OutcomeFilter as b };
63
+ export { type DeploymentOutcome as D, FileSystemOutcomeStore as F, InMemoryOutcomeStore as I, type OutcomeStore as O, type OutcomeFilter as a, type FileSystemOutcomeStoreOptions as b };
@@ -0,0 +1,457 @@
1
+ import { b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
2
+ import { R as RunRecord } from './run-record-BGY6bHRh.js';
3
+ import { T as TraceAnalysisStore } from './store-CJbzDxZ2.js';
4
+ import { J as JudgeInput } from './types-DhqpAi_z.js';
5
+
6
+ /**
7
+ * ChatClient — the single LLM abstraction analysts call.
8
+ *
9
+ * agent-eval already ships an `LlmClient` (OpenAI-compatible, retry,
10
+ * graceful JSON-schema degrade) and judges that talk to `TCloud`. Two
11
+ * mixed patterns force every analyst author to pick a transport, which
12
+ * couples analyst code to runtime concerns (cli-bridge vs router vs
13
+ * sandbox-sdk) it shouldn't know about.
14
+ *
15
+ * `ChatClient` is one interface every analyst takes via `AnalystContext.chat`.
16
+ * The operator decides at the registry boundary which transport binds
17
+ * to it. Analyst code stays transport-agnostic; swapping production
18
+ * (sandbox-sdk) for local dev (cli-bridge) or tests (mock) is a one-
19
+ * line factory call.
20
+ *
21
+ * Designed to coexist: existing `LlmClient` callers and existing
22
+ * `TCloud`-based judges keep working untouched. New analyst code uses
23
+ * `ChatClient`. When old call sites migrate, they pick up budgeting,
24
+ * cancellation, and unified telemetry for free.
25
+ */
26
+
27
+ /**
28
+ * Unified chat interface. Mirrors LlmCallRequest/Result so the OpenAI-
29
+ * compatible mental model stays. Two methods: a one-shot `chat()` and
30
+ * an `streamChat()` for future agentic loops (not yet exposed).
31
+ */
32
+ interface ChatClient {
33
+ /** Display name of the bound transport — included in telemetry. */
34
+ readonly transport: ChatTransport;
35
+ /** Default model when caller omits — operators bind this per environment. */
36
+ readonly defaultModel?: string;
37
+ chat(req: ChatRequest, opts?: ChatCallOpts): Promise<ChatResponse>;
38
+ }
39
+ type ChatTransport = 'router' | 'sandbox-sdk' | 'cli-bridge' | 'direct-provider' | 'mock';
40
+ interface ChatRequest extends Omit<LlmCallRequest, 'model'> {
41
+ /** Optional — falls back to ChatClient.defaultModel. */
42
+ model?: string;
43
+ }
44
+ type ChatResponse = LlmCallResult;
45
+ interface ChatCallOpts {
46
+ /** Cancel the in-flight request. */
47
+ signal?: AbortSignal;
48
+ /** Hard USD ceiling for this single call (informational; the underlying transport may not enforce). */
49
+ maxCostUsd?: number;
50
+ /** Correlation tag carried into request headers when the transport allows. */
51
+ correlationId?: string;
52
+ }
53
+ type CreateChatClientOpts = RouterTransportOpts | CliBridgeTransportOpts | DirectProviderTransportOpts | SandboxSdkTransportOpts | MockTransportOpts;
54
+ interface BaseTransportOpts {
55
+ defaultModel?: string;
56
+ }
57
+ interface RouterTransportOpts extends BaseTransportOpts {
58
+ transport: 'router';
59
+ baseUrl?: string;
60
+ apiKey: string;
61
+ }
62
+ interface CliBridgeTransportOpts extends BaseTransportOpts {
63
+ transport: 'cli-bridge';
64
+ baseUrl?: string;
65
+ bearer?: string;
66
+ }
67
+ interface DirectProviderTransportOpts extends BaseTransportOpts {
68
+ transport: 'direct-provider';
69
+ baseUrl: string;
70
+ apiKey: string;
71
+ }
72
+ /**
73
+ * Sandbox-SDK transport. Provided as a thin pass-through: the caller
74
+ * supplies a callable that mimics LlmClient.chat() against an already-
75
+ * configured Sandbox handle. We don't import the SDK here to keep
76
+ * agent-eval dep-free of @tangle-network/sandbox.
77
+ */
78
+ interface SandboxSdkTransportOpts extends BaseTransportOpts {
79
+ transport: 'sandbox-sdk';
80
+ chat: (req: ChatRequest, opts?: ChatCallOpts) => Promise<ChatResponse>;
81
+ }
82
+ /**
83
+ * Mock transport for tests. The handler receives the request and returns
84
+ * whatever the test wants. No retries, no JSON-schema degrade.
85
+ */
86
+ interface MockTransportOpts extends BaseTransportOpts {
87
+ transport: 'mock';
88
+ handler: (req: ChatRequest, opts?: ChatCallOpts) => Promise<ChatResponse>;
89
+ }
90
+ /**
91
+ * Build a ChatClient bound to a specific transport. The returned client
92
+ * is safe to share across analysts in a single registry run.
93
+ */
94
+ declare function createChatClient(opts: CreateChatClientOpts): ChatClient;
95
+
96
+ /**
97
+ * Analyst contract — the missing orchestration layer over agent-eval's
98
+ * existing analyzers (analyzeTraces, MultiLayerVerifier, RunCritic,
99
+ * SemanticConceptJudge, JudgeFn, ...).
100
+ *
101
+ * Each existing primitive returns its own output shape. The Analyst
102
+ * contract is the single envelope every primitive lifts into, so a
103
+ * registry can run N analysts against a run and a single renderer can
104
+ * compose findings without knowing which analyzer produced them.
105
+ *
106
+ * The contract is intentionally domain-agnostic: nothing here knows
107
+ * about code, voice, RAG, or any particular agent stack. Analysts
108
+ * declare what INPUT KIND they need (a trace store, an artifact dir,
109
+ * a RunRecord, a JudgeInput, or `custom`), and the registry routes
110
+ * the matching input from `AnalystRunInputs`.
111
+ */
112
+
113
+ /**
114
+ * Unified envelope every analyst emits. Schema-versioned so renderers
115
+ * and time-series diffs survive future field additions.
116
+ */
117
+ interface AnalystFinding {
118
+ schema_version: '1.0.0';
119
+ /**
120
+ * Stable hash over identity-defining fields (analyst_id + canonical
121
+ * claim + area + optional subject). Two findings from two runs that
122
+ * "are the same finding" share this id — that's what `diffFindings`
123
+ * uses to compute appeared/disappeared sets across runs.
124
+ */
125
+ finding_id: string;
126
+ analyst_id: string;
127
+ produced_at: string;
128
+ severity: AnalystSeverity;
129
+ /**
130
+ * Coarse classification. Renderers group by this. Free-form so
131
+ * domain-specific analysts can introduce categories without a
132
+ * schema change ('agent-reasoning', 'verification', 'cost',
133
+ * 'tool-use', 'safety', 'latency', 'data-quality', ...).
134
+ */
135
+ area: string;
136
+ claim: string;
137
+ rationale?: string;
138
+ evidence_refs: EvidenceRef[];
139
+ recommended_action?: string;
140
+ validation_plan?: string;
141
+ /** 0..1 — the analyst's own confidence. Not calibrated across analysts. */
142
+ confidence: number;
143
+ /**
144
+ * Optional subject the finding is about — leaf id, agent id, request
145
+ * id. Included in finding_id when present so per-subject findings
146
+ * diff cleanly across runs.
147
+ */
148
+ subject?: string;
149
+ /** Analyst-private extras; renderers ignore unless they know the analyst. */
150
+ metadata?: Record<string, unknown>;
151
+ }
152
+ type AnalystSeverity = 'critical' | 'high' | 'medium' | 'low' | 'info';
153
+ interface EvidenceRef {
154
+ /**
155
+ * Where the evidence lives. `span` and `event` refer to OTLP trace
156
+ * elements; `artifact` to a file inside the run's artifact tree;
157
+ * `finding` to another AnalystFinding (cross-analyst chaining);
158
+ * `metric` to a named scalar reading the renderer knows how to read.
159
+ */
160
+ kind: 'span' | 'event' | 'artifact' | 'finding' | 'metric';
161
+ uri: string;
162
+ excerpt?: string;
163
+ }
164
+ /**
165
+ * The discriminator the registry uses to pass the right input.
166
+ * `custom` is the escape hatch — analysts that need something else
167
+ * (e.g. an embedding cache, a partner SDK handle) read it from
168
+ * `AnalystRunInputs.custom[<analyst id>]`.
169
+ */
170
+ type AnalystInputKind = 'trace-store' | 'artifact-dir' | 'run-record' | 'judge-input' | 'custom';
171
+ interface AnalystCost {
172
+ /** `deterministic` analysts MUST NOT call the LLM. */
173
+ kind: 'deterministic' | 'llm';
174
+ /** Optional declared upper bound; the registry can enforce a budget. */
175
+ est_usd_per_run?: number;
176
+ /** Models the analyst expects to use (informational). */
177
+ models?: string[];
178
+ }
179
+ interface AnalystRequirements {
180
+ /** Min number of shots / samples the analyst needs to produce signal. */
181
+ min_shots?: number;
182
+ /** Capabilities the runtime must supply (e.g. ['network', 'gpu']). */
183
+ capabilities?: string[];
184
+ }
185
+ /**
186
+ * What's passed to every analyst call. The registry resolves which
187
+ * field the analyst's `inputKind` selects and asserts it's present.
188
+ */
189
+ interface AnalystRunInputs {
190
+ traceStore?: TraceAnalysisStore;
191
+ artifactDir?: string;
192
+ runRecord?: RunRecord;
193
+ judgeInput?: JudgeInput;
194
+ /** Keyed by analyst id; populated by callers that registered custom analysts. */
195
+ custom?: Record<string, unknown>;
196
+ }
197
+ interface AnalystContext {
198
+ runId: string;
199
+ /** Stable correlation id so logs from a single registry.run() share a tag. */
200
+ correlationId: string;
201
+ /** Wall-clock deadline (epoch ms). Analysts SHOULD honor for graceful cancel. */
202
+ deadlineMs?: number;
203
+ /** Per-analyst USD budget. Analysts MAY check before issuing LLM calls. */
204
+ budgetUsd?: number;
205
+ /**
206
+ * Shared chat client. Analysts that call an LLM go through this so
207
+ * the operator picks transport (sandbox-sdk | router | cli-bridge |
208
+ * direct-provider | mock) at the registry boundary without touching
209
+ * analyst code.
210
+ */
211
+ chat?: ChatClient;
212
+ /**
213
+ * Findings from a prior run the operator wants the analyst to see as
214
+ * retrieval context. Kinds that take advantage of cross-run memory
215
+ * (failure-mode "I saw this cluster last run", knowledge-gap "the wiki
216
+ * page I asked for is still missing") render these into the actor's
217
+ * working set. Filtering is the operator's job: pass the slice that
218
+ * matches the analyst's id, or pass everything and let the kind
219
+ * filter. Empty / absent means no cross-run context.
220
+ */
221
+ priorFindings?: ReadonlyArray<AnalystFinding>;
222
+ /** Free-form runtime tags (env, host, op). Findings can echo these into metadata. */
223
+ tags?: Record<string, string>;
224
+ /** Logger callback — analysts SHOULD prefer this over console.* for testability. */
225
+ log?: (msg: string, fields?: Record<string, unknown>) => void;
226
+ /** Optional abort signal. Analysts SHOULD pass it through to LLM calls. */
227
+ signal?: AbortSignal;
228
+ }
229
+ /**
230
+ * The minimal contract. Concrete analysts can refine `TInput` so
231
+ * implementations stay type-safe (e.g. a trace analyst's `TInput` is
232
+ * `TraceAnalysisStore`); the registry passes the right field from
233
+ * `AnalystRunInputs` based on `inputKind`.
234
+ */
235
+ interface Analyst<TInput = unknown> {
236
+ /** Stable identifier — appears in finding_id, telemetry, and registry exclusion lists. */
237
+ readonly id: string;
238
+ /** Human-readable. One sentence. */
239
+ readonly description: string;
240
+ readonly inputKind: AnalystInputKind;
241
+ readonly cost: AnalystCost;
242
+ readonly requires?: AnalystRequirements;
243
+ /** Bump on breaking changes to claim wording or area so old finding_ids don't collide. */
244
+ readonly version: string;
245
+ analyze(input: TInput, ctx: AnalystContext): Promise<AnalystFinding[]>;
246
+ }
247
+ /**
248
+ * Compute the stable finding_id from the identity-defining fields.
249
+ * Default implementation hashes {analyst_id, area, subject, normalized claim}.
250
+ * Analysts that emit findings whose claim text varies per run (timestamps,
251
+ * counts) SHOULD either: (a) pass an explicit `id_basis` to fix the hash,
252
+ * or (b) move the variable part into `rationale`/`metadata` and keep the
253
+ * `claim` static.
254
+ */
255
+ declare function computeFindingId(input: {
256
+ analyst_id: string;
257
+ area: string;
258
+ subject?: string;
259
+ claim: string;
260
+ /** Override the claim for hashing — use when the displayed claim has run-specific bits. */
261
+ id_basis?: string;
262
+ }): string;
263
+ /**
264
+ * Convenience factory: produce a fully-formed AnalystFinding with the
265
+ * id computed automatically. Analyst code stays terse.
266
+ */
267
+ declare function makeFinding(init: Omit<AnalystFinding, 'schema_version' | 'finding_id' | 'produced_at'> & {
268
+ id_basis?: string;
269
+ produced_at?: string;
270
+ }): AnalystFinding;
271
+ interface AnalystRunSummary {
272
+ analyst_id: string;
273
+ status: 'ok' | 'skipped' | 'failed';
274
+ /** Why skipped — missing input, budget exceeded, capability unmet. */
275
+ reason?: string;
276
+ findings_count: number;
277
+ latency_ms: number;
278
+ cost_usd: number;
279
+ /** When `status='failed'`: the error class + message, never the full stack. */
280
+ error?: {
281
+ class: string;
282
+ message: string;
283
+ };
284
+ }
285
+ interface AnalystRunResult {
286
+ run_id: string;
287
+ correlation_id: string;
288
+ started_at: string;
289
+ ended_at: string;
290
+ findings: AnalystFinding[];
291
+ per_analyst: AnalystRunSummary[];
292
+ /** Total LLM cost in USD across all analysts in this registry.run(). */
293
+ total_cost_usd: number;
294
+ }
295
+ /**
296
+ * Events emitted by `AnalystRegistry.runStream(...)` in real time as
297
+ * the registry executes. UIs subscribe via `for await (const ev of
298
+ * registry.runStream(...))`; `registry.run(...)` is a thin collector
299
+ * over the same stream, so the two surfaces share their invariants.
300
+ *
301
+ * Per-finding events are intentionally omitted — analyzers are batch
302
+ * operations (an Ax actor returns the full `findings:json[]` at the
303
+ * end of the responder), so streaming inside one analyst would only
304
+ * emit partial JSON consumers can't render. The kind-completion event
305
+ * is the right granularity; subscribers wanting per-finding rendering
306
+ * iterate `event.findings` themselves.
307
+ */
308
+ type AnalystRunEvent = {
309
+ type: 'run-started';
310
+ run_id: string;
311
+ correlation_id: string;
312
+ started_at: string;
313
+ /** The ordered list of analyst ids the registry will run. */
314
+ analyst_ids: ReadonlyArray<string>;
315
+ } | {
316
+ type: 'analyst-skipped';
317
+ summary: AnalystRunSummary;
318
+ } | {
319
+ type: 'analyst-started';
320
+ analyst_id: string;
321
+ started_at: string;
322
+ } | {
323
+ type: 'analyst-completed';
324
+ /** `summary.status` is `'ok'` for clean completion or `'failed'` for thrown analysts. */
325
+ summary: AnalystRunSummary;
326
+ findings: ReadonlyArray<AnalystFinding>;
327
+ } | {
328
+ type: 'run-completed';
329
+ result: AnalystRunResult;
330
+ };
331
+
332
+ /**
333
+ * AnalystRegistry — orchestrate N analysts against one run.
334
+ *
335
+ * Owns three responsibilities and only three:
336
+ * 1. Registration — ids must be unique; bad registrations fail loudly
337
+ * at register-time, not run-time.
338
+ * 2. Routing — each analyst declares its `inputKind`; the registry
339
+ * picks the matching field from AnalystRunInputs and skips the
340
+ * analyst with a logged reason if it's missing.
341
+ * 3. Isolation — one analyst's exception MUST NOT stop other analysts.
342
+ * Failed analysts produce zero findings + a 'failed' summary row.
343
+ *
344
+ * Cross-cutting concerns (telemetry, error → finding conversion, cost
345
+ * ingestion, storage rotation) live in `AnalystHooks`. Budget shaping
346
+ * (equal split vs weighted vs custom) lives in `BudgetPolicy`. Both
347
+ * have sensible defaults; consumers override only what they need.
348
+ */
349
+
350
+ interface AnalystHooks {
351
+ /** Before analyze() — last chance to mutate ctx (e.g. inject tags, override budget). */
352
+ onBeforeAnalyze?(args: {
353
+ analyst: Analyst;
354
+ ctx: AnalystContext;
355
+ runId: string;
356
+ }): void | Promise<void>;
357
+ /** After every analyst (ok | failed | skipped). Use for telemetry, ingestion, rotation. */
358
+ onAfterAnalyze?(args: {
359
+ analyst: Analyst;
360
+ summary: AnalystRunSummary;
361
+ findings: AnalystFinding[];
362
+ runId: string;
363
+ }): void | Promise<void>;
364
+ /**
365
+ * On analyst exception. Hook MAY return findings to convert the
366
+ * error into structured findings; the summary still reports 'failed'.
367
+ * Return void to keep the default empty-findings behavior.
368
+ */
369
+ onError?(args: {
370
+ analyst: Analyst;
371
+ error: Error;
372
+ runId: string;
373
+ }): AnalystFinding[] | undefined | Promise<AnalystFinding[] | undefined>;
374
+ /** Once after registry.run() completes. Use for final aggregation, persistence. */
375
+ onComplete?(args: {
376
+ result: AnalystRunResult;
377
+ }): void | Promise<void>;
378
+ }
379
+ interface BudgetPolicy {
380
+ /** Overall USD cap across the registry.run(). */
381
+ totalUsd?: number;
382
+ /** Per-analyst weight for the default allocator. Missing ids get weight 1. */
383
+ weights?: Record<string, number>;
384
+ /**
385
+ * Custom allocator — receives the analyst, remaining/total budget, and
386
+ * the count of analysts that will run. Returns the per-analyst budget
387
+ * (or undefined to leave it uncapped). Overrides weights when set.
388
+ */
389
+ allocate?: (args: {
390
+ analyst: Analyst;
391
+ totalUsd: number | undefined;
392
+ remainingUsd: number | undefined;
393
+ runningCount: number;
394
+ }) => number | undefined;
395
+ }
396
+ interface AnalystRegistryOptions {
397
+ /** Shared chat client passed to every LLM analyst via AnalystContext. */
398
+ chat?: ChatClient;
399
+ /** Logger callback. Defaults to a no-op. */
400
+ log?: (msg: string, fields?: Record<string, unknown>) => void;
401
+ /** Hooks invoked around analyze() — observability + customization seam. */
402
+ hooks?: AnalystHooks;
403
+ /** Default budget when run() doesn't override. */
404
+ defaultBudget?: BudgetPolicy;
405
+ }
406
+ interface RegistryRunOpts {
407
+ /** Restrict to a subset of registered analysts by id. */
408
+ only?: string[];
409
+ /** Skip these analysts even if registered. Useful for cheap iteration. */
410
+ skip?: string[];
411
+ /** Budget policy — totalUsd + optional weights/allocator. Falls back to options.defaultBudget. */
412
+ budget?: BudgetPolicy;
413
+ /** Wall-clock cap. Analysts SHOULD honor `ctx.deadlineMs`. */
414
+ timeoutMs?: number;
415
+ /** Abort signal — forwarded into every analyst's context. */
416
+ signal?: AbortSignal;
417
+ /** Tags echoed into AnalystContext.tags — useful for tracking environment/version in findings. */
418
+ tags?: Record<string, string>;
419
+ /**
420
+ * Prior-run findings made available as retrieval context to every
421
+ * analyst via `ctx.priorFindings`. The registry forwards the slice
422
+ * whose `analyst_id` matches each registered analyst so a kind sees
423
+ * only its own history. Pass `{ '*': findings }` to broadcast to
424
+ * every analyst (useful for cross-kind chaining where the improvement
425
+ * analyst consumes upstream failure findings).
426
+ */
427
+ priorFindings?: ReadonlyArray<AnalystFinding> | Record<string, ReadonlyArray<AnalystFinding>>;
428
+ }
429
+ declare class AnalystRegistry {
430
+ private readonly analysts;
431
+ private readonly options;
432
+ constructor(options?: AnalystRegistryOptions);
433
+ register(analyst: Analyst): void;
434
+ list(): ReadonlyArray<{
435
+ id: string;
436
+ description: string;
437
+ version: string;
438
+ cost: Analyst['cost'];
439
+ }>;
440
+ run(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): Promise<AnalystRunResult>;
441
+ /**
442
+ * Streaming counterpart to `run()`. Emits `AnalystRunEvent` values
443
+ * in real time — `run-started`, then per-analyst `skipped` /
444
+ * `started` / `completed`, then a terminal `run-completed` whose
445
+ * payload is the full `AnalystRunResult`. UIs use this to render
446
+ * progress; persistence consumers use `run()` and read the result.
447
+ *
448
+ * Hooks (`onBeforeAnalyze` / `onAfterAnalyze` / `onError` /
449
+ * `onComplete`) fire as before — streaming is additive, not a hook
450
+ * replacement.
451
+ */
452
+ runStream(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): AsyncGenerator<AnalystRunEvent, void, void>;
453
+ private selectAnalysts;
454
+ private routeInput;
455
+ }
456
+
457
+ export { AnalystRegistry as A, type BudgetPolicy as B, type ChatCallOpts as C, type DirectProviderTransportOpts as D, type EvidenceRef as E, type MockTransportOpts as M, type RegistryRunOpts as R, type SandboxSdkTransportOpts as S, type Analyst as a, type AnalystSeverity as b, type AnalystFinding as c, type AnalystCost as d, type AnalystContext as e, type AnalystHooks as f, type AnalystInputKind as g, type AnalystRegistryOptions as h, type AnalystRequirements as i, type AnalystRunEvent as j, type AnalystRunInputs as k, type AnalystRunResult as l, type AnalystRunSummary as m, type ChatClient as n, type ChatRequest as o, type ChatResponse as p, type ChatTransport as q, type CliBridgeTransportOpts as r, type CreateChatClientOpts as s, type RouterTransportOpts as t, computeFindingId as u, createChatClient as v, makeFinding as w };