@tangle-network/agent-eval 0.27.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +87 -0
  2. package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/chunk-UW4NOOZI.js +1561 -0
  5. package/dist/chunk-UW4NOOZI.js.map +1 -0
  6. package/dist/{control-BT4qnXiS.d.ts → control-rJhEDdpy.d.ts} +4 -4
  7. package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-BRdQ0wrx.d.ts} +2 -2
  8. package/dist/control.d.ts +5 -5
  9. package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
  10. package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-D1NZKqYu.d.ts} +1 -1
  11. package/dist/{feedback-trajectory-D1aGKusy.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
  12. package/dist/governance/index.d.ts +2 -2
  13. package/dist/{index-BhLlu-qO.d.ts → index-Cgt3DKXr.d.ts} +1 -1
  14. package/dist/index.d.ts +1190 -335
  15. package/dist/index.js +1580 -489
  16. package/dist/index.js.map +1 -1
  17. package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
  18. package/dist/knowledge/index.d.ts +3 -3
  19. package/dist/meta-eval/index.d.ts +1 -1
  20. package/dist/{multi-layer-verifier-U-c8ge1k.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +1 -1
  21. package/dist/openapi.json +1 -1
  22. package/dist/optimization.d.ts +8 -8
  23. package/dist/pipelines/index.d.ts +6 -6
  24. package/dist/prm/index.d.ts +4 -4
  25. package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
  26. package/dist/{release-report-CCQqnK46.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
  27. package/dist/replay-BX5Fm8en.d.ts +529 -0
  28. package/dist/reporting.d.ts +4 -4
  29. package/dist/{researcher-G81CWc0q.d.ts → researcher-ClDX3KZx.d.ts} +5 -5
  30. package/dist/rl.d.ts +8 -8
  31. package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
  32. package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
  33. package/dist/{summary-report-Dl4akLKX.d.ts → summary-report-jrSGb2xZ.d.ts} +1 -1
  34. package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
  35. package/dist/traces.d.ts +9 -311
  36. package/dist/traces.js +15 -986
  37. package/dist/traces.js.map +1 -1
  38. package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
  39. package/dist/wire/index.d.ts +4 -4
  40. package/package.json +1 -1
  41. package/dist/chunk-4U4BKCXK.js +0 -569
  42. package/dist/chunk-4U4BKCXK.js.map +0 -1
  43. package/dist/replay-D7z0J43-.d.ts +0 -225
@@ -1,5 +1,5 @@
1
1
  import { C as CaptureIntegrityError } from './errors-BZ9sTdz7.js';
2
- import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
+ import { T as TraceStore } from './store-BP5be6s7.js';
3
3
 
4
4
  /**
5
5
  * RawProviderSink — first-class persistence for the actual HTTP-level
@@ -208,4 +208,4 @@ declare function assertRunCaptured(store: TraceStore, runId: string, expectation
208
208
  /** Strict mode: throws `RunIntegrityError` when the report isn't ok. */
209
209
  declare function throwIfRunIncomplete(report: RunIntegrityReport): void;
210
210
 
211
- export { FileSystemRawProviderSink as F, InMemoryRawProviderSink as I, NoopRawProviderSink as N, type ProviderRedactor as P, type RawProviderSink as R, type RunIntegrityExpectations as a, type RunIntegrityReport as b, type RawProviderEvent as c, type FileSystemRawProviderSinkOptions as d, type InMemoryRawProviderSinkOptions as e, type RawProviderDirection as f, type RawProviderSinkFilter as g, RunIntegrityError as h, type RunIntegrityIssue as i, type RunIntegrityIssueCode as j, assertRunCaptured as k, defaultProviderRedactor as l, providerFromBaseUrl as p, throwIfRunIncomplete as t };
211
+ export { FileSystemRawProviderSink as F, InMemoryRawProviderSink as I, NoopRawProviderSink as N, type ProviderRedactor as P, type RawProviderSink as R, type RunIntegrityExpectations as a, type RunIntegrityReport as b, type FileSystemRawProviderSinkOptions as c, type InMemoryRawProviderSinkOptions as d, type RawProviderDirection as e, type RawProviderEvent as f, type RawProviderSinkFilter as g, RunIntegrityError as h, type RunIntegrityIssue as i, type RunIntegrityIssueCode as j, assertRunCaptured as k, defaultProviderRedactor as l, providerFromBaseUrl as p, throwIfRunIncomplete as t };
@@ -1,6 +1,6 @@
1
- import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BZ_lVLYW.js';
2
- import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
- import '../store-Db2Bv8Cf.js';
1
+ import { j as ControlSeverity, C as ControlEvalResult } from '../control-runtime-BRdQ0wrx.js';
2
+ import { T as TraceEmitter } from '../emitter-BqjeOvJh.js';
3
+ import '../store-BP5be6s7.js';
4
4
 
5
5
  type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
6
6
  type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
@@ -1,4 +1,4 @@
1
- import { R as Run, T as TraceStore } from '../store-Db2Bv8Cf.js';
1
+ import { R as Run, T as TraceStore } from '../store-BP5be6s7.js';
2
2
  import { a as OutcomeFilter, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
3
3
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore } from '../outcome-store-D6KWmYvj.js';
4
4
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from '../rubric-predictive-validity-C0uDYwG6.js';
@@ -138,4 +138,4 @@ declare class MultiLayerVerifier<Env = unknown> {
138
138
  run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
139
139
  }
140
140
 
141
- export { type Finding as F, type Layer as L, MultiLayerVerifier as M, type Severity as S, type VerificationReport as V, type LayerResult as a, type VerifyContext as b, type LayerStatus as c, type VerifyOptions as d, gradeSemanticStatus as g };
141
+ export { type Finding as F, type Layer as L, MultiLayerVerifier as M, type Severity as S, type VerificationReport as V, type VerifyOptions as a, type LayerResult as b, type VerifyContext as c, type LayerStatus as d, gradeSemanticStatus as g };
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.27.2",
5
+ "version": "0.29.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -1,11 +1,11 @@
1
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-G81CWc0q.js';
2
- export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-D1aGKusy.js';
3
- export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-Dl4akLKX.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-ClDX3KZx.js';
2
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-j0nJFgC6.js';
3
+ export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-jrSGb2xZ.js';
4
4
  import './errors-BZ9sTdz7.js';
5
- import './integrity-DK2EBVZC.js';
6
- import './store-Db2Bv8Cf.js';
5
+ import './integrity-BAxLGJ9I.js';
6
+ import './store-BP5be6s7.js';
7
7
  import './run-record-CqzahIbx.js';
8
- import './emitter-DP_cSSiw.js';
9
- import './control-runtime-BZ_lVLYW.js';
8
+ import './emitter-BqjeOvJh.js';
9
+ import './control-runtime-BRdQ0wrx.js';
10
10
  import './dataset-CiK_3LDr.js';
11
- import './failure-cluster-Cw65_5FY.js';
11
+ import './failure-cluster-D1NZKqYu.js';
@@ -1,9 +1,9 @@
1
- import { g as BudgetSpec, T as TraceStore, h as RunFilter, R as Run, a as ToolSpan } from '../store-Db2Bv8Cf.js';
2
- export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-Cw65_5FY.js';
3
- import { a as TrajectoryStep } from '../trajectory-CnoBo-JY.js';
4
- import { B as BaselineOptions, a as BaselineReport } from '../baseline-4R5deP0N.js';
5
- export { c as computeToolUseMetrics } from '../baseline-4R5deP0N.js';
6
- import { l as llmSpans } from '../query-DODUYdPg.js';
1
+ import { g as BudgetSpec, T as TraceStore, l as RunFilter, R as Run, a as ToolSpan } from '../store-BP5be6s7.js';
2
+ export { a as FailureCluster, F as FailureClusterReport, f as failureClusterView } from '../failure-cluster-D1NZKqYu.js';
3
+ import { a as TrajectoryStep } from '../trajectory-BFmveYZt.js';
4
+ import { B as BaselineOptions, a as BaselineReport } from '../baseline-BwdCXUS8.js';
5
+ export { c as computeToolUseMetrics } from '../baseline-BwdCXUS8.js';
6
+ import { l as llmSpans } from '../query-BFDT0kX_.js';
7
7
 
8
8
  /**
9
9
  * BudgetBreachView — aggregates breach events across the corpus.
@@ -1,7 +1,7 @@
1
- import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
2
- export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
3
- import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
4
- import '../trajectory-CnoBo-JY.js';
1
+ import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-DgSqjqqj.js';
2
+ export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-DgSqjqqj.js';
3
+ import { S as Span, T as TraceStore } from '../store-BP5be6s7.js';
4
+ import '../trajectory-BFmveYZt.js';
5
5
 
6
6
  /**
7
7
  * Export PRM-graded traces as training data for downstream reward-model
@@ -1,4 +1,4 @@
1
- import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
1
+ import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-BP5be6s7.js';
2
2
 
3
3
  /**
4
4
  * Typed query helpers over TraceStore.
@@ -1,5 +1,5 @@
1
1
  import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
2
- import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-Dl4akLKX.js';
2
+ import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-jrSGb2xZ.js';
3
3
  import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
4
4
 
5
5
  /**
@@ -0,0 +1,529 @@
1
+ import { T as TraceStore } from './store-BP5be6s7.js';
2
+ import { AxAIService } from '@ax-llm/ax';
3
+ import { R as ReplayError } from './errors-BZ9sTdz7.js';
4
+ import { R as RawProviderSink, f as RawProviderEvent } from './integrity-BAxLGJ9I.js';
5
+
6
+ /**
7
+ * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
8
+ * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
9
+ *
10
+ * Wire format only. We do NOT depend on the @opentelemetry SDK — that
11
+ * would drag in polyfills incompatible with Workers/Edge. Consumers
12
+ * push the JSON to their collector of choice via HTTP.
13
+ *
14
+ * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
15
+ */
16
+
17
+ declare const OTEL_AGENT_EVAL_SCOPE: {
18
+ name: string;
19
+ version: string;
20
+ };
21
+ interface OtlpSpan {
22
+ traceId: string;
23
+ spanId: string;
24
+ parentSpanId?: string;
25
+ name: string;
26
+ kind: number;
27
+ startTimeUnixNano: string;
28
+ endTimeUnixNano: string;
29
+ attributes: Array<{
30
+ key: string;
31
+ value: {
32
+ stringValue?: string;
33
+ intValue?: string;
34
+ doubleValue?: number;
35
+ boolValue?: boolean;
36
+ };
37
+ }>;
38
+ events?: Array<{
39
+ timeUnixNano: string;
40
+ name: string;
41
+ attributes?: OtlpSpan['attributes'];
42
+ }>;
43
+ status?: {
44
+ code: number;
45
+ message?: string;
46
+ };
47
+ }
48
+ interface OtlpResourceSpans {
49
+ resource: {
50
+ attributes: OtlpSpan['attributes'];
51
+ };
52
+ scopeSpans: Array<{
53
+ scope: typeof OTEL_AGENT_EVAL_SCOPE;
54
+ spans: OtlpSpan[];
55
+ }>;
56
+ }
57
+ interface OtlpExport {
58
+ resourceSpans: OtlpResourceSpans[];
59
+ }
60
+ /** Export a single run's spans + events in OTLP/JSON. */
61
+ declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
62
+
63
+ /**
64
+ * Redaction — remove PII / secrets from trace payloads before persist.
65
+ *
66
+ * Pre-persistence rules mean raw traces in storage are already scrubbed.
67
+ * Unredacted variants (for debugging / post-mortems) live in a separate
68
+ * storage layer with stricter access controls; this module only covers
69
+ * the default scrub-then-persist path.
70
+ *
71
+ * Rules compose: pass an array of `RedactionRule`, each is applied in
72
+ * order. Strings that match get replaced with a tagged sentinel so the
73
+ * eval framework can count how many redactions happened per run
74
+ * (surfaced via `redaction_applied` events).
75
+ */
76
+ interface RedactionRule {
77
+ id: string;
78
+ pattern: RegExp;
79
+ /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
80
+ replacement?: string;
81
+ }
82
+ interface RedactionReport {
83
+ redactionCount: number;
84
+ byRule: Record<string, number>;
85
+ }
86
+ /** OWASP / common-sense defaults — extend per-domain. */
87
+ declare const DEFAULT_REDACTION_RULES: RedactionRule[];
88
+ declare const REDACTION_VERSION = "1.0.0";
89
+ /**
90
+ * Redact a single string. Returns the new string and a per-rule count of
91
+ * how many substitutions fired.
92
+ */
93
+ declare function redactString(input: string, rules?: RedactionRule[]): {
94
+ output: string;
95
+ report: RedactionReport;
96
+ };
97
+ /**
98
+ * Walk a JSON-ish value applying `redactString` to every string leaf.
99
+ * Arrays and plain objects are recursed; other types pass through
100
+ * untouched. Circular references throw — traces should be tree-shaped.
101
+ */
102
+ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
103
+ value: unknown;
104
+ report: RedactionReport;
105
+ };
106
+
107
+ /**
108
+ * Shared types for the trace-analyst module.
109
+ *
110
+ * Wire format. The store interface speaks `OtlpSpanLike` rows — one JSONL
111
+ * line per span, OTLP-shaped. We do NOT depend on a specific tracing
112
+ * vendor at the type level. Adapter
113
+ * layers map upstream shapes onto this interface.
114
+ *
115
+ * Design constraint. Every read operation that can return arbitrary
116
+ * payload must carry a byte budget so the agent's tool result stays
117
+ * bounded regardless of input trace size. Oversized responses
118
+ * substitute a deterministic summary instead of bytes — see
119
+ * `ViewTraceOversized`.
120
+ */
121
+ /** OTLP span kind (subset we actually use). */
122
+ type TraceAnalystSpanKind = 'AGENT' | 'LLM' | 'TOOL' | 'CHAIN' | 'GUARDRAIL' | 'SPAN' | 'UNKNOWN';
123
+ type TraceAnalystSpanStatus = 'OK' | 'ERROR' | 'UNSET';
124
+ /** Subset of OTLP span fields the analyst exposes to the agent. The
125
+ * store's job is to project upstream's full span shape down to this
126
+ * view — the analyst never sees vendor extensions directly. */
127
+ interface TraceAnalystSpan {
128
+ trace_id: string;
129
+ span_id: string;
130
+ parent_span_id: string | null;
131
+ name: string;
132
+ kind: TraceAnalystSpanKind;
133
+ start_time: string;
134
+ end_time: string;
135
+ duration_ms: number;
136
+ status: TraceAnalystSpanStatus;
137
+ status_message?: string;
138
+ service_name: string | null;
139
+ agent_name: string | null;
140
+ model_name: string | null;
141
+ tool_name: string | null;
142
+ /** Raw JSON-serialisable attribute map. May contain large strings;
143
+ * callers must respect the per-attribute byte cap. */
144
+ attributes: Record<string, unknown>;
145
+ }
146
+ interface TraceAnalystTraceSummary {
147
+ trace_id: string;
148
+ service_name: string | null;
149
+ agent_name: string | null;
150
+ span_count: number;
151
+ has_errors: boolean;
152
+ start_time: string;
153
+ end_time: string;
154
+ duration_ms: number;
155
+ raw_jsonl_bytes: number;
156
+ models: string[];
157
+ tools: string[];
158
+ }
159
+ interface TraceAnalystFilters {
160
+ /** Restrict to traces that contain at least one error span. */
161
+ has_errors?: boolean;
162
+ /** Match if any span's `service.name` is in this list. */
163
+ service_names?: string[];
164
+ /** Match if any span's `agent.name` is in this list. */
165
+ agent_names?: string[];
166
+ /** Match if any LLM span's `llm.model_name` is in this list. */
167
+ model_names?: string[];
168
+ /** Match if any tool span's `tool.name` is in this list. */
169
+ tool_names?: string[];
170
+ /** ISO-8601 lower bound on the trace's earliest start time. */
171
+ start_time_after?: string;
172
+ /** ISO-8601 upper bound on the trace's earliest start time. */
173
+ start_time_before?: string;
174
+ /** Single regex applied to raw JSONL bytes for the trace. Opt-in;
175
+ * expensive on large datasets. Use the indexed filters above first. */
176
+ regex_pattern?: string;
177
+ }
178
+ interface DatasetOverview {
179
+ total_traces: number;
180
+ raw_jsonl_bytes: number;
181
+ services: string[];
182
+ agents: string[];
183
+ models: string[];
184
+ tool_names: string[];
185
+ /** Up to 20 real trace ids the agent may pass to view/search tools. */
186
+ sample_trace_ids: string[];
187
+ errors: {
188
+ trace_count: number;
189
+ span_count: number;
190
+ };
191
+ time_range: {
192
+ earliest: string;
193
+ latest: string;
194
+ } | null;
195
+ }
196
+ interface QueryTracesPage {
197
+ traces: TraceAnalystTraceSummary[];
198
+ total: number;
199
+ has_more: boolean;
200
+ }
201
+ /** Full-trace view. When the response would exceed the per-call byte
202
+ * budget, `oversized` is populated INSTEAD of `spans` so the agent
203
+ * knows to switch to `searchTrace` / `viewSpans`. */
204
+ interface ViewTraceResult {
205
+ trace_id: string;
206
+ spans?: TraceAnalystSpan[];
207
+ oversized?: ViewTraceOversized;
208
+ }
209
+ interface ViewTraceOversized {
210
+ span_count: number;
211
+ /** Names with their counts, sorted desc. Capped at 20 entries. */
212
+ top_span_names: Array<[string, number]>;
213
+ /** Largest single span body (bytes after attribute-cap projection). */
214
+ span_response_bytes_max: number;
215
+ error_span_count: number;
216
+ }
217
+ interface ViewSpansResult {
218
+ trace_id: string;
219
+ spans: TraceAnalystSpan[];
220
+ /** Number of requested span ids that were not found in the trace. */
221
+ missing_span_ids: string[];
222
+ /** Number of attribute fields truncated to fit the per-attribute cap. */
223
+ truncated_attribute_count: number;
224
+ }
225
+ interface SpanMatchRecord {
226
+ trace_id: string;
227
+ span_id: string;
228
+ span_name: string;
229
+ span_kind: TraceAnalystSpanKind;
230
+ /** JSON pointer-style path to the matched value, e.g.
231
+ * `attributes."llm.input_messages"[2].content`. */
232
+ attribute_path: string;
233
+ matched_text: string;
234
+ context_before: string;
235
+ context_after: string;
236
+ match_offset: number;
237
+ }
238
+ interface SearchTraceResult {
239
+ trace_id: string;
240
+ hits: SpanMatchRecord[];
241
+ total_matches: number;
242
+ has_more: boolean;
243
+ }
244
+ interface SearchSpanResult {
245
+ trace_id: string;
246
+ span_id: string;
247
+ hits: SpanMatchRecord[];
248
+ total_matches: number;
249
+ has_more: boolean;
250
+ }
251
+ /** Tunable byte budgets for bounded RLM tool output. */
252
+ interface TraceAnalystByteBudgets {
253
+ /** Max bytes any single tool response may emit. Hard ceiling enforced
254
+ * by the store; oversized → summary. Default 150_000. */
255
+ perCallByteCeiling: number;
256
+ /** Per-attribute string truncation cap on `viewTrace` (discovery scan).
257
+ * Default 4096. */
258
+ perAttributeViewBudget: number;
259
+ /** Per-attribute string truncation cap on `viewSpans` (surgical reads).
260
+ * Default 16384. */
261
+ perAttributeSpanBudget: number;
262
+ /** Per-attribute cap on a single match record's `matched_text` and
263
+ * context window. Default 1024. */
264
+ perMatchTextBudget: number;
265
+ }
266
+ declare const DEFAULT_TRACE_ANALYST_BUDGETS: TraceAnalystByteBudgets;
267
+ /** Marker substituted in place of truncated string payloads. Callers
268
+ * parsing tool output can detect it deterministically. */
269
+ declare const TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
270
+
271
+ /**
272
+ * `TraceAnalysisStore` — read-side interface the trace-analyst calls
273
+ * through. Six operations, all bounded:
274
+ *
275
+ * - `getOverview(filters?)` — dataset rollup + sample trace ids.
276
+ * - `queryTraces(filters?, limit, offset)` — paginated summaries.
277
+ * - `countTraces(filters?)` — cheap count without materialisation.
278
+ * - `viewTrace(trace_id, perAttrCap)` — full span list, oversized → summary.
279
+ * - `viewSpans(trace_id, span_ids, perAttrCap)` — surgical span fetch.
280
+ * - `searchTrace(trace_id, regex, max_matches)` — bounded regex hits.
281
+ * - `searchSpan(trace_id, span_id, regex, max_matches)` — single-span search.
282
+ *
283
+ * Multiple implementations ship in the core (`OtlpFileTraceStore`).
284
+ * Downstream callers can supply their own — e.g. a DuckDB-backed
285
+ * adapter or an in-memory adapter for tests — by implementing this
286
+ * interface.
287
+ *
288
+ * Filters compose with AND semantics. Empty/undefined fields impose
289
+ * no constraint. `regex_pattern` is the only opt-in raw-bytes scan —
290
+ * implementations may skip it via `count`/`overview` when not set.
291
+ */
292
+
293
+ interface TraceAnalysisStore {
294
+ getOverview(filters?: TraceAnalystFilters): Promise<DatasetOverview>;
295
+ queryTraces(opts: {
296
+ filters?: TraceAnalystFilters;
297
+ limit: number;
298
+ offset?: number;
299
+ }): Promise<QueryTracesPage>;
300
+ countTraces(filters?: TraceAnalystFilters): Promise<number>;
301
+ viewTrace(opts: {
302
+ trace_id: string;
303
+ /** Override per-attribute byte cap. Defaults to discovery budget. */
304
+ per_attribute_byte_cap?: number;
305
+ }): Promise<ViewTraceResult>;
306
+ viewSpans(opts: {
307
+ trace_id: string;
308
+ span_ids: readonly string[];
309
+ /** Override per-attribute byte cap. Defaults to surgical budget. */
310
+ per_attribute_byte_cap?: number;
311
+ }): Promise<ViewSpansResult>;
312
+ searchTrace(opts: {
313
+ trace_id: string;
314
+ regex_pattern: string;
315
+ /** Hard cap on matches returned. Default 50. */
316
+ max_matches?: number;
317
+ }): Promise<SearchTraceResult>;
318
+ searchSpan(opts: {
319
+ trace_id: string;
320
+ span_id: string;
321
+ regex_pattern: string;
322
+ max_matches?: number;
323
+ }): Promise<SearchSpanResult>;
324
+ }
325
+
326
+ interface AnalyzeTracesInput {
327
+ /** The user-facing question. Domain framing belongs here, not in the
328
+ * actor description. */
329
+ question: string;
330
+ }
331
+ interface AnalyzeTracesResult {
332
+ /** The responder's prose answer. */
333
+ answer: string;
334
+ /** Bulleted findings extracted from the responder's structured output. */
335
+ findings: string[];
336
+ /** Per-actor-turn snapshots captured via `actorTurnCallback`. */
337
+ turns: AnalyzeTracesTurnSnapshot[];
338
+ /** Total turns the actor took. */
339
+ turnCount: number;
340
+ /** Token usage by role. */
341
+ usage: TraceAnalystUsage;
342
+ /** Full system + assistant + tool message log by role. */
343
+ chatLog: TraceAnalystChatLog;
344
+ /** Prompt version that produced this run. */
345
+ actorPromptVersion: string;
346
+ }
347
+ interface TraceAnalystUsage {
348
+ actor: TraceAnalystUsageEntry[];
349
+ responder: TraceAnalystUsageEntry[];
350
+ }
351
+ interface TraceAnalystUsageEntry {
352
+ [key: string]: unknown;
353
+ }
354
+ interface TraceAnalystChatLog {
355
+ actor: TraceAnalystChatMessage[];
356
+ responder: TraceAnalystChatMessage[];
357
+ }
358
+ interface TraceAnalystChatMessage {
359
+ [key: string]: unknown;
360
+ }
361
+ interface AnalyzeTracesTurnSnapshot {
362
+ turn: number;
363
+ isError: boolean;
364
+ /** The JS code the actor produced for this turn. */
365
+ code: string;
366
+ /** The formatted action-log entry the actor sees on the next turn. */
367
+ output: string;
368
+ /** Provider thought (when `actorOptions.showThoughts` is true and the
369
+ * provider returns it). */
370
+ thought?: string;
371
+ }
372
+ interface AnalyzeTracesOptions {
373
+ /** Trace data source. Pass either an OTLP-JSONL path or a custom store. */
374
+ source: string | TraceAnalysisStore;
375
+ /** Caller-provided AxAIService. */
376
+ ai: AxAIService;
377
+ /** Model id forwarded to actor + responder. */
378
+ model?: string;
379
+ /** Recursion depth. 0 = no sub-agent dispatch. Default 1. */
380
+ maxDepth?: number;
381
+ /** Maximum actor turns. Default 12. */
382
+ maxTurns?: number;
383
+ /** Maximum parallel sub-agent calls in batched llmQuery. Default 2. */
384
+ maxParallelSubagents?: number;
385
+ /** Override the actor description. */
386
+ actorDescription?: string;
387
+ /** Override the subagent description. */
388
+ subagentDescription?: string;
389
+ /** Per-turn observability hook. */
390
+ onTurn?: (turn: AnalyzeTracesTurnSnapshot) => void | Promise<void>;
391
+ /** Override max runtime characters per turn. Default 6000. */
392
+ maxRuntimeChars?: number;
393
+ /** When set, every turn's snapshot is appended to this JSONL file
394
+ * immediately. If the analyst crashes mid-loop (provider 503,
395
+ * network error, validator reject) the partial reasoning is still
396
+ * on disk. Replay the file with the responder afterward to recover
397
+ * evidence. */
398
+ progressLogPath?: string;
399
+ }
400
+ /**
401
+ * Run the trace analyst.
402
+ *
403
+ * Throws:
404
+ * - `TraceFileMissingError` if `source` is a path and doesn't exist.
405
+ * - `AxAgentClarificationError` if the analyst asks for clarification.
406
+ * - Provider errors (auth, rate limits) propagate from the AI service.
407
+ */
408
+ declare function analyzeTraces(input: AnalyzeTracesInput, options: AnalyzeTracesOptions): Promise<AnalyzeTracesResult>;
409
+
410
+ /**
411
+ * Replay-from-raw-events — turn every captured campaign run into a
412
+ * re-runnable artifact.
413
+ *
414
+ * `RawProviderSink` captures every provider HTTP envelope; `runEvalCampaign`
415
+ * makes that capture the default. Together they make every past run a
416
+ * complete fingerprint of what happened on the wire — enough to replay
417
+ * the run without burning new LLM cost.
418
+ *
419
+ * Three use cases this primitive enables:
420
+ *
421
+ * 1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
422
+ * to last week's runs without re-calling any LLM. The cost of trying
423
+ * a new rubric drops from "another full sweep" to a CPU-bound replay.
424
+ * 2. **Determinism audits** — replay the same campaign and verify the
425
+ * raw responses match byte-for-byte. Any drift is a non-determinism
426
+ * bug (in the harness, the prompt builder, the sandbox, …).
427
+ * 3. **Free judge calibration** — run two judges on identical responses
428
+ * and measure inter-judge agreement without doubling LLM spend.
429
+ *
430
+ * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
431
+ * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
432
+ * from the cache instead of calling the network. No new code path through
433
+ * the LLM client is needed; the cache hit is invisible to the runner.
434
+ */
435
+
436
+ declare class ReplayCacheMissError extends ReplayError {
437
+ readonly url: string;
438
+ readonly requestKey: string;
439
+ constructor(url: string, requestKey: string, message?: string);
440
+ }
441
+ interface ReplayCacheEntry {
442
+ request: RawProviderEvent;
443
+ response: RawProviderEvent;
444
+ }
445
+ interface ReplayCacheStats {
446
+ total: number;
447
+ byProvider: Record<string, number>;
448
+ byModel: Record<string, number>;
449
+ /** Spans for which we have a request but no response (run aborted mid-call). */
450
+ orphanRequests: number;
451
+ }
452
+ /**
453
+ * In-memory deterministic cache of (request → response) keyed on a stable
454
+ * hash of the request body. Built from a `RawProviderSink` containing
455
+ * paired `request` and `response` events from a previous run.
456
+ *
457
+ * The cache is the source of truth for replay; `createReplayFetch` is a
458
+ * thin wrapper that reads from it.
459
+ */
460
+ declare class ReplayCache {
461
+ private byKey;
462
+ private orphans;
463
+ private byProvider;
464
+ private byModel;
465
+ /**
466
+ * Build a cache from a sink's events. The sink must implement `list()`.
467
+ * Filter by `runId` / `spanId` to scope to a specific replay.
468
+ */
469
+ static fromSink(sink: RawProviderSink, filter?: {
470
+ runId?: string;
471
+ spanId?: string;
472
+ }): Promise<ReplayCache>;
473
+ /** Build a cache from an in-memory event list. */
474
+ static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
475
+ /** Number of cacheable (request, response) pairs in the cache. */
476
+ size(): number;
477
+ stats(): ReplayCacheStats;
478
+ /** Iterate every cached `(request, response)` pair in insertion order. */
479
+ entries(): IterableIterator<ReplayCacheEntry>;
480
+ /**
481
+ * Look up a cached response by hashing the (model, messages, temperature,
482
+ * maxTokens, response_format) shape. Returns `undefined` on miss; the
483
+ * caller decides whether to throw, fall back to the network, or skip.
484
+ */
485
+ lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
486
+ }
487
+ interface ReplayFetchOptions {
488
+ /**
489
+ * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
490
+ * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
491
+ * still complete; `'fail-closed'` returns a synthetic 599 response so the
492
+ * call site sees a non-retriable failure.
493
+ */
494
+ onMiss?: 'throw' | 'fallback' | 'fail-closed';
495
+ fallbackFetch?: typeof fetch;
496
+ /** Optional callback fired once per replayed call (for telemetry / counters). */
497
+ onHit?: (info: {
498
+ url: string;
499
+ provider: string;
500
+ model: string;
501
+ }) => void;
502
+ /** Optional callback fired on cache miss before the `onMiss` policy applies. */
503
+ onMissNotify?: (info: {
504
+ url: string;
505
+ requestBody: unknown;
506
+ }) => void;
507
+ }
508
+ /**
509
+ * Build a `fetch`-shaped function that serves cached responses out of a
510
+ * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
511
+ * `LlmClientOptions.fetch` and `callLlm` becomes free.
512
+ *
513
+ * Non-`/chat/completions` URLs are passed straight to the fallback fetch
514
+ * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
515
+ * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
516
+ * `fetch` and shouldn't be intercepted.
517
+ */
518
+ declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
519
+ /**
520
+ * Convenience iterator over `(request, response)` pairs in a sink — for
521
+ * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
522
+ * runs purely in-process over cached LLM outputs.
523
+ */
524
+ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
525
+ runId?: string;
526
+ spanId?: string;
527
+ }): AsyncGenerator<ReplayCacheEntry>;
528
+
529
+ export { type AnalyzeTracesOptions as A, analyzeTraces as B, createReplayFetch as C, type DatasetOverview as D, exportRunAsOtlp as E, iterateRawCalls as F, redactString as G, redactValue as H, OTEL_AGENT_EVAL_SCOPE as O, type QueryTracesPage as Q, REDACTION_VERSION as R, type SearchTraceResult as S, type TraceAnalysisStore as T, type ViewTraceResult as V, type AnalyzeTracesResult as a, type TraceAnalystFilters as b, type ViewSpansResult as c, type SearchSpanResult as d, type AnalyzeTracesInput as e, type AnalyzeTracesTurnSnapshot as f, DEFAULT_REDACTION_RULES as g, DEFAULT_TRACE_ANALYST_BUDGETS as h, type OtlpExport as i, type OtlpResourceSpans as j, type OtlpSpan as k, type RedactionReport as l, type RedactionRule as m, ReplayCache as n, type ReplayCacheEntry as o, ReplayCacheMissError as p, type ReplayCacheStats as q, type ReplayFetchOptions as r, type SpanMatchRecord as s, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX as t, type TraceAnalystByteBudgets as u, type TraceAnalystSpan as v, type TraceAnalystSpanKind as w, type TraceAnalystSpanStatus as x, type TraceAnalystTraceSummary as y, type ViewTraceOversized as z };