observability-toolkit 1.8.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +126 -5
- package/dist/backends/index.d.ts +163 -0
- package/dist/backends/index.d.ts.map +1 -1
- package/dist/backends/index.js +57 -0
- package/dist/backends/index.js.map +1 -1
- package/dist/backends/index.test.js +55 -1
- package/dist/backends/index.test.js.map +1 -1
- package/dist/backends/local-jsonl.d.ts +30 -0
- package/dist/backends/local-jsonl.d.ts.map +1 -1
- package/dist/backends/local-jsonl.js +912 -550
- package/dist/backends/local-jsonl.js.map +1 -1
- package/dist/backends/signoz-api-rate-limiter.test.js +2 -1
- package/dist/backends/signoz-api-rate-limiter.test.js.map +1 -1
- package/dist/backends/signoz-api.d.ts +16 -2
- package/dist/backends/signoz-api.d.ts.map +1 -1
- package/dist/backends/signoz-api.js +650 -534
- package/dist/backends/signoz-api.js.map +1 -1
- package/dist/backends/signoz-api.test.js +6 -5
- package/dist/backends/signoz-api.test.js.map +1 -1
- package/dist/lib/agent-as-judge.d.ts +388 -0
- package/dist/lib/agent-as-judge.d.ts.map +1 -0
- package/dist/lib/agent-as-judge.js +740 -0
- package/dist/lib/agent-as-judge.js.map +1 -0
- package/dist/lib/agent-as-judge.test.d.ts +5 -0
- package/dist/lib/agent-as-judge.test.d.ts.map +1 -0
- package/dist/lib/agent-as-judge.test.js +816 -0
- package/dist/lib/agent-as-judge.test.js.map +1 -0
- package/dist/lib/cache.d.ts +15 -2
- package/dist/lib/cache.d.ts.map +1 -1
- package/dist/lib/cache.js +16 -2
- package/dist/lib/cache.js.map +1 -1
- package/dist/lib/circuit-breaker.d.ts +18 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -1
- package/dist/lib/circuit-breaker.js +41 -8
- package/dist/lib/circuit-breaker.js.map +1 -1
- package/dist/lib/confident-export.d.ts +101 -0
- package/dist/lib/confident-export.d.ts.map +1 -0
- package/dist/lib/confident-export.js +393 -0
- package/dist/lib/confident-export.js.map +1 -0
- package/dist/lib/confident-export.test.d.ts +7 -0
- package/dist/lib/confident-export.test.d.ts.map +1 -0
- package/dist/lib/confident-export.test.js +835 -0
- package/dist/lib/confident-export.test.js.map +1 -0
- package/dist/lib/constants.d.ts +75 -0
- package/dist/lib/constants.d.ts.map +1 -1
- package/dist/lib/constants.js +104 -1
- package/dist/lib/constants.js.map +1 -1
- package/dist/lib/datadog-export.d.ts +156 -0
- package/dist/lib/datadog-export.d.ts.map +1 -0
- package/dist/lib/datadog-export.js +464 -0
- package/dist/lib/datadog-export.js.map +1 -0
- package/dist/lib/datadog-export.test.d.ts +14 -0
- package/dist/lib/datadog-export.test.d.ts.map +1 -0
- package/dist/lib/datadog-export.test.js +890 -0
- package/dist/lib/datadog-export.test.js.map +1 -0
- package/dist/lib/evaluation-hooks.d.ts +49 -0
- package/dist/lib/evaluation-hooks.d.ts.map +1 -0
- package/dist/lib/evaluation-hooks.js +488 -0
- package/dist/lib/evaluation-hooks.js.map +1 -0
- package/dist/lib/evaluation-hooks.test.d.ts +8 -0
- package/dist/lib/evaluation-hooks.test.d.ts.map +1 -0
- package/dist/lib/evaluation-hooks.test.js +624 -0
- package/dist/lib/evaluation-hooks.test.js.map +1 -0
- package/dist/lib/export-utils.d.ts +99 -0
- package/dist/lib/export-utils.d.ts.map +1 -0
- package/dist/lib/export-utils.js +238 -0
- package/dist/lib/export-utils.js.map +1 -0
- package/dist/lib/export-utils.test.d.ts +5 -0
- package/dist/lib/export-utils.test.d.ts.map +1 -0
- package/dist/lib/export-utils.test.js +193 -0
- package/dist/lib/export-utils.test.js.map +1 -0
- package/dist/lib/file-utils.d.ts +17 -2
- package/dist/lib/file-utils.d.ts.map +1 -1
- package/dist/lib/file-utils.js +24 -5
- package/dist/lib/file-utils.js.map +1 -1
- package/dist/lib/file-utils.test.js +30 -0
- package/dist/lib/file-utils.test.js.map +1 -1
- package/dist/lib/histogram.d.ts +119 -0
- package/dist/lib/histogram.d.ts.map +1 -0
- package/dist/lib/histogram.js +202 -0
- package/dist/lib/histogram.js.map +1 -0
- package/dist/lib/histogram.test.d.ts +5 -0
- package/dist/lib/histogram.test.d.ts.map +1 -0
- package/dist/lib/histogram.test.js +381 -0
- package/dist/lib/histogram.test.js.map +1 -0
- package/dist/lib/instrumentation.d.ts +153 -0
- package/dist/lib/instrumentation.d.ts.map +1 -0
- package/dist/lib/instrumentation.integration.test.d.ts +2 -0
- package/dist/lib/instrumentation.integration.test.d.ts.map +1 -0
- package/dist/lib/instrumentation.integration.test.js +589 -0
- package/dist/lib/instrumentation.integration.test.js.map +1 -0
- package/dist/lib/instrumentation.js +520 -0
- package/dist/lib/instrumentation.js.map +1 -0
- package/dist/lib/instrumentation.test.d.ts +2 -0
- package/dist/lib/instrumentation.test.d.ts.map +1 -0
- package/dist/lib/instrumentation.test.js +821 -0
- package/dist/lib/instrumentation.test.js.map +1 -0
- package/dist/lib/langfuse-export.d.ts +125 -0
- package/dist/lib/langfuse-export.d.ts.map +1 -0
- package/dist/lib/langfuse-export.js +367 -0
- package/dist/lib/langfuse-export.js.map +1 -0
- package/dist/lib/langfuse-export.test.d.ts +7 -0
- package/dist/lib/langfuse-export.test.d.ts.map +1 -0
- package/dist/lib/langfuse-export.test.js +1007 -0
- package/dist/lib/langfuse-export.test.js.map +1 -0
- package/dist/lib/llm-as-judge.d.ts +657 -0
- package/dist/lib/llm-as-judge.d.ts.map +1 -0
- package/dist/lib/llm-as-judge.js +1397 -0
- package/dist/lib/llm-as-judge.js.map +1 -0
- package/dist/lib/llm-as-judge.test.d.ts +2 -0
- package/dist/lib/llm-as-judge.test.d.ts.map +1 -0
- package/dist/lib/llm-as-judge.test.js +2409 -0
- package/dist/lib/llm-as-judge.test.js.map +1 -0
- package/dist/lib/logger.d.ts +1 -1
- package/dist/lib/logger.d.ts.map +1 -1
- package/dist/lib/logger.js.map +1 -1
- package/dist/lib/metrics.d.ts +62 -0
- package/dist/lib/metrics.d.ts.map +1 -0
- package/dist/lib/metrics.js +166 -0
- package/dist/lib/metrics.js.map +1 -0
- package/dist/lib/metrics.test.d.ts +5 -0
- package/dist/lib/metrics.test.d.ts.map +1 -0
- package/dist/lib/metrics.test.js +189 -0
- package/dist/lib/metrics.test.js.map +1 -0
- package/dist/lib/parse-stats.d.ts +119 -0
- package/dist/lib/parse-stats.d.ts.map +1 -0
- package/dist/lib/parse-stats.js +206 -0
- package/dist/lib/parse-stats.js.map +1 -0
- package/dist/lib/parse-stats.test.d.ts +5 -0
- package/dist/lib/parse-stats.test.d.ts.map +1 -0
- package/dist/lib/parse-stats.test.js +283 -0
- package/dist/lib/parse-stats.test.js.map +1 -0
- package/dist/lib/phoenix-export.d.ts +109 -0
- package/dist/lib/phoenix-export.d.ts.map +1 -0
- package/dist/lib/phoenix-export.js +429 -0
- package/dist/lib/phoenix-export.js.map +1 -0
- package/dist/lib/phoenix-export.test.d.ts +11 -0
- package/dist/lib/phoenix-export.test.d.ts.map +1 -0
- package/dist/lib/phoenix-export.test.js +725 -0
- package/dist/lib/phoenix-export.test.js.map +1 -0
- package/dist/lib/server-utils.d.ts +6 -1
- package/dist/lib/server-utils.d.ts.map +1 -1
- package/dist/lib/server-utils.js +9 -1
- package/dist/lib/server-utils.js.map +1 -1
- package/dist/lib/shared-schemas.d.ts +6 -0
- package/dist/lib/shared-schemas.d.ts.map +1 -1
- package/dist/lib/shared-schemas.js +11 -4
- package/dist/lib/shared-schemas.js.map +1 -1
- package/dist/lib/verification-events.d.ts +100 -0
- package/dist/lib/verification-events.d.ts.map +1 -0
- package/dist/lib/verification-events.js +162 -0
- package/dist/lib/verification-events.js.map +1 -0
- package/dist/lib/verification-events.test.d.ts +5 -0
- package/dist/lib/verification-events.test.d.ts.map +1 -0
- package/dist/lib/verification-events.test.js +193 -0
- package/dist/lib/verification-events.test.js.map +1 -0
- package/dist/server.d.ts +5 -0
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +77 -21
- package/dist/server.js.map +1 -1
- package/dist/tools/context-stats.d.ts.map +1 -1
- package/dist/tools/context-stats.js +6 -8
- package/dist/tools/context-stats.js.map +1 -1
- package/dist/tools/export-confident.d.ts +145 -0
- package/dist/tools/export-confident.d.ts.map +1 -0
- package/dist/tools/export-confident.js +134 -0
- package/dist/tools/export-confident.js.map +1 -0
- package/dist/tools/export-confident.test.d.ts +7 -0
- package/dist/tools/export-confident.test.d.ts.map +1 -0
- package/dist/tools/export-confident.test.js +332 -0
- package/dist/tools/export-confident.test.js.map +1 -0
- package/dist/tools/export-datadog.d.ts +160 -0
- package/dist/tools/export-datadog.d.ts.map +1 -0
- package/dist/tools/export-datadog.js +160 -0
- package/dist/tools/export-datadog.js.map +1 -0
- package/dist/tools/export-datadog.test.d.ts +8 -0
- package/dist/tools/export-datadog.test.d.ts.map +1 -0
- package/dist/tools/export-datadog.test.js +419 -0
- package/dist/tools/export-datadog.test.js.map +1 -0
- package/dist/tools/export-langfuse.d.ts +137 -0
- package/dist/tools/export-langfuse.d.ts.map +1 -0
- package/dist/tools/export-langfuse.js +131 -0
- package/dist/tools/export-langfuse.js.map +1 -0
- package/dist/tools/export-langfuse.test.d.ts +7 -0
- package/dist/tools/export-langfuse.test.d.ts.map +1 -0
- package/dist/tools/export-langfuse.test.js +303 -0
- package/dist/tools/export-langfuse.test.js.map +1 -0
- package/dist/tools/export-phoenix.d.ts +145 -0
- package/dist/tools/export-phoenix.d.ts.map +1 -0
- package/dist/tools/export-phoenix.js +135 -0
- package/dist/tools/export-phoenix.js.map +1 -0
- package/dist/tools/export-phoenix.test.d.ts +7 -0
- package/dist/tools/export-phoenix.test.d.ts.map +1 -0
- package/dist/tools/export-phoenix.test.js +316 -0
- package/dist/tools/export-phoenix.test.js.map +1 -0
- package/dist/tools/health-check.d.ts +26 -0
- package/dist/tools/health-check.d.ts.map +1 -1
- package/dist/tools/health-check.js +36 -7
- package/dist/tools/health-check.js.map +1 -1
- package/dist/tools/index.d.ts +6 -0
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +6 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/inject-evaluations.d.ts +1315 -0
- package/dist/tools/inject-evaluations.d.ts.map +1 -0
- package/dist/tools/inject-evaluations.js +121 -0
- package/dist/tools/inject-evaluations.js.map +1 -0
- package/dist/tools/inject-evaluations.test.d.ts +5 -0
- package/dist/tools/inject-evaluations.test.d.ts.map +1 -0
- package/dist/tools/inject-evaluations.test.js +359 -0
- package/dist/tools/inject-evaluations.test.js.map +1 -0
- package/dist/tools/query-evaluations.d.ts +25 -4
- package/dist/tools/query-evaluations.d.ts.map +1 -1
- package/dist/tools/query-evaluations.js +10 -0
- package/dist/tools/query-evaluations.js.map +1 -1
- package/dist/tools/query-llm-events.js +2 -2
- package/dist/tools/query-llm-events.js.map +1 -1
- package/dist/tools/query-logs.d.ts +8 -8
- package/dist/tools/query-logs.js +3 -3
- package/dist/tools/query-logs.js.map +1 -1
- package/dist/tools/query-metrics.d.ts +4 -4
- package/dist/tools/query-metrics.js +2 -2
- package/dist/tools/query-metrics.js.map +1 -1
- package/dist/tools/query-traces.d.ts +8 -8
- package/dist/tools/query-verifications.d.ts +111 -0
- package/dist/tools/query-verifications.d.ts.map +1 -0
- package/dist/tools/query-verifications.js +101 -0
- package/dist/tools/query-verifications.js.map +1 -0
- package/dist/tools/query-verifications.test.d.ts +5 -0
- package/dist/tools/query-verifications.test.d.ts.map +1 -0
- package/dist/tools/query-verifications.test.js +156 -0
- package/dist/tools/query-verifications.test.js.map +1 -0
- package/dist/types/evaluation-hooks.d.ts +176 -0
- package/dist/types/evaluation-hooks.d.ts.map +1 -0
- package/dist/types/evaluation-hooks.js +49 -0
- package/dist/types/evaluation-hooks.js.map +1 -0
- package/package.json +10 -2
|
@@ -0,0 +1,740 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent-as-Judge Implementation
|
|
3
|
+
*
|
|
4
|
+
* Provides patterns and utilities for evaluating AI agents using autonomous
|
|
5
|
+
* judge agents with planning, tool use, memory, and multi-agent collaboration.
|
|
6
|
+
*
|
|
7
|
+
* @security
|
|
8
|
+
* - All user inputs are sanitized via llm-as-judge utilities
|
|
9
|
+
* - Tool execution should use sandbox isolation
|
|
10
|
+
* - Memory is bounded to prevent resource exhaustion
|
|
11
|
+
*
|
|
12
|
+
* @see https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/
|
|
13
|
+
* @see docs/quality/agent-as-judge.md
|
|
14
|
+
*/
|
|
15
|
+
import { MAX_STEP_SCORES, MAX_TOOL_VERIFICATIONS, MAX_STEP_ID_LENGTH, } from '../backends/index.js';
|
|
16
|
+
import { InputValidationError } from './input-validator.js';
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// Constants
|
|
19
|
+
// ============================================================================
|
|
20
|
+
/** Maximum trajectory length to analyze for efficiency metrics */
|
|
21
|
+
export const MAX_TRAJECTORY_LENGTH = 1000;
|
|
22
|
+
/** Default timeout for agent evaluation steps (60 seconds) */
|
|
23
|
+
export const DEFAULT_AGENT_EVAL_TIMEOUT_MS = 60000;
|
|
24
|
+
/** Maximum concurrent specialist evaluators */
|
|
25
|
+
export const MAX_CONCURRENT_EVALUATORS = 10;
|
|
26
|
+
/** Maximum consensus rounds for multi-agent evaluation */
|
|
27
|
+
export const MAX_CONSENSUS_ROUNDS = 5;
|
|
28
|
+
/** Default variance threshold for consensus convergence */
|
|
29
|
+
export const DEFAULT_CONVERGENCE_THRESHOLD = 0.1;
|
|
30
|
+
/**
|
|
31
|
+
* Default threshold for early termination in procedural evaluation.
|
|
32
|
+
* If a critical stage scores below this threshold, evaluation terminates early.
|
|
33
|
+
*/
|
|
34
|
+
export const DEFAULT_EARLY_TERMINATION_THRESHOLD = 0.3;
|
|
35
|
+
// ============================================================================
|
|
36
|
+
// Timeout Protection (H6)
|
|
37
|
+
// ============================================================================
|
|
38
|
+
/**
|
|
39
|
+
* Error thrown when an evaluation exceeds the configured timeout.
|
|
40
|
+
*/
|
|
41
|
+
export class AgentEvalTimeoutError extends Error {
|
|
42
|
+
timeoutMs;
|
|
43
|
+
constructor(timeoutMs) {
|
|
44
|
+
super(`Agent evaluation timed out after ${timeoutMs}ms`);
|
|
45
|
+
this.timeoutMs = timeoutMs;
|
|
46
|
+
this.name = 'AgentEvalTimeoutError';
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Execute an async function with timeout protection.
|
|
51
|
+
*
|
|
52
|
+
* @param fn - Async function to execute
|
|
53
|
+
* @param timeoutMs - Timeout in milliseconds (default: DEFAULT_AGENT_EVAL_TIMEOUT_MS)
|
|
54
|
+
* @returns Result of the function
|
|
55
|
+
* @throws {AgentEvalTimeoutError} If function times out
|
|
56
|
+
*/
|
|
57
|
+
export async function withAgentTimeout(fn, timeoutMs = DEFAULT_AGENT_EVAL_TIMEOUT_MS) {
|
|
58
|
+
let timeoutId;
|
|
59
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
60
|
+
timeoutId = setTimeout(() => {
|
|
61
|
+
reject(new AgentEvalTimeoutError(timeoutMs));
|
|
62
|
+
}, timeoutMs);
|
|
63
|
+
});
|
|
64
|
+
try {
|
|
65
|
+
const result = await Promise.race([fn(), timeoutPromise]);
|
|
66
|
+
if (timeoutId !== undefined)
|
|
67
|
+
clearTimeout(timeoutId);
|
|
68
|
+
return result;
|
|
69
|
+
}
|
|
70
|
+
catch (error) {
|
|
71
|
+
if (timeoutId !== undefined)
|
|
72
|
+
clearTimeout(timeoutId);
|
|
73
|
+
throw error;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
// ============================================================================
|
|
77
|
+
// Validation Utilities
|
|
78
|
+
// ============================================================================
|
|
79
|
+
/**
|
|
80
|
+
* Validate an Evaluand object.
|
|
81
|
+
* @throws {InputValidationError} If validation fails
|
|
82
|
+
*/
|
|
83
|
+
export function validateEvaluand(evaluand) {
|
|
84
|
+
if (!evaluand.input || evaluand.input.trim().length === 0) {
|
|
85
|
+
throw new InputValidationError('Evaluand input is required', 'input', 'required');
|
|
86
|
+
}
|
|
87
|
+
if (!evaluand.output || evaluand.output.trim().length === 0) {
|
|
88
|
+
throw new InputValidationError('Evaluand output is required', 'output', 'required');
|
|
89
|
+
}
|
|
90
|
+
if (evaluand.actions && evaluand.actions.length > MAX_TRAJECTORY_LENGTH) {
|
|
91
|
+
throw new InputValidationError(`Actions array exceeds ${MAX_TRAJECTORY_LENGTH} limit`, 'actions', 'maxLength');
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Validate a StepScore object.
|
|
96
|
+
* @throws {InputValidationError} If validation fails
|
|
97
|
+
*/
|
|
98
|
+
export function validateStepScore(stepScore) {
|
|
99
|
+
// Validate step identifier
|
|
100
|
+
if (typeof stepScore.step === 'string') {
|
|
101
|
+
if (stepScore.step.length > MAX_STEP_ID_LENGTH) {
|
|
102
|
+
throw new InputValidationError(`Step identifier exceeds ${MAX_STEP_ID_LENGTH} characters`, 'step', 'maxLength');
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
else if (typeof stepScore.step === 'number') {
|
|
106
|
+
if (stepScore.step < 0 || !Number.isInteger(stepScore.step)) {
|
|
107
|
+
throw new InputValidationError('Step index must be a non-negative integer', 'step', 'type');
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
throw new InputValidationError('Step must be a string or number', 'step', 'type');
|
|
112
|
+
}
|
|
113
|
+
// Validate score range
|
|
114
|
+
if (!Number.isFinite(stepScore.score) || stepScore.score < 0 || stepScore.score > 1) {
|
|
115
|
+
throw new InputValidationError('Score must be in range [0, 1]', 'score', 'range');
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Validate a ToolVerification object.
|
|
120
|
+
* @throws {InputValidationError} If validation fails
|
|
121
|
+
*/
|
|
122
|
+
export function validateToolVerification(verification) {
|
|
123
|
+
if (!verification.toolName || verification.toolName.trim().length === 0) {
|
|
124
|
+
throw new InputValidationError('Tool name is required', 'toolName', 'required');
|
|
125
|
+
}
|
|
126
|
+
if (typeof verification.toolCorrect !== 'boolean') {
|
|
127
|
+
throw new InputValidationError('toolCorrect must be a boolean', 'toolCorrect', 'type');
|
|
128
|
+
}
|
|
129
|
+
if (typeof verification.argsCorrect !== 'boolean') {
|
|
130
|
+
throw new InputValidationError('argsCorrect must be a boolean', 'argsCorrect', 'type');
|
|
131
|
+
}
|
|
132
|
+
if (!Number.isFinite(verification.score) || verification.score < 0 || verification.score > 1) {
|
|
133
|
+
throw new InputValidationError('Score must be in range [0, 1]', 'score', 'range');
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
// ============================================================================
|
|
137
|
+
// Tool Verification Utilities
|
|
138
|
+
// ============================================================================
|
|
139
|
+
/**
|
|
140
|
+
* Verify a single tool call against expected behavior.
|
|
141
|
+
*
|
|
142
|
+
* @param action - The agent action containing the tool call
|
|
143
|
+
* @param expectedTool - Expected tool name (optional)
|
|
144
|
+
* @param expectedArgs - Expected arguments (optional)
|
|
145
|
+
* @param actualResult - Actual result to compare against expected (optional)
|
|
146
|
+
* @param expectedResult - Expected result for comparison (optional)
|
|
147
|
+
* @returns ToolVerification result with weighted score:
|
|
148
|
+
* - Tool selection: 40% weight
|
|
149
|
+
* - Arguments: 30% weight (if expectedArgs provided)
|
|
150
|
+
* - Result: 30% weight (if expectedResult provided)
|
|
151
|
+
* @throws {InputValidationError} If action is null, undefined, or not an object
|
|
152
|
+
*
|
|
153
|
+
* @example
|
|
154
|
+
* ```typescript
|
|
155
|
+
* const verification = verifyToolCall(
|
|
156
|
+
* { type: 'tool_call', tool: 'search', arguments: { query: 'test' } },
|
|
157
|
+
* 'search',
|
|
158
|
+
* { query: 'test' }
|
|
159
|
+
* );
|
|
160
|
+
* console.log(verification.score); // 1.0 (all correct)
|
|
161
|
+
* ```
|
|
162
|
+
*/
|
|
163
|
+
export function verifyToolCall(action, expectedTool, expectedArgs, actualResult, expectedResult) {
|
|
164
|
+
// H1: Validate action parameter
|
|
165
|
+
if (!action || typeof action !== 'object') {
|
|
166
|
+
throw new InputValidationError('Action is required and must be an object', 'action', 'required');
|
|
167
|
+
}
|
|
168
|
+
const toolCorrect = expectedTool ? action.tool === expectedTool : true;
|
|
169
|
+
const argsCorrect = expectedArgs
|
|
170
|
+
? deepEquals(action.arguments, expectedArgs)
|
|
171
|
+
: true;
|
|
172
|
+
const resultCorrect = expectedResult !== undefined
|
|
173
|
+
? deepEquals(actualResult ?? action.result, expectedResult)
|
|
174
|
+
: undefined;
|
|
175
|
+
// Calculate weighted score
|
|
176
|
+
let score = 0;
|
|
177
|
+
let weights = 0;
|
|
178
|
+
// Tool selection: 40% weight
|
|
179
|
+
score += (toolCorrect ? 0.4 : 0);
|
|
180
|
+
weights += 0.4;
|
|
181
|
+
// Arguments: 30% weight
|
|
182
|
+
if (expectedArgs !== undefined) {
|
|
183
|
+
score += (argsCorrect ? 0.3 : 0);
|
|
184
|
+
weights += 0.3;
|
|
185
|
+
}
|
|
186
|
+
// Result: 30% weight (only if expected result provided)
|
|
187
|
+
if (expectedResult !== undefined) {
|
|
188
|
+
score += (resultCorrect ? 0.3 : 0);
|
|
189
|
+
weights += 0.3;
|
|
190
|
+
}
|
|
191
|
+
// Normalize by weights used
|
|
192
|
+
const normalizedScore = weights > 0 ? score / weights : (toolCorrect ? 1 : 0);
|
|
193
|
+
return {
|
|
194
|
+
toolName: action.tool || 'unknown',
|
|
195
|
+
toolCallId: action.toolCallId,
|
|
196
|
+
toolCorrect,
|
|
197
|
+
argsCorrect,
|
|
198
|
+
resultCorrect,
|
|
199
|
+
score: normalizedScore,
|
|
200
|
+
expectedTool,
|
|
201
|
+
evidence: {
|
|
202
|
+
actualTool: action.tool,
|
|
203
|
+
actualArgs: action.arguments,
|
|
204
|
+
actualResult: actualResult ?? action.result,
|
|
205
|
+
expectedTool,
|
|
206
|
+
expectedArgs,
|
|
207
|
+
expectedResult,
|
|
208
|
+
},
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Verify all tool calls in an agent trajectory.
|
|
213
|
+
*
|
|
214
|
+
* @param actions - List of agent actions
|
|
215
|
+
* @param expectedTools - Map of expected tool calls by index or toolCallId
|
|
216
|
+
* @returns Array of ToolVerification results
|
|
217
|
+
*/
|
|
218
|
+
export function verifyToolCalls(actions, expectedTools) {
|
|
219
|
+
const verifications = [];
|
|
220
|
+
for (let i = 0; i < actions.length && verifications.length < MAX_TOOL_VERIFICATIONS; i++) {
|
|
221
|
+
const action = actions[i];
|
|
222
|
+
if (action.type !== 'tool_call' || !action.tool)
|
|
223
|
+
continue;
|
|
224
|
+
// Check for expected tool by index or toolCallId
|
|
225
|
+
const expected = expectedTools?.get(i) ??
|
|
226
|
+
(action.toolCallId ? expectedTools?.get(action.toolCallId) : undefined);
|
|
227
|
+
const verification = verifyToolCall(action, expected?.tool, expected?.args, undefined, expected?.result);
|
|
228
|
+
verifications.push(verification);
|
|
229
|
+
}
|
|
230
|
+
return verifications;
|
|
231
|
+
}
|
|
232
|
+
// ============================================================================
|
|
233
|
+
// Step Scoring Utilities
|
|
234
|
+
// ============================================================================
|
|
235
|
+
/**
|
|
236
|
+
* Score a single step in the agent trajectory.
|
|
237
|
+
*
|
|
238
|
+
* @param action - The agent action to score
|
|
239
|
+
* @param stepIndex - Index of this step (non-negative integer)
|
|
240
|
+
* @param evaluation - Score and explanation from LLM judge
|
|
241
|
+
* @returns StepScore result with score clamped to [0, 1]
|
|
242
|
+
* @throws {InputValidationError} If evaluation.score is not a finite number
|
|
243
|
+
* @throws {InputValidationError} If stepIndex is invalid (validated via validateStepScore)
|
|
244
|
+
*
|
|
245
|
+
* @example
|
|
246
|
+
* ```typescript
|
|
247
|
+
* const step = scoreStep(
|
|
248
|
+
* { type: 'tool_call', tool: 'search' },
|
|
249
|
+
* 0,
|
|
250
|
+
* { score: 0.85, explanation: 'Correct tool selection' }
|
|
251
|
+
* );
|
|
252
|
+
* ```
|
|
253
|
+
*/
|
|
254
|
+
export function scoreStep(action, stepIndex, evaluation) {
|
|
255
|
+
// H4: Validate score is a finite number before clamping
|
|
256
|
+
if (typeof evaluation.score !== 'number' || !Number.isFinite(evaluation.score)) {
|
|
257
|
+
throw new InputValidationError('Evaluation score must be a finite number', 'score', 'type');
|
|
258
|
+
}
|
|
259
|
+
const stepScore = {
|
|
260
|
+
step: stepIndex,
|
|
261
|
+
score: Math.max(0, Math.min(1, evaluation.score)), // Clamp to [0, 1]
|
|
262
|
+
explanation: evaluation.explanation,
|
|
263
|
+
evidence: {
|
|
264
|
+
actionType: action.type,
|
|
265
|
+
tool: action.tool,
|
|
266
|
+
reasoning: action.reasoning,
|
|
267
|
+
},
|
|
268
|
+
};
|
|
269
|
+
validateStepScore(stepScore);
|
|
270
|
+
return stepScore;
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Calculate aggregate step scores from individual scores.
|
|
274
|
+
*
|
|
275
|
+
* @param stepScores - Array of step scores
|
|
276
|
+
* @param aggregation - Aggregation method ('average', 'weighted', 'min')
|
|
277
|
+
* @param weights - Weights for weighted aggregation (must match stepScores length, finite non-negative)
|
|
278
|
+
* @returns Aggregated score (0-1)
|
|
279
|
+
* - Empty array returns 1 (no steps to fail = vacuously perfect)
|
|
280
|
+
* - This convention matches mathematical definition where empty set satisfies all predicates
|
|
281
|
+
* @throws {Error} If aggregation is 'weighted' but weights array is missing or wrong length
|
|
282
|
+
* @throws {Error} If any weight is negative, NaN, or Infinity
|
|
283
|
+
*
|
|
284
|
+
* @example
|
|
285
|
+
* ```typescript
|
|
286
|
+
* // Simple average
|
|
287
|
+
* const avg = aggregateStepScores(steps, 'average');
|
|
288
|
+
*
|
|
289
|
+
* // Weighted (emphasize later steps)
|
|
290
|
+
* const weighted = aggregateStepScores(steps, 'weighted', [1, 2, 3]);
|
|
291
|
+
*
|
|
292
|
+
* // Minimum score (most strict)
|
|
293
|
+
* const min = aggregateStepScores(steps, 'min');
|
|
294
|
+
*
|
|
295
|
+
* // Empty array returns 1 (vacuously true)
|
|
296
|
+
* aggregateStepScores([], 'average'); // => 1
|
|
297
|
+
* ```
|
|
298
|
+
*/
|
|
299
|
+
export function aggregateStepScores(stepScores, aggregation = 'average', weights) {
|
|
300
|
+
// L7: Empty array returns 1 (vacuously perfect - no steps to fail)
|
|
301
|
+
if (stepScores.length === 0)
|
|
302
|
+
return 1;
|
|
303
|
+
switch (aggregation) {
|
|
304
|
+
case 'average':
|
|
305
|
+
return stepScores.reduce((sum, s) => sum + s.score, 0) / stepScores.length;
|
|
306
|
+
case 'weighted':
|
|
307
|
+
if (!weights || weights.length !== stepScores.length) {
|
|
308
|
+
throw new Error('Weights required for weighted aggregation');
|
|
309
|
+
}
|
|
310
|
+
// L8/M1: Validate weights are finite non-negative numbers
|
|
311
|
+
for (let i = 0; i < weights.length; i++) {
|
|
312
|
+
if (weights[i] < 0 || !Number.isFinite(weights[i])) {
|
|
313
|
+
throw new Error(`Invalid weight at index ${i}: ${weights[i]}. Weights must be finite non-negative numbers.`);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
const totalWeight = weights.reduce((sum, w) => sum + w, 0);
|
|
317
|
+
if (totalWeight === 0)
|
|
318
|
+
return 0;
|
|
319
|
+
return stepScores.reduce((sum, s, i) => sum + s.score * weights[i], 0) / totalWeight;
|
|
320
|
+
case 'min':
|
|
321
|
+
return Math.min(...stepScores.map(s => s.score));
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
// ============================================================================
|
|
325
|
+
// Trajectory Analysis Utilities
|
|
326
|
+
// ============================================================================
|
|
327
|
+
/**
|
|
328
|
+
* Calculate trajectory efficiency metrics.
|
|
329
|
+
*
|
|
330
|
+
* Uses JSON.stringify for redundancy detection. For large argument objects,
|
|
331
|
+
* consider implementing hash-based comparison for better performance.
|
|
332
|
+
*
|
|
333
|
+
* @param evaluand - The evaluand with actions
|
|
334
|
+
* @param optimalLength - Expected optimal trajectory length (optional)
|
|
335
|
+
* @returns Efficiency metrics
|
|
336
|
+
*/
|
|
337
|
+
export function analyzeTrajectory(evaluand, optimalLength) {
|
|
338
|
+
const actions = evaluand.actions || [];
|
|
339
|
+
const length = actions.length;
|
|
340
|
+
const toolCalls = actions.filter(a => a.type === 'tool_call');
|
|
341
|
+
const toolCallCount = toolCalls.length;
|
|
342
|
+
const uniqueTools = new Set(toolCalls.map(a => a.tool).filter(Boolean)).size;
|
|
343
|
+
// Calculate efficiency ratio (if optimal length known)
|
|
344
|
+
const efficiencyRatio = optimalLength && optimalLength > 0
|
|
345
|
+
? Math.min(1, optimalLength / Math.max(length, 1))
|
|
346
|
+
: 1;
|
|
347
|
+
// Detect redundant actions (same tool with same args)
|
|
348
|
+
const seenToolCalls = new Set();
|
|
349
|
+
let redundantActions = 0;
|
|
350
|
+
for (const action of toolCalls) {
|
|
351
|
+
// M2: Wrap in try-catch to handle circular references or non-serializable values
|
|
352
|
+
try {
|
|
353
|
+
const key = JSON.stringify({ tool: action.tool, args: action.arguments });
|
|
354
|
+
if (seenToolCalls.has(key)) {
|
|
355
|
+
redundantActions++;
|
|
356
|
+
}
|
|
357
|
+
seenToolCalls.add(key);
|
|
358
|
+
}
|
|
359
|
+
catch {
|
|
360
|
+
// Skip this action for redundancy detection if serialization fails
|
|
361
|
+
continue;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
return {
|
|
365
|
+
length,
|
|
366
|
+
toolCallCount,
|
|
367
|
+
uniqueTools,
|
|
368
|
+
efficiencyRatio,
|
|
369
|
+
redundantActions,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
// ============================================================================
|
|
373
|
+
// Multi-Agent Consensus Utilities
|
|
374
|
+
// ============================================================================
|
|
375
|
+
/**
|
|
376
|
+
* Calculate sample variance of scores for convergence detection.
|
|
377
|
+
* Uses Bessel's correction (n-1) for unbiased estimation.
|
|
378
|
+
*
|
|
379
|
+
* @param scores - Array of score values
|
|
380
|
+
* @returns Sample variance (0 for empty or single-element arrays)
|
|
381
|
+
*/
|
|
382
|
+
export function calculateVariance(scores) {
|
|
383
|
+
// Return 0 for empty arrays or single element (no variance measurable)
|
|
384
|
+
if (scores.length <= 1)
|
|
385
|
+
return 0;
|
|
386
|
+
const n = scores.length;
|
|
387
|
+
const mean = scores.reduce((sum, s) => sum + s, 0) / n;
|
|
388
|
+
const squaredDiffs = scores.map(s => (s - mean) ** 2);
|
|
389
|
+
// Bessel's correction: divide by (n-1) for sample variance
|
|
390
|
+
return squaredDiffs.reduce((sum, d) => sum + d, 0) / (n - 1);
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Calculate median of scores for final consensus.
|
|
394
|
+
*/
|
|
395
|
+
export function calculateMedian(scores) {
|
|
396
|
+
if (scores.length === 0)
|
|
397
|
+
return 0;
|
|
398
|
+
const sorted = [...scores].sort((a, b) => a - b);
|
|
399
|
+
const mid = Math.floor(sorted.length / 2);
|
|
400
|
+
return sorted.length % 2 === 0
|
|
401
|
+
? (sorted[mid - 1] + sorted[mid]) / 2
|
|
402
|
+
: sorted[mid];
|
|
403
|
+
}
|
|
404
|
+
/**
|
|
405
|
+
* Run multi-agent consensus evaluation.
|
|
406
|
+
*
|
|
407
|
+
* Uses Promise.allSettled for graceful degradation - partial failures
|
|
408
|
+
* don't abort the entire consensus. At least one judge must succeed per round.
|
|
409
|
+
*
|
|
410
|
+
* @param evaluand - Subject to evaluate
|
|
411
|
+
* @param judges - Array of judge evaluation functions
|
|
412
|
+
* @param config - Consensus configuration
|
|
413
|
+
* @returns ConsensusResult
|
|
414
|
+
* @throws {InputValidationError} If judges array exceeds MAX_CONCURRENT_EVALUATORS
|
|
415
|
+
* @throws {InputValidationError} If config.rounds is less than 1
|
|
416
|
+
* @throws {Error} If all judges fail in any round
|
|
417
|
+
*/
|
|
418
|
+
export async function collectiveConsensus(evaluand, judges, config) {
|
|
419
|
+
validateEvaluand(evaluand);
|
|
420
|
+
// H2: Validate judge count to prevent unbounded memory growth
|
|
421
|
+
if (judges.length > MAX_CONCURRENT_EVALUATORS) {
|
|
422
|
+
throw new InputValidationError(`Number of judges (${judges.length}) exceeds ${MAX_CONCURRENT_EVALUATORS} limit`, 'judges', 'maxLength');
|
|
423
|
+
}
|
|
424
|
+
if (judges.length === 0) {
|
|
425
|
+
throw new InputValidationError('At least one judge is required', 'judges', 'required');
|
|
426
|
+
}
|
|
427
|
+
// M9: Validate rounds is positive
|
|
428
|
+
if (config.rounds < 1) {
|
|
429
|
+
throw new InputValidationError('Rounds must be at least 1', 'rounds', 'range');
|
|
430
|
+
}
|
|
431
|
+
const rounds = Math.min(config.rounds, MAX_CONSENSUS_ROUNDS);
|
|
432
|
+
const scores = new Map();
|
|
433
|
+
// Initialize score arrays
|
|
434
|
+
for (const judge of judges) {
|
|
435
|
+
scores.set(judge.id, []);
|
|
436
|
+
}
|
|
437
|
+
let convergenceRound = rounds;
|
|
438
|
+
let converged = false;
|
|
439
|
+
for (let round = 0; round < rounds; round++) {
|
|
440
|
+
// H3: Use Promise.allSettled for graceful degradation
|
|
441
|
+
const roundResults = await Promise.allSettled(judges.map(async (judge) => {
|
|
442
|
+
const score = await judge.evaluate(evaluand, scores);
|
|
443
|
+
return { id: judge.id, score };
|
|
444
|
+
}));
|
|
445
|
+
// Filter successful results
|
|
446
|
+
const successfulResults = roundResults
|
|
447
|
+
.filter((r) => r.status === 'fulfilled')
|
|
448
|
+
.map(r => r.value);
|
|
449
|
+
// H3: Require at least one successful judge
|
|
450
|
+
if (successfulResults.length === 0) {
|
|
451
|
+
throw new Error(`All judge evaluations failed in round ${round + 1}`);
|
|
452
|
+
}
|
|
453
|
+
// Update scores for successful judges only
|
|
454
|
+
for (const { id, score } of successfulResults) {
|
|
455
|
+
// H4/M1: Validate score is finite AND in valid range [0, 1] before storing
|
|
456
|
+
if (Number.isFinite(score) && score >= 0 && score <= 1) {
|
|
457
|
+
scores.get(id).push(score);
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
// Check convergence (only using successful scores)
|
|
461
|
+
const currentScores = successfulResults
|
|
462
|
+
.filter(r => Number.isFinite(r.score))
|
|
463
|
+
.map(r => r.score);
|
|
464
|
+
if (currentScores.length > 0 && calculateVariance(currentScores) < config.convergenceThreshold) {
|
|
465
|
+
convergenceRound = round + 1;
|
|
466
|
+
converged = true;
|
|
467
|
+
break;
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
// Final score is median of last round (only judges with scores)
|
|
471
|
+
const lastRoundScores = [];
|
|
472
|
+
for (const judge of judges) {
|
|
473
|
+
const judgeScores = scores.get(judge.id);
|
|
474
|
+
if (judgeScores.length > 0) {
|
|
475
|
+
lastRoundScores.push(judgeScores[judgeScores.length - 1]);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
return {
|
|
479
|
+
finalScore: lastRoundScores.length > 0 ? calculateMedian(lastRoundScores) : 0,
|
|
480
|
+
convergenceRound,
|
|
481
|
+
judgeScores: scores,
|
|
482
|
+
converged,
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
// ============================================================================
|
|
486
|
+
// Abstract Agent Judge Base Class
|
|
487
|
+
// ============================================================================
|
|
488
|
+
/**
|
|
489
|
+
* Abstract base class for Agent-as-Judge implementations.
|
|
490
|
+
*
|
|
491
|
+
* Provides common infrastructure for memory, tool access, and evaluation flow.
|
|
492
|
+
* Subclasses implement specific evaluation strategies (procedural, reactive, self-evolving).
|
|
493
|
+
*/
|
|
494
|
+
/** Maximum entries in judge memory before LRU eviction */
|
|
495
|
+
const MAX_JUDGE_MEMORY_SIZE = 1000;
|
|
496
|
+
export class AgentJudge {
|
|
497
|
+
memory = new Map();
|
|
498
|
+
/**
|
|
499
|
+
* Store intermediate state in memory with LRU eviction.
|
|
500
|
+
* @security Memory is bounded to MAX_JUDGE_MEMORY_SIZE entries
|
|
501
|
+
*/
|
|
502
|
+
storeInMemory(key, value) {
|
|
503
|
+
// H5: Proper LRU - delete existing key first to move it to end
|
|
504
|
+
if (this.memory.has(key)) {
|
|
505
|
+
this.memory.delete(key);
|
|
506
|
+
}
|
|
507
|
+
// Bound memory size to prevent exhaustion
|
|
508
|
+
if (this.memory.size >= MAX_JUDGE_MEMORY_SIZE) {
|
|
509
|
+
// Evict least recently used (first entry in insertion order)
|
|
510
|
+
const firstKey = this.memory.keys().next().value;
|
|
511
|
+
if (firstKey)
|
|
512
|
+
this.memory.delete(firstKey);
|
|
513
|
+
}
|
|
514
|
+
this.memory.set(key, value);
|
|
515
|
+
}
|
|
516
|
+
/**
|
|
517
|
+
* Retrieve state from memory with LRU update.
|
|
518
|
+
* @note Access moves item to end (most recently used)
|
|
519
|
+
*/
|
|
520
|
+
getFromMemory(key) {
|
|
521
|
+
const value = this.memory.get(key);
|
|
522
|
+
if (value !== undefined) {
|
|
523
|
+
// H5: Re-insert to move to end (most recently used)
|
|
524
|
+
this.memory.delete(key);
|
|
525
|
+
this.memory.set(key, value);
|
|
526
|
+
}
|
|
527
|
+
return value;
|
|
528
|
+
}
|
|
529
|
+
/**
|
|
530
|
+
* Clear all memory.
|
|
531
|
+
*/
|
|
532
|
+
clearMemory() {
|
|
533
|
+
this.memory.clear();
|
|
534
|
+
}
|
|
535
|
+
/**
|
|
536
|
+
* Convert AgentEvalResult to OTel-compatible EvaluationResult.
|
|
537
|
+
*/
|
|
538
|
+
toEvaluationResult(result, evaluand, evaluatorType = 'llm') {
|
|
539
|
+
return {
|
|
540
|
+
scoreValue: result.overallScore,
|
|
541
|
+
explanation: result.explanation,
|
|
542
|
+
evaluator: this.name,
|
|
543
|
+
evaluatorType,
|
|
544
|
+
agentId: evaluand.agentId,
|
|
545
|
+
agentName: evaluand.agentName,
|
|
546
|
+
stepScores: result.stepScores.slice(0, MAX_STEP_SCORES),
|
|
547
|
+
toolVerifications: result.toolVerifications.slice(0, MAX_TOOL_VERIFICATIONS),
|
|
548
|
+
trajectoryLength: result.trajectoryLength,
|
|
549
|
+
};
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
/**
|
|
553
|
+
* Procedural Agent Judge - Fixed evaluation pipeline.
|
|
554
|
+
*
|
|
555
|
+
* Executes a predefined sequence of evaluation stages.
|
|
556
|
+
* Best for domain-specific evaluations with known criteria.
|
|
557
|
+
*/
|
|
558
|
+
export class ProceduralJudge extends AgentJudge {
|
|
559
|
+
stages;
|
|
560
|
+
earlyTerminationOn;
|
|
561
|
+
name = 'procedural-agent-judge';
|
|
562
|
+
/**
|
|
563
|
+
* Create a ProceduralJudge with a sequence of evaluation stages.
|
|
564
|
+
*
|
|
565
|
+
* @param stages - Array of evaluation stages to execute in order
|
|
566
|
+
* @param earlyTerminationOn - Optional stage name that triggers early termination on failure
|
|
567
|
+
* @throws {InputValidationError} If stages array is empty
|
|
568
|
+
* @throws {InputValidationError} If any stage has empty or missing name
|
|
569
|
+
* @throws {InputValidationError} If any stage has missing evaluate function
|
|
570
|
+
* @throws {InputValidationError} If earlyTerminationOn references non-existent stage
|
|
571
|
+
*/
|
|
572
|
+
constructor(stages, earlyTerminationOn) {
|
|
573
|
+
super();
|
|
574
|
+
this.stages = stages;
|
|
575
|
+
this.earlyTerminationOn = earlyTerminationOn;
|
|
576
|
+
// M8: Validate stages array
|
|
577
|
+
if (!stages || stages.length === 0) {
|
|
578
|
+
throw new InputValidationError('ProceduralJudge requires at least one stage', 'stages', 'required');
|
|
579
|
+
}
|
|
580
|
+
// Validate each stage
|
|
581
|
+
for (let i = 0; i < stages.length; i++) {
|
|
582
|
+
const stage = stages[i];
|
|
583
|
+
if (!stage.name || stage.name.trim().length === 0) {
|
|
584
|
+
throw new InputValidationError(`Stage ${i} has invalid name "${stage.name ?? 'undefined'}" - must be non-empty and not just whitespace`, 'stages', 'required');
|
|
585
|
+
}
|
|
586
|
+
if (typeof stage.evaluate !== 'function') {
|
|
587
|
+
throw new InputValidationError(`Stage ${i} (${stage.name}) must have an evaluate function`, 'stages', 'type');
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
// Validate earlyTerminationOn if provided
|
|
591
|
+
if (earlyTerminationOn !== undefined) {
|
|
592
|
+
const stageNames = stages.map(s => s.name);
|
|
593
|
+
if (!stageNames.includes(earlyTerminationOn)) {
|
|
594
|
+
throw new InputValidationError(`earlyTerminationOn stage '${earlyTerminationOn}' not found in stages: ${stageNames.join(', ')}`, 'earlyTerminationOn', 'invalid');
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
async evaluate(evaluand) {
|
|
599
|
+
// M3: Wrap entire evaluation in timeout to prevent indefinite hangs
|
|
600
|
+
return withAgentTimeout(async () => {
|
|
601
|
+
validateEvaluand(evaluand);
|
|
602
|
+
const context = {};
|
|
603
|
+
const stepScores = [];
|
|
604
|
+
const trajectory = analyzeTrajectory(evaluand);
|
|
605
|
+
for (let i = 0; i < this.stages.length; i++) {
|
|
606
|
+
const stage = this.stages[i];
|
|
607
|
+
const result = await stage.evaluate(evaluand, context);
|
|
608
|
+
const stepScore = scoreStep({ type: 'evaluation_stage', reasoning: stage.name }, i, result);
|
|
609
|
+
stepScores.push(stepScore);
|
|
610
|
+
context[stage.name] = result;
|
|
611
|
+
// L10: Early termination if configured and stage fails below threshold
|
|
612
|
+
if (this.earlyTerminationOn === stage.name && result.score < DEFAULT_EARLY_TERMINATION_THRESHOLD) {
|
|
613
|
+
return {
|
|
614
|
+
overallScore: 0,
|
|
615
|
+
stepScores,
|
|
616
|
+
toolVerifications: verifyToolCalls(evaluand.actions || []),
|
|
617
|
+
trajectoryLength: trajectory.length,
|
|
618
|
+
explanation: `Early termination: ${stage.name} failed with score ${result.score}`,
|
|
619
|
+
actionableFeedback: [result.explanation],
|
|
620
|
+
};
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
return {
|
|
624
|
+
overallScore: aggregateStepScores(stepScores),
|
|
625
|
+
stepScores,
|
|
626
|
+
toolVerifications: verifyToolCalls(evaluand.actions || []),
|
|
627
|
+
trajectoryLength: trajectory.length,
|
|
628
|
+
explanation: `Procedural evaluation completed ${this.stages.length} stages`,
|
|
629
|
+
actionableFeedback: stepScores
|
|
630
|
+
.filter(s => s.score < 0.7)
|
|
631
|
+
.map(s => s.explanation || `Step ${s.step} needs improvement`),
|
|
632
|
+
};
|
|
633
|
+
});
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
/**
|
|
637
|
+
* Reactive Agent Judge - Adaptive evaluation based on content.
|
|
638
|
+
*
|
|
639
|
+
* Routes evaluation to appropriate specialists based on initial analysis.
|
|
640
|
+
* Supports deep-dive evaluation when initial analysis indicates issues.
|
|
641
|
+
*/
|
|
642
|
+
export class ReactiveJudge extends AgentJudge {
|
|
643
|
+
router;
|
|
644
|
+
specialists;
|
|
645
|
+
deepDiveSpecialists;
|
|
646
|
+
name = 'reactive-agent-judge';
|
|
647
|
+
constructor(router, specialists, deepDiveSpecialists) {
|
|
648
|
+
super();
|
|
649
|
+
this.router = router;
|
|
650
|
+
this.specialists = specialists;
|
|
651
|
+
this.deepDiveSpecialists = deepDiveSpecialists;
|
|
652
|
+
}
|
|
653
|
+
async evaluate(evaluand) {
|
|
654
|
+
// M3: Wrap entire evaluation in timeout to prevent indefinite hangs
|
|
655
|
+
return withAgentTimeout(async () => {
|
|
656
|
+
validateEvaluand(evaluand);
|
|
657
|
+
// Route to relevant specialists
|
|
658
|
+
const relevantSpecialists = await this.router(evaluand);
|
|
659
|
+
const trajectory = analyzeTrajectory(evaluand);
|
|
660
|
+
const stepScores = [];
|
|
661
|
+
for (let i = 0; i < relevantSpecialists.length; i++) {
|
|
662
|
+
const specialistName = relevantSpecialists[i];
|
|
663
|
+
const specialist = this.specialists.get(specialistName);
|
|
664
|
+
if (!specialist)
|
|
665
|
+
continue;
|
|
666
|
+
const result = await specialist(evaluand);
|
|
667
|
+
const stepScore = scoreStep({ type: 'specialist_evaluation', reasoning: specialistName }, i, result);
|
|
668
|
+
stepScores.push(stepScore);
|
|
669
|
+
this.storeInMemory(`eval_${specialistName}`, result);
|
|
670
|
+
// Trigger deep dive if needed
|
|
671
|
+
if (result.needsDeepDive && this.deepDiveSpecialists) {
|
|
672
|
+
const deepDive = this.deepDiveSpecialists.get(specialistName);
|
|
673
|
+
if (deepDive) {
|
|
674
|
+
const deepResult = await deepDive(evaluand);
|
|
675
|
+
const deepStepScore = scoreStep({ type: 'deep_dive_evaluation', reasoning: `${specialistName}_deep` }, stepScores.length, deepResult);
|
|
676
|
+
stepScores.push(deepStepScore);
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
return {
|
|
681
|
+
overallScore: aggregateStepScores(stepScores),
|
|
682
|
+
stepScores,
|
|
683
|
+
toolVerifications: verifyToolCalls(evaluand.actions || []),
|
|
684
|
+
trajectoryLength: trajectory.length,
|
|
685
|
+
explanation: `Reactive evaluation engaged ${relevantSpecialists.length} specialists`,
|
|
686
|
+
actionableFeedback: stepScores
|
|
687
|
+
.filter(s => s.score < 0.7)
|
|
688
|
+
.map(s => s.explanation || `${s.step} needs improvement`),
|
|
689
|
+
};
|
|
690
|
+
});
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
// ============================================================================
|
|
694
|
+
// Helper Functions
|
|
695
|
+
// ============================================================================
|
|
696
|
+
/**
|
|
697
|
+
* Deep equality comparison for objects.
|
|
698
|
+
* Used for comparing tool arguments and results.
|
|
699
|
+
*
|
|
700
|
+
* M3: Includes circular reference protection using WeakSet tracking.
|
|
701
|
+
*
|
|
702
|
+
* @param a - First value to compare
|
|
703
|
+
* @param b - Second value to compare
|
|
704
|
+
* @param seenA - WeakSet tracking visited objects from 'a' (for cycle detection)
|
|
705
|
+
* @param seenB - WeakSet tracking visited objects from 'b' (for cycle detection)
|
|
706
|
+
* @returns True if values are deeply equal
|
|
707
|
+
*/
|
|
708
|
+
function deepEquals(a, b, seenA = new WeakSet(), seenB = new WeakSet()) {
|
|
709
|
+
if (a === b)
|
|
710
|
+
return true;
|
|
711
|
+
if (a === null || b === null)
|
|
712
|
+
return a === b;
|
|
713
|
+
if (typeof a !== typeof b)
|
|
714
|
+
return false;
|
|
715
|
+
if (typeof a !== 'object')
|
|
716
|
+
return a === b;
|
|
717
|
+
// M3: Circular reference protection
|
|
718
|
+
const aObj = a;
|
|
719
|
+
const bObj = b;
|
|
720
|
+
if (seenA.has(aObj) || seenB.has(bObj)) {
|
|
721
|
+
// If we've seen this object before, consider unequal to be safe
|
|
722
|
+
// (matching circular structures is complex and rarely needed)
|
|
723
|
+
return false;
|
|
724
|
+
}
|
|
725
|
+
seenA.add(aObj);
|
|
726
|
+
seenB.add(bObj);
|
|
727
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
728
|
+
if (a.length !== b.length)
|
|
729
|
+
return false;
|
|
730
|
+
return a.every((val, i) => deepEquals(val, b[i], seenA, seenB));
|
|
731
|
+
}
|
|
732
|
+
if (Array.isArray(a) || Array.isArray(b))
|
|
733
|
+
return false;
|
|
734
|
+
const aKeys = Object.keys(a);
|
|
735
|
+
const bKeys = Object.keys(b);
|
|
736
|
+
if (aKeys.length !== bKeys.length)
|
|
737
|
+
return false;
|
|
738
|
+
return aKeys.every(key => deepEquals(a[key], b[key], seenA, seenB));
|
|
739
|
+
}
|
|
740
|
+
//# sourceMappingURL=agent-as-judge.js.map
|