observability-toolkit 1.8.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/README.md +126 -5
  2. package/dist/backends/index.d.ts +163 -0
  3. package/dist/backends/index.d.ts.map +1 -1
  4. package/dist/backends/index.js +57 -0
  5. package/dist/backends/index.js.map +1 -1
  6. package/dist/backends/index.test.js +55 -1
  7. package/dist/backends/index.test.js.map +1 -1
  8. package/dist/backends/local-jsonl.d.ts +30 -0
  9. package/dist/backends/local-jsonl.d.ts.map +1 -1
  10. package/dist/backends/local-jsonl.js +912 -550
  11. package/dist/backends/local-jsonl.js.map +1 -1
  12. package/dist/backends/signoz-api-rate-limiter.test.js +2 -1
  13. package/dist/backends/signoz-api-rate-limiter.test.js.map +1 -1
  14. package/dist/backends/signoz-api.d.ts +16 -2
  15. package/dist/backends/signoz-api.d.ts.map +1 -1
  16. package/dist/backends/signoz-api.js +650 -534
  17. package/dist/backends/signoz-api.js.map +1 -1
  18. package/dist/backends/signoz-api.test.js +6 -5
  19. package/dist/backends/signoz-api.test.js.map +1 -1
  20. package/dist/lib/agent-as-judge.d.ts +388 -0
  21. package/dist/lib/agent-as-judge.d.ts.map +1 -0
  22. package/dist/lib/agent-as-judge.js +740 -0
  23. package/dist/lib/agent-as-judge.js.map +1 -0
  24. package/dist/lib/agent-as-judge.test.d.ts +5 -0
  25. package/dist/lib/agent-as-judge.test.d.ts.map +1 -0
  26. package/dist/lib/agent-as-judge.test.js +816 -0
  27. package/dist/lib/agent-as-judge.test.js.map +1 -0
  28. package/dist/lib/cache.d.ts +15 -2
  29. package/dist/lib/cache.d.ts.map +1 -1
  30. package/dist/lib/cache.js +16 -2
  31. package/dist/lib/cache.js.map +1 -1
  32. package/dist/lib/circuit-breaker.d.ts +18 -0
  33. package/dist/lib/circuit-breaker.d.ts.map +1 -1
  34. package/dist/lib/circuit-breaker.js +41 -8
  35. package/dist/lib/circuit-breaker.js.map +1 -1
  36. package/dist/lib/confident-export.d.ts +101 -0
  37. package/dist/lib/confident-export.d.ts.map +1 -0
  38. package/dist/lib/confident-export.js +393 -0
  39. package/dist/lib/confident-export.js.map +1 -0
  40. package/dist/lib/confident-export.test.d.ts +7 -0
  41. package/dist/lib/confident-export.test.d.ts.map +1 -0
  42. package/dist/lib/confident-export.test.js +835 -0
  43. package/dist/lib/confident-export.test.js.map +1 -0
  44. package/dist/lib/constants.d.ts +75 -0
  45. package/dist/lib/constants.d.ts.map +1 -1
  46. package/dist/lib/constants.js +104 -1
  47. package/dist/lib/constants.js.map +1 -1
  48. package/dist/lib/datadog-export.d.ts +156 -0
  49. package/dist/lib/datadog-export.d.ts.map +1 -0
  50. package/dist/lib/datadog-export.js +464 -0
  51. package/dist/lib/datadog-export.js.map +1 -0
  52. package/dist/lib/datadog-export.test.d.ts +14 -0
  53. package/dist/lib/datadog-export.test.d.ts.map +1 -0
  54. package/dist/lib/datadog-export.test.js +890 -0
  55. package/dist/lib/datadog-export.test.js.map +1 -0
  56. package/dist/lib/evaluation-hooks.d.ts +49 -0
  57. package/dist/lib/evaluation-hooks.d.ts.map +1 -0
  58. package/dist/lib/evaluation-hooks.js +488 -0
  59. package/dist/lib/evaluation-hooks.js.map +1 -0
  60. package/dist/lib/evaluation-hooks.test.d.ts +8 -0
  61. package/dist/lib/evaluation-hooks.test.d.ts.map +1 -0
  62. package/dist/lib/evaluation-hooks.test.js +624 -0
  63. package/dist/lib/evaluation-hooks.test.js.map +1 -0
  64. package/dist/lib/export-utils.d.ts +99 -0
  65. package/dist/lib/export-utils.d.ts.map +1 -0
  66. package/dist/lib/export-utils.js +238 -0
  67. package/dist/lib/export-utils.js.map +1 -0
  68. package/dist/lib/export-utils.test.d.ts +5 -0
  69. package/dist/lib/export-utils.test.d.ts.map +1 -0
  70. package/dist/lib/export-utils.test.js +193 -0
  71. package/dist/lib/export-utils.test.js.map +1 -0
  72. package/dist/lib/file-utils.d.ts +17 -2
  73. package/dist/lib/file-utils.d.ts.map +1 -1
  74. package/dist/lib/file-utils.js +24 -5
  75. package/dist/lib/file-utils.js.map +1 -1
  76. package/dist/lib/file-utils.test.js +30 -0
  77. package/dist/lib/file-utils.test.js.map +1 -1
  78. package/dist/lib/histogram.d.ts +119 -0
  79. package/dist/lib/histogram.d.ts.map +1 -0
  80. package/dist/lib/histogram.js +202 -0
  81. package/dist/lib/histogram.js.map +1 -0
  82. package/dist/lib/histogram.test.d.ts +5 -0
  83. package/dist/lib/histogram.test.d.ts.map +1 -0
  84. package/dist/lib/histogram.test.js +381 -0
  85. package/dist/lib/histogram.test.js.map +1 -0
  86. package/dist/lib/instrumentation.d.ts +153 -0
  87. package/dist/lib/instrumentation.d.ts.map +1 -0
  88. package/dist/lib/instrumentation.integration.test.d.ts +2 -0
  89. package/dist/lib/instrumentation.integration.test.d.ts.map +1 -0
  90. package/dist/lib/instrumentation.integration.test.js +589 -0
  91. package/dist/lib/instrumentation.integration.test.js.map +1 -0
  92. package/dist/lib/instrumentation.js +520 -0
  93. package/dist/lib/instrumentation.js.map +1 -0
  94. package/dist/lib/instrumentation.test.d.ts +2 -0
  95. package/dist/lib/instrumentation.test.d.ts.map +1 -0
  96. package/dist/lib/instrumentation.test.js +821 -0
  97. package/dist/lib/instrumentation.test.js.map +1 -0
  98. package/dist/lib/langfuse-export.d.ts +125 -0
  99. package/dist/lib/langfuse-export.d.ts.map +1 -0
  100. package/dist/lib/langfuse-export.js +367 -0
  101. package/dist/lib/langfuse-export.js.map +1 -0
  102. package/dist/lib/langfuse-export.test.d.ts +7 -0
  103. package/dist/lib/langfuse-export.test.d.ts.map +1 -0
  104. package/dist/lib/langfuse-export.test.js +1007 -0
  105. package/dist/lib/langfuse-export.test.js.map +1 -0
  106. package/dist/lib/llm-as-judge.d.ts +657 -0
  107. package/dist/lib/llm-as-judge.d.ts.map +1 -0
  108. package/dist/lib/llm-as-judge.js +1397 -0
  109. package/dist/lib/llm-as-judge.js.map +1 -0
  110. package/dist/lib/llm-as-judge.test.d.ts +2 -0
  111. package/dist/lib/llm-as-judge.test.d.ts.map +1 -0
  112. package/dist/lib/llm-as-judge.test.js +2409 -0
  113. package/dist/lib/llm-as-judge.test.js.map +1 -0
  114. package/dist/lib/logger.d.ts +1 -1
  115. package/dist/lib/logger.d.ts.map +1 -1
  116. package/dist/lib/logger.js.map +1 -1
  117. package/dist/lib/metrics.d.ts +62 -0
  118. package/dist/lib/metrics.d.ts.map +1 -0
  119. package/dist/lib/metrics.js +166 -0
  120. package/dist/lib/metrics.js.map +1 -0
  121. package/dist/lib/metrics.test.d.ts +5 -0
  122. package/dist/lib/metrics.test.d.ts.map +1 -0
  123. package/dist/lib/metrics.test.js +189 -0
  124. package/dist/lib/metrics.test.js.map +1 -0
  125. package/dist/lib/parse-stats.d.ts +119 -0
  126. package/dist/lib/parse-stats.d.ts.map +1 -0
  127. package/dist/lib/parse-stats.js +206 -0
  128. package/dist/lib/parse-stats.js.map +1 -0
  129. package/dist/lib/parse-stats.test.d.ts +5 -0
  130. package/dist/lib/parse-stats.test.d.ts.map +1 -0
  131. package/dist/lib/parse-stats.test.js +283 -0
  132. package/dist/lib/parse-stats.test.js.map +1 -0
  133. package/dist/lib/phoenix-export.d.ts +109 -0
  134. package/dist/lib/phoenix-export.d.ts.map +1 -0
  135. package/dist/lib/phoenix-export.js +429 -0
  136. package/dist/lib/phoenix-export.js.map +1 -0
  137. package/dist/lib/phoenix-export.test.d.ts +11 -0
  138. package/dist/lib/phoenix-export.test.d.ts.map +1 -0
  139. package/dist/lib/phoenix-export.test.js +725 -0
  140. package/dist/lib/phoenix-export.test.js.map +1 -0
  141. package/dist/lib/server-utils.d.ts +6 -1
  142. package/dist/lib/server-utils.d.ts.map +1 -1
  143. package/dist/lib/server-utils.js +9 -1
  144. package/dist/lib/server-utils.js.map +1 -1
  145. package/dist/lib/shared-schemas.d.ts +6 -0
  146. package/dist/lib/shared-schemas.d.ts.map +1 -1
  147. package/dist/lib/shared-schemas.js +11 -4
  148. package/dist/lib/shared-schemas.js.map +1 -1
  149. package/dist/lib/verification-events.d.ts +100 -0
  150. package/dist/lib/verification-events.d.ts.map +1 -0
  151. package/dist/lib/verification-events.js +162 -0
  152. package/dist/lib/verification-events.js.map +1 -0
  153. package/dist/lib/verification-events.test.d.ts +5 -0
  154. package/dist/lib/verification-events.test.d.ts.map +1 -0
  155. package/dist/lib/verification-events.test.js +193 -0
  156. package/dist/lib/verification-events.test.js.map +1 -0
  157. package/dist/server.d.ts +5 -0
  158. package/dist/server.d.ts.map +1 -1
  159. package/dist/server.js +77 -21
  160. package/dist/server.js.map +1 -1
  161. package/dist/tools/context-stats.d.ts.map +1 -1
  162. package/dist/tools/context-stats.js +6 -8
  163. package/dist/tools/context-stats.js.map +1 -1
  164. package/dist/tools/export-confident.d.ts +145 -0
  165. package/dist/tools/export-confident.d.ts.map +1 -0
  166. package/dist/tools/export-confident.js +134 -0
  167. package/dist/tools/export-confident.js.map +1 -0
  168. package/dist/tools/export-confident.test.d.ts +7 -0
  169. package/dist/tools/export-confident.test.d.ts.map +1 -0
  170. package/dist/tools/export-confident.test.js +332 -0
  171. package/dist/tools/export-confident.test.js.map +1 -0
  172. package/dist/tools/export-datadog.d.ts +160 -0
  173. package/dist/tools/export-datadog.d.ts.map +1 -0
  174. package/dist/tools/export-datadog.js +160 -0
  175. package/dist/tools/export-datadog.js.map +1 -0
  176. package/dist/tools/export-datadog.test.d.ts +8 -0
  177. package/dist/tools/export-datadog.test.d.ts.map +1 -0
  178. package/dist/tools/export-datadog.test.js +419 -0
  179. package/dist/tools/export-datadog.test.js.map +1 -0
  180. package/dist/tools/export-langfuse.d.ts +137 -0
  181. package/dist/tools/export-langfuse.d.ts.map +1 -0
  182. package/dist/tools/export-langfuse.js +131 -0
  183. package/dist/tools/export-langfuse.js.map +1 -0
  184. package/dist/tools/export-langfuse.test.d.ts +7 -0
  185. package/dist/tools/export-langfuse.test.d.ts.map +1 -0
  186. package/dist/tools/export-langfuse.test.js +303 -0
  187. package/dist/tools/export-langfuse.test.js.map +1 -0
  188. package/dist/tools/export-phoenix.d.ts +145 -0
  189. package/dist/tools/export-phoenix.d.ts.map +1 -0
  190. package/dist/tools/export-phoenix.js +135 -0
  191. package/dist/tools/export-phoenix.js.map +1 -0
  192. package/dist/tools/export-phoenix.test.d.ts +7 -0
  193. package/dist/tools/export-phoenix.test.d.ts.map +1 -0
  194. package/dist/tools/export-phoenix.test.js +316 -0
  195. package/dist/tools/export-phoenix.test.js.map +1 -0
  196. package/dist/tools/health-check.d.ts +26 -0
  197. package/dist/tools/health-check.d.ts.map +1 -1
  198. package/dist/tools/health-check.js +36 -7
  199. package/dist/tools/health-check.js.map +1 -1
  200. package/dist/tools/index.d.ts +6 -0
  201. package/dist/tools/index.d.ts.map +1 -1
  202. package/dist/tools/index.js +6 -0
  203. package/dist/tools/index.js.map +1 -1
  204. package/dist/tools/inject-evaluations.d.ts +1315 -0
  205. package/dist/tools/inject-evaluations.d.ts.map +1 -0
  206. package/dist/tools/inject-evaluations.js +121 -0
  207. package/dist/tools/inject-evaluations.js.map +1 -0
  208. package/dist/tools/inject-evaluations.test.d.ts +5 -0
  209. package/dist/tools/inject-evaluations.test.d.ts.map +1 -0
  210. package/dist/tools/inject-evaluations.test.js +359 -0
  211. package/dist/tools/inject-evaluations.test.js.map +1 -0
  212. package/dist/tools/query-evaluations.d.ts +25 -4
  213. package/dist/tools/query-evaluations.d.ts.map +1 -1
  214. package/dist/tools/query-evaluations.js +10 -0
  215. package/dist/tools/query-evaluations.js.map +1 -1
  216. package/dist/tools/query-llm-events.js +2 -2
  217. package/dist/tools/query-llm-events.js.map +1 -1
  218. package/dist/tools/query-logs.d.ts +8 -8
  219. package/dist/tools/query-logs.js +3 -3
  220. package/dist/tools/query-logs.js.map +1 -1
  221. package/dist/tools/query-metrics.d.ts +4 -4
  222. package/dist/tools/query-metrics.js +2 -2
  223. package/dist/tools/query-metrics.js.map +1 -1
  224. package/dist/tools/query-traces.d.ts +8 -8
  225. package/dist/tools/query-verifications.d.ts +111 -0
  226. package/dist/tools/query-verifications.d.ts.map +1 -0
  227. package/dist/tools/query-verifications.js +101 -0
  228. package/dist/tools/query-verifications.js.map +1 -0
  229. package/dist/tools/query-verifications.test.d.ts +5 -0
  230. package/dist/tools/query-verifications.test.d.ts.map +1 -0
  231. package/dist/tools/query-verifications.test.js +156 -0
  232. package/dist/tools/query-verifications.test.js.map +1 -0
  233. package/dist/types/evaluation-hooks.d.ts +176 -0
  234. package/dist/types/evaluation-hooks.d.ts.map +1 -0
  235. package/dist/types/evaluation-hooks.js +49 -0
  236. package/dist/types/evaluation-hooks.js.map +1 -0
  237. package/package.json +10 -2
@@ -0,0 +1,740 @@
1
+ /**
2
+ * Agent-as-Judge Implementation
3
+ *
4
+ * Provides patterns and utilities for evaluating AI agents using autonomous
5
+ * judge agents with planning, tool use, memory, and multi-agent collaboration.
6
+ *
7
+ * @security
8
+ * - All user inputs are sanitized via llm-as-judge utilities
9
+ * - Tool execution should use sandbox isolation
10
+ * - Memory is bounded to prevent resource exhaustion
11
+ *
12
+ * @see https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/
13
+ * @see docs/quality/agent-as-judge.md
14
+ */
15
+ import { MAX_STEP_SCORES, MAX_TOOL_VERIFICATIONS, MAX_STEP_ID_LENGTH, } from '../backends/index.js';
16
+ import { InputValidationError } from './input-validator.js';
17
+ // ============================================================================
18
+ // Constants
19
+ // ============================================================================
20
+ /** Maximum trajectory length to analyze for efficiency metrics */
21
+ export const MAX_TRAJECTORY_LENGTH = 1000;
22
+ /** Default timeout for agent evaluation steps (60 seconds) */
23
+ export const DEFAULT_AGENT_EVAL_TIMEOUT_MS = 60000;
24
+ /** Maximum concurrent specialist evaluators */
25
+ export const MAX_CONCURRENT_EVALUATORS = 10;
26
+ /** Maximum consensus rounds for multi-agent evaluation */
27
+ export const MAX_CONSENSUS_ROUNDS = 5;
28
+ /** Default variance threshold for consensus convergence */
29
+ export const DEFAULT_CONVERGENCE_THRESHOLD = 0.1;
30
+ /**
31
+ * Default threshold for early termination in procedural evaluation.
32
+ * If a critical stage scores below this threshold, evaluation terminates early.
33
+ */
34
+ export const DEFAULT_EARLY_TERMINATION_THRESHOLD = 0.3;
35
+ // ============================================================================
36
+ // Timeout Protection (H6)
37
+ // ============================================================================
38
+ /**
39
+ * Error thrown when an evaluation exceeds the configured timeout.
40
+ */
41
+ export class AgentEvalTimeoutError extends Error {
42
+ timeoutMs;
43
+ constructor(timeoutMs) {
44
+ super(`Agent evaluation timed out after ${timeoutMs}ms`);
45
+ this.timeoutMs = timeoutMs;
46
+ this.name = 'AgentEvalTimeoutError';
47
+ }
48
+ }
49
+ /**
50
+ * Execute an async function with timeout protection.
51
+ *
52
+ * @param fn - Async function to execute
53
+ * @param timeoutMs - Timeout in milliseconds (default: DEFAULT_AGENT_EVAL_TIMEOUT_MS)
54
+ * @returns Result of the function
55
+ * @throws {AgentEvalTimeoutError} If function times out
56
+ */
57
+ export async function withAgentTimeout(fn, timeoutMs = DEFAULT_AGENT_EVAL_TIMEOUT_MS) {
58
+ let timeoutId;
59
+ const timeoutPromise = new Promise((_, reject) => {
60
+ timeoutId = setTimeout(() => {
61
+ reject(new AgentEvalTimeoutError(timeoutMs));
62
+ }, timeoutMs);
63
+ });
64
+ try {
65
+ const result = await Promise.race([fn(), timeoutPromise]);
66
+ if (timeoutId !== undefined)
67
+ clearTimeout(timeoutId);
68
+ return result;
69
+ }
70
+ catch (error) {
71
+ if (timeoutId !== undefined)
72
+ clearTimeout(timeoutId);
73
+ throw error;
74
+ }
75
+ }
76
+ // ============================================================================
77
+ // Validation Utilities
78
+ // ============================================================================
79
+ /**
80
+ * Validate an Evaluand object.
81
+ * @throws {InputValidationError} If validation fails
82
+ */
83
+ export function validateEvaluand(evaluand) {
84
+ if (!evaluand.input || evaluand.input.trim().length === 0) {
85
+ throw new InputValidationError('Evaluand input is required', 'input', 'required');
86
+ }
87
+ if (!evaluand.output || evaluand.output.trim().length === 0) {
88
+ throw new InputValidationError('Evaluand output is required', 'output', 'required');
89
+ }
90
+ if (evaluand.actions && evaluand.actions.length > MAX_TRAJECTORY_LENGTH) {
91
+ throw new InputValidationError(`Actions array exceeds ${MAX_TRAJECTORY_LENGTH} limit`, 'actions', 'maxLength');
92
+ }
93
+ }
94
+ /**
95
+ * Validate a StepScore object.
96
+ * @throws {InputValidationError} If validation fails
97
+ */
98
+ export function validateStepScore(stepScore) {
99
+ // Validate step identifier
100
+ if (typeof stepScore.step === 'string') {
101
+ if (stepScore.step.length > MAX_STEP_ID_LENGTH) {
102
+ throw new InputValidationError(`Step identifier exceeds ${MAX_STEP_ID_LENGTH} characters`, 'step', 'maxLength');
103
+ }
104
+ }
105
+ else if (typeof stepScore.step === 'number') {
106
+ if (stepScore.step < 0 || !Number.isInteger(stepScore.step)) {
107
+ throw new InputValidationError('Step index must be a non-negative integer', 'step', 'type');
108
+ }
109
+ }
110
+ else {
111
+ throw new InputValidationError('Step must be a string or number', 'step', 'type');
112
+ }
113
+ // Validate score range
114
+ if (!Number.isFinite(stepScore.score) || stepScore.score < 0 || stepScore.score > 1) {
115
+ throw new InputValidationError('Score must be in range [0, 1]', 'score', 'range');
116
+ }
117
+ }
118
+ /**
119
+ * Validate a ToolVerification object.
120
+ * @throws {InputValidationError} If validation fails
121
+ */
122
+ export function validateToolVerification(verification) {
123
+ if (!verification.toolName || verification.toolName.trim().length === 0) {
124
+ throw new InputValidationError('Tool name is required', 'toolName', 'required');
125
+ }
126
+ if (typeof verification.toolCorrect !== 'boolean') {
127
+ throw new InputValidationError('toolCorrect must be a boolean', 'toolCorrect', 'type');
128
+ }
129
+ if (typeof verification.argsCorrect !== 'boolean') {
130
+ throw new InputValidationError('argsCorrect must be a boolean', 'argsCorrect', 'type');
131
+ }
132
+ if (!Number.isFinite(verification.score) || verification.score < 0 || verification.score > 1) {
133
+ throw new InputValidationError('Score must be in range [0, 1]', 'score', 'range');
134
+ }
135
+ }
136
+ // ============================================================================
137
+ // Tool Verification Utilities
138
+ // ============================================================================
139
+ /**
140
+ * Verify a single tool call against expected behavior.
141
+ *
142
+ * @param action - The agent action containing the tool call
143
+ * @param expectedTool - Expected tool name (optional)
144
+ * @param expectedArgs - Expected arguments (optional)
145
+ * @param actualResult - Actual result to compare against expected (optional)
146
+ * @param expectedResult - Expected result for comparison (optional)
147
+ * @returns ToolVerification result with weighted score:
148
+ * - Tool selection: 40% weight
149
+ * - Arguments: 30% weight (if expectedArgs provided)
150
+ * - Result: 30% weight (if expectedResult provided)
151
+ * @throws {InputValidationError} If action is null, undefined, or not an object
152
+ *
153
+ * @example
154
+ * ```typescript
155
+ * const verification = verifyToolCall(
156
+ * { type: 'tool_call', tool: 'search', arguments: { query: 'test' } },
157
+ * 'search',
158
+ * { query: 'test' }
159
+ * );
160
+ * console.log(verification.score); // 1.0 (all correct)
161
+ * ```
162
+ */
163
+ export function verifyToolCall(action, expectedTool, expectedArgs, actualResult, expectedResult) {
164
+ // H1: Validate action parameter
165
+ if (!action || typeof action !== 'object') {
166
+ throw new InputValidationError('Action is required and must be an object', 'action', 'required');
167
+ }
168
+ const toolCorrect = expectedTool ? action.tool === expectedTool : true;
169
+ const argsCorrect = expectedArgs
170
+ ? deepEquals(action.arguments, expectedArgs)
171
+ : true;
172
+ const resultCorrect = expectedResult !== undefined
173
+ ? deepEquals(actualResult ?? action.result, expectedResult)
174
+ : undefined;
175
+ // Calculate weighted score
176
+ let score = 0;
177
+ let weights = 0;
178
+ // Tool selection: 40% weight
179
+ score += (toolCorrect ? 0.4 : 0);
180
+ weights += 0.4;
181
+ // Arguments: 30% weight
182
+ if (expectedArgs !== undefined) {
183
+ score += (argsCorrect ? 0.3 : 0);
184
+ weights += 0.3;
185
+ }
186
+ // Result: 30% weight (only if expected result provided)
187
+ if (expectedResult !== undefined) {
188
+ score += (resultCorrect ? 0.3 : 0);
189
+ weights += 0.3;
190
+ }
191
+ // Normalize by weights used
192
+ const normalizedScore = weights > 0 ? score / weights : (toolCorrect ? 1 : 0);
193
+ return {
194
+ toolName: action.tool || 'unknown',
195
+ toolCallId: action.toolCallId,
196
+ toolCorrect,
197
+ argsCorrect,
198
+ resultCorrect,
199
+ score: normalizedScore,
200
+ expectedTool,
201
+ evidence: {
202
+ actualTool: action.tool,
203
+ actualArgs: action.arguments,
204
+ actualResult: actualResult ?? action.result,
205
+ expectedTool,
206
+ expectedArgs,
207
+ expectedResult,
208
+ },
209
+ };
210
+ }
211
+ /**
212
+ * Verify all tool calls in an agent trajectory.
213
+ *
214
+ * @param actions - List of agent actions
215
+ * @param expectedTools - Map of expected tool calls by index or toolCallId
216
+ * @returns Array of ToolVerification results
217
+ */
218
+ export function verifyToolCalls(actions, expectedTools) {
219
+ const verifications = [];
220
+ for (let i = 0; i < actions.length && verifications.length < MAX_TOOL_VERIFICATIONS; i++) {
221
+ const action = actions[i];
222
+ if (action.type !== 'tool_call' || !action.tool)
223
+ continue;
224
+ // Check for expected tool by index or toolCallId
225
+ const expected = expectedTools?.get(i) ??
226
+ (action.toolCallId ? expectedTools?.get(action.toolCallId) : undefined);
227
+ const verification = verifyToolCall(action, expected?.tool, expected?.args, undefined, expected?.result);
228
+ verifications.push(verification);
229
+ }
230
+ return verifications;
231
+ }
232
+ // ============================================================================
233
+ // Step Scoring Utilities
234
+ // ============================================================================
235
+ /**
236
+ * Score a single step in the agent trajectory.
237
+ *
238
+ * @param action - The agent action to score
239
+ * @param stepIndex - Index of this step (non-negative integer)
240
+ * @param evaluation - Score and explanation from LLM judge
241
+ * @returns StepScore result with score clamped to [0, 1]
242
+ * @throws {InputValidationError} If evaluation.score is not a finite number
243
+ * @throws {InputValidationError} If stepIndex is invalid (validated via validateStepScore)
244
+ *
245
+ * @example
246
+ * ```typescript
247
+ * const step = scoreStep(
248
+ * { type: 'tool_call', tool: 'search' },
249
+ * 0,
250
+ * { score: 0.85, explanation: 'Correct tool selection' }
251
+ * );
252
+ * ```
253
+ */
254
+ export function scoreStep(action, stepIndex, evaluation) {
255
+ // H4: Validate score is a finite number before clamping
256
+ if (typeof evaluation.score !== 'number' || !Number.isFinite(evaluation.score)) {
257
+ throw new InputValidationError('Evaluation score must be a finite number', 'score', 'type');
258
+ }
259
+ const stepScore = {
260
+ step: stepIndex,
261
+ score: Math.max(0, Math.min(1, evaluation.score)), // Clamp to [0, 1]
262
+ explanation: evaluation.explanation,
263
+ evidence: {
264
+ actionType: action.type,
265
+ tool: action.tool,
266
+ reasoning: action.reasoning,
267
+ },
268
+ };
269
+ validateStepScore(stepScore);
270
+ return stepScore;
271
+ }
272
+ /**
273
+ * Calculate aggregate step scores from individual scores.
274
+ *
275
+ * @param stepScores - Array of step scores
276
+ * @param aggregation - Aggregation method ('average', 'weighted', 'min')
277
+ * @param weights - Weights for weighted aggregation (must match stepScores length, finite non-negative)
278
+ * @returns Aggregated score (0-1)
279
+ * - Empty array returns 1 (no steps to fail = vacuously perfect)
280
+ * - This convention matches mathematical definition where empty set satisfies all predicates
281
+ * @throws {Error} If aggregation is 'weighted' but weights array is missing or wrong length
282
+ * @throws {Error} If any weight is negative, NaN, or Infinity
283
+ *
284
+ * @example
285
+ * ```typescript
286
+ * // Simple average
287
+ * const avg = aggregateStepScores(steps, 'average');
288
+ *
289
+ * // Weighted (emphasize later steps)
290
+ * const weighted = aggregateStepScores(steps, 'weighted', [1, 2, 3]);
291
+ *
292
+ * // Minimum score (most strict)
293
+ * const min = aggregateStepScores(steps, 'min');
294
+ *
295
+ * // Empty array returns 1 (vacuously true)
296
+ * aggregateStepScores([], 'average'); // => 1
297
+ * ```
298
+ */
299
+ export function aggregateStepScores(stepScores, aggregation = 'average', weights) {
300
+ // L7: Empty array returns 1 (vacuously perfect - no steps to fail)
301
+ if (stepScores.length === 0)
302
+ return 1;
303
+ switch (aggregation) {
304
+ case 'average':
305
+ return stepScores.reduce((sum, s) => sum + s.score, 0) / stepScores.length;
306
+ case 'weighted':
307
+ if (!weights || weights.length !== stepScores.length) {
308
+ throw new Error('Weights required for weighted aggregation');
309
+ }
310
+ // L8/M1: Validate weights are finite non-negative numbers
311
+ for (let i = 0; i < weights.length; i++) {
312
+ if (weights[i] < 0 || !Number.isFinite(weights[i])) {
313
+ throw new Error(`Invalid weight at index ${i}: ${weights[i]}. Weights must be finite non-negative numbers.`);
314
+ }
315
+ }
316
+ const totalWeight = weights.reduce((sum, w) => sum + w, 0);
317
+ if (totalWeight === 0)
318
+ return 0;
319
+ return stepScores.reduce((sum, s, i) => sum + s.score * weights[i], 0) / totalWeight;
320
+ case 'min':
321
+ return Math.min(...stepScores.map(s => s.score));
322
+ }
323
+ }
324
+ // ============================================================================
325
+ // Trajectory Analysis Utilities
326
+ // ============================================================================
327
+ /**
328
+ * Calculate trajectory efficiency metrics.
329
+ *
330
+ * Uses JSON.stringify for redundancy detection. For large argument objects,
331
+ * consider implementing hash-based comparison for better performance.
332
+ *
333
+ * @param evaluand - The evaluand with actions
334
+ * @param optimalLength - Expected optimal trajectory length (optional)
335
+ * @returns Efficiency metrics
336
+ */
337
+ export function analyzeTrajectory(evaluand, optimalLength) {
338
+ const actions = evaluand.actions || [];
339
+ const length = actions.length;
340
+ const toolCalls = actions.filter(a => a.type === 'tool_call');
341
+ const toolCallCount = toolCalls.length;
342
+ const uniqueTools = new Set(toolCalls.map(a => a.tool).filter(Boolean)).size;
343
+ // Calculate efficiency ratio (if optimal length known)
344
+ const efficiencyRatio = optimalLength && optimalLength > 0
345
+ ? Math.min(1, optimalLength / Math.max(length, 1))
346
+ : 1;
347
+ // Detect redundant actions (same tool with same args)
348
+ const seenToolCalls = new Set();
349
+ let redundantActions = 0;
350
+ for (const action of toolCalls) {
351
+ // M2: Wrap in try-catch to handle circular references or non-serializable values
352
+ try {
353
+ const key = JSON.stringify({ tool: action.tool, args: action.arguments });
354
+ if (seenToolCalls.has(key)) {
355
+ redundantActions++;
356
+ }
357
+ seenToolCalls.add(key);
358
+ }
359
+ catch {
360
+ // Skip this action for redundancy detection if serialization fails
361
+ continue;
362
+ }
363
+ }
364
+ return {
365
+ length,
366
+ toolCallCount,
367
+ uniqueTools,
368
+ efficiencyRatio,
369
+ redundantActions,
370
+ };
371
+ }
372
+ // ============================================================================
373
+ // Multi-Agent Consensus Utilities
374
+ // ============================================================================
375
+ /**
376
+ * Calculate sample variance of scores for convergence detection.
377
+ * Uses Bessel's correction (n-1) for unbiased estimation.
378
+ *
379
+ * @param scores - Array of score values
380
+ * @returns Sample variance (0 for empty or single-element arrays)
381
+ */
382
+ export function calculateVariance(scores) {
383
+ // Return 0 for empty arrays or single element (no variance measurable)
384
+ if (scores.length <= 1)
385
+ return 0;
386
+ const n = scores.length;
387
+ const mean = scores.reduce((sum, s) => sum + s, 0) / n;
388
+ const squaredDiffs = scores.map(s => (s - mean) ** 2);
389
+ // Bessel's correction: divide by (n-1) for sample variance
390
+ return squaredDiffs.reduce((sum, d) => sum + d, 0) / (n - 1);
391
+ }
392
+ /**
393
+ * Calculate median of scores for final consensus.
394
+ */
395
+ export function calculateMedian(scores) {
396
+ if (scores.length === 0)
397
+ return 0;
398
+ const sorted = [...scores].sort((a, b) => a - b);
399
+ const mid = Math.floor(sorted.length / 2);
400
+ return sorted.length % 2 === 0
401
+ ? (sorted[mid - 1] + sorted[mid]) / 2
402
+ : sorted[mid];
403
+ }
404
+ /**
405
+ * Run multi-agent consensus evaluation.
406
+ *
407
+ * Uses Promise.allSettled for graceful degradation - partial failures
408
+ * don't abort the entire consensus. At least one judge must succeed per round.
409
+ *
410
+ * @param evaluand - Subject to evaluate
411
+ * @param judges - Array of judge evaluation functions
412
+ * @param config - Consensus configuration
413
+ * @returns ConsensusResult
414
+ * @throws {InputValidationError} If judges array exceeds MAX_CONCURRENT_EVALUATORS
415
+ * @throws {InputValidationError} If config.rounds is less than 1
416
+ * @throws {Error} If all judges fail in any round
417
+ */
418
+ export async function collectiveConsensus(evaluand, judges, config) {
419
+ validateEvaluand(evaluand);
420
+ // H2: Validate judge count to prevent unbounded memory growth
421
+ if (judges.length > MAX_CONCURRENT_EVALUATORS) {
422
+ throw new InputValidationError(`Number of judges (${judges.length}) exceeds ${MAX_CONCURRENT_EVALUATORS} limit`, 'judges', 'maxLength');
423
+ }
424
+ if (judges.length === 0) {
425
+ throw new InputValidationError('At least one judge is required', 'judges', 'required');
426
+ }
427
+ // M9: Validate rounds is positive
428
+ if (config.rounds < 1) {
429
+ throw new InputValidationError('Rounds must be at least 1', 'rounds', 'range');
430
+ }
431
+ const rounds = Math.min(config.rounds, MAX_CONSENSUS_ROUNDS);
432
+ const scores = new Map();
433
+ // Initialize score arrays
434
+ for (const judge of judges) {
435
+ scores.set(judge.id, []);
436
+ }
437
+ let convergenceRound = rounds;
438
+ let converged = false;
439
+ for (let round = 0; round < rounds; round++) {
440
+ // H3: Use Promise.allSettled for graceful degradation
441
+ const roundResults = await Promise.allSettled(judges.map(async (judge) => {
442
+ const score = await judge.evaluate(evaluand, scores);
443
+ return { id: judge.id, score };
444
+ }));
445
+ // Filter successful results
446
+ const successfulResults = roundResults
447
+ .filter((r) => r.status === 'fulfilled')
448
+ .map(r => r.value);
449
+ // H3: Require at least one successful judge
450
+ if (successfulResults.length === 0) {
451
+ throw new Error(`All judge evaluations failed in round ${round + 1}`);
452
+ }
453
+ // Update scores for successful judges only
454
+ for (const { id, score } of successfulResults) {
455
+ // H4/M1: Validate score is finite AND in valid range [0, 1] before storing
456
+ if (Number.isFinite(score) && score >= 0 && score <= 1) {
457
+ scores.get(id).push(score);
458
+ }
459
+ }
460
+ // Check convergence (only using successful scores)
461
+ const currentScores = successfulResults
462
+ .filter(r => Number.isFinite(r.score))
463
+ .map(r => r.score);
464
+ if (currentScores.length > 0 && calculateVariance(currentScores) < config.convergenceThreshold) {
465
+ convergenceRound = round + 1;
466
+ converged = true;
467
+ break;
468
+ }
469
+ }
470
+ // Final score is median of last round (only judges with scores)
471
+ const lastRoundScores = [];
472
+ for (const judge of judges) {
473
+ const judgeScores = scores.get(judge.id);
474
+ if (judgeScores.length > 0) {
475
+ lastRoundScores.push(judgeScores[judgeScores.length - 1]);
476
+ }
477
+ }
478
+ return {
479
+ finalScore: lastRoundScores.length > 0 ? calculateMedian(lastRoundScores) : 0,
480
+ convergenceRound,
481
+ judgeScores: scores,
482
+ converged,
483
+ };
484
+ }
485
+ // ============================================================================
486
+ // Abstract Agent Judge Base Class
487
+ // ============================================================================
488
+ /**
489
+ * Abstract base class for Agent-as-Judge implementations.
490
+ *
491
+ * Provides common infrastructure for memory, tool access, and evaluation flow.
492
+ * Subclasses implement specific evaluation strategies (procedural, reactive, self-evolving).
493
+ */
494
+ /** Maximum entries in judge memory before LRU eviction */
495
+ const MAX_JUDGE_MEMORY_SIZE = 1000;
496
+ export class AgentJudge {
497
+ memory = new Map();
498
+ /**
499
+ * Store intermediate state in memory with LRU eviction.
500
+ * @security Memory is bounded to MAX_JUDGE_MEMORY_SIZE entries
501
+ */
502
+ storeInMemory(key, value) {
503
+ // H5: Proper LRU - delete existing key first to move it to end
504
+ if (this.memory.has(key)) {
505
+ this.memory.delete(key);
506
+ }
507
+ // Bound memory size to prevent exhaustion
508
+ if (this.memory.size >= MAX_JUDGE_MEMORY_SIZE) {
509
+ // Evict least recently used (first entry in insertion order)
510
+ const firstKey = this.memory.keys().next().value;
511
+ if (firstKey)
512
+ this.memory.delete(firstKey);
513
+ }
514
+ this.memory.set(key, value);
515
+ }
516
+ /**
517
+ * Retrieve state from memory with LRU update.
518
+ * @note Access moves item to end (most recently used)
519
+ */
520
+ getFromMemory(key) {
521
+ const value = this.memory.get(key);
522
+ if (value !== undefined) {
523
+ // H5: Re-insert to move to end (most recently used)
524
+ this.memory.delete(key);
525
+ this.memory.set(key, value);
526
+ }
527
+ return value;
528
+ }
529
+ /**
530
+ * Clear all memory.
531
+ */
532
+ clearMemory() {
533
+ this.memory.clear();
534
+ }
535
+ /**
536
+ * Convert AgentEvalResult to OTel-compatible EvaluationResult.
537
+ */
538
+ toEvaluationResult(result, evaluand, evaluatorType = 'llm') {
539
+ return {
540
+ scoreValue: result.overallScore,
541
+ explanation: result.explanation,
542
+ evaluator: this.name,
543
+ evaluatorType,
544
+ agentId: evaluand.agentId,
545
+ agentName: evaluand.agentName,
546
+ stepScores: result.stepScores.slice(0, MAX_STEP_SCORES),
547
+ toolVerifications: result.toolVerifications.slice(0, MAX_TOOL_VERIFICATIONS),
548
+ trajectoryLength: result.trajectoryLength,
549
+ };
550
+ }
551
+ }
552
+ /**
553
+ * Procedural Agent Judge - Fixed evaluation pipeline.
554
+ *
555
+ * Executes a predefined sequence of evaluation stages.
556
+ * Best for domain-specific evaluations with known criteria.
557
+ */
558
+ export class ProceduralJudge extends AgentJudge {
559
+ stages;
560
+ earlyTerminationOn;
561
+ name = 'procedural-agent-judge';
562
+ /**
563
+ * Create a ProceduralJudge with a sequence of evaluation stages.
564
+ *
565
+ * @param stages - Array of evaluation stages to execute in order
566
+ * @param earlyTerminationOn - Optional stage name that triggers early termination on failure
567
+ * @throws {InputValidationError} If stages array is empty
568
+ * @throws {InputValidationError} If any stage has empty or missing name
569
+ * @throws {InputValidationError} If any stage has missing evaluate function
570
+ * @throws {InputValidationError} If earlyTerminationOn references non-existent stage
571
+ */
572
+ constructor(stages, earlyTerminationOn) {
573
+ super();
574
+ this.stages = stages;
575
+ this.earlyTerminationOn = earlyTerminationOn;
576
+ // M8: Validate stages array
577
+ if (!stages || stages.length === 0) {
578
+ throw new InputValidationError('ProceduralJudge requires at least one stage', 'stages', 'required');
579
+ }
580
+ // Validate each stage
581
+ for (let i = 0; i < stages.length; i++) {
582
+ const stage = stages[i];
583
+ if (!stage.name || stage.name.trim().length === 0) {
584
+ throw new InputValidationError(`Stage ${i} has invalid name "${stage.name ?? 'undefined'}" - must be non-empty and not just whitespace`, 'stages', 'required');
585
+ }
586
+ if (typeof stage.evaluate !== 'function') {
587
+ throw new InputValidationError(`Stage ${i} (${stage.name}) must have an evaluate function`, 'stages', 'type');
588
+ }
589
+ }
590
+ // Validate earlyTerminationOn if provided
591
+ if (earlyTerminationOn !== undefined) {
592
+ const stageNames = stages.map(s => s.name);
593
+ if (!stageNames.includes(earlyTerminationOn)) {
594
+ throw new InputValidationError(`earlyTerminationOn stage '${earlyTerminationOn}' not found in stages: ${stageNames.join(', ')}`, 'earlyTerminationOn', 'invalid');
595
+ }
596
+ }
597
+ }
598
+ async evaluate(evaluand) {
599
+ // M3: Wrap entire evaluation in timeout to prevent indefinite hangs
600
+ return withAgentTimeout(async () => {
601
+ validateEvaluand(evaluand);
602
+ const context = {};
603
+ const stepScores = [];
604
+ const trajectory = analyzeTrajectory(evaluand);
605
+ for (let i = 0; i < this.stages.length; i++) {
606
+ const stage = this.stages[i];
607
+ const result = await stage.evaluate(evaluand, context);
608
+ const stepScore = scoreStep({ type: 'evaluation_stage', reasoning: stage.name }, i, result);
609
+ stepScores.push(stepScore);
610
+ context[stage.name] = result;
611
+ // L10: Early termination if configured and stage fails below threshold
612
+ if (this.earlyTerminationOn === stage.name && result.score < DEFAULT_EARLY_TERMINATION_THRESHOLD) {
613
+ return {
614
+ overallScore: 0,
615
+ stepScores,
616
+ toolVerifications: verifyToolCalls(evaluand.actions || []),
617
+ trajectoryLength: trajectory.length,
618
+ explanation: `Early termination: ${stage.name} failed with score ${result.score}`,
619
+ actionableFeedback: [result.explanation],
620
+ };
621
+ }
622
+ }
623
+ return {
624
+ overallScore: aggregateStepScores(stepScores),
625
+ stepScores,
626
+ toolVerifications: verifyToolCalls(evaluand.actions || []),
627
+ trajectoryLength: trajectory.length,
628
+ explanation: `Procedural evaluation completed ${this.stages.length} stages`,
629
+ actionableFeedback: stepScores
630
+ .filter(s => s.score < 0.7)
631
+ .map(s => s.explanation || `Step ${s.step} needs improvement`),
632
+ };
633
+ });
634
+ }
635
+ }
636
+ /**
637
+ * Reactive Agent Judge - Adaptive evaluation based on content.
638
+ *
639
+ * Routes evaluation to appropriate specialists based on initial analysis.
640
+ * Supports deep-dive evaluation when initial analysis indicates issues.
641
+ */
642
+ export class ReactiveJudge extends AgentJudge {
643
+ router;
644
+ specialists;
645
+ deepDiveSpecialists;
646
+ name = 'reactive-agent-judge';
647
+ constructor(router, specialists, deepDiveSpecialists) {
648
+ super();
649
+ this.router = router;
650
+ this.specialists = specialists;
651
+ this.deepDiveSpecialists = deepDiveSpecialists;
652
+ }
653
+ async evaluate(evaluand) {
654
+ // M3: Wrap entire evaluation in timeout to prevent indefinite hangs
655
+ return withAgentTimeout(async () => {
656
+ validateEvaluand(evaluand);
657
+ // Route to relevant specialists
658
+ const relevantSpecialists = await this.router(evaluand);
659
+ const trajectory = analyzeTrajectory(evaluand);
660
+ const stepScores = [];
661
+ for (let i = 0; i < relevantSpecialists.length; i++) {
662
+ const specialistName = relevantSpecialists[i];
663
+ const specialist = this.specialists.get(specialistName);
664
+ if (!specialist)
665
+ continue;
666
+ const result = await specialist(evaluand);
667
+ const stepScore = scoreStep({ type: 'specialist_evaluation', reasoning: specialistName }, i, result);
668
+ stepScores.push(stepScore);
669
+ this.storeInMemory(`eval_${specialistName}`, result);
670
+ // Trigger deep dive if needed
671
+ if (result.needsDeepDive && this.deepDiveSpecialists) {
672
+ const deepDive = this.deepDiveSpecialists.get(specialistName);
673
+ if (deepDive) {
674
+ const deepResult = await deepDive(evaluand);
675
+ const deepStepScore = scoreStep({ type: 'deep_dive_evaluation', reasoning: `${specialistName}_deep` }, stepScores.length, deepResult);
676
+ stepScores.push(deepStepScore);
677
+ }
678
+ }
679
+ }
680
+ return {
681
+ overallScore: aggregateStepScores(stepScores),
682
+ stepScores,
683
+ toolVerifications: verifyToolCalls(evaluand.actions || []),
684
+ trajectoryLength: trajectory.length,
685
+ explanation: `Reactive evaluation engaged ${relevantSpecialists.length} specialists`,
686
+ actionableFeedback: stepScores
687
+ .filter(s => s.score < 0.7)
688
+ .map(s => s.explanation || `${s.step} needs improvement`),
689
+ };
690
+ });
691
+ }
692
+ }
693
+ // ============================================================================
694
+ // Helper Functions
695
+ // ============================================================================
696
+ /**
697
+ * Deep equality comparison for objects.
698
+ * Used for comparing tool arguments and results.
699
+ *
700
+ * M3: Includes circular reference protection using WeakSet tracking.
701
+ *
702
+ * @param a - First value to compare
703
+ * @param b - Second value to compare
704
+ * @param seenA - WeakSet tracking visited objects from 'a' (for cycle detection)
705
+ * @param seenB - WeakSet tracking visited objects from 'b' (for cycle detection)
706
+ * @returns True if values are deeply equal
707
+ */
708
+ function deepEquals(a, b, seenA = new WeakSet(), seenB = new WeakSet()) {
709
+ if (a === b)
710
+ return true;
711
+ if (a === null || b === null)
712
+ return a === b;
713
+ if (typeof a !== typeof b)
714
+ return false;
715
+ if (typeof a !== 'object')
716
+ return a === b;
717
+ // M3: Circular reference protection
718
+ const aObj = a;
719
+ const bObj = b;
720
+ if (seenA.has(aObj) || seenB.has(bObj)) {
721
+ // If we've seen this object before, consider unequal to be safe
722
+ // (matching circular structures is complex and rarely needed)
723
+ return false;
724
+ }
725
+ seenA.add(aObj);
726
+ seenB.add(bObj);
727
+ if (Array.isArray(a) && Array.isArray(b)) {
728
+ if (a.length !== b.length)
729
+ return false;
730
+ return a.every((val, i) => deepEquals(val, b[i], seenA, seenB));
731
+ }
732
+ if (Array.isArray(a) || Array.isArray(b))
733
+ return false;
734
+ const aKeys = Object.keys(a);
735
+ const bKeys = Object.keys(b);
736
+ if (aKeys.length !== bKeys.length)
737
+ return false;
738
+ return aKeys.every(key => deepEquals(a[key], b[key], seenA, seenB));
739
+ }
740
+ //# sourceMappingURL=agent-as-judge.js.map