npm - observability-toolkit - Versions diffs - 1.8.4 → 2.0.0 - Mend

observability-toolkit 1.8.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (364) hide show

package/README.md +126 -5
package/dist/backends/index.d.ts +163 -0
package/dist/backends/index.d.ts.map +1 -1
package/dist/backends/index.js +57 -0
package/dist/backends/index.js.map +1 -1
package/dist/backends/index.test.js +55 -1
package/dist/backends/index.test.js.map +1 -1
package/dist/backends/local-jsonl-boolean-search.test.js +8 -8
package/dist/backends/local-jsonl-boolean-search.test.js.map +1 -1
package/dist/backends/local-jsonl-cache.test.d.ts +2 -0
package/dist/backends/local-jsonl-cache.test.d.ts.map +1 -0
package/dist/backends/local-jsonl-cache.test.js +295 -0
package/dist/backends/local-jsonl-cache.test.js.map +1 -0
package/dist/backends/local-jsonl-circuit-breaker.test.d.ts +2 -0
package/dist/backends/local-jsonl-circuit-breaker.test.d.ts.map +1 -0
package/dist/backends/local-jsonl-circuit-breaker.test.js +180 -0
package/dist/backends/local-jsonl-circuit-breaker.test.js.map +1 -0
package/dist/backends/local-jsonl-export.test.d.ts +2 -0
package/dist/backends/local-jsonl-export.test.d.ts.map +1 -0
package/dist/backends/local-jsonl-export.test.js +704 -0
package/dist/backends/local-jsonl-export.test.js.map +1 -0
package/dist/backends/local-jsonl-index.test.d.ts +2 -0
package/dist/backends/local-jsonl-index.test.d.ts.map +1 -0
package/dist/backends/local-jsonl-index.test.js +554 -0
package/dist/backends/local-jsonl-index.test.js.map +1 -0
package/dist/backends/local-jsonl-logs.test.js +52 -43
package/dist/backends/local-jsonl-logs.test.js.map +1 -1
package/dist/backends/local-jsonl-metrics.test.d.ts +2 -0
package/dist/backends/local-jsonl-metrics.test.d.ts.map +1 -0
package/dist/backends/local-jsonl-metrics.test.js +876 -0
package/dist/backends/local-jsonl-metrics.test.js.map +1 -0
package/dist/backends/local-jsonl-traces.test.js +89 -83
package/dist/backends/local-jsonl-traces.test.js.map +1 -1
package/dist/backends/local-jsonl.d.ts +39 -0
package/dist/backends/local-jsonl.d.ts.map +1 -1
package/dist/backends/local-jsonl.js +975 -492
package/dist/backends/local-jsonl.js.map +1 -1
package/dist/backends/signoz-api-circuit-breaker.test.d.ts +6 -0
package/dist/backends/signoz-api-circuit-breaker.test.d.ts.map +1 -0
package/dist/backends/signoz-api-circuit-breaker.test.js +548 -0
package/dist/backends/signoz-api-circuit-breaker.test.js.map +1 -0
package/dist/backends/signoz-api-rate-limiter.test.d.ts +6 -0
package/dist/backends/signoz-api-rate-limiter.test.d.ts.map +1 -0
package/dist/backends/signoz-api-rate-limiter.test.js +390 -0
package/dist/backends/signoz-api-rate-limiter.test.js.map +1 -0
package/dist/backends/signoz-api-ssrf.test.d.ts +6 -0
package/dist/backends/signoz-api-ssrf.test.d.ts.map +1 -0
package/dist/backends/signoz-api-ssrf.test.js +216 -0
package/dist/backends/signoz-api-ssrf.test.js.map +1 -0
package/dist/backends/signoz-api-test-helpers.d.ts +80 -0
package/dist/backends/signoz-api-test-helpers.d.ts.map +1 -0
package/dist/backends/signoz-api-test-helpers.js +79 -0
package/dist/backends/signoz-api-test-helpers.js.map +1 -0
package/dist/backends/signoz-api.d.ts +31 -1
package/dist/backends/signoz-api.d.ts.map +1 -1
package/dist/backends/signoz-api.js +717 -539
package/dist/backends/signoz-api.js.map +1 -1
package/dist/backends/signoz-api.test.d.ts +9 -0
package/dist/backends/signoz-api.test.d.ts.map +1 -1
package/dist/backends/signoz-api.test.js +20 -1032
package/dist/backends/signoz-api.test.js.map +1 -1
package/dist/lib/agent-as-judge.d.ts +388 -0
package/dist/lib/agent-as-judge.d.ts.map +1 -0
package/dist/lib/agent-as-judge.js +740 -0
package/dist/lib/agent-as-judge.js.map +1 -0
package/dist/lib/agent-as-judge.test.d.ts +5 -0
package/dist/lib/agent-as-judge.test.d.ts.map +1 -0
package/dist/lib/agent-as-judge.test.js +816 -0
package/dist/lib/agent-as-judge.test.js.map +1 -0
package/dist/lib/cache.d.ts +61 -2
package/dist/lib/cache.d.ts.map +1 -1
package/dist/lib/cache.js +54 -3
package/dist/lib/cache.js.map +1 -1
package/dist/lib/circuit-breaker.d.ts +101 -0
package/dist/lib/circuit-breaker.d.ts.map +1 -0
package/dist/lib/circuit-breaker.js +158 -0
package/dist/lib/circuit-breaker.js.map +1 -0
package/dist/lib/circuit-breaker.test.d.ts +2 -0
package/dist/lib/circuit-breaker.test.d.ts.map +1 -0
package/dist/lib/circuit-breaker.test.js +263 -0
package/dist/lib/circuit-breaker.test.js.map +1 -0
package/dist/lib/confident-export.d.ts +101 -0
package/dist/lib/confident-export.d.ts.map +1 -0
package/dist/lib/confident-export.js +393 -0
package/dist/lib/confident-export.js.map +1 -0
package/dist/lib/confident-export.test.d.ts +7 -0
package/dist/lib/confident-export.test.d.ts.map +1 -0
package/dist/lib/confident-export.test.js +835 -0
package/dist/lib/confident-export.test.js.map +1 -0
package/dist/lib/constants-symlink.test.d.ts +12 -0
package/dist/lib/constants-symlink.test.d.ts.map +1 -0
package/dist/lib/constants-symlink.test.js +357 -0
package/dist/lib/constants-symlink.test.js.map +1 -0
package/dist/lib/constants.d.ts +75 -0
package/dist/lib/constants.d.ts.map +1 -1
package/dist/lib/constants.js +104 -1
package/dist/lib/constants.js.map +1 -1
package/dist/lib/datadog-export.d.ts +156 -0
package/dist/lib/datadog-export.d.ts.map +1 -0
package/dist/lib/datadog-export.js +464 -0
package/dist/lib/datadog-export.js.map +1 -0
package/dist/lib/datadog-export.test.d.ts +14 -0
package/dist/lib/datadog-export.test.d.ts.map +1 -0
package/dist/lib/datadog-export.test.js +890 -0
package/dist/lib/datadog-export.test.js.map +1 -0
package/dist/lib/edge-cases.test.js +17 -17
package/dist/lib/edge-cases.test.js.map +1 -1
package/dist/lib/error-sanitizer.d.ts.map +1 -1
package/dist/lib/error-sanitizer.js +29 -3
package/dist/lib/error-sanitizer.js.map +1 -1
package/dist/lib/error-sanitizer.test.js +159 -0
package/dist/lib/error-sanitizer.test.js.map +1 -1
package/dist/lib/error-types.d.ts +54 -0
package/dist/lib/error-types.d.ts.map +1 -0
package/dist/lib/error-types.js +154 -0
package/dist/lib/error-types.js.map +1 -0
package/dist/lib/error-types.test.d.ts +2 -0
package/dist/lib/error-types.test.d.ts.map +1 -0
package/dist/lib/error-types.test.js +196 -0
package/dist/lib/error-types.test.js.map +1 -0
package/dist/lib/evaluation-hooks.d.ts +49 -0
package/dist/lib/evaluation-hooks.d.ts.map +1 -0
package/dist/lib/evaluation-hooks.js +488 -0
package/dist/lib/evaluation-hooks.js.map +1 -0
package/dist/lib/evaluation-hooks.test.d.ts +8 -0
package/dist/lib/evaluation-hooks.test.d.ts.map +1 -0
package/dist/lib/evaluation-hooks.test.js +624 -0
package/dist/lib/evaluation-hooks.test.js.map +1 -0
package/dist/lib/export-utils.d.ts +99 -0
package/dist/lib/export-utils.d.ts.map +1 -0
package/dist/lib/export-utils.js +238 -0
package/dist/lib/export-utils.js.map +1 -0
package/dist/lib/export-utils.test.d.ts +5 -0
package/dist/lib/export-utils.test.d.ts.map +1 -0
package/dist/lib/export-utils.test.js +193 -0
package/dist/lib/export-utils.test.js.map +1 -0
package/dist/lib/file-utils.d.ts +17 -2
package/dist/lib/file-utils.d.ts.map +1 -1
package/dist/lib/file-utils.js +24 -5
package/dist/lib/file-utils.js.map +1 -1
package/dist/lib/file-utils.test.js +30 -0
package/dist/lib/file-utils.test.js.map +1 -1
package/dist/lib/histogram.d.ts +119 -0
package/dist/lib/histogram.d.ts.map +1 -0
package/dist/lib/histogram.js +202 -0
package/dist/lib/histogram.js.map +1 -0
package/dist/lib/histogram.test.d.ts +5 -0
package/dist/lib/histogram.test.d.ts.map +1 -0
package/dist/lib/histogram.test.js +381 -0
package/dist/lib/histogram.test.js.map +1 -0
package/dist/lib/indexer.test.js +27 -27
package/dist/lib/indexer.test.js.map +1 -1
package/dist/lib/input-validator.d.ts +12 -0
package/dist/lib/input-validator.d.ts.map +1 -1
package/dist/lib/input-validator.fuzz.test.d.ts +12 -0
package/dist/lib/input-validator.fuzz.test.d.ts.map +1 -0
package/dist/lib/input-validator.fuzz.test.js +290 -0
package/dist/lib/input-validator.fuzz.test.js.map +1 -0
package/dist/lib/input-validator.js +57 -3
package/dist/lib/input-validator.js.map +1 -1
package/dist/lib/input-validator.test.js +129 -1
package/dist/lib/input-validator.test.js.map +1 -1
package/dist/lib/instrumentation.d.ts +153 -0
package/dist/lib/instrumentation.d.ts.map +1 -0
package/dist/lib/instrumentation.integration.test.d.ts +2 -0
package/dist/lib/instrumentation.integration.test.d.ts.map +1 -0
package/dist/lib/instrumentation.integration.test.js +589 -0
package/dist/lib/instrumentation.integration.test.js.map +1 -0
package/dist/lib/instrumentation.js +520 -0
package/dist/lib/instrumentation.js.map +1 -0
package/dist/lib/instrumentation.test.d.ts +2 -0
package/dist/lib/instrumentation.test.d.ts.map +1 -0
package/dist/lib/instrumentation.test.js +821 -0
package/dist/lib/instrumentation.test.js.map +1 -0
package/dist/lib/langfuse-export.d.ts +125 -0
package/dist/lib/langfuse-export.d.ts.map +1 -0
package/dist/lib/langfuse-export.js +367 -0
package/dist/lib/langfuse-export.js.map +1 -0
package/dist/lib/langfuse-export.test.d.ts +7 -0
package/dist/lib/langfuse-export.test.d.ts.map +1 -0
package/dist/lib/langfuse-export.test.js +1007 -0
package/dist/lib/langfuse-export.test.js.map +1 -0
package/dist/lib/llm-as-judge.d.ts +657 -0
package/dist/lib/llm-as-judge.d.ts.map +1 -0
package/dist/lib/llm-as-judge.js +1397 -0
package/dist/lib/llm-as-judge.js.map +1 -0
package/dist/lib/llm-as-judge.test.d.ts +2 -0
package/dist/lib/llm-as-judge.test.d.ts.map +1 -0
package/dist/lib/llm-as-judge.test.js +2409 -0
package/dist/lib/llm-as-judge.test.js.map +1 -0
package/dist/lib/logger.d.ts +46 -0
package/dist/lib/logger.d.ts.map +1 -0
package/dist/lib/logger.js +81 -0
package/dist/lib/logger.js.map +1 -0
package/dist/lib/logger.test.d.ts +2 -0
package/dist/lib/logger.test.d.ts.map +1 -0
package/dist/lib/logger.test.js +122 -0
package/dist/lib/logger.test.js.map +1 -0
package/dist/lib/metrics.d.ts +62 -0
package/dist/lib/metrics.d.ts.map +1 -0
package/dist/lib/metrics.js +166 -0
package/dist/lib/metrics.js.map +1 -0
package/dist/lib/metrics.test.d.ts +5 -0
package/dist/lib/metrics.test.d.ts.map +1 -0
package/dist/lib/metrics.test.js +189 -0
package/dist/lib/metrics.test.js.map +1 -0
package/dist/lib/parse-stats.d.ts +119 -0
package/dist/lib/parse-stats.d.ts.map +1 -0
package/dist/lib/parse-stats.js +206 -0
package/dist/lib/parse-stats.js.map +1 -0
package/dist/lib/parse-stats.test.d.ts +5 -0
package/dist/lib/parse-stats.test.d.ts.map +1 -0
package/dist/lib/parse-stats.test.js +283 -0
package/dist/lib/parse-stats.test.js.map +1 -0
package/dist/lib/phoenix-export.d.ts +109 -0
package/dist/lib/phoenix-export.d.ts.map +1 -0
package/dist/lib/phoenix-export.js +429 -0
package/dist/lib/phoenix-export.js.map +1 -0
package/dist/lib/phoenix-export.test.d.ts +11 -0
package/dist/lib/phoenix-export.test.d.ts.map +1 -0
package/dist/lib/phoenix-export.test.js +725 -0
package/dist/lib/phoenix-export.test.js.map +1 -0
package/dist/lib/server-utils.d.ts +14 -1
package/dist/lib/server-utils.d.ts.map +1 -1
package/dist/lib/server-utils.js +43 -3
package/dist/lib/server-utils.js.map +1 -1
package/dist/lib/shared-schemas.d.ts +28 -0
package/dist/lib/shared-schemas.d.ts.map +1 -1
package/dist/lib/shared-schemas.js +33 -4
package/dist/lib/shared-schemas.js.map +1 -1
package/dist/lib/toon-encoder.d.ts +7 -2
package/dist/lib/toon-encoder.d.ts.map +1 -1
package/dist/lib/toon-encoder.js +21 -6
package/dist/lib/toon-encoder.js.map +1 -1
package/dist/lib/toon-encoder.test.d.ts +5 -0
package/dist/lib/toon-encoder.test.d.ts.map +1 -0
package/dist/lib/toon-encoder.test.js +85 -0
package/dist/lib/toon-encoder.test.js.map +1 -0
package/dist/lib/verification-events.d.ts +100 -0
package/dist/lib/verification-events.d.ts.map +1 -0
package/dist/lib/verification-events.js +162 -0
package/dist/lib/verification-events.js.map +1 -0
package/dist/lib/verification-events.test.d.ts +5 -0
package/dist/lib/verification-events.test.d.ts.map +1 -0
package/dist/lib/verification-events.test.js +193 -0
package/dist/lib/verification-events.test.js.map +1 -0
package/dist/server.d.ts +5 -0
package/dist/server.d.ts.map +1 -1
package/dist/server.js +79 -21
package/dist/server.js.map +1 -1
package/dist/server.test.js +30 -0
package/dist/server.test.js.map +1 -1
package/dist/test-helpers/env-utils.d.ts +22 -0
package/dist/test-helpers/env-utils.d.ts.map +1 -1
package/dist/test-helpers/env-utils.js +38 -0
package/dist/test-helpers/env-utils.js.map +1 -1
package/dist/test-helpers/fuzz-generators.d.ts +58 -0
package/dist/test-helpers/fuzz-generators.d.ts.map +1 -0
package/dist/test-helpers/fuzz-generators.js +216 -0
package/dist/test-helpers/fuzz-generators.js.map +1 -0
package/dist/test-helpers/index.d.ts +1 -0
package/dist/test-helpers/index.d.ts.map +1 -1
package/dist/test-helpers/index.js +2 -0
package/dist/test-helpers/index.js.map +1 -1
package/dist/test-helpers/memfs-utils.d.ts +181 -0
package/dist/test-helpers/memfs-utils.d.ts.map +1 -0
package/dist/test-helpers/memfs-utils.js +292 -0
package/dist/test-helpers/memfs-utils.js.map +1 -0
package/dist/test-helpers/memfs-utils.test.d.ts +5 -0
package/dist/test-helpers/memfs-utils.test.d.ts.map +1 -0
package/dist/test-helpers/memfs-utils.test.js +338 -0
package/dist/test-helpers/memfs-utils.test.js.map +1 -0
package/dist/test-helpers/race-condition-helpers.d.ts +85 -0
package/dist/test-helpers/race-condition-helpers.d.ts.map +1 -0
package/dist/test-helpers/race-condition-helpers.js +279 -0
package/dist/test-helpers/race-condition-helpers.js.map +1 -0
package/dist/test-helpers/test-data-builders.d.ts +40 -3
package/dist/test-helpers/test-data-builders.d.ts.map +1 -1
package/dist/test-helpers/test-data-builders.js +54 -5
package/dist/test-helpers/test-data-builders.js.map +1 -1
package/dist/test-helpers/tool-validators.d.ts.map +1 -1
package/dist/test-helpers/tool-validators.js +16 -1
package/dist/test-helpers/tool-validators.js.map +1 -1
package/dist/tools/context-stats.d.ts.map +1 -1
package/dist/tools/context-stats.js +6 -8
package/dist/tools/context-stats.js.map +1 -1
package/dist/tools/export-confident.d.ts +145 -0
package/dist/tools/export-confident.d.ts.map +1 -0
package/dist/tools/export-confident.js +134 -0
package/dist/tools/export-confident.js.map +1 -0
package/dist/tools/export-confident.test.d.ts +7 -0
package/dist/tools/export-confident.test.d.ts.map +1 -0
package/dist/tools/export-confident.test.js +332 -0
package/dist/tools/export-confident.test.js.map +1 -0
package/dist/tools/export-datadog.d.ts +160 -0
package/dist/tools/export-datadog.d.ts.map +1 -0
package/dist/tools/export-datadog.js +160 -0
package/dist/tools/export-datadog.js.map +1 -0
package/dist/tools/export-datadog.test.d.ts +8 -0
package/dist/tools/export-datadog.test.d.ts.map +1 -0
package/dist/tools/export-datadog.test.js +419 -0
package/dist/tools/export-datadog.test.js.map +1 -0
package/dist/tools/export-langfuse.d.ts +137 -0
package/dist/tools/export-langfuse.d.ts.map +1 -0
package/dist/tools/export-langfuse.js +131 -0
package/dist/tools/export-langfuse.js.map +1 -0
package/dist/tools/export-langfuse.test.d.ts +7 -0
package/dist/tools/export-langfuse.test.d.ts.map +1 -0
package/dist/tools/export-langfuse.test.js +303 -0
package/dist/tools/export-langfuse.test.js.map +1 -0
package/dist/tools/export-phoenix.d.ts +145 -0
package/dist/tools/export-phoenix.d.ts.map +1 -0
package/dist/tools/export-phoenix.js +135 -0
package/dist/tools/export-phoenix.js.map +1 -0
package/dist/tools/export-phoenix.test.d.ts +7 -0
package/dist/tools/export-phoenix.test.d.ts.map +1 -0
package/dist/tools/export-phoenix.test.js +316 -0
package/dist/tools/export-phoenix.test.js.map +1 -0
package/dist/tools/health-check.d.ts +26 -0
package/dist/tools/health-check.d.ts.map +1 -1
package/dist/tools/health-check.js +36 -7
package/dist/tools/health-check.js.map +1 -1
package/dist/tools/index.d.ts +6 -0
package/dist/tools/index.d.ts.map +1 -1
package/dist/tools/index.js +6 -0
package/dist/tools/index.js.map +1 -1
package/dist/tools/inject-evaluations.d.ts +1315 -0
package/dist/tools/inject-evaluations.d.ts.map +1 -0
package/dist/tools/inject-evaluations.js +121 -0
package/dist/tools/inject-evaluations.js.map +1 -0
package/dist/tools/inject-evaluations.test.d.ts +5 -0
package/dist/tools/inject-evaluations.test.d.ts.map +1 -0
package/dist/tools/inject-evaluations.test.js +359 -0
package/dist/tools/inject-evaluations.test.js.map +1 -0
package/dist/tools/query-evaluations.d.ts +25 -4
package/dist/tools/query-evaluations.d.ts.map +1 -1
package/dist/tools/query-evaluations.js +26 -2
package/dist/tools/query-evaluations.js.map +1 -1
package/dist/tools/query-evaluations.test.js +53 -46
package/dist/tools/query-evaluations.test.js.map +1 -1
package/dist/tools/query-llm-events.js +2 -2
package/dist/tools/query-llm-events.js.map +1 -1
package/dist/tools/query-llm-events.test.js +6 -3
package/dist/tools/query-llm-events.test.js.map +1 -1
package/dist/tools/query-logs.d.ts +8 -8
package/dist/tools/query-logs.js +3 -3
package/dist/tools/query-logs.js.map +1 -1
package/dist/tools/query-metrics.d.ts +4 -4
package/dist/tools/query-metrics.js +2 -2
package/dist/tools/query-metrics.js.map +1 -1
package/dist/tools/query-traces.d.ts +8 -8
package/dist/tools/query-verifications.d.ts +111 -0
package/dist/tools/query-verifications.d.ts.map +1 -0
package/dist/tools/query-verifications.js +101 -0
package/dist/tools/query-verifications.js.map +1 -0
package/dist/tools/query-verifications.test.d.ts +5 -0
package/dist/tools/query-verifications.test.d.ts.map +1 -0
package/dist/tools/query-verifications.test.js +156 -0
package/dist/tools/query-verifications.test.js.map +1 -0
package/dist/types/evaluation-hooks.d.ts +176 -0
package/dist/types/evaluation-hooks.d.ts.map +1 -0
package/dist/types/evaluation-hooks.js +49 -0
package/dist/types/evaluation-hooks.js.map +1 -0
package/package.json +11 -2

package/dist/lib/agent-as-judge.test.js ADDED Viewed

@@ -0,0 +1,816 @@
+/**
+ * Tests for Agent-as-Judge Implementation
+ */
+import { describe, it, beforeEach } from 'node:test';
+import assert from 'node:assert/strict';
+import {
+// Constants
+MAX_TRAJECTORY_LENGTH, MAX_CONCURRENT_EVALUATORS, MAX_CONSENSUS_ROUNDS, DEFAULT_CONVERGENCE_THRESHOLD,
+// Error classes
+AgentEvalTimeoutError,
+// Utilities
+withAgentTimeout, validateEvaluand, validateStepScore, validateToolVerification, verifyToolCall, verifyToolCalls, scoreStep, aggregateStepScores, analyzeTrajectory, calculateVariance, calculateMedian, collectiveConsensus,
+// Classes
+ProceduralJudge, ReactiveJudge, } from './agent-as-judge.js';
+import { InputValidationError } from './input-validator.js';
+describe('agent-as-judge', () => {
+    // ============================================================================
+    // Timeout Protection Tests
+    // ============================================================================
+    describe('withAgentTimeout', () => {
+        it('should return result when function completes in time', async () => {
+            const result = await withAgentTimeout(() => Promise.resolve(42), 1000);
+            assert.equal(result, 42);
+        });
+        it('should throw AgentEvalTimeoutError when function times out', async () => {
+            await assert.rejects(withAgentTimeout(() => new Promise((resolve) => setTimeout(resolve, 200)), 50), AgentEvalTimeoutError);
+        });
+        it('should include timeout duration in error', async () => {
+            try {
+                await withAgentTimeout(() => new Promise((resolve) => setTimeout(resolve, 200)), 50);
+                assert.fail('Should have thrown');
+            }
+            catch (error) {
+                assert.ok(error instanceof AgentEvalTimeoutError);
+                assert.equal(error.timeoutMs, 50);
+                assert.ok(error.message.includes('50'));
+            }
+        });
+        it('should propagate function errors', async () => {
+            await assert.rejects(withAgentTimeout(() => Promise.reject(new Error('Test error')), 1000), { message: 'Test error' });
+        });
+    });
+    // ============================================================================
+    // Validation Tests
+    // ============================================================================
+    describe('validateEvaluand', () => {
+        it('should pass for valid evaluand', () => {
+            assert.doesNotThrow(() => validateEvaluand({ input: 'test input', output: 'test output' }));
+        });
+        it('should throw for empty input', () => {
+            assert.throws(() => validateEvaluand({ input: '', output: 'test' }), InputValidationError);
+        });
+        it('should throw for whitespace-only input', () => {
+            assert.throws(() => validateEvaluand({ input: '   ', output: 'test' }), InputValidationError);
+        });
+        it('should throw for empty output', () => {
+            assert.throws(() => validateEvaluand({ input: 'test', output: '' }), InputValidationError);
+        });
+        it('should throw for actions exceeding MAX_TRAJECTORY_LENGTH', () => {
+            const actions = Array(MAX_TRAJECTORY_LENGTH + 1).fill({
+                type: 'tool_call',
+                tool: 'test',
+            });
+            assert.throws(() => validateEvaluand({ input: 'test', output: 'test', actions }), InputValidationError);
+        });
+        it('should pass for actions at MAX_TRAJECTORY_LENGTH', () => {
+            const actions = Array(MAX_TRAJECTORY_LENGTH).fill({
+                type: 'tool_call',
+                tool: 'test',
+            });
+            assert.doesNotThrow(() => validateEvaluand({ input: 'test', output: 'test', actions }));
+        });
+    });
+    describe('validateStepScore', () => {
+        it('should pass for valid step score with number step', () => {
+            assert.doesNotThrow(() => validateStepScore({ step: 0, score: 0.5 }));
+        });
+        it('should pass for valid step score with string step', () => {
+            assert.doesNotThrow(() => validateStepScore({ step: 'step_1', score: 0.5 }));
+        });
+        it('should throw for step string exceeding 256 characters', () => {
+            assert.throws(() => validateStepScore({ step: 'a'.repeat(257), score: 0.5 }), InputValidationError);
+        });
+        it('should throw for negative step index', () => {
+            assert.throws(() => validateStepScore({ step: -1, score: 0.5 }), InputValidationError);
+        });
+        it('should throw for non-integer step index', () => {
+            assert.throws(() => validateStepScore({ step: 1.5, score: 0.5 }), InputValidationError);
+        });
+        it('should throw for score below 0', () => {
+            assert.throws(() => validateStepScore({ step: 0, score: -0.1 }), InputValidationError);
+        });
+        it('should throw for score above 1', () => {
+            assert.throws(() => validateStepScore({ step: 0, score: 1.1 }), InputValidationError);
+        });
+        it('should throw for NaN score', () => {
+            assert.throws(() => validateStepScore({ step: 0, score: NaN }), InputValidationError);
+        });
+        it('should throw for Infinity score', () => {
+            assert.throws(() => validateStepScore({ step: 0, score: Infinity }), InputValidationError);
+        });
+    });
+    describe('validateToolVerification', () => {
+        it('should pass for valid verification', () => {
+            assert.doesNotThrow(() => validateToolVerification({
+                toolName: 'search',
+                toolCorrect: true,
+                argsCorrect: true,
+                score: 0.8,
+            }));
+        });
+        it('should throw for empty tool name', () => {
+            assert.throws(() => validateToolVerification({
+                toolName: '',
+                toolCorrect: true,
+                argsCorrect: true,
+                score: 0.8,
+            }), InputValidationError);
+        });
+        it('should throw for non-boolean toolCorrect', () => {
+            assert.throws(() => validateToolVerification({
+                toolName: 'test',
+                toolCorrect: 'yes',
+                argsCorrect: true,
+                score: 0.8,
+            }), InputValidationError);
+        });
+        it('should throw for invalid score', () => {
+            assert.throws(() => validateToolVerification({
+                toolName: 'test',
+                toolCorrect: true,
+                argsCorrect: true,
+                score: 1.5,
+            }), InputValidationError);
+        });
+    });
+    // ============================================================================
+    // Tool Verification Tests
+    // ============================================================================
+    describe('verifyToolCall', () => {
+        it('should throw for null action (H1)', () => {
+            assert.throws(() => verifyToolCall(null), InputValidationError);
+        });
+        it('should throw for non-object action (H1)', () => {
+            assert.throws(() => verifyToolCall('string'), InputValidationError);
+        });
+        it('should verify correct tool selection', () => {
+            const action = { type: 'tool_call', tool: 'search' };
+            const result = verifyToolCall(action, 'search');
+            assert.equal(result.toolCorrect, true);
+            assert.equal(result.score, 1.0);
+        });
+        it('should detect incorrect tool selection', () => {
+            const action = { type: 'tool_call', tool: 'read' };
+            const result = verifyToolCall(action, 'search');
+            assert.equal(result.toolCorrect, false);
+            assert.equal(result.score, 0);
+        });
+        it('should verify correct arguments', () => {
+            const action = {
+                type: 'tool_call',
+                tool: 'search',
+                arguments: { query: 'test' },
+            };
+            const result = verifyToolCall(action, 'search', { query: 'test' });
+            assert.equal(result.argsCorrect, true);
+            assert.ok(result.score > 0.9); // tool + args correct
+        });
+        it('should detect incorrect arguments', () => {
+            const action = {
+                type: 'tool_call',
+                tool: 'search',
+                arguments: { query: 'wrong' },
+            };
+            const result = verifyToolCall(action, 'search', { query: 'test' });
+            assert.equal(result.argsCorrect, false);
+        });
+        it('should verify result correctness when provided', () => {
+            const action = {
+                type: 'tool_call',
+                tool: 'calc',
+                result: 42,
+            };
+            const result = verifyToolCall(action, 'calc', undefined, undefined, 42);
+            assert.equal(result.resultCorrect, true);
+        });
+        it('should include evidence in result', () => {
+            const action = {
+                type: 'tool_call',
+                tool: 'test',
+                arguments: { foo: 'bar' },
+            };
+            const result = verifyToolCall(action, 'expected', { baz: 'qux' });
+            assert.ok(result.evidence);
+            assert.deepEqual(result.evidence.actualTool, 'test');
+            assert.deepEqual(result.evidence.expectedTool, 'expected');
+        });
+    });
+    describe('verifyToolCalls', () => {
+        it('should verify multiple tool calls', () => {
+            const actions = [
+                { type: 'tool_call', tool: 'search', toolCallId: 'call_1' },
+                { type: 'reasoning', reasoning: 'thinking...' },
+                { type: 'tool_call', tool: 'read', toolCallId: 'call_2' },
+            ];
+            const expected = new Map([
+                ['call_1', { tool: 'search' }],
+                ['call_2', { tool: 'read' }],
+            ]);
+            const results = verifyToolCalls(actions, expected);
+            assert.equal(results.length, 2); // Only tool_call actions
+            assert.equal(results[0].toolCorrect, true);
+            assert.equal(results[1].toolCorrect, true);
+        });
+        it('should skip non-tool actions', () => {
+            const actions = [
+                { type: 'reasoning', reasoning: 'thinking' },
+                { type: 'response', reasoning: 'responding' },
+            ];
+            const results = verifyToolCalls(actions);
+            assert.equal(results.length, 0);
+        });
+        it('should respect MAX_TOOL_VERIFICATIONS limit', () => {
+            const actions = Array(1000).fill({
+                type: 'tool_call',
+                tool: 'test',
+            });
+            const results = verifyToolCalls(actions);
+            assert.ok(results.length <= 500); // MAX_TOOL_VERIFICATIONS
+        });
+    });
+    // ============================================================================
+    // Step Scoring Tests
+    // ============================================================================
+    describe('scoreStep', () => {
+        it('should create valid step score', () => {
+            const action = { type: 'tool_call', tool: 'search' };
+            const result = scoreStep(action, 0, { score: 0.8, explanation: 'Good' });
+            assert.equal(result.step, 0);
+            assert.equal(result.score, 0.8);
+            assert.equal(result.explanation, 'Good');
+        });
+        it('should clamp score to [0, 1]', () => {
+            const action = { type: 'tool_call', tool: 'test' };
+            const high = scoreStep(action, 0, { score: 1.5 });
+            assert.equal(high.score, 1);
+            const low = scoreStep(action, 0, { score: -0.5 });
+            assert.equal(low.score, 0);
+        });
+        it('should throw for NaN score (H4)', () => {
+            const action = { type: 'tool_call', tool: 'test' };
+            assert.throws(() => scoreStep(action, 0, { score: NaN }), InputValidationError);
+        });
+        it('should throw for string score (H4)', () => {
+            const action = { type: 'tool_call', tool: 'test' };
+            assert.throws(() => scoreStep(action, 0, { score: '0.5' }), InputValidationError);
+        });
+        it('should include action metadata in evidence', () => {
+            const action = {
+                type: 'tool_call',
+                tool: 'search',
+                reasoning: 'searching for data',
+            };
+            const result = scoreStep(action, 0, { score: 0.9 });
+            const evidence = result.evidence;
+            assert.equal(evidence.actionType, 'tool_call');
+            assert.equal(evidence.tool, 'search');
+            assert.equal(evidence.reasoning, 'searching for data');
+        });
+    });
+    describe('aggregateStepScores', () => {
+        it('should return 1 for empty array', () => {
+            const result = aggregateStepScores([]);
+            assert.equal(result, 1);
+        });
+        it('should calculate average correctly', () => {
+            const scores = [
+                { step: 0, score: 0.8 },
+                { step: 1, score: 0.6 },
+                { step: 2, score: 1.0 },
+            ];
+            const result = aggregateStepScores(scores, 'average');
+            // Use approximate comparison for floating point
+            assert.ok(Math.abs(result - 0.8) < 0.0001);
+        });
+        it('should calculate weighted average correctly', () => {
+            const scores = [
+                { step: 0, score: 1.0 },
+                { step: 1, score: 0.0 },
+            ];
+            const result = aggregateStepScores(scores, 'weighted', [3, 1]);
+            assert.equal(result, 0.75);
+        });
+        it('should throw for weighted without weights', () => {
+            const scores = [{ step: 0, score: 0.5 }];
+            assert.throws(() => aggregateStepScores(scores, 'weighted'), Error);
+        });
+        it('should throw for mismatched weights length', () => {
+            const scores = [
+                { step: 0, score: 0.5 },
+                { step: 1, score: 0.5 },
+            ];
+            assert.throws(() => aggregateStepScores(scores, 'weighted', [1]), Error);
+        });
+        it('should return 0 for all-zero weights', () => {
+            const scores = [{ step: 0, score: 0.8 }];
+            const result = aggregateStepScores(scores, 'weighted', [0]);
+            assert.equal(result, 0);
+        });
+        it('should throw on negative weights (L8)', () => {
+            const scores = [
+                { step: 0, score: 0.5 },
+                { step: 1, score: 0.8 },
+            ];
+            assert.throws(() => aggregateStepScores(scores, 'weighted', [1, -1]), /Invalid weight at index 1: -1\. Weights must be finite non-negative numbers/);
+        });
+        it('should throw on negative weight at index 0', () => {
+            const scores = [
+                { step: 0, score: 0.5 },
+                { step: 1, score: 0.8 },
+            ];
+            assert.throws(() => aggregateStepScores(scores, 'weighted', [-1, 1]), /Invalid weight at index 0: -1\. Weights must be finite non-negative numbers/);
+        });
+        it('should throw on NaN weight (M1)', () => {
+            const scores = [
+                { step: 0, score: 0.5 },
+                { step: 1, score: 0.8 },
+            ];
+            assert.throws(() => aggregateStepScores(scores, 'weighted', [1, NaN]), /Invalid weight at index 1: NaN\. Weights must be finite non-negative numbers/);
+        });
+        it('should throw on Infinity weight', () => {
+            const scores = [
+                { step: 0, score: 0.5 },
+                { step: 1, score: 0.8 },
+            ];
+            assert.throws(() => aggregateStepScores(scores, 'weighted', [1, Infinity]), /Invalid weight at index 1: Infinity\. Weights must be finite non-negative numbers/);
+        });
+        it('should throw on -Infinity weight', () => {
+            const scores = [
+                { step: 0, score: 0.5 },
+                { step: 1, score: 0.8 },
+            ];
+            assert.throws(() => aggregateStepScores(scores, 'weighted', [-Infinity, 1]), /Invalid weight at index 0: -Infinity\. Weights must be finite non-negative numbers/);
+        });
+        it('should calculate min correctly', () => {
+            const scores = [
+                { step: 0, score: 0.9 },
+                { step: 1, score: 0.3 },
+                { step: 2, score: 0.7 },
+            ];
+            const result = aggregateStepScores(scores, 'min');
+            assert.equal(result, 0.3);
+        });
+    });
+    // ============================================================================
+    // Trajectory Analysis Tests
+    // ============================================================================
+    describe('analyzeTrajectory', () => {
+        it('should calculate basic metrics', () => {
+            const evaluand = {
+                input: 'test',
+                output: 'result',
+                actions: [
+                    { type: 'tool_call', tool: 'search' },
+                    { type: 'reasoning', reasoning: 'thinking' },
+                    { type: 'tool_call', tool: 'read' },
+                ],
+            };
+            const result = analyzeTrajectory(evaluand);
+            assert.equal(result.length, 3);
+            assert.equal(result.toolCallCount, 2);
+            assert.equal(result.uniqueTools, 2);
+        });
+        it('should handle empty actions', () => {
+            const evaluand = { input: 'test', output: 'result' };
+            const result = analyzeTrajectory(evaluand);
+            assert.equal(result.length, 0);
+            assert.equal(result.toolCallCount, 0);
+        });
+        it('should calculate efficiency ratio', () => {
+            const evaluand = {
+                input: 'test',
+                output: 'result',
+                actions: Array(10).fill({ type: 'tool_call', tool: 'test' }),
+            };
+            const result = analyzeTrajectory(evaluand, 5);
+            assert.equal(result.efficiencyRatio, 0.5); // 5/10
+        });
+        it('should cap efficiency ratio at 1', () => {
+            const evaluand = {
+                input: 'test',
+                output: 'result',
+                actions: [{ type: 'tool_call', tool: 'test' }],
+            };
+            const result = analyzeTrajectory(evaluand, 10);
+            assert.equal(result.efficiencyRatio, 1);
+        });
+        it('should detect redundant actions', () => {
+            const evaluand = {
+                input: 'test',
+                output: 'result',
+                actions: [
+                    { type: 'tool_call', tool: 'search', arguments: { q: 'a' } },
+                    { type: 'tool_call', tool: 'search', arguments: { q: 'a' } }, // Duplicate
+                    { type: 'tool_call', tool: 'search', arguments: { q: 'b' } }, // Different args
+                ],
+            };
+            const result = analyzeTrajectory(evaluand);
+            assert.equal(result.redundantActions, 1);
+        });
+    });
+    // ============================================================================
+    // Statistical Functions Tests
+    // ============================================================================
+    describe('calculateVariance', () => {
+        it('should return 0 for empty array', () => {
+            assert.equal(calculateVariance([]), 0);
+        });
+        it('should return 0 for single value', () => {
+            assert.equal(calculateVariance([5]), 0);
+        });
+        it('should return 0 for identical values', () => {
+            assert.equal(calculateVariance([3, 3, 3, 3]), 0);
+        });
+        it('should calculate sample variance with Bessel correction (M7)', () => {
+            // Values: [2, 4, 4, 4, 5, 5, 7, 9], n=8, mean = 5
+            // Sum of squared diffs = (2-5)^2 + (4-5)^2*3 + (5-5)^2*2 + (7-5)^2 + (9-5)^2
+            //                      = 9 + 3 + 0 + 4 + 16 = 32
+            // Sample variance (Bessel's correction) = 32 / (n-1) = 32/7 ≈ 4.571
+            const result = calculateVariance([2, 4, 4, 4, 5, 5, 7, 9]);
+            assert.ok(Math.abs(result - 32 / 7) < 0.001, `Expected ~4.571, got ${result}`);
+        });
+        it('should calculate variance for two values', () => {
+            // Values: [2, 6], mean = 4
+            // Sum of squared diffs = (2-4)^2 + (6-4)^2 = 4 + 4 = 8
+            // Sample variance = 8 / (2-1) = 8
+            const result = calculateVariance([2, 6]);
+            assert.equal(result, 8);
+        });
+    });
+    describe('calculateMedian', () => {
+        it('should return 0 for empty array', () => {
+            assert.equal(calculateMedian([]), 0);
+        });
+        it('should return value for single element', () => {
+            assert.equal(calculateMedian([5]), 5);
+        });
+        it('should calculate median for odd count', () => {
+            assert.equal(calculateMedian([1, 3, 5]), 3);
+        });
+        it('should calculate median for even count', () => {
+            assert.equal(calculateMedian([1, 2, 3, 4]), 2.5);
+        });
+        it('should handle unsorted input', () => {
+            assert.equal(calculateMedian([5, 1, 3]), 3);
+        });
+    });
+    // ============================================================================
+    // Consensus Tests
+    // ============================================================================
+    describe('collectiveConsensus', () => {
+        const validEvaluand = { input: 'test', output: 'result' };
+        const config = {
+            rounds: 3,
+            convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
+        };
+        it('should throw for invalid evaluand', async () => {
+            await assert.rejects(collectiveConsensus({ input: '', output: 'test' }, [], config), InputValidationError);
+        });
+        it('should throw for too many judges (H2)', async () => {
+            const judges = Array(MAX_CONCURRENT_EVALUATORS + 1).fill({
+                id: 'judge',
+                evaluate: async () => 0.5,
+            });
+            await assert.rejects(collectiveConsensus(validEvaluand, judges, config), InputValidationError);
+        });
+        it('should throw for empty judges array', async () => {
+            await assert.rejects(collectiveConsensus(validEvaluand, [], config), InputValidationError);
+        });
+        it('should throw for rounds < 1 (M9)', async () => {
+            const judges = [{ id: 'j1', evaluate: async () => 0.5 }];
+            await assert.rejects(collectiveConsensus(validEvaluand, judges, { ...config, rounds: 0 }), InputValidationError);
+        });
+        it('should reach consensus with agreeing judges', async () => {
+            const judges = [
+                { id: 'j1', evaluate: async () => 0.8 },
+                { id: 'j2', evaluate: async () => 0.82 },
+                { id: 'j3', evaluate: async () => 0.79 },
+            ];
+            const result = await collectiveConsensus(validEvaluand, judges, config);
+            assert.ok(result.converged);
+            assert.ok(result.finalScore > 0.7);
+        });
+        it('should handle disagreeing judges', async () => {
+            const judges = [
+                { id: 'j1', evaluate: async () => 0.2 },
+                { id: 'j2', evaluate: async () => 0.8 },
+            ];
+            const result = await collectiveConsensus(validEvaluand, judges, {
+                rounds: 2,
+                convergenceThreshold: 0.01, // Very tight
+            });
+            assert.equal(result.converged, false);
+        });
+        it('should handle judge failures gracefully (H3)', async () => {
+            const judges = [
+                { id: 'j1', evaluate: async () => 0.8 },
+                {
+                    id: 'j2',
+                    evaluate: async () => {
+                        throw new Error('Judge failed');
+                    },
+                },
+                { id: 'j3', evaluate: async () => 0.75 },
+            ];
+            const result = await collectiveConsensus(validEvaluand, judges, config);
+            // Should succeed with 2 working judges
+            assert.ok(result.finalScore > 0);
+        });
+        it('should throw when all judges fail (H3)', async () => {
+            const judges = [
+                {
+                    id: 'j1',
+                    evaluate: async () => {
+                        throw new Error('Failed 1');
+                    },
+                },
+                {
+                    id: 'j2',
+                    evaluate: async () => {
+                        throw new Error('Failed 2');
+                    },
+                },
+            ];
+            await assert.rejects(collectiveConsensus(validEvaluand, judges, config), /All judge evaluations failed/);
+        });
+        it('should respect MAX_CONSENSUS_ROUNDS', async () => {
+            let callCount = 0;
+            const judges = [
+                {
+                    id: 'j1',
+                    evaluate: async () => {
+                        callCount++;
+                        return 0.5;
+                    },
+                },
+            ];
+            await collectiveConsensus(validEvaluand, judges, {
+                rounds: 100, // Exceeds max
+                convergenceThreshold: 0.0001,
+            });
+            assert.ok(callCount <= MAX_CONSENSUS_ROUNDS);
+        });
+    });
+    // ============================================================================
+    // ProceduralJudge Tests
+    // ============================================================================
+    describe('ProceduralJudge', () => {
+        it('should execute all stages in order', async () => {
+            const executionOrder = [];
+            const stages = [
+                {
+                    name: 'stage1',
+                    evaluate: async () => {
+                        executionOrder.push('stage1');
+                        return { score: 0.8, explanation: 'Good' };
+                    },
+                },
+                {
+                    name: 'stage2',
+                    evaluate: async () => {
+                        executionOrder.push('stage2');
+                        return { score: 0.9, explanation: 'Great' };
+                    },
+                },
+            ];
+            const judge = new ProceduralJudge(stages);
+            const result = await judge.evaluate({
+                input: 'test',
+                output: 'result',
+            });
+            assert.deepEqual(executionOrder, ['stage1', 'stage2']);
+            assert.equal(result.stepScores.length, 2);
+        });
+        it('should support early termination', async () => {
+            const executionOrder = [];
+            const stages = [
+                {
+                    name: 'safety',
+                    evaluate: async () => {
+                        executionOrder.push('safety');
+                        return { score: 0.2, explanation: 'Failed safety' };
+                    },
+                },
+                {
+                    name: 'quality',
+                    evaluate: async () => {
+                        executionOrder.push('quality');
+                        return { score: 0.9, explanation: 'Good' };
+                    },
+                },
+            ];
+            const judge = new ProceduralJudge(stages, 'safety');
+            const result = await judge.evaluate({
+                input: 'test',
+                output: 'result',
+            });
+            assert.deepEqual(executionOrder, ['safety']);
+            assert.equal(result.overallScore, 0);
+            assert.ok(result.explanation.includes('Early termination'));
+        });
+        it('should pass context between stages', async () => {
+            let stage2Context = {};
+            const stages = [
+                {
+                    name: 'stage1',
+                    evaluate: async (_, ctx) => {
+                        ctx['data'] = 'from_stage1';
+                        return { score: 0.8, explanation: 'OK' };
+                    },
+                },
+                {
+                    name: 'stage2',
+                    evaluate: async (_, ctx) => {
+                        stage2Context = { ...ctx };
+                        return { score: 0.9, explanation: 'OK' };
+                    },
+                },
+            ];
+            const judge = new ProceduralJudge(stages);
+            await judge.evaluate({ input: 'test', output: 'result' });
+            assert.ok(stage2Context['stage1']);
+        });
+        it('should validate evaluand', async () => {
+            const judge = new ProceduralJudge([
+                { name: 'test', evaluate: async () => ({ score: 1, explanation: '' }) },
+            ]);
+            await assert.rejects(judge.evaluate({ input: '', output: 'test' }), InputValidationError);
+        });
+        describe('constructor validation (M8)', () => {
+            it('should throw on empty stages array', () => {
+                assert.throws(() => new ProceduralJudge([]), InputValidationError);
+            });
+            it('should throw on stage with empty name', () => {
+                assert.throws(() => new ProceduralJudge([
+                    { name: '', evaluate: async () => ({ score: 1, explanation: '' }) },
+                ]), InputValidationError);
+            });
+            it('should throw on stage with whitespace-only name', () => {
+                assert.throws(() => new ProceduralJudge([
+                    { name: '   ', evaluate: async () => ({ score: 1, explanation: '' }) },
+                ]), InputValidationError);
+            });
+            it('should throw on stage without evaluate function', () => {
+                assert.throws(
+                // @ts-expect-error - testing runtime validation
+                () => new ProceduralJudge([{ name: 'test' }]), InputValidationError);
+            });
+            it('should throw on invalid earlyTerminationOn stage name', () => {
+                assert.throws(() => new ProceduralJudge([{ name: 'stage1', evaluate: async () => ({ score: 1, explanation: '' }) }], 'nonexistent'), InputValidationError);
+            });
+            it('should accept valid stages with earlyTerminationOn', () => {
+                const judge = new ProceduralJudge([{ name: 'safety', evaluate: async () => ({ score: 1, explanation: '' }) }], 'safety');
+                assert.ok(judge);
+            });
+        });
+    });
+    // ============================================================================
+    // ReactiveJudge Tests
+    // ============================================================================
+    describe('ReactiveJudge', () => {
+        it('should route to selected specialists', async () => {
+            const executedSpecialists = [];
+            const router = async () => ['quality', 'safety'];
+            const specialists = new Map([
+                [
+                    'quality',
+                    async () => {
+                        executedSpecialists.push('quality');
+                        return { score: 0.9, explanation: 'Good quality' };
+                    },
+                ],
+                [
+                    'safety',
+                    async () => {
+                        executedSpecialists.push('safety');
+                        return { score: 1.0, explanation: 'Safe' };
+                    },
+                ],
+                [
+                    'style',
+                    async () => {
+                        executedSpecialists.push('style');
+                        return { score: 0.8, explanation: 'OK style' };
+                    },
+                ],
+            ]);
+            const judge = new ReactiveJudge(router, specialists);
+            const result = await judge.evaluate({
+                input: 'test',
+                output: 'result',
+            });
+            assert.deepEqual(executedSpecialists, ['quality', 'safety']);
+            assert.equal(result.stepScores.length, 2);
+        });
+        it('should trigger deep dive when needed', async () => {
+            const deepDiveTriggered = { value: false };
+            const router = async () => ['quality'];
+            const specialists = new Map([
+                [
+                    'quality',
+                    async () => ({
+                        score: 0.5,
+                        explanation: 'Needs investigation',
+                        needsDeepDive: true,
+                    }),
+                ],
+            ]);
+            const deepDiveSpecialists = new Map([
+                [
+                    'quality',
+                    async () => {
+                        deepDiveTriggered.value = true;
+                        return { score: 0.4, explanation: 'Deep analysis' };
+                    },
+                ],
+            ]);
+            const judge = new ReactiveJudge(router, specialists, deepDiveSpecialists);
+            const result = await judge.evaluate({
+                input: 'test',
+                output: 'result',
+            });
+            assert.equal(deepDiveTriggered.value, true);
+            assert.equal(result.stepScores.length, 2); // Regular + deep dive
+        });
+        it('should skip missing specialists', async () => {
+            const router = async () => ['missing', 'existing'];
+            const specialists = new Map([
+                ['existing', async () => ({ score: 0.8, explanation: 'OK' })],
+            ]);
+            const judge = new ReactiveJudge(router, specialists);
+            const result = await judge.evaluate({
+                input: 'test',
+                output: 'result',
+            });
+            assert.equal(result.stepScores.length, 1);
+        });
+        it('should validate evaluand', async () => {
+            const judge = new ReactiveJudge(async () => [], new Map());
+            await assert.rejects(judge.evaluate({ input: '', output: 'test' }), InputValidationError);
+        });
+    });
+    // ============================================================================
+    // AgentJudge Memory Tests (H5)
+    // ============================================================================
+    describe('AgentJudge memory management', () => {
+        class TestJudge extends ProceduralJudge {
+            // Expose protected methods for testing
+            testStore(key, value) {
+                this.storeInMemory(key, value);
+            }
+            testGet(key) {
+                return this.getFromMemory(key);
+            }
+            getMemorySize() {
+                return this.memory.size;
+            }
+        }
+        let judge;
+        beforeEach(() => {
+            judge = new TestJudge([
+                { name: 'test', evaluate: async () => ({ score: 1, explanation: '' }) },
+            ]);
+        });
+        it('should store and retrieve values', () => {
+            judge.testStore('key1', 'value1');
+            assert.equal(judge.testGet('key1'), 'value1');
+        });
+        it('should implement LRU - reading moves to end (H5)', () => {
+            // Fill nearly to capacity
+            for (let i = 0; i < 997; i++) {
+                judge.testStore(`fill_${i}`, i);
+            }
+            // Store 3 more items we care about
+            judge.testStore('key1', 'value1');
+            judge.testStore('key2', 'value2');
+            judge.testStore('key3', 'value3');
+            // Memory is now at 1000 - order is fill_0...fill_996, key1, key2, key3
+            // Read key1 - should move it to end (after key3)
+            judge.testGet('key1');
+            // Order is now: fill_0...fill_996, key2, key3, key1
+            // Store 1 more to trigger eviction of fill_0
+            judge.testStore('new_item', 'new');
+            // Order: fill_1...fill_996, key2, key3, key1, new_item
+            // key1 should still exist (was accessed recently), key2/key3 should exist
+            assert.equal(judge.testGet('key1'), 'value1');
+            assert.equal(judge.testGet('key2'), 'value2');
+            assert.equal(judge.testGet('key3'), 'value3');
+            // fill_0 should be evicted
+            assert.equal(judge.testGet('fill_0'), undefined);
+        });
+        it('should evict oldest on overflow', () => {
+            // Fill to capacity
+            for (let i = 0; i < 1000; i++) {
+                judge.testStore(`key_${i}`, i);
+            }
+            assert.equal(judge.getMemorySize(), 1000);
+            // Add one more
+            judge.testStore('new_key', 'new_value');
+            assert.equal(judge.getMemorySize(), 1000);
+            assert.equal(judge.testGet('key_0'), undefined); // First one evicted
+            assert.equal(judge.testGet('new_key'), 'new_value');
+        });
+    });
+});
+//# sourceMappingURL=agent-as-judge.test.js.map