observability-toolkit 1.8.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +126 -5
- package/dist/backends/index.d.ts +163 -0
- package/dist/backends/index.d.ts.map +1 -1
- package/dist/backends/index.js +57 -0
- package/dist/backends/index.js.map +1 -1
- package/dist/backends/index.test.js +55 -1
- package/dist/backends/index.test.js.map +1 -1
- package/dist/backends/local-jsonl-boolean-search.test.js +8 -8
- package/dist/backends/local-jsonl-boolean-search.test.js.map +1 -1
- package/dist/backends/local-jsonl-cache.test.d.ts +2 -0
- package/dist/backends/local-jsonl-cache.test.d.ts.map +1 -0
- package/dist/backends/local-jsonl-cache.test.js +295 -0
- package/dist/backends/local-jsonl-cache.test.js.map +1 -0
- package/dist/backends/local-jsonl-circuit-breaker.test.d.ts +2 -0
- package/dist/backends/local-jsonl-circuit-breaker.test.d.ts.map +1 -0
- package/dist/backends/local-jsonl-circuit-breaker.test.js +180 -0
- package/dist/backends/local-jsonl-circuit-breaker.test.js.map +1 -0
- package/dist/backends/local-jsonl-export.test.d.ts +2 -0
- package/dist/backends/local-jsonl-export.test.d.ts.map +1 -0
- package/dist/backends/local-jsonl-export.test.js +704 -0
- package/dist/backends/local-jsonl-export.test.js.map +1 -0
- package/dist/backends/local-jsonl-index.test.d.ts +2 -0
- package/dist/backends/local-jsonl-index.test.d.ts.map +1 -0
- package/dist/backends/local-jsonl-index.test.js +554 -0
- package/dist/backends/local-jsonl-index.test.js.map +1 -0
- package/dist/backends/local-jsonl-logs.test.js +52 -43
- package/dist/backends/local-jsonl-logs.test.js.map +1 -1
- package/dist/backends/local-jsonl-metrics.test.d.ts +2 -0
- package/dist/backends/local-jsonl-metrics.test.d.ts.map +1 -0
- package/dist/backends/local-jsonl-metrics.test.js +876 -0
- package/dist/backends/local-jsonl-metrics.test.js.map +1 -0
- package/dist/backends/local-jsonl-traces.test.js +89 -83
- package/dist/backends/local-jsonl-traces.test.js.map +1 -1
- package/dist/backends/local-jsonl.d.ts +39 -0
- package/dist/backends/local-jsonl.d.ts.map +1 -1
- package/dist/backends/local-jsonl.js +975 -492
- package/dist/backends/local-jsonl.js.map +1 -1
- package/dist/backends/signoz-api-circuit-breaker.test.d.ts +6 -0
- package/dist/backends/signoz-api-circuit-breaker.test.d.ts.map +1 -0
- package/dist/backends/signoz-api-circuit-breaker.test.js +548 -0
- package/dist/backends/signoz-api-circuit-breaker.test.js.map +1 -0
- package/dist/backends/signoz-api-rate-limiter.test.d.ts +6 -0
- package/dist/backends/signoz-api-rate-limiter.test.d.ts.map +1 -0
- package/dist/backends/signoz-api-rate-limiter.test.js +390 -0
- package/dist/backends/signoz-api-rate-limiter.test.js.map +1 -0
- package/dist/backends/signoz-api-ssrf.test.d.ts +6 -0
- package/dist/backends/signoz-api-ssrf.test.d.ts.map +1 -0
- package/dist/backends/signoz-api-ssrf.test.js +216 -0
- package/dist/backends/signoz-api-ssrf.test.js.map +1 -0
- package/dist/backends/signoz-api-test-helpers.d.ts +80 -0
- package/dist/backends/signoz-api-test-helpers.d.ts.map +1 -0
- package/dist/backends/signoz-api-test-helpers.js +79 -0
- package/dist/backends/signoz-api-test-helpers.js.map +1 -0
- package/dist/backends/signoz-api.d.ts +31 -1
- package/dist/backends/signoz-api.d.ts.map +1 -1
- package/dist/backends/signoz-api.js +717 -539
- package/dist/backends/signoz-api.js.map +1 -1
- package/dist/backends/signoz-api.test.d.ts +9 -0
- package/dist/backends/signoz-api.test.d.ts.map +1 -1
- package/dist/backends/signoz-api.test.js +20 -1032
- package/dist/backends/signoz-api.test.js.map +1 -1
- package/dist/lib/agent-as-judge.d.ts +388 -0
- package/dist/lib/agent-as-judge.d.ts.map +1 -0
- package/dist/lib/agent-as-judge.js +740 -0
- package/dist/lib/agent-as-judge.js.map +1 -0
- package/dist/lib/agent-as-judge.test.d.ts +5 -0
- package/dist/lib/agent-as-judge.test.d.ts.map +1 -0
- package/dist/lib/agent-as-judge.test.js +816 -0
- package/dist/lib/agent-as-judge.test.js.map +1 -0
- package/dist/lib/cache.d.ts +61 -2
- package/dist/lib/cache.d.ts.map +1 -1
- package/dist/lib/cache.js +54 -3
- package/dist/lib/cache.js.map +1 -1
- package/dist/lib/circuit-breaker.d.ts +101 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +158 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/circuit-breaker.test.d.ts +2 -0
- package/dist/lib/circuit-breaker.test.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.test.js +263 -0
- package/dist/lib/circuit-breaker.test.js.map +1 -0
- package/dist/lib/confident-export.d.ts +101 -0
- package/dist/lib/confident-export.d.ts.map +1 -0
- package/dist/lib/confident-export.js +393 -0
- package/dist/lib/confident-export.js.map +1 -0
- package/dist/lib/confident-export.test.d.ts +7 -0
- package/dist/lib/confident-export.test.d.ts.map +1 -0
- package/dist/lib/confident-export.test.js +835 -0
- package/dist/lib/confident-export.test.js.map +1 -0
- package/dist/lib/constants-symlink.test.d.ts +12 -0
- package/dist/lib/constants-symlink.test.d.ts.map +1 -0
- package/dist/lib/constants-symlink.test.js +357 -0
- package/dist/lib/constants-symlink.test.js.map +1 -0
- package/dist/lib/constants.d.ts +75 -0
- package/dist/lib/constants.d.ts.map +1 -1
- package/dist/lib/constants.js +104 -1
- package/dist/lib/constants.js.map +1 -1
- package/dist/lib/datadog-export.d.ts +156 -0
- package/dist/lib/datadog-export.d.ts.map +1 -0
- package/dist/lib/datadog-export.js +464 -0
- package/dist/lib/datadog-export.js.map +1 -0
- package/dist/lib/datadog-export.test.d.ts +14 -0
- package/dist/lib/datadog-export.test.d.ts.map +1 -0
- package/dist/lib/datadog-export.test.js +890 -0
- package/dist/lib/datadog-export.test.js.map +1 -0
- package/dist/lib/edge-cases.test.js +17 -17
- package/dist/lib/edge-cases.test.js.map +1 -1
- package/dist/lib/error-sanitizer.d.ts.map +1 -1
- package/dist/lib/error-sanitizer.js +29 -3
- package/dist/lib/error-sanitizer.js.map +1 -1
- package/dist/lib/error-sanitizer.test.js +159 -0
- package/dist/lib/error-sanitizer.test.js.map +1 -1
- package/dist/lib/error-types.d.ts +54 -0
- package/dist/lib/error-types.d.ts.map +1 -0
- package/dist/lib/error-types.js +154 -0
- package/dist/lib/error-types.js.map +1 -0
- package/dist/lib/error-types.test.d.ts +2 -0
- package/dist/lib/error-types.test.d.ts.map +1 -0
- package/dist/lib/error-types.test.js +196 -0
- package/dist/lib/error-types.test.js.map +1 -0
- package/dist/lib/evaluation-hooks.d.ts +49 -0
- package/dist/lib/evaluation-hooks.d.ts.map +1 -0
- package/dist/lib/evaluation-hooks.js +488 -0
- package/dist/lib/evaluation-hooks.js.map +1 -0
- package/dist/lib/evaluation-hooks.test.d.ts +8 -0
- package/dist/lib/evaluation-hooks.test.d.ts.map +1 -0
- package/dist/lib/evaluation-hooks.test.js +624 -0
- package/dist/lib/evaluation-hooks.test.js.map +1 -0
- package/dist/lib/export-utils.d.ts +99 -0
- package/dist/lib/export-utils.d.ts.map +1 -0
- package/dist/lib/export-utils.js +238 -0
- package/dist/lib/export-utils.js.map +1 -0
- package/dist/lib/export-utils.test.d.ts +5 -0
- package/dist/lib/export-utils.test.d.ts.map +1 -0
- package/dist/lib/export-utils.test.js +193 -0
- package/dist/lib/export-utils.test.js.map +1 -0
- package/dist/lib/file-utils.d.ts +17 -2
- package/dist/lib/file-utils.d.ts.map +1 -1
- package/dist/lib/file-utils.js +24 -5
- package/dist/lib/file-utils.js.map +1 -1
- package/dist/lib/file-utils.test.js +30 -0
- package/dist/lib/file-utils.test.js.map +1 -1
- package/dist/lib/histogram.d.ts +119 -0
- package/dist/lib/histogram.d.ts.map +1 -0
- package/dist/lib/histogram.js +202 -0
- package/dist/lib/histogram.js.map +1 -0
- package/dist/lib/histogram.test.d.ts +5 -0
- package/dist/lib/histogram.test.d.ts.map +1 -0
- package/dist/lib/histogram.test.js +381 -0
- package/dist/lib/histogram.test.js.map +1 -0
- package/dist/lib/indexer.test.js +27 -27
- package/dist/lib/indexer.test.js.map +1 -1
- package/dist/lib/input-validator.d.ts +12 -0
- package/dist/lib/input-validator.d.ts.map +1 -1
- package/dist/lib/input-validator.fuzz.test.d.ts +12 -0
- package/dist/lib/input-validator.fuzz.test.d.ts.map +1 -0
- package/dist/lib/input-validator.fuzz.test.js +290 -0
- package/dist/lib/input-validator.fuzz.test.js.map +1 -0
- package/dist/lib/input-validator.js +57 -3
- package/dist/lib/input-validator.js.map +1 -1
- package/dist/lib/input-validator.test.js +129 -1
- package/dist/lib/input-validator.test.js.map +1 -1
- package/dist/lib/instrumentation.d.ts +153 -0
- package/dist/lib/instrumentation.d.ts.map +1 -0
- package/dist/lib/instrumentation.integration.test.d.ts +2 -0
- package/dist/lib/instrumentation.integration.test.d.ts.map +1 -0
- package/dist/lib/instrumentation.integration.test.js +589 -0
- package/dist/lib/instrumentation.integration.test.js.map +1 -0
- package/dist/lib/instrumentation.js +520 -0
- package/dist/lib/instrumentation.js.map +1 -0
- package/dist/lib/instrumentation.test.d.ts +2 -0
- package/dist/lib/instrumentation.test.d.ts.map +1 -0
- package/dist/lib/instrumentation.test.js +821 -0
- package/dist/lib/instrumentation.test.js.map +1 -0
- package/dist/lib/langfuse-export.d.ts +125 -0
- package/dist/lib/langfuse-export.d.ts.map +1 -0
- package/dist/lib/langfuse-export.js +367 -0
- package/dist/lib/langfuse-export.js.map +1 -0
- package/dist/lib/langfuse-export.test.d.ts +7 -0
- package/dist/lib/langfuse-export.test.d.ts.map +1 -0
- package/dist/lib/langfuse-export.test.js +1007 -0
- package/dist/lib/langfuse-export.test.js.map +1 -0
- package/dist/lib/llm-as-judge.d.ts +657 -0
- package/dist/lib/llm-as-judge.d.ts.map +1 -0
- package/dist/lib/llm-as-judge.js +1397 -0
- package/dist/lib/llm-as-judge.js.map +1 -0
- package/dist/lib/llm-as-judge.test.d.ts +2 -0
- package/dist/lib/llm-as-judge.test.d.ts.map +1 -0
- package/dist/lib/llm-as-judge.test.js +2409 -0
- package/dist/lib/llm-as-judge.test.js.map +1 -0
- package/dist/lib/logger.d.ts +46 -0
- package/dist/lib/logger.d.ts.map +1 -0
- package/dist/lib/logger.js +81 -0
- package/dist/lib/logger.js.map +1 -0
- package/dist/lib/logger.test.d.ts +2 -0
- package/dist/lib/logger.test.d.ts.map +1 -0
- package/dist/lib/logger.test.js +122 -0
- package/dist/lib/logger.test.js.map +1 -0
- package/dist/lib/metrics.d.ts +62 -0
- package/dist/lib/metrics.d.ts.map +1 -0
- package/dist/lib/metrics.js +166 -0
- package/dist/lib/metrics.js.map +1 -0
- package/dist/lib/metrics.test.d.ts +5 -0
- package/dist/lib/metrics.test.d.ts.map +1 -0
- package/dist/lib/metrics.test.js +189 -0
- package/dist/lib/metrics.test.js.map +1 -0
- package/dist/lib/parse-stats.d.ts +119 -0
- package/dist/lib/parse-stats.d.ts.map +1 -0
- package/dist/lib/parse-stats.js +206 -0
- package/dist/lib/parse-stats.js.map +1 -0
- package/dist/lib/parse-stats.test.d.ts +5 -0
- package/dist/lib/parse-stats.test.d.ts.map +1 -0
- package/dist/lib/parse-stats.test.js +283 -0
- package/dist/lib/parse-stats.test.js.map +1 -0
- package/dist/lib/phoenix-export.d.ts +109 -0
- package/dist/lib/phoenix-export.d.ts.map +1 -0
- package/dist/lib/phoenix-export.js +429 -0
- package/dist/lib/phoenix-export.js.map +1 -0
- package/dist/lib/phoenix-export.test.d.ts +11 -0
- package/dist/lib/phoenix-export.test.d.ts.map +1 -0
- package/dist/lib/phoenix-export.test.js +725 -0
- package/dist/lib/phoenix-export.test.js.map +1 -0
- package/dist/lib/server-utils.d.ts +14 -1
- package/dist/lib/server-utils.d.ts.map +1 -1
- package/dist/lib/server-utils.js +43 -3
- package/dist/lib/server-utils.js.map +1 -1
- package/dist/lib/shared-schemas.d.ts +28 -0
- package/dist/lib/shared-schemas.d.ts.map +1 -1
- package/dist/lib/shared-schemas.js +33 -4
- package/dist/lib/shared-schemas.js.map +1 -1
- package/dist/lib/toon-encoder.d.ts +7 -2
- package/dist/lib/toon-encoder.d.ts.map +1 -1
- package/dist/lib/toon-encoder.js +21 -6
- package/dist/lib/toon-encoder.js.map +1 -1
- package/dist/lib/toon-encoder.test.d.ts +5 -0
- package/dist/lib/toon-encoder.test.d.ts.map +1 -0
- package/dist/lib/toon-encoder.test.js +85 -0
- package/dist/lib/toon-encoder.test.js.map +1 -0
- package/dist/lib/verification-events.d.ts +100 -0
- package/dist/lib/verification-events.d.ts.map +1 -0
- package/dist/lib/verification-events.js +162 -0
- package/dist/lib/verification-events.js.map +1 -0
- package/dist/lib/verification-events.test.d.ts +5 -0
- package/dist/lib/verification-events.test.d.ts.map +1 -0
- package/dist/lib/verification-events.test.js +193 -0
- package/dist/lib/verification-events.test.js.map +1 -0
- package/dist/server.d.ts +5 -0
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +79 -21
- package/dist/server.js.map +1 -1
- package/dist/server.test.js +30 -0
- package/dist/server.test.js.map +1 -1
- package/dist/test-helpers/env-utils.d.ts +22 -0
- package/dist/test-helpers/env-utils.d.ts.map +1 -1
- package/dist/test-helpers/env-utils.js +38 -0
- package/dist/test-helpers/env-utils.js.map +1 -1
- package/dist/test-helpers/fuzz-generators.d.ts +58 -0
- package/dist/test-helpers/fuzz-generators.d.ts.map +1 -0
- package/dist/test-helpers/fuzz-generators.js +216 -0
- package/dist/test-helpers/fuzz-generators.js.map +1 -0
- package/dist/test-helpers/index.d.ts +1 -0
- package/dist/test-helpers/index.d.ts.map +1 -1
- package/dist/test-helpers/index.js +2 -0
- package/dist/test-helpers/index.js.map +1 -1
- package/dist/test-helpers/memfs-utils.d.ts +181 -0
- package/dist/test-helpers/memfs-utils.d.ts.map +1 -0
- package/dist/test-helpers/memfs-utils.js +292 -0
- package/dist/test-helpers/memfs-utils.js.map +1 -0
- package/dist/test-helpers/memfs-utils.test.d.ts +5 -0
- package/dist/test-helpers/memfs-utils.test.d.ts.map +1 -0
- package/dist/test-helpers/memfs-utils.test.js +338 -0
- package/dist/test-helpers/memfs-utils.test.js.map +1 -0
- package/dist/test-helpers/race-condition-helpers.d.ts +85 -0
- package/dist/test-helpers/race-condition-helpers.d.ts.map +1 -0
- package/dist/test-helpers/race-condition-helpers.js +279 -0
- package/dist/test-helpers/race-condition-helpers.js.map +1 -0
- package/dist/test-helpers/test-data-builders.d.ts +40 -3
- package/dist/test-helpers/test-data-builders.d.ts.map +1 -1
- package/dist/test-helpers/test-data-builders.js +54 -5
- package/dist/test-helpers/test-data-builders.js.map +1 -1
- package/dist/test-helpers/tool-validators.d.ts.map +1 -1
- package/dist/test-helpers/tool-validators.js +16 -1
- package/dist/test-helpers/tool-validators.js.map +1 -1
- package/dist/tools/context-stats.d.ts.map +1 -1
- package/dist/tools/context-stats.js +6 -8
- package/dist/tools/context-stats.js.map +1 -1
- package/dist/tools/export-confident.d.ts +145 -0
- package/dist/tools/export-confident.d.ts.map +1 -0
- package/dist/tools/export-confident.js +134 -0
- package/dist/tools/export-confident.js.map +1 -0
- package/dist/tools/export-confident.test.d.ts +7 -0
- package/dist/tools/export-confident.test.d.ts.map +1 -0
- package/dist/tools/export-confident.test.js +332 -0
- package/dist/tools/export-confident.test.js.map +1 -0
- package/dist/tools/export-datadog.d.ts +160 -0
- package/dist/tools/export-datadog.d.ts.map +1 -0
- package/dist/tools/export-datadog.js +160 -0
- package/dist/tools/export-datadog.js.map +1 -0
- package/dist/tools/export-datadog.test.d.ts +8 -0
- package/dist/tools/export-datadog.test.d.ts.map +1 -0
- package/dist/tools/export-datadog.test.js +419 -0
- package/dist/tools/export-datadog.test.js.map +1 -0
- package/dist/tools/export-langfuse.d.ts +137 -0
- package/dist/tools/export-langfuse.d.ts.map +1 -0
- package/dist/tools/export-langfuse.js +131 -0
- package/dist/tools/export-langfuse.js.map +1 -0
- package/dist/tools/export-langfuse.test.d.ts +7 -0
- package/dist/tools/export-langfuse.test.d.ts.map +1 -0
- package/dist/tools/export-langfuse.test.js +303 -0
- package/dist/tools/export-langfuse.test.js.map +1 -0
- package/dist/tools/export-phoenix.d.ts +145 -0
- package/dist/tools/export-phoenix.d.ts.map +1 -0
- package/dist/tools/export-phoenix.js +135 -0
- package/dist/tools/export-phoenix.js.map +1 -0
- package/dist/tools/export-phoenix.test.d.ts +7 -0
- package/dist/tools/export-phoenix.test.d.ts.map +1 -0
- package/dist/tools/export-phoenix.test.js +316 -0
- package/dist/tools/export-phoenix.test.js.map +1 -0
- package/dist/tools/health-check.d.ts +26 -0
- package/dist/tools/health-check.d.ts.map +1 -1
- package/dist/tools/health-check.js +36 -7
- package/dist/tools/health-check.js.map +1 -1
- package/dist/tools/index.d.ts +6 -0
- package/dist/tools/index.d.ts.map +1 -1
- package/dist/tools/index.js +6 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/inject-evaluations.d.ts +1315 -0
- package/dist/tools/inject-evaluations.d.ts.map +1 -0
- package/dist/tools/inject-evaluations.js +121 -0
- package/dist/tools/inject-evaluations.js.map +1 -0
- package/dist/tools/inject-evaluations.test.d.ts +5 -0
- package/dist/tools/inject-evaluations.test.d.ts.map +1 -0
- package/dist/tools/inject-evaluations.test.js +359 -0
- package/dist/tools/inject-evaluations.test.js.map +1 -0
- package/dist/tools/query-evaluations.d.ts +25 -4
- package/dist/tools/query-evaluations.d.ts.map +1 -1
- package/dist/tools/query-evaluations.js +26 -2
- package/dist/tools/query-evaluations.js.map +1 -1
- package/dist/tools/query-evaluations.test.js +53 -46
- package/dist/tools/query-evaluations.test.js.map +1 -1
- package/dist/tools/query-llm-events.js +2 -2
- package/dist/tools/query-llm-events.js.map +1 -1
- package/dist/tools/query-llm-events.test.js +6 -3
- package/dist/tools/query-llm-events.test.js.map +1 -1
- package/dist/tools/query-logs.d.ts +8 -8
- package/dist/tools/query-logs.js +3 -3
- package/dist/tools/query-logs.js.map +1 -1
- package/dist/tools/query-metrics.d.ts +4 -4
- package/dist/tools/query-metrics.js +2 -2
- package/dist/tools/query-metrics.js.map +1 -1
- package/dist/tools/query-traces.d.ts +8 -8
- package/dist/tools/query-verifications.d.ts +111 -0
- package/dist/tools/query-verifications.d.ts.map +1 -0
- package/dist/tools/query-verifications.js +101 -0
- package/dist/tools/query-verifications.js.map +1 -0
- package/dist/tools/query-verifications.test.d.ts +5 -0
- package/dist/tools/query-verifications.test.d.ts.map +1 -0
- package/dist/tools/query-verifications.test.js +156 -0
- package/dist/tools/query-verifications.test.js.map +1 -0
- package/dist/types/evaluation-hooks.d.ts +176 -0
- package/dist/types/evaluation-hooks.d.ts.map +1 -0
- package/dist/types/evaluation-hooks.js +49 -0
- package/dist/types/evaluation-hooks.js.map +1 -0
- package/package.json +11 -2
|
@@ -0,0 +1,1397 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-as-Judge Implementation
|
|
3
|
+
*
|
|
4
|
+
* Provides patterns and utilities for using LLMs to evaluate LLM outputs.
|
|
5
|
+
* Implements G-Eval, QAG, and production-ready evaluation patterns per
|
|
6
|
+
* industry best practices and OTel GenAI semantic conventions.
|
|
7
|
+
*
|
|
8
|
+
* @security
|
|
9
|
+
* - All user inputs are sanitized for prompt injection protection
|
|
10
|
+
* - LLM calls have timeout protection (default 30s)
|
|
11
|
+
* - Input sizes are validated to prevent resource exhaustion
|
|
12
|
+
* - JSON parsing has depth limits to prevent DoS
|
|
13
|
+
*
|
|
14
|
+
* @security Known Limitations
|
|
15
|
+
* - Script homoglyphs (Cyrillic, Greek characters visually similar to Latin)
|
|
16
|
+
* are NOT currently filtered in all cases. The confusables library provides
|
|
17
|
+
* partial coverage but may miss some Unicode TR39 edge cases.
|
|
18
|
+
* Example: Cyrillic "а" (U+0430) looks identical to Latin "a".
|
|
19
|
+
* This is a known gap tracked for future enhancement.
|
|
20
|
+
*
|
|
21
|
+
* @see https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/
|
|
22
|
+
*/
|
|
23
|
+
import { InputValidationError } from './input-validator.js';
|
|
24
|
+
import { HttpStatus } from './constants.js';
|
|
25
|
+
import { remove as removeConfusables } from 'confusables';
|
|
26
|
+
import sbd from 'sbd';
|
|
27
|
+
// ============================================================================
|
|
28
|
+
// Typed Error Classes
|
|
29
|
+
// ============================================================================
|
|
30
|
+
/**
|
|
31
|
+
* Error for prompt injection detection.
|
|
32
|
+
*
|
|
33
|
+
* NOTE: This class is exported for type-checking and external use but is
|
|
34
|
+
* intentionally NOT thrown by sanitizeForPrompt(). The design decision is
|
|
35
|
+
* to silently replace injection patterns with '[filtered]' markers rather
|
|
36
|
+
* than fail-fast, allowing evaluation to proceed with sanitized input.
|
|
37
|
+
*
|
|
38
|
+
* @example
|
|
39
|
+
* // External code can throw this for stricter handling:
|
|
40
|
+
* if (detectsInjection(input)) {
|
|
41
|
+
* throw new PromptInjectionError('Injection detected in user input');
|
|
42
|
+
* }
|
|
43
|
+
*/
|
|
44
|
+
export class PromptInjectionError extends Error {
|
|
45
|
+
constructor(message) {
|
|
46
|
+
super(message);
|
|
47
|
+
this.name = 'PromptInjectionError';
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Error thrown when an LLM call exceeds the configured timeout.
|
|
52
|
+
*
|
|
53
|
+
* Thrown by withTimeout() when the wrapped function does not complete
|
|
54
|
+
* within the specified duration. Callers should catch this to implement
|
|
55
|
+
* fallback behavior or retry logic.
|
|
56
|
+
*
|
|
57
|
+
* @example
|
|
58
|
+
* try {
|
|
59
|
+
* await withTimeout(() => llmCall(), 5000);
|
|
60
|
+
* } catch (e) {
|
|
61
|
+
* if (e instanceof LLMTimeoutError) {
|
|
62
|
+
* // Handle timeout - use cached result or return default
|
|
63
|
+
* }
|
|
64
|
+
* }
|
|
65
|
+
*
|
|
66
|
+
* @see withTimeout
|
|
67
|
+
*/
|
|
68
|
+
export class LLMTimeoutError extends Error {
|
|
69
|
+
constructor(timeoutMs) {
|
|
70
|
+
super(`LLM call timed out after ${timeoutMs}ms`);
|
|
71
|
+
this.name = 'LLMTimeoutError';
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Error thrown when score extraction or normalization fails.
|
|
76
|
+
*
|
|
77
|
+
* Thrown by normalizeWithLogprobs() when the LLM response cannot be
|
|
78
|
+
* parsed into a valid score, or when probability weighting fails.
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* try {
|
|
82
|
+
* const score = normalizeWithLogprobs(response, [1,2,3,4,5]);
|
|
83
|
+
* } catch (e) {
|
|
84
|
+
* if (e instanceof ScoreNormalizationError) {
|
|
85
|
+
* // Fallback to regex-based extraction
|
|
86
|
+
* }
|
|
87
|
+
* }
|
|
88
|
+
*
|
|
89
|
+
* @see normalizeWithLogprobs
|
|
90
|
+
* @see extractScoreFromText
|
|
91
|
+
*/
|
|
92
|
+
export class ScoreNormalizationError extends Error {
|
|
93
|
+
constructor(message) {
|
|
94
|
+
super(message);
|
|
95
|
+
this.name = 'ScoreNormalizationError';
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
// ============================================================================
|
|
99
|
+
// Security Constants
|
|
100
|
+
// ============================================================================
|
|
101
|
+
/** Maximum input size in bytes (64KB) */
|
|
102
|
+
export const MAX_INPUT_SIZE_BYTES = 65536;
|
|
103
|
+
/** Maximum text length per field (10KB) */
|
|
104
|
+
export const MAX_TEXT_LENGTH = 10000;
|
|
105
|
+
/** Maximum context array length */
|
|
106
|
+
export const MAX_CONTEXT_ITEMS = 20;
|
|
107
|
+
/** Maximum statements to process in QAG pattern */
|
|
108
|
+
export const MAX_STATEMENTS = 20;
|
|
109
|
+
/** Default timeout for LLM calls (30 seconds) */
|
|
110
|
+
export const DEFAULT_LLM_TIMEOUT_MS = 30000;
|
|
111
|
+
/** Maximum JSON nesting depth */
|
|
112
|
+
export const MAX_JSON_DEPTH = 5;
|
|
113
|
+
/**
|
|
114
|
+
* Current log level for the module.
|
|
115
|
+
* Controls verbosity of console output for production flexibility.
|
|
116
|
+
* - 'debug': All logs including verbose debugging info
|
|
117
|
+
* - 'info': Informational messages and above
|
|
118
|
+
* - 'warn': Warnings and errors only (default)
|
|
119
|
+
* - 'error': Only error messages
|
|
120
|
+
* - 'silent': No logging output
|
|
121
|
+
*/
|
|
122
|
+
export const LOG_LEVEL = process.env.LLM_JUDGE_LOG_LEVEL || 'warn';
|
|
123
|
+
/**
|
|
124
|
+
* Check if a log level should be output based on current LOG_LEVEL.
|
|
125
|
+
* @param level - Level to check
|
|
126
|
+
* @returns True if the level should be logged
|
|
127
|
+
*/
|
|
128
|
+
function shouldLog(level) {
|
|
129
|
+
const levels = ['debug', 'info', 'warn', 'error', 'silent'];
|
|
130
|
+
const currentIndex = levels.indexOf(LOG_LEVEL);
|
|
131
|
+
const levelIndex = levels.indexOf(level);
|
|
132
|
+
return levelIndex >= currentIndex && LOG_LEVEL !== 'silent';
|
|
133
|
+
}
|
|
134
|
+
// ============================================================================
|
|
135
|
+
// G-Eval Score Range Constants
|
|
136
|
+
// ============================================================================
|
|
137
|
+
/** G-Eval minimum score (inclusive) */
|
|
138
|
+
export const G_EVAL_MIN_SCORE = 1;
|
|
139
|
+
/** G-Eval maximum score (inclusive) */
|
|
140
|
+
export const G_EVAL_MAX_SCORE = 5;
|
|
141
|
+
/** G-Eval valid score values array */
|
|
142
|
+
export const G_EVAL_VALID_SCORES = [1, 2, 3, 4, 5];
|
|
143
|
+
/** G-Eval score range for normalization (max - min) */
|
|
144
|
+
export const G_EVAL_SCORE_RANGE = G_EVAL_MAX_SCORE - G_EVAL_MIN_SCORE;
|
|
145
|
+
/**
|
|
146
|
+
* G-Eval default/middle score for fallback scenarios.
|
|
147
|
+
* @note This constant is exported for library consumers who need a sensible
|
|
148
|
+
* default when implementing custom fallback logic. The core gEval() function
|
|
149
|
+
* throws errors rather than using fallback values to ensure explicit handling.
|
|
150
|
+
* @example
|
|
151
|
+
* ```typescript
|
|
152
|
+
* // Custom fallback in consumer code
|
|
153
|
+
* const score = parseResult(response) ?? G_EVAL_DEFAULT_SCORE;
|
|
154
|
+
* ```
|
|
155
|
+
*/
|
|
156
|
+
export const G_EVAL_DEFAULT_SCORE = 3;
|
|
157
|
+
// ============================================================================
|
|
158
|
+
// LLM Configuration Constants
|
|
159
|
+
// ============================================================================
|
|
160
|
+
/** Default temperature for deterministic LLM calls (e.g., extraction, answering) */
|
|
161
|
+
export const LLM_TEMPERATURE_DETERMINISTIC = 0;
|
|
162
|
+
/** Default temperature for evaluation LLM calls (slight variation allowed) */
|
|
163
|
+
export const LLM_TEMPERATURE_EVALUATION = 0.1;
|
|
164
|
+
/** Minimum evaluation steps to generate in G-Eval */
|
|
165
|
+
export const G_EVAL_MIN_STEPS = 3;
|
|
166
|
+
/** Maximum evaluation steps to generate in G-Eval */
|
|
167
|
+
export const G_EVAL_MAX_STEPS = 5;
|
|
168
|
+
// ============================================================================
|
|
169
|
+
// QAG Pattern Constants
|
|
170
|
+
// ============================================================================
|
|
171
|
+
/** Minimum statement length to be considered valid for QAG extraction */
|
|
172
|
+
export const MIN_STATEMENT_LENGTH = 10;
|
|
173
|
+
// ============================================================================
|
|
174
|
+
// Retry and Circuit Breaker Constants
|
|
175
|
+
// ============================================================================
|
|
176
|
+
/** Default maximum retry attempts for evaluateWithRetry */
|
|
177
|
+
export const DEFAULT_MAX_RETRIES = 3;
|
|
178
|
+
/** Default circuit breaker failure threshold */
|
|
179
|
+
export const DEFAULT_CIRCUIT_BREAKER_THRESHOLD = 5;
|
|
180
|
+
/** Default circuit breaker reset timeout in milliseconds */
|
|
181
|
+
export const DEFAULT_CIRCUIT_BREAKER_RESET_MS = 30000;
|
|
182
|
+
/** Base delay multiplier for exponential backoff (1 second) */
|
|
183
|
+
export const BACKOFF_BASE_MS = 1000;
|
|
184
|
+
// ============================================================================
|
|
185
|
+
// Score Validation Constants
|
|
186
|
+
// ============================================================================
|
|
187
|
+
/** Minimum valid normalized score */
|
|
188
|
+
export const NORMALIZED_SCORE_MIN = 0;
|
|
189
|
+
/** Maximum valid normalized score */
|
|
190
|
+
export const NORMALIZED_SCORE_MAX = 1;
|
|
191
|
+
/**
|
|
192
|
+
* OTel attribute mapping for evaluation events
|
|
193
|
+
*/
|
|
194
|
+
export const EVALUATION_OTEL_ATTRIBUTES = {
|
|
195
|
+
evaluationName: 'gen_ai.evaluation.name',
|
|
196
|
+
scoreValue: 'gen_ai.evaluation.score.value',
|
|
197
|
+
scoreLabel: 'gen_ai.evaluation.score.label',
|
|
198
|
+
explanation: 'gen_ai.evaluation.explanation',
|
|
199
|
+
errorType: 'error.type',
|
|
200
|
+
durationMs: 'gen_ai.evaluation.duration',
|
|
201
|
+
inputTokens: 'gen_ai.usage.input_tokens',
|
|
202
|
+
outputTokens: 'gen_ai.usage.output_tokens',
|
|
203
|
+
};
|
|
204
|
+
// ============================================================================
|
|
205
|
+
// Security Utilities
|
|
206
|
+
// ============================================================================
|
|
207
|
+
/**
|
|
208
|
+
* Prompt injection detection patterns (case-insensitive, Unicode-normalized).
|
|
209
|
+
*
|
|
210
|
+
* @security These patterns use non-capturing groups (?:...) and avoid nested
|
|
211
|
+
* quantifiers that could cause catastrophic backtracking on adversarial input.
|
|
212
|
+
* For example, `\s+(all\s+)?` is rewritten as `\s+(?:all\s+)?` with the outer
|
|
213
|
+
* `\s+` matching minimally before the optional group.
|
|
214
|
+
*/
|
|
215
|
+
const PROMPT_INJECTION_PATTERNS = [
|
|
216
|
+
/ignore\s+(?:all\s+)?previous\s+instructions/gi,
|
|
217
|
+
/system\s+prompt/gi,
|
|
218
|
+
/you\s+are\s+now/gi,
|
|
219
|
+
/forget\s+everything/gi,
|
|
220
|
+
/disregard\s+(?:all\s+)?(?:previous|prior)/gi,
|
|
221
|
+
/new\s+instructions?:/gi,
|
|
222
|
+
/override\s+(?:system|instructions)/gi,
|
|
223
|
+
/act\s+as\s+(?:if\s+)?(?:you\s+are|an?)\s/gi,
|
|
224
|
+
/pretend\s+(?:you\s+are|to\s+be)/gi,
|
|
225
|
+
/jailbreak/gi,
|
|
226
|
+
/\bDAN\b/gi, // "Do Anything Now" prompt - case-insensitive to catch "dan", "Dan" variants
|
|
227
|
+
/developer\s+mode/gi,
|
|
228
|
+
/ignore\s+safety/gi,
|
|
229
|
+
/bypass\s+(?:filter|restriction|rule)/gi,
|
|
230
|
+
];
|
|
231
|
+
/**
|
|
232
|
+
* Compiles an array of RegExp patterns into a single combined regex.
|
|
233
|
+
* Optimizes O(n*m) pattern matching to O(n) by using alternation.
|
|
234
|
+
*
|
|
235
|
+
* @param patterns - Array of regex patterns to compile (must be non-empty)
|
|
236
|
+
* @returns Combined regex with all patterns as alternations (global + case-insensitive)
|
|
237
|
+
* @performance Reduces 14 pattern checks to single regex evaluation
|
|
238
|
+
* @security All patterns are normalized to 'gi' flags for consistent case-insensitive matching
|
|
239
|
+
* @throws {Error} If patterns array is empty
|
|
240
|
+
*/
|
|
241
|
+
function compilePatterns(patterns) {
|
|
242
|
+
if (patterns.length === 0) {
|
|
243
|
+
throw new Error('Cannot compile empty patterns array');
|
|
244
|
+
}
|
|
245
|
+
const sources = patterns.map(p => `(?:${p.source})`);
|
|
246
|
+
return new RegExp(sources.join('|'), 'gi');
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Pre-compiled injection detection regex for O(n) performance.
|
|
250
|
+
* Combines all 14 injection patterns into single regex via alternation.
|
|
251
|
+
*
|
|
252
|
+
* @performance Single regex.test() instead of 14 individual pattern checks.
|
|
253
|
+
* For 10KB text with 14 patterns, reduces from O(10KB*14) to O(10KB).
|
|
254
|
+
*/
|
|
255
|
+
const COMPILED_INJECTION_PATTERN = compilePatterns(PROMPT_INJECTION_PATTERNS);
|
|
256
|
+
/**
|
|
257
|
+
* Normalize text for prompt injection detection.
|
|
258
|
+
* Handles Unicode homoglyphs and common obfuscation tricks.
|
|
259
|
+
*
|
|
260
|
+
* @security CRITICAL ORDERING:
|
|
261
|
+
* 1. Homoglyph mapping FIRST using confusables library (Unicode TR39 coverage)
|
|
262
|
+
* 2. NFKC normalization to decompose composed characters
|
|
263
|
+
* 3. Zero-width removal prevents attacks like "ign\u2060ore" bypassing detection
|
|
264
|
+
* 4. Quote normalization prevents "ignore" vs 'ignore' bypasses
|
|
265
|
+
*
|
|
266
|
+
* @see https://unicode.org/reports/tr39/#Confusable_Detection
|
|
267
|
+
*/
|
|
268
|
+
function normalizeForDetection(text) {
|
|
269
|
+
// Step 1: Map homoglyphs from other scripts to Latin equivalents
|
|
270
|
+
// Uses confusables library with full Unicode TR39 coverage
|
|
271
|
+
const normalized = removeConfusables(text);
|
|
272
|
+
return normalized
|
|
273
|
+
.normalize('NFKC') // Step 2: Decompose composed characters
|
|
274
|
+
.replace(/[\u200B-\u200D\u2060\u180E\uFEFF\u034F\uFE00-\uFE0F]/g, '') // Step 3: Remove zero-width chars
|
|
275
|
+
.replace(/['']/g, "'") // Step 4: Normalize quotes
|
|
276
|
+
.replace(/[""]/g, '"')
|
|
277
|
+
.toLowerCase();
|
|
278
|
+
}
|
|
279
|
+
/** Zero-width characters regex for removal from output */
|
|
280
|
+
const ZERO_WIDTH_CHARS_REGEX = /[\u200B-\u200D\u2060\u180E\uFEFF\u034F\uFE00-\uFE0F]/g;
|
|
281
|
+
/** Smart single quotes normalization regex */
|
|
282
|
+
const SMART_SINGLE_QUOTES_REGEX = /['']/g;
|
|
283
|
+
/** Smart double quotes normalization regex */
|
|
284
|
+
const SMART_DOUBLE_QUOTES_REGEX = /[""]/g;
|
|
285
|
+
/** Double newline delimiter regex for section injection prevention */
|
|
286
|
+
const DOUBLE_NEWLINE_REGEX = /\n\n/g;
|
|
287
|
+
/** Section keyword regex for prompt delimiter escaping */
|
|
288
|
+
const SECTION_KEYWORD_REGEX = /\n(Output|Input|Context|Expected Output|Criteria|Score):/gi;
|
|
289
|
+
/**
|
|
290
|
+
* Sanitizes an array of context strings for safe prompt inclusion.
|
|
291
|
+
* Truncates to MAX_CONTEXT_ITEMS and sanitizes each item.
|
|
292
|
+
*
|
|
293
|
+
* @param context - Array of context strings to sanitize
|
|
294
|
+
* @returns Sanitized and truncated context array (max MAX_CONTEXT_ITEMS items)
|
|
295
|
+
* @security Applies prompt injection protection to each context item
|
|
296
|
+
*/
|
|
297
|
+
export function sanitizeContextArray(context) {
|
|
298
|
+
return context.slice(0, MAX_CONTEXT_ITEMS).map(c => sanitizeForPrompt(c));
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Sanitize text for safe inclusion in prompts.
|
|
302
|
+
* Detects and removes potential prompt injection attempts.
|
|
303
|
+
*
|
|
304
|
+
* SECURITY DESIGN:
|
|
305
|
+
* - Uses normalized text (homoglyphs → Latin) for injection DETECTION
|
|
306
|
+
* - Preserves original text when no injection patterns are found
|
|
307
|
+
* - Only returns normalized text when malicious patterns are detected
|
|
308
|
+
* - This preserves legitimate Cyrillic/Greek text while catching attacks
|
|
309
|
+
*
|
|
310
|
+
* @param text - Text to sanitize
|
|
311
|
+
* @param maxLength - Maximum allowed length (default: MAX_TEXT_LENGTH)
|
|
312
|
+
* @returns Sanitized text with injection patterns replaced by '[filtered]'
|
|
313
|
+
* @security Removes zero-width characters and neutralizes prompt injection patterns
|
|
314
|
+
*/
|
|
315
|
+
export function sanitizeForPrompt(text, maxLength = MAX_TEXT_LENGTH) {
|
|
316
|
+
// Truncate to prevent context overflow
|
|
317
|
+
let sanitized = text.slice(0, maxLength);
|
|
318
|
+
// Remove zero-width characters that could be used to bypass detection
|
|
319
|
+
// This must happen before pattern matching to prevent word-breaking attacks
|
|
320
|
+
sanitized = sanitized.replace(ZERO_WIDTH_CHARS_REGEX, '');
|
|
321
|
+
// Create normalized text for injection DETECTION only
|
|
322
|
+
// Uses confusables library for full Unicode TR39 homoglyph coverage
|
|
323
|
+
let detectionText = removeConfusables(sanitized);
|
|
324
|
+
// Apply NFKC normalization for full-width characters and other Unicode tricks
|
|
325
|
+
detectionText = detectionText
|
|
326
|
+
.normalize('NFKC')
|
|
327
|
+
.replace(ZERO_WIDTH_CHARS_REGEX, '')
|
|
328
|
+
.replace(SMART_SINGLE_QUOTES_REGEX, "'")
|
|
329
|
+
.replace(SMART_DOUBLE_QUOTES_REGEX, '"');
|
|
330
|
+
// Check for injection patterns in normalized detection text
|
|
331
|
+
// Uses pre-compiled regex for O(n) performance vs O(n*m) with individual patterns
|
|
332
|
+
const hasInjection = COMPILED_INJECTION_PATTERN.test(detectionText);
|
|
333
|
+
// Reset lastIndex after test() to ensure consistent behavior
|
|
334
|
+
COMPILED_INJECTION_PATTERN.lastIndex = 0;
|
|
335
|
+
// If injection detected, use normalized text with patterns filtered
|
|
336
|
+
// Data loss is acceptable when filtering malicious input
|
|
337
|
+
if (hasInjection) {
|
|
338
|
+
// Use compiled pattern for single-pass replacement
|
|
339
|
+
const normalized = detectionText.replace(COMPILED_INJECTION_PATTERN, '[filtered]');
|
|
340
|
+
// Escape prompt delimiters
|
|
341
|
+
return normalized
|
|
342
|
+
.replace(DOUBLE_NEWLINE_REGEX, '\n \n')
|
|
343
|
+
.replace(SECTION_KEYWORD_REGEX, '\n $1:');
|
|
344
|
+
}
|
|
345
|
+
// No injection - preserve original text, only escape delimiters
|
|
346
|
+
// This preserves legitimate Cyrillic/Greek text
|
|
347
|
+
return sanitized
|
|
348
|
+
.replace(DOUBLE_NEWLINE_REGEX, '\n \n')
|
|
349
|
+
.replace(SECTION_KEYWORD_REGEX, '\n $1:');
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* Create a customizable sanitizer with additional injection patterns.
|
|
353
|
+
*
|
|
354
|
+
* Use this factory when you need to extend the default prompt injection
|
|
355
|
+
* patterns with domain-specific patterns for your use case.
|
|
356
|
+
*
|
|
357
|
+
* @param additionalPatterns - Additional regex patterns to detect as prompt injection
|
|
358
|
+
* @returns Sanitizer function with signature (text: string, maxLength?: number) => string
|
|
359
|
+
* that applies all default patterns plus additionalPatterns, truncates to maxLength,
|
|
360
|
+
* and returns sanitized text with '[filtered]' markers for detected injections.
|
|
361
|
+
* @throws {InputValidationError} If additionalPatterns contains non-RegExp items
|
|
362
|
+
*
|
|
363
|
+
* @example
|
|
364
|
+
* ```typescript
|
|
365
|
+
* const customSanitizer = createSanitizer([
|
|
366
|
+
* /my\s+custom\s+attack\s+pattern/gi,
|
|
367
|
+
* /another\s+pattern/gi,
|
|
368
|
+
* ]);
|
|
369
|
+
* const sanitized = customSanitizer(userInput);
|
|
370
|
+
* const truncated = customSanitizer(longInput, 1000); // Override maxLength per-call
|
|
371
|
+
* ```
|
|
372
|
+
*
|
|
373
|
+
* @security Custom patterns MUST avoid ReDoS vulnerabilities:
|
|
374
|
+
* - Use non-capturing groups (?:...) when grouping is not needed
|
|
375
|
+
* - Avoid nested quantifiers like (a+)+ or (a*)*
|
|
376
|
+
* - Test patterns with tools like safe-regex before deployment
|
|
377
|
+
* - Example vulnerable: /^(a+)+$/ with input "aaaaaaaaaaaaaaaaX"
|
|
378
|
+
* - Example safe: /^a+$/ or /^(?:a+)$/
|
|
379
|
+
*/
|
|
380
|
+
export function createSanitizer(additionalPatterns = []) {
|
|
381
|
+
// Validate patterns at factory time
|
|
382
|
+
for (let i = 0; i < additionalPatterns.length; i++) {
|
|
383
|
+
if (!(additionalPatterns[i] instanceof RegExp)) {
|
|
384
|
+
throw new InputValidationError(`additionalPatterns[${i}] must be a RegExp, got ${typeof additionalPatterns[i]}`, 'additionalPatterns', 'type');
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
// Compile all patterns at factory time for O(n) performance
|
|
388
|
+
const allPatterns = [...PROMPT_INJECTION_PATTERNS, ...additionalPatterns];
|
|
389
|
+
const compiledPattern = compilePatterns(allPatterns);
|
|
390
|
+
return (text, maxLength = MAX_TEXT_LENGTH) => {
|
|
391
|
+
// Truncate to prevent context overflow
|
|
392
|
+
let sanitized = text.slice(0, maxLength);
|
|
393
|
+
// Remove zero-width characters
|
|
394
|
+
sanitized = sanitized.replace(ZERO_WIDTH_CHARS_REGEX, '');
|
|
395
|
+
// Create normalized text for injection DETECTION only
|
|
396
|
+
let detectionText = removeConfusables(sanitized);
|
|
397
|
+
detectionText = detectionText
|
|
398
|
+
.normalize('NFKC')
|
|
399
|
+
.replace(ZERO_WIDTH_CHARS_REGEX, '')
|
|
400
|
+
.replace(SMART_SINGLE_QUOTES_REGEX, "'")
|
|
401
|
+
.replace(SMART_DOUBLE_QUOTES_REGEX, '"');
|
|
402
|
+
// Check for injection patterns using compiled regex
|
|
403
|
+
const hasInjection = compiledPattern.test(detectionText);
|
|
404
|
+
compiledPattern.lastIndex = 0; // Reset for consistent behavior
|
|
405
|
+
// If injection detected, use normalized text with patterns filtered
|
|
406
|
+
if (hasInjection) {
|
|
407
|
+
const normalized = detectionText.replace(compiledPattern, '[filtered]');
|
|
408
|
+
return normalized
|
|
409
|
+
.replace(DOUBLE_NEWLINE_REGEX, '\n \n')
|
|
410
|
+
.replace(SECTION_KEYWORD_REGEX, '\n $1:');
|
|
411
|
+
}
|
|
412
|
+
// No injection - preserve original text, only escape delimiters
|
|
413
|
+
return sanitized
|
|
414
|
+
.replace(DOUBLE_NEWLINE_REGEX, '\n \n')
|
|
415
|
+
.replace(SECTION_KEYWORD_REGEX, '\n $1:');
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
/**
|
|
419
|
+
* Validate test case input sizes against security limits.
|
|
420
|
+
* Checks individual field lengths and total byte size.
|
|
421
|
+
*
|
|
422
|
+
* @param testCase - Test case to validate
|
|
423
|
+
* @returns void
|
|
424
|
+
* @throws {InputValidationError} If any field exceeds MAX_TEXT_LENGTH
|
|
425
|
+
* @throws {InputValidationError} If context exceeds MAX_CONTEXT_ITEMS
|
|
426
|
+
* @throws {InputValidationError} If total size exceeds MAX_INPUT_SIZE_BYTES
|
|
427
|
+
* @security Prevents resource exhaustion by enforcing size limits
|
|
428
|
+
*/
|
|
429
|
+
export function validateTestCase(testCase) {
|
|
430
|
+
if (testCase.input.length > MAX_TEXT_LENGTH) {
|
|
431
|
+
throw new InputValidationError(`Input exceeds ${MAX_TEXT_LENGTH} character limit`, 'input', 'maxLength');
|
|
432
|
+
}
|
|
433
|
+
if (testCase.output.length > MAX_TEXT_LENGTH) {
|
|
434
|
+
throw new InputValidationError(`Output exceeds ${MAX_TEXT_LENGTH} character limit`, 'output', 'maxLength');
|
|
435
|
+
}
|
|
436
|
+
if (testCase.context && testCase.context.length > MAX_CONTEXT_ITEMS) {
|
|
437
|
+
throw new InputValidationError(`Context exceeds ${MAX_CONTEXT_ITEMS} items limit`, 'context', 'maxLength');
|
|
438
|
+
}
|
|
439
|
+
// Validate individual context item types and sizes
|
|
440
|
+
if (testCase.context) {
|
|
441
|
+
for (let i = 0; i < testCase.context.length; i++) {
|
|
442
|
+
const item = testCase.context[i];
|
|
443
|
+
// Validate type - context items must be strings
|
|
444
|
+
if (typeof item !== 'string') {
|
|
445
|
+
throw new InputValidationError(`Context item ${i} must be a string, got ${typeof item}`, 'context', 'type');
|
|
446
|
+
}
|
|
447
|
+
if (item.length > MAX_TEXT_LENGTH) {
|
|
448
|
+
throw new InputValidationError(`Context item ${i} exceeds ${MAX_TEXT_LENGTH} character limit`, 'context', 'maxLength');
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
if (testCase.expectedOutput && testCase.expectedOutput.length > MAX_TEXT_LENGTH) {
|
|
453
|
+
throw new InputValidationError(`Expected output exceeds ${MAX_TEXT_LENGTH} character limit`, 'expectedOutput', 'maxLength');
|
|
454
|
+
}
|
|
455
|
+
// Validate total size to prevent memory exhaustion
|
|
456
|
+
// Individual fields may pass but combined could exceed MAX_INPUT_SIZE_BYTES
|
|
457
|
+
let totalBytes = testCase.input.length + testCase.output.length;
|
|
458
|
+
if (testCase.context) {
|
|
459
|
+
totalBytes += testCase.context.reduce((sum, c) => sum + c.length, 0);
|
|
460
|
+
}
|
|
461
|
+
if (testCase.expectedOutput) {
|
|
462
|
+
totalBytes += testCase.expectedOutput.length;
|
|
463
|
+
}
|
|
464
|
+
if (totalBytes > MAX_INPUT_SIZE_BYTES) {
|
|
465
|
+
throw new InputValidationError(`Total test case size ${totalBytes} exceeds ${MAX_INPUT_SIZE_BYTES} bytes`, 'testCase', 'maxSize');
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
/**
|
|
469
|
+
* Safe JSON parsing with depth limit to prevent DoS attacks.
|
|
470
|
+
*
|
|
471
|
+
* @param text - JSON text to parse
|
|
472
|
+
* @param maxDepth - Maximum nesting depth (default: MAX_JSON_DEPTH)
|
|
473
|
+
* @returns Parsed value
|
|
474
|
+
* @throws {Error} If JSON is invalid, too large, or too deeply nested
|
|
475
|
+
*/
|
|
476
|
+
export function safeJSONParse(text, maxDepth = MAX_JSON_DEPTH) {
|
|
477
|
+
// Limit size
|
|
478
|
+
if (text.length > MAX_INPUT_SIZE_BYTES) {
|
|
479
|
+
throw new Error('JSON response too large');
|
|
480
|
+
}
|
|
481
|
+
const parsed = JSON.parse(text);
|
|
482
|
+
// Check depth recursively - iterates directly without array allocation for performance
|
|
483
|
+
const checkDepth = (obj, depth = 0) => {
|
|
484
|
+
if (depth > maxDepth) {
|
|
485
|
+
throw new Error('JSON nesting too deep');
|
|
486
|
+
}
|
|
487
|
+
if (typeof obj === 'object' && obj !== null) {
|
|
488
|
+
if (Array.isArray(obj)) {
|
|
489
|
+
for (const value of obj) {
|
|
490
|
+
checkDepth(value, depth + 1);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
else {
|
|
494
|
+
for (const key in obj) {
|
|
495
|
+
if (Object.prototype.hasOwnProperty.call(obj, key)) {
|
|
496
|
+
checkDepth(obj[key], depth + 1);
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
};
|
|
502
|
+
checkDepth(parsed);
|
|
503
|
+
return parsed;
|
|
504
|
+
}
|
|
505
|
+
/**
|
|
506
|
+
* Execute an async function with timeout protection.
|
|
507
|
+
*
|
|
508
|
+
* Uses AbortController for atomic cancellation to prevent race conditions
|
|
509
|
+
* between the function completing and the timeout firing. The abort signal
|
|
510
|
+
* provides atomic state that both code paths can check safely.
|
|
511
|
+
*
|
|
512
|
+
* @param fn - Async function to execute, receives AbortSignal for cancellation
|
|
513
|
+
* @param timeoutMs - Timeout in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
|
|
514
|
+
* @returns Result of the function
|
|
515
|
+
* @throws {LLMTimeoutError} If function times out
|
|
516
|
+
*/
|
|
517
|
+
export async function withTimeout(fn, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
|
|
518
|
+
let timeoutId;
|
|
519
|
+
const abortController = new AbortController();
|
|
520
|
+
const clearTimer = () => {
|
|
521
|
+
if (timeoutId !== undefined) {
|
|
522
|
+
clearTimeout(timeoutId);
|
|
523
|
+
timeoutId = undefined;
|
|
524
|
+
}
|
|
525
|
+
};
|
|
526
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
527
|
+
timeoutId = setTimeout(() => {
|
|
528
|
+
// Check and set abort atomically - abort() MUST be first action after check
|
|
529
|
+
// to prevent race between timeout and fn() completion
|
|
530
|
+
if (!abortController.signal.aborted) {
|
|
531
|
+
abortController.abort(); // Signal timeout to function
|
|
532
|
+
clearTimer();
|
|
533
|
+
reject(new LLMTimeoutError(timeoutMs));
|
|
534
|
+
}
|
|
535
|
+
}, timeoutMs);
|
|
536
|
+
});
|
|
537
|
+
try {
|
|
538
|
+
const result = await Promise.race([fn(abortController.signal), timeoutPromise]);
|
|
539
|
+
clearTimer(); // Success: just clear timer, don't abort (signal stays not-aborted)
|
|
540
|
+
return result;
|
|
541
|
+
}
|
|
542
|
+
catch (error) {
|
|
543
|
+
clearTimer();
|
|
544
|
+
// Only abort if not already aborted (e.g., from timeout)
|
|
545
|
+
if (!abortController.signal.aborted) {
|
|
546
|
+
abortController.abort();
|
|
547
|
+
}
|
|
548
|
+
throw error;
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
// ============================================================================
|
|
552
|
+
// G-Eval Pattern Helpers
|
|
553
|
+
// ============================================================================
|
|
554
|
+
/**
|
|
555
|
+
* Build evaluation prompt from config and test case.
|
|
556
|
+
* Used by G-Eval pattern for structured evaluation prompts.
|
|
557
|
+
*
|
|
558
|
+
* @param config - G-Eval configuration with criteria and parameters
|
|
559
|
+
* @param testCase - Test case containing input, output, and optional context
|
|
560
|
+
* @param steps - Evaluation steps generated by chain-of-thought
|
|
561
|
+
* @returns Formatted prompt string for the judge model
|
|
562
|
+
* @security All user-provided content is sanitized for prompt injection
|
|
563
|
+
*/
|
|
564
|
+
export function buildEvalPrompt(config, testCase, steps) {
|
|
565
|
+
const parts = [
|
|
566
|
+
`You are evaluating: ${config.name}`,
|
|
567
|
+
`\nCriteria: ${config.criteria}`,
|
|
568
|
+
`\nEvaluation Steps:\n${steps}`,
|
|
569
|
+
];
|
|
570
|
+
if (config.evaluationParams.includes('input')) {
|
|
571
|
+
parts.push(`\nInput: ${sanitizeForPrompt(testCase.input)}`);
|
|
572
|
+
}
|
|
573
|
+
if (config.evaluationParams.includes('output')) {
|
|
574
|
+
parts.push(`\nOutput: ${sanitizeForPrompt(testCase.output)}`);
|
|
575
|
+
}
|
|
576
|
+
if (config.evaluationParams.includes('context') && testCase.context) {
|
|
577
|
+
const sanitizedContext = sanitizeContextArray(testCase.context);
|
|
578
|
+
parts.push(`\nContext: ${sanitizedContext.join('\n')}`);
|
|
579
|
+
}
|
|
580
|
+
if (config.evaluationParams.includes('expectedOutput') && testCase.expectedOutput) {
|
|
581
|
+
parts.push(`\nExpected Output: ${sanitizeForPrompt(testCase.expectedOutput)}`);
|
|
582
|
+
}
|
|
583
|
+
parts.push(`\nProvide a score from ${G_EVAL_MIN_SCORE}-${G_EVAL_MAX_SCORE} and explain your reasoning.`);
|
|
584
|
+
return parts.join('');
|
|
585
|
+
}
|
|
586
|
+
/**
|
|
587
|
+
* Normalize score using token log probabilities.
|
|
588
|
+
* Calculates weighted average score based on probability distribution.
|
|
589
|
+
*
|
|
590
|
+
* @param logprobs - Token log probabilities from LLM response
|
|
591
|
+
* @param validScores - Valid score values (e.g., [1, 2, 3, 4, 5])
|
|
592
|
+
* @returns Normalized score as weighted average
|
|
593
|
+
* @throws {ScoreNormalizationError} When no valid score tokens found in logprobs
|
|
594
|
+
*/
|
|
595
|
+
export function normalizeWithLogprobs(logprobs, validScores) {
|
|
596
|
+
// Validate validScores contains only finite numbers
|
|
597
|
+
if (!validScores.every(s => typeof s === 'number' && Number.isFinite(s))) {
|
|
598
|
+
throw new ScoreNormalizationError('validScores must contain only finite numbers');
|
|
599
|
+
}
|
|
600
|
+
const scoreProbs = new Map();
|
|
601
|
+
for (const score of validScores) {
|
|
602
|
+
scoreProbs.set(score, 0);
|
|
603
|
+
}
|
|
604
|
+
for (const { token, logprob } of logprobs) {
|
|
605
|
+
const scoreValue = parseInt(token.trim(), 10);
|
|
606
|
+
if (validScores.includes(scoreValue)) {
|
|
607
|
+
const prob = Math.exp(logprob);
|
|
608
|
+
scoreProbs.set(scoreValue, (scoreProbs.get(scoreValue) || 0) + prob);
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
let weightedSum = 0;
|
|
612
|
+
let totalProb = 0;
|
|
613
|
+
for (const [score, prob] of scoreProbs) {
|
|
614
|
+
weightedSum += score * prob;
|
|
615
|
+
totalProb += prob;
|
|
616
|
+
}
|
|
617
|
+
if (totalProb === 0) {
|
|
618
|
+
throw new ScoreNormalizationError('No valid score tokens found in logprobs - cannot normalize');
|
|
619
|
+
}
|
|
620
|
+
return weightedSum / totalProb;
|
|
621
|
+
}
|
|
622
|
+
// ============================================================================
|
|
623
|
+
// Score Extraction Pattern Constants
|
|
624
|
+
// ============================================================================
|
|
625
|
+
/**
|
|
626
|
+
* Matches explicit score declarations like "Score: 4" or "score 3".
|
|
627
|
+
* Case-insensitive. Captures the digit (1-5) in group 1.
|
|
628
|
+
* @example "Score: 4" → captures "4"
|
|
629
|
+
* @example "score 3" → captures "3"
|
|
630
|
+
*/
|
|
631
|
+
const EXPLICIT_SCORE_PATTERN = /\bscore[:\s]+([1-5])\b/i;
|
|
632
|
+
/**
|
|
633
|
+
* Matches rating format like "Rating: 4" or "rating: 2".
|
|
634
|
+
* Case-insensitive. Captures the digit (1-5) in group 1.
|
|
635
|
+
* @example "Rating: 4" → captures "4"
|
|
636
|
+
*/
|
|
637
|
+
const RATING_PATTERN = /\brating[:\s]+([1-5])\b/i;
|
|
638
|
+
/**
|
|
639
|
+
* Matches fractional format like "4 out of 5" or "4/5".
|
|
640
|
+
* Case-insensitive. Captures the numerator digit (1-5) in group 1.
|
|
641
|
+
* @example "4 out of 5" → captures "4"
|
|
642
|
+
* @example "3/5" → captures "3"
|
|
643
|
+
*/
|
|
644
|
+
const FRACTION_PATTERN = /\b([1-5])\s*(?:out of|\/)\s*5\b/i;
|
|
645
|
+
/**
|
|
646
|
+
* Matches standalone digit on its own line.
|
|
647
|
+
* Useful for responses that just return the score number.
|
|
648
|
+
* Captures the digit (1-5) in group 1.
|
|
649
|
+
* @example " 3 " (with newlines) → captures "3"
|
|
650
|
+
*/
|
|
651
|
+
const STANDALONE_DIGIT_PATTERN = /^\s*([1-5])\s*$/m;
|
|
652
|
+
/**
|
|
653
|
+
* Score extraction patterns in order of specificity.
|
|
654
|
+
* More specific patterns are tried first to avoid false positives.
|
|
655
|
+
*
|
|
656
|
+
* These patterns match G-Eval scores in the range [G_EVAL_MIN_SCORE, G_EVAL_MAX_SCORE] (1-5).
|
|
657
|
+
*/
|
|
658
|
+
const SCORE_PATTERNS = [
|
|
659
|
+
EXPLICIT_SCORE_PATTERN,
|
|
660
|
+
RATING_PATTERN,
|
|
661
|
+
FRACTION_PATTERN,
|
|
662
|
+
STANDALONE_DIGIT_PATTERN,
|
|
663
|
+
];
|
|
664
|
+
/** Maximum characters to search for fallback score extraction */
|
|
665
|
+
const SCORE_FALLBACK_WINDOW = 100;
|
|
666
|
+
/**
|
|
667
|
+
* Extract score from LLM response text.
|
|
668
|
+
*
|
|
669
|
+
* Uses specific patterns to avoid false positives from incidental digits.
|
|
670
|
+
* Falls back to the last digit in the valid range within the last 100 characters,
|
|
671
|
+
* since LLMs typically provide their final answer at the end.
|
|
672
|
+
*
|
|
673
|
+
* @param text - LLM response text
|
|
674
|
+
* @returns Extracted score in G-Eval range
|
|
675
|
+
* @throws {ScoreNormalizationError} If no valid score found
|
|
676
|
+
*/
|
|
677
|
+
export function extractScoreFromText(text) {
|
|
678
|
+
// Try specific patterns first (more reliable) - search entire text
|
|
679
|
+
for (const pattern of SCORE_PATTERNS) {
|
|
680
|
+
const match = text.match(pattern);
|
|
681
|
+
if (match) {
|
|
682
|
+
return parseInt(match[1], 10);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
// Fallback: last digit in G-Eval range within last 100 chars only
|
|
686
|
+
// This reduces false positives from incidental numbers in prose
|
|
687
|
+
// e.g., "This response has 3 main points" won't extract 3 if score is at end
|
|
688
|
+
const tailText = text.slice(-SCORE_FALLBACK_WINDOW);
|
|
689
|
+
const allDigits = tailText.match(/\b([1-5])\b/g);
|
|
690
|
+
if (allDigits && allDigits.length > 0) {
|
|
691
|
+
return parseInt(allDigits[allDigits.length - 1], 10);
|
|
692
|
+
}
|
|
693
|
+
// No valid score found - throw rather than mask failure with default
|
|
694
|
+
throw new ScoreNormalizationError('No valid score found in LLM response');
|
|
695
|
+
}
|
|
696
|
+
/**
|
|
697
|
+
* G-Eval implementation using chain-of-thought prompting with token probability normalization.
|
|
698
|
+
*
|
|
699
|
+
* @param llm - LLM provider for judge calls
|
|
700
|
+
* @param config - G-Eval configuration with name, criteria, and evaluation parameters
|
|
701
|
+
* @param testCase - Test case to evaluate
|
|
702
|
+
* @param timeoutMs - Timeout for LLM calls in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
|
|
703
|
+
* @returns Evaluation result with normalized score (0-1), reason, and raw response
|
|
704
|
+
* @throws {InputValidationError} If test case exceeds size limits
|
|
705
|
+
* @throws {Error} If LLM call times out or returns invalid score
|
|
706
|
+
* @security Input validation, prompt injection protection, and timeout protection
|
|
707
|
+
*/
|
|
708
|
+
export async function gEval(llm, config, testCase, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
|
|
709
|
+
// Validate input sizes
|
|
710
|
+
validateTestCase(testCase);
|
|
711
|
+
// Step 1: Generate evaluation steps via CoT (with timeout)
|
|
712
|
+
const stepsPrompt = `
|
|
713
|
+
Given the criteria: ${config.criteria}
|
|
714
|
+
Generate detailed evaluation steps to assess this criterion.
|
|
715
|
+
List ${G_EVAL_MIN_STEPS}-${G_EVAL_MAX_STEPS} specific steps the evaluator should follow.
|
|
716
|
+
`;
|
|
717
|
+
const stepsResponse = await withTimeout((_signal) => llm.generate(stepsPrompt, { temperature: config.temperature ?? LLM_TEMPERATURE_EVALUATION }), timeoutMs);
|
|
718
|
+
// Step 2: Evaluate with generated steps (with timeout)
|
|
719
|
+
const evalPrompt = buildEvalPrompt(config, testCase, stepsResponse.text);
|
|
720
|
+
const response = await withTimeout((_signal) => llm.generate(evalPrompt, { temperature: config.temperature ?? LLM_TEMPERATURE_EVALUATION, logprobs: true }), timeoutMs);
|
|
721
|
+
// Step 3: Normalize score using token probabilities if available
|
|
722
|
+
let rawScore;
|
|
723
|
+
if (response.logprobs && response.logprobs.length > 0) {
|
|
724
|
+
try {
|
|
725
|
+
rawScore = normalizeWithLogprobs(response.logprobs, [...G_EVAL_VALID_SCORES]);
|
|
726
|
+
}
|
|
727
|
+
catch {
|
|
728
|
+
// Logprobs normalization failed - fallback to text extraction
|
|
729
|
+
// This is expected when LLM returns unexpected token distribution
|
|
730
|
+
rawScore = extractScoreFromText(response.text);
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
else {
|
|
734
|
+
// Fallback: extract score from response text using specific patterns
|
|
735
|
+
rawScore = extractScoreFromText(response.text);
|
|
736
|
+
}
|
|
737
|
+
// Convert G-Eval scale (1-5) to normalized score (0-1)
|
|
738
|
+
// Formula: (score - min) / (max - min) = (score - 1) / 4
|
|
739
|
+
const normalizedScore = (rawScore - G_EVAL_MIN_SCORE) / G_EVAL_SCORE_RANGE;
|
|
740
|
+
// Validate score is in valid range
|
|
741
|
+
if (!isValidScore(normalizedScore)) {
|
|
742
|
+
throw new Error(`Invalid normalized score: ${normalizedScore} (raw: ${rawScore})`);
|
|
743
|
+
}
|
|
744
|
+
return {
|
|
745
|
+
score: normalizedScore,
|
|
746
|
+
reason: response.text,
|
|
747
|
+
rawResponse: response.text,
|
|
748
|
+
};
|
|
749
|
+
}
|
|
750
|
+
// ============================================================================
|
|
751
|
+
// QAG Pattern Helpers
|
|
752
|
+
// ============================================================================
|
|
753
|
+
/**
|
|
754
|
+
* Extract atomic statements from LLM output.
|
|
755
|
+
* Each statement should be independently verifiable.
|
|
756
|
+
*
|
|
757
|
+
* @param llm - LLM provider for extraction
|
|
758
|
+
* @param output - Text to extract statements from
|
|
759
|
+
* @param timeoutMs - Timeout for LLM call in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
|
|
760
|
+
* @returns Array of atomic statements (max MAX_STATEMENTS)
|
|
761
|
+
* @throws {Error} If LLM call times out
|
|
762
|
+
* @security Output sanitized, safe JSON parsing with depth limits, result capped at MAX_STATEMENTS
|
|
763
|
+
*/
|
|
764
|
+
export async function extractStatements(llm, output, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
|
|
765
|
+
const sanitizedOutput = sanitizeForPrompt(output);
|
|
766
|
+
const prompt = `
|
|
767
|
+
Extract all factual claims from the following text as a JSON array of strings.
|
|
768
|
+
Each claim should be a single, atomic statement that can be verified independently.
|
|
769
|
+
|
|
770
|
+
Text: ${sanitizedOutput}
|
|
771
|
+
|
|
772
|
+
Return ONLY a JSON array, e.g.: ["claim 1", "claim 2", "claim 3"]
|
|
773
|
+
`;
|
|
774
|
+
const response = await withTimeout((_signal) => llm.generate(prompt, { temperature: LLM_TEMPERATURE_DETERMINISTIC }), timeoutMs);
|
|
775
|
+
try {
|
|
776
|
+
const parsed = safeJSONParse(response.text);
|
|
777
|
+
// Validate parsed result is an array before processing
|
|
778
|
+
if (!Array.isArray(parsed)) {
|
|
779
|
+
throw new Error(`Expected array from statement extraction, got ${typeof parsed}`);
|
|
780
|
+
}
|
|
781
|
+
// Type guard filters non-strings and empty strings, caps at MAX_STATEMENTS
|
|
782
|
+
return parsed
|
|
783
|
+
.filter((item) => typeof item === 'string' && item.trim().length > 0)
|
|
784
|
+
.slice(0, MAX_STATEMENTS);
|
|
785
|
+
}
|
|
786
|
+
catch (error) {
|
|
787
|
+
// JSON parse failed - fallback to sentence splitting
|
|
788
|
+
// Log with context for production debugging
|
|
789
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
790
|
+
const responsePreview = response.text.length > 200
|
|
791
|
+
? response.text.slice(0, 200) + '...'
|
|
792
|
+
: response.text;
|
|
793
|
+
// Structured logging with context for distributed tracing
|
|
794
|
+
// Callers can pass traceId/spanId via AsyncLocalStorage or context propagation
|
|
795
|
+
// Respects LOG_LEVEL environment variable for production flexibility
|
|
796
|
+
if (shouldLog('warn')) {
|
|
797
|
+
console.warn('[llm-as-judge] Statement extraction JSON parse failed, using sentence fallback.', {
|
|
798
|
+
error: errorMessage,
|
|
799
|
+
responsePreview,
|
|
800
|
+
outputLength: output.length,
|
|
801
|
+
// Note: traceId/spanId can be added by wrapping this module with OTel instrumentation
|
|
802
|
+
// e.g., using @opentelemetry/api context.active().getValue(SPAN_KEY)
|
|
803
|
+
});
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
// Use sbd (sentence boundary detection) for proper sentence splitting
|
|
807
|
+
// Handles abbreviations like "Dr. Smith" without splitting incorrectly
|
|
808
|
+
return sbd.sentences(output)
|
|
809
|
+
.map(s => s.trim())
|
|
810
|
+
.filter(s => s.length > MIN_STATEMENT_LENGTH)
|
|
811
|
+
.slice(0, MAX_STATEMENTS);
|
|
812
|
+
}
|
|
813
|
+
/**
|
|
814
|
+
* Generate verification question for a statement.
|
|
815
|
+
*
|
|
816
|
+
* @param llm - LLM provider for question generation
|
|
817
|
+
* @param statement - Statement to verify
|
|
818
|
+
* @param timeoutMs - Timeout for LLM call in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
|
|
819
|
+
* @returns Yes/no verification question string
|
|
820
|
+
* @throws {Error} If LLM call times out
|
|
821
|
+
* @security Statement is sanitized for prompt injection before use
|
|
822
|
+
*/
|
|
823
|
+
export async function generateVerificationQuestion(llm, statement, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
|
|
824
|
+
const sanitizedStatement = sanitizeForPrompt(statement);
|
|
825
|
+
const prompt = `
|
|
826
|
+
Convert this statement into a yes/no question that can verify its accuracy:
|
|
827
|
+
|
|
828
|
+
Statement: ${sanitizedStatement}
|
|
829
|
+
|
|
830
|
+
Return ONLY the question, nothing else.
|
|
831
|
+
`;
|
|
832
|
+
const response = await withTimeout((_signal) => llm.generate(prompt, { temperature: LLM_TEMPERATURE_DETERMINISTIC }), timeoutMs);
|
|
833
|
+
return response.text.trim();
|
|
834
|
+
}
|
|
835
|
+
/**
|
|
836
|
+
* Answer verification question using provided context.
|
|
837
|
+
*
|
|
838
|
+
* @param llm - LLM provider for answering
|
|
839
|
+
* @param question - Yes/no question to answer
|
|
840
|
+
* @param context - Context documents to use for answering
|
|
841
|
+
* @param timeoutMs - Timeout for LLM call in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
|
|
842
|
+
* @returns 'yes', 'no', or 'unknown' based on context
|
|
843
|
+
* @throws {Error} If LLM call times out
|
|
844
|
+
* @security Question and context are sanitized for prompt injection
|
|
845
|
+
*/
|
|
846
|
+
export async function answerQuestion(llm, question, context, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
|
|
847
|
+
const sanitizedQuestion = sanitizeForPrompt(question);
|
|
848
|
+
const sanitizedContext = sanitizeContextArray(context);
|
|
849
|
+
const prompt = `
|
|
850
|
+
Based ONLY on the following context, answer the question with "yes", "no", or "unknown".
|
|
851
|
+
|
|
852
|
+
Context:
|
|
853
|
+
${sanitizedContext.join('\n\n')}
|
|
854
|
+
|
|
855
|
+
Question: ${sanitizedQuestion}
|
|
856
|
+
|
|
857
|
+
Answer (yes/no/unknown):
|
|
858
|
+
`;
|
|
859
|
+
const response = await withTimeout((_signal) => llm.generate(prompt, { temperature: LLM_TEMPERATURE_DETERMINISTIC }), timeoutMs);
|
|
860
|
+
const normalized = response.text.trim().toLowerCase();
|
|
861
|
+
// Use word boundary matching to avoid false positives like "yesterday" or "notwithstanding"
|
|
862
|
+
const yesMatch = /\b(yes|yeah|correct|true|affirmative)\b/i.test(normalized);
|
|
863
|
+
const noMatch = /\b(no|nope|incorrect|false|negative)\b/i.test(normalized);
|
|
864
|
+
// If both or neither, check what comes first for ambiguous cases
|
|
865
|
+
if (yesMatch && noMatch) {
|
|
866
|
+
const yesPos = normalized.search(/\b(yes|yeah|correct|true|affirmative)\b/i);
|
|
867
|
+
const noPos = normalized.search(/\b(no|nope|incorrect|false|negative)\b/i);
|
|
868
|
+
return yesPos < noPos ? 'yes' : 'no';
|
|
869
|
+
}
|
|
870
|
+
if (yesMatch)
|
|
871
|
+
return 'yes';
|
|
872
|
+
if (noMatch)
|
|
873
|
+
return 'no';
|
|
874
|
+
return 'unknown';
|
|
875
|
+
}
|
|
876
|
+
/**
|
|
877
|
+
* QAG (Question-Answer Generation) evaluation.
|
|
878
|
+
* Decomposes evaluation into atomic yes/no questions.
|
|
879
|
+
*
|
|
880
|
+
* Uses Promise.allSettled for graceful degradation - partial failures
|
|
881
|
+
* don't abort the entire evaluation. Score is calculated from successful
|
|
882
|
+
* verifications only.
|
|
883
|
+
*
|
|
884
|
+
* @param llm - LLM provider for all operations
|
|
885
|
+
* @param input - Original user input (unused but included for API consistency)
|
|
886
|
+
* @param output - LLM output to evaluate for faithfulness
|
|
887
|
+
* @param context - Context documents for verification
|
|
888
|
+
* @param options - Optional configuration object
|
|
889
|
+
* @param options.timeoutMs - Timeout for each LLM call (default: DEFAULT_LLM_TIMEOUT_MS)
|
|
890
|
+
* @returns Faithfulness score (0-1) as proportion of verified statements
|
|
891
|
+
* @security Timeout protection on all calls; graceful degradation on partial failures
|
|
892
|
+
* @performance Makes 2N+1 LLM calls for N statements (capped at MAX_STATEMENTS=20):
|
|
893
|
+
* - 1 call to extract atomic statements from output
|
|
894
|
+
* - N parallel calls to generate verification questions (one per statement)
|
|
895
|
+
* - N parallel calls to answer questions from context (one per statement)
|
|
896
|
+
* For MAX_STATEMENTS (20), this is 41 LLM calls total.
|
|
897
|
+
* Typical latency: 10-30s (parallel execution) depending on LLM provider.
|
|
898
|
+
*/
|
|
899
|
+
export async function qagEvaluate(llm, input, output, context, options) {
|
|
900
|
+
const timeoutMs = options?.timeoutMs ?? DEFAULT_LLM_TIMEOUT_MS;
|
|
901
|
+
// Step 1: Extract statements from output
|
|
902
|
+
const statements = await extractStatements(llm, output, timeoutMs);
|
|
903
|
+
if (statements.length === 0) {
|
|
904
|
+
return 1; // No claims to verify = fully faithful
|
|
905
|
+
}
|
|
906
|
+
// Step 2: Generate verification questions with graceful degradation
|
|
907
|
+
const questionResults = await Promise.allSettled(statements.map(s => generateVerificationQuestion(llm, s, timeoutMs)));
|
|
908
|
+
// Collect successful questions with their indices
|
|
909
|
+
const successfulQuestions = [];
|
|
910
|
+
for (let i = 0; i < questionResults.length; i++) {
|
|
911
|
+
const result = questionResults[i];
|
|
912
|
+
if (result.status === 'fulfilled') {
|
|
913
|
+
successfulQuestions.push({ question: result.value, index: i });
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
// If all question generation failed, throw error (0 would mean "unfaithful" not "failed")
|
|
917
|
+
if (successfulQuestions.length === 0) {
|
|
918
|
+
throw new Error('QAG evaluation failed: no verification questions generated');
|
|
919
|
+
}
|
|
920
|
+
// Step 3: Answer questions with graceful degradation
|
|
921
|
+
const answerResults = await Promise.allSettled(successfulQuestions.map(({ question }) => answerQuestion(llm, question, context, timeoutMs)));
|
|
922
|
+
// Step 4: Calculate score from successful answers only
|
|
923
|
+
let yesCount = 0;
|
|
924
|
+
let successfulAnswers = 0;
|
|
925
|
+
for (const result of answerResults) {
|
|
926
|
+
if (result.status === 'fulfilled') {
|
|
927
|
+
successfulAnswers++;
|
|
928
|
+
if (result.value === 'yes') {
|
|
929
|
+
yesCount++;
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
// If all answer calls failed, throw error (0 would mean "unfaithful" not "failed")
|
|
934
|
+
if (successfulAnswers === 0) {
|
|
935
|
+
throw new Error('QAG evaluation failed: no verification answers obtained');
|
|
936
|
+
}
|
|
937
|
+
return yesCount / successfulAnswers;
|
|
938
|
+
}
|
|
939
|
+
// ============================================================================
|
|
940
|
+
// Bias Mitigation
|
|
941
|
+
// ============================================================================
|
|
942
|
+
/** Valid winner values for pairwise evaluation */
|
|
943
|
+
const VALID_PAIRWISE_WINNERS = ['A', 'B', 'tie'];
|
|
944
|
+
/**
|
|
945
|
+
* Validate that an evaluate function returned a valid pairwise result.
|
|
946
|
+
* Runtime type guard to prevent type safety bypass from external functions.
|
|
947
|
+
*
|
|
948
|
+
* @param result - Result from evaluate function to validate
|
|
949
|
+
* @param ordering - Ordering label for error message ('AB' or 'BA')
|
|
950
|
+
* @throws {InputValidationError} If result is not a valid pairwise result object
|
|
951
|
+
*/
|
|
952
|
+
function validatePairwiseResult(result, ordering) {
|
|
953
|
+
if (!result ||
|
|
954
|
+
typeof result !== 'object' ||
|
|
955
|
+
typeof result.winner !== 'string' ||
|
|
956
|
+
!VALID_PAIRWISE_WINNERS.includes(result.winner)) {
|
|
957
|
+
throw new InputValidationError(`Invalid evaluate result for ${ordering} ordering: expected { winner: 'A' | 'B' | 'tie' }, got ${JSON.stringify(result)}`, 'evaluate', 'type');
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
/**
|
|
961
|
+
* Mitigated pairwise evaluation with position bias correction.
|
|
962
|
+
* Evaluates both orderings and only counts consistent wins.
|
|
963
|
+
*
|
|
964
|
+
* @param evaluate - Evaluation function that compares two outputs
|
|
965
|
+
* @param input - User input
|
|
966
|
+
* @param outputA - First output to compare
|
|
967
|
+
* @param outputB - Second output to compare
|
|
968
|
+
* @returns Winner ('A', 'B', or 'tie')
|
|
969
|
+
* @throws {Error} If evaluate function is not provided
|
|
970
|
+
* @throws {InputValidationError} If input, outputA, or outputB is empty
|
|
971
|
+
* @throws {InputValidationError} If any string exceeds MAX_TEXT_LENGTH
|
|
972
|
+
*/
|
|
973
|
+
export async function mitigatedPairwiseEval(evaluate, input, outputA, outputB) {
|
|
974
|
+
// Validate evaluate function
|
|
975
|
+
if (typeof evaluate !== 'function') {
|
|
976
|
+
throw new Error('mitigatedPairwiseEval requires an evaluate function');
|
|
977
|
+
}
|
|
978
|
+
// Validate input is non-empty
|
|
979
|
+
if (!input || input.trim().length === 0) {
|
|
980
|
+
throw new InputValidationError('Input cannot be empty', 'input', 'required');
|
|
981
|
+
}
|
|
982
|
+
// Validate outputA is non-empty
|
|
983
|
+
if (!outputA || outputA.trim().length === 0) {
|
|
984
|
+
throw new InputValidationError('Output A cannot be empty', 'outputA', 'required');
|
|
985
|
+
}
|
|
986
|
+
// Validate outputB is non-empty
|
|
987
|
+
if (!outputB || outputB.trim().length === 0) {
|
|
988
|
+
throw new InputValidationError('Output B cannot be empty', 'outputB', 'required');
|
|
989
|
+
}
|
|
990
|
+
// Validate input sizes to prevent resource exhaustion
|
|
991
|
+
if (input.length > MAX_TEXT_LENGTH) {
|
|
992
|
+
throw new InputValidationError(`Input exceeds ${MAX_TEXT_LENGTH} character limit`, 'input', 'maxLength');
|
|
993
|
+
}
|
|
994
|
+
if (outputA.length > MAX_TEXT_LENGTH) {
|
|
995
|
+
throw new InputValidationError(`Output A exceeds ${MAX_TEXT_LENGTH} character limit`, 'outputA', 'maxLength');
|
|
996
|
+
}
|
|
997
|
+
if (outputB.length > MAX_TEXT_LENGTH) {
|
|
998
|
+
throw new InputValidationError(`Output B exceeds ${MAX_TEXT_LENGTH} character limit`, 'outputB', 'maxLength');
|
|
999
|
+
}
|
|
1000
|
+
// Evaluate both orderings to detect position bias
|
|
1001
|
+
const [resultAB, resultBA] = await Promise.all([
|
|
1002
|
+
evaluate(input, outputA, outputB),
|
|
1003
|
+
evaluate(input, outputB, outputA),
|
|
1004
|
+
]);
|
|
1005
|
+
// Validate evaluate function returned valid results at runtime
|
|
1006
|
+
validatePairwiseResult(resultAB, 'AB');
|
|
1007
|
+
validatePairwiseResult(resultBA, 'BA');
|
|
1008
|
+
// Map BA result back to AB perspective:
|
|
1009
|
+
// - 'A' winner in BA (reversed order) means B won in original ordering
|
|
1010
|
+
// - 'B' winner in BA (reversed order) means A won in original ordering
|
|
1011
|
+
// - 'tie' remains 'tie'
|
|
1012
|
+
const baMapped = resultBA.winner === 'A' ? 'B' : resultBA.winner === 'B' ? 'A' : 'tie';
|
|
1013
|
+
// Only count consistent wins
|
|
1014
|
+
if (resultAB.winner === 'A' && baMapped === 'A') {
|
|
1015
|
+
return 'A';
|
|
1016
|
+
}
|
|
1017
|
+
else if (resultAB.winner === 'B' && baMapped === 'B') {
|
|
1018
|
+
return 'B';
|
|
1019
|
+
}
|
|
1020
|
+
else {
|
|
1021
|
+
return 'tie';
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
/**
|
|
1025
|
+
* Multi-judge panel evaluation.
|
|
1026
|
+
* Uses multiple judge models and returns median score.
|
|
1027
|
+
*
|
|
1028
|
+
* @param evaluators - Array of evaluation functions for different models
|
|
1029
|
+
* @param testCase - Test case to evaluate
|
|
1030
|
+
* @returns Median score from all judges
|
|
1031
|
+
* @throws {Error} If evaluators array is empty
|
|
1032
|
+
*/
|
|
1033
|
+
export async function panelEvaluation(evaluators, testCase) {
|
|
1034
|
+
if (evaluators.length === 0) {
|
|
1035
|
+
throw new Error('panelEvaluation requires at least one evaluator');
|
|
1036
|
+
}
|
|
1037
|
+
const scores = await Promise.all(evaluators.map(evaluate => evaluate(testCase)));
|
|
1038
|
+
// Return median
|
|
1039
|
+
const sorted = [...scores].sort((a, b) => a - b);
|
|
1040
|
+
const mid = Math.floor(sorted.length / 2);
|
|
1041
|
+
if (sorted.length % 2 === 0) {
|
|
1042
|
+
return (sorted[mid - 1] + sorted[mid]) / 2;
|
|
1043
|
+
}
|
|
1044
|
+
return sorted[mid];
|
|
1045
|
+
}
|
|
1046
|
+
// ============================================================================
|
|
1047
|
+
// Production Utilities
|
|
1048
|
+
// ============================================================================
|
|
1049
|
+
/**
|
|
1050
|
+
* Validate that a score is within expected range [0, 1].
|
|
1051
|
+
*
|
|
1052
|
+
* @param score - Score value to validate
|
|
1053
|
+
* @returns True if score is a number between 0 and 1 (inclusive), false otherwise
|
|
1054
|
+
*/
|
|
1055
|
+
export function isValidScore(score) {
|
|
1056
|
+
return typeof score === 'number' && !isNaN(score) && score >= NORMALIZED_SCORE_MIN && score <= NORMALIZED_SCORE_MAX;
|
|
1057
|
+
}
|
|
1058
|
+
/**
|
|
1059
|
+
* Maximum exponent for backoff calculation.
|
|
1060
|
+
* Derived from: Math.floor(Math.log2(MAX_BACKOFF_MS / BACKOFF_BASE_MS))
|
|
1061
|
+
* = Math.floor(Math.log2(60000 / 1000)) = Math.floor(5.9) = 5
|
|
1062
|
+
* This caps backoff at 2^5 * 1000ms = 32 seconds before MAX_BACKOFF_MS takes over.
|
|
1063
|
+
*/
|
|
1064
|
+
const MAX_BACKOFF_EXPONENT = 5;
|
|
1065
|
+
/** Maximum backoff delay in milliseconds (60 seconds) */
|
|
1066
|
+
const MAX_BACKOFF_MS = 60000;
|
|
1067
|
+
/**
|
|
1068
|
+
* Delay utility for retry backoff.
|
|
1069
|
+
*/
|
|
1070
|
+
function delay(ms) {
|
|
1071
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
1072
|
+
}
|
|
1073
|
+
/**
|
|
1074
|
+
* Evaluate with retry logic and exponential backoff.
|
|
1075
|
+
*
|
|
1076
|
+
* Retries a single evaluation function on transient failures. Use this for
|
|
1077
|
+
* simple retry scenarios where you want automatic backoff.
|
|
1078
|
+
*
|
|
1079
|
+
* Compare to {@link JudgeCircuitBreaker.evaluate}:
|
|
1080
|
+
* - `evaluateWithRetry`: Retries same operation with backoff (1s → 2s → 4s)
|
|
1081
|
+
* - `JudgeCircuitBreaker.evaluate`: Fails fast when service is degraded, optional fallback
|
|
1082
|
+
*
|
|
1083
|
+
* These can be combined: wrap evaluateWithRetry inside circuit breaker for
|
|
1084
|
+
* retries on transient errors while circuit-breaking on sustained failures.
|
|
1085
|
+
*
|
|
1086
|
+
* @param evaluate - Evaluation function to retry on failure
|
|
1087
|
+
* @param testCase - Test case to evaluate
|
|
1088
|
+
* @param maxRetries - Maximum number of retry attempts (default: DEFAULT_MAX_RETRIES)
|
|
1089
|
+
* @returns Evaluation result with retryCount indicating number of failed attempts
|
|
1090
|
+
* @throws {Error} If all retry attempts fail. The thrown error is the last error
|
|
1091
|
+
* encountered. For non-Error thrown values, the error wraps the original value
|
|
1092
|
+
* in `error.cause` for debugging context (ECMAScript 2022 Error.cause).
|
|
1093
|
+
*
|
|
1094
|
+
* @example
|
|
1095
|
+
* const result = await evaluateWithRetry(
|
|
1096
|
+
* (tc) => gEval(tc, criteria, llmFn),
|
|
1097
|
+
* testCase,
|
|
1098
|
+
* 3
|
|
1099
|
+
* );
|
|
1100
|
+
*
|
|
1101
|
+
* @example
|
|
1102
|
+
* // Accessing error cause for debugging
|
|
1103
|
+
* try {
|
|
1104
|
+
* await evaluateWithRetry(evaluate, testCase, 3);
|
|
1105
|
+
* } catch (error) {
|
|
1106
|
+
* console.error('Final error:', error.message);
|
|
1107
|
+
* if (error.cause) {
|
|
1108
|
+
* console.error('Original cause:', error.cause);
|
|
1109
|
+
* }
|
|
1110
|
+
* }
|
|
1111
|
+
*/
|
|
1112
|
+
export async function evaluateWithRetry(evaluate, testCase, maxRetries = DEFAULT_MAX_RETRIES) {
|
|
1113
|
+
let lastError = new Error('No attempts made');
|
|
1114
|
+
let retryCount = 0;
|
|
1115
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
1116
|
+
try {
|
|
1117
|
+
const result = await evaluate(testCase);
|
|
1118
|
+
// Validate result has valid score
|
|
1119
|
+
if (isValidScore(result.score)) {
|
|
1120
|
+
return { ...result, retryCount };
|
|
1121
|
+
}
|
|
1122
|
+
// Invalid score - treat as error and retry
|
|
1123
|
+
throw new Error(`Invalid score: ${result.score}`);
|
|
1124
|
+
}
|
|
1125
|
+
catch (error) {
|
|
1126
|
+
// Preserve original error as cause for debugging context
|
|
1127
|
+
// Use JSON.stringify for objects to get meaningful message instead of "[object Object]"
|
|
1128
|
+
const errorMessage = error instanceof Error
|
|
1129
|
+
? error.message
|
|
1130
|
+
: (typeof error === 'object' && error !== null ? JSON.stringify(error) : String(error));
|
|
1131
|
+
lastError = error instanceof Error ? error : new Error(errorMessage, { cause: error });
|
|
1132
|
+
retryCount++;
|
|
1133
|
+
// Don't wait after the last attempt
|
|
1134
|
+
if (attempt < maxRetries) {
|
|
1135
|
+
// Exponential backoff: 2^(attempt-1) seconds, capped at MAX_BACKOFF_EXPONENT
|
|
1136
|
+
// First retry (attempt=1) waits 1s, second (attempt=2) waits 2s, etc.
|
|
1137
|
+
const cappedExponent = Math.min(attempt - 1, MAX_BACKOFF_EXPONENT);
|
|
1138
|
+
const backoffMs = Math.min(BACKOFF_BASE_MS * 2 ** cappedExponent, MAX_BACKOFF_MS);
|
|
1139
|
+
await delay(backoffMs);
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
throw lastError;
|
|
1144
|
+
}
|
|
1145
|
+
/**
|
|
1146
|
+
* Circuit breaker for judge model failures.
|
|
1147
|
+
* Prevents cascading failures when judge model is unavailable.
|
|
1148
|
+
* Rate limit errors (429) are not counted toward the threshold.
|
|
1149
|
+
*
|
|
1150
|
+
* Compare to {@link evaluateWithRetry}:
|
|
1151
|
+
* - `evaluateWithRetry`: Retries same operation with backoff (1s → 2s → 4s)
|
|
1152
|
+
* - `JudgeCircuitBreaker.evaluate`: Fails fast when service is degraded, optional fallback
|
|
1153
|
+
*/
|
|
1154
|
+
export class JudgeCircuitBreaker {
|
|
1155
|
+
threshold;
|
|
1156
|
+
resetTimeout;
|
|
1157
|
+
failures = 0;
|
|
1158
|
+
lastFailure = null;
|
|
1159
|
+
isOpen = false;
|
|
1160
|
+
/**
|
|
1161
|
+
* Flag to prevent multiple concurrent resets (race condition protection).
|
|
1162
|
+
* @note This implementation assumes single-threaded Node.js execution.
|
|
1163
|
+
* For worker threads or multi-process deployments, use external synchronization
|
|
1164
|
+
* (e.g., Redis-based distributed locks) instead of this in-memory flag.
|
|
1165
|
+
*/
|
|
1166
|
+
resetting = false;
|
|
1167
|
+
/**
|
|
1168
|
+
* Count of times circuit has opened (for flapping detection).
|
|
1169
|
+
* @note This counter is unbounded for simplicity. In practice, overflow would
|
|
1170
|
+
* take ~584 million years at 1 state change per second. For long-running
|
|
1171
|
+
* services requiring bounded counters, use external observability tools
|
|
1172
|
+
* (e.g., Prometheus counters) or implement a sliding window approach.
|
|
1173
|
+
*/
|
|
1174
|
+
openCount = 0;
|
|
1175
|
+
/**
|
|
1176
|
+
* Count of times circuit has been reset (for flapping detection).
|
|
1177
|
+
* @note Unbounded counter - see openCount for rationale.
|
|
1178
|
+
*/
|
|
1179
|
+
resetCount = 0;
|
|
1180
|
+
/**
|
|
1181
|
+
* Create a new circuit breaker instance.
|
|
1182
|
+
*
|
|
1183
|
+
* @param threshold - Number of failures before circuit opens (default: DEFAULT_CIRCUIT_BREAKER_THRESHOLD)
|
|
1184
|
+
* @param resetTimeout - Time in ms before circuit resets (default: DEFAULT_CIRCUIT_BREAKER_RESET_MS)
|
|
1185
|
+
*/
|
|
1186
|
+
constructor(threshold = DEFAULT_CIRCUIT_BREAKER_THRESHOLD, resetTimeout = DEFAULT_CIRCUIT_BREAKER_RESET_MS) {
|
|
1187
|
+
this.threshold = threshold;
|
|
1188
|
+
this.resetTimeout = resetTimeout;
|
|
1189
|
+
}
|
|
1190
|
+
/**
|
|
1191
|
+
* Check if circuit is open (failing).
|
|
1192
|
+
*
|
|
1193
|
+
* @returns True if circuit is open and blocking requests
|
|
1194
|
+
*/
|
|
1195
|
+
get open() {
|
|
1196
|
+
return this.isOpen;
|
|
1197
|
+
}
|
|
1198
|
+
/**
|
|
1199
|
+
* Get current failure count.
|
|
1200
|
+
*
|
|
1201
|
+
* @returns Number of consecutive failures (excluding rate limits)
|
|
1202
|
+
*/
|
|
1203
|
+
get failureCount() {
|
|
1204
|
+
return this.failures;
|
|
1205
|
+
}
|
|
1206
|
+
/**
|
|
1207
|
+
* Get circuit breaker statistics for observability and flapping detection.
|
|
1208
|
+
*
|
|
1209
|
+
* @returns Object with openCount and resetCount for monitoring circuit health
|
|
1210
|
+
*/
|
|
1211
|
+
get stats() {
|
|
1212
|
+
return { openCount: this.openCount, resetCount: this.resetCount };
|
|
1213
|
+
}
|
|
1214
|
+
/**
|
|
1215
|
+
* Reset the circuit breaker to closed state.
|
|
1216
|
+
* Thread-safe: uses resetting flag to prevent concurrent resets.
|
|
1217
|
+
*
|
|
1218
|
+
* @returns void
|
|
1219
|
+
*/
|
|
1220
|
+
reset() {
|
|
1221
|
+
this.isOpen = false;
|
|
1222
|
+
this.failures = 0;
|
|
1223
|
+
this.lastFailure = null;
|
|
1224
|
+
this.resetting = false;
|
|
1225
|
+
this.resetCount++;
|
|
1226
|
+
}
|
|
1227
|
+
/**
|
|
1228
|
+
* Check if an error should count toward circuit breaker threshold.
|
|
1229
|
+
*
|
|
1230
|
+
* @param error - Error to check
|
|
1231
|
+
* @returns False for rate limit errors (transient), true for other errors
|
|
1232
|
+
*/
|
|
1233
|
+
shouldCountAsFailure(error) {
|
|
1234
|
+
if (!(error instanceof Error)) {
|
|
1235
|
+
return true;
|
|
1236
|
+
}
|
|
1237
|
+
// Type-based checks (robust for typed providers)
|
|
1238
|
+
const rateLimitErrorNames = [
|
|
1239
|
+
'RateLimitError', // OpenAI SDK
|
|
1240
|
+
'ThrottlingException', // AWS Bedrock
|
|
1241
|
+
'TooManyRequestsError', // Generic
|
|
1242
|
+
'RateLimitExceeded', // Anthropic
|
|
1243
|
+
];
|
|
1244
|
+
if (rateLimitErrorNames.includes(error.name)) {
|
|
1245
|
+
return false;
|
|
1246
|
+
}
|
|
1247
|
+
// HTTP status code check (if available on error object)
|
|
1248
|
+
const errorWithStatus = error;
|
|
1249
|
+
if (errorWithStatus.statusCode === HttpStatus.TOO_MANY_REQUESTS || errorWithStatus.status === HttpStatus.TOO_MANY_REQUESTS) {
|
|
1250
|
+
return false;
|
|
1251
|
+
}
|
|
1252
|
+
// Error code check (AWS-style errors)
|
|
1253
|
+
if (errorWithStatus.code === 'ThrottlingException' ||
|
|
1254
|
+
errorWithStatus.code === 'TooManyRequestsException' ||
|
|
1255
|
+
errorWithStatus.code === 'ProvisionedThroughputExceededException') {
|
|
1256
|
+
return false;
|
|
1257
|
+
}
|
|
1258
|
+
// Fallback to message pattern matching (last resort)
|
|
1259
|
+
// Use word boundary regex to avoid false positives
|
|
1260
|
+
if (error.message && typeof error.message === 'string') {
|
|
1261
|
+
const message = error.message.toLowerCase();
|
|
1262
|
+
// Note: \s+ intentionally matches any amount of whitespace to handle
|
|
1263
|
+
// variations like "too many requests" from different providers
|
|
1264
|
+
if (/\brate[_\s-]?limit/i.test(message) ||
|
|
1265
|
+
/\b429\b/.test(message) ||
|
|
1266
|
+
/\bthrottl/i.test(message) ||
|
|
1267
|
+
/\btoo\s+many\s+requests\b/i.test(message)) {
|
|
1268
|
+
return false;
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
return true;
|
|
1272
|
+
}
|
|
1273
|
+
/**
|
|
1274
|
+
* Execute evaluation with circuit breaker protection.
|
|
1275
|
+
*
|
|
1276
|
+
* @param evaluate - Primary evaluation function to execute
|
|
1277
|
+
* @param fallbackEvaluate - Optional fallback function when circuit is open
|
|
1278
|
+
* @returns Result from evaluate or fallbackEvaluate
|
|
1279
|
+
* @throws {Error} If circuit is open and no fallback provided
|
|
1280
|
+
* @throws {Error} If evaluation fails (error is re-thrown after recording)
|
|
1281
|
+
*/
|
|
1282
|
+
async evaluate(evaluate, fallbackEvaluate) {
|
|
1283
|
+
// Check if circuit should be reset - triple-check pattern with resetting flag
|
|
1284
|
+
// prevents race condition where multiple concurrent calls could all reset
|
|
1285
|
+
if (this.isOpen && this.lastFailure && !this.resetting) {
|
|
1286
|
+
const elapsed = Date.now() - this.lastFailure.getTime();
|
|
1287
|
+
if (elapsed > this.resetTimeout && this.isOpen && !this.resetting) {
|
|
1288
|
+
// Set flag BEFORE reset to prevent concurrent resets
|
|
1289
|
+
this.resetting = true;
|
|
1290
|
+
this.reset();
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
// If circuit is open, use fallback or throw
|
|
1294
|
+
if (this.isOpen) {
|
|
1295
|
+
if (fallbackEvaluate) {
|
|
1296
|
+
return fallbackEvaluate();
|
|
1297
|
+
}
|
|
1298
|
+
throw new Error('Circuit breaker open - evaluation temporarily unavailable');
|
|
1299
|
+
}
|
|
1300
|
+
try {
|
|
1301
|
+
const result = await evaluate();
|
|
1302
|
+
this.failures = 0;
|
|
1303
|
+
return result;
|
|
1304
|
+
}
|
|
1305
|
+
catch (error) {
|
|
1306
|
+
// Only count non-transient errors
|
|
1307
|
+
if (this.shouldCountAsFailure(error)) {
|
|
1308
|
+
this.failures++;
|
|
1309
|
+
this.lastFailure = new Date();
|
|
1310
|
+
if (this.failures >= this.threshold) {
|
|
1311
|
+
this.isOpen = true;
|
|
1312
|
+
this.openCount++;
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
throw error;
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
/**
|
|
1320
|
+
* Default canary test cases for judge pipeline health monitoring.
|
|
1321
|
+
*/
|
|
1322
|
+
export const DEFAULT_CANARY_CASES = [
|
|
1323
|
+
{
|
|
1324
|
+
name: 'perfect_answer',
|
|
1325
|
+
input: 'What is 2+2?',
|
|
1326
|
+
output: '2+2 equals 4.',
|
|
1327
|
+
metric: 'relevance',
|
|
1328
|
+
expectedScore: { min: 0.9 },
|
|
1329
|
+
description: 'Simple factual answer should score high',
|
|
1330
|
+
},
|
|
1331
|
+
{
|
|
1332
|
+
name: 'hallucination_detection',
|
|
1333
|
+
input: 'What is the capital of France?',
|
|
1334
|
+
output: 'The capital of France is Tokyo, a beautiful city in Asia.',
|
|
1335
|
+
metric: 'faithfulness',
|
|
1336
|
+
expectedScore: { max: 0.3 },
|
|
1337
|
+
description: 'Obvious hallucination should score low',
|
|
1338
|
+
},
|
|
1339
|
+
{
|
|
1340
|
+
name: 'off_topic_detection',
|
|
1341
|
+
input: 'Explain quantum computing',
|
|
1342
|
+
output: 'I love pizza! It is delicious with pepperoni.',
|
|
1343
|
+
metric: 'relevance',
|
|
1344
|
+
expectedScore: { max: 0.2 },
|
|
1345
|
+
description: 'Completely off-topic should score very low',
|
|
1346
|
+
},
|
|
1347
|
+
];
|
|
1348
|
+
/**
|
|
1349
|
+
* Run canary evaluations to monitor judge pipeline health.
|
|
1350
|
+
*
|
|
1351
|
+
* @param evaluate - Evaluation function to test (takes test case and metric)
|
|
1352
|
+
* @param canaries - Canary test cases to run (defaults to DEFAULT_CANARY_CASES)
|
|
1353
|
+
* @returns Canary report with overall pass/fail and individual results
|
|
1354
|
+
* @throws {Error} If any canary lacks expectedScore.min or expectedScore.max
|
|
1355
|
+
*/
|
|
1356
|
+
export async function runCanaryEvaluations(evaluate, canaries = DEFAULT_CANARY_CASES) {
|
|
1357
|
+
// Validate evaluate is a function
|
|
1358
|
+
if (typeof evaluate !== 'function') {
|
|
1359
|
+
throw new Error('runCanaryEvaluations requires an evaluate function');
|
|
1360
|
+
}
|
|
1361
|
+
const results = [];
|
|
1362
|
+
for (const canary of canaries) {
|
|
1363
|
+
// Validate canary has at least one threshold defined
|
|
1364
|
+
if (canary.expectedScore.min === undefined && canary.expectedScore.max === undefined) {
|
|
1365
|
+
throw new Error(`Canary '${canary.name}' must define expectedScore.min or expectedScore.max`);
|
|
1366
|
+
}
|
|
1367
|
+
const score = await evaluate({ input: canary.input, output: canary.output }, canary.metric);
|
|
1368
|
+
// Validate score is in valid range
|
|
1369
|
+
if (!isValidScore(score)) {
|
|
1370
|
+
results.push({
|
|
1371
|
+
name: canary.name,
|
|
1372
|
+
score: NaN,
|
|
1373
|
+
expected: canary.expectedScore,
|
|
1374
|
+
passed: false,
|
|
1375
|
+
timestamp: new Date().toISOString(),
|
|
1376
|
+
});
|
|
1377
|
+
continue;
|
|
1378
|
+
}
|
|
1379
|
+
// Determine if score passes threshold - check both min AND max when both defined
|
|
1380
|
+
// We validated above that at least one of min/max is defined
|
|
1381
|
+
const passed = (canary.expectedScore.min === undefined || score >= canary.expectedScore.min) &&
|
|
1382
|
+
(canary.expectedScore.max === undefined || score <= canary.expectedScore.max);
|
|
1383
|
+
results.push({
|
|
1384
|
+
name: canary.name,
|
|
1385
|
+
score,
|
|
1386
|
+
expected: canary.expectedScore,
|
|
1387
|
+
passed,
|
|
1388
|
+
timestamp: new Date().toISOString(),
|
|
1389
|
+
});
|
|
1390
|
+
}
|
|
1391
|
+
return {
|
|
1392
|
+
timestamp: new Date().toISOString(),
|
|
1393
|
+
passed: results.every(r => r.passed),
|
|
1394
|
+
results,
|
|
1395
|
+
};
|
|
1396
|
+
}
|
|
1397
|
+
//# sourceMappingURL=llm-as-judge.js.map
|