observability-toolkit 1.8.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (364) hide show
  1. package/README.md +126 -5
  2. package/dist/backends/index.d.ts +163 -0
  3. package/dist/backends/index.d.ts.map +1 -1
  4. package/dist/backends/index.js +57 -0
  5. package/dist/backends/index.js.map +1 -1
  6. package/dist/backends/index.test.js +55 -1
  7. package/dist/backends/index.test.js.map +1 -1
  8. package/dist/backends/local-jsonl-boolean-search.test.js +8 -8
  9. package/dist/backends/local-jsonl-boolean-search.test.js.map +1 -1
  10. package/dist/backends/local-jsonl-cache.test.d.ts +2 -0
  11. package/dist/backends/local-jsonl-cache.test.d.ts.map +1 -0
  12. package/dist/backends/local-jsonl-cache.test.js +295 -0
  13. package/dist/backends/local-jsonl-cache.test.js.map +1 -0
  14. package/dist/backends/local-jsonl-circuit-breaker.test.d.ts +2 -0
  15. package/dist/backends/local-jsonl-circuit-breaker.test.d.ts.map +1 -0
  16. package/dist/backends/local-jsonl-circuit-breaker.test.js +180 -0
  17. package/dist/backends/local-jsonl-circuit-breaker.test.js.map +1 -0
  18. package/dist/backends/local-jsonl-export.test.d.ts +2 -0
  19. package/dist/backends/local-jsonl-export.test.d.ts.map +1 -0
  20. package/dist/backends/local-jsonl-export.test.js +704 -0
  21. package/dist/backends/local-jsonl-export.test.js.map +1 -0
  22. package/dist/backends/local-jsonl-index.test.d.ts +2 -0
  23. package/dist/backends/local-jsonl-index.test.d.ts.map +1 -0
  24. package/dist/backends/local-jsonl-index.test.js +554 -0
  25. package/dist/backends/local-jsonl-index.test.js.map +1 -0
  26. package/dist/backends/local-jsonl-logs.test.js +52 -43
  27. package/dist/backends/local-jsonl-logs.test.js.map +1 -1
  28. package/dist/backends/local-jsonl-metrics.test.d.ts +2 -0
  29. package/dist/backends/local-jsonl-metrics.test.d.ts.map +1 -0
  30. package/dist/backends/local-jsonl-metrics.test.js +876 -0
  31. package/dist/backends/local-jsonl-metrics.test.js.map +1 -0
  32. package/dist/backends/local-jsonl-traces.test.js +89 -83
  33. package/dist/backends/local-jsonl-traces.test.js.map +1 -1
  34. package/dist/backends/local-jsonl.d.ts +39 -0
  35. package/dist/backends/local-jsonl.d.ts.map +1 -1
  36. package/dist/backends/local-jsonl.js +975 -492
  37. package/dist/backends/local-jsonl.js.map +1 -1
  38. package/dist/backends/signoz-api-circuit-breaker.test.d.ts +6 -0
  39. package/dist/backends/signoz-api-circuit-breaker.test.d.ts.map +1 -0
  40. package/dist/backends/signoz-api-circuit-breaker.test.js +548 -0
  41. package/dist/backends/signoz-api-circuit-breaker.test.js.map +1 -0
  42. package/dist/backends/signoz-api-rate-limiter.test.d.ts +6 -0
  43. package/dist/backends/signoz-api-rate-limiter.test.d.ts.map +1 -0
  44. package/dist/backends/signoz-api-rate-limiter.test.js +390 -0
  45. package/dist/backends/signoz-api-rate-limiter.test.js.map +1 -0
  46. package/dist/backends/signoz-api-ssrf.test.d.ts +6 -0
  47. package/dist/backends/signoz-api-ssrf.test.d.ts.map +1 -0
  48. package/dist/backends/signoz-api-ssrf.test.js +216 -0
  49. package/dist/backends/signoz-api-ssrf.test.js.map +1 -0
  50. package/dist/backends/signoz-api-test-helpers.d.ts +80 -0
  51. package/dist/backends/signoz-api-test-helpers.d.ts.map +1 -0
  52. package/dist/backends/signoz-api-test-helpers.js +79 -0
  53. package/dist/backends/signoz-api-test-helpers.js.map +1 -0
  54. package/dist/backends/signoz-api.d.ts +31 -1
  55. package/dist/backends/signoz-api.d.ts.map +1 -1
  56. package/dist/backends/signoz-api.js +717 -539
  57. package/dist/backends/signoz-api.js.map +1 -1
  58. package/dist/backends/signoz-api.test.d.ts +9 -0
  59. package/dist/backends/signoz-api.test.d.ts.map +1 -1
  60. package/dist/backends/signoz-api.test.js +20 -1032
  61. package/dist/backends/signoz-api.test.js.map +1 -1
  62. package/dist/lib/agent-as-judge.d.ts +388 -0
  63. package/dist/lib/agent-as-judge.d.ts.map +1 -0
  64. package/dist/lib/agent-as-judge.js +740 -0
  65. package/dist/lib/agent-as-judge.js.map +1 -0
  66. package/dist/lib/agent-as-judge.test.d.ts +5 -0
  67. package/dist/lib/agent-as-judge.test.d.ts.map +1 -0
  68. package/dist/lib/agent-as-judge.test.js +816 -0
  69. package/dist/lib/agent-as-judge.test.js.map +1 -0
  70. package/dist/lib/cache.d.ts +61 -2
  71. package/dist/lib/cache.d.ts.map +1 -1
  72. package/dist/lib/cache.js +54 -3
  73. package/dist/lib/cache.js.map +1 -1
  74. package/dist/lib/circuit-breaker.d.ts +101 -0
  75. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  76. package/dist/lib/circuit-breaker.js +158 -0
  77. package/dist/lib/circuit-breaker.js.map +1 -0
  78. package/dist/lib/circuit-breaker.test.d.ts +2 -0
  79. package/dist/lib/circuit-breaker.test.d.ts.map +1 -0
  80. package/dist/lib/circuit-breaker.test.js +263 -0
  81. package/dist/lib/circuit-breaker.test.js.map +1 -0
  82. package/dist/lib/confident-export.d.ts +101 -0
  83. package/dist/lib/confident-export.d.ts.map +1 -0
  84. package/dist/lib/confident-export.js +393 -0
  85. package/dist/lib/confident-export.js.map +1 -0
  86. package/dist/lib/confident-export.test.d.ts +7 -0
  87. package/dist/lib/confident-export.test.d.ts.map +1 -0
  88. package/dist/lib/confident-export.test.js +835 -0
  89. package/dist/lib/confident-export.test.js.map +1 -0
  90. package/dist/lib/constants-symlink.test.d.ts +12 -0
  91. package/dist/lib/constants-symlink.test.d.ts.map +1 -0
  92. package/dist/lib/constants-symlink.test.js +357 -0
  93. package/dist/lib/constants-symlink.test.js.map +1 -0
  94. package/dist/lib/constants.d.ts +75 -0
  95. package/dist/lib/constants.d.ts.map +1 -1
  96. package/dist/lib/constants.js +104 -1
  97. package/dist/lib/constants.js.map +1 -1
  98. package/dist/lib/datadog-export.d.ts +156 -0
  99. package/dist/lib/datadog-export.d.ts.map +1 -0
  100. package/dist/lib/datadog-export.js +464 -0
  101. package/dist/lib/datadog-export.js.map +1 -0
  102. package/dist/lib/datadog-export.test.d.ts +14 -0
  103. package/dist/lib/datadog-export.test.d.ts.map +1 -0
  104. package/dist/lib/datadog-export.test.js +890 -0
  105. package/dist/lib/datadog-export.test.js.map +1 -0
  106. package/dist/lib/edge-cases.test.js +17 -17
  107. package/dist/lib/edge-cases.test.js.map +1 -1
  108. package/dist/lib/error-sanitizer.d.ts.map +1 -1
  109. package/dist/lib/error-sanitizer.js +29 -3
  110. package/dist/lib/error-sanitizer.js.map +1 -1
  111. package/dist/lib/error-sanitizer.test.js +159 -0
  112. package/dist/lib/error-sanitizer.test.js.map +1 -1
  113. package/dist/lib/error-types.d.ts +54 -0
  114. package/dist/lib/error-types.d.ts.map +1 -0
  115. package/dist/lib/error-types.js +154 -0
  116. package/dist/lib/error-types.js.map +1 -0
  117. package/dist/lib/error-types.test.d.ts +2 -0
  118. package/dist/lib/error-types.test.d.ts.map +1 -0
  119. package/dist/lib/error-types.test.js +196 -0
  120. package/dist/lib/error-types.test.js.map +1 -0
  121. package/dist/lib/evaluation-hooks.d.ts +49 -0
  122. package/dist/lib/evaluation-hooks.d.ts.map +1 -0
  123. package/dist/lib/evaluation-hooks.js +488 -0
  124. package/dist/lib/evaluation-hooks.js.map +1 -0
  125. package/dist/lib/evaluation-hooks.test.d.ts +8 -0
  126. package/dist/lib/evaluation-hooks.test.d.ts.map +1 -0
  127. package/dist/lib/evaluation-hooks.test.js +624 -0
  128. package/dist/lib/evaluation-hooks.test.js.map +1 -0
  129. package/dist/lib/export-utils.d.ts +99 -0
  130. package/dist/lib/export-utils.d.ts.map +1 -0
  131. package/dist/lib/export-utils.js +238 -0
  132. package/dist/lib/export-utils.js.map +1 -0
  133. package/dist/lib/export-utils.test.d.ts +5 -0
  134. package/dist/lib/export-utils.test.d.ts.map +1 -0
  135. package/dist/lib/export-utils.test.js +193 -0
  136. package/dist/lib/export-utils.test.js.map +1 -0
  137. package/dist/lib/file-utils.d.ts +17 -2
  138. package/dist/lib/file-utils.d.ts.map +1 -1
  139. package/dist/lib/file-utils.js +24 -5
  140. package/dist/lib/file-utils.js.map +1 -1
  141. package/dist/lib/file-utils.test.js +30 -0
  142. package/dist/lib/file-utils.test.js.map +1 -1
  143. package/dist/lib/histogram.d.ts +119 -0
  144. package/dist/lib/histogram.d.ts.map +1 -0
  145. package/dist/lib/histogram.js +202 -0
  146. package/dist/lib/histogram.js.map +1 -0
  147. package/dist/lib/histogram.test.d.ts +5 -0
  148. package/dist/lib/histogram.test.d.ts.map +1 -0
  149. package/dist/lib/histogram.test.js +381 -0
  150. package/dist/lib/histogram.test.js.map +1 -0
  151. package/dist/lib/indexer.test.js +27 -27
  152. package/dist/lib/indexer.test.js.map +1 -1
  153. package/dist/lib/input-validator.d.ts +12 -0
  154. package/dist/lib/input-validator.d.ts.map +1 -1
  155. package/dist/lib/input-validator.fuzz.test.d.ts +12 -0
  156. package/dist/lib/input-validator.fuzz.test.d.ts.map +1 -0
  157. package/dist/lib/input-validator.fuzz.test.js +290 -0
  158. package/dist/lib/input-validator.fuzz.test.js.map +1 -0
  159. package/dist/lib/input-validator.js +57 -3
  160. package/dist/lib/input-validator.js.map +1 -1
  161. package/dist/lib/input-validator.test.js +129 -1
  162. package/dist/lib/input-validator.test.js.map +1 -1
  163. package/dist/lib/instrumentation.d.ts +153 -0
  164. package/dist/lib/instrumentation.d.ts.map +1 -0
  165. package/dist/lib/instrumentation.integration.test.d.ts +2 -0
  166. package/dist/lib/instrumentation.integration.test.d.ts.map +1 -0
  167. package/dist/lib/instrumentation.integration.test.js +589 -0
  168. package/dist/lib/instrumentation.integration.test.js.map +1 -0
  169. package/dist/lib/instrumentation.js +520 -0
  170. package/dist/lib/instrumentation.js.map +1 -0
  171. package/dist/lib/instrumentation.test.d.ts +2 -0
  172. package/dist/lib/instrumentation.test.d.ts.map +1 -0
  173. package/dist/lib/instrumentation.test.js +821 -0
  174. package/dist/lib/instrumentation.test.js.map +1 -0
  175. package/dist/lib/langfuse-export.d.ts +125 -0
  176. package/dist/lib/langfuse-export.d.ts.map +1 -0
  177. package/dist/lib/langfuse-export.js +367 -0
  178. package/dist/lib/langfuse-export.js.map +1 -0
  179. package/dist/lib/langfuse-export.test.d.ts +7 -0
  180. package/dist/lib/langfuse-export.test.d.ts.map +1 -0
  181. package/dist/lib/langfuse-export.test.js +1007 -0
  182. package/dist/lib/langfuse-export.test.js.map +1 -0
  183. package/dist/lib/llm-as-judge.d.ts +657 -0
  184. package/dist/lib/llm-as-judge.d.ts.map +1 -0
  185. package/dist/lib/llm-as-judge.js +1397 -0
  186. package/dist/lib/llm-as-judge.js.map +1 -0
  187. package/dist/lib/llm-as-judge.test.d.ts +2 -0
  188. package/dist/lib/llm-as-judge.test.d.ts.map +1 -0
  189. package/dist/lib/llm-as-judge.test.js +2409 -0
  190. package/dist/lib/llm-as-judge.test.js.map +1 -0
  191. package/dist/lib/logger.d.ts +46 -0
  192. package/dist/lib/logger.d.ts.map +1 -0
  193. package/dist/lib/logger.js +81 -0
  194. package/dist/lib/logger.js.map +1 -0
  195. package/dist/lib/logger.test.d.ts +2 -0
  196. package/dist/lib/logger.test.d.ts.map +1 -0
  197. package/dist/lib/logger.test.js +122 -0
  198. package/dist/lib/logger.test.js.map +1 -0
  199. package/dist/lib/metrics.d.ts +62 -0
  200. package/dist/lib/metrics.d.ts.map +1 -0
  201. package/dist/lib/metrics.js +166 -0
  202. package/dist/lib/metrics.js.map +1 -0
  203. package/dist/lib/metrics.test.d.ts +5 -0
  204. package/dist/lib/metrics.test.d.ts.map +1 -0
  205. package/dist/lib/metrics.test.js +189 -0
  206. package/dist/lib/metrics.test.js.map +1 -0
  207. package/dist/lib/parse-stats.d.ts +119 -0
  208. package/dist/lib/parse-stats.d.ts.map +1 -0
  209. package/dist/lib/parse-stats.js +206 -0
  210. package/dist/lib/parse-stats.js.map +1 -0
  211. package/dist/lib/parse-stats.test.d.ts +5 -0
  212. package/dist/lib/parse-stats.test.d.ts.map +1 -0
  213. package/dist/lib/parse-stats.test.js +283 -0
  214. package/dist/lib/parse-stats.test.js.map +1 -0
  215. package/dist/lib/phoenix-export.d.ts +109 -0
  216. package/dist/lib/phoenix-export.d.ts.map +1 -0
  217. package/dist/lib/phoenix-export.js +429 -0
  218. package/dist/lib/phoenix-export.js.map +1 -0
  219. package/dist/lib/phoenix-export.test.d.ts +11 -0
  220. package/dist/lib/phoenix-export.test.d.ts.map +1 -0
  221. package/dist/lib/phoenix-export.test.js +725 -0
  222. package/dist/lib/phoenix-export.test.js.map +1 -0
  223. package/dist/lib/server-utils.d.ts +14 -1
  224. package/dist/lib/server-utils.d.ts.map +1 -1
  225. package/dist/lib/server-utils.js +43 -3
  226. package/dist/lib/server-utils.js.map +1 -1
  227. package/dist/lib/shared-schemas.d.ts +28 -0
  228. package/dist/lib/shared-schemas.d.ts.map +1 -1
  229. package/dist/lib/shared-schemas.js +33 -4
  230. package/dist/lib/shared-schemas.js.map +1 -1
  231. package/dist/lib/toon-encoder.d.ts +7 -2
  232. package/dist/lib/toon-encoder.d.ts.map +1 -1
  233. package/dist/lib/toon-encoder.js +21 -6
  234. package/dist/lib/toon-encoder.js.map +1 -1
  235. package/dist/lib/toon-encoder.test.d.ts +5 -0
  236. package/dist/lib/toon-encoder.test.d.ts.map +1 -0
  237. package/dist/lib/toon-encoder.test.js +85 -0
  238. package/dist/lib/toon-encoder.test.js.map +1 -0
  239. package/dist/lib/verification-events.d.ts +100 -0
  240. package/dist/lib/verification-events.d.ts.map +1 -0
  241. package/dist/lib/verification-events.js +162 -0
  242. package/dist/lib/verification-events.js.map +1 -0
  243. package/dist/lib/verification-events.test.d.ts +5 -0
  244. package/dist/lib/verification-events.test.d.ts.map +1 -0
  245. package/dist/lib/verification-events.test.js +193 -0
  246. package/dist/lib/verification-events.test.js.map +1 -0
  247. package/dist/server.d.ts +5 -0
  248. package/dist/server.d.ts.map +1 -1
  249. package/dist/server.js +79 -21
  250. package/dist/server.js.map +1 -1
  251. package/dist/server.test.js +30 -0
  252. package/dist/server.test.js.map +1 -1
  253. package/dist/test-helpers/env-utils.d.ts +22 -0
  254. package/dist/test-helpers/env-utils.d.ts.map +1 -1
  255. package/dist/test-helpers/env-utils.js +38 -0
  256. package/dist/test-helpers/env-utils.js.map +1 -1
  257. package/dist/test-helpers/fuzz-generators.d.ts +58 -0
  258. package/dist/test-helpers/fuzz-generators.d.ts.map +1 -0
  259. package/dist/test-helpers/fuzz-generators.js +216 -0
  260. package/dist/test-helpers/fuzz-generators.js.map +1 -0
  261. package/dist/test-helpers/index.d.ts +1 -0
  262. package/dist/test-helpers/index.d.ts.map +1 -1
  263. package/dist/test-helpers/index.js +2 -0
  264. package/dist/test-helpers/index.js.map +1 -1
  265. package/dist/test-helpers/memfs-utils.d.ts +181 -0
  266. package/dist/test-helpers/memfs-utils.d.ts.map +1 -0
  267. package/dist/test-helpers/memfs-utils.js +292 -0
  268. package/dist/test-helpers/memfs-utils.js.map +1 -0
  269. package/dist/test-helpers/memfs-utils.test.d.ts +5 -0
  270. package/dist/test-helpers/memfs-utils.test.d.ts.map +1 -0
  271. package/dist/test-helpers/memfs-utils.test.js +338 -0
  272. package/dist/test-helpers/memfs-utils.test.js.map +1 -0
  273. package/dist/test-helpers/race-condition-helpers.d.ts +85 -0
  274. package/dist/test-helpers/race-condition-helpers.d.ts.map +1 -0
  275. package/dist/test-helpers/race-condition-helpers.js +279 -0
  276. package/dist/test-helpers/race-condition-helpers.js.map +1 -0
  277. package/dist/test-helpers/test-data-builders.d.ts +40 -3
  278. package/dist/test-helpers/test-data-builders.d.ts.map +1 -1
  279. package/dist/test-helpers/test-data-builders.js +54 -5
  280. package/dist/test-helpers/test-data-builders.js.map +1 -1
  281. package/dist/test-helpers/tool-validators.d.ts.map +1 -1
  282. package/dist/test-helpers/tool-validators.js +16 -1
  283. package/dist/test-helpers/tool-validators.js.map +1 -1
  284. package/dist/tools/context-stats.d.ts.map +1 -1
  285. package/dist/tools/context-stats.js +6 -8
  286. package/dist/tools/context-stats.js.map +1 -1
  287. package/dist/tools/export-confident.d.ts +145 -0
  288. package/dist/tools/export-confident.d.ts.map +1 -0
  289. package/dist/tools/export-confident.js +134 -0
  290. package/dist/tools/export-confident.js.map +1 -0
  291. package/dist/tools/export-confident.test.d.ts +7 -0
  292. package/dist/tools/export-confident.test.d.ts.map +1 -0
  293. package/dist/tools/export-confident.test.js +332 -0
  294. package/dist/tools/export-confident.test.js.map +1 -0
  295. package/dist/tools/export-datadog.d.ts +160 -0
  296. package/dist/tools/export-datadog.d.ts.map +1 -0
  297. package/dist/tools/export-datadog.js +160 -0
  298. package/dist/tools/export-datadog.js.map +1 -0
  299. package/dist/tools/export-datadog.test.d.ts +8 -0
  300. package/dist/tools/export-datadog.test.d.ts.map +1 -0
  301. package/dist/tools/export-datadog.test.js +419 -0
  302. package/dist/tools/export-datadog.test.js.map +1 -0
  303. package/dist/tools/export-langfuse.d.ts +137 -0
  304. package/dist/tools/export-langfuse.d.ts.map +1 -0
  305. package/dist/tools/export-langfuse.js +131 -0
  306. package/dist/tools/export-langfuse.js.map +1 -0
  307. package/dist/tools/export-langfuse.test.d.ts +7 -0
  308. package/dist/tools/export-langfuse.test.d.ts.map +1 -0
  309. package/dist/tools/export-langfuse.test.js +303 -0
  310. package/dist/tools/export-langfuse.test.js.map +1 -0
  311. package/dist/tools/export-phoenix.d.ts +145 -0
  312. package/dist/tools/export-phoenix.d.ts.map +1 -0
  313. package/dist/tools/export-phoenix.js +135 -0
  314. package/dist/tools/export-phoenix.js.map +1 -0
  315. package/dist/tools/export-phoenix.test.d.ts +7 -0
  316. package/dist/tools/export-phoenix.test.d.ts.map +1 -0
  317. package/dist/tools/export-phoenix.test.js +316 -0
  318. package/dist/tools/export-phoenix.test.js.map +1 -0
  319. package/dist/tools/health-check.d.ts +26 -0
  320. package/dist/tools/health-check.d.ts.map +1 -1
  321. package/dist/tools/health-check.js +36 -7
  322. package/dist/tools/health-check.js.map +1 -1
  323. package/dist/tools/index.d.ts +6 -0
  324. package/dist/tools/index.d.ts.map +1 -1
  325. package/dist/tools/index.js +6 -0
  326. package/dist/tools/index.js.map +1 -1
  327. package/dist/tools/inject-evaluations.d.ts +1315 -0
  328. package/dist/tools/inject-evaluations.d.ts.map +1 -0
  329. package/dist/tools/inject-evaluations.js +121 -0
  330. package/dist/tools/inject-evaluations.js.map +1 -0
  331. package/dist/tools/inject-evaluations.test.d.ts +5 -0
  332. package/dist/tools/inject-evaluations.test.d.ts.map +1 -0
  333. package/dist/tools/inject-evaluations.test.js +359 -0
  334. package/dist/tools/inject-evaluations.test.js.map +1 -0
  335. package/dist/tools/query-evaluations.d.ts +25 -4
  336. package/dist/tools/query-evaluations.d.ts.map +1 -1
  337. package/dist/tools/query-evaluations.js +26 -2
  338. package/dist/tools/query-evaluations.js.map +1 -1
  339. package/dist/tools/query-evaluations.test.js +53 -46
  340. package/dist/tools/query-evaluations.test.js.map +1 -1
  341. package/dist/tools/query-llm-events.js +2 -2
  342. package/dist/tools/query-llm-events.js.map +1 -1
  343. package/dist/tools/query-llm-events.test.js +6 -3
  344. package/dist/tools/query-llm-events.test.js.map +1 -1
  345. package/dist/tools/query-logs.d.ts +8 -8
  346. package/dist/tools/query-logs.js +3 -3
  347. package/dist/tools/query-logs.js.map +1 -1
  348. package/dist/tools/query-metrics.d.ts +4 -4
  349. package/dist/tools/query-metrics.js +2 -2
  350. package/dist/tools/query-metrics.js.map +1 -1
  351. package/dist/tools/query-traces.d.ts +8 -8
  352. package/dist/tools/query-verifications.d.ts +111 -0
  353. package/dist/tools/query-verifications.d.ts.map +1 -0
  354. package/dist/tools/query-verifications.js +101 -0
  355. package/dist/tools/query-verifications.js.map +1 -0
  356. package/dist/tools/query-verifications.test.d.ts +5 -0
  357. package/dist/tools/query-verifications.test.d.ts.map +1 -0
  358. package/dist/tools/query-verifications.test.js +156 -0
  359. package/dist/tools/query-verifications.test.js.map +1 -0
  360. package/dist/types/evaluation-hooks.d.ts +176 -0
  361. package/dist/types/evaluation-hooks.d.ts.map +1 -0
  362. package/dist/types/evaluation-hooks.js +49 -0
  363. package/dist/types/evaluation-hooks.js.map +1 -0
  364. package/package.json +11 -2
@@ -0,0 +1,1397 @@
1
+ /**
2
+ * LLM-as-Judge Implementation
3
+ *
4
+ * Provides patterns and utilities for using LLMs to evaluate LLM outputs.
5
+ * Implements G-Eval, QAG, and production-ready evaluation patterns per
6
+ * industry best practices and OTel GenAI semantic conventions.
7
+ *
8
+ * @security
9
+ * - All user inputs are sanitized for prompt injection protection
10
+ * - LLM calls have timeout protection (default 30s)
11
+ * - Input sizes are validated to prevent resource exhaustion
12
+ * - JSON parsing has depth limits to prevent DoS
13
+ *
14
+ * @security Known Limitations
15
+ * - Script homoglyphs (Cyrillic, Greek characters visually similar to Latin)
16
+ * are NOT currently filtered in all cases. The confusables library provides
17
+ * partial coverage but may miss some Unicode TR39 edge cases.
18
+ * Example: Cyrillic "а" (U+0430) looks identical to Latin "a".
19
+ * This is a known gap tracked for future enhancement.
20
+ *
21
+ * @see https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/
22
+ */
23
+ import { InputValidationError } from './input-validator.js';
24
+ import { HttpStatus } from './constants.js';
25
+ import { remove as removeConfusables } from 'confusables';
26
+ import sbd from 'sbd';
27
+ // ============================================================================
28
+ // Typed Error Classes
29
+ // ============================================================================
30
+ /**
31
+ * Error for prompt injection detection.
32
+ *
33
+ * NOTE: This class is exported for type-checking and external use but is
34
+ * intentionally NOT thrown by sanitizeForPrompt(). The design decision is
35
+ * to silently replace injection patterns with '[filtered]' markers rather
36
+ * than fail-fast, allowing evaluation to proceed with sanitized input.
37
+ *
38
+ * @example
39
+ * // External code can throw this for stricter handling:
40
+ * if (detectsInjection(input)) {
41
+ * throw new PromptInjectionError('Injection detected in user input');
42
+ * }
43
+ */
44
+ export class PromptInjectionError extends Error {
45
+ constructor(message) {
46
+ super(message);
47
+ this.name = 'PromptInjectionError';
48
+ }
49
+ }
50
+ /**
51
+ * Error thrown when an LLM call exceeds the configured timeout.
52
+ *
53
+ * Thrown by withTimeout() when the wrapped function does not complete
54
+ * within the specified duration. Callers should catch this to implement
55
+ * fallback behavior or retry logic.
56
+ *
57
+ * @example
58
+ * try {
59
+ * await withTimeout(() => llmCall(), 5000);
60
+ * } catch (e) {
61
+ * if (e instanceof LLMTimeoutError) {
62
+ * // Handle timeout - use cached result or return default
63
+ * }
64
+ * }
65
+ *
66
+ * @see withTimeout
67
+ */
68
+ export class LLMTimeoutError extends Error {
69
+ constructor(timeoutMs) {
70
+ super(`LLM call timed out after ${timeoutMs}ms`);
71
+ this.name = 'LLMTimeoutError';
72
+ }
73
+ }
74
+ /**
75
+ * Error thrown when score extraction or normalization fails.
76
+ *
77
+ * Thrown by normalizeWithLogprobs() when the LLM response cannot be
78
+ * parsed into a valid score, or when probability weighting fails.
79
+ *
80
+ * @example
81
+ * try {
82
+ * const score = normalizeWithLogprobs(response, [1,2,3,4,5]);
83
+ * } catch (e) {
84
+ * if (e instanceof ScoreNormalizationError) {
85
+ * // Fallback to regex-based extraction
86
+ * }
87
+ * }
88
+ *
89
+ * @see normalizeWithLogprobs
90
+ * @see extractScoreFromText
91
+ */
92
+ export class ScoreNormalizationError extends Error {
93
+ constructor(message) {
94
+ super(message);
95
+ this.name = 'ScoreNormalizationError';
96
+ }
97
+ }
98
+ // ============================================================================
99
+ // Security Constants
100
+ // ============================================================================
101
+ /** Maximum input size in bytes (64KB) */
102
+ export const MAX_INPUT_SIZE_BYTES = 65536;
103
+ /** Maximum text length per field (10KB) */
104
+ export const MAX_TEXT_LENGTH = 10000;
105
+ /** Maximum context array length */
106
+ export const MAX_CONTEXT_ITEMS = 20;
107
+ /** Maximum statements to process in QAG pattern */
108
+ export const MAX_STATEMENTS = 20;
109
+ /** Default timeout for LLM calls (30 seconds) */
110
+ export const DEFAULT_LLM_TIMEOUT_MS = 30000;
111
+ /** Maximum JSON nesting depth */
112
+ export const MAX_JSON_DEPTH = 5;
113
+ /**
114
+ * Current log level for the module.
115
+ * Controls verbosity of console output for production flexibility.
116
+ * - 'debug': All logs including verbose debugging info
117
+ * - 'info': Informational messages and above
118
+ * - 'warn': Warnings and errors only (default)
119
+ * - 'error': Only error messages
120
+ * - 'silent': No logging output
121
+ */
122
+ export const LOG_LEVEL = process.env.LLM_JUDGE_LOG_LEVEL || 'warn';
123
+ /**
124
+ * Check if a log level should be output based on current LOG_LEVEL.
125
+ * @param level - Level to check
126
+ * @returns True if the level should be logged
127
+ */
128
+ function shouldLog(level) {
129
+ const levels = ['debug', 'info', 'warn', 'error', 'silent'];
130
+ const currentIndex = levels.indexOf(LOG_LEVEL);
131
+ const levelIndex = levels.indexOf(level);
132
+ return levelIndex >= currentIndex && LOG_LEVEL !== 'silent';
133
+ }
134
+ // ============================================================================
135
+ // G-Eval Score Range Constants
136
+ // ============================================================================
137
+ /** G-Eval minimum score (inclusive) */
138
+ export const G_EVAL_MIN_SCORE = 1;
139
+ /** G-Eval maximum score (inclusive) */
140
+ export const G_EVAL_MAX_SCORE = 5;
141
+ /** G-Eval valid score values array */
142
+ export const G_EVAL_VALID_SCORES = [1, 2, 3, 4, 5];
143
+ /** G-Eval score range for normalization (max - min) */
144
+ export const G_EVAL_SCORE_RANGE = G_EVAL_MAX_SCORE - G_EVAL_MIN_SCORE;
145
+ /**
146
+ * G-Eval default/middle score for fallback scenarios.
147
+ * @note This constant is exported for library consumers who need a sensible
148
+ * default when implementing custom fallback logic. The core gEval() function
149
+ * throws errors rather than using fallback values to ensure explicit handling.
150
+ * @example
151
+ * ```typescript
152
+ * // Custom fallback in consumer code
153
+ * const score = parseResult(response) ?? G_EVAL_DEFAULT_SCORE;
154
+ * ```
155
+ */
156
+ export const G_EVAL_DEFAULT_SCORE = 3;
157
+ // ============================================================================
158
+ // LLM Configuration Constants
159
+ // ============================================================================
160
+ /** Default temperature for deterministic LLM calls (e.g., extraction, answering) */
161
+ export const LLM_TEMPERATURE_DETERMINISTIC = 0;
162
+ /** Default temperature for evaluation LLM calls (slight variation allowed) */
163
+ export const LLM_TEMPERATURE_EVALUATION = 0.1;
164
+ /** Minimum evaluation steps to generate in G-Eval */
165
+ export const G_EVAL_MIN_STEPS = 3;
166
+ /** Maximum evaluation steps to generate in G-Eval */
167
+ export const G_EVAL_MAX_STEPS = 5;
168
+ // ============================================================================
169
+ // QAG Pattern Constants
170
+ // ============================================================================
171
+ /** Minimum statement length to be considered valid for QAG extraction */
172
+ export const MIN_STATEMENT_LENGTH = 10;
173
+ // ============================================================================
174
+ // Retry and Circuit Breaker Constants
175
+ // ============================================================================
176
+ /** Default maximum retry attempts for evaluateWithRetry */
177
+ export const DEFAULT_MAX_RETRIES = 3;
178
+ /** Default circuit breaker failure threshold */
179
+ export const DEFAULT_CIRCUIT_BREAKER_THRESHOLD = 5;
180
+ /** Default circuit breaker reset timeout in milliseconds */
181
+ export const DEFAULT_CIRCUIT_BREAKER_RESET_MS = 30000;
182
+ /** Base delay multiplier for exponential backoff (1 second) */
183
+ export const BACKOFF_BASE_MS = 1000;
184
+ // ============================================================================
185
+ // Score Validation Constants
186
+ // ============================================================================
187
+ /** Minimum valid normalized score */
188
+ export const NORMALIZED_SCORE_MIN = 0;
189
+ /** Maximum valid normalized score */
190
+ export const NORMALIZED_SCORE_MAX = 1;
191
+ /**
192
+ * OTel attribute mapping for evaluation events
193
+ */
194
+ export const EVALUATION_OTEL_ATTRIBUTES = {
195
+ evaluationName: 'gen_ai.evaluation.name',
196
+ scoreValue: 'gen_ai.evaluation.score.value',
197
+ scoreLabel: 'gen_ai.evaluation.score.label',
198
+ explanation: 'gen_ai.evaluation.explanation',
199
+ errorType: 'error.type',
200
+ durationMs: 'gen_ai.evaluation.duration',
201
+ inputTokens: 'gen_ai.usage.input_tokens',
202
+ outputTokens: 'gen_ai.usage.output_tokens',
203
+ };
204
+ // ============================================================================
205
+ // Security Utilities
206
+ // ============================================================================
207
+ /**
208
+ * Prompt injection detection patterns (case-insensitive, Unicode-normalized).
209
+ *
210
+ * @security These patterns use non-capturing groups (?:...) and avoid nested
211
+ * quantifiers that could cause catastrophic backtracking on adversarial input.
212
+ * For example, `\s+(all\s+)?` is rewritten as `\s+(?:all\s+)?` with the outer
213
+ * `\s+` matching minimally before the optional group.
214
+ */
215
+ const PROMPT_INJECTION_PATTERNS = [
216
+ /ignore\s+(?:all\s+)?previous\s+instructions/gi,
217
+ /system\s+prompt/gi,
218
+ /you\s+are\s+now/gi,
219
+ /forget\s+everything/gi,
220
+ /disregard\s+(?:all\s+)?(?:previous|prior)/gi,
221
+ /new\s+instructions?:/gi,
222
+ /override\s+(?:system|instructions)/gi,
223
+ /act\s+as\s+(?:if\s+)?(?:you\s+are|an?)\s/gi,
224
+ /pretend\s+(?:you\s+are|to\s+be)/gi,
225
+ /jailbreak/gi,
226
+ /\bDAN\b/gi, // "Do Anything Now" prompt - case-insensitive to catch "dan", "Dan" variants
227
+ /developer\s+mode/gi,
228
+ /ignore\s+safety/gi,
229
+ /bypass\s+(?:filter|restriction|rule)/gi,
230
+ ];
231
+ /**
232
+ * Compiles an array of RegExp patterns into a single combined regex.
233
+ * Optimizes O(n*m) pattern matching to O(n) by using alternation.
234
+ *
235
+ * @param patterns - Array of regex patterns to compile (must be non-empty)
236
+ * @returns Combined regex with all patterns as alternations (global + case-insensitive)
237
+ * @performance Reduces 14 pattern checks to single regex evaluation
238
+ * @security All patterns are normalized to 'gi' flags for consistent case-insensitive matching
239
+ * @throws {Error} If patterns array is empty
240
+ */
241
+ function compilePatterns(patterns) {
242
+ if (patterns.length === 0) {
243
+ throw new Error('Cannot compile empty patterns array');
244
+ }
245
+ const sources = patterns.map(p => `(?:${p.source})`);
246
+ return new RegExp(sources.join('|'), 'gi');
247
+ }
248
+ /**
249
+ * Pre-compiled injection detection regex for O(n) performance.
250
+ * Combines all 14 injection patterns into single regex via alternation.
251
+ *
252
+ * @performance Single regex.test() instead of 14 individual pattern checks.
253
+ * For 10KB text with 14 patterns, reduces from O(10KB*14) to O(10KB).
254
+ */
255
+ const COMPILED_INJECTION_PATTERN = compilePatterns(PROMPT_INJECTION_PATTERNS);
256
+ /**
257
+ * Normalize text for prompt injection detection.
258
+ * Handles Unicode homoglyphs and common obfuscation tricks.
259
+ *
260
+ * @security CRITICAL ORDERING:
261
+ * 1. Homoglyph mapping FIRST using confusables library (Unicode TR39 coverage)
262
+ * 2. NFKC normalization to decompose composed characters
263
+ * 3. Zero-width removal prevents attacks like "ign\u2060ore" bypassing detection
264
+ * 4. Quote normalization prevents "ignore" vs 'ignore' bypasses
265
+ *
266
+ * @see https://unicode.org/reports/tr39/#Confusable_Detection
267
+ */
268
+ function normalizeForDetection(text) {
269
+ // Step 1: Map homoglyphs from other scripts to Latin equivalents
270
+ // Uses confusables library with full Unicode TR39 coverage
271
+ const normalized = removeConfusables(text);
272
+ return normalized
273
+ .normalize('NFKC') // Step 2: Decompose composed characters
274
+ .replace(/[\u200B-\u200D\u2060\u180E\uFEFF\u034F\uFE00-\uFE0F]/g, '') // Step 3: Remove zero-width chars
275
+ .replace(/['']/g, "'") // Step 4: Normalize quotes
276
+ .replace(/[""]/g, '"')
277
+ .toLowerCase();
278
+ }
279
+ /** Zero-width characters regex for removal from output */
280
+ const ZERO_WIDTH_CHARS_REGEX = /[\u200B-\u200D\u2060\u180E\uFEFF\u034F\uFE00-\uFE0F]/g;
281
+ /** Smart single quotes normalization regex */
282
+ const SMART_SINGLE_QUOTES_REGEX = /['']/g;
283
+ /** Smart double quotes normalization regex */
284
+ const SMART_DOUBLE_QUOTES_REGEX = /[""]/g;
285
+ /** Double newline delimiter regex for section injection prevention */
286
+ const DOUBLE_NEWLINE_REGEX = /\n\n/g;
287
+ /** Section keyword regex for prompt delimiter escaping */
288
+ const SECTION_KEYWORD_REGEX = /\n(Output|Input|Context|Expected Output|Criteria|Score):/gi;
289
+ /**
290
+ * Sanitizes an array of context strings for safe prompt inclusion.
291
+ * Truncates to MAX_CONTEXT_ITEMS and sanitizes each item.
292
+ *
293
+ * @param context - Array of context strings to sanitize
294
+ * @returns Sanitized and truncated context array (max MAX_CONTEXT_ITEMS items)
295
+ * @security Applies prompt injection protection to each context item
296
+ */
297
+ export function sanitizeContextArray(context) {
298
+ return context.slice(0, MAX_CONTEXT_ITEMS).map(c => sanitizeForPrompt(c));
299
+ }
300
+ /**
301
+ * Sanitize text for safe inclusion in prompts.
302
+ * Detects and removes potential prompt injection attempts.
303
+ *
304
+ * SECURITY DESIGN:
305
+ * - Uses normalized text (homoglyphs → Latin) for injection DETECTION
306
+ * - Preserves original text when no injection patterns are found
307
+ * - Only returns normalized text when malicious patterns are detected
308
+ * - This preserves legitimate Cyrillic/Greek text while catching attacks
309
+ *
310
+ * @param text - Text to sanitize
311
+ * @param maxLength - Maximum allowed length (default: MAX_TEXT_LENGTH)
312
+ * @returns Sanitized text with injection patterns replaced by '[filtered]'
313
+ * @security Removes zero-width characters and neutralizes prompt injection patterns
314
+ */
315
+ export function sanitizeForPrompt(text, maxLength = MAX_TEXT_LENGTH) {
316
+ // Truncate to prevent context overflow
317
+ let sanitized = text.slice(0, maxLength);
318
+ // Remove zero-width characters that could be used to bypass detection
319
+ // This must happen before pattern matching to prevent word-breaking attacks
320
+ sanitized = sanitized.replace(ZERO_WIDTH_CHARS_REGEX, '');
321
+ // Create normalized text for injection DETECTION only
322
+ // Uses confusables library for full Unicode TR39 homoglyph coverage
323
+ let detectionText = removeConfusables(sanitized);
324
+ // Apply NFKC normalization for full-width characters and other Unicode tricks
325
+ detectionText = detectionText
326
+ .normalize('NFKC')
327
+ .replace(ZERO_WIDTH_CHARS_REGEX, '')
328
+ .replace(SMART_SINGLE_QUOTES_REGEX, "'")
329
+ .replace(SMART_DOUBLE_QUOTES_REGEX, '"');
330
+ // Check for injection patterns in normalized detection text
331
+ // Uses pre-compiled regex for O(n) performance vs O(n*m) with individual patterns
332
+ const hasInjection = COMPILED_INJECTION_PATTERN.test(detectionText);
333
+ // Reset lastIndex after test() to ensure consistent behavior
334
+ COMPILED_INJECTION_PATTERN.lastIndex = 0;
335
+ // If injection detected, use normalized text with patterns filtered
336
+ // Data loss is acceptable when filtering malicious input
337
+ if (hasInjection) {
338
+ // Use compiled pattern for single-pass replacement
339
+ const normalized = detectionText.replace(COMPILED_INJECTION_PATTERN, '[filtered]');
340
+ // Escape prompt delimiters
341
+ return normalized
342
+ .replace(DOUBLE_NEWLINE_REGEX, '\n \n')
343
+ .replace(SECTION_KEYWORD_REGEX, '\n $1:');
344
+ }
345
+ // No injection - preserve original text, only escape delimiters
346
+ // This preserves legitimate Cyrillic/Greek text
347
+ return sanitized
348
+ .replace(DOUBLE_NEWLINE_REGEX, '\n \n')
349
+ .replace(SECTION_KEYWORD_REGEX, '\n $1:');
350
+ }
351
+ /**
352
+ * Create a customizable sanitizer with additional injection patterns.
353
+ *
354
+ * Use this factory when you need to extend the default prompt injection
355
+ * patterns with domain-specific patterns for your use case.
356
+ *
357
+ * @param additionalPatterns - Additional regex patterns to detect as prompt injection
358
+ * @returns Sanitizer function with signature (text: string, maxLength?: number) => string
359
+ * that applies all default patterns plus additionalPatterns, truncates to maxLength,
360
+ * and returns sanitized text with '[filtered]' markers for detected injections.
361
+ * @throws {InputValidationError} If additionalPatterns contains non-RegExp items
362
+ *
363
+ * @example
364
+ * ```typescript
365
+ * const customSanitizer = createSanitizer([
366
+ * /my\s+custom\s+attack\s+pattern/gi,
367
+ * /another\s+pattern/gi,
368
+ * ]);
369
+ * const sanitized = customSanitizer(userInput);
370
+ * const truncated = customSanitizer(longInput, 1000); // Override maxLength per-call
371
+ * ```
372
+ *
373
+ * @security Custom patterns MUST avoid ReDoS vulnerabilities:
374
+ * - Use non-capturing groups (?:...) when grouping is not needed
375
+ * - Avoid nested quantifiers like (a+)+ or (a*)*
376
+ * - Test patterns with tools like safe-regex before deployment
377
+ * - Example vulnerable: /^(a+)+$/ with input "aaaaaaaaaaaaaaaaX"
378
+ * - Example safe: /^a+$/ or /^(?:a+)$/
379
+ */
380
+ export function createSanitizer(additionalPatterns = []) {
381
+ // Validate patterns at factory time
382
+ for (let i = 0; i < additionalPatterns.length; i++) {
383
+ if (!(additionalPatterns[i] instanceof RegExp)) {
384
+ throw new InputValidationError(`additionalPatterns[${i}] must be a RegExp, got ${typeof additionalPatterns[i]}`, 'additionalPatterns', 'type');
385
+ }
386
+ }
387
+ // Compile all patterns at factory time for O(n) performance
388
+ const allPatterns = [...PROMPT_INJECTION_PATTERNS, ...additionalPatterns];
389
+ const compiledPattern = compilePatterns(allPatterns);
390
+ return (text, maxLength = MAX_TEXT_LENGTH) => {
391
+ // Truncate to prevent context overflow
392
+ let sanitized = text.slice(0, maxLength);
393
+ // Remove zero-width characters
394
+ sanitized = sanitized.replace(ZERO_WIDTH_CHARS_REGEX, '');
395
+ // Create normalized text for injection DETECTION only
396
+ let detectionText = removeConfusables(sanitized);
397
+ detectionText = detectionText
398
+ .normalize('NFKC')
399
+ .replace(ZERO_WIDTH_CHARS_REGEX, '')
400
+ .replace(SMART_SINGLE_QUOTES_REGEX, "'")
401
+ .replace(SMART_DOUBLE_QUOTES_REGEX, '"');
402
+ // Check for injection patterns using compiled regex
403
+ const hasInjection = compiledPattern.test(detectionText);
404
+ compiledPattern.lastIndex = 0; // Reset for consistent behavior
405
+ // If injection detected, use normalized text with patterns filtered
406
+ if (hasInjection) {
407
+ const normalized = detectionText.replace(compiledPattern, '[filtered]');
408
+ return normalized
409
+ .replace(DOUBLE_NEWLINE_REGEX, '\n \n')
410
+ .replace(SECTION_KEYWORD_REGEX, '\n $1:');
411
+ }
412
+ // No injection - preserve original text, only escape delimiters
413
+ return sanitized
414
+ .replace(DOUBLE_NEWLINE_REGEX, '\n \n')
415
+ .replace(SECTION_KEYWORD_REGEX, '\n $1:');
416
+ };
417
+ }
418
+ /**
419
+ * Validate test case input sizes against security limits.
420
+ * Checks individual field lengths and total byte size.
421
+ *
422
+ * @param testCase - Test case to validate
423
+ * @returns void
424
+ * @throws {InputValidationError} If any field exceeds MAX_TEXT_LENGTH
425
+ * @throws {InputValidationError} If context exceeds MAX_CONTEXT_ITEMS
426
+ * @throws {InputValidationError} If total size exceeds MAX_INPUT_SIZE_BYTES
427
+ * @security Prevents resource exhaustion by enforcing size limits
428
+ */
429
+ export function validateTestCase(testCase) {
430
+ if (testCase.input.length > MAX_TEXT_LENGTH) {
431
+ throw new InputValidationError(`Input exceeds ${MAX_TEXT_LENGTH} character limit`, 'input', 'maxLength');
432
+ }
433
+ if (testCase.output.length > MAX_TEXT_LENGTH) {
434
+ throw new InputValidationError(`Output exceeds ${MAX_TEXT_LENGTH} character limit`, 'output', 'maxLength');
435
+ }
436
+ if (testCase.context && testCase.context.length > MAX_CONTEXT_ITEMS) {
437
+ throw new InputValidationError(`Context exceeds ${MAX_CONTEXT_ITEMS} items limit`, 'context', 'maxLength');
438
+ }
439
+ // Validate individual context item types and sizes
440
+ if (testCase.context) {
441
+ for (let i = 0; i < testCase.context.length; i++) {
442
+ const item = testCase.context[i];
443
+ // Validate type - context items must be strings
444
+ if (typeof item !== 'string') {
445
+ throw new InputValidationError(`Context item ${i} must be a string, got ${typeof item}`, 'context', 'type');
446
+ }
447
+ if (item.length > MAX_TEXT_LENGTH) {
448
+ throw new InputValidationError(`Context item ${i} exceeds ${MAX_TEXT_LENGTH} character limit`, 'context', 'maxLength');
449
+ }
450
+ }
451
+ }
452
+ if (testCase.expectedOutput && testCase.expectedOutput.length > MAX_TEXT_LENGTH) {
453
+ throw new InputValidationError(`Expected output exceeds ${MAX_TEXT_LENGTH} character limit`, 'expectedOutput', 'maxLength');
454
+ }
455
+ // Validate total size to prevent memory exhaustion
456
+ // Individual fields may pass but combined could exceed MAX_INPUT_SIZE_BYTES
457
+ let totalBytes = testCase.input.length + testCase.output.length;
458
+ if (testCase.context) {
459
+ totalBytes += testCase.context.reduce((sum, c) => sum + c.length, 0);
460
+ }
461
+ if (testCase.expectedOutput) {
462
+ totalBytes += testCase.expectedOutput.length;
463
+ }
464
+ if (totalBytes > MAX_INPUT_SIZE_BYTES) {
465
+ throw new InputValidationError(`Total test case size ${totalBytes} exceeds ${MAX_INPUT_SIZE_BYTES} bytes`, 'testCase', 'maxSize');
466
+ }
467
+ }
468
+ /**
469
+ * Safe JSON parsing with depth limit to prevent DoS attacks.
470
+ *
471
+ * @param text - JSON text to parse
472
+ * @param maxDepth - Maximum nesting depth (default: MAX_JSON_DEPTH)
473
+ * @returns Parsed value
474
+ * @throws {Error} If JSON is invalid, too large, or too deeply nested
475
+ */
476
+ export function safeJSONParse(text, maxDepth = MAX_JSON_DEPTH) {
477
+ // Limit size
478
+ if (text.length > MAX_INPUT_SIZE_BYTES) {
479
+ throw new Error('JSON response too large');
480
+ }
481
+ const parsed = JSON.parse(text);
482
+ // Check depth recursively - iterates directly without array allocation for performance
483
+ const checkDepth = (obj, depth = 0) => {
484
+ if (depth > maxDepth) {
485
+ throw new Error('JSON nesting too deep');
486
+ }
487
+ if (typeof obj === 'object' && obj !== null) {
488
+ if (Array.isArray(obj)) {
489
+ for (const value of obj) {
490
+ checkDepth(value, depth + 1);
491
+ }
492
+ }
493
+ else {
494
+ for (const key in obj) {
495
+ if (Object.prototype.hasOwnProperty.call(obj, key)) {
496
+ checkDepth(obj[key], depth + 1);
497
+ }
498
+ }
499
+ }
500
+ }
501
+ };
502
+ checkDepth(parsed);
503
+ return parsed;
504
+ }
505
+ /**
506
+ * Execute an async function with timeout protection.
507
+ *
508
+ * Uses AbortController for atomic cancellation to prevent race conditions
509
+ * between the function completing and the timeout firing. The abort signal
510
+ * provides atomic state that both code paths can check safely.
511
+ *
512
+ * @param fn - Async function to execute, receives AbortSignal for cancellation
513
+ * @param timeoutMs - Timeout in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
514
+ * @returns Result of the function
515
+ * @throws {LLMTimeoutError} If function times out
516
+ */
517
+ export async function withTimeout(fn, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
518
+ let timeoutId;
519
+ const abortController = new AbortController();
520
+ const clearTimer = () => {
521
+ if (timeoutId !== undefined) {
522
+ clearTimeout(timeoutId);
523
+ timeoutId = undefined;
524
+ }
525
+ };
526
+ const timeoutPromise = new Promise((_, reject) => {
527
+ timeoutId = setTimeout(() => {
528
+ // Check and set abort atomically - abort() MUST be first action after check
529
+ // to prevent race between timeout and fn() completion
530
+ if (!abortController.signal.aborted) {
531
+ abortController.abort(); // Signal timeout to function
532
+ clearTimer();
533
+ reject(new LLMTimeoutError(timeoutMs));
534
+ }
535
+ }, timeoutMs);
536
+ });
537
+ try {
538
+ const result = await Promise.race([fn(abortController.signal), timeoutPromise]);
539
+ clearTimer(); // Success: just clear timer, don't abort (signal stays not-aborted)
540
+ return result;
541
+ }
542
+ catch (error) {
543
+ clearTimer();
544
+ // Only abort if not already aborted (e.g., from timeout)
545
+ if (!abortController.signal.aborted) {
546
+ abortController.abort();
547
+ }
548
+ throw error;
549
+ }
550
+ }
551
+ // ============================================================================
552
+ // G-Eval Pattern Helpers
553
+ // ============================================================================
554
+ /**
555
+ * Build evaluation prompt from config and test case.
556
+ * Used by G-Eval pattern for structured evaluation prompts.
557
+ *
558
+ * @param config - G-Eval configuration with criteria and parameters
559
+ * @param testCase - Test case containing input, output, and optional context
560
+ * @param steps - Evaluation steps generated by chain-of-thought
561
+ * @returns Formatted prompt string for the judge model
562
+ * @security All user-provided content is sanitized for prompt injection
563
+ */
564
+ export function buildEvalPrompt(config, testCase, steps) {
565
+ const parts = [
566
+ `You are evaluating: ${config.name}`,
567
+ `\nCriteria: ${config.criteria}`,
568
+ `\nEvaluation Steps:\n${steps}`,
569
+ ];
570
+ if (config.evaluationParams.includes('input')) {
571
+ parts.push(`\nInput: ${sanitizeForPrompt(testCase.input)}`);
572
+ }
573
+ if (config.evaluationParams.includes('output')) {
574
+ parts.push(`\nOutput: ${sanitizeForPrompt(testCase.output)}`);
575
+ }
576
+ if (config.evaluationParams.includes('context') && testCase.context) {
577
+ const sanitizedContext = sanitizeContextArray(testCase.context);
578
+ parts.push(`\nContext: ${sanitizedContext.join('\n')}`);
579
+ }
580
+ if (config.evaluationParams.includes('expectedOutput') && testCase.expectedOutput) {
581
+ parts.push(`\nExpected Output: ${sanitizeForPrompt(testCase.expectedOutput)}`);
582
+ }
583
+ parts.push(`\nProvide a score from ${G_EVAL_MIN_SCORE}-${G_EVAL_MAX_SCORE} and explain your reasoning.`);
584
+ return parts.join('');
585
+ }
586
+ /**
587
+ * Normalize score using token log probabilities.
588
+ * Calculates weighted average score based on probability distribution.
589
+ *
590
+ * @param logprobs - Token log probabilities from LLM response
591
+ * @param validScores - Valid score values (e.g., [1, 2, 3, 4, 5])
592
+ * @returns Normalized score as weighted average
593
+ * @throws {ScoreNormalizationError} When no valid score tokens found in logprobs
594
+ */
595
+ export function normalizeWithLogprobs(logprobs, validScores) {
596
+ // Validate validScores contains only finite numbers
597
+ if (!validScores.every(s => typeof s === 'number' && Number.isFinite(s))) {
598
+ throw new ScoreNormalizationError('validScores must contain only finite numbers');
599
+ }
600
+ const scoreProbs = new Map();
601
+ for (const score of validScores) {
602
+ scoreProbs.set(score, 0);
603
+ }
604
+ for (const { token, logprob } of logprobs) {
605
+ const scoreValue = parseInt(token.trim(), 10);
606
+ if (validScores.includes(scoreValue)) {
607
+ const prob = Math.exp(logprob);
608
+ scoreProbs.set(scoreValue, (scoreProbs.get(scoreValue) || 0) + prob);
609
+ }
610
+ }
611
+ let weightedSum = 0;
612
+ let totalProb = 0;
613
+ for (const [score, prob] of scoreProbs) {
614
+ weightedSum += score * prob;
615
+ totalProb += prob;
616
+ }
617
+ if (totalProb === 0) {
618
+ throw new ScoreNormalizationError('No valid score tokens found in logprobs - cannot normalize');
619
+ }
620
+ return weightedSum / totalProb;
621
+ }
622
+ // ============================================================================
623
+ // Score Extraction Pattern Constants
624
+ // ============================================================================
625
+ /**
626
+ * Matches explicit score declarations like "Score: 4" or "score 3".
627
+ * Case-insensitive. Captures the digit (1-5) in group 1.
628
+ * @example "Score: 4" → captures "4"
629
+ * @example "score 3" → captures "3"
630
+ */
631
+ const EXPLICIT_SCORE_PATTERN = /\bscore[:\s]+([1-5])\b/i;
632
+ /**
633
+ * Matches rating format like "Rating: 4" or "rating: 2".
634
+ * Case-insensitive. Captures the digit (1-5) in group 1.
635
+ * @example "Rating: 4" → captures "4"
636
+ */
637
+ const RATING_PATTERN = /\brating[:\s]+([1-5])\b/i;
638
+ /**
639
+ * Matches fractional format like "4 out of 5" or "4/5".
640
+ * Case-insensitive. Captures the numerator digit (1-5) in group 1.
641
+ * @example "4 out of 5" → captures "4"
642
+ * @example "3/5" → captures "3"
643
+ */
644
+ const FRACTION_PATTERN = /\b([1-5])\s*(?:out of|\/)\s*5\b/i;
645
+ /**
646
+ * Matches standalone digit on its own line.
647
+ * Useful for responses that just return the score number.
648
+ * Captures the digit (1-5) in group 1.
649
+ * @example " 3 " (with newlines) → captures "3"
650
+ */
651
+ const STANDALONE_DIGIT_PATTERN = /^\s*([1-5])\s*$/m;
652
+ /**
653
+ * Score extraction patterns in order of specificity.
654
+ * More specific patterns are tried first to avoid false positives.
655
+ *
656
+ * These patterns match G-Eval scores in the range [G_EVAL_MIN_SCORE, G_EVAL_MAX_SCORE] (1-5).
657
+ */
658
+ const SCORE_PATTERNS = [
659
+ EXPLICIT_SCORE_PATTERN,
660
+ RATING_PATTERN,
661
+ FRACTION_PATTERN,
662
+ STANDALONE_DIGIT_PATTERN,
663
+ ];
664
+ /** Maximum characters to search for fallback score extraction */
665
+ const SCORE_FALLBACK_WINDOW = 100;
666
+ /**
667
+ * Extract score from LLM response text.
668
+ *
669
+ * Uses specific patterns to avoid false positives from incidental digits.
670
+ * Falls back to the last digit in the valid range within the last 100 characters,
671
+ * since LLMs typically provide their final answer at the end.
672
+ *
673
+ * @param text - LLM response text
674
+ * @returns Extracted score in G-Eval range
675
+ * @throws {ScoreNormalizationError} If no valid score found
676
+ */
677
+ export function extractScoreFromText(text) {
678
+ // Try specific patterns first (more reliable) - search entire text
679
+ for (const pattern of SCORE_PATTERNS) {
680
+ const match = text.match(pattern);
681
+ if (match) {
682
+ return parseInt(match[1], 10);
683
+ }
684
+ }
685
+ // Fallback: last digit in G-Eval range within last 100 chars only
686
+ // This reduces false positives from incidental numbers in prose
687
+ // e.g., "This response has 3 main points" won't extract 3 if score is at end
688
+ const tailText = text.slice(-SCORE_FALLBACK_WINDOW);
689
+ const allDigits = tailText.match(/\b([1-5])\b/g);
690
+ if (allDigits && allDigits.length > 0) {
691
+ return parseInt(allDigits[allDigits.length - 1], 10);
692
+ }
693
+ // No valid score found - throw rather than mask failure with default
694
+ throw new ScoreNormalizationError('No valid score found in LLM response');
695
+ }
696
+ /**
697
+ * G-Eval implementation using chain-of-thought prompting with token probability normalization.
698
+ *
699
+ * @param llm - LLM provider for judge calls
700
+ * @param config - G-Eval configuration with name, criteria, and evaluation parameters
701
+ * @param testCase - Test case to evaluate
702
+ * @param timeoutMs - Timeout for LLM calls in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
703
+ * @returns Evaluation result with normalized score (0-1), reason, and raw response
704
+ * @throws {InputValidationError} If test case exceeds size limits
705
+ * @throws {Error} If LLM call times out or returns invalid score
706
+ * @security Input validation, prompt injection protection, and timeout protection
707
+ */
708
+ export async function gEval(llm, config, testCase, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
709
+ // Validate input sizes
710
+ validateTestCase(testCase);
711
+ // Step 1: Generate evaluation steps via CoT (with timeout)
712
+ const stepsPrompt = `
713
+ Given the criteria: ${config.criteria}
714
+ Generate detailed evaluation steps to assess this criterion.
715
+ List ${G_EVAL_MIN_STEPS}-${G_EVAL_MAX_STEPS} specific steps the evaluator should follow.
716
+ `;
717
+ const stepsResponse = await withTimeout((_signal) => llm.generate(stepsPrompt, { temperature: config.temperature ?? LLM_TEMPERATURE_EVALUATION }), timeoutMs);
718
+ // Step 2: Evaluate with generated steps (with timeout)
719
+ const evalPrompt = buildEvalPrompt(config, testCase, stepsResponse.text);
720
+ const response = await withTimeout((_signal) => llm.generate(evalPrompt, { temperature: config.temperature ?? LLM_TEMPERATURE_EVALUATION, logprobs: true }), timeoutMs);
721
+ // Step 3: Normalize score using token probabilities if available
722
+ let rawScore;
723
+ if (response.logprobs && response.logprobs.length > 0) {
724
+ try {
725
+ rawScore = normalizeWithLogprobs(response.logprobs, [...G_EVAL_VALID_SCORES]);
726
+ }
727
+ catch {
728
+ // Logprobs normalization failed - fallback to text extraction
729
+ // This is expected when LLM returns unexpected token distribution
730
+ rawScore = extractScoreFromText(response.text);
731
+ }
732
+ }
733
+ else {
734
+ // Fallback: extract score from response text using specific patterns
735
+ rawScore = extractScoreFromText(response.text);
736
+ }
737
+ // Convert G-Eval scale (1-5) to normalized score (0-1)
738
+ // Formula: (score - min) / (max - min) = (score - 1) / 4
739
+ const normalizedScore = (rawScore - G_EVAL_MIN_SCORE) / G_EVAL_SCORE_RANGE;
740
+ // Validate score is in valid range
741
+ if (!isValidScore(normalizedScore)) {
742
+ throw new Error(`Invalid normalized score: ${normalizedScore} (raw: ${rawScore})`);
743
+ }
744
+ return {
745
+ score: normalizedScore,
746
+ reason: response.text,
747
+ rawResponse: response.text,
748
+ };
749
+ }
750
+ // ============================================================================
751
+ // QAG Pattern Helpers
752
+ // ============================================================================
753
+ /**
754
+ * Extract atomic statements from LLM output.
755
+ * Each statement should be independently verifiable.
756
+ *
757
+ * @param llm - LLM provider for extraction
758
+ * @param output - Text to extract statements from
759
+ * @param timeoutMs - Timeout for LLM call in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
760
+ * @returns Array of atomic statements (max MAX_STATEMENTS)
761
+ * @throws {Error} If LLM call times out
762
+ * @security Output sanitized, safe JSON parsing with depth limits, result capped at MAX_STATEMENTS
763
+ */
764
+ export async function extractStatements(llm, output, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
765
+ const sanitizedOutput = sanitizeForPrompt(output);
766
+ const prompt = `
767
+ Extract all factual claims from the following text as a JSON array of strings.
768
+ Each claim should be a single, atomic statement that can be verified independently.
769
+
770
+ Text: ${sanitizedOutput}
771
+
772
+ Return ONLY a JSON array, e.g.: ["claim 1", "claim 2", "claim 3"]
773
+ `;
774
+ const response = await withTimeout((_signal) => llm.generate(prompt, { temperature: LLM_TEMPERATURE_DETERMINISTIC }), timeoutMs);
775
+ try {
776
+ const parsed = safeJSONParse(response.text);
777
+ // Validate parsed result is an array before processing
778
+ if (!Array.isArray(parsed)) {
779
+ throw new Error(`Expected array from statement extraction, got ${typeof parsed}`);
780
+ }
781
+ // Type guard filters non-strings and empty strings, caps at MAX_STATEMENTS
782
+ return parsed
783
+ .filter((item) => typeof item === 'string' && item.trim().length > 0)
784
+ .slice(0, MAX_STATEMENTS);
785
+ }
786
+ catch (error) {
787
+ // JSON parse failed - fallback to sentence splitting
788
+ // Log with context for production debugging
789
+ const errorMessage = error instanceof Error ? error.message : String(error);
790
+ const responsePreview = response.text.length > 200
791
+ ? response.text.slice(0, 200) + '...'
792
+ : response.text;
793
+ // Structured logging with context for distributed tracing
794
+ // Callers can pass traceId/spanId via AsyncLocalStorage or context propagation
795
+ // Respects LOG_LEVEL environment variable for production flexibility
796
+ if (shouldLog('warn')) {
797
+ console.warn('[llm-as-judge] Statement extraction JSON parse failed, using sentence fallback.', {
798
+ error: errorMessage,
799
+ responsePreview,
800
+ outputLength: output.length,
801
+ // Note: traceId/spanId can be added by wrapping this module with OTel instrumentation
802
+ // e.g., using @opentelemetry/api context.active().getValue(SPAN_KEY)
803
+ });
804
+ }
805
+ }
806
+ // Use sbd (sentence boundary detection) for proper sentence splitting
807
+ // Handles abbreviations like "Dr. Smith" without splitting incorrectly
808
+ return sbd.sentences(output)
809
+ .map(s => s.trim())
810
+ .filter(s => s.length > MIN_STATEMENT_LENGTH)
811
+ .slice(0, MAX_STATEMENTS);
812
+ }
813
+ /**
814
+ * Generate verification question for a statement.
815
+ *
816
+ * @param llm - LLM provider for question generation
817
+ * @param statement - Statement to verify
818
+ * @param timeoutMs - Timeout for LLM call in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
819
+ * @returns Yes/no verification question string
820
+ * @throws {Error} If LLM call times out
821
+ * @security Statement is sanitized for prompt injection before use
822
+ */
823
+ export async function generateVerificationQuestion(llm, statement, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
824
+ const sanitizedStatement = sanitizeForPrompt(statement);
825
+ const prompt = `
826
+ Convert this statement into a yes/no question that can verify its accuracy:
827
+
828
+ Statement: ${sanitizedStatement}
829
+
830
+ Return ONLY the question, nothing else.
831
+ `;
832
+ const response = await withTimeout((_signal) => llm.generate(prompt, { temperature: LLM_TEMPERATURE_DETERMINISTIC }), timeoutMs);
833
+ return response.text.trim();
834
+ }
835
+ /**
836
+ * Answer verification question using provided context.
837
+ *
838
+ * @param llm - LLM provider for answering
839
+ * @param question - Yes/no question to answer
840
+ * @param context - Context documents to use for answering
841
+ * @param timeoutMs - Timeout for LLM call in milliseconds (default: DEFAULT_LLM_TIMEOUT_MS)
842
+ * @returns 'yes', 'no', or 'unknown' based on context
843
+ * @throws {Error} If LLM call times out
844
+ * @security Question and context are sanitized for prompt injection
845
+ */
846
+ export async function answerQuestion(llm, question, context, timeoutMs = DEFAULT_LLM_TIMEOUT_MS) {
847
+ const sanitizedQuestion = sanitizeForPrompt(question);
848
+ const sanitizedContext = sanitizeContextArray(context);
849
+ const prompt = `
850
+ Based ONLY on the following context, answer the question with "yes", "no", or "unknown".
851
+
852
+ Context:
853
+ ${sanitizedContext.join('\n\n')}
854
+
855
+ Question: ${sanitizedQuestion}
856
+
857
+ Answer (yes/no/unknown):
858
+ `;
859
+ const response = await withTimeout((_signal) => llm.generate(prompt, { temperature: LLM_TEMPERATURE_DETERMINISTIC }), timeoutMs);
860
+ const normalized = response.text.trim().toLowerCase();
861
+ // Use word boundary matching to avoid false positives like "yesterday" or "notwithstanding"
862
+ const yesMatch = /\b(yes|yeah|correct|true|affirmative)\b/i.test(normalized);
863
+ const noMatch = /\b(no|nope|incorrect|false|negative)\b/i.test(normalized);
864
+ // If both or neither, check what comes first for ambiguous cases
865
+ if (yesMatch && noMatch) {
866
+ const yesPos = normalized.search(/\b(yes|yeah|correct|true|affirmative)\b/i);
867
+ const noPos = normalized.search(/\b(no|nope|incorrect|false|negative)\b/i);
868
+ return yesPos < noPos ? 'yes' : 'no';
869
+ }
870
+ if (yesMatch)
871
+ return 'yes';
872
+ if (noMatch)
873
+ return 'no';
874
+ return 'unknown';
875
+ }
876
+ /**
877
+ * QAG (Question-Answer Generation) evaluation.
878
+ * Decomposes evaluation into atomic yes/no questions.
879
+ *
880
+ * Uses Promise.allSettled for graceful degradation - partial failures
881
+ * don't abort the entire evaluation. Score is calculated from successful
882
+ * verifications only.
883
+ *
884
+ * @param llm - LLM provider for all operations
885
+ * @param input - Original user input (unused but included for API consistency)
886
+ * @param output - LLM output to evaluate for faithfulness
887
+ * @param context - Context documents for verification
888
+ * @param options - Optional configuration object
889
+ * @param options.timeoutMs - Timeout for each LLM call (default: DEFAULT_LLM_TIMEOUT_MS)
890
+ * @returns Faithfulness score (0-1) as proportion of verified statements
891
+ * @security Timeout protection on all calls; graceful degradation on partial failures
892
+ * @performance Makes 2N+1 LLM calls for N statements (capped at MAX_STATEMENTS=20):
893
+ * - 1 call to extract atomic statements from output
894
+ * - N parallel calls to generate verification questions (one per statement)
895
+ * - N parallel calls to answer questions from context (one per statement)
896
+ * For MAX_STATEMENTS (20), this is 41 LLM calls total.
897
+ * Typical latency: 10-30s (parallel execution) depending on LLM provider.
898
+ */
899
+ export async function qagEvaluate(llm, input, output, context, options) {
900
+ const timeoutMs = options?.timeoutMs ?? DEFAULT_LLM_TIMEOUT_MS;
901
+ // Step 1: Extract statements from output
902
+ const statements = await extractStatements(llm, output, timeoutMs);
903
+ if (statements.length === 0) {
904
+ return 1; // No claims to verify = fully faithful
905
+ }
906
+ // Step 2: Generate verification questions with graceful degradation
907
+ const questionResults = await Promise.allSettled(statements.map(s => generateVerificationQuestion(llm, s, timeoutMs)));
908
+ // Collect successful questions with their indices
909
+ const successfulQuestions = [];
910
+ for (let i = 0; i < questionResults.length; i++) {
911
+ const result = questionResults[i];
912
+ if (result.status === 'fulfilled') {
913
+ successfulQuestions.push({ question: result.value, index: i });
914
+ }
915
+ }
916
+ // If all question generation failed, throw error (0 would mean "unfaithful" not "failed")
917
+ if (successfulQuestions.length === 0) {
918
+ throw new Error('QAG evaluation failed: no verification questions generated');
919
+ }
920
+ // Step 3: Answer questions with graceful degradation
921
+ const answerResults = await Promise.allSettled(successfulQuestions.map(({ question }) => answerQuestion(llm, question, context, timeoutMs)));
922
+ // Step 4: Calculate score from successful answers only
923
+ let yesCount = 0;
924
+ let successfulAnswers = 0;
925
+ for (const result of answerResults) {
926
+ if (result.status === 'fulfilled') {
927
+ successfulAnswers++;
928
+ if (result.value === 'yes') {
929
+ yesCount++;
930
+ }
931
+ }
932
+ }
933
+ // If all answer calls failed, throw error (0 would mean "unfaithful" not "failed")
934
+ if (successfulAnswers === 0) {
935
+ throw new Error('QAG evaluation failed: no verification answers obtained');
936
+ }
937
+ return yesCount / successfulAnswers;
938
+ }
939
+ // ============================================================================
940
+ // Bias Mitigation
941
+ // ============================================================================
942
+ /** Valid winner values for pairwise evaluation */
943
+ const VALID_PAIRWISE_WINNERS = ['A', 'B', 'tie'];
944
+ /**
945
+ * Validate that an evaluate function returned a valid pairwise result.
946
+ * Runtime type guard to prevent type safety bypass from external functions.
947
+ *
948
+ * @param result - Result from evaluate function to validate
949
+ * @param ordering - Ordering label for error message ('AB' or 'BA')
950
+ * @throws {InputValidationError} If result is not a valid pairwise result object
951
+ */
952
+ function validatePairwiseResult(result, ordering) {
953
+ if (!result ||
954
+ typeof result !== 'object' ||
955
+ typeof result.winner !== 'string' ||
956
+ !VALID_PAIRWISE_WINNERS.includes(result.winner)) {
957
+ throw new InputValidationError(`Invalid evaluate result for ${ordering} ordering: expected { winner: 'A' | 'B' | 'tie' }, got ${JSON.stringify(result)}`, 'evaluate', 'type');
958
+ }
959
+ }
960
+ /**
961
+ * Mitigated pairwise evaluation with position bias correction.
962
+ * Evaluates both orderings and only counts consistent wins.
963
+ *
964
+ * @param evaluate - Evaluation function that compares two outputs
965
+ * @param input - User input
966
+ * @param outputA - First output to compare
967
+ * @param outputB - Second output to compare
968
+ * @returns Winner ('A', 'B', or 'tie')
969
+ * @throws {Error} If evaluate function is not provided
970
+ * @throws {InputValidationError} If input, outputA, or outputB is empty
971
+ * @throws {InputValidationError} If any string exceeds MAX_TEXT_LENGTH
972
+ */
973
+ export async function mitigatedPairwiseEval(evaluate, input, outputA, outputB) {
974
+ // Validate evaluate function
975
+ if (typeof evaluate !== 'function') {
976
+ throw new Error('mitigatedPairwiseEval requires an evaluate function');
977
+ }
978
+ // Validate input is non-empty
979
+ if (!input || input.trim().length === 0) {
980
+ throw new InputValidationError('Input cannot be empty', 'input', 'required');
981
+ }
982
+ // Validate outputA is non-empty
983
+ if (!outputA || outputA.trim().length === 0) {
984
+ throw new InputValidationError('Output A cannot be empty', 'outputA', 'required');
985
+ }
986
+ // Validate outputB is non-empty
987
+ if (!outputB || outputB.trim().length === 0) {
988
+ throw new InputValidationError('Output B cannot be empty', 'outputB', 'required');
989
+ }
990
+ // Validate input sizes to prevent resource exhaustion
991
+ if (input.length > MAX_TEXT_LENGTH) {
992
+ throw new InputValidationError(`Input exceeds ${MAX_TEXT_LENGTH} character limit`, 'input', 'maxLength');
993
+ }
994
+ if (outputA.length > MAX_TEXT_LENGTH) {
995
+ throw new InputValidationError(`Output A exceeds ${MAX_TEXT_LENGTH} character limit`, 'outputA', 'maxLength');
996
+ }
997
+ if (outputB.length > MAX_TEXT_LENGTH) {
998
+ throw new InputValidationError(`Output B exceeds ${MAX_TEXT_LENGTH} character limit`, 'outputB', 'maxLength');
999
+ }
1000
+ // Evaluate both orderings to detect position bias
1001
+ const [resultAB, resultBA] = await Promise.all([
1002
+ evaluate(input, outputA, outputB),
1003
+ evaluate(input, outputB, outputA),
1004
+ ]);
1005
+ // Validate evaluate function returned valid results at runtime
1006
+ validatePairwiseResult(resultAB, 'AB');
1007
+ validatePairwiseResult(resultBA, 'BA');
1008
+ // Map BA result back to AB perspective:
1009
+ // - 'A' winner in BA (reversed order) means B won in original ordering
1010
+ // - 'B' winner in BA (reversed order) means A won in original ordering
1011
+ // - 'tie' remains 'tie'
1012
+ const baMapped = resultBA.winner === 'A' ? 'B' : resultBA.winner === 'B' ? 'A' : 'tie';
1013
+ // Only count consistent wins
1014
+ if (resultAB.winner === 'A' && baMapped === 'A') {
1015
+ return 'A';
1016
+ }
1017
+ else if (resultAB.winner === 'B' && baMapped === 'B') {
1018
+ return 'B';
1019
+ }
1020
+ else {
1021
+ return 'tie';
1022
+ }
1023
+ }
1024
+ /**
1025
+ * Multi-judge panel evaluation.
1026
+ * Uses multiple judge models and returns median score.
1027
+ *
1028
+ * @param evaluators - Array of evaluation functions for different models
1029
+ * @param testCase - Test case to evaluate
1030
+ * @returns Median score from all judges
1031
+ * @throws {Error} If evaluators array is empty
1032
+ */
1033
+ export async function panelEvaluation(evaluators, testCase) {
1034
+ if (evaluators.length === 0) {
1035
+ throw new Error('panelEvaluation requires at least one evaluator');
1036
+ }
1037
+ const scores = await Promise.all(evaluators.map(evaluate => evaluate(testCase)));
1038
+ // Return median
1039
+ const sorted = [...scores].sort((a, b) => a - b);
1040
+ const mid = Math.floor(sorted.length / 2);
1041
+ if (sorted.length % 2 === 0) {
1042
+ return (sorted[mid - 1] + sorted[mid]) / 2;
1043
+ }
1044
+ return sorted[mid];
1045
+ }
1046
+ // ============================================================================
1047
+ // Production Utilities
1048
+ // ============================================================================
1049
+ /**
1050
+ * Validate that a score is within expected range [0, 1].
1051
+ *
1052
+ * @param score - Score value to validate
1053
+ * @returns True if score is a number between 0 and 1 (inclusive), false otherwise
1054
+ */
1055
+ export function isValidScore(score) {
1056
+ return typeof score === 'number' && !isNaN(score) && score >= NORMALIZED_SCORE_MIN && score <= NORMALIZED_SCORE_MAX;
1057
+ }
1058
+ /**
1059
+ * Maximum exponent for backoff calculation.
1060
+ * Derived from: Math.floor(Math.log2(MAX_BACKOFF_MS / BACKOFF_BASE_MS))
1061
+ * = Math.floor(Math.log2(60000 / 1000)) = Math.floor(5.9) = 5
1062
+ * This caps backoff at 2^5 * 1000ms = 32 seconds before MAX_BACKOFF_MS takes over.
1063
+ */
1064
+ const MAX_BACKOFF_EXPONENT = 5;
1065
+ /** Maximum backoff delay in milliseconds (60 seconds) */
1066
+ const MAX_BACKOFF_MS = 60000;
1067
+ /**
1068
+ * Delay utility for retry backoff.
1069
+ */
1070
+ function delay(ms) {
1071
+ return new Promise(resolve => setTimeout(resolve, ms));
1072
+ }
1073
+ /**
1074
+ * Evaluate with retry logic and exponential backoff.
1075
+ *
1076
+ * Retries a single evaluation function on transient failures. Use this for
1077
+ * simple retry scenarios where you want automatic backoff.
1078
+ *
1079
+ * Compare to {@link JudgeCircuitBreaker.evaluate}:
1080
+ * - `evaluateWithRetry`: Retries same operation with backoff (1s → 2s → 4s)
1081
+ * - `JudgeCircuitBreaker.evaluate`: Fails fast when service is degraded, optional fallback
1082
+ *
1083
+ * These can be combined: wrap evaluateWithRetry inside circuit breaker for
1084
+ * retries on transient errors while circuit-breaking on sustained failures.
1085
+ *
1086
+ * @param evaluate - Evaluation function to retry on failure
1087
+ * @param testCase - Test case to evaluate
1088
+ * @param maxRetries - Maximum number of retry attempts (default: DEFAULT_MAX_RETRIES)
1089
+ * @returns Evaluation result with retryCount indicating number of failed attempts
1090
+ * @throws {Error} If all retry attempts fail. The thrown error is the last error
1091
+ * encountered. For non-Error thrown values, the error wraps the original value
1092
+ * in `error.cause` for debugging context (ECMAScript 2022 Error.cause).
1093
+ *
1094
+ * @example
1095
+ * const result = await evaluateWithRetry(
1096
+ * (tc) => gEval(tc, criteria, llmFn),
1097
+ * testCase,
1098
+ * 3
1099
+ * );
1100
+ *
1101
+ * @example
1102
+ * // Accessing error cause for debugging
1103
+ * try {
1104
+ * await evaluateWithRetry(evaluate, testCase, 3);
1105
+ * } catch (error) {
1106
+ * console.error('Final error:', error.message);
1107
+ * if (error.cause) {
1108
+ * console.error('Original cause:', error.cause);
1109
+ * }
1110
+ * }
1111
+ */
1112
+ export async function evaluateWithRetry(evaluate, testCase, maxRetries = DEFAULT_MAX_RETRIES) {
1113
+ let lastError = new Error('No attempts made');
1114
+ let retryCount = 0;
1115
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
1116
+ try {
1117
+ const result = await evaluate(testCase);
1118
+ // Validate result has valid score
1119
+ if (isValidScore(result.score)) {
1120
+ return { ...result, retryCount };
1121
+ }
1122
+ // Invalid score - treat as error and retry
1123
+ throw new Error(`Invalid score: ${result.score}`);
1124
+ }
1125
+ catch (error) {
1126
+ // Preserve original error as cause for debugging context
1127
+ // Use JSON.stringify for objects to get meaningful message instead of "[object Object]"
1128
+ const errorMessage = error instanceof Error
1129
+ ? error.message
1130
+ : (typeof error === 'object' && error !== null ? JSON.stringify(error) : String(error));
1131
+ lastError = error instanceof Error ? error : new Error(errorMessage, { cause: error });
1132
+ retryCount++;
1133
+ // Don't wait after the last attempt
1134
+ if (attempt < maxRetries) {
1135
+ // Exponential backoff: 2^(attempt-1) seconds, capped at MAX_BACKOFF_EXPONENT
1136
+ // First retry (attempt=1) waits 1s, second (attempt=2) waits 2s, etc.
1137
+ const cappedExponent = Math.min(attempt - 1, MAX_BACKOFF_EXPONENT);
1138
+ const backoffMs = Math.min(BACKOFF_BASE_MS * 2 ** cappedExponent, MAX_BACKOFF_MS);
1139
+ await delay(backoffMs);
1140
+ }
1141
+ }
1142
+ }
1143
+ throw lastError;
1144
+ }
1145
+ /**
1146
+ * Circuit breaker for judge model failures.
1147
+ * Prevents cascading failures when judge model is unavailable.
1148
+ * Rate limit errors (429) are not counted toward the threshold.
1149
+ *
1150
+ * Compare to {@link evaluateWithRetry}:
1151
+ * - `evaluateWithRetry`: Retries same operation with backoff (1s → 2s → 4s)
1152
+ * - `JudgeCircuitBreaker.evaluate`: Fails fast when service is degraded, optional fallback
1153
+ */
1154
+ export class JudgeCircuitBreaker {
1155
+ threshold;
1156
+ resetTimeout;
1157
+ failures = 0;
1158
+ lastFailure = null;
1159
+ isOpen = false;
1160
+ /**
1161
+ * Flag to prevent multiple concurrent resets (race condition protection).
1162
+ * @note This implementation assumes single-threaded Node.js execution.
1163
+ * For worker threads or multi-process deployments, use external synchronization
1164
+ * (e.g., Redis-based distributed locks) instead of this in-memory flag.
1165
+ */
1166
+ resetting = false;
1167
+ /**
1168
+ * Count of times circuit has opened (for flapping detection).
1169
+ * @note This counter is unbounded for simplicity. In practice, overflow would
1170
+ * take ~584 million years at 1 state change per second. For long-running
1171
+ * services requiring bounded counters, use external observability tools
1172
+ * (e.g., Prometheus counters) or implement a sliding window approach.
1173
+ */
1174
+ openCount = 0;
1175
+ /**
1176
+ * Count of times circuit has been reset (for flapping detection).
1177
+ * @note Unbounded counter - see openCount for rationale.
1178
+ */
1179
+ resetCount = 0;
1180
+ /**
1181
+ * Create a new circuit breaker instance.
1182
+ *
1183
+ * @param threshold - Number of failures before circuit opens (default: DEFAULT_CIRCUIT_BREAKER_THRESHOLD)
1184
+ * @param resetTimeout - Time in ms before circuit resets (default: DEFAULT_CIRCUIT_BREAKER_RESET_MS)
1185
+ */
1186
+ constructor(threshold = DEFAULT_CIRCUIT_BREAKER_THRESHOLD, resetTimeout = DEFAULT_CIRCUIT_BREAKER_RESET_MS) {
1187
+ this.threshold = threshold;
1188
+ this.resetTimeout = resetTimeout;
1189
+ }
1190
+ /**
1191
+ * Check if circuit is open (failing).
1192
+ *
1193
+ * @returns True if circuit is open and blocking requests
1194
+ */
1195
+ get open() {
1196
+ return this.isOpen;
1197
+ }
1198
+ /**
1199
+ * Get current failure count.
1200
+ *
1201
+ * @returns Number of consecutive failures (excluding rate limits)
1202
+ */
1203
+ get failureCount() {
1204
+ return this.failures;
1205
+ }
1206
+ /**
1207
+ * Get circuit breaker statistics for observability and flapping detection.
1208
+ *
1209
+ * @returns Object with openCount and resetCount for monitoring circuit health
1210
+ */
1211
+ get stats() {
1212
+ return { openCount: this.openCount, resetCount: this.resetCount };
1213
+ }
1214
+ /**
1215
+ * Reset the circuit breaker to closed state.
1216
+ * Thread-safe: uses resetting flag to prevent concurrent resets.
1217
+ *
1218
+ * @returns void
1219
+ */
1220
+ reset() {
1221
+ this.isOpen = false;
1222
+ this.failures = 0;
1223
+ this.lastFailure = null;
1224
+ this.resetting = false;
1225
+ this.resetCount++;
1226
+ }
1227
+ /**
1228
+ * Check if an error should count toward circuit breaker threshold.
1229
+ *
1230
+ * @param error - Error to check
1231
+ * @returns False for rate limit errors (transient), true for other errors
1232
+ */
1233
+ shouldCountAsFailure(error) {
1234
+ if (!(error instanceof Error)) {
1235
+ return true;
1236
+ }
1237
+ // Type-based checks (robust for typed providers)
1238
+ const rateLimitErrorNames = [
1239
+ 'RateLimitError', // OpenAI SDK
1240
+ 'ThrottlingException', // AWS Bedrock
1241
+ 'TooManyRequestsError', // Generic
1242
+ 'RateLimitExceeded', // Anthropic
1243
+ ];
1244
+ if (rateLimitErrorNames.includes(error.name)) {
1245
+ return false;
1246
+ }
1247
+ // HTTP status code check (if available on error object)
1248
+ const errorWithStatus = error;
1249
+ if (errorWithStatus.statusCode === HttpStatus.TOO_MANY_REQUESTS || errorWithStatus.status === HttpStatus.TOO_MANY_REQUESTS) {
1250
+ return false;
1251
+ }
1252
+ // Error code check (AWS-style errors)
1253
+ if (errorWithStatus.code === 'ThrottlingException' ||
1254
+ errorWithStatus.code === 'TooManyRequestsException' ||
1255
+ errorWithStatus.code === 'ProvisionedThroughputExceededException') {
1256
+ return false;
1257
+ }
1258
+ // Fallback to message pattern matching (last resort)
1259
+ // Use word boundary regex to avoid false positives
1260
+ if (error.message && typeof error.message === 'string') {
1261
+ const message = error.message.toLowerCase();
1262
+ // Note: \s+ intentionally matches any amount of whitespace to handle
1263
+ // variations like "too many requests" from different providers
1264
+ if (/\brate[_\s-]?limit/i.test(message) ||
1265
+ /\b429\b/.test(message) ||
1266
+ /\bthrottl/i.test(message) ||
1267
+ /\btoo\s+many\s+requests\b/i.test(message)) {
1268
+ return false;
1269
+ }
1270
+ }
1271
+ return true;
1272
+ }
1273
+ /**
1274
+ * Execute evaluation with circuit breaker protection.
1275
+ *
1276
+ * @param evaluate - Primary evaluation function to execute
1277
+ * @param fallbackEvaluate - Optional fallback function when circuit is open
1278
+ * @returns Result from evaluate or fallbackEvaluate
1279
+ * @throws {Error} If circuit is open and no fallback provided
1280
+ * @throws {Error} If evaluation fails (error is re-thrown after recording)
1281
+ */
1282
+ async evaluate(evaluate, fallbackEvaluate) {
1283
+ // Check if circuit should be reset - triple-check pattern with resetting flag
1284
+ // prevents race condition where multiple concurrent calls could all reset
1285
+ if (this.isOpen && this.lastFailure && !this.resetting) {
1286
+ const elapsed = Date.now() - this.lastFailure.getTime();
1287
+ if (elapsed > this.resetTimeout && this.isOpen && !this.resetting) {
1288
+ // Set flag BEFORE reset to prevent concurrent resets
1289
+ this.resetting = true;
1290
+ this.reset();
1291
+ }
1292
+ }
1293
+ // If circuit is open, use fallback or throw
1294
+ if (this.isOpen) {
1295
+ if (fallbackEvaluate) {
1296
+ return fallbackEvaluate();
1297
+ }
1298
+ throw new Error('Circuit breaker open - evaluation temporarily unavailable');
1299
+ }
1300
+ try {
1301
+ const result = await evaluate();
1302
+ this.failures = 0;
1303
+ return result;
1304
+ }
1305
+ catch (error) {
1306
+ // Only count non-transient errors
1307
+ if (this.shouldCountAsFailure(error)) {
1308
+ this.failures++;
1309
+ this.lastFailure = new Date();
1310
+ if (this.failures >= this.threshold) {
1311
+ this.isOpen = true;
1312
+ this.openCount++;
1313
+ }
1314
+ }
1315
+ throw error;
1316
+ }
1317
+ }
1318
+ }
1319
+ /**
1320
+ * Default canary test cases for judge pipeline health monitoring.
1321
+ */
1322
+ export const DEFAULT_CANARY_CASES = [
1323
+ {
1324
+ name: 'perfect_answer',
1325
+ input: 'What is 2+2?',
1326
+ output: '2+2 equals 4.',
1327
+ metric: 'relevance',
1328
+ expectedScore: { min: 0.9 },
1329
+ description: 'Simple factual answer should score high',
1330
+ },
1331
+ {
1332
+ name: 'hallucination_detection',
1333
+ input: 'What is the capital of France?',
1334
+ output: 'The capital of France is Tokyo, a beautiful city in Asia.',
1335
+ metric: 'faithfulness',
1336
+ expectedScore: { max: 0.3 },
1337
+ description: 'Obvious hallucination should score low',
1338
+ },
1339
+ {
1340
+ name: 'off_topic_detection',
1341
+ input: 'Explain quantum computing',
1342
+ output: 'I love pizza! It is delicious with pepperoni.',
1343
+ metric: 'relevance',
1344
+ expectedScore: { max: 0.2 },
1345
+ description: 'Completely off-topic should score very low',
1346
+ },
1347
+ ];
1348
+ /**
1349
+ * Run canary evaluations to monitor judge pipeline health.
1350
+ *
1351
+ * @param evaluate - Evaluation function to test (takes test case and metric)
1352
+ * @param canaries - Canary test cases to run (defaults to DEFAULT_CANARY_CASES)
1353
+ * @returns Canary report with overall pass/fail and individual results
1354
+ * @throws {Error} If any canary lacks expectedScore.min or expectedScore.max
1355
+ */
1356
+ export async function runCanaryEvaluations(evaluate, canaries = DEFAULT_CANARY_CASES) {
1357
+ // Validate evaluate is a function
1358
+ if (typeof evaluate !== 'function') {
1359
+ throw new Error('runCanaryEvaluations requires an evaluate function');
1360
+ }
1361
+ const results = [];
1362
+ for (const canary of canaries) {
1363
+ // Validate canary has at least one threshold defined
1364
+ if (canary.expectedScore.min === undefined && canary.expectedScore.max === undefined) {
1365
+ throw new Error(`Canary '${canary.name}' must define expectedScore.min or expectedScore.max`);
1366
+ }
1367
+ const score = await evaluate({ input: canary.input, output: canary.output }, canary.metric);
1368
+ // Validate score is in valid range
1369
+ if (!isValidScore(score)) {
1370
+ results.push({
1371
+ name: canary.name,
1372
+ score: NaN,
1373
+ expected: canary.expectedScore,
1374
+ passed: false,
1375
+ timestamp: new Date().toISOString(),
1376
+ });
1377
+ continue;
1378
+ }
1379
+ // Determine if score passes threshold - check both min AND max when both defined
1380
+ // We validated above that at least one of min/max is defined
1381
+ const passed = (canary.expectedScore.min === undefined || score >= canary.expectedScore.min) &&
1382
+ (canary.expectedScore.max === undefined || score <= canary.expectedScore.max);
1383
+ results.push({
1384
+ name: canary.name,
1385
+ score,
1386
+ expected: canary.expectedScore,
1387
+ passed,
1388
+ timestamp: new Date().toISOString(),
1389
+ });
1390
+ }
1391
+ return {
1392
+ timestamp: new Date().toISOString(),
1393
+ passed: results.every(r => r.passed),
1394
+ results,
1395
+ };
1396
+ }
1397
+ //# sourceMappingURL=llm-as-judge.js.map