observability-toolkit 1.8.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (364) hide show
  1. package/README.md +126 -5
  2. package/dist/backends/index.d.ts +163 -0
  3. package/dist/backends/index.d.ts.map +1 -1
  4. package/dist/backends/index.js +57 -0
  5. package/dist/backends/index.js.map +1 -1
  6. package/dist/backends/index.test.js +55 -1
  7. package/dist/backends/index.test.js.map +1 -1
  8. package/dist/backends/local-jsonl-boolean-search.test.js +8 -8
  9. package/dist/backends/local-jsonl-boolean-search.test.js.map +1 -1
  10. package/dist/backends/local-jsonl-cache.test.d.ts +2 -0
  11. package/dist/backends/local-jsonl-cache.test.d.ts.map +1 -0
  12. package/dist/backends/local-jsonl-cache.test.js +295 -0
  13. package/dist/backends/local-jsonl-cache.test.js.map +1 -0
  14. package/dist/backends/local-jsonl-circuit-breaker.test.d.ts +2 -0
  15. package/dist/backends/local-jsonl-circuit-breaker.test.d.ts.map +1 -0
  16. package/dist/backends/local-jsonl-circuit-breaker.test.js +180 -0
  17. package/dist/backends/local-jsonl-circuit-breaker.test.js.map +1 -0
  18. package/dist/backends/local-jsonl-export.test.d.ts +2 -0
  19. package/dist/backends/local-jsonl-export.test.d.ts.map +1 -0
  20. package/dist/backends/local-jsonl-export.test.js +704 -0
  21. package/dist/backends/local-jsonl-export.test.js.map +1 -0
  22. package/dist/backends/local-jsonl-index.test.d.ts +2 -0
  23. package/dist/backends/local-jsonl-index.test.d.ts.map +1 -0
  24. package/dist/backends/local-jsonl-index.test.js +554 -0
  25. package/dist/backends/local-jsonl-index.test.js.map +1 -0
  26. package/dist/backends/local-jsonl-logs.test.js +52 -43
  27. package/dist/backends/local-jsonl-logs.test.js.map +1 -1
  28. package/dist/backends/local-jsonl-metrics.test.d.ts +2 -0
  29. package/dist/backends/local-jsonl-metrics.test.d.ts.map +1 -0
  30. package/dist/backends/local-jsonl-metrics.test.js +876 -0
  31. package/dist/backends/local-jsonl-metrics.test.js.map +1 -0
  32. package/dist/backends/local-jsonl-traces.test.js +89 -83
  33. package/dist/backends/local-jsonl-traces.test.js.map +1 -1
  34. package/dist/backends/local-jsonl.d.ts +39 -0
  35. package/dist/backends/local-jsonl.d.ts.map +1 -1
  36. package/dist/backends/local-jsonl.js +975 -492
  37. package/dist/backends/local-jsonl.js.map +1 -1
  38. package/dist/backends/signoz-api-circuit-breaker.test.d.ts +6 -0
  39. package/dist/backends/signoz-api-circuit-breaker.test.d.ts.map +1 -0
  40. package/dist/backends/signoz-api-circuit-breaker.test.js +548 -0
  41. package/dist/backends/signoz-api-circuit-breaker.test.js.map +1 -0
  42. package/dist/backends/signoz-api-rate-limiter.test.d.ts +6 -0
  43. package/dist/backends/signoz-api-rate-limiter.test.d.ts.map +1 -0
  44. package/dist/backends/signoz-api-rate-limiter.test.js +390 -0
  45. package/dist/backends/signoz-api-rate-limiter.test.js.map +1 -0
  46. package/dist/backends/signoz-api-ssrf.test.d.ts +6 -0
  47. package/dist/backends/signoz-api-ssrf.test.d.ts.map +1 -0
  48. package/dist/backends/signoz-api-ssrf.test.js +216 -0
  49. package/dist/backends/signoz-api-ssrf.test.js.map +1 -0
  50. package/dist/backends/signoz-api-test-helpers.d.ts +80 -0
  51. package/dist/backends/signoz-api-test-helpers.d.ts.map +1 -0
  52. package/dist/backends/signoz-api-test-helpers.js +79 -0
  53. package/dist/backends/signoz-api-test-helpers.js.map +1 -0
  54. package/dist/backends/signoz-api.d.ts +31 -1
  55. package/dist/backends/signoz-api.d.ts.map +1 -1
  56. package/dist/backends/signoz-api.js +717 -539
  57. package/dist/backends/signoz-api.js.map +1 -1
  58. package/dist/backends/signoz-api.test.d.ts +9 -0
  59. package/dist/backends/signoz-api.test.d.ts.map +1 -1
  60. package/dist/backends/signoz-api.test.js +20 -1032
  61. package/dist/backends/signoz-api.test.js.map +1 -1
  62. package/dist/lib/agent-as-judge.d.ts +388 -0
  63. package/dist/lib/agent-as-judge.d.ts.map +1 -0
  64. package/dist/lib/agent-as-judge.js +740 -0
  65. package/dist/lib/agent-as-judge.js.map +1 -0
  66. package/dist/lib/agent-as-judge.test.d.ts +5 -0
  67. package/dist/lib/agent-as-judge.test.d.ts.map +1 -0
  68. package/dist/lib/agent-as-judge.test.js +816 -0
  69. package/dist/lib/agent-as-judge.test.js.map +1 -0
  70. package/dist/lib/cache.d.ts +61 -2
  71. package/dist/lib/cache.d.ts.map +1 -1
  72. package/dist/lib/cache.js +54 -3
  73. package/dist/lib/cache.js.map +1 -1
  74. package/dist/lib/circuit-breaker.d.ts +101 -0
  75. package/dist/lib/circuit-breaker.d.ts.map +1 -0
  76. package/dist/lib/circuit-breaker.js +158 -0
  77. package/dist/lib/circuit-breaker.js.map +1 -0
  78. package/dist/lib/circuit-breaker.test.d.ts +2 -0
  79. package/dist/lib/circuit-breaker.test.d.ts.map +1 -0
  80. package/dist/lib/circuit-breaker.test.js +263 -0
  81. package/dist/lib/circuit-breaker.test.js.map +1 -0
  82. package/dist/lib/confident-export.d.ts +101 -0
  83. package/dist/lib/confident-export.d.ts.map +1 -0
  84. package/dist/lib/confident-export.js +393 -0
  85. package/dist/lib/confident-export.js.map +1 -0
  86. package/dist/lib/confident-export.test.d.ts +7 -0
  87. package/dist/lib/confident-export.test.d.ts.map +1 -0
  88. package/dist/lib/confident-export.test.js +835 -0
  89. package/dist/lib/confident-export.test.js.map +1 -0
  90. package/dist/lib/constants-symlink.test.d.ts +12 -0
  91. package/dist/lib/constants-symlink.test.d.ts.map +1 -0
  92. package/dist/lib/constants-symlink.test.js +357 -0
  93. package/dist/lib/constants-symlink.test.js.map +1 -0
  94. package/dist/lib/constants.d.ts +75 -0
  95. package/dist/lib/constants.d.ts.map +1 -1
  96. package/dist/lib/constants.js +104 -1
  97. package/dist/lib/constants.js.map +1 -1
  98. package/dist/lib/datadog-export.d.ts +156 -0
  99. package/dist/lib/datadog-export.d.ts.map +1 -0
  100. package/dist/lib/datadog-export.js +464 -0
  101. package/dist/lib/datadog-export.js.map +1 -0
  102. package/dist/lib/datadog-export.test.d.ts +14 -0
  103. package/dist/lib/datadog-export.test.d.ts.map +1 -0
  104. package/dist/lib/datadog-export.test.js +890 -0
  105. package/dist/lib/datadog-export.test.js.map +1 -0
  106. package/dist/lib/edge-cases.test.js +17 -17
  107. package/dist/lib/edge-cases.test.js.map +1 -1
  108. package/dist/lib/error-sanitizer.d.ts.map +1 -1
  109. package/dist/lib/error-sanitizer.js +29 -3
  110. package/dist/lib/error-sanitizer.js.map +1 -1
  111. package/dist/lib/error-sanitizer.test.js +159 -0
  112. package/dist/lib/error-sanitizer.test.js.map +1 -1
  113. package/dist/lib/error-types.d.ts +54 -0
  114. package/dist/lib/error-types.d.ts.map +1 -0
  115. package/dist/lib/error-types.js +154 -0
  116. package/dist/lib/error-types.js.map +1 -0
  117. package/dist/lib/error-types.test.d.ts +2 -0
  118. package/dist/lib/error-types.test.d.ts.map +1 -0
  119. package/dist/lib/error-types.test.js +196 -0
  120. package/dist/lib/error-types.test.js.map +1 -0
  121. package/dist/lib/evaluation-hooks.d.ts +49 -0
  122. package/dist/lib/evaluation-hooks.d.ts.map +1 -0
  123. package/dist/lib/evaluation-hooks.js +488 -0
  124. package/dist/lib/evaluation-hooks.js.map +1 -0
  125. package/dist/lib/evaluation-hooks.test.d.ts +8 -0
  126. package/dist/lib/evaluation-hooks.test.d.ts.map +1 -0
  127. package/dist/lib/evaluation-hooks.test.js +624 -0
  128. package/dist/lib/evaluation-hooks.test.js.map +1 -0
  129. package/dist/lib/export-utils.d.ts +99 -0
  130. package/dist/lib/export-utils.d.ts.map +1 -0
  131. package/dist/lib/export-utils.js +238 -0
  132. package/dist/lib/export-utils.js.map +1 -0
  133. package/dist/lib/export-utils.test.d.ts +5 -0
  134. package/dist/lib/export-utils.test.d.ts.map +1 -0
  135. package/dist/lib/export-utils.test.js +193 -0
  136. package/dist/lib/export-utils.test.js.map +1 -0
  137. package/dist/lib/file-utils.d.ts +17 -2
  138. package/dist/lib/file-utils.d.ts.map +1 -1
  139. package/dist/lib/file-utils.js +24 -5
  140. package/dist/lib/file-utils.js.map +1 -1
  141. package/dist/lib/file-utils.test.js +30 -0
  142. package/dist/lib/file-utils.test.js.map +1 -1
  143. package/dist/lib/histogram.d.ts +119 -0
  144. package/dist/lib/histogram.d.ts.map +1 -0
  145. package/dist/lib/histogram.js +202 -0
  146. package/dist/lib/histogram.js.map +1 -0
  147. package/dist/lib/histogram.test.d.ts +5 -0
  148. package/dist/lib/histogram.test.d.ts.map +1 -0
  149. package/dist/lib/histogram.test.js +381 -0
  150. package/dist/lib/histogram.test.js.map +1 -0
  151. package/dist/lib/indexer.test.js +27 -27
  152. package/dist/lib/indexer.test.js.map +1 -1
  153. package/dist/lib/input-validator.d.ts +12 -0
  154. package/dist/lib/input-validator.d.ts.map +1 -1
  155. package/dist/lib/input-validator.fuzz.test.d.ts +12 -0
  156. package/dist/lib/input-validator.fuzz.test.d.ts.map +1 -0
  157. package/dist/lib/input-validator.fuzz.test.js +290 -0
  158. package/dist/lib/input-validator.fuzz.test.js.map +1 -0
  159. package/dist/lib/input-validator.js +57 -3
  160. package/dist/lib/input-validator.js.map +1 -1
  161. package/dist/lib/input-validator.test.js +129 -1
  162. package/dist/lib/input-validator.test.js.map +1 -1
  163. package/dist/lib/instrumentation.d.ts +153 -0
  164. package/dist/lib/instrumentation.d.ts.map +1 -0
  165. package/dist/lib/instrumentation.integration.test.d.ts +2 -0
  166. package/dist/lib/instrumentation.integration.test.d.ts.map +1 -0
  167. package/dist/lib/instrumentation.integration.test.js +589 -0
  168. package/dist/lib/instrumentation.integration.test.js.map +1 -0
  169. package/dist/lib/instrumentation.js +520 -0
  170. package/dist/lib/instrumentation.js.map +1 -0
  171. package/dist/lib/instrumentation.test.d.ts +2 -0
  172. package/dist/lib/instrumentation.test.d.ts.map +1 -0
  173. package/dist/lib/instrumentation.test.js +821 -0
  174. package/dist/lib/instrumentation.test.js.map +1 -0
  175. package/dist/lib/langfuse-export.d.ts +125 -0
  176. package/dist/lib/langfuse-export.d.ts.map +1 -0
  177. package/dist/lib/langfuse-export.js +367 -0
  178. package/dist/lib/langfuse-export.js.map +1 -0
  179. package/dist/lib/langfuse-export.test.d.ts +7 -0
  180. package/dist/lib/langfuse-export.test.d.ts.map +1 -0
  181. package/dist/lib/langfuse-export.test.js +1007 -0
  182. package/dist/lib/langfuse-export.test.js.map +1 -0
  183. package/dist/lib/llm-as-judge.d.ts +657 -0
  184. package/dist/lib/llm-as-judge.d.ts.map +1 -0
  185. package/dist/lib/llm-as-judge.js +1397 -0
  186. package/dist/lib/llm-as-judge.js.map +1 -0
  187. package/dist/lib/llm-as-judge.test.d.ts +2 -0
  188. package/dist/lib/llm-as-judge.test.d.ts.map +1 -0
  189. package/dist/lib/llm-as-judge.test.js +2409 -0
  190. package/dist/lib/llm-as-judge.test.js.map +1 -0
  191. package/dist/lib/logger.d.ts +46 -0
  192. package/dist/lib/logger.d.ts.map +1 -0
  193. package/dist/lib/logger.js +81 -0
  194. package/dist/lib/logger.js.map +1 -0
  195. package/dist/lib/logger.test.d.ts +2 -0
  196. package/dist/lib/logger.test.d.ts.map +1 -0
  197. package/dist/lib/logger.test.js +122 -0
  198. package/dist/lib/logger.test.js.map +1 -0
  199. package/dist/lib/metrics.d.ts +62 -0
  200. package/dist/lib/metrics.d.ts.map +1 -0
  201. package/dist/lib/metrics.js +166 -0
  202. package/dist/lib/metrics.js.map +1 -0
  203. package/dist/lib/metrics.test.d.ts +5 -0
  204. package/dist/lib/metrics.test.d.ts.map +1 -0
  205. package/dist/lib/metrics.test.js +189 -0
  206. package/dist/lib/metrics.test.js.map +1 -0
  207. package/dist/lib/parse-stats.d.ts +119 -0
  208. package/dist/lib/parse-stats.d.ts.map +1 -0
  209. package/dist/lib/parse-stats.js +206 -0
  210. package/dist/lib/parse-stats.js.map +1 -0
  211. package/dist/lib/parse-stats.test.d.ts +5 -0
  212. package/dist/lib/parse-stats.test.d.ts.map +1 -0
  213. package/dist/lib/parse-stats.test.js +283 -0
  214. package/dist/lib/parse-stats.test.js.map +1 -0
  215. package/dist/lib/phoenix-export.d.ts +109 -0
  216. package/dist/lib/phoenix-export.d.ts.map +1 -0
  217. package/dist/lib/phoenix-export.js +429 -0
  218. package/dist/lib/phoenix-export.js.map +1 -0
  219. package/dist/lib/phoenix-export.test.d.ts +11 -0
  220. package/dist/lib/phoenix-export.test.d.ts.map +1 -0
  221. package/dist/lib/phoenix-export.test.js +725 -0
  222. package/dist/lib/phoenix-export.test.js.map +1 -0
  223. package/dist/lib/server-utils.d.ts +14 -1
  224. package/dist/lib/server-utils.d.ts.map +1 -1
  225. package/dist/lib/server-utils.js +43 -3
  226. package/dist/lib/server-utils.js.map +1 -1
  227. package/dist/lib/shared-schemas.d.ts +28 -0
  228. package/dist/lib/shared-schemas.d.ts.map +1 -1
  229. package/dist/lib/shared-schemas.js +33 -4
  230. package/dist/lib/shared-schemas.js.map +1 -1
  231. package/dist/lib/toon-encoder.d.ts +7 -2
  232. package/dist/lib/toon-encoder.d.ts.map +1 -1
  233. package/dist/lib/toon-encoder.js +21 -6
  234. package/dist/lib/toon-encoder.js.map +1 -1
  235. package/dist/lib/toon-encoder.test.d.ts +5 -0
  236. package/dist/lib/toon-encoder.test.d.ts.map +1 -0
  237. package/dist/lib/toon-encoder.test.js +85 -0
  238. package/dist/lib/toon-encoder.test.js.map +1 -0
  239. package/dist/lib/verification-events.d.ts +100 -0
  240. package/dist/lib/verification-events.d.ts.map +1 -0
  241. package/dist/lib/verification-events.js +162 -0
  242. package/dist/lib/verification-events.js.map +1 -0
  243. package/dist/lib/verification-events.test.d.ts +5 -0
  244. package/dist/lib/verification-events.test.d.ts.map +1 -0
  245. package/dist/lib/verification-events.test.js +193 -0
  246. package/dist/lib/verification-events.test.js.map +1 -0
  247. package/dist/server.d.ts +5 -0
  248. package/dist/server.d.ts.map +1 -1
  249. package/dist/server.js +79 -21
  250. package/dist/server.js.map +1 -1
  251. package/dist/server.test.js +30 -0
  252. package/dist/server.test.js.map +1 -1
  253. package/dist/test-helpers/env-utils.d.ts +22 -0
  254. package/dist/test-helpers/env-utils.d.ts.map +1 -1
  255. package/dist/test-helpers/env-utils.js +38 -0
  256. package/dist/test-helpers/env-utils.js.map +1 -1
  257. package/dist/test-helpers/fuzz-generators.d.ts +58 -0
  258. package/dist/test-helpers/fuzz-generators.d.ts.map +1 -0
  259. package/dist/test-helpers/fuzz-generators.js +216 -0
  260. package/dist/test-helpers/fuzz-generators.js.map +1 -0
  261. package/dist/test-helpers/index.d.ts +1 -0
  262. package/dist/test-helpers/index.d.ts.map +1 -1
  263. package/dist/test-helpers/index.js +2 -0
  264. package/dist/test-helpers/index.js.map +1 -1
  265. package/dist/test-helpers/memfs-utils.d.ts +181 -0
  266. package/dist/test-helpers/memfs-utils.d.ts.map +1 -0
  267. package/dist/test-helpers/memfs-utils.js +292 -0
  268. package/dist/test-helpers/memfs-utils.js.map +1 -0
  269. package/dist/test-helpers/memfs-utils.test.d.ts +5 -0
  270. package/dist/test-helpers/memfs-utils.test.d.ts.map +1 -0
  271. package/dist/test-helpers/memfs-utils.test.js +338 -0
  272. package/dist/test-helpers/memfs-utils.test.js.map +1 -0
  273. package/dist/test-helpers/race-condition-helpers.d.ts +85 -0
  274. package/dist/test-helpers/race-condition-helpers.d.ts.map +1 -0
  275. package/dist/test-helpers/race-condition-helpers.js +279 -0
  276. package/dist/test-helpers/race-condition-helpers.js.map +1 -0
  277. package/dist/test-helpers/test-data-builders.d.ts +40 -3
  278. package/dist/test-helpers/test-data-builders.d.ts.map +1 -1
  279. package/dist/test-helpers/test-data-builders.js +54 -5
  280. package/dist/test-helpers/test-data-builders.js.map +1 -1
  281. package/dist/test-helpers/tool-validators.d.ts.map +1 -1
  282. package/dist/test-helpers/tool-validators.js +16 -1
  283. package/dist/test-helpers/tool-validators.js.map +1 -1
  284. package/dist/tools/context-stats.d.ts.map +1 -1
  285. package/dist/tools/context-stats.js +6 -8
  286. package/dist/tools/context-stats.js.map +1 -1
  287. package/dist/tools/export-confident.d.ts +145 -0
  288. package/dist/tools/export-confident.d.ts.map +1 -0
  289. package/dist/tools/export-confident.js +134 -0
  290. package/dist/tools/export-confident.js.map +1 -0
  291. package/dist/tools/export-confident.test.d.ts +7 -0
  292. package/dist/tools/export-confident.test.d.ts.map +1 -0
  293. package/dist/tools/export-confident.test.js +332 -0
  294. package/dist/tools/export-confident.test.js.map +1 -0
  295. package/dist/tools/export-datadog.d.ts +160 -0
  296. package/dist/tools/export-datadog.d.ts.map +1 -0
  297. package/dist/tools/export-datadog.js +160 -0
  298. package/dist/tools/export-datadog.js.map +1 -0
  299. package/dist/tools/export-datadog.test.d.ts +8 -0
  300. package/dist/tools/export-datadog.test.d.ts.map +1 -0
  301. package/dist/tools/export-datadog.test.js +419 -0
  302. package/dist/tools/export-datadog.test.js.map +1 -0
  303. package/dist/tools/export-langfuse.d.ts +137 -0
  304. package/dist/tools/export-langfuse.d.ts.map +1 -0
  305. package/dist/tools/export-langfuse.js +131 -0
  306. package/dist/tools/export-langfuse.js.map +1 -0
  307. package/dist/tools/export-langfuse.test.d.ts +7 -0
  308. package/dist/tools/export-langfuse.test.d.ts.map +1 -0
  309. package/dist/tools/export-langfuse.test.js +303 -0
  310. package/dist/tools/export-langfuse.test.js.map +1 -0
  311. package/dist/tools/export-phoenix.d.ts +145 -0
  312. package/dist/tools/export-phoenix.d.ts.map +1 -0
  313. package/dist/tools/export-phoenix.js +135 -0
  314. package/dist/tools/export-phoenix.js.map +1 -0
  315. package/dist/tools/export-phoenix.test.d.ts +7 -0
  316. package/dist/tools/export-phoenix.test.d.ts.map +1 -0
  317. package/dist/tools/export-phoenix.test.js +316 -0
  318. package/dist/tools/export-phoenix.test.js.map +1 -0
  319. package/dist/tools/health-check.d.ts +26 -0
  320. package/dist/tools/health-check.d.ts.map +1 -1
  321. package/dist/tools/health-check.js +36 -7
  322. package/dist/tools/health-check.js.map +1 -1
  323. package/dist/tools/index.d.ts +6 -0
  324. package/dist/tools/index.d.ts.map +1 -1
  325. package/dist/tools/index.js +6 -0
  326. package/dist/tools/index.js.map +1 -1
  327. package/dist/tools/inject-evaluations.d.ts +1315 -0
  328. package/dist/tools/inject-evaluations.d.ts.map +1 -0
  329. package/dist/tools/inject-evaluations.js +121 -0
  330. package/dist/tools/inject-evaluations.js.map +1 -0
  331. package/dist/tools/inject-evaluations.test.d.ts +5 -0
  332. package/dist/tools/inject-evaluations.test.d.ts.map +1 -0
  333. package/dist/tools/inject-evaluations.test.js +359 -0
  334. package/dist/tools/inject-evaluations.test.js.map +1 -0
  335. package/dist/tools/query-evaluations.d.ts +25 -4
  336. package/dist/tools/query-evaluations.d.ts.map +1 -1
  337. package/dist/tools/query-evaluations.js +26 -2
  338. package/dist/tools/query-evaluations.js.map +1 -1
  339. package/dist/tools/query-evaluations.test.js +53 -46
  340. package/dist/tools/query-evaluations.test.js.map +1 -1
  341. package/dist/tools/query-llm-events.js +2 -2
  342. package/dist/tools/query-llm-events.js.map +1 -1
  343. package/dist/tools/query-llm-events.test.js +6 -3
  344. package/dist/tools/query-llm-events.test.js.map +1 -1
  345. package/dist/tools/query-logs.d.ts +8 -8
  346. package/dist/tools/query-logs.js +3 -3
  347. package/dist/tools/query-logs.js.map +1 -1
  348. package/dist/tools/query-metrics.d.ts +4 -4
  349. package/dist/tools/query-metrics.js +2 -2
  350. package/dist/tools/query-metrics.js.map +1 -1
  351. package/dist/tools/query-traces.d.ts +8 -8
  352. package/dist/tools/query-verifications.d.ts +111 -0
  353. package/dist/tools/query-verifications.d.ts.map +1 -0
  354. package/dist/tools/query-verifications.js +101 -0
  355. package/dist/tools/query-verifications.js.map +1 -0
  356. package/dist/tools/query-verifications.test.d.ts +5 -0
  357. package/dist/tools/query-verifications.test.d.ts.map +1 -0
  358. package/dist/tools/query-verifications.test.js +156 -0
  359. package/dist/tools/query-verifications.test.js.map +1 -0
  360. package/dist/types/evaluation-hooks.d.ts +176 -0
  361. package/dist/types/evaluation-hooks.d.ts.map +1 -0
  362. package/dist/types/evaluation-hooks.js +49 -0
  363. package/dist/types/evaluation-hooks.js.map +1 -0
  364. package/package.json +11 -2
@@ -0,0 +1,816 @@
1
+ /**
2
+ * Tests for Agent-as-Judge Implementation
3
+ */
4
+ import { describe, it, beforeEach } from 'node:test';
5
+ import assert from 'node:assert/strict';
6
+ import {
7
+ // Constants
8
+ MAX_TRAJECTORY_LENGTH, MAX_CONCURRENT_EVALUATORS, MAX_CONSENSUS_ROUNDS, DEFAULT_CONVERGENCE_THRESHOLD,
9
+ // Error classes
10
+ AgentEvalTimeoutError,
11
+ // Utilities
12
+ withAgentTimeout, validateEvaluand, validateStepScore, validateToolVerification, verifyToolCall, verifyToolCalls, scoreStep, aggregateStepScores, analyzeTrajectory, calculateVariance, calculateMedian, collectiveConsensus,
13
+ // Classes
14
+ ProceduralJudge, ReactiveJudge, } from './agent-as-judge.js';
15
+ import { InputValidationError } from './input-validator.js';
16
+ describe('agent-as-judge', () => {
17
+ // ============================================================================
18
+ // Timeout Protection Tests
19
+ // ============================================================================
20
+ describe('withAgentTimeout', () => {
21
+ it('should return result when function completes in time', async () => {
22
+ const result = await withAgentTimeout(() => Promise.resolve(42), 1000);
23
+ assert.equal(result, 42);
24
+ });
25
+ it('should throw AgentEvalTimeoutError when function times out', async () => {
26
+ await assert.rejects(withAgentTimeout(() => new Promise((resolve) => setTimeout(resolve, 200)), 50), AgentEvalTimeoutError);
27
+ });
28
+ it('should include timeout duration in error', async () => {
29
+ try {
30
+ await withAgentTimeout(() => new Promise((resolve) => setTimeout(resolve, 200)), 50);
31
+ assert.fail('Should have thrown');
32
+ }
33
+ catch (error) {
34
+ assert.ok(error instanceof AgentEvalTimeoutError);
35
+ assert.equal(error.timeoutMs, 50);
36
+ assert.ok(error.message.includes('50'));
37
+ }
38
+ });
39
+ it('should propagate function errors', async () => {
40
+ await assert.rejects(withAgentTimeout(() => Promise.reject(new Error('Test error')), 1000), { message: 'Test error' });
41
+ });
42
+ });
43
+ // ============================================================================
44
+ // Validation Tests
45
+ // ============================================================================
46
+ describe('validateEvaluand', () => {
47
+ it('should pass for valid evaluand', () => {
48
+ assert.doesNotThrow(() => validateEvaluand({ input: 'test input', output: 'test output' }));
49
+ });
50
+ it('should throw for empty input', () => {
51
+ assert.throws(() => validateEvaluand({ input: '', output: 'test' }), InputValidationError);
52
+ });
53
+ it('should throw for whitespace-only input', () => {
54
+ assert.throws(() => validateEvaluand({ input: ' ', output: 'test' }), InputValidationError);
55
+ });
56
+ it('should throw for empty output', () => {
57
+ assert.throws(() => validateEvaluand({ input: 'test', output: '' }), InputValidationError);
58
+ });
59
+ it('should throw for actions exceeding MAX_TRAJECTORY_LENGTH', () => {
60
+ const actions = Array(MAX_TRAJECTORY_LENGTH + 1).fill({
61
+ type: 'tool_call',
62
+ tool: 'test',
63
+ });
64
+ assert.throws(() => validateEvaluand({ input: 'test', output: 'test', actions }), InputValidationError);
65
+ });
66
+ it('should pass for actions at MAX_TRAJECTORY_LENGTH', () => {
67
+ const actions = Array(MAX_TRAJECTORY_LENGTH).fill({
68
+ type: 'tool_call',
69
+ tool: 'test',
70
+ });
71
+ assert.doesNotThrow(() => validateEvaluand({ input: 'test', output: 'test', actions }));
72
+ });
73
+ });
74
+ describe('validateStepScore', () => {
75
+ it('should pass for valid step score with number step', () => {
76
+ assert.doesNotThrow(() => validateStepScore({ step: 0, score: 0.5 }));
77
+ });
78
+ it('should pass for valid step score with string step', () => {
79
+ assert.doesNotThrow(() => validateStepScore({ step: 'step_1', score: 0.5 }));
80
+ });
81
+ it('should throw for step string exceeding 256 characters', () => {
82
+ assert.throws(() => validateStepScore({ step: 'a'.repeat(257), score: 0.5 }), InputValidationError);
83
+ });
84
+ it('should throw for negative step index', () => {
85
+ assert.throws(() => validateStepScore({ step: -1, score: 0.5 }), InputValidationError);
86
+ });
87
+ it('should throw for non-integer step index', () => {
88
+ assert.throws(() => validateStepScore({ step: 1.5, score: 0.5 }), InputValidationError);
89
+ });
90
+ it('should throw for score below 0', () => {
91
+ assert.throws(() => validateStepScore({ step: 0, score: -0.1 }), InputValidationError);
92
+ });
93
+ it('should throw for score above 1', () => {
94
+ assert.throws(() => validateStepScore({ step: 0, score: 1.1 }), InputValidationError);
95
+ });
96
+ it('should throw for NaN score', () => {
97
+ assert.throws(() => validateStepScore({ step: 0, score: NaN }), InputValidationError);
98
+ });
99
+ it('should throw for Infinity score', () => {
100
+ assert.throws(() => validateStepScore({ step: 0, score: Infinity }), InputValidationError);
101
+ });
102
+ });
103
+ describe('validateToolVerification', () => {
104
+ it('should pass for valid verification', () => {
105
+ assert.doesNotThrow(() => validateToolVerification({
106
+ toolName: 'search',
107
+ toolCorrect: true,
108
+ argsCorrect: true,
109
+ score: 0.8,
110
+ }));
111
+ });
112
+ it('should throw for empty tool name', () => {
113
+ assert.throws(() => validateToolVerification({
114
+ toolName: '',
115
+ toolCorrect: true,
116
+ argsCorrect: true,
117
+ score: 0.8,
118
+ }), InputValidationError);
119
+ });
120
+ it('should throw for non-boolean toolCorrect', () => {
121
+ assert.throws(() => validateToolVerification({
122
+ toolName: 'test',
123
+ toolCorrect: 'yes',
124
+ argsCorrect: true,
125
+ score: 0.8,
126
+ }), InputValidationError);
127
+ });
128
+ it('should throw for invalid score', () => {
129
+ assert.throws(() => validateToolVerification({
130
+ toolName: 'test',
131
+ toolCorrect: true,
132
+ argsCorrect: true,
133
+ score: 1.5,
134
+ }), InputValidationError);
135
+ });
136
+ });
137
+ // ============================================================================
138
+ // Tool Verification Tests
139
+ // ============================================================================
140
+ describe('verifyToolCall', () => {
141
+ it('should throw for null action (H1)', () => {
142
+ assert.throws(() => verifyToolCall(null), InputValidationError);
143
+ });
144
+ it('should throw for non-object action (H1)', () => {
145
+ assert.throws(() => verifyToolCall('string'), InputValidationError);
146
+ });
147
+ it('should verify correct tool selection', () => {
148
+ const action = { type: 'tool_call', tool: 'search' };
149
+ const result = verifyToolCall(action, 'search');
150
+ assert.equal(result.toolCorrect, true);
151
+ assert.equal(result.score, 1.0);
152
+ });
153
+ it('should detect incorrect tool selection', () => {
154
+ const action = { type: 'tool_call', tool: 'read' };
155
+ const result = verifyToolCall(action, 'search');
156
+ assert.equal(result.toolCorrect, false);
157
+ assert.equal(result.score, 0);
158
+ });
159
+ it('should verify correct arguments', () => {
160
+ const action = {
161
+ type: 'tool_call',
162
+ tool: 'search',
163
+ arguments: { query: 'test' },
164
+ };
165
+ const result = verifyToolCall(action, 'search', { query: 'test' });
166
+ assert.equal(result.argsCorrect, true);
167
+ assert.ok(result.score > 0.9); // tool + args correct
168
+ });
169
+ it('should detect incorrect arguments', () => {
170
+ const action = {
171
+ type: 'tool_call',
172
+ tool: 'search',
173
+ arguments: { query: 'wrong' },
174
+ };
175
+ const result = verifyToolCall(action, 'search', { query: 'test' });
176
+ assert.equal(result.argsCorrect, false);
177
+ });
178
+ it('should verify result correctness when provided', () => {
179
+ const action = {
180
+ type: 'tool_call',
181
+ tool: 'calc',
182
+ result: 42,
183
+ };
184
+ const result = verifyToolCall(action, 'calc', undefined, undefined, 42);
185
+ assert.equal(result.resultCorrect, true);
186
+ });
187
+ it('should include evidence in result', () => {
188
+ const action = {
189
+ type: 'tool_call',
190
+ tool: 'test',
191
+ arguments: { foo: 'bar' },
192
+ };
193
+ const result = verifyToolCall(action, 'expected', { baz: 'qux' });
194
+ assert.ok(result.evidence);
195
+ assert.deepEqual(result.evidence.actualTool, 'test');
196
+ assert.deepEqual(result.evidence.expectedTool, 'expected');
197
+ });
198
+ });
199
+ describe('verifyToolCalls', () => {
200
+ it('should verify multiple tool calls', () => {
201
+ const actions = [
202
+ { type: 'tool_call', tool: 'search', toolCallId: 'call_1' },
203
+ { type: 'reasoning', reasoning: 'thinking...' },
204
+ { type: 'tool_call', tool: 'read', toolCallId: 'call_2' },
205
+ ];
206
+ const expected = new Map([
207
+ ['call_1', { tool: 'search' }],
208
+ ['call_2', { tool: 'read' }],
209
+ ]);
210
+ const results = verifyToolCalls(actions, expected);
211
+ assert.equal(results.length, 2); // Only tool_call actions
212
+ assert.equal(results[0].toolCorrect, true);
213
+ assert.equal(results[1].toolCorrect, true);
214
+ });
215
+ it('should skip non-tool actions', () => {
216
+ const actions = [
217
+ { type: 'reasoning', reasoning: 'thinking' },
218
+ { type: 'response', reasoning: 'responding' },
219
+ ];
220
+ const results = verifyToolCalls(actions);
221
+ assert.equal(results.length, 0);
222
+ });
223
+ it('should respect MAX_TOOL_VERIFICATIONS limit', () => {
224
+ const actions = Array(1000).fill({
225
+ type: 'tool_call',
226
+ tool: 'test',
227
+ });
228
+ const results = verifyToolCalls(actions);
229
+ assert.ok(results.length <= 500); // MAX_TOOL_VERIFICATIONS
230
+ });
231
+ });
232
+ // ============================================================================
233
+ // Step Scoring Tests
234
+ // ============================================================================
235
+ describe('scoreStep', () => {
236
+ it('should create valid step score', () => {
237
+ const action = { type: 'tool_call', tool: 'search' };
238
+ const result = scoreStep(action, 0, { score: 0.8, explanation: 'Good' });
239
+ assert.equal(result.step, 0);
240
+ assert.equal(result.score, 0.8);
241
+ assert.equal(result.explanation, 'Good');
242
+ });
243
+ it('should clamp score to [0, 1]', () => {
244
+ const action = { type: 'tool_call', tool: 'test' };
245
+ const high = scoreStep(action, 0, { score: 1.5 });
246
+ assert.equal(high.score, 1);
247
+ const low = scoreStep(action, 0, { score: -0.5 });
248
+ assert.equal(low.score, 0);
249
+ });
250
+ it('should throw for NaN score (H4)', () => {
251
+ const action = { type: 'tool_call', tool: 'test' };
252
+ assert.throws(() => scoreStep(action, 0, { score: NaN }), InputValidationError);
253
+ });
254
+ it('should throw for string score (H4)', () => {
255
+ const action = { type: 'tool_call', tool: 'test' };
256
+ assert.throws(() => scoreStep(action, 0, { score: '0.5' }), InputValidationError);
257
+ });
258
+ it('should include action metadata in evidence', () => {
259
+ const action = {
260
+ type: 'tool_call',
261
+ tool: 'search',
262
+ reasoning: 'searching for data',
263
+ };
264
+ const result = scoreStep(action, 0, { score: 0.9 });
265
+ const evidence = result.evidence;
266
+ assert.equal(evidence.actionType, 'tool_call');
267
+ assert.equal(evidence.tool, 'search');
268
+ assert.equal(evidence.reasoning, 'searching for data');
269
+ });
270
+ });
271
+ describe('aggregateStepScores', () => {
272
+ it('should return 1 for empty array', () => {
273
+ const result = aggregateStepScores([]);
274
+ assert.equal(result, 1);
275
+ });
276
+ it('should calculate average correctly', () => {
277
+ const scores = [
278
+ { step: 0, score: 0.8 },
279
+ { step: 1, score: 0.6 },
280
+ { step: 2, score: 1.0 },
281
+ ];
282
+ const result = aggregateStepScores(scores, 'average');
283
+ // Use approximate comparison for floating point
284
+ assert.ok(Math.abs(result - 0.8) < 0.0001);
285
+ });
286
+ it('should calculate weighted average correctly', () => {
287
+ const scores = [
288
+ { step: 0, score: 1.0 },
289
+ { step: 1, score: 0.0 },
290
+ ];
291
+ const result = aggregateStepScores(scores, 'weighted', [3, 1]);
292
+ assert.equal(result, 0.75);
293
+ });
294
+ it('should throw for weighted without weights', () => {
295
+ const scores = [{ step: 0, score: 0.5 }];
296
+ assert.throws(() => aggregateStepScores(scores, 'weighted'), Error);
297
+ });
298
+ it('should throw for mismatched weights length', () => {
299
+ const scores = [
300
+ { step: 0, score: 0.5 },
301
+ { step: 1, score: 0.5 },
302
+ ];
303
+ assert.throws(() => aggregateStepScores(scores, 'weighted', [1]), Error);
304
+ });
305
+ it('should return 0 for all-zero weights', () => {
306
+ const scores = [{ step: 0, score: 0.8 }];
307
+ const result = aggregateStepScores(scores, 'weighted', [0]);
308
+ assert.equal(result, 0);
309
+ });
310
+ it('should throw on negative weights (L8)', () => {
311
+ const scores = [
312
+ { step: 0, score: 0.5 },
313
+ { step: 1, score: 0.8 },
314
+ ];
315
+ assert.throws(() => aggregateStepScores(scores, 'weighted', [1, -1]), /Invalid weight at index 1: -1\. Weights must be finite non-negative numbers/);
316
+ });
317
+ it('should throw on negative weight at index 0', () => {
318
+ const scores = [
319
+ { step: 0, score: 0.5 },
320
+ { step: 1, score: 0.8 },
321
+ ];
322
+ assert.throws(() => aggregateStepScores(scores, 'weighted', [-1, 1]), /Invalid weight at index 0: -1\. Weights must be finite non-negative numbers/);
323
+ });
324
+ it('should throw on NaN weight (M1)', () => {
325
+ const scores = [
326
+ { step: 0, score: 0.5 },
327
+ { step: 1, score: 0.8 },
328
+ ];
329
+ assert.throws(() => aggregateStepScores(scores, 'weighted', [1, NaN]), /Invalid weight at index 1: NaN\. Weights must be finite non-negative numbers/);
330
+ });
331
+ it('should throw on Infinity weight', () => {
332
+ const scores = [
333
+ { step: 0, score: 0.5 },
334
+ { step: 1, score: 0.8 },
335
+ ];
336
+ assert.throws(() => aggregateStepScores(scores, 'weighted', [1, Infinity]), /Invalid weight at index 1: Infinity\. Weights must be finite non-negative numbers/);
337
+ });
338
+ it('should throw on -Infinity weight', () => {
339
+ const scores = [
340
+ { step: 0, score: 0.5 },
341
+ { step: 1, score: 0.8 },
342
+ ];
343
+ assert.throws(() => aggregateStepScores(scores, 'weighted', [-Infinity, 1]), /Invalid weight at index 0: -Infinity\. Weights must be finite non-negative numbers/);
344
+ });
345
+ it('should calculate min correctly', () => {
346
+ const scores = [
347
+ { step: 0, score: 0.9 },
348
+ { step: 1, score: 0.3 },
349
+ { step: 2, score: 0.7 },
350
+ ];
351
+ const result = aggregateStepScores(scores, 'min');
352
+ assert.equal(result, 0.3);
353
+ });
354
+ });
355
+ // ============================================================================
356
+ // Trajectory Analysis Tests
357
+ // ============================================================================
358
+ describe('analyzeTrajectory', () => {
359
+ it('should calculate basic metrics', () => {
360
+ const evaluand = {
361
+ input: 'test',
362
+ output: 'result',
363
+ actions: [
364
+ { type: 'tool_call', tool: 'search' },
365
+ { type: 'reasoning', reasoning: 'thinking' },
366
+ { type: 'tool_call', tool: 'read' },
367
+ ],
368
+ };
369
+ const result = analyzeTrajectory(evaluand);
370
+ assert.equal(result.length, 3);
371
+ assert.equal(result.toolCallCount, 2);
372
+ assert.equal(result.uniqueTools, 2);
373
+ });
374
+ it('should handle empty actions', () => {
375
+ const evaluand = { input: 'test', output: 'result' };
376
+ const result = analyzeTrajectory(evaluand);
377
+ assert.equal(result.length, 0);
378
+ assert.equal(result.toolCallCount, 0);
379
+ });
380
+ it('should calculate efficiency ratio', () => {
381
+ const evaluand = {
382
+ input: 'test',
383
+ output: 'result',
384
+ actions: Array(10).fill({ type: 'tool_call', tool: 'test' }),
385
+ };
386
+ const result = analyzeTrajectory(evaluand, 5);
387
+ assert.equal(result.efficiencyRatio, 0.5); // 5/10
388
+ });
389
+ it('should cap efficiency ratio at 1', () => {
390
+ const evaluand = {
391
+ input: 'test',
392
+ output: 'result',
393
+ actions: [{ type: 'tool_call', tool: 'test' }],
394
+ };
395
+ const result = analyzeTrajectory(evaluand, 10);
396
+ assert.equal(result.efficiencyRatio, 1);
397
+ });
398
+ it('should detect redundant actions', () => {
399
+ const evaluand = {
400
+ input: 'test',
401
+ output: 'result',
402
+ actions: [
403
+ { type: 'tool_call', tool: 'search', arguments: { q: 'a' } },
404
+ { type: 'tool_call', tool: 'search', arguments: { q: 'a' } }, // Duplicate
405
+ { type: 'tool_call', tool: 'search', arguments: { q: 'b' } }, // Different args
406
+ ],
407
+ };
408
+ const result = analyzeTrajectory(evaluand);
409
+ assert.equal(result.redundantActions, 1);
410
+ });
411
+ });
412
+ // ============================================================================
413
+ // Statistical Functions Tests
414
+ // ============================================================================
415
+ describe('calculateVariance', () => {
416
+ it('should return 0 for empty array', () => {
417
+ assert.equal(calculateVariance([]), 0);
418
+ });
419
+ it('should return 0 for single value', () => {
420
+ assert.equal(calculateVariance([5]), 0);
421
+ });
422
+ it('should return 0 for identical values', () => {
423
+ assert.equal(calculateVariance([3, 3, 3, 3]), 0);
424
+ });
425
+ it('should calculate sample variance with Bessel correction (M7)', () => {
426
+ // Values: [2, 4, 4, 4, 5, 5, 7, 9], n=8, mean = 5
427
+ // Sum of squared diffs = (2-5)^2 + (4-5)^2*3 + (5-5)^2*2 + (7-5)^2 + (9-5)^2
428
+ // = 9 + 3 + 0 + 4 + 16 = 32
429
+ // Sample variance (Bessel's correction) = 32 / (n-1) = 32/7 ≈ 4.571
430
+ const result = calculateVariance([2, 4, 4, 4, 5, 5, 7, 9]);
431
+ assert.ok(Math.abs(result - 32 / 7) < 0.001, `Expected ~4.571, got ${result}`);
432
+ });
433
+ it('should calculate variance for two values', () => {
434
+ // Values: [2, 6], mean = 4
435
+ // Sum of squared diffs = (2-4)^2 + (6-4)^2 = 4 + 4 = 8
436
+ // Sample variance = 8 / (2-1) = 8
437
+ const result = calculateVariance([2, 6]);
438
+ assert.equal(result, 8);
439
+ });
440
+ });
441
+ describe('calculateMedian', () => {
442
+ it('should return 0 for empty array', () => {
443
+ assert.equal(calculateMedian([]), 0);
444
+ });
445
+ it('should return value for single element', () => {
446
+ assert.equal(calculateMedian([5]), 5);
447
+ });
448
+ it('should calculate median for odd count', () => {
449
+ assert.equal(calculateMedian([1, 3, 5]), 3);
450
+ });
451
+ it('should calculate median for even count', () => {
452
+ assert.equal(calculateMedian([1, 2, 3, 4]), 2.5);
453
+ });
454
+ it('should handle unsorted input', () => {
455
+ assert.equal(calculateMedian([5, 1, 3]), 3);
456
+ });
457
+ });
458
+ // ============================================================================
459
+ // Consensus Tests
460
+ // ============================================================================
461
+ describe('collectiveConsensus', () => {
462
+ const validEvaluand = { input: 'test', output: 'result' };
463
+ const config = {
464
+ rounds: 3,
465
+ convergenceThreshold: DEFAULT_CONVERGENCE_THRESHOLD,
466
+ };
467
+ it('should throw for invalid evaluand', async () => {
468
+ await assert.rejects(collectiveConsensus({ input: '', output: 'test' }, [], config), InputValidationError);
469
+ });
470
+ it('should throw for too many judges (H2)', async () => {
471
+ const judges = Array(MAX_CONCURRENT_EVALUATORS + 1).fill({
472
+ id: 'judge',
473
+ evaluate: async () => 0.5,
474
+ });
475
+ await assert.rejects(collectiveConsensus(validEvaluand, judges, config), InputValidationError);
476
+ });
477
+ it('should throw for empty judges array', async () => {
478
+ await assert.rejects(collectiveConsensus(validEvaluand, [], config), InputValidationError);
479
+ });
480
+ it('should throw for rounds < 1 (M9)', async () => {
481
+ const judges = [{ id: 'j1', evaluate: async () => 0.5 }];
482
+ await assert.rejects(collectiveConsensus(validEvaluand, judges, { ...config, rounds: 0 }), InputValidationError);
483
+ });
484
+ it('should reach consensus with agreeing judges', async () => {
485
+ const judges = [
486
+ { id: 'j1', evaluate: async () => 0.8 },
487
+ { id: 'j2', evaluate: async () => 0.82 },
488
+ { id: 'j3', evaluate: async () => 0.79 },
489
+ ];
490
+ const result = await collectiveConsensus(validEvaluand, judges, config);
491
+ assert.ok(result.converged);
492
+ assert.ok(result.finalScore > 0.7);
493
+ });
494
+ it('should handle disagreeing judges', async () => {
495
+ const judges = [
496
+ { id: 'j1', evaluate: async () => 0.2 },
497
+ { id: 'j2', evaluate: async () => 0.8 },
498
+ ];
499
+ const result = await collectiveConsensus(validEvaluand, judges, {
500
+ rounds: 2,
501
+ convergenceThreshold: 0.01, // Very tight
502
+ });
503
+ assert.equal(result.converged, false);
504
+ });
505
+ it('should handle judge failures gracefully (H3)', async () => {
506
+ const judges = [
507
+ { id: 'j1', evaluate: async () => 0.8 },
508
+ {
509
+ id: 'j2',
510
+ evaluate: async () => {
511
+ throw new Error('Judge failed');
512
+ },
513
+ },
514
+ { id: 'j3', evaluate: async () => 0.75 },
515
+ ];
516
+ const result = await collectiveConsensus(validEvaluand, judges, config);
517
+ // Should succeed with 2 working judges
518
+ assert.ok(result.finalScore > 0);
519
+ });
520
+ it('should throw when all judges fail (H3)', async () => {
521
+ const judges = [
522
+ {
523
+ id: 'j1',
524
+ evaluate: async () => {
525
+ throw new Error('Failed 1');
526
+ },
527
+ },
528
+ {
529
+ id: 'j2',
530
+ evaluate: async () => {
531
+ throw new Error('Failed 2');
532
+ },
533
+ },
534
+ ];
535
+ await assert.rejects(collectiveConsensus(validEvaluand, judges, config), /All judge evaluations failed/);
536
+ });
537
+ it('should respect MAX_CONSENSUS_ROUNDS', async () => {
538
+ let callCount = 0;
539
+ const judges = [
540
+ {
541
+ id: 'j1',
542
+ evaluate: async () => {
543
+ callCount++;
544
+ return 0.5;
545
+ },
546
+ },
547
+ ];
548
+ await collectiveConsensus(validEvaluand, judges, {
549
+ rounds: 100, // Exceeds max
550
+ convergenceThreshold: 0.0001,
551
+ });
552
+ assert.ok(callCount <= MAX_CONSENSUS_ROUNDS);
553
+ });
554
+ });
555
+ // ============================================================================
556
+ // ProceduralJudge Tests
557
+ // ============================================================================
558
+ describe('ProceduralJudge', () => {
559
+ it('should execute all stages in order', async () => {
560
+ const executionOrder = [];
561
+ const stages = [
562
+ {
563
+ name: 'stage1',
564
+ evaluate: async () => {
565
+ executionOrder.push('stage1');
566
+ return { score: 0.8, explanation: 'Good' };
567
+ },
568
+ },
569
+ {
570
+ name: 'stage2',
571
+ evaluate: async () => {
572
+ executionOrder.push('stage2');
573
+ return { score: 0.9, explanation: 'Great' };
574
+ },
575
+ },
576
+ ];
577
+ const judge = new ProceduralJudge(stages);
578
+ const result = await judge.evaluate({
579
+ input: 'test',
580
+ output: 'result',
581
+ });
582
+ assert.deepEqual(executionOrder, ['stage1', 'stage2']);
583
+ assert.equal(result.stepScores.length, 2);
584
+ });
585
+ it('should support early termination', async () => {
586
+ const executionOrder = [];
587
+ const stages = [
588
+ {
589
+ name: 'safety',
590
+ evaluate: async () => {
591
+ executionOrder.push('safety');
592
+ return { score: 0.2, explanation: 'Failed safety' };
593
+ },
594
+ },
595
+ {
596
+ name: 'quality',
597
+ evaluate: async () => {
598
+ executionOrder.push('quality');
599
+ return { score: 0.9, explanation: 'Good' };
600
+ },
601
+ },
602
+ ];
603
+ const judge = new ProceduralJudge(stages, 'safety');
604
+ const result = await judge.evaluate({
605
+ input: 'test',
606
+ output: 'result',
607
+ });
608
+ assert.deepEqual(executionOrder, ['safety']);
609
+ assert.equal(result.overallScore, 0);
610
+ assert.ok(result.explanation.includes('Early termination'));
611
+ });
612
+ it('should pass context between stages', async () => {
613
+ let stage2Context = {};
614
+ const stages = [
615
+ {
616
+ name: 'stage1',
617
+ evaluate: async (_, ctx) => {
618
+ ctx['data'] = 'from_stage1';
619
+ return { score: 0.8, explanation: 'OK' };
620
+ },
621
+ },
622
+ {
623
+ name: 'stage2',
624
+ evaluate: async (_, ctx) => {
625
+ stage2Context = { ...ctx };
626
+ return { score: 0.9, explanation: 'OK' };
627
+ },
628
+ },
629
+ ];
630
+ const judge = new ProceduralJudge(stages);
631
+ await judge.evaluate({ input: 'test', output: 'result' });
632
+ assert.ok(stage2Context['stage1']);
633
+ });
634
+ it('should validate evaluand', async () => {
635
+ const judge = new ProceduralJudge([
636
+ { name: 'test', evaluate: async () => ({ score: 1, explanation: '' }) },
637
+ ]);
638
+ await assert.rejects(judge.evaluate({ input: '', output: 'test' }), InputValidationError);
639
+ });
640
+ describe('constructor validation (M8)', () => {
641
+ it('should throw on empty stages array', () => {
642
+ assert.throws(() => new ProceduralJudge([]), InputValidationError);
643
+ });
644
+ it('should throw on stage with empty name', () => {
645
+ assert.throws(() => new ProceduralJudge([
646
+ { name: '', evaluate: async () => ({ score: 1, explanation: '' }) },
647
+ ]), InputValidationError);
648
+ });
649
+ it('should throw on stage with whitespace-only name', () => {
650
+ assert.throws(() => new ProceduralJudge([
651
+ { name: ' ', evaluate: async () => ({ score: 1, explanation: '' }) },
652
+ ]), InputValidationError);
653
+ });
654
+ it('should throw on stage without evaluate function', () => {
655
+ assert.throws(
656
+ // @ts-expect-error - testing runtime validation
657
+ () => new ProceduralJudge([{ name: 'test' }]), InputValidationError);
658
+ });
659
+ it('should throw on invalid earlyTerminationOn stage name', () => {
660
+ assert.throws(() => new ProceduralJudge([{ name: 'stage1', evaluate: async () => ({ score: 1, explanation: '' }) }], 'nonexistent'), InputValidationError);
661
+ });
662
+ it('should accept valid stages with earlyTerminationOn', () => {
663
+ const judge = new ProceduralJudge([{ name: 'safety', evaluate: async () => ({ score: 1, explanation: '' }) }], 'safety');
664
+ assert.ok(judge);
665
+ });
666
+ });
667
+ });
668
+ // ============================================================================
669
+ // ReactiveJudge Tests
670
+ // ============================================================================
671
+ describe('ReactiveJudge', () => {
672
+ it('should route to selected specialists', async () => {
673
+ const executedSpecialists = [];
674
+ const router = async () => ['quality', 'safety'];
675
+ const specialists = new Map([
676
+ [
677
+ 'quality',
678
+ async () => {
679
+ executedSpecialists.push('quality');
680
+ return { score: 0.9, explanation: 'Good quality' };
681
+ },
682
+ ],
683
+ [
684
+ 'safety',
685
+ async () => {
686
+ executedSpecialists.push('safety');
687
+ return { score: 1.0, explanation: 'Safe' };
688
+ },
689
+ ],
690
+ [
691
+ 'style',
692
+ async () => {
693
+ executedSpecialists.push('style');
694
+ return { score: 0.8, explanation: 'OK style' };
695
+ },
696
+ ],
697
+ ]);
698
+ const judge = new ReactiveJudge(router, specialists);
699
+ const result = await judge.evaluate({
700
+ input: 'test',
701
+ output: 'result',
702
+ });
703
+ assert.deepEqual(executedSpecialists, ['quality', 'safety']);
704
+ assert.equal(result.stepScores.length, 2);
705
+ });
706
+ it('should trigger deep dive when needed', async () => {
707
+ const deepDiveTriggered = { value: false };
708
+ const router = async () => ['quality'];
709
+ const specialists = new Map([
710
+ [
711
+ 'quality',
712
+ async () => ({
713
+ score: 0.5,
714
+ explanation: 'Needs investigation',
715
+ needsDeepDive: true,
716
+ }),
717
+ ],
718
+ ]);
719
+ const deepDiveSpecialists = new Map([
720
+ [
721
+ 'quality',
722
+ async () => {
723
+ deepDiveTriggered.value = true;
724
+ return { score: 0.4, explanation: 'Deep analysis' };
725
+ },
726
+ ],
727
+ ]);
728
+ const judge = new ReactiveJudge(router, specialists, deepDiveSpecialists);
729
+ const result = await judge.evaluate({
730
+ input: 'test',
731
+ output: 'result',
732
+ });
733
+ assert.equal(deepDiveTriggered.value, true);
734
+ assert.equal(result.stepScores.length, 2); // Regular + deep dive
735
+ });
736
+ it('should skip missing specialists', async () => {
737
+ const router = async () => ['missing', 'existing'];
738
+ const specialists = new Map([
739
+ ['existing', async () => ({ score: 0.8, explanation: 'OK' })],
740
+ ]);
741
+ const judge = new ReactiveJudge(router, specialists);
742
+ const result = await judge.evaluate({
743
+ input: 'test',
744
+ output: 'result',
745
+ });
746
+ assert.equal(result.stepScores.length, 1);
747
+ });
748
+ it('should validate evaluand', async () => {
749
+ const judge = new ReactiveJudge(async () => [], new Map());
750
+ await assert.rejects(judge.evaluate({ input: '', output: 'test' }), InputValidationError);
751
+ });
752
+ });
753
+ // ============================================================================
754
+ // AgentJudge Memory Tests (H5)
755
+ // ============================================================================
756
+ describe('AgentJudge memory management', () => {
757
+ class TestJudge extends ProceduralJudge {
758
+ // Expose protected methods for testing
759
+ testStore(key, value) {
760
+ this.storeInMemory(key, value);
761
+ }
762
+ testGet(key) {
763
+ return this.getFromMemory(key);
764
+ }
765
+ getMemorySize() {
766
+ return this.memory.size;
767
+ }
768
+ }
769
+ let judge;
770
+ beforeEach(() => {
771
+ judge = new TestJudge([
772
+ { name: 'test', evaluate: async () => ({ score: 1, explanation: '' }) },
773
+ ]);
774
+ });
775
+ it('should store and retrieve values', () => {
776
+ judge.testStore('key1', 'value1');
777
+ assert.equal(judge.testGet('key1'), 'value1');
778
+ });
779
+ it('should implement LRU - reading moves to end (H5)', () => {
780
+ // Fill nearly to capacity
781
+ for (let i = 0; i < 997; i++) {
782
+ judge.testStore(`fill_${i}`, i);
783
+ }
784
+ // Store 3 more items we care about
785
+ judge.testStore('key1', 'value1');
786
+ judge.testStore('key2', 'value2');
787
+ judge.testStore('key3', 'value3');
788
+ // Memory is now at 1000 - order is fill_0...fill_996, key1, key2, key3
789
+ // Read key1 - should move it to end (after key3)
790
+ judge.testGet('key1');
791
+ // Order is now: fill_0...fill_996, key2, key3, key1
792
+ // Store 1 more to trigger eviction of fill_0
793
+ judge.testStore('new_item', 'new');
794
+ // Order: fill_1...fill_996, key2, key3, key1, new_item
795
+ // key1 should still exist (was accessed recently), key2/key3 should exist
796
+ assert.equal(judge.testGet('key1'), 'value1');
797
+ assert.equal(judge.testGet('key2'), 'value2');
798
+ assert.equal(judge.testGet('key3'), 'value3');
799
+ // fill_0 should be evicted
800
+ assert.equal(judge.testGet('fill_0'), undefined);
801
+ });
802
+ it('should evict oldest on overflow', () => {
803
+ // Fill to capacity
804
+ for (let i = 0; i < 1000; i++) {
805
+ judge.testStore(`key_${i}`, i);
806
+ }
807
+ assert.equal(judge.getMemorySize(), 1000);
808
+ // Add one more
809
+ judge.testStore('new_key', 'new_value');
810
+ assert.equal(judge.getMemorySize(), 1000);
811
+ assert.equal(judge.testGet('key_0'), undefined); // First one evicted
812
+ assert.equal(judge.testGet('new_key'), 'new_value');
813
+ });
814
+ });
815
+ });
816
+ //# sourceMappingURL=agent-as-judge.test.js.map