promptfoo 0.121.4 → 0.121.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (346) hide show
  1. package/dist/src/{ListApp-DQkFNqE9.js → ListApp-BRUsT43Y.js} +1 -1
  2. package/dist/src/{accounts-Dy17bs4D.cjs → accounts-BIFntVWB.cjs} +4 -4
  3. package/dist/src/{accounts-F9d_5sMC.js → accounts-CLJHCDDb.js} +6 -6
  4. package/dist/src/{accounts-DhMYUUbu.js → accounts-CaLNYnf7.js} +4 -4
  5. package/dist/src/{accounts-DdJ2pHMI.js → accounts-bnyHT7Ju.js} +5 -5
  6. package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-B5krlibj.js} +3 -3
  7. package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-Ba67xmgs.js} +2 -2
  8. package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-BclbiXiq.js} +3 -3
  9. package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-D2x0wGhB.cjs} +2 -2
  10. package/dist/src/{agents-CgaMXvLM.js → agents-BGqaTDnr.js} +5 -5
  11. package/dist/src/{agents-8FDnTriG.js → agents-BV9yFpXX.js} +5 -5
  12. package/dist/src/{agents-aYPQLf8W.js → agents-BYdMl1UE.js} +4 -4
  13. package/dist/src/{agents-pQeBEXMm.js → agents-DhxWMCtH.js} +5 -5
  14. package/dist/src/{agents-D7-HGxUj.cjs → agents-DiWmQYH9.cjs} +4 -4
  15. package/dist/src/{agents-BahDpe5G.cjs → agents-WULPVjbH.cjs} +4 -4
  16. package/dist/src/{agents-DJ35I3Nt.js → agents-emVcx3yh.js} +5 -5
  17. package/dist/src/{agents-C-R_jfzI.js → agents-n6vPqV3i.js} +4 -4
  18. package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-BxqK9HF_.js} +7 -7
  19. package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-BzLjZI_m.cjs} +6 -6
  20. package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-DR4pgeiC.js} +6 -6
  21. package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-uPGp0Zdo.js} +7 -7
  22. package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
  23. package/dist/src/app/assets/Report-vjzrbgce.js +1 -0
  24. package/dist/src/app/assets/index-B3NQ8HTd.js +385 -0
  25. package/dist/src/app/assets/{index-BXGkeMwh.css → index-Cli2yAXv.css} +1 -1
  26. package/dist/src/app/index.html +27 -2
  27. package/dist/src/{audio-DcVKoInv.js → audio-BvpTOArF.js} +4 -4
  28. package/dist/src/{audio-BQtNuYBj.cjs → audio-C0vDeS0j.cjs} +3 -3
  29. package/dist/src/{audio-B7izf48x.js → audio-CScmnmEB.js} +4 -4
  30. package/dist/src/{audio-COrn8rM6.js → audio-Da8U9IS5.js} +3 -3
  31. package/dist/src/{base-fZ9wgg50.js → base-BOMaNEes.js} +3 -3
  32. package/dist/src/{base-PYJvBE1i.js → base-BTux96b1.js} +2 -2
  33. package/dist/src/{base-D-670DX8.cjs → base-Tw6uhH8K.cjs} +2 -2
  34. package/dist/src/{base-yrI1Yal4.js → base-dYsl2hmL.js} +3 -3
  35. package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B95F_7vE.cjs} +2 -2
  36. package/dist/src/{blobs-C-F78Kfn.js → blobs-BW4U31ue.js} +2 -2
  37. package/dist/src/{blobs-BCZavS8s.js → blobs-D_gg8nbm.js} +3 -3
  38. package/dist/src/{blobs-BQWqnnvL.js → blobs-DjLby-uP.js} +3 -3
  39. package/dist/src/{cache-mb7c8hbp.js → cache-BI5BY7ey.js} +4 -4
  40. package/dist/src/{cache-DbLsVWB2.cjs → cache-BRkhlH3k.cjs} +1 -1
  41. package/dist/src/cache-BlC6aeJ0.js +3 -0
  42. package/dist/src/{cache-D5NZmMiT.js → cache-Bzttsk0X.js} +2 -2
  43. package/dist/src/{cache-C4Xb-hNb.js → cache-Cr-qWIbP.js} +3 -3
  44. package/dist/src/{cache-BIyPcp5v.cjs → cache-DGg-yTZG.cjs} +2 -2
  45. package/dist/src/{chat-Dr3DUQ0D.js → chat-BLOdH60v.js} +12 -12
  46. package/dist/src/{chat-BfPaS15_.js → chat-Cx_LkwvZ.js} +12 -12
  47. package/dist/src/{chat-mW0ORo8G.js → chat-D9nudO9b.js} +4 -4
  48. package/dist/src/{chat-I9izLm49.js → chat-DChSH_Es.js} +12 -12
  49. package/dist/src/{chat-MKxMnZJZ.js → chat-DG2LkwLq.js} +2 -2
  50. package/dist/src/{chat-BPXSW8Bv.cjs → chat-DH97tVV9.cjs} +2 -2
  51. package/dist/src/{chat-0bwXjVP0.js → chat-aMQZw6R7.js} +4 -4
  52. package/dist/src/{chat-CclRbxGf.cjs → chat-vYqqv1gP.cjs} +11 -11
  53. package/dist/src/{chatkit-zUIVoDos.js → chatkit-B8X34dQc.js} +4 -4
  54. package/dist/src/{chatkit-Cv6AhukM.js → chatkit-BXu42Qwt.js} +3 -3
  55. package/dist/src/{chatkit-CJnHRRMM.js → chatkit-CbMRoeYw.js} +4 -4
  56. package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-D44VyUyB.cjs} +3 -3
  57. package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-BRq0bbIK.cjs} +8 -8
  58. package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-BjriSVRZ.js} +7 -7
  59. package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BzNZeZ0N.js} +7 -7
  60. package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-DYv_AJ8u.js} +7 -7
  61. package/dist/src/cloud-CoD5OacT.js +3 -0
  62. package/dist/src/{cloud-DQZ5sVjW.js → cloud-Da0bofJd.js} +3 -3
  63. package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-CXC4b1EU.js} +4 -4
  64. package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-CyBoIs1Q.js} +6 -6
  65. package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-DGOwgexC.js} +6 -6
  66. package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-DJv5qnyb.cjs} +4 -4
  67. package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-1sAoOyft.js} +5 -5
  68. package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-D-dnkzCF.js} +5 -5
  69. package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-DKVjkDav.cjs} +3 -3
  70. package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-TJkVrZlB.js} +3 -3
  71. package/dist/src/codex-app-server-CCLjqCh9.js +1915 -0
  72. package/dist/src/codex-app-server-CCe0TiDc.js +1915 -0
  73. package/dist/src/codex-app-server-CPW1LFwh.js +1916 -0
  74. package/dist/src/codex-app-server-VMRnjZ68.cjs +1920 -0
  75. package/dist/src/codex-sdk-1jm_qPHf.js +3 -0
  76. package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-Bd8UbO9q.cjs} +5 -5
  77. package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-BgEFQ70r.js} +6 -6
  78. package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-Bzb_TqX9.js} +6 -6
  79. package/dist/src/{codex-sdk-CWEnH70W.cjs → codex-sdk-Danroptg.cjs} +1 -1
  80. package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-DfvDTN33.js} +5 -5
  81. package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-B5ImDlSm.js} +8 -8
  82. package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-BgAkuYCw.cjs} +7 -7
  83. package/dist/src/{cometapi-BUlt_ELa.js → cometapi-CC7hWxmX.js} +8 -8
  84. package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CCbpHkuF.js} +7 -7
  85. package/dist/src/{completion-x0a_c2y1.js → completion-2iuYVxwi.js} +6 -6
  86. package/dist/src/{completion-Dnxn7E-j.js → completion-CrD6MQ93.js} +5 -5
  87. package/dist/src/{completion-BozdoXba.cjs → completion-DtQ72Bm3.cjs} +5 -5
  88. package/dist/src/{completion-HUe8wDhZ.js → completion-Vq_ad618.js} +6 -6
  89. package/dist/src/{createHash-ChI45QR1.js → createHash-DPpsZgFF.js} +1 -1
  90. package/dist/src/{createHash-CwDVU5xr.js → createHash-Un4Q_huE.js} +1 -1
  91. package/dist/src/{createHash-B7KvgoOD.cjs → createHash-VvBIc-AW.cjs} +1 -1
  92. package/dist/src/{docker-DCgsveLD.js → docker--3qzPa-6.js} +6 -6
  93. package/dist/src/{docker-DS4_Osau.cjs → docker-D3AY-5F5.cjs} +5 -5
  94. package/dist/src/{docker-CQmlA2NU.js → docker-DCsCDvwM.js} +6 -6
  95. package/dist/src/{docker-ClnmCf1Z.js → docker-Dorv4_Dg.js} +5 -5
  96. package/dist/src/{embedding-I45KG3o7.cjs → embedding-BXhN5lCH.cjs} +5 -5
  97. package/dist/src/{embedding-nFbumxcv.js → embedding-ChS1ivFS.js} +5 -5
  98. package/dist/src/{embedding-D3xTseo7.js → embedding-DNRvZwRN.js} +6 -6
  99. package/dist/src/{embedding-DD9wa3ae.js → embedding-D_bI4NDq.js} +6 -6
  100. package/dist/src/{errors-Cw810C93.js → errors-DFHe4L-n.js} +1 -1
  101. package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
  102. package/dist/src/{esm-C7PnfdF8.js → esm-BRkfNsYs.js} +1 -1
  103. package/dist/src/{esm-tVgYPY-f.js → esm-BX8fwlAO.js} +2 -2
  104. package/dist/src/{esm-CtEPLdAj.cjs → esm-B_rGuPTo.cjs} +1 -1
  105. package/dist/src/{eval-CzJFfFO9.js → eval-BQPLBJbw.js} +1 -1
  106. package/dist/src/{eval-u4UVafl6.js → eval-DJ_4A-tr.js} +14 -14
  107. package/dist/src/evalResult-BBJAHAtw.cjs +2 -0
  108. package/dist/src/evalResult-BBK58h2B.js +3 -0
  109. package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-Cx-8OWkb.cjs} +28 -10
  110. package/dist/src/{evalResult-D3hVYFis.js → evalResult-D6P5I5il.js} +29 -11
  111. package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-pSvGWFMo.js} +29 -11
  112. package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-D-UIbbYq.js} +845 -98
  113. package/dist/src/evaluator-DgLKaZk8.js +3 -0
  114. package/dist/src/{extractor-Dk6bRWkv.js → extractor-BM3jRERL.js} +5 -5
  115. package/dist/src/{extractor-WVPOrH43.cjs → extractor-Dxr2J_wK.cjs} +5 -5
  116. package/dist/src/{extractor-DNSeBVOJ.js → extractor-DxyiFhPk.js} +6 -6
  117. package/dist/src/{extractor-CAfTSraf.js → extractor-YlZbUMsL.js} +6 -6
  118. package/dist/src/fetch-8viavNv8.js +3 -0
  119. package/dist/src/{fetch-BEWnXrrG.js → fetch-B6ch2nU2.js} +9 -20
  120. package/dist/src/{fetch-Di00EQrc.js → fetch-D9xxyC1p.js} +221 -232
  121. package/dist/src/{fetch-CJU5ELPa.cjs → fetch-NuqXW1Xb.cjs} +221 -244
  122. package/dist/src/{fetch-B0Z3Oe4k.js → fetch-Y5qX_kST.js} +8 -19
  123. package/dist/src/{fileExtensions-BArZuxsI.js → fileExtensions-8CjoL7vB.js} +1 -1
  124. package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-BGh-W-HT.js} +1 -1
  125. package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-D9h-8Wxg.cjs} +1 -1
  126. package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-DysCsxNG.js} +1 -1
  127. package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-Ch4A7G3o.js} +1 -1
  128. package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-BokHC-MW.cjs} +1 -1
  129. package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-C3ZPQU60.js} +1 -1
  130. package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-CFny3gOy.js} +1 -1
  131. package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-DxODqT9e.js} +1 -1
  132. package/dist/src/{graders-Zy3x0zqX.js → graders-BoUqsCEm.js} +1303 -2044
  133. package/dist/src/{graders--zknU_uk.cjs → graders-Bw1wk_21.cjs} +1553 -2240
  134. package/dist/src/graders-C84JI-m5.js +2 -0
  135. package/dist/src/graders-CBbd0K0Q.cjs +2 -0
  136. package/dist/src/graders-CbQqpHSN.js +3 -0
  137. package/dist/src/{graders-eIHhRqoC.js → graders-CgPn32yp.js} +1300 -2041
  138. package/dist/src/{graders-pvbReLLn.js → graders-CwrbifOo.js} +747 -1488
  139. package/dist/src/graders-DS42d3ZG.js +2 -0
  140. package/dist/src/{image-9302QVqR.js → image-BeWaInPF.js} +3 -3
  141. package/dist/src/{image-DVz2RiMF.js → image-BmilRNqO.js} +7 -7
  142. package/dist/src/{image-x6KqLQl4.cjs → image-CxJoa3aW.cjs} +6 -6
  143. package/dist/src/{image-De2FBmYV.cjs → image-D10dNAav.cjs} +3 -3
  144. package/dist/src/{image-dnoUgPrC.js → image-Dr_3I3nK.js} +4 -4
  145. package/dist/src/{image-B5Mv-Z3h.js → image-DsGRlkh7.js} +7 -7
  146. package/dist/src/{image-qUpPvmNZ.js → image-a_SGUobh.js} +6 -6
  147. package/dist/src/{image-u7-rKnYU.js → image-qjO6FWPs.js} +4 -4
  148. package/dist/src/index.cjs +1052 -296
  149. package/dist/src/index.d.cts +124 -13
  150. package/dist/src/index.d.ts +125 -14
  151. package/dist/src/index.js +1018 -262
  152. package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-CCICw2cy.js} +2 -2
  153. package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
  154. package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
  155. package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-BBETc5-S.js} +6 -6
  156. package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-C8qOo26M.cjs} +5 -5
  157. package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CzAi2rUI.js} +6 -6
  158. package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-Dr3Kib7F.js} +5 -5
  159. package/dist/src/{litellm-C2kqjxqp.js → litellm-BLSiANhk.js} +5 -5
  160. package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CaUmV7Mk.cjs} +4 -4
  161. package/dist/src/{litellm-p37R1dzQ.js → litellm-DQGo_juI.js} +4 -4
  162. package/dist/src/{litellm-DRjpcSa7.js → litellm-DRc4qWfc.js} +5 -5
  163. package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
  164. package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
  165. package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-B-tNZzqW.js} +6 -6
  166. package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CtS3OlGq.js} +5 -5
  167. package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-PJJgUjOc.js} +6 -6
  168. package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-if-Ml4R9.cjs} +5 -5
  169. package/dist/src/main.js +242 -198
  170. package/dist/src/{messages-zWbkLLHz.js → messages-B9dSjrNf.js} +264 -16
  171. package/dist/src/{messages-811uVVW5.cjs → messages-BnsVHUnm.cjs} +266 -15
  172. package/dist/src/{messages-MYTQ2TWp.js → messages-CI69Lasb.js} +264 -16
  173. package/dist/src/{messages-BTQz42fn.js → messages-CewuNcNS.js} +264 -16
  174. package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-BBGcGeCa.cjs} +1 -1
  175. package/dist/src/{meteor-DuAFv6gF.js → meteor-BKTM-7KS.js} +1 -1
  176. package/dist/src/{meteor-DHdzY1Ss.js → meteor-CeGo0Lu2.js} +2 -2
  177. package/dist/src/{meteor-CU5UAE-H.js → meteor-Wc_aUVvu.js} +2 -2
  178. package/dist/src/{modelslab-wu9yi5GE.js → modelslab-BCLOtfek.js} +7 -7
  179. package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-BkapYJhh.cjs} +6 -6
  180. package/dist/src/{modelslab-DIq-6y7x.js → modelslab-D73OnKSx.js} +6 -6
  181. package/dist/src/{modelslab-D0erNWKe.js → modelslab-zpz9JcK0.js} +7 -7
  182. package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-B8F_TK5w.js} +6 -6
  183. package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-Bx0NFV2f.js} +5 -5
  184. package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-CNGJTLtG.js} +6 -6
  185. package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-DkT7tnoB.cjs} +5 -5
  186. package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BaXRN1cr.js} +4 -4
  187. package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-BeTRaFOh.js} +4 -4
  188. package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CL7Zqv0G.js} +3 -3
  189. package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-YT426juD.cjs} +3 -3
  190. package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
  191. package/dist/src/{openai-Qsvz25mV.js → openai-BT-JvDse.js} +2 -2
  192. package/dist/src/{openai-URNyItar.cjs → openai-Cy1XLs0c.cjs} +1 -1
  193. package/dist/src/{openai-iYtrXzOX.js → openai-D4fxGvRx.js} +1 -1
  194. package/dist/src/{openclaw-CwzlQSQX.js → openclaw-Bq7RVR3k.js} +7 -6
  195. package/dist/src/{openclaw-CLWrW03k.js → openclaw-DA8U4DsD.js} +8 -7
  196. package/dist/src/{openclaw-CnQ363Wi.js → openclaw-DObVgpjC.js} +8 -7
  197. package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-DUBZP3GL.cjs} +8 -7
  198. package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-BB40Wir1.js} +4 -4
  199. package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-BM1UAIv1.js} +3 -3
  200. package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-CeqiOcOU.cjs} +4 -4
  201. package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-ChdK7F7z.js} +4 -4
  202. package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-C6thJRXi.js} +4 -4
  203. package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-CcdIikOu.js} +3 -3
  204. package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-DNSQj6bf.cjs} +3 -3
  205. package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver-UYMQx3sy.js} +4 -4
  206. package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
  207. package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-BESeALrr.cjs} +1 -1
  208. package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DoACwqhD.js} +1 -1
  209. package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-PMsleEzs.js} +2 -2
  210. package/dist/src/{providers-Bp4S-FvO.js → providers-BuyzKt7C.js} +1 -1
  211. package/dist/src/{providers-DV3ax9e_.cjs → providers-C7lNVBjX.cjs} +1 -1
  212. package/dist/src/{providers-u9Enmfok.js → providers-CCE2COJi2.js} +1 -1
  213. package/dist/src/{providers-DruaQfwu.js → providers-CJh7iriU.js} +18103 -17952
  214. package/dist/src/{providers-iUt5fbAN.js → providers-Ctcc592x.js} +1 -1
  215. package/dist/src/{providers-Domz_llv.js → providers-DRrerKra.js} +432 -281
  216. package/dist/src/{providers-BV_KMZje.js → providers-DT-GtF2t.js} +19094 -18943
  217. package/dist/src/{providers-1eKkXBKp.cjs → providers-eDShy16E.cjs} +17946 -17795
  218. package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-C4tltmIn.js} +3 -3
  219. package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-CoLaCwNY.cjs} +3 -3
  220. package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-DMO68Jg7.js} +3 -3
  221. package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-DNqbnRdx.js} +3 -3
  222. package/dist/src/{quiverai-DR0SnIQV.js → quiverai-BSS9a7wV.js} +3 -3
  223. package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Bk1KrvL6.js} +4 -4
  224. package/dist/src/{quiverai-DFotyafY.cjs → quiverai-Bpx6MZ7T.cjs} +3 -3
  225. package/dist/src/{quiverai-aPPvXOgn.js → quiverai-CPKhWgaT.js} +4 -4
  226. package/dist/src/{render-DHIZ6_k8.js → render-7uNJ2V14.js} +2 -2
  227. package/dist/src/{render-CH-62LbA.js → render-DlscvAUJ.js} +1 -1
  228. package/dist/src/{render-CMEpfLaO.js → render-eui5p5mL.js} +2 -2
  229. package/dist/src/{render-CgVDrJmM.js → render-nj-UaPdn.js} +2 -2
  230. package/dist/src/{render-DfQSFxGE.cjs → render-tG6ir9_g.cjs} +1 -1
  231. package/dist/src/{responses--OsX2aYW.js → responses-1ztiVYsx.js} +49 -15
  232. package/dist/src/{responses-DL9m8CyY.js → responses-B8haB-mD.js} +49 -15
  233. package/dist/src/{responses-C-flexAY.js → responses-BiaBguAu.js} +49 -15
  234. package/dist/src/{responses-Bi9vBuW_.cjs → responses-CF-ayauu.cjs} +48 -14
  235. package/dist/src/rubyUtils-4hjGxvju.js +3 -0
  236. package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-BI0p46eZ.js} +3 -3
  237. package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CIQFnVz4.js} +3 -3
  238. package/dist/src/rubyUtils-CO-tuszQ.cjs +2 -0
  239. package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DGnoCYL2.js} +3 -3
  240. package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-DoifqkiA.cjs} +4 -3
  241. package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-BDLeW29y.js} +12 -12
  242. package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-C5T60MKf.js} +13 -13
  243. package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-ClS_NB07.js} +13 -13
  244. package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-ljtY12VM.cjs} +12 -12
  245. package/dist/src/{scanner-1DqWi1Ej.js → scanner-nOCWNIXa.js} +7 -7
  246. package/dist/src/server/index.js +1067 -265
  247. package/dist/src/{server-Dx2TyCH2.cjs → server-BEECpeGG.cjs} +5 -5
  248. package/dist/src/{server-BNYztJkh.js → server-ByiF3qlg.js} +9 -8
  249. package/dist/src/{server-BSB45Nt9.js → server-ByxbqAcQ.js} +8 -7
  250. package/dist/src/{server-DaA2eR26.cjs → server-C0XKRNB_.cjs} +1 -1
  251. package/dist/src/server-C_15p79-.js +3 -0
  252. package/dist/src/{server-D6Il2Sob.js → server-gyd6d4Hc.js} +5 -5
  253. package/dist/src/{signal-CE5G3a7x.js → signal-DTtUuU3l.js} +3 -3
  254. package/dist/src/{slack-acRb0IqQ.js → slack-4zZX1OKP.js} +1 -1
  255. package/dist/src/{slack-1Rhq0EoV.cjs → slack-BLlsDpfG.cjs} +1 -1
  256. package/dist/src/{slack-D5Wpy8LM.js → slack-BPYLQLgb.js} +2 -2
  257. package/dist/src/{slack-DDUe-5MC.js → slack-Bamy_7te.js} +2 -2
  258. package/dist/src/{store-DAAyxcy6.cjs → store-2K0kDi80.cjs} +2 -2
  259. package/dist/src/{store-Dn9HUkdW.js → store-2OXm_eBY.js} +3 -3
  260. package/dist/src/store-BELqNwvz.js +3 -0
  261. package/dist/src/{store-M0b1WfYb.js → store-BPkzEyFM.js} +2 -2
  262. package/dist/src/{store-CYEy5J2D.js → store-CPh25336.js} +3 -3
  263. package/dist/src/store-uQZ4AjPe.cjs +2 -0
  264. package/dist/src/{tables-CsWou1Bx.js → tables-BMSOS2Gg.js} +3 -3
  265. package/dist/src/{tables-DUfh1F7Z.cjs → tables-CXbaZ9y1.cjs} +2 -2
  266. package/dist/src/{tables-C4CH3zRr.js → tables-NlvH23ky.js} +3 -3
  267. package/dist/src/{tables-DQ4WU5tX.js → tables-WgdUZ8Ck.js} +2 -2
  268. package/dist/src/{telemetry-dbaJ0E98.js → telemetry--iqaGyaS.js} +5 -4
  269. package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CEQxGnMZ.cjs} +7 -6
  270. package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-CgdVGV8N.js} +4 -3
  271. package/dist/src/{telemetry-CQPez_Jp.js → telemetry-DWdGHvEf.js} +5 -4
  272. package/dist/src/telemetry-DjNoC_n3.cjs +2 -0
  273. package/dist/src/telemetry-ZdPZc0fm.js +3 -0
  274. package/dist/src/{text-BVi-cLPJ.cjs → text-BiNME7QG.cjs} +1 -1
  275. package/dist/src/{text-KvuD2Iko.js → text-D4lz-Jg_.js} +1 -1
  276. package/dist/src/{text-DHxdyQqT.js → text-DDQP0tuQ.js} +1 -1
  277. package/dist/src/{text-CZr46tp_.js → text-NWvfMfkF.js} +1 -1
  278. package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-2wIvAhB3.js} +1 -1
  279. package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-4c780gFd.js} +1 -1
  280. package/dist/src/tokenUsageUtils-BjVkdk18.js +142 -0
  281. package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-C9odhsbW.cjs} +1 -1
  282. package/dist/src/{transcription-DuWDupG7.js → transcription-84t4ALo2.js} +5 -5
  283. package/dist/src/{transcription-CJspiD2c.js → transcription-Bm2emLmJ.js} +6 -6
  284. package/dist/src/{transcription-BvjmiYB1.cjs → transcription-CZ4LG5hQ.cjs} +5 -5
  285. package/dist/src/{transcription-V2HaAmy2.js → transcription-D7Q0vJsh.js} +6 -6
  286. package/dist/src/{transform-zDhMmzwX.js → transform-B-b6Cq-q.js} +5 -5
  287. package/dist/src/transform-BQt0BeAW.js +3 -0
  288. package/dist/src/{transform-DgKlRr73.cjs → transform-Bq5oqC0s.cjs} +1 -1
  289. package/dist/src/{transform-CUnzlsbn.cjs → transform-C9izGX54.cjs} +4 -4
  290. package/dist/src/{transform-DYX1_Xnh.js → transform-CwbAZ84V.js} +5 -5
  291. package/dist/src/{transform-CTeuTR3S.cjs → transform-Dg4LcO1Y.cjs} +6 -6
  292. package/dist/src/{transform-CG0ehZNG.js → transform-DtooZqYY.js} +6 -6
  293. package/dist/src/{transform-UN5UGu8U.js → transform-DzCF-wqV.js} +5 -5
  294. package/dist/src/{transform-lQrDE1BQ.js → transform-_DpNB4qp.js} +5 -5
  295. package/dist/src/{transform-Bbg6A8Jk.js → transform-eGiUAv86.js} +4 -4
  296. package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-B22swDxr.cjs} +1 -1
  297. package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-lvCCvuPT.js} +1 -1
  298. package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-rJGPccjr.js} +1 -1
  299. package/dist/src/{types-Bgh5SOn6.js → types-BDjGOq4E.js} +4 -2
  300. package/dist/src/{types-Dm9JM6Vb.js → types-BVH9hjgW.js} +4 -2
  301. package/dist/src/{types-CeaeaZdP.cjs → types-CgG2rKiW.cjs} +151 -149
  302. package/dist/src/{types-BGQDAP8i.js → types-DNRZVOue.js} +152 -150
  303. package/dist/src/{util-C8e5uydV.js → util-3pBZZb_H.js} +142 -17
  304. package/dist/src/{util-CN3SrLT4.cjs → util-A5_ZsQUn.cjs} +65 -43
  305. package/dist/src/{util-D3q0WQ-0.js → util-B9CNhyac.js} +66 -44
  306. package/dist/src/{util-DxWpWjhc.js → util-BQOCAHQC.js} +700 -575
  307. package/dist/src/{util-BYvQUPp7.js → util-BVXcTwXu.js} +3 -3
  308. package/dist/src/{util-D9TisOyk.js → util-BlFVL0UF.js} +65 -43
  309. package/dist/src/{util-C9J8ahRn.js → util-C-kmRosx.js} +66 -44
  310. package/dist/src/{util-DvU2Pw8c.js → util-DFPeFkiV.js} +3 -3
  311. package/dist/src/{util-DDs-7g6-.js → util-DN0-b81k.js} +3 -3
  312. package/dist/src/{util-olYL5C6N.cjs → util-Dpmm_dAI.cjs} +3 -3
  313. package/dist/src/{util-oGMLA7vc.js → util-Dub0f_ej.js} +700 -575
  314. package/dist/src/{util-Bxn8emtE.cjs → util-DvpHnLt0.cjs} +718 -570
  315. package/dist/src/{utils-DJfvjyMj.js → utils-BUMN8orw.js} +3 -3
  316. package/dist/src/{utils-B05gLxER.cjs → utils-DkVeShIB.cjs} +2 -2
  317. package/dist/src/{utils-BLJKfv0y.js → utils-kt7lv30R.js} +3 -3
  318. package/dist/src/{utils-hXtCYanr.js → utils-o8S5huU2.js} +2 -2
  319. package/dist/src/version-0frU0UTr.js +16 -0
  320. package/dist/src/version-CbpiUINz.js +17 -0
  321. package/dist/src/version-CbuBKu2U.js +16 -0
  322. package/dist/src/version-D9zu9FWB.cjs +27 -0
  323. package/dist/tsconfig.tsbuildinfo +1 -1
  324. package/package.json +22 -20
  325. package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
  326. package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
  327. package/dist/src/cache-Cr9oLMUa.js +0 -3
  328. package/dist/src/cloud-Hphvo8kr.js +0 -3
  329. package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
  330. package/dist/src/evalResult-D8MT9p0s.js +0 -3
  331. package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
  332. package/dist/src/evaluator-CVessDWe.js +0 -3
  333. package/dist/src/fetch-C7bGKDlQ.js +0 -3
  334. package/dist/src/graders-BOAzQEUe.cjs +0 -2
  335. package/dist/src/graders-D4BTsZdG2.js +0 -3
  336. package/dist/src/graders-DOJK1XpV.js +0 -2
  337. package/dist/src/graders-NAv9LcBn.js +0 -2
  338. package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
  339. package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
  340. package/dist/src/server-DCtHUqlp.js +0 -3
  341. package/dist/src/store-CWOSz6D_.cjs +0 -2
  342. package/dist/src/store-DCDBhv7B.js +0 -3
  343. package/dist/src/telemetry-C1IqxcdW.js +0 -3
  344. package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
  345. package/dist/src/transform-M6ITAESf.js +0 -3
  346. /package/dist/src/{evalResult-DElBuddX.js → evalResult-spPqh1G_.js} +0 -0
@@ -4,34 +4,35 @@ Object.defineProperties(exports, {
4
4
  });
5
5
  const require_logger = require("./logger-COuQb2xB.cjs");
6
6
  const require_invariant = require("./invariant-kfQ8Bu82.cjs");
7
- const require_esm = require("./esm-CtEPLdAj.cjs");
8
- const require_pythonUtils = require("./pythonUtils-tAJvvpS-.cjs");
9
- const require_fileExtensions = require("./fileExtensions-bYh77CN8.cjs");
10
- const require_transform = require("./transform-CUnzlsbn.cjs");
11
- const require_graders = require("./graders--zknU_uk.cjs");
12
- const require_types = require("./types-CeaeaZdP.cjs");
13
- const require_util = require("./util-Bxn8emtE.cjs");
14
- const require_render = require("./render-DfQSFxGE.cjs");
15
- const require_fetch = require("./fetch-CJU5ELPa.cjs");
16
- const require_cache = require("./cache-BIyPcp5v.cjs");
17
- const require_providers = require("./providers-1eKkXBKp.cjs");
18
- const require_utils = require("./utils-B05gLxER.cjs");
19
- const require_createHash = require("./createHash-B7KvgoOD.cjs");
20
- const require_chat = require("./chat-CclRbxGf.cjs");
21
- const require_tokenUsageUtils = require("./tokenUsageUtils-Bb7DkZPz.cjs");
22
- const require_transform$1 = require("./transform-CTeuTR3S.cjs");
23
- const require_util$1 = require("./util-CN3SrLT4.cjs");
24
- const require_providerRegistry = require("./providerRegistry-CQMdTmHP.cjs");
25
- const require_server = require("./server-Dx2TyCH2.cjs");
26
- const require_accounts = require("./accounts-Dy17bs4D.cjs");
27
- const require_blobs = require("./blobs-D2FAd1Q5.cjs");
28
- const require_tables = require("./tables-DUfh1F7Z.cjs");
29
- const require_extractor = require("./extractor-WVPOrH43.cjs");
30
- const require_telemetry = require("./telemetry-Dsw_faFj.cjs");
31
- const require_text = require("./text-BVi-cLPJ.cjs");
32
- const require_store = require("./store-DAAyxcy6.cjs");
33
- const require_rubyUtils = require("./rubyUtils-B6eljPuh.cjs");
34
- const require_evalResult = require("./evalResult-KZqXl4XP.cjs");
7
+ const require_fetch = require("./fetch-NuqXW1Xb.cjs");
8
+ const require_version = require("./version-D9zu9FWB.cjs");
9
+ const require_types = require("./types-CgG2rKiW.cjs");
10
+ const require_accounts = require("./accounts-BIFntVWB.cjs");
11
+ const require_esm = require("./esm-B_rGuPTo.cjs");
12
+ const require_render = require("./render-tG6ir9_g.cjs");
13
+ const require_providerRegistry = require("./providerRegistry-BESeALrr.cjs");
14
+ const require_server = require("./server-BEECpeGG.cjs");
15
+ const require_providers = require("./providers-eDShy16E.cjs");
16
+ const require_pythonUtils = require("./pythonUtils-CoLaCwNY.cjs");
17
+ const require_fileExtensions = require("./fileExtensions-D9h-8Wxg.cjs");
18
+ const require_util = require("./util-DvpHnLt0.cjs");
19
+ const require_tokenUsageUtils = require("./tokenUsageUtils-C9odhsbW.cjs");
20
+ const require_blobs = require("./blobs-B95F_7vE.cjs");
21
+ const require_tables = require("./tables-CXbaZ9y1.cjs");
22
+ const require_extractor = require("./extractor-Dxr2J_wK.cjs");
23
+ const require_cache = require("./cache-DGg-yTZG.cjs");
24
+ const require_chat = require("./chat-vYqqv1gP.cjs");
25
+ const require_transform = require("./transform-Dg4LcO1Y.cjs");
26
+ const require_util$1 = require("./util-A5_ZsQUn.cjs");
27
+ const require_transform$1 = require("./transform-C9izGX54.cjs");
28
+ const require_telemetry = require("./telemetry-CEQxGnMZ.cjs");
29
+ const require_text = require("./text-BiNME7QG.cjs");
30
+ const require_store = require("./store-2K0kDi80.cjs");
31
+ const require_createHash = require("./createHash-VvBIc-AW.cjs");
32
+ const require_rubyUtils = require("./rubyUtils-DoifqkiA.cjs");
33
+ const require_graders = require("./graders-Bw1wk_21.cjs");
34
+ const require_utils = require("./utils-DkVeShIB.cjs");
35
+ const require_evalResult = require("./evalResult-Cx-8OWkb.cjs");
35
36
  let fs = require("fs");
36
37
  fs = require_logger.__toESM(fs);
37
38
  let path = require("path");
@@ -41,34 +42,34 @@ async = require_logger.__toESM(async);
41
42
  let js_yaml = require("js-yaml");
42
43
  js_yaml = require_logger.__toESM(js_yaml);
43
44
  let node_async_hooks = require("node:async_hooks");
44
- require("node:path");
45
- require("node:url");
46
45
  let chalk = require("chalk");
47
46
  chalk = require_logger.__toESM(chalk);
48
47
  let os = require("os");
49
48
  os = require_logger.__toESM(os);
50
- let util = require("util");
51
- util = require_logger.__toESM(util);
52
49
  let dedent = require("dedent");
53
50
  dedent = require_logger.__toESM(dedent);
51
+ let zod = require("zod");
54
52
  let fs_promises = require("fs/promises");
55
53
  fs_promises = require_logger.__toESM(fs_promises);
56
- let glob = require("glob");
57
- let zod = require("zod");
58
- let csv_parse_sync = require("csv-parse/sync");
59
- let fast_xml_parser = require("fast-xml-parser");
54
+ let util = require("util");
55
+ util = require_logger.__toESM(util);
56
+ let _inquirer_input = require("@inquirer/input");
57
+ _inquirer_input = require_logger.__toESM(_inquirer_input);
58
+ require("node:path");
59
+ require("node:url");
60
60
  let crypto$1 = require("crypto");
61
61
  crypto$1 = require_logger.__toESM(crypto$1);
62
62
  let _opentelemetry_api = require("@opentelemetry/api");
63
- let _inquirer_input = require("@inquirer/input");
64
- _inquirer_input = require_logger.__toESM(_inquirer_input);
65
63
  let readline = require("readline");
66
64
  readline = require_logger.__toESM(readline);
65
+ let csv_parse_sync = require("csv-parse/sync");
66
+ let glob = require("glob");
67
67
  let drizzle_orm = require("drizzle-orm");
68
+ let fast_xml_parser = require("fast-xml-parser");
68
69
  let cli_progress = require("cli-progress");
69
70
  cli_progress = require_logger.__toESM(cli_progress);
70
71
  let url = require("url");
71
- let jsdom = require("jsdom");
72
+ let parse5 = require("parse5");
72
73
  let fastest_levenshtein = require("fastest-levenshtein");
73
74
  let js_rouge = require("js-rouge");
74
75
  js_rouge = require_logger.__toESM(js_rouge);
@@ -262,6 +263,502 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
262
263
  };
263
264
  };
264
265
  //#endregion
266
+ //#region src/matchers/classification.ts
267
+ /**
268
+ *
269
+ * @param expected Expected classification. If undefined, matches any classification.
270
+ * @param output Text to classify.
271
+ * @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
272
+ * @param grading
273
+ * @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
274
+ */
275
+ async function matchesClassification(expected, output, threshold, grading) {
276
+ const resp = await (await require_graders.getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
277
+ if (!resp.classification) return require_graders.fail(resp.error || "Unknown error fetching classification");
278
+ let score;
279
+ if (expected === void 0) {
280
+ const scores = Object.values(resp.classification);
281
+ if (scores.length === 0) return {
282
+ pass: false,
283
+ score: 0,
284
+ reason: "No classification scores returned"
285
+ };
286
+ score = Math.max(...scores);
287
+ } else score = resp.classification[expected] || 0;
288
+ if (score >= threshold - Number.EPSILON) {
289
+ const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
290
+ return {
291
+ pass: true,
292
+ score,
293
+ reason
294
+ };
295
+ }
296
+ return {
297
+ pass: false,
298
+ score,
299
+ reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
300
+ };
301
+ }
302
+ //#endregion
303
+ //#region src/matchers/comparison.ts
304
+ async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
305
+ require_invariant.invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
306
+ const resp = await require_graders.callProviderWithContext(await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "select-best check"), await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.SELECT_BEST_PROMPT), {
307
+ criteria,
308
+ outputs: outputs.map((o) => require_graders.tryParse(o)),
309
+ ...vars || {}
310
+ }), "select-best", {
311
+ criteria,
312
+ outputs: outputs.map((o) => require_graders.tryParse(o)),
313
+ ...vars || {}
314
+ }, providerCallContext);
315
+ if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => require_graders.fail(resp.error || "No output", resp.tokenUsage));
316
+ require_invariant.invariant(typeof resp.output === "string", "select-best produced malformed response");
317
+ const firstIntegerMatch = resp.output.trim().match(/\d+/);
318
+ const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
319
+ if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => require_graders.fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
320
+ const tokensUsed = require_graders.normalizeMatcherTokenUsage(resp.tokenUsage);
321
+ return outputs.map((_output, index) => {
322
+ if (index === verdict) return {
323
+ pass: true,
324
+ score: 1,
325
+ reason: `Output selected as the best: ${criteria}`,
326
+ tokensUsed
327
+ };
328
+ else return {
329
+ pass: false,
330
+ score: 0,
331
+ reason: `Output not selected: ${criteria}`,
332
+ tokensUsed
333
+ };
334
+ });
335
+ }
336
+ async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
337
+ require_invariant.invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
338
+ const value = assertion.value || {};
339
+ const options = {
340
+ method: typeof value === "object" && "method" in value ? value.method : "average",
341
+ weights: typeof value === "object" && "weights" in value ? value.weights : {},
342
+ threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
343
+ };
344
+ const scores = resultsWithGradingResults.map((result, index) => {
345
+ const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
346
+ if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
347
+ let totalWeightedScore = 0;
348
+ let totalWeight = 0;
349
+ relevantResults.forEach((componentResult) => {
350
+ const assertionType = componentResult.assertion?.type || "unknown";
351
+ const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
352
+ const score = componentResult.score || 0;
353
+ totalWeightedScore += score * weight;
354
+ totalWeight += weight;
355
+ });
356
+ let aggregateScore;
357
+ if (options.method === "sum") aggregateScore = totalWeightedScore;
358
+ else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
359
+ return {
360
+ index,
361
+ score: aggregateScore,
362
+ componentCount: relevantResults.length,
363
+ totalWeight
364
+ };
365
+ });
366
+ let maxScore = -Infinity;
367
+ let winnerIndex = 0;
368
+ for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
369
+ maxScore = scores[i].score;
370
+ winnerIndex = i;
371
+ }
372
+ const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
373
+ return scores.map(({ index, score, componentCount, totalWeight }) => {
374
+ const isWinner = index === winnerIndex && meetsThreshold;
375
+ return {
376
+ pass: isWinner,
377
+ score: isWinner ? 1 : 0,
378
+ reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
379
+ namedScores: {
380
+ maxScore: score,
381
+ assertionCount: componentCount,
382
+ totalWeight
383
+ }
384
+ };
385
+ });
386
+ }
387
+ //#endregion
388
+ //#region src/matchers/moderation.ts
389
+ async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
390
+ if (!assistantResponse) return {
391
+ pass: true,
392
+ score: 1,
393
+ reason: "No output to moderate"
394
+ };
395
+ const defaultProviders = await require_graders.getDefaultProviders();
396
+ const defaultModerationProvider = !require_logger.getEnvString("OPENAI_API_KEY") && (require_logger.getEnvString("REPLICATE_API_KEY") || require_logger.getEnvString("REPLICATE_API_TOKEN")) ? await require_providers.loadApiProvider(require_types.LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
397
+ const moderationProvider = await require_graders.getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
398
+ require_invariant.invariant(moderationProvider, "Moderation provider must be defined");
399
+ const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
400
+ if (resp.error) return {
401
+ pass: false,
402
+ score: 0,
403
+ reason: `Moderation API error: ${resp.error}`
404
+ };
405
+ const { flags } = resp;
406
+ if (!flags || flags.length === 0) return {
407
+ pass: true,
408
+ score: 1,
409
+ reason: "No moderation flags detected"
410
+ };
411
+ const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
412
+ if (filteredFlags.length > 0) return {
413
+ pass: false,
414
+ score: 0,
415
+ reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
416
+ };
417
+ return {
418
+ pass: true,
419
+ score: 1,
420
+ reason: "No relevant moderation flags detected"
421
+ };
422
+ }
423
+ //#endregion
424
+ //#region src/assertions/contextUtils.ts
425
+ /**
426
+ * Resolves the context value for context-based assertions.
427
+ * Supports extracting context from test variables or transforming from output.
428
+ * Can return either a single context string or an array of context chunks.
429
+ *
430
+ * @param assertion - The assertion configuration
431
+ * @param test - The test case
432
+ * @param output - The provider output (after provider transform, before test transform)
433
+ * @param prompt - The prompt text
434
+ * @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
435
+ * @param providerResponse - Optional full provider response for contextTransform
436
+ * @returns The resolved context string or array of strings
437
+ * @throws Error if context cannot be resolved or transform fails
438
+ */
439
+ async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
440
+ let contextValue;
441
+ if (test.vars?.context) {
442
+ if (typeof test.vars.context === "string") contextValue = test.vars.context;
443
+ else if (Array.isArray(test.vars.context)) {
444
+ const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
445
+ if (invalidEntry) {
446
+ const [idx, val] = invalidEntry;
447
+ require_invariant.invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
448
+ }
449
+ contextValue = test.vars.context;
450
+ }
451
+ } else if (fallbackContext) contextValue = fallbackContext;
452
+ if (assertion.contextTransform) try {
453
+ const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
454
+ const transformed = await require_transform$1.transform(assertion.contextTransform, outputForTransform, {
455
+ vars: test.vars,
456
+ prompt: { label: prompt },
457
+ ...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
458
+ });
459
+ require_invariant.invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${assertion.contextTransform}`);
460
+ contextValue = transformed;
461
+ } catch (error) {
462
+ throw new Error(`Failed to transform context using expression '${assertion.contextTransform}': ${error instanceof Error ? error.message : String(error)}`);
463
+ }
464
+ require_invariant.invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
465
+ return contextValue;
466
+ }
467
+ /**
468
+ * Serializes context (string or string[]) to a single string for prompts.
469
+ * Joins chunks with double newlines to preserve separation.
470
+ */
471
+ function serializeContext(context) {
472
+ return Array.isArray(context) ? context.join("\n\n") : context;
473
+ }
474
+ //#endregion
475
+ //#region src/matchers/rag.ts
476
+ async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
477
+ const defaults = await require_graders.getDefaultProviders();
478
+ const embeddingProvider = await require_graders.getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
479
+ const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
480
+ const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
481
+ const rubricPrompt = await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.ANSWER_RELEVANCY_GENERATE);
482
+ const parsedOutput = require_graders.tryParse(output);
483
+ const promptText = await require_graders.renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
484
+ const candidateQuestions = [];
485
+ for (let i = 0; i < 3; i++) {
486
+ const resp = await require_graders.callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
487
+ require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
488
+ if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
489
+ require_invariant.invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
490
+ candidateQuestions.push(resp.output);
491
+ }
492
+ require_invariant.invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
493
+ const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
494
+ require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
495
+ if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return require_graders.fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
496
+ const inputEmbedding = inputEmbeddingResp.embedding;
497
+ const similarities = [];
498
+ const questionsWithScores = [];
499
+ for (const question of candidateQuestions) {
500
+ const resp = await embeddingProvider.callEmbeddingApi(question);
501
+ require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
502
+ if (resp.error || !resp.embedding) return require_graders.fail(resp.error || "No embedding", tokensUsed);
503
+ const questionSimilarity = require_graders.cosineSimilarity(inputEmbedding, resp.embedding);
504
+ similarities.push(questionSimilarity);
505
+ questionsWithScores.push({
506
+ question,
507
+ similarity: questionSimilarity
508
+ });
509
+ }
510
+ const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
511
+ const pass = similarity >= threshold - Number.EPSILON;
512
+ const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
513
+ const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
514
+ const metadata = {
515
+ generatedQuestions: questionsWithScores,
516
+ averageSimilarity: similarity,
517
+ threshold
518
+ };
519
+ if (pass) return {
520
+ pass: true,
521
+ score: similarity,
522
+ reason: greaterThanReason,
523
+ tokensUsed,
524
+ metadata
525
+ };
526
+ return {
527
+ pass: false,
528
+ score: similarity,
529
+ reason: lessThanReason,
530
+ tokensUsed,
531
+ metadata
532
+ };
533
+ }
534
+ async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
535
+ const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "context recall check");
536
+ const contextString = serializeContext(context);
537
+ const resp = await require_graders.callProviderWithContext(textProvider, await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.CONTEXT_RECALL), {
538
+ context: contextString,
539
+ groundTruth,
540
+ ...vars || {}
541
+ }), "context-recall", {
542
+ context: contextString,
543
+ groundTruth,
544
+ ...vars || {}
545
+ }, providerCallContext);
546
+ if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", resp.tokenUsage);
547
+ require_invariant.invariant(typeof resp.output === "string", "context-recall produced malformed response");
548
+ const attributedTokenLower = require_graders.CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
549
+ const notAttributedTokenLower = require_graders.CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
550
+ const sentences = require_graders.splitIntoSentences(resp.output).filter((line) => {
551
+ const lowerLine = line.toLowerCase();
552
+ return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
553
+ });
554
+ const sentenceAttributions = [];
555
+ let numerator = 0;
556
+ for (const sentence of sentences) {
557
+ const lowerSentence = sentence.toLowerCase();
558
+ const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
559
+ if (isAttributed) numerator++;
560
+ const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
561
+ const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
562
+ sentenceAttributions.push({
563
+ sentence: cleanSentence,
564
+ attributed: isAttributed
565
+ });
566
+ }
567
+ const score = sentences.length > 0 ? numerator / sentences.length : 0;
568
+ const pass = score >= threshold - Number.EPSILON;
569
+ const metadata = {
570
+ sentenceAttributions,
571
+ totalSentences: sentences.length,
572
+ attributedSentences: numerator,
573
+ score
574
+ };
575
+ return {
576
+ pass,
577
+ score,
578
+ reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
579
+ tokensUsed: require_graders.normalizeMatcherTokenUsage(resp.tokenUsage),
580
+ metadata
581
+ };
582
+ }
583
+ async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
584
+ const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "context relevance check");
585
+ const contextString = serializeContext(context);
586
+ const resp = await require_graders.callProviderWithContext(textProvider, await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.CONTEXT_RELEVANCE), {
587
+ context: contextString,
588
+ query: question
589
+ }), "context-relevance", {
590
+ context: contextString,
591
+ query: question
592
+ }, providerCallContext);
593
+ if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", resp.tokenUsage);
594
+ require_invariant.invariant(typeof resp.output === "string", "context-relevance produced malformed response");
595
+ const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : require_graders.splitIntoSentences(context);
596
+ const totalContextUnits = contextUnits.length;
597
+ const extractedSentences = require_graders.splitIntoSentences(resp.output);
598
+ const relevantSentences = [];
599
+ const insufficientInformation = resp.output.includes(require_graders.CONTEXT_RELEVANCE_BAD);
600
+ let numerator = 0;
601
+ if (insufficientInformation) numerator = 0;
602
+ else {
603
+ const uniqueRelevantSentences = [...new Set(extractedSentences)];
604
+ numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
605
+ relevantSentences.push(...uniqueRelevantSentences);
606
+ }
607
+ const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
608
+ const pass = score >= threshold - Number.EPSILON;
609
+ const metadata = {
610
+ extractedSentences: relevantSentences,
611
+ totalContextUnits,
612
+ totalContextSentences: totalContextUnits,
613
+ contextUnits,
614
+ relevantSentenceCount: numerator,
615
+ insufficientInformation,
616
+ score
617
+ };
618
+ return {
619
+ pass,
620
+ score,
621
+ reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
622
+ tokensUsed: require_graders.normalizeMatcherTokenUsage(resp.tokenUsage),
623
+ metadata
624
+ };
625
+ }
626
+ async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
627
+ const textProvider = await require_graders.getAndCheckProvider("text", grading?.provider, (await require_graders.getDefaultProviders()).gradingProvider, "faithfulness check");
628
+ const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
629
+ if (grading?.rubricPrompt) require_invariant.invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
630
+ const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
631
+ const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
632
+ const longformPrompt = await require_graders.loadRubricPrompt(rawLongformPrompt, require_graders.CONTEXT_FAITHFULNESS_LONGFORM);
633
+ const nliPrompt = await require_graders.loadRubricPrompt(rawNliPrompt, require_graders.CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
634
+ let promptText = await require_graders.renderLlmRubricPrompt(longformPrompt, {
635
+ question: query,
636
+ answer: require_graders.tryParse(output),
637
+ ...vars || {}
638
+ });
639
+ let resp = await require_graders.callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
640
+ question: query,
641
+ answer: require_graders.tryParse(output),
642
+ ...vars || {}
643
+ }, providerCallContext);
644
+ require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
645
+ if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
646
+ require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
647
+ const contextString = serializeContext(context);
648
+ const statements = require_graders.splitIntoSentences(resp.output);
649
+ promptText = await require_graders.renderLlmRubricPrompt(nliPrompt, {
650
+ context: contextString,
651
+ statements,
652
+ ...vars || {}
653
+ });
654
+ resp = await require_graders.callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
655
+ context: contextString,
656
+ statements,
657
+ ...vars || {}
658
+ }, providerCallContext);
659
+ require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, resp.tokenUsage);
660
+ if (resp.error || !resp.output) return require_graders.fail(resp.error || "No output", tokensUsed);
661
+ require_invariant.invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
662
+ let finalAnswer = "Final verdict for each statement in order:";
663
+ finalAnswer = finalAnswer.toLowerCase();
664
+ let verdicts = resp.output.toLowerCase().trim();
665
+ let score = 0;
666
+ if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
667
+ verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
668
+ const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
669
+ if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
670
+ } else {
671
+ const noVerdictCount = verdicts.split("verdict: no").length - 1;
672
+ if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
673
+ }
674
+ score = Math.min(1, Math.max(0, score));
675
+ const pass = score >= threshold - Number.EPSILON;
676
+ return {
677
+ pass,
678
+ score,
679
+ reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
680
+ tokensUsed
681
+ };
682
+ }
683
+ //#endregion
684
+ //#region src/matchers/similarity.ts
685
+ function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
686
+ switch (metric) {
687
+ case "cosine": return require_graders.cosineSimilarity(expectedEmbedding, outputEmbedding);
688
+ case "dot_product": return require_graders.dotProduct(expectedEmbedding, outputEmbedding);
689
+ case "euclidean": return require_graders.euclideanDistance(expectedEmbedding, outputEmbedding);
690
+ default: return require_graders.fail(`Unsupported metric: ${metric}`, tokensUsed);
691
+ }
692
+ }
693
+ function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
694
+ if (metric === "euclidean") {
695
+ const distance = similarity;
696
+ const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
697
+ const normalizedScore = 1 / (1 + distance);
698
+ const score = inverse ? 1 - normalizedScore : normalizedScore;
699
+ const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
700
+ const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
701
+ return {
702
+ pass,
703
+ score,
704
+ reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
705
+ tokensUsed
706
+ };
707
+ }
708
+ const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
709
+ const score = inverse ? 1 - similarity : similarity;
710
+ const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
711
+ const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
712
+ return {
713
+ pass,
714
+ score,
715
+ reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
716
+ tokensUsed
717
+ };
718
+ }
719
+ async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
720
+ if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
721
+ const similarityResp = await finalProvider.callSimilarityApi(expected, output);
722
+ require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
723
+ if (similarityResp.error) return require_graders.fail(similarityResp.error, tokensUsed);
724
+ if (similarityResp.similarity == null) return require_graders.fail("Unknown error fetching similarity", tokensUsed);
725
+ if (!Number.isFinite(similarityResp.similarity)) return require_graders.fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
726
+ return similarityResp.similarity;
727
+ }
728
+ const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
729
+ if (typeof callEmbeddingApi !== "function") {
730
+ if ("callSimilarityApi" in finalProvider) return require_graders.fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
731
+ throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
732
+ }
733
+ const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
734
+ const mergedUsage = require_graders.normalizeMatcherTokenUsage(void 0);
735
+ require_tokenUsageUtils.accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
736
+ require_tokenUsageUtils.accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
737
+ require_tokenUsageUtils.accumulateTokenUsage(tokensUsed, mergedUsage);
738
+ if (expectedEmbedding.error || outputEmbedding.error) return require_graders.fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
739
+ if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return require_graders.fail("Embedding not found", tokensUsed);
740
+ return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
741
+ }
742
+ async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
743
+ if (metric === "cosine" && require_logger.state.config?.redteam && require_server.shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
744
+ return await require_graders.doRemoteGrading({
745
+ task: "similar",
746
+ expected,
747
+ output,
748
+ threshold,
749
+ inverse
750
+ });
751
+ } catch (error) {
752
+ return require_graders.fail(`Could not perform remote grading: ${error}`);
753
+ }
754
+ const defaults = await require_graders.getDefaultProviders();
755
+ const finalProvider = await require_graders.getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
756
+ const tokensUsed = require_graders.normalizeMatcherTokenUsage(void 0);
757
+ const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
758
+ if (typeof similarity !== "number") return similarity;
759
+ return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
760
+ }
761
+ //#endregion
265
762
  //#region src/tracing/evaluatorTracing.ts
266
763
  let otlpReceiverStarted = false;
267
764
  const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
@@ -305,7 +802,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
305
802
  require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
306
803
  try {
307
804
  require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
308
- const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-BfcVq2Nq.cjs"));
805
+ const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DNSQj6bf.cjs"));
309
806
  const port = testSuite.tracing.otlp.http.port || 4318;
310
807
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
311
808
  const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
@@ -329,7 +826,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
329
826
  async function stopOtlpReceiverIfNeeded() {
330
827
  if (otlpReceiverStarted) try {
331
828
  require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
332
- const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-BfcVq2Nq.cjs"));
829
+ const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DNSQj6bf.cjs"));
333
830
  await stopOTLPReceiver();
334
831
  otlpReceiverStarted = false;
335
832
  require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -364,7 +861,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
364
861
  }
365
862
  if (!tracingEnabled) return null;
366
863
  require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
367
- const { getTraceStore } = await Promise.resolve().then(() => require("./store-CWOSz6D_.cjs"));
864
+ const { getTraceStore } = await Promise.resolve().then(() => require("./store-uQZ4AjPe.cjs"));
368
865
  const traceStore = getTraceStore();
369
866
  const traceId = generateTraceId();
370
867
  const spanId = generateSpanId();
@@ -406,7 +903,7 @@ const handleAnswerRelevance = async ({ assertion, output, prompt, test, provider
406
903
  require_invariant.invariant(prompt, "answer-relevance assertion type must have a prompt");
407
904
  return {
408
905
  assertion,
409
- ...await require_graders.matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
906
+ ...await matchesAnswerRelevance(typeof test?.vars?.query === "string" ? test.vars.query : prompt, output, assertion.threshold ?? 0, test.options, providerCallContext)
410
907
  };
411
908
  };
412
909
  //#endregion
@@ -662,7 +1159,7 @@ function handleBleuScore({ assertion, inverse, outputString, renderedValue }) {
662
1159
  //#region src/assertions/classifier.ts
663
1160
  async function handleClassifier({ assertion, renderedValue, outputString, test, inverse }) {
664
1161
  require_invariant.invariant(typeof renderedValue === "string" || typeof renderedValue === "undefined", "\"classifier\" assertion type must have a string value or be undefined");
665
- const classificationResult = await require_graders.matchesClassification(renderedValue, outputString, assertion.threshold ?? 1, test.options);
1162
+ const classificationResult = await matchesClassification(renderedValue, outputString, assertion.threshold ?? 1, test.options);
666
1163
  if (inverse) {
667
1164
  classificationResult.pass = !classificationResult.pass;
668
1165
  classificationResult.score = 1 - classificationResult.score;
@@ -674,38 +1171,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
674
1171
  }
675
1172
  //#endregion
676
1173
  //#region src/assertions/contains.ts
1174
+ /**
1175
+ * Advance over separators between parsed fields.
1176
+ *
1177
+ * Contains-any values allow whitespace around comma delimiters, and historical
1178
+ * parsing ignored repeated commas rather than producing empty fields.
1179
+ */
1180
+ function skipWhitespaceAndCommas(value, startIndex) {
1181
+ let i = startIndex;
1182
+ while (i < value.length) {
1183
+ i = skipWhitespace(value, i);
1184
+ if (value[i] !== ",") break;
1185
+ i++;
1186
+ }
1187
+ return i;
1188
+ }
1189
+ /**
1190
+ * Advance over whitespace while preserving comma delimiter handling for callers.
1191
+ */
1192
+ function skipWhitespace(value, startIndex) {
1193
+ let i = startIndex;
1194
+ while (i < value.length && /\s/.test(value[i])) i++;
1195
+ return i;
1196
+ }
1197
+ /**
1198
+ * Parse a quoted field using the assertion parser's CSV-like escape rules.
1199
+ *
1200
+ * Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
1201
+ * unterminated fields so malformed assertion values do not silently pass.
1202
+ */
1203
+ function parseQuotedField(value, startIndex) {
1204
+ let i = startIndex + 1;
1205
+ let field = "";
1206
+ let terminated = false;
1207
+ while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
1208
+ field += value[i + 1];
1209
+ i += 2;
1210
+ } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
1211
+ field += "\"";
1212
+ i += 2;
1213
+ } else if (value[i] === "\"") {
1214
+ i++;
1215
+ terminated = true;
1216
+ break;
1217
+ } else {
1218
+ field += value[i];
1219
+ i++;
1220
+ }
1221
+ require_invariant.invariant(terminated, "Unterminated quoted field in contains assertion value");
1222
+ return {
1223
+ field,
1224
+ nextIndex: i
1225
+ };
1226
+ }
1227
+ /**
1228
+ * Parse an unquoted field up to the next comma, trimming surrounding whitespace.
1229
+ */
1230
+ function parseUnquotedField(value, startIndex) {
1231
+ let i = startIndex;
1232
+ while (i < value.length && value[i] !== ",") i++;
1233
+ return {
1234
+ field: value.substring(startIndex, i).trim(),
1235
+ nextIndex: i
1236
+ };
1237
+ }
1238
+ /**
1239
+ * Split a contains-any string into fields while preserving quoted commas.
1240
+ */
677
1241
  function parseCommaSeparatedValues(value) {
678
1242
  const results = [];
679
1243
  let i = 0;
680
1244
  while (i < value.length) {
681
- while (i < value.length && /\s/.test(value[i])) i++;
1245
+ i = skipWhitespaceAndCommas(value, i);
682
1246
  if (i >= value.length) break;
683
- if (value[i] === ",") {
684
- i++;
685
- continue;
686
- }
687
- if (value[i] === "\"") {
688
- i++;
689
- let field = "";
690
- while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
691
- field += value[i + 1];
692
- i += 2;
693
- } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
694
- field += "\"";
695
- i += 2;
696
- } else if (value[i] === "\"") {
697
- i++;
698
- break;
699
- } else {
700
- field += value[i];
701
- i++;
702
- }
703
- results.push(field);
704
- } else {
705
- const start = i;
706
- while (i < value.length && value[i] !== ",") i++;
707
- results.push(value.substring(start, i).trim());
708
- }
1247
+ const isQuotedField = value[i] === "\"";
1248
+ const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
1249
+ results.push(parsed.field);
1250
+ i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
1251
+ require_invariant.invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
709
1252
  }
710
1253
  return results;
711
1254
  }
@@ -803,10 +1346,10 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
803
1346
  require_invariant.invariant(test.vars, "context-faithfulness assertion requires a test with variables");
804
1347
  require_invariant.invariant(typeof test.vars.query === "string", "context-faithfulness assertion requires a \"query\" variable with the user question");
805
1348
  require_invariant.invariant(typeof output === "string", "context-faithfulness assertion requires string output from the provider");
806
- const context = await require_graders.resolveContext(assertion, test, output, prompt, void 0, providerResponse);
1349
+ const context = await resolveContext(assertion, test, output, prompt, void 0, providerResponse);
807
1350
  return {
808
1351
  assertion,
809
- ...await require_graders.matchesContextFaithfulness(test.vars.query, output, context, assertion.threshold ?? 0, test.options, test.vars, providerCallContext),
1352
+ ...await matchesContextFaithfulness(test.vars.query, output, context, assertion.threshold ?? 0, test.options, test.vars, providerCallContext),
810
1353
  metadata: { context }
811
1354
  };
812
1355
  }
@@ -825,8 +1368,8 @@ async function handleContextFaithfulness({ assertion, test, output, prompt, prov
825
1368
  const handleContextRecall = async ({ assertion, renderedValue, prompt, test, output, providerResponse, providerCallContext }) => {
826
1369
  require_invariant.invariant(typeof renderedValue === "string", "context-recall assertion requires a string value (expected answer or fact to verify)");
827
1370
  require_invariant.invariant(prompt, "context-recall assertion requires a prompt");
828
- const context = await require_graders.resolveContext(assertion, test, output, prompt, prompt, providerResponse);
829
- const result = await require_graders.matchesContextRecall(context, renderedValue, assertion.threshold ?? 0, test.options, test.vars, providerCallContext);
1371
+ const context = await resolveContext(assertion, test, output, prompt, prompt, providerResponse);
1372
+ const result = await matchesContextRecall(context, renderedValue, assertion.threshold ?? 0, test.options, test.vars, providerCallContext);
830
1373
  return {
831
1374
  assertion,
832
1375
  ...result,
@@ -851,8 +1394,8 @@ const handleContextRecall = async ({ assertion, renderedValue, prompt, test, out
851
1394
  const handleContextRelevance = async ({ assertion, test, output, prompt, providerResponse, providerCallContext }) => {
852
1395
  require_invariant.invariant(test.vars, "context-relevance assertion requires a test with variables");
853
1396
  require_invariant.invariant(typeof test.vars.query === "string", "context-relevance assertion requires a \"query\" variable with the user question");
854
- const context = await require_graders.resolveContext(assertion, test, output, prompt, void 0, providerResponse);
855
- const result = await require_graders.matchesContextRelevance(test.vars.query, context, assertion.threshold ?? 0, test.options, providerCallContext);
1397
+ const context = await resolveContext(assertion, test, output, prompt, void 0, providerResponse);
1398
+ const result = await matchesContextRelevance(test.vars.query, context, assertion.threshold ?? 0, test.options, providerCallContext);
856
1399
  return {
857
1400
  assertion,
858
1401
  ...result,
@@ -930,7 +1473,7 @@ function handleFinishReason({ assertion, inverse = false, renderedValue, provide
930
1473
  //#region src/assertions/functionToolCall.ts
931
1474
  const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
932
1475
  try {
933
- if (provider instanceof require_providers.AIStudioChatProvider || provider instanceof require_providers.GoogleLiveProvider || provider instanceof require_providers.VertexChatProvider) require_transform$1.validateFunctionCall(output, provider.config?.tools, test.vars);
1476
+ if (provider instanceof require_providers.AIStudioChatProvider || provider instanceof require_providers.GoogleLiveProvider || provider instanceof require_providers.VertexChatProvider) require_transform.validateFunctionCall(output, provider.config?.tools, test.vars);
934
1477
  else if (provider instanceof require_chat.OpenAiChatCompletionProvider) require_util$1.validateFunctionCall(output, provider.config.functions, test.vars);
935
1478
  else throw new Error(`Provider does not have functionality for checking function call.`);
936
1479
  return {
@@ -1110,6 +1653,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
1110
1653
  };
1111
1654
  //#endregion
1112
1655
  //#region src/assertions/html.ts
1656
+ const LITERAL_WRAPPER_PATTERNS = {
1657
+ html: /<html(?=[\s>/])/,
1658
+ head: /<head(?=[\s>/])/,
1659
+ body: /<body(?=[\s>/])/
1660
+ };
1661
+ function isWrapperTagName(tagName) {
1662
+ return tagName === "html" || tagName === "head" || tagName === "body";
1663
+ }
1664
+ function isTextNode(node) {
1665
+ return node.nodeName === "#text";
1666
+ }
1667
+ function isElementNode(node) {
1668
+ return "tagName" in node;
1669
+ }
1670
+ function hasSourceCodeLocation(element) {
1671
+ return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
1672
+ }
1673
+ function getChildNodes(node) {
1674
+ return "childNodes" in node ? node.childNodes : [];
1675
+ }
1676
+ function findFirstElement(root, predicate) {
1677
+ const stack = [root];
1678
+ while (stack.length > 0) {
1679
+ const current = stack.pop();
1680
+ if (isElementNode(current) && predicate(current)) return current;
1681
+ const children = getChildNodes(current);
1682
+ for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
1683
+ }
1684
+ }
1685
+ function hasTopLevelText(parentNode) {
1686
+ return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
1687
+ }
1688
+ function isUserProvidedElement(element, inputLowercase) {
1689
+ const tagName = element.tagName.toLowerCase();
1690
+ if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
1691
+ return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
1692
+ }
1113
1693
  const HTML_PATTERNS = {
1114
1694
  openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
1115
1695
  closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
@@ -1265,37 +1845,21 @@ function validateHtml(htmlString) {
1265
1845
  isValid: false,
1266
1846
  reason: "Output appears to be XML, not HTML"
1267
1847
  };
1268
- try {
1269
- const { document } = new jsdom.JSDOM(trimmed, { contentType: "text/html" }).window;
1270
- if (document.body && !trimmed.toLowerCase().includes("<body")) {
1271
- if (Array.from(document.body.childNodes).some((node) => node.nodeType === 3 && node.textContent?.trim())) return {
1272
- isValid: false,
1273
- reason: "Output must be wrapped in HTML tags"
1274
- };
1275
- }
1276
- const allElements = document.querySelectorAll("*");
1277
- if (!Array.from(allElements).find((element) => {
1278
- const tagName = element.tagName.toLowerCase();
1279
- if ([
1280
- "html",
1281
- "head",
1282
- "body"
1283
- ].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
1284
- return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
1285
- })) return {
1286
- isValid: false,
1287
- reason: "Output does not contain recognized HTML elements"
1288
- };
1289
- return {
1290
- isValid: true,
1291
- reason: "Output is valid HTML"
1292
- };
1293
- } catch (error) {
1294
- return {
1295
- isValid: false,
1296
- reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
1297
- };
1298
- }
1848
+ const document = (0, parse5.parse)(trimmed, { sourceCodeLocationInfo: true });
1849
+ const inputLowercase = trimmed.toLowerCase();
1850
+ const body = findFirstElement(document, (element) => element.tagName === "body");
1851
+ if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
1852
+ isValid: false,
1853
+ reason: "Output must be wrapped in HTML tags"
1854
+ };
1855
+ if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
1856
+ isValid: false,
1857
+ reason: "Output does not contain recognized HTML elements"
1858
+ };
1859
+ return {
1860
+ isValid: true,
1861
+ reason: "Output is valid HTML"
1862
+ };
1299
1863
  }
1300
1864
  const handleContainsHtml = ({ assertion, outputString, inverse }) => {
1301
1865
  const pass = containsHtml(outputString) !== inverse;
@@ -1460,7 +2024,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
1460
2024
  let result;
1461
2025
  if (typeof valueFromScript === "undefined") {
1462
2026
  const functionBody = renderedValue.includes("\n") ? renderedValue : buildFunctionBody(renderedValue);
1463
- result = await validateResult(new Function("output", "context", "process", functionBody)(output, assertionValueContext, require_transform.getProcessShim()));
2027
+ result = await validateResult(new Function("output", "context", "process", functionBody)(output, assertionValueContext, require_transform$1.getProcessShim()));
1464
2028
  } else {
1465
2029
  require_invariant.invariant(typeof valueFromScript === "boolean" || typeof valueFromScript === "number" || typeof valueFromScript === "object", `Javascript assertion script must return a boolean, number, or object (${assertion.value})`);
1466
2030
  result = await validateResult(valueFromScript);
@@ -1667,7 +2231,7 @@ const handleModeration = async ({ assertion, test, outputString, providerRespons
1667
2231
  const parsedPrompt = require_fetch.parseChatPrompt(promptToModerate, null);
1668
2232
  if (parsedPrompt && parsedPrompt.length > 0) promptToModerate = getLastModerationPrompt(parsedPrompt) ?? promptToModerate;
1669
2233
  } catch {}
1670
- const moderationResult = await require_graders.matchesModeration({
2234
+ const moderationResult = await matchesModeration({
1671
2235
  userPrompt: promptToModerate,
1672
2236
  assistantResponse: outputString,
1673
2237
  categories: Array.isArray(assertion.value) ? assertion.value : []
@@ -2400,11 +2964,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
2400
2964
  const rougeMethod = js_rouge[baseType[baseType.length - 1]];
2401
2965
  const score = rougeMethod(outputString, renderedValue, {});
2402
2966
  const threshold = assertion.threshold ?? .75;
2403
- const pass = score >= threshold != inverse;
2404
2967
  return {
2405
- pass,
2968
+ pass: score >= threshold !== inverse,
2406
2969
  score: inverse ? 1 - score : score,
2407
- reason: pass ? `${baseType.toUpperCase()} score ${score.toFixed(2)} is greater than or equal to threshold ${threshold}` : `${baseType.toUpperCase()} score ${score.toFixed(2)} is less than threshold ${threshold}`,
2970
+ reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
2408
2971
  assertion
2409
2972
  };
2410
2973
  }
@@ -2466,10 +3029,196 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
2466
3029
  }
2467
3030
  };
2468
3031
  //#endregion
3032
+ //#region src/providers/webSearchUtils.ts
3033
+ function hasTool(provider, predicate) {
3034
+ return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
3035
+ }
3036
+ function getProviderId(provider) {
3037
+ if (typeof provider.id !== "function") return null;
3038
+ try {
3039
+ return provider.id();
3040
+ } catch (err) {
3041
+ require_logger.logger.debug(`Failed to read provider id: ${err}`);
3042
+ return null;
3043
+ }
3044
+ }
3045
+ function isOpenAiResponsesProvider(provider, id) {
3046
+ return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
3047
+ }
3048
+ /**
3049
+ * Check if a provider has web search capabilities
3050
+ * @param provider The provider to check
3051
+ * @returns true if the provider supports web search
3052
+ */
3053
+ function hasWebSearchCapability(provider) {
3054
+ if (!provider) return false;
3055
+ const id = getProviderId(provider);
3056
+ if (!id) return false;
3057
+ if (id.includes("perplexity")) return true;
3058
+ if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
3059
+ if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
3060
+ if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
3061
+ if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
3062
+ if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
3063
+ return false;
3064
+ }
3065
+ /**
3066
+ * Load a provider with web search capabilities.
3067
+ * Tries multiple providers in order of preference until one succeeds.
3068
+ * Uses the latest and most capable models from each provider with specific checkpoint IDs.
3069
+ *
3070
+ * @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
3071
+ * @returns A provider with web search capabilities or null
3072
+ */
3073
+ async function loadWebSearchProvider(preferAnthropic = false) {
3074
+ const loadAnthropicWebSearch = async () => {
3075
+ try {
3076
+ return await require_providers.loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
3077
+ type: "web_search_20250305",
3078
+ name: "web_search",
3079
+ max_uses: 5
3080
+ }] } } });
3081
+ } catch (err) {
3082
+ require_logger.logger.debug(`Failed to load Anthropic web search provider: ${err}`);
3083
+ return null;
3084
+ }
3085
+ };
3086
+ const loadOpenAIWebSearch = async () => {
3087
+ try {
3088
+ return await require_providers.loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
3089
+ } catch (err) {
3090
+ require_logger.logger.debug(`Failed to load OpenAI web search provider: ${err}`);
3091
+ return null;
3092
+ }
3093
+ };
3094
+ const loadPerplexity = async () => {
3095
+ try {
3096
+ return await require_providers.loadApiProvider("perplexity:sonar-pro");
3097
+ } catch (err) {
3098
+ require_logger.logger.debug(`Failed to load Perplexity provider: ${err}`);
3099
+ return null;
3100
+ }
3101
+ };
3102
+ const loadGoogleWebSearch = async () => {
3103
+ try {
3104
+ return await require_providers.loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
3105
+ } catch (err) {
3106
+ require_logger.logger.debug(`Failed to load Google web search provider: ${err}`);
3107
+ return null;
3108
+ }
3109
+ };
3110
+ const loadVertexWebSearch = async () => {
3111
+ try {
3112
+ return await require_providers.loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
3113
+ } catch (err) {
3114
+ require_logger.logger.debug(`Failed to load Vertex web search provider: ${err}`);
3115
+ return null;
3116
+ }
3117
+ };
3118
+ const loadXaiWebSearch = async () => {
3119
+ try {
3120
+ return await require_providers.loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
3121
+ } catch (err) {
3122
+ require_logger.logger.debug(`Failed to load xAI web search provider: ${err}`);
3123
+ return null;
3124
+ }
3125
+ };
3126
+ const providers = preferAnthropic ? [
3127
+ loadAnthropicWebSearch,
3128
+ loadOpenAIWebSearch,
3129
+ loadPerplexity,
3130
+ loadGoogleWebSearch,
3131
+ loadVertexWebSearch,
3132
+ loadXaiWebSearch
3133
+ ] : [
3134
+ loadOpenAIWebSearch,
3135
+ loadAnthropicWebSearch,
3136
+ loadPerplexity,
3137
+ loadGoogleWebSearch,
3138
+ loadVertexWebSearch,
3139
+ loadXaiWebSearch
3140
+ ];
3141
+ for (const getProvider of providers) {
3142
+ const provider = await getProvider();
3143
+ if (provider && hasWebSearchCapability(provider)) {
3144
+ require_logger.logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
3145
+ return provider;
3146
+ }
3147
+ if (provider) require_logger.logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
3148
+ }
3149
+ return null;
3150
+ }
3151
+ //#endregion
3152
+ //#region src/matchers/search.ts
3153
+ async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
3154
+ if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
3155
+ const defaultProviders = await require_graders.getDefaultProviders();
3156
+ const defaultSearchProviders = [
3157
+ defaultProviders.webSearchProvider,
3158
+ defaultProviders.llmRubricProvider,
3159
+ defaultProviders.gradingProvider
3160
+ ];
3161
+ let searchProvider = (grading.provider ? await require_graders.getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
3162
+ if (!hasWebSearchCapability(searchProvider)) {
3163
+ const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
3164
+ if (webSearchDefault) searchProvider = webSearchDefault;
3165
+ }
3166
+ if (!hasWebSearchCapability(searchProvider)) {
3167
+ const webSearchProvider = await loadWebSearchProvider(true);
3168
+ if (webSearchProvider) searchProvider = webSearchProvider;
3169
+ }
3170
+ if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${require_graders.DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
3171
+ const prompt = await require_graders.renderLlmRubricPrompt(await require_graders.loadRubricPrompt(grading?.rubricPrompt, require_graders.DEFAULT_WEB_SEARCH_PROMPT), {
3172
+ output: require_graders.tryParse(llmOutput),
3173
+ rubric,
3174
+ ...vars || {}
3175
+ });
3176
+ const resp = await require_graders.callProviderWithContext(searchProvider, prompt, "search-rubric", {
3177
+ output: require_graders.tryParse(llmOutput),
3178
+ rubric,
3179
+ ...vars || {}
3180
+ }, providerCallContext);
3181
+ if (resp.error || !resp.output) return {
3182
+ pass: false,
3183
+ score: 0,
3184
+ reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
3185
+ tokensUsed: resp.tokenUsage,
3186
+ assertion
3187
+ };
3188
+ try {
3189
+ const result = require_logger.extractFirstJsonObject(String(resp.output));
3190
+ let pass = result.pass ?? false;
3191
+ const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
3192
+ if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
3193
+ return {
3194
+ pass,
3195
+ score,
3196
+ reason: result.reason || "No reason provided",
3197
+ tokensUsed: resp.tokenUsage,
3198
+ assertion,
3199
+ metadata: {
3200
+ searchResults: result.searchResults || [],
3201
+ searchProvider: searchProvider.id()
3202
+ }
3203
+ };
3204
+ } catch (err) {
3205
+ require_logger.logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
3206
+ const outputLower = String(resp.output).toLowerCase();
3207
+ const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
3208
+ return {
3209
+ pass,
3210
+ score: pass ? 1 : 0,
3211
+ reason: resp.output,
3212
+ tokensUsed: resp.tokenUsage,
3213
+ assertion
3214
+ };
3215
+ }
3216
+ }
3217
+ //#endregion
2469
3218
  //#region src/assertions/searchRubric.ts
2470
3219
  async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
2471
3220
  if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
2472
- const result = await require_graders.matchesSearchRubric(String(renderedValue), providerResponse.output, test.options, test.vars, assertion, provider, providerCallContext);
3221
+ const result = await matchesSearchRubric(String(renderedValue), providerResponse.output, test.options, test.vars, assertion, provider, providerCallContext);
2473
3222
  if (inverse) {
2474
3223
  result.pass = !result.pass;
2475
3224
  result.reason = result.pass ? `Output does not require web search verification: ${result.reason}` : `Output requires web search verification: ${result.reason}`;
@@ -2500,7 +3249,7 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
2500
3249
  if (Array.isArray(renderedValue)) {
2501
3250
  let minScore = Number.POSITIVE_INFINITY;
2502
3251
  for (const value of renderedValue) {
2503
- const result = await require_graders.matchesSimilarity(value, outputString, threshold, inverse, test.options, metric);
3252
+ const result = await matchesSimilarity(value, outputString, threshold, inverse, test.options, metric);
2504
3253
  if (result.pass) return {
2505
3254
  assertion,
2506
3255
  ...result
@@ -2515,7 +3264,7 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
2515
3264
  };
2516
3265
  } else return {
2517
3266
  assertion,
2518
- ...await require_graders.matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
3267
+ ...await matchesSimilarity(renderedValue, outputString, threshold, inverse, test.options, metric)
2519
3268
  };
2520
3269
  };
2521
3270
  //#endregion
@@ -3551,7 +4300,7 @@ const ASSERTION_HANDLERS = {
3551
4300
  "llm-rubric": handleLlmRubric,
3552
4301
  meteor: async (params) => {
3553
4302
  try {
3554
- const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-Co1VQ1u5.cjs"));
4303
+ const { handleMeteorAssertion } = await Promise.resolve().then(() => require("./meteor-BBGcGeCa.cjs"));
3555
4304
  return handleMeteorAssertion(params);
3556
4305
  } catch (error) {
3557
4306
  if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
@@ -3633,7 +4382,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
3633
4382
  const { cost, logProbs, output: originalOutput } = providerResponse;
3634
4383
  let output = originalOutput;
3635
4384
  require_invariant.invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
3636
- if (assertion.transform) output = await require_transform.transform(assertion.transform, output, {
4385
+ if (assertion.transform) output = await require_transform$1.transform(assertion.transform, output, {
3637
4386
  vars: resolvedVars,
3638
4387
  prompt: { label: prompt },
3639
4388
  ...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
@@ -3687,7 +4436,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
3687
4436
  };
3688
4437
  }
3689
4438
  else if (filePath.endsWith(".rb")) try {
3690
- const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-DUbq4tff.cjs"));
4439
+ const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-CO-tuszQ.cjs"));
3691
4440
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
3692
4441
  require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
3693
4442
  } catch (error) {
@@ -3840,7 +4589,7 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
3840
4589
  async function runCompareAssertion(test, assertion, outputs, context) {
3841
4590
  require_invariant.invariant(typeof assertion.value === "string", "select-best must have a string value");
3842
4591
  test = require_graders.getFinalTest(test, assertion);
3843
- return (await require_graders.matchesSelectBest(assertion.value, outputs, test.options, test.vars, context)).map((result) => ({
4592
+ return (await matchesSelectBest(assertion.value, outputs, test.options, test.vars, context)).map((result) => ({
3844
4593
  ...result,
3845
4594
  assertion
3846
4595
  }));
@@ -3857,17 +4606,17 @@ async function readAssertions(filePath) {
3857
4606
  var assertions_default = {
3858
4607
  runAssertion,
3859
4608
  runAssertions,
3860
- matchesSimilarity: require_graders.matchesSimilarity,
3861
- matchesClassification: require_graders.matchesClassification,
4609
+ matchesSimilarity,
4610
+ matchesClassification,
3862
4611
  matchesLlmRubric: require_graders.matchesLlmRubric,
3863
4612
  matchesFactuality: require_graders.matchesFactuality,
3864
4613
  matchesClosedQa: require_graders.matchesClosedQa,
3865
- matchesAnswerRelevance: require_graders.matchesAnswerRelevance,
3866
- matchesContextRecall: require_graders.matchesContextRecall,
3867
- matchesContextRelevance: require_graders.matchesContextRelevance,
3868
- matchesContextFaithfulness: require_graders.matchesContextFaithfulness,
3869
- matchesComparisonBoolean: require_graders.matchesSelectBest,
3870
- matchesModeration: require_graders.matchesModeration,
4614
+ matchesAnswerRelevance,
4615
+ matchesContextRecall,
4616
+ matchesContextRelevance,
4617
+ matchesContextFaithfulness,
4618
+ matchesComparisonBoolean: matchesSelectBest,
4619
+ matchesModeration,
3871
4620
  matchesConversationRelevance
3872
4621
  };
3873
4622
  //#endregion
@@ -4242,7 +4991,7 @@ function initializeOtel(config) {
4242
4991
  require_logger.logger.debug("[OtelSdk] Registered W3C Trace Context propagator");
4243
4992
  const resource = (0, _opentelemetry_resources.resourceFromAttributes)({
4244
4993
  [_opentelemetry_semantic_conventions.ATTR_SERVICE_NAME]: config.serviceName,
4245
- [_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]: require_fetch.VERSION
4994
+ [_opentelemetry_semantic_conventions.ATTR_SERVICE_VERSION]: require_version.VERSION
4246
4995
  });
4247
4996
  const spanProcessors = [];
4248
4997
  if (config.localExport) {
@@ -4960,13 +5709,13 @@ async function gradeRunEvalResponse({ abortSignal, deferGrading, evalId, latency
4960
5709
  }
4961
5710
  async function transformRunEvalResponse({ evalId, prompt, promptIdx, provider, response, test, testIdx, vars }) {
4962
5711
  const processedResponse = { ...response };
4963
- if (provider.transform) processedResponse.output = await require_transform.transform(provider.transform, processedResponse.output, {
5712
+ if (provider.transform) processedResponse.output = await require_transform$1.transform(provider.transform, processedResponse.output, {
4964
5713
  vars,
4965
5714
  prompt
4966
5715
  });
4967
5716
  const providerTransformedOutput = processedResponse.output;
4968
5717
  const testTransform = test.options?.transform || test.options?.postprocess;
4969
- if (testTransform) processedResponse.output = await require_transform.transform(testTransform, processedResponse.output, {
5718
+ if (testTransform) processedResponse.output = await require_transform$1.transform(testTransform, processedResponse.output, {
4970
5719
  vars,
4971
5720
  prompt,
4972
5721
  ...response && response.metadata && { metadata: response.metadata }
@@ -5418,10 +6167,10 @@ async function prepareTestVariables(tests, testSuite) {
5418
6167
  async function applyInputTransform(testCase, inputTransformDefault) {
5419
6168
  const inputTransform = testCase.options?.transformVars || inputTransformDefault;
5420
6169
  if (!inputTransform) return;
5421
- const transformedVars = await require_transform.transform(inputTransform, testCase.vars, {
6170
+ const transformedVars = await require_transform$1.transform(inputTransform, testCase.vars, {
5422
6171
  prompt: {},
5423
6172
  uuid: crypto.randomUUID()
5424
- }, true, require_transform.TransformInputType.VARS);
6173
+ }, true, require_transform$1.TransformInputType.VARS);
5425
6174
  require_invariant.invariant(typeof transformedVars === "object", "Transform function did not return a valid object");
5426
6175
  testCase.vars = {
5427
6176
  ...testCase.vars,
@@ -5485,7 +6234,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
5485
6234
  const defaultProvider = defaultTest.provider;
5486
6235
  if (require_types.isApiProvider(defaultProvider)) return defaultProvider;
5487
6236
  if (typeof defaultProvider === "object" && defaultProvider.id) {
5488
- const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-DV3ax9e_.cjs"));
6237
+ const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-C7lNVBjX.cjs"));
5489
6238
  return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
5490
6239
  }
5491
6240
  return defaultProvider;
@@ -5645,7 +6394,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
5645
6394
  async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
5646
6395
  if (!require_logger.state.resume || !evalRecord.persisted) return;
5647
6396
  try {
5648
- const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-Dvc-iucu.cjs"));
6397
+ const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-BBJAHAtw.cjs"));
5649
6398
  const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: require_logger.state.retryMode });
5650
6399
  const originalCount = runEvalOptions.length;
5651
6400
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -6104,9 +6853,8 @@ var Evaluator = class {
6104
6853
  context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
6105
6854
  }
6106
6855
  async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
6107
- let flushGroupedRows;
6108
6856
  try {
6109
- if (shouldGroupGradingByProvider) flushGroupedRows = await this.runGroupedEvalSteps({
6857
+ if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
6110
6858
  checkAbort,
6111
6859
  evalStepIndexMap,
6112
6860
  groupedRunEvalOptions,
@@ -6138,7 +6886,6 @@ var Evaluator = class {
6138
6886
  cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
6139
6887
  throw err;
6140
6888
  }
6141
- await flushGroupedRows?.();
6142
6889
  if (isEvalTimedOut()) require_logger.logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
6143
6890
  else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
6144
6891
  ciProgressReporter,
@@ -6363,7 +7110,7 @@ var Evaluator = class {
6363
7110
  }
6364
7111
  const maxScoreAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === "max-score");
6365
7112
  if (!maxScoreAssertion) return;
6366
- const maxScoreGradingResults = await require_graders.selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
7113
+ const maxScoreGradingResults = await selectMaxScore(resultsToCompare.map((r) => r.response?.output || ""), resultsToCompare, maxScoreAssertion);
6367
7114
  updateComparisonReporterProgress({
6368
7115
  ciProgressReporter,
6369
7116
  compareCount,
@@ -8434,47 +9181,11 @@ function filterPrompts(prompts, filterPromptsOption) {
8434
9181
  //#endregion
8435
9182
  //#region src/commands/eval/filterProviders.ts
8436
9183
  /**
8437
- * Checks if a value is a valid provider ID (non-empty string).
8438
- */
8439
- function isValidProviderId(id) {
8440
- return id !== null && id !== void 0 && typeof id === "string" && id !== "";
8441
- }
8442
- /**
8443
9184
  * Extracts the id and label from a raw provider config without instantiating it.
8444
9185
  * Handles all provider config formats: string, function, ProviderOptions, ProviderOptionsMap.
8445
9186
  */
8446
9187
  function getProviderIdAndLabel(provider, index) {
8447
- if (typeof provider === "string") return { id: provider };
8448
- if (typeof provider === "function") {
8449
- const label = provider.label;
8450
- return {
8451
- id: label ?? `custom-function-${index}`,
8452
- label
8453
- };
8454
- }
8455
- const providerId = provider.id;
8456
- if ("id" in provider && isValidProviderId(providerId)) return {
8457
- id: providerId,
8458
- label: provider.label
8459
- };
8460
- const keys = Object.keys(provider);
8461
- if (keys.length > 0) {
8462
- const id = keys[0];
8463
- const value = provider[id];
8464
- if (typeof value === "object" && value !== null) return {
8465
- id: value.id || id,
8466
- label: value.label
8467
- };
8468
- }
8469
- const label = provider.label;
8470
- if (isValidProviderId(label)) return {
8471
- id: label,
8472
- label
8473
- };
8474
- return {
8475
- id: `unknown-${index}`,
8476
- label
8477
- };
9188
+ return require_util.normalizeProviderRef(provider, { index });
8478
9189
  }
8479
9190
  /**
8480
9191
  * Filters raw provider configs BEFORE instantiation.
@@ -10227,7 +10938,7 @@ async function fetchRemoteGeneration(task, prompts) {
10227
10938
  const body = {
10228
10939
  task,
10229
10940
  prompts,
10230
- version: require_fetch.VERSION,
10941
+ version: require_version.VERSION,
10231
10942
  email: require_accounts.getUserEmail()
10232
10943
  };
10233
10944
  const response = await require_cache.fetchWithCache(require_server.getRemoteGenerationUrl(), {
@@ -11179,9 +11890,10 @@ function dedupeTestCases(testCases) {
11179
11890
  return deduped;
11180
11891
  }
11181
11892
  function buildMaxCharsRetryInstructions(rejectedPromptLengths, limit) {
11893
+ const longestRejectedPromptText = rejectedPromptLengths.length > 0 ? `${Math.max(...rejectedPromptLengths)} characters` : "unknown length";
11182
11894
  return dedent.default`
11183
11895
  Your previous response included ${rejectedPromptLengths.length} generated prompt${rejectedPromptLengths.length === 1 ? "" : "s"} that exceeded the ${limit ?? "configured"}-character limit.
11184
- The longest rejected prompt was ${Math.max(...rejectedPromptLengths)} characters.
11896
+ The longest rejected prompt was ${longestRejectedPromptText}.
11185
11897
  Generate replacement prompts only, and keep every user message within the character limit.
11186
11898
  `.trim();
11187
11899
  }
@@ -11238,7 +11950,7 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
11238
11950
  n,
11239
11951
  purpose,
11240
11952
  task: key,
11241
- version: require_fetch.VERSION,
11953
+ version: require_version.VERSION,
11242
11954
  email: require_accounts.getUserEmail()
11243
11955
  });
11244
11956
  try {
@@ -12317,7 +13029,7 @@ function handleFailedPlugins(failedPlugins, strict) {
12317
13029
  }
12318
13030
  function getConfigHash(configPath) {
12319
13031
  const content = fs.readFileSync(configPath, "utf8");
12320
- return (0, crypto$1.createHash)("md5").update(`${require_fetch.VERSION}:${content}`).digest("hex");
13032
+ return (0, crypto$1.createHash)("md5").update(`${require_version.VERSION}:${content}`).digest("hex");
12321
13033
  }
12322
13034
  function createHeaderComments({ title, timestampLabel, author, cloudHost, testCasesCount, plugins, strategies, isUpdate = false }) {
12323
13035
  const sectionLabel = isUpdate ? "Changes:" : "Test Configuration:";
@@ -13230,7 +13942,7 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
13230
13942
  for (const row of evaluateTable.body.slice(0, maxRows)) table.push([...row.vars.map((v) => require_text.ellipsize(v, tableCellMaxLength)), ...row.outputs.map(({ pass, text, failureReason: failureType }) => {
13231
13943
  text = require_text.ellipsize(text, tableCellMaxLength);
13232
13944
  if (pass) return chalk.default.green("[PASS] ") + text;
13233
- else return chalk.default.red(failureType === require_types.ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.default.red.bold(c) : c).join("---");
13945
+ return chalk.default.red(failureType === require_types.ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.default.red.bold(c) : c).join("---");
13234
13946
  })]);
13235
13947
  return table.toString();
13236
13948
  }
@@ -13321,6 +14033,115 @@ function formatDuration(seconds) {
13321
14033
  }
13322
14034
  //#endregion
13323
14035
  //#region src/commands/eval/summary.ts
14036
+ function getCompletionMessage({ completionType, evalId, shareableUrl, wasAborted, writeToDatabase, activelySharing }) {
14037
+ if (wasAborted) {
14038
+ const idSuffix = writeToDatabase ? ` (ID: ${chalk.default.cyan(evalId)})` : "";
14039
+ return `${chalk.default.red("✗")} ${completionType} aborted${idSuffix}`;
14040
+ }
14041
+ if (writeToDatabase && shareableUrl) return `${chalk.default.green("✓")} ${completionType} complete: ${shareableUrl}`;
14042
+ if (writeToDatabase && activelySharing) return `${chalk.default.green("✓")} ${completionType} complete`;
14043
+ if (writeToDatabase) return `${chalk.default.green("✓")} ${completionType} complete (ID: ${chalk.default.cyan(evalId)})`;
14044
+ return `${chalk.default.green("✓")} ${completionType} complete`;
14045
+ }
14046
+ function getAbortSummaryLines(targetErrorStatus) {
14047
+ if (targetErrorStatus == null) return [];
14048
+ return [
14049
+ "",
14050
+ chalk.default.red.bold("Scan stopped: Target is unavailable and will not recover on retry."),
14051
+ chalk.default.red(` Target returned HTTP ${targetErrorStatus}`),
14052
+ "",
14053
+ chalk.default.yellow("Possible causes:"),
14054
+ chalk.default.yellow(" • Invalid API key or authentication (401/403)"),
14055
+ chalk.default.yellow(" • Target endpoint does not exist (404)"),
14056
+ chalk.default.yellow(" • Server does not support the request (501)"),
14057
+ "",
14058
+ chalk.default.cyan("To fix: Check your target configuration and credentials.")
14059
+ ];
14060
+ }
14061
+ function getGuidanceLines({ writeToDatabase, shareableUrl, wantsToShare, activelySharing, hasExplicitDisable, cloudEnabled }) {
14062
+ if (!writeToDatabase || shareableUrl || wantsToShare || activelySharing) return [];
14063
+ const lines = ["", `» View results: ${chalk.default.green.bold("promptfoo view")}`];
14064
+ if (!hasExplicitDisable) lines.push(cloudEnabled ? `» Create shareable URL: ${chalk.default.green.bold("promptfoo share")}` : `» Share with your team: ${chalk.default.green.bold("https://promptfoo.app")}`);
14065
+ lines.push(`» Feedback: ${chalk.default.green.bold("https://promptfoo.dev/feedback")}`);
14066
+ return lines;
14067
+ }
14068
+ function buildUsageDetails(usage, total) {
14069
+ const parts = [];
14070
+ if (usage.prompt && usage.prompt > 0) parts.push(`${usage.prompt.toLocaleString()} prompt`);
14071
+ if (usage.completion && usage.completion > 0) parts.push(`${usage.completion.toLocaleString()} completion`);
14072
+ if (usage.cached && usage.cached > 0) parts.push(usage.cached === total && parts.length === 0 ? "cached" : `${usage.cached.toLocaleString()} cached`);
14073
+ if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) parts.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
14074
+ return parts;
14075
+ }
14076
+ function getTokenUsageLines(tokenUsage, isRedteam, tracker) {
14077
+ const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
14078
+ const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
14079
+ if (!hasEvalTokens && !hasGradingTokens) return [];
14080
+ const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
14081
+ const evalTokens = {
14082
+ prompt: tokenUsage.prompt || 0,
14083
+ completion: tokenUsage.completion || 0,
14084
+ total: tokenUsage.total || combinedTotal,
14085
+ cached: tokenUsage.cached || 0,
14086
+ numRequests: tokenUsage.numRequests || 0,
14087
+ completionDetails: tokenUsage.completionDetails || {
14088
+ reasoning: 0,
14089
+ acceptedPrediction: 0,
14090
+ rejectedPrediction: 0
14091
+ }
14092
+ };
14093
+ const lines = [`${chalk.default.bold("Total Tokens:")} ${chalk.default.white.bold((evalTokens.total + (tokenUsage.assertions?.total || 0)).toLocaleString())}`];
14094
+ if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.default.gray("Probes:")} ${chalk.default.white(tokenUsage.numRequests.toLocaleString())}`);
14095
+ if (evalTokens.total > 0) {
14096
+ const evalParts = buildUsageDetails(evalTokens, evalTokens.total);
14097
+ lines.push(` ${chalk.default.gray("Eval:")} ${chalk.default.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
14098
+ }
14099
+ if (tokenUsage.assertions?.total && tokenUsage.assertions.total > 0) {
14100
+ const gradingParts = buildUsageDetails(tokenUsage.assertions, tokenUsage.assertions.total);
14101
+ lines.push(` ${chalk.default.gray("Grading:")} ${chalk.default.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
14102
+ }
14103
+ lines.push(...getProviderUsageLines(tracker));
14104
+ return lines;
14105
+ }
14106
+ function getProviderUsageLines(tracker) {
14107
+ const providerIds = tracker.getProviderIds();
14108
+ if (providerIds.length <= 1) return [];
14109
+ const sortedProviders = providerIds.map((id) => ({
14110
+ id,
14111
+ usage: tracker.getProviderUsage(id)
14112
+ })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
14113
+ const lines = ["", chalk.default.bold("Providers:")];
14114
+ for (const { id, usage } of sortedProviders) {
14115
+ if ((usage.total || 0) === 0 && (usage.prompt || 0) + (usage.completion || 0) === 0) continue;
14116
+ const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
14117
+ const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
14118
+ const details = buildUsageDetails(usage, displayTotal);
14119
+ const requestInfo = `${usage.numRequests || 0} requests`;
14120
+ const separator = details.length > 0 ? "; " : "";
14121
+ lines.push(` ${chalk.default.gray(`${displayId}:`)} ${chalk.default.white(displayTotal.toLocaleString())} (${requestInfo}${separator}${details.join(", ")})`);
14122
+ }
14123
+ return lines;
14124
+ }
14125
+ function formatResultPercentage(count, totalTests) {
14126
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
14127
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
14128
+ }
14129
+ function formatResultLine(count, label, icon, iconColor, totalTests) {
14130
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count, totalTests)})`)}`;
14131
+ }
14132
+ function getResultsLines({ successes, failures, errors, duration, maxConcurrency }) {
14133
+ const totalTests = successes + failures + errors;
14134
+ const errorLabel = errors === 1 ? "error" : "errors";
14135
+ return [
14136
+ "",
14137
+ chalk.default.bold("Results:"),
14138
+ formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green, totalTests),
14139
+ formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red, totalTests),
14140
+ formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red, totalTests),
14141
+ chalk.default.gray(`Duration: ${formatDuration(duration)} (concurrency: ${maxConcurrency})`),
14142
+ ""
14143
+ ];
14144
+ }
13324
14145
  /**
13325
14146
  * Generate formatted evaluation summary output for CLI display.
13326
14147
  *
@@ -13359,115 +14180,28 @@ function formatDuration(seconds) {
13359
14180
  * ```
13360
14181
  */
13361
14182
  function generateEvalSummary(params) {
13362
- const { evalId, isRedteam, writeToDatabase, shareableUrl, wantsToShare, hasExplicitDisable, cloudEnabled, activelySharing = false, tokenUsage, successes, failures, errors, duration, maxConcurrency, tracker, targetErrorStatus } = params;
13363
- const lines = [];
13364
- const completionType = isRedteam ? "Red team" : "Eval";
13365
- const wasAborted = targetErrorStatus != null;
13366
- let completionMessage;
13367
- if (wasAborted) {
13368
- completionMessage = `${chalk.default.red("✗")} ${completionType} aborted`;
13369
- if (writeToDatabase) completionMessage += ` (ID: ${chalk.default.cyan(evalId)})`;
13370
- } else if (writeToDatabase && shareableUrl) completionMessage = `${chalk.default.green("✓")} ${completionType} complete: ${shareableUrl}`;
13371
- else if (writeToDatabase && activelySharing) completionMessage = `${chalk.default.green("✓")} ${completionType} complete`;
13372
- else if (writeToDatabase) completionMessage = `${chalk.default.green("✓")} ${completionType} complete (ID: ${chalk.default.cyan(evalId)})`;
13373
- else completionMessage = `${chalk.default.green("✓")} ${completionType} complete`;
13374
- lines.push(completionMessage);
13375
- if (wasAborted && targetErrorStatus != null) {
13376
- lines.push("");
13377
- lines.push(chalk.default.red.bold("Scan stopped: Target is unavailable and will not recover on retry."));
13378
- lines.push(chalk.default.red(` Target returned HTTP ${targetErrorStatus}`));
13379
- lines.push("");
13380
- lines.push(chalk.default.yellow("Possible causes:"));
13381
- lines.push(chalk.default.yellow(" • Invalid API key or authentication (401/403)"));
13382
- lines.push(chalk.default.yellow(" • Target endpoint does not exist (404)"));
13383
- lines.push(chalk.default.yellow(" • Server does not support the request (501)"));
13384
- lines.push("");
13385
- lines.push(chalk.default.cyan("To fix: Check your target configuration and credentials."));
13386
- }
13387
- if (writeToDatabase && !shareableUrl && !wantsToShare && !activelySharing) {
13388
- lines.push("");
13389
- lines.push(`» View results: ${chalk.default.green.bold("promptfoo view")}`);
13390
- if (!hasExplicitDisable) if (cloudEnabled) lines.push(`» Create shareable URL: ${chalk.default.green.bold("promptfoo share")}`);
13391
- else lines.push(`» Share with your team: ${chalk.default.green.bold("https://promptfoo.app")}`);
13392
- lines.push(`» Feedback: ${chalk.default.green.bold("https://promptfoo.dev/feedback")}`);
13393
- }
13394
- lines.push("");
13395
- const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
13396
- const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
13397
- if (hasEvalTokens || hasGradingTokens) {
13398
- const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
13399
- const evalTokens = {
13400
- prompt: tokenUsage.prompt || 0,
13401
- completion: tokenUsage.completion || 0,
13402
- total: tokenUsage.total || combinedTotal,
13403
- cached: tokenUsage.cached || 0,
13404
- completionDetails: tokenUsage.completionDetails || {
13405
- reasoning: 0,
13406
- acceptedPrediction: 0,
13407
- rejectedPrediction: 0
13408
- }
13409
- };
13410
- const grandTotal = evalTokens.total + (tokenUsage.assertions?.total || 0);
13411
- lines.push(`${chalk.default.bold("Total Tokens:")} ${chalk.default.white.bold(grandTotal.toLocaleString())}`);
13412
- if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.default.gray("Probes:")} ${chalk.default.white(tokenUsage.numRequests.toLocaleString())}`);
13413
- if (evalTokens.total > 0) {
13414
- const evalParts = [];
13415
- if (evalTokens.prompt > 0) evalParts.push(`${evalTokens.prompt.toLocaleString()} prompt`);
13416
- if (evalTokens.completion > 0) evalParts.push(`${evalTokens.completion.toLocaleString()} completion`);
13417
- if (evalTokens.cached > 0) if (evalTokens.cached === evalTokens.total && evalParts.length === 0) evalParts.push("cached");
13418
- else evalParts.push(`${evalTokens.cached.toLocaleString()} cached`);
13419
- if (evalTokens.completionDetails?.reasoning && evalTokens.completionDetails.reasoning > 0) evalParts.push(`${evalTokens.completionDetails.reasoning.toLocaleString()} reasoning`);
13420
- lines.push(` ${chalk.default.gray("Eval:")} ${chalk.default.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
13421
- }
13422
- if (tokenUsage.assertions && tokenUsage.assertions.total && tokenUsage.assertions.total > 0) {
13423
- const gradingParts = [];
13424
- if (tokenUsage.assertions.prompt && tokenUsage.assertions.prompt > 0) gradingParts.push(`${tokenUsage.assertions.prompt.toLocaleString()} prompt`);
13425
- if (tokenUsage.assertions.completion && tokenUsage.assertions.completion > 0) gradingParts.push(`${tokenUsage.assertions.completion.toLocaleString()} completion`);
13426
- if (tokenUsage.assertions.cached && tokenUsage.assertions.cached > 0) if (tokenUsage.assertions.cached === tokenUsage.assertions.total && gradingParts.length === 0) gradingParts.push("cached");
13427
- else gradingParts.push(`${tokenUsage.assertions.cached.toLocaleString()} cached`);
13428
- if (tokenUsage.assertions.completionDetails?.reasoning && tokenUsage.assertions.completionDetails.reasoning > 0) gradingParts.push(`${tokenUsage.assertions.completionDetails.reasoning.toLocaleString()} reasoning`);
13429
- lines.push(` ${chalk.default.gray("Grading:")} ${chalk.default.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
13430
- }
13431
- const providerIds = tracker.getProviderIds();
13432
- if (providerIds.length > 1) {
13433
- lines.push("");
13434
- lines.push(chalk.default.bold("Providers:"));
13435
- const sortedProviders = providerIds.map((id) => ({
13436
- id,
13437
- usage: tracker.getProviderUsage(id)
13438
- })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
13439
- for (const { id, usage } of sortedProviders) if ((usage.total || 0) > 0 || (usage.prompt || 0) + (usage.completion || 0) > 0) {
13440
- const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
13441
- const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
13442
- const details = [];
13443
- if (usage.prompt && usage.prompt > 0) details.push(`${usage.prompt.toLocaleString()} prompt`);
13444
- if (usage.completion && usage.completion > 0) details.push(`${usage.completion.toLocaleString()} completion`);
13445
- if (usage.cached && usage.cached > 0) if (usage.cached === displayTotal && details.length === 0) details.push("cached");
13446
- else details.push(`${usage.cached.toLocaleString()} cached`);
13447
- if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) details.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
13448
- const breakdown = ` (${`${usage.numRequests || 0} requests`}${details.length > 0 ? "; " : ""}${details.join(", ")})`;
13449
- lines.push(` ${chalk.default.gray(displayId + ":")} ${chalk.default.white(displayTotal.toLocaleString())}${breakdown}`);
13450
- }
13451
- }
13452
- }
13453
- lines.push("");
13454
- const totalTests = successes + failures + errors;
13455
- const formatResultPercentage = (count) => {
13456
- const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
13457
- return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
13458
- };
13459
- const formatResultLine = (count, label, icon, iconColor) => {
13460
- return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count)})`)}`;
13461
- };
13462
- const errorLabel = errors === 1 ? "error" : "errors";
13463
- lines.push(chalk.default.bold("Results:"));
13464
- lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green));
13465
- lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red));
13466
- lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red));
13467
- const durationDisplay = formatDuration(duration);
13468
- lines.push(chalk.default.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
13469
- lines.push("");
13470
- return lines;
14183
+ return [
14184
+ getCompletionMessage({
14185
+ completionType: params.isRedteam ? "Red team" : "Eval",
14186
+ evalId: params.evalId,
14187
+ shareableUrl: params.shareableUrl,
14188
+ wasAborted: params.targetErrorStatus != null,
14189
+ writeToDatabase: params.writeToDatabase,
14190
+ activelySharing: params.activelySharing ?? false
14191
+ }),
14192
+ ...getAbortSummaryLines(params.targetErrorStatus),
14193
+ ...getGuidanceLines({
14194
+ writeToDatabase: params.writeToDatabase,
14195
+ shareableUrl: params.shareableUrl,
14196
+ wantsToShare: params.wantsToShare,
14197
+ activelySharing: params.activelySharing ?? false,
14198
+ hasExplicitDisable: params.hasExplicitDisable,
14199
+ cloudEnabled: params.cloudEnabled
14200
+ }),
14201
+ "",
14202
+ ...getTokenUsageLines(params.tokenUsage, params.isRedteam, params.tracker),
14203
+ ...getResultsLines(params)
14204
+ ];
13471
14205
  }
13472
14206
  //#endregion
13473
14207
  //#region src/commands/retry.ts
@@ -14237,6 +14971,26 @@ async function doRedteamRun(options) {
14237
14971
  }
14238
14972
  //#endregion
14239
14973
  //#region src/index.ts
14974
+ /**
14975
+ * Shallow-clone a test case so the caller can swap in resolved ApiProvider
14976
+ * instances on `options.provider` / `assert[].provider` without leaking those
14977
+ * mutations back to the input. The input may alias the unified config written
14978
+ * to the Eval record, and a live SDK client (e.g. Bedrock's BedrockRuntime,
14979
+ * Anthropic's client) holds circular references that break drizzle's JSON
14980
+ * serialization on `evalRecord.save()`. Fixes #8687.
14981
+ *
14982
+ * Detaches only `options` and `assert[]`. Other reference fields (`provider`,
14983
+ * `vars`, `metadata`, `providerOutput`) remain aliased — callers must reassign
14984
+ * those by reference rather than mutating in place. `assert-set` children are
14985
+ * not deep-cloned because the resolve loop skips `assert-set`; if that ever
14986
+ * changes, extend this helper.
14987
+ */
14988
+ function cloneTestForResolve(test) {
14989
+ const cloned = { ...test };
14990
+ if (test.options) cloned.options = { ...test.options };
14991
+ if (test.assert) cloned.assert = test.assert.map((assertion) => ({ ...assertion }));
14992
+ return cloned;
14993
+ }
14240
14994
  async function evaluate(testSuite, options = {}) {
14241
14995
  if (testSuite.writeLatestResults) await runDbMigrations();
14242
14996
  const loadedProviders = await require_providers.loadApiProviders(testSuite.providers, { env: testSuite.env });
@@ -14256,22 +15010,24 @@ async function evaluate(testSuite, options = {}) {
14256
15010
  nunjucksFilters: await require_util.readFilters(testSuite.nunjucksFilters || {}),
14257
15011
  prompts: await require_graders.processPrompts(testSuite.prompts)
14258
15012
  };
14259
- if (typeof constructedTestSuite.defaultTest === "object") {
14260
- if (constructedTestSuite.defaultTest?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
15013
+ if (typeof constructedTestSuite.defaultTest === "object" && constructedTestSuite.defaultTest) {
15014
+ constructedTestSuite.defaultTest = cloneTestForResolve(constructedTestSuite.defaultTest);
15015
+ if (constructedTestSuite.defaultTest.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
14261
15016
  env: testSuite.env,
14262
15017
  basePath: require_logger.state.basePath
14263
15018
  });
14264
- if (constructedTestSuite.defaultTest?.options?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
15019
+ if (constructedTestSuite.defaultTest.options?.provider && !require_types.isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await require_providers.resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
14265
15020
  env: testSuite.env,
14266
15021
  basePath: require_logger.state.basePath
14267
15022
  });
14268
15023
  }
14269
- for (const test of constructedTestSuite.tests || []) {
15024
+ constructedTestSuite.tests = (constructedTestSuite.tests || []).map(cloneTestForResolve);
15025
+ for (const test of constructedTestSuite.tests) {
14270
15026
  if (test.options?.provider && !require_types.isApiProvider(test.options.provider)) test.options.provider = await require_providers.resolveProvider(test.options.provider, providerMap, {
14271
15027
  env: testSuite.env,
14272
15028
  basePath: require_logger.state.basePath
14273
15029
  });
14274
- if (test.assert) for (const assertion of test.assert) {
15030
+ for (const assertion of test.assert || []) {
14275
15031
  if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
14276
15032
  if (assertion.provider && !require_types.isApiProvider(assertion.provider)) assertion.provider = await require_providers.resolveProvider(assertion.provider, providerMap, {
14277
15033
  env: testSuite.env,