promptfoo 0.121.4 → 0.121.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (346) hide show
  1. package/dist/src/{ListApp-DQkFNqE9.js → ListApp-BRUsT43Y.js} +1 -1
  2. package/dist/src/{accounts-Dy17bs4D.cjs → accounts-BIFntVWB.cjs} +4 -4
  3. package/dist/src/{accounts-F9d_5sMC.js → accounts-CLJHCDDb.js} +6 -6
  4. package/dist/src/{accounts-DhMYUUbu.js → accounts-CaLNYnf7.js} +4 -4
  5. package/dist/src/{accounts-DdJ2pHMI.js → accounts-bnyHT7Ju.js} +5 -5
  6. package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-B5krlibj.js} +3 -3
  7. package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-Ba67xmgs.js} +2 -2
  8. package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-BclbiXiq.js} +3 -3
  9. package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-D2x0wGhB.cjs} +2 -2
  10. package/dist/src/{agents-CgaMXvLM.js → agents-BGqaTDnr.js} +5 -5
  11. package/dist/src/{agents-8FDnTriG.js → agents-BV9yFpXX.js} +5 -5
  12. package/dist/src/{agents-aYPQLf8W.js → agents-BYdMl1UE.js} +4 -4
  13. package/dist/src/{agents-pQeBEXMm.js → agents-DhxWMCtH.js} +5 -5
  14. package/dist/src/{agents-D7-HGxUj.cjs → agents-DiWmQYH9.cjs} +4 -4
  15. package/dist/src/{agents-BahDpe5G.cjs → agents-WULPVjbH.cjs} +4 -4
  16. package/dist/src/{agents-DJ35I3Nt.js → agents-emVcx3yh.js} +5 -5
  17. package/dist/src/{agents-C-R_jfzI.js → agents-n6vPqV3i.js} +4 -4
  18. package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-BxqK9HF_.js} +7 -7
  19. package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-BzLjZI_m.cjs} +6 -6
  20. package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-DR4pgeiC.js} +6 -6
  21. package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-uPGp0Zdo.js} +7 -7
  22. package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
  23. package/dist/src/app/assets/Report-vjzrbgce.js +1 -0
  24. package/dist/src/app/assets/index-B3NQ8HTd.js +385 -0
  25. package/dist/src/app/assets/{index-BXGkeMwh.css → index-Cli2yAXv.css} +1 -1
  26. package/dist/src/app/index.html +27 -2
  27. package/dist/src/{audio-DcVKoInv.js → audio-BvpTOArF.js} +4 -4
  28. package/dist/src/{audio-BQtNuYBj.cjs → audio-C0vDeS0j.cjs} +3 -3
  29. package/dist/src/{audio-B7izf48x.js → audio-CScmnmEB.js} +4 -4
  30. package/dist/src/{audio-COrn8rM6.js → audio-Da8U9IS5.js} +3 -3
  31. package/dist/src/{base-fZ9wgg50.js → base-BOMaNEes.js} +3 -3
  32. package/dist/src/{base-PYJvBE1i.js → base-BTux96b1.js} +2 -2
  33. package/dist/src/{base-D-670DX8.cjs → base-Tw6uhH8K.cjs} +2 -2
  34. package/dist/src/{base-yrI1Yal4.js → base-dYsl2hmL.js} +3 -3
  35. package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B95F_7vE.cjs} +2 -2
  36. package/dist/src/{blobs-C-F78Kfn.js → blobs-BW4U31ue.js} +2 -2
  37. package/dist/src/{blobs-BCZavS8s.js → blobs-D_gg8nbm.js} +3 -3
  38. package/dist/src/{blobs-BQWqnnvL.js → blobs-DjLby-uP.js} +3 -3
  39. package/dist/src/{cache-mb7c8hbp.js → cache-BI5BY7ey.js} +4 -4
  40. package/dist/src/{cache-DbLsVWB2.cjs → cache-BRkhlH3k.cjs} +1 -1
  41. package/dist/src/cache-BlC6aeJ0.js +3 -0
  42. package/dist/src/{cache-D5NZmMiT.js → cache-Bzttsk0X.js} +2 -2
  43. package/dist/src/{cache-C4Xb-hNb.js → cache-Cr-qWIbP.js} +3 -3
  44. package/dist/src/{cache-BIyPcp5v.cjs → cache-DGg-yTZG.cjs} +2 -2
  45. package/dist/src/{chat-Dr3DUQ0D.js → chat-BLOdH60v.js} +12 -12
  46. package/dist/src/{chat-BfPaS15_.js → chat-Cx_LkwvZ.js} +12 -12
  47. package/dist/src/{chat-mW0ORo8G.js → chat-D9nudO9b.js} +4 -4
  48. package/dist/src/{chat-I9izLm49.js → chat-DChSH_Es.js} +12 -12
  49. package/dist/src/{chat-MKxMnZJZ.js → chat-DG2LkwLq.js} +2 -2
  50. package/dist/src/{chat-BPXSW8Bv.cjs → chat-DH97tVV9.cjs} +2 -2
  51. package/dist/src/{chat-0bwXjVP0.js → chat-aMQZw6R7.js} +4 -4
  52. package/dist/src/{chat-CclRbxGf.cjs → chat-vYqqv1gP.cjs} +11 -11
  53. package/dist/src/{chatkit-zUIVoDos.js → chatkit-B8X34dQc.js} +4 -4
  54. package/dist/src/{chatkit-Cv6AhukM.js → chatkit-BXu42Qwt.js} +3 -3
  55. package/dist/src/{chatkit-CJnHRRMM.js → chatkit-CbMRoeYw.js} +4 -4
  56. package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-D44VyUyB.cjs} +3 -3
  57. package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-BRq0bbIK.cjs} +8 -8
  58. package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-BjriSVRZ.js} +7 -7
  59. package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BzNZeZ0N.js} +7 -7
  60. package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-DYv_AJ8u.js} +7 -7
  61. package/dist/src/cloud-CoD5OacT.js +3 -0
  62. package/dist/src/{cloud-DQZ5sVjW.js → cloud-Da0bofJd.js} +3 -3
  63. package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-CXC4b1EU.js} +4 -4
  64. package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-CyBoIs1Q.js} +6 -6
  65. package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-DGOwgexC.js} +6 -6
  66. package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-DJv5qnyb.cjs} +4 -4
  67. package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-1sAoOyft.js} +5 -5
  68. package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-D-dnkzCF.js} +5 -5
  69. package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-DKVjkDav.cjs} +3 -3
  70. package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-TJkVrZlB.js} +3 -3
  71. package/dist/src/codex-app-server-CCLjqCh9.js +1915 -0
  72. package/dist/src/codex-app-server-CCe0TiDc.js +1915 -0
  73. package/dist/src/codex-app-server-CPW1LFwh.js +1916 -0
  74. package/dist/src/codex-app-server-VMRnjZ68.cjs +1920 -0
  75. package/dist/src/codex-sdk-1jm_qPHf.js +3 -0
  76. package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-Bd8UbO9q.cjs} +5 -5
  77. package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-BgEFQ70r.js} +6 -6
  78. package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-Bzb_TqX9.js} +6 -6
  79. package/dist/src/{codex-sdk-CWEnH70W.cjs → codex-sdk-Danroptg.cjs} +1 -1
  80. package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-DfvDTN33.js} +5 -5
  81. package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-B5ImDlSm.js} +8 -8
  82. package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-BgAkuYCw.cjs} +7 -7
  83. package/dist/src/{cometapi-BUlt_ELa.js → cometapi-CC7hWxmX.js} +8 -8
  84. package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CCbpHkuF.js} +7 -7
  85. package/dist/src/{completion-x0a_c2y1.js → completion-2iuYVxwi.js} +6 -6
  86. package/dist/src/{completion-Dnxn7E-j.js → completion-CrD6MQ93.js} +5 -5
  87. package/dist/src/{completion-BozdoXba.cjs → completion-DtQ72Bm3.cjs} +5 -5
  88. package/dist/src/{completion-HUe8wDhZ.js → completion-Vq_ad618.js} +6 -6
  89. package/dist/src/{createHash-ChI45QR1.js → createHash-DPpsZgFF.js} +1 -1
  90. package/dist/src/{createHash-CwDVU5xr.js → createHash-Un4Q_huE.js} +1 -1
  91. package/dist/src/{createHash-B7KvgoOD.cjs → createHash-VvBIc-AW.cjs} +1 -1
  92. package/dist/src/{docker-DCgsveLD.js → docker--3qzPa-6.js} +6 -6
  93. package/dist/src/{docker-DS4_Osau.cjs → docker-D3AY-5F5.cjs} +5 -5
  94. package/dist/src/{docker-CQmlA2NU.js → docker-DCsCDvwM.js} +6 -6
  95. package/dist/src/{docker-ClnmCf1Z.js → docker-Dorv4_Dg.js} +5 -5
  96. package/dist/src/{embedding-I45KG3o7.cjs → embedding-BXhN5lCH.cjs} +5 -5
  97. package/dist/src/{embedding-nFbumxcv.js → embedding-ChS1ivFS.js} +5 -5
  98. package/dist/src/{embedding-D3xTseo7.js → embedding-DNRvZwRN.js} +6 -6
  99. package/dist/src/{embedding-DD9wa3ae.js → embedding-D_bI4NDq.js} +6 -6
  100. package/dist/src/{errors-Cw810C93.js → errors-DFHe4L-n.js} +1 -1
  101. package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
  102. package/dist/src/{esm-C7PnfdF8.js → esm-BRkfNsYs.js} +1 -1
  103. package/dist/src/{esm-tVgYPY-f.js → esm-BX8fwlAO.js} +2 -2
  104. package/dist/src/{esm-CtEPLdAj.cjs → esm-B_rGuPTo.cjs} +1 -1
  105. package/dist/src/{eval-CzJFfFO9.js → eval-BQPLBJbw.js} +1 -1
  106. package/dist/src/{eval-u4UVafl6.js → eval-DJ_4A-tr.js} +14 -14
  107. package/dist/src/evalResult-BBJAHAtw.cjs +2 -0
  108. package/dist/src/evalResult-BBK58h2B.js +3 -0
  109. package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-Cx-8OWkb.cjs} +28 -10
  110. package/dist/src/{evalResult-D3hVYFis.js → evalResult-D6P5I5il.js} +29 -11
  111. package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-pSvGWFMo.js} +29 -11
  112. package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-D-UIbbYq.js} +845 -98
  113. package/dist/src/evaluator-DgLKaZk8.js +3 -0
  114. package/dist/src/{extractor-Dk6bRWkv.js → extractor-BM3jRERL.js} +5 -5
  115. package/dist/src/{extractor-WVPOrH43.cjs → extractor-Dxr2J_wK.cjs} +5 -5
  116. package/dist/src/{extractor-DNSeBVOJ.js → extractor-DxyiFhPk.js} +6 -6
  117. package/dist/src/{extractor-CAfTSraf.js → extractor-YlZbUMsL.js} +6 -6
  118. package/dist/src/fetch-8viavNv8.js +3 -0
  119. package/dist/src/{fetch-BEWnXrrG.js → fetch-B6ch2nU2.js} +9 -20
  120. package/dist/src/{fetch-Di00EQrc.js → fetch-D9xxyC1p.js} +221 -232
  121. package/dist/src/{fetch-CJU5ELPa.cjs → fetch-NuqXW1Xb.cjs} +221 -244
  122. package/dist/src/{fetch-B0Z3Oe4k.js → fetch-Y5qX_kST.js} +8 -19
  123. package/dist/src/{fileExtensions-BArZuxsI.js → fileExtensions-8CjoL7vB.js} +1 -1
  124. package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-BGh-W-HT.js} +1 -1
  125. package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-D9h-8Wxg.cjs} +1 -1
  126. package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-DysCsxNG.js} +1 -1
  127. package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-Ch4A7G3o.js} +1 -1
  128. package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-BokHC-MW.cjs} +1 -1
  129. package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-C3ZPQU60.js} +1 -1
  130. package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-CFny3gOy.js} +1 -1
  131. package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-DxODqT9e.js} +1 -1
  132. package/dist/src/{graders-Zy3x0zqX.js → graders-BoUqsCEm.js} +1303 -2044
  133. package/dist/src/{graders--zknU_uk.cjs → graders-Bw1wk_21.cjs} +1553 -2240
  134. package/dist/src/graders-C84JI-m5.js +2 -0
  135. package/dist/src/graders-CBbd0K0Q.cjs +2 -0
  136. package/dist/src/graders-CbQqpHSN.js +3 -0
  137. package/dist/src/{graders-eIHhRqoC.js → graders-CgPn32yp.js} +1300 -2041
  138. package/dist/src/{graders-pvbReLLn.js → graders-CwrbifOo.js} +747 -1488
  139. package/dist/src/graders-DS42d3ZG.js +2 -0
  140. package/dist/src/{image-9302QVqR.js → image-BeWaInPF.js} +3 -3
  141. package/dist/src/{image-DVz2RiMF.js → image-BmilRNqO.js} +7 -7
  142. package/dist/src/{image-x6KqLQl4.cjs → image-CxJoa3aW.cjs} +6 -6
  143. package/dist/src/{image-De2FBmYV.cjs → image-D10dNAav.cjs} +3 -3
  144. package/dist/src/{image-dnoUgPrC.js → image-Dr_3I3nK.js} +4 -4
  145. package/dist/src/{image-B5Mv-Z3h.js → image-DsGRlkh7.js} +7 -7
  146. package/dist/src/{image-qUpPvmNZ.js → image-a_SGUobh.js} +6 -6
  147. package/dist/src/{image-u7-rKnYU.js → image-qjO6FWPs.js} +4 -4
  148. package/dist/src/index.cjs +1052 -296
  149. package/dist/src/index.d.cts +124 -13
  150. package/dist/src/index.d.ts +125 -14
  151. package/dist/src/index.js +1018 -262
  152. package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-CCICw2cy.js} +2 -2
  153. package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
  154. package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
  155. package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-BBETc5-S.js} +6 -6
  156. package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-C8qOo26M.cjs} +5 -5
  157. package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CzAi2rUI.js} +6 -6
  158. package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-Dr3Kib7F.js} +5 -5
  159. package/dist/src/{litellm-C2kqjxqp.js → litellm-BLSiANhk.js} +5 -5
  160. package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CaUmV7Mk.cjs} +4 -4
  161. package/dist/src/{litellm-p37R1dzQ.js → litellm-DQGo_juI.js} +4 -4
  162. package/dist/src/{litellm-DRjpcSa7.js → litellm-DRc4qWfc.js} +5 -5
  163. package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
  164. package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
  165. package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-B-tNZzqW.js} +6 -6
  166. package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CtS3OlGq.js} +5 -5
  167. package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-PJJgUjOc.js} +6 -6
  168. package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-if-Ml4R9.cjs} +5 -5
  169. package/dist/src/main.js +242 -198
  170. package/dist/src/{messages-zWbkLLHz.js → messages-B9dSjrNf.js} +264 -16
  171. package/dist/src/{messages-811uVVW5.cjs → messages-BnsVHUnm.cjs} +266 -15
  172. package/dist/src/{messages-MYTQ2TWp.js → messages-CI69Lasb.js} +264 -16
  173. package/dist/src/{messages-BTQz42fn.js → messages-CewuNcNS.js} +264 -16
  174. package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-BBGcGeCa.cjs} +1 -1
  175. package/dist/src/{meteor-DuAFv6gF.js → meteor-BKTM-7KS.js} +1 -1
  176. package/dist/src/{meteor-DHdzY1Ss.js → meteor-CeGo0Lu2.js} +2 -2
  177. package/dist/src/{meteor-CU5UAE-H.js → meteor-Wc_aUVvu.js} +2 -2
  178. package/dist/src/{modelslab-wu9yi5GE.js → modelslab-BCLOtfek.js} +7 -7
  179. package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-BkapYJhh.cjs} +6 -6
  180. package/dist/src/{modelslab-DIq-6y7x.js → modelslab-D73OnKSx.js} +6 -6
  181. package/dist/src/{modelslab-D0erNWKe.js → modelslab-zpz9JcK0.js} +7 -7
  182. package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-B8F_TK5w.js} +6 -6
  183. package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-Bx0NFV2f.js} +5 -5
  184. package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-CNGJTLtG.js} +6 -6
  185. package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-DkT7tnoB.cjs} +5 -5
  186. package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BaXRN1cr.js} +4 -4
  187. package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-BeTRaFOh.js} +4 -4
  188. package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CL7Zqv0G.js} +3 -3
  189. package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-YT426juD.cjs} +3 -3
  190. package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
  191. package/dist/src/{openai-Qsvz25mV.js → openai-BT-JvDse.js} +2 -2
  192. package/dist/src/{openai-URNyItar.cjs → openai-Cy1XLs0c.cjs} +1 -1
  193. package/dist/src/{openai-iYtrXzOX.js → openai-D4fxGvRx.js} +1 -1
  194. package/dist/src/{openclaw-CwzlQSQX.js → openclaw-Bq7RVR3k.js} +7 -6
  195. package/dist/src/{openclaw-CLWrW03k.js → openclaw-DA8U4DsD.js} +8 -7
  196. package/dist/src/{openclaw-CnQ363Wi.js → openclaw-DObVgpjC.js} +8 -7
  197. package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-DUBZP3GL.cjs} +8 -7
  198. package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-BB40Wir1.js} +4 -4
  199. package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-BM1UAIv1.js} +3 -3
  200. package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-CeqiOcOU.cjs} +4 -4
  201. package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-ChdK7F7z.js} +4 -4
  202. package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-C6thJRXi.js} +4 -4
  203. package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-CcdIikOu.js} +3 -3
  204. package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-DNSQj6bf.cjs} +3 -3
  205. package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver-UYMQx3sy.js} +4 -4
  206. package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
  207. package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-BESeALrr.cjs} +1 -1
  208. package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DoACwqhD.js} +1 -1
  209. package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-PMsleEzs.js} +2 -2
  210. package/dist/src/{providers-Bp4S-FvO.js → providers-BuyzKt7C.js} +1 -1
  211. package/dist/src/{providers-DV3ax9e_.cjs → providers-C7lNVBjX.cjs} +1 -1
  212. package/dist/src/{providers-u9Enmfok.js → providers-CCE2COJi2.js} +1 -1
  213. package/dist/src/{providers-DruaQfwu.js → providers-CJh7iriU.js} +18103 -17952
  214. package/dist/src/{providers-iUt5fbAN.js → providers-Ctcc592x.js} +1 -1
  215. package/dist/src/{providers-Domz_llv.js → providers-DRrerKra.js} +432 -281
  216. package/dist/src/{providers-BV_KMZje.js → providers-DT-GtF2t.js} +19094 -18943
  217. package/dist/src/{providers-1eKkXBKp.cjs → providers-eDShy16E.cjs} +17946 -17795
  218. package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-C4tltmIn.js} +3 -3
  219. package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-CoLaCwNY.cjs} +3 -3
  220. package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-DMO68Jg7.js} +3 -3
  221. package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-DNqbnRdx.js} +3 -3
  222. package/dist/src/{quiverai-DR0SnIQV.js → quiverai-BSS9a7wV.js} +3 -3
  223. package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Bk1KrvL6.js} +4 -4
  224. package/dist/src/{quiverai-DFotyafY.cjs → quiverai-Bpx6MZ7T.cjs} +3 -3
  225. package/dist/src/{quiverai-aPPvXOgn.js → quiverai-CPKhWgaT.js} +4 -4
  226. package/dist/src/{render-DHIZ6_k8.js → render-7uNJ2V14.js} +2 -2
  227. package/dist/src/{render-CH-62LbA.js → render-DlscvAUJ.js} +1 -1
  228. package/dist/src/{render-CMEpfLaO.js → render-eui5p5mL.js} +2 -2
  229. package/dist/src/{render-CgVDrJmM.js → render-nj-UaPdn.js} +2 -2
  230. package/dist/src/{render-DfQSFxGE.cjs → render-tG6ir9_g.cjs} +1 -1
  231. package/dist/src/{responses--OsX2aYW.js → responses-1ztiVYsx.js} +49 -15
  232. package/dist/src/{responses-DL9m8CyY.js → responses-B8haB-mD.js} +49 -15
  233. package/dist/src/{responses-C-flexAY.js → responses-BiaBguAu.js} +49 -15
  234. package/dist/src/{responses-Bi9vBuW_.cjs → responses-CF-ayauu.cjs} +48 -14
  235. package/dist/src/rubyUtils-4hjGxvju.js +3 -0
  236. package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-BI0p46eZ.js} +3 -3
  237. package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CIQFnVz4.js} +3 -3
  238. package/dist/src/rubyUtils-CO-tuszQ.cjs +2 -0
  239. package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DGnoCYL2.js} +3 -3
  240. package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-DoifqkiA.cjs} +4 -3
  241. package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-BDLeW29y.js} +12 -12
  242. package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-C5T60MKf.js} +13 -13
  243. package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-ClS_NB07.js} +13 -13
  244. package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-ljtY12VM.cjs} +12 -12
  245. package/dist/src/{scanner-1DqWi1Ej.js → scanner-nOCWNIXa.js} +7 -7
  246. package/dist/src/server/index.js +1067 -265
  247. package/dist/src/{server-Dx2TyCH2.cjs → server-BEECpeGG.cjs} +5 -5
  248. package/dist/src/{server-BNYztJkh.js → server-ByiF3qlg.js} +9 -8
  249. package/dist/src/{server-BSB45Nt9.js → server-ByxbqAcQ.js} +8 -7
  250. package/dist/src/{server-DaA2eR26.cjs → server-C0XKRNB_.cjs} +1 -1
  251. package/dist/src/server-C_15p79-.js +3 -0
  252. package/dist/src/{server-D6Il2Sob.js → server-gyd6d4Hc.js} +5 -5
  253. package/dist/src/{signal-CE5G3a7x.js → signal-DTtUuU3l.js} +3 -3
  254. package/dist/src/{slack-acRb0IqQ.js → slack-4zZX1OKP.js} +1 -1
  255. package/dist/src/{slack-1Rhq0EoV.cjs → slack-BLlsDpfG.cjs} +1 -1
  256. package/dist/src/{slack-D5Wpy8LM.js → slack-BPYLQLgb.js} +2 -2
  257. package/dist/src/{slack-DDUe-5MC.js → slack-Bamy_7te.js} +2 -2
  258. package/dist/src/{store-DAAyxcy6.cjs → store-2K0kDi80.cjs} +2 -2
  259. package/dist/src/{store-Dn9HUkdW.js → store-2OXm_eBY.js} +3 -3
  260. package/dist/src/store-BELqNwvz.js +3 -0
  261. package/dist/src/{store-M0b1WfYb.js → store-BPkzEyFM.js} +2 -2
  262. package/dist/src/{store-CYEy5J2D.js → store-CPh25336.js} +3 -3
  263. package/dist/src/store-uQZ4AjPe.cjs +2 -0
  264. package/dist/src/{tables-CsWou1Bx.js → tables-BMSOS2Gg.js} +3 -3
  265. package/dist/src/{tables-DUfh1F7Z.cjs → tables-CXbaZ9y1.cjs} +2 -2
  266. package/dist/src/{tables-C4CH3zRr.js → tables-NlvH23ky.js} +3 -3
  267. package/dist/src/{tables-DQ4WU5tX.js → tables-WgdUZ8Ck.js} +2 -2
  268. package/dist/src/{telemetry-dbaJ0E98.js → telemetry--iqaGyaS.js} +5 -4
  269. package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CEQxGnMZ.cjs} +7 -6
  270. package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-CgdVGV8N.js} +4 -3
  271. package/dist/src/{telemetry-CQPez_Jp.js → telemetry-DWdGHvEf.js} +5 -4
  272. package/dist/src/telemetry-DjNoC_n3.cjs +2 -0
  273. package/dist/src/telemetry-ZdPZc0fm.js +3 -0
  274. package/dist/src/{text-BVi-cLPJ.cjs → text-BiNME7QG.cjs} +1 -1
  275. package/dist/src/{text-KvuD2Iko.js → text-D4lz-Jg_.js} +1 -1
  276. package/dist/src/{text-DHxdyQqT.js → text-DDQP0tuQ.js} +1 -1
  277. package/dist/src/{text-CZr46tp_.js → text-NWvfMfkF.js} +1 -1
  278. package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-2wIvAhB3.js} +1 -1
  279. package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-4c780gFd.js} +1 -1
  280. package/dist/src/tokenUsageUtils-BjVkdk18.js +142 -0
  281. package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-C9odhsbW.cjs} +1 -1
  282. package/dist/src/{transcription-DuWDupG7.js → transcription-84t4ALo2.js} +5 -5
  283. package/dist/src/{transcription-CJspiD2c.js → transcription-Bm2emLmJ.js} +6 -6
  284. package/dist/src/{transcription-BvjmiYB1.cjs → transcription-CZ4LG5hQ.cjs} +5 -5
  285. package/dist/src/{transcription-V2HaAmy2.js → transcription-D7Q0vJsh.js} +6 -6
  286. package/dist/src/{transform-zDhMmzwX.js → transform-B-b6Cq-q.js} +5 -5
  287. package/dist/src/transform-BQt0BeAW.js +3 -0
  288. package/dist/src/{transform-DgKlRr73.cjs → transform-Bq5oqC0s.cjs} +1 -1
  289. package/dist/src/{transform-CUnzlsbn.cjs → transform-C9izGX54.cjs} +4 -4
  290. package/dist/src/{transform-DYX1_Xnh.js → transform-CwbAZ84V.js} +5 -5
  291. package/dist/src/{transform-CTeuTR3S.cjs → transform-Dg4LcO1Y.cjs} +6 -6
  292. package/dist/src/{transform-CG0ehZNG.js → transform-DtooZqYY.js} +6 -6
  293. package/dist/src/{transform-UN5UGu8U.js → transform-DzCF-wqV.js} +5 -5
  294. package/dist/src/{transform-lQrDE1BQ.js → transform-_DpNB4qp.js} +5 -5
  295. package/dist/src/{transform-Bbg6A8Jk.js → transform-eGiUAv86.js} +4 -4
  296. package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-B22swDxr.cjs} +1 -1
  297. package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-lvCCvuPT.js} +1 -1
  298. package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-rJGPccjr.js} +1 -1
  299. package/dist/src/{types-Bgh5SOn6.js → types-BDjGOq4E.js} +4 -2
  300. package/dist/src/{types-Dm9JM6Vb.js → types-BVH9hjgW.js} +4 -2
  301. package/dist/src/{types-CeaeaZdP.cjs → types-CgG2rKiW.cjs} +151 -149
  302. package/dist/src/{types-BGQDAP8i.js → types-DNRZVOue.js} +152 -150
  303. package/dist/src/{util-C8e5uydV.js → util-3pBZZb_H.js} +142 -17
  304. package/dist/src/{util-CN3SrLT4.cjs → util-A5_ZsQUn.cjs} +65 -43
  305. package/dist/src/{util-D3q0WQ-0.js → util-B9CNhyac.js} +66 -44
  306. package/dist/src/{util-DxWpWjhc.js → util-BQOCAHQC.js} +700 -575
  307. package/dist/src/{util-BYvQUPp7.js → util-BVXcTwXu.js} +3 -3
  308. package/dist/src/{util-D9TisOyk.js → util-BlFVL0UF.js} +65 -43
  309. package/dist/src/{util-C9J8ahRn.js → util-C-kmRosx.js} +66 -44
  310. package/dist/src/{util-DvU2Pw8c.js → util-DFPeFkiV.js} +3 -3
  311. package/dist/src/{util-DDs-7g6-.js → util-DN0-b81k.js} +3 -3
  312. package/dist/src/{util-olYL5C6N.cjs → util-Dpmm_dAI.cjs} +3 -3
  313. package/dist/src/{util-oGMLA7vc.js → util-Dub0f_ej.js} +700 -575
  314. package/dist/src/{util-Bxn8emtE.cjs → util-DvpHnLt0.cjs} +718 -570
  315. package/dist/src/{utils-DJfvjyMj.js → utils-BUMN8orw.js} +3 -3
  316. package/dist/src/{utils-B05gLxER.cjs → utils-DkVeShIB.cjs} +2 -2
  317. package/dist/src/{utils-BLJKfv0y.js → utils-kt7lv30R.js} +3 -3
  318. package/dist/src/{utils-hXtCYanr.js → utils-o8S5huU2.js} +2 -2
  319. package/dist/src/version-0frU0UTr.js +16 -0
  320. package/dist/src/version-CbpiUINz.js +17 -0
  321. package/dist/src/version-CbuBKu2U.js +16 -0
  322. package/dist/src/version-D9zu9FWB.cjs +27 -0
  323. package/dist/tsconfig.tsbuildinfo +1 -1
  324. package/package.json +22 -20
  325. package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
  326. package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
  327. package/dist/src/cache-Cr9oLMUa.js +0 -3
  328. package/dist/src/cloud-Hphvo8kr.js +0 -3
  329. package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
  330. package/dist/src/evalResult-D8MT9p0s.js +0 -3
  331. package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
  332. package/dist/src/evaluator-CVessDWe.js +0 -3
  333. package/dist/src/fetch-C7bGKDlQ.js +0 -3
  334. package/dist/src/graders-BOAzQEUe.cjs +0 -2
  335. package/dist/src/graders-D4BTsZdG2.js +0 -3
  336. package/dist/src/graders-DOJK1XpV.js +0 -2
  337. package/dist/src/graders-NAv9LcBn.js +0 -2
  338. package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
  339. package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
  340. package/dist/src/server-DCtHUqlp.js +0 -3
  341. package/dist/src/store-CWOSz6D_.cjs +0 -2
  342. package/dist/src/store-DCDBhv7B.js +0 -3
  343. package/dist/src/telemetry-C1IqxcdW.js +0 -3
  344. package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
  345. package/dist/src/transform-M6ITAESf.js +0 -3
  346. /package/dist/src/{evalResult-DElBuddX.js → evalResult-spPqh1G_.js} +0 -0
package/dist/src/index.js CHANGED
@@ -1,33 +1,34 @@
1
- import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, g as getAjv, h as extractJsonObjects, k as state, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "./logger-Ct2S6Yx-.js";
1
+ import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, g as getAjv, h as extractJsonObjects, k as state, m as extractFirstJsonObject, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "./logger-Ct2S6Yx-.js";
2
2
  import { t as invariant } from "./invariant-Ddh24eXh.js";
3
- import { r as importModule, t as getDirectory } from "./esm-C7PnfdF8.js";
4
- import { r as runPython } from "./pythonUtils-C2UQ30Rz.js";
5
- import { i as isJavascriptFile } from "./fileExtensions-DnqA1y9x.js";
6
- import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-Bbg6A8Jk.js";
7
- import { $ as matchesGEval, A as DivergentRepetitionPlugin, B as sampleArray, C as getPiiLeakTestsForCategory, D as HarmbenchPlugin, E as ImitationPlugin, F as AegisPlugin, G as loadRubricPrompt, H as callProviderWithContext, I as RedteamGraderBase, J as matchesClosedQa, K as matchesAnswerRelevance, L as RedteamPluginBase, M as CrossSessionLeakPlugin, N as ContractPlugin, O as HallucinationPlugin, P as BeavertailsPlugin, Q as matchesFactuality, R as getCustomPolicies, S as PlinyPlugin, T as IntentPlugin, U as fail, V as fetchHuggingFaceDataset, W as getAndCheckProvider, X as matchesContextRecall, Y as matchesContextFaithfulness, Z as matchesContextRelevance, _ as PoliticsPlugin, _t as processFileReference, a as UnverifiableClaimsPlugin, at as matchesSimilarity, b as isValidPolicyObject, c as ToolDiscoveryPlugin, ct as withProviderCallExecutionContext, d as TeenSafetyDangerousContentPlugin, dt as readPrompts, et as matchesLlmRubric, f as TeenSafetyAgeRestrictedGoodsAndServicesPlugin, ft as readProviderPromptMap, g as PromptExtractionPlugin, gt as loadFromJavaScriptFile, h as RbacPlugin, ht as getFinalTest, i as VLGuardPlugin, it as matchesSelectBest, j as DebugAccessPlugin, k as ExcessiveAgencyPlugin, l as TeenSafetyHarmfulBodyIdealsPlugin, lt as getDefaultProviders, m as ShellInjectionPlugin, mt as coerceString, n as getGraderById, nt as matchesPiScore, o as UnsafeBenchPlugin, ot as matchesTrajectoryGoalSuccess, p as SqlInjectionPlugin, pt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, q as matchesClassification, r as VLSUPlugin, rt as matchesSearchRubric, s as ToxicChatPlugin, st as selectMaxScore, t as GRADERS, tt as matchesModeration, u as TeenSafetyDangerousRoleplayPlugin, ut as processPrompts, v as PolicyPlugin, vt as resolveContext, w as OverreliancePlugin, x as makeInlinePolicyIdSync, y as determinePolicyTypeFromId, z as retryWithDeduplication } from "./graders-Zy3x0zqX.js";
8
- import { A as isApiProvider, At as CompletionTokenDetailsSchema, C as TestGeneratorConfigSchema, Ct as UNALIGNED_PROVIDER_HARM_PLUGINS, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, Ot as PromptSchema, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as TELECOM_PLUGINS, T as TestSuiteSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as PLUGIN_CATEGORIES, a as AtomicTestCaseSchema, at as DEFAULT_PLUGINS, b as TestCaseWithVarsFileSchema, c as CompletedPromptSchema, ct as HARM_PLUGINS, d as EvaluateOptionsSchema, dt as LLAMA_GUARD_REPLICATE_PROVIDER, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MEDICAL_PLUGINS, g as ResultFailureReason, gt as PII_PLUGINS, h as OutputFileExtension, ht as PHARMACY_PLUGINS, i as AssertionTypeSchema, it as DATASET_EXEMPT_PLUGINS, j as isProviderOptions, jt as InputsSchema, k as isResultFailureReason, kt as BaseTokenUsageSchema, l as DerivedMetricSchema, lt as INSURANCE_PLUGINS, m as OutputConfigSchema, mt as MULTI_INPUT_VAR, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FINANCIAL_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_EXCLUDED_PLUGINS, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as CANARY_BREAKING_STRATEGY_IDS, s as CommandLineOptionsSchema, st as FOUNDATION_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_ENABLED_CATEGORIES, v as SpecialAssertionTypesSchema, vt as REDTEAM_PROVIDER_HARM_PLUGINS, w as TestSuiteConfigSchema, wt as CODING_AGENT_CORE_PLUGINS, x as TestCasesWithMetadataPromptSchema, xt as TEEN_SAFETY_PLUGINS, y as TestCaseSchema, yt as REMOTE_ONLY_PLUGIN_IDS, z as StrategyConfigSchema } from "./types-BGQDAP8i.js";
9
- import { C as checkProviderApiKeys, D as isGoogleProvider, E as isAnthropicProvider, O as isOpenAiProvider, S as resultIsForTestCase, T as getProviderDescription, _ as setupEnv, b as filterRuntimeVars, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, g as parseFileUrl, h as loadFunction, i as fetchCsvFromGoogleSheet, k as isProviderAllowed, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as deduplicateTestCases, w as doesProviderRefMatch, x as getTestCaseDeduplicationKey, y as extractRuntimeVars } from "./util-oGMLA7vc.js";
10
- import { a as getNunjucksEngine, i as extractVariablesFromTemplates, r as extractVariablesFromTemplate, t as renderEnvOnlyInObject } from "./render-CH-62LbA.js";
11
- import { A as TERMINAL_MAX_WIDTH, F as VERSION, I as FILE_METADATA_KEY, L as HUMAN_ASSERTION_TYPE, M as getShareApiBaseUrl, N as getShareViewBaseUrl, S as parseChatPrompt, a as CloudConfig, d as sleep, h as REQUEST_TIMEOUT_MS, j as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp, y as isPromptfooSampleTarget } from "./fetch-Di00EQrc.js";
12
- import { c as isNonTransientHttpStatus, i as getCache, n as disableCache, o as withCacheNamespace, r as fetchWithCache, s as NON_TRANSIENT_HTTP_STATUSES, t as cache_exports } from "./cache-D5NZmMiT.js";
13
- import { $ as AIStudioChatProvider, A as createRateLimitRegistry, B as getCloudDatabaseId, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getGeneratedPromptOverLimit, G as isCloudProvider, H as getOrgContext, I as getMaxCharsPerMessageModifierValue, K as resolveTeamId, L as throwIfTargetPromptExceedsMaxChars, N as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY, Q as VertexChatProvider, T as runExtensionHook, U as getPluginSeverityOverridesFromCloud, V as getEvalConfigFromCloud, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId, z as checkCloudPermissions } from "./providers-DruaQfwu.js";
14
- import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-hXtCYanr.js";
15
- import { n as sha256, t as randomSequence } from "./createHash-4gFQpDDv.js";
16
- import { t as OpenAiChatCompletionProvider } from "./chat-I9izLm49.js";
17
- import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-C-bmyHoE.js";
18
- import { h as validateFunctionCall } from "./transform-CG0ehZNG.js";
19
- import { l as validateFunctionCall$1 } from "./util-D9TisOyk.js";
20
- import { t as providerRegistry } from "./providerRegistry-Bvh8mv85.js";
21
- import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-D6Il2Sob.js";
22
- import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-DhMYUUbu.js";
23
- import { t as getBlobByHash } from "./blobs-C-F78Kfn.js";
24
- import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-DQ4WU5tX.js";
25
- import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-Dk6bRWkv.js";
26
- import { t as telemetry } from "./telemetry-Dvqxv3YC.js";
27
- import { t as ellipsize } from "./text-DHxdyQqT.js";
28
- import { t as getTraceStore } from "./store-M0b1WfYb.js";
29
- import { n as runRuby } from "./rubyUtils-DVLeA2jg.js";
30
- import { t as EvalResult } from "./evalResult-Bgm9ZH31.js";
3
+ import { A as TERMINAL_MAX_WIDTH, F as HUMAN_ASSERTION_TYPE, M as getShareApiBaseUrl, N as getShareViewBaseUrl, P as FILE_METADATA_KEY, S as cloudConfig, _ as parseChatPrompt, a as getCurrentTimestamp, j as getDefaultShareViewBaseUrl, m as isPromptfooSampleTarget, n as fetchWithRetries, o as sleep, r as fetchWithTimeout, t as fetchWithProxy, u as REQUEST_TIMEOUT_MS, x as CloudConfig } from "./fetch-D9xxyC1p.js";
4
+ import { n as VERSION } from "./version-CbuBKu2U.js";
5
+ import { $ as categoryAliases, A as isApiProvider, At as CompletionTokenDetailsSchema, B as StrategyConfigSchema, C as TestGeneratorConfigSchema, Ct as TELECOM_PLUGINS, D as VarsSchema, E as UnifiedConfigSchema, F as ProvidersSchema, G as DEFAULT_STRATEGIES, H as isUuid, I as ConversationMessageSchema, J as STRATEGY_COLLECTION_MAPPINGS, L as PartialGenerationError, M as RedteamConfigSchema, N as PromptSchema, O as isGradingResult, Q as Severity, R as PluginConfigSchema, S as TestCasesWithMetadataSchema, St as TEEN_SAFETY_PLUGINS, T as TestSuiteSchema, Tt as CODING_AGENT_CORE_PLUGINS, Y as getDefaultNFanout, Z as isFanoutStrategy, _ as ScenarioSchema, _t as PII_PLUGINS, a as AtomicTestCaseSchema, at as DATASET_EXEMPT_PLUGINS, b as TestCaseWithVarsFileSchema, bt as REMOTE_ONLY_PLUGIN_IDS, c as CompletedPromptSchema, ct as FOUNDATION_PLUGINS, d as EvaluateOptionsSchema, dt as LLAMA_GUARD_ENABLED_CATEGORIES, f as GradingConfigSchema, ft as LLAMA_GUARD_REPLICATE_PROVIDER, g as ResultFailureReason, gt as PHARMACY_PLUGINS, h as OutputFileExtension, ht as MULTI_INPUT_VAR, i as AssertionTypeSchema, it as CANARY_BREAKING_STRATEGY_IDS, j as isProviderOptions, jt as InputsSchema, k as isResultFailureReason, kt as BaseTokenUsageSchema, l as DerivedMetricSchema, lt as HARM_PLUGINS, m as OutputConfigSchema, mt as MULTI_INPUT_EXCLUDED_PLUGINS, n as AssertionSchema, nt as ALIASED_PLUGIN_MAPPINGS, o as BaseAssertionTypesSchema, ot as DEFAULT_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MEDICAL_PLUGINS, q as STRATEGY_COLLECTIONS, r as AssertionSetSchema, rt as BIAS_PLUGINS, s as CommandLineOptionsSchema, st as FINANCIAL_PLUGINS, t as AssertionOrSetSchema, tt as riskCategorySeverityMap, u as EvalResultsFilterMode, ut as INSURANCE_PLUGINS, v as SpecialAssertionTypesSchema, vt as PLUGIN_CATEGORIES, w as TestSuiteConfigSchema, wt as UNALIGNED_PROVIDER_HARM_PLUGINS, x as TestCasesWithMetadataPromptSchema, y as TestCaseSchema, yt as REDTEAM_PROVIDER_HARM_PLUGINS, z as PolicyObjectSchema } from "./types-DNRZVOue.js";
6
+ import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-CaLNYnf7.js";
7
+ import { r as importModule, t as getDirectory } from "./esm-BRkfNsYs.js";
8
+ import { a as getNunjucksEngine, i as extractVariablesFromTemplates, r as extractVariablesFromTemplate, t as renderEnvOnlyInObject } from "./render-DlscvAUJ.js";
9
+ import { t as providerRegistry } from "./providerRegistry-DoACwqhD.js";
10
+ import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-gyd6d4Hc.js";
11
+ import { $ as getMaxCharsPerMessageModifierValue, B as collectFileMetadata, D as validateStrategies, E as loadStrategy, F as getSessionId, G as redteamProviderManager, H as runExtensionHook, I as getShortPluginId, J as createProviderRateLimitOptions, K as TokenUsageTracker, L as isBasicRefusal, N as extractPromptFromTags, O as pluginMatchesStrategyTargets, P as extractVariablesFromJson, Q as getGeneratedPromptOverLimit, T as Strategies, U as isPackagePath, V as renderPrompt, W as loadFromPackage, X as PromptfooHarmfulCompletionProvider, Z as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY, a as resolveProviderConfigs, at as getOrgContext, ct as isCloudProvider, d as MCPProvider, et as throwIfTargetPromptExceedsMaxChars, g as AIStudioChatProvider, h as GoogleLiveProvider, i as resolveProvider, it as getEvalConfigFromCloud, j as extractGoalFromPrompt, k as checkExfilTracking, lt as resolveTeamId, m as VertexChatProvider, n as loadApiProvider, nt as checkCloudPermissions, ot as getPluginSeverityOverridesFromCloud, q as createRateLimitRegistry, r as loadApiProviders, rt as getCloudDatabaseId, t as getProviderIds } from "./providers-CJh7iriU.js";
12
+ import { r as runPython } from "./pythonUtils-DMO68Jg7.js";
13
+ import { i as isJavascriptFile } from "./fileExtensions-8CjoL7vB.js";
14
+ import { A as readFilters, M as loadFunction, N as parseFileUrl, O as maybeLoadToolsFromExternalFile, T as maybeLoadFromExternalFile, _ as isProviderAllowed, a as setupEnv, b as normalizeProviderRef, c as filterRuntimeVars, d as checkProviderApiKeys, f as doesProviderRefMatch, g as isOpenAiProvider, h as isGoogleProvider, i as fetchCsvFromGoogleSheet, j as readOutput, l as getTestCaseDeduplicationKey, m as isAnthropicProvider, n as writeMultipleOutputs, o as deduplicateTestCases, p as getProviderDescription, r as writeOutput, s as extractRuntimeVars, t as printBorder, u as resultIsForTestCase, w as maybeLoadConfigFromExternalFile } from "./util-Dub0f_ej.js";
15
+ import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-4c780gFd.js";
16
+ import { t as getBlobByHash } from "./blobs-BW4U31ue.js";
17
+ import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-WgdUZ8Ck.js";
18
+ import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-BM3jRERL.js";
19
+ import { c as isNonTransientHttpStatus, i as getCache, n as disableCache, o as withCacheNamespace, r as fetchWithCache, s as NON_TRANSIENT_HTTP_STATUSES, t as cache_exports } from "./cache-Bzttsk0X.js";
20
+ import { t as OpenAiChatCompletionProvider } from "./chat-Cx_LkwvZ.js";
21
+ import { h as validateFunctionCall } from "./transform-_DpNB4qp.js";
22
+ import { l as validateFunctionCall$1 } from "./util-BlFVL0UF.js";
23
+ import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-eGiUAv86.js";
24
+ import { t as telemetry } from "./telemetry-CgdVGV8N.js";
25
+ import { t as ellipsize } from "./text-DDQP0tuQ.js";
26
+ import { t as getTraceStore } from "./store-BPkzEyFM.js";
27
+ import { n as sha256, t as randomSequence } from "./createHash-Un4Q_huE.js";
28
+ import { n as runRuby } from "./rubyUtils-BI0p46eZ.js";
29
+ import { $ as SELECT_BEST_PROMPT, A as DivergentRepetitionPlugin, B as sampleArray, C as getPiiLeakTestsForCategory, Ct as DEFAULT_ANTHROPIC_MODEL, D as HarmbenchPlugin, Dt as withProviderCallExecutionContext, E as ImitationPlugin, Et as getGradingProvider, F as AegisPlugin, G as matchesLlmRubric, H as matchesClosedQa, I as RedteamGraderBase, J as doRemoteGrading, K as matchesPiScore, L as RedteamPluginBase, M as CrossSessionLeakPlugin, N as ContractPlugin, O as HallucinationPlugin, P as BeavertailsPlugin, Q as DEFAULT_WEB_SEARCH_PROMPT, R as getCustomPolicies, S as PlinyPlugin, St as getDefaultProviders, T as IntentPlugin, Tt as getAndCheckProvider, U as matchesFactuality, V as fetchHuggingFaceDataset, W as matchesGEval, X as readPrompts, Y as processPrompts, Z as readProviderPromptMap, _ as PoliticsPlugin, _t as tryParse, a as UnverifiableClaimsPlugin, at as CONTEXT_RECALL_ATTRIBUTED_TOKEN, b as isValidPolicyObject, bt as loadFromJavaScriptFile, c as ToolDiscoveryPlugin, ct as CONTEXT_RELEVANCE_BAD, d as TeenSafetyDangerousContentPlugin, dt as cosineSimilarity, et as SUGGEST_PROMPTS_SYSTEM_MESSAGE, f as TeenSafetyAgeRestrictedGoodsAndServicesPlugin, ft as dotProduct, g as PromptExtractionPlugin, gt as splitIntoSentences, h as RbacPlugin, ht as normalizeMatcherTokenUsage, i as VLGuardPlugin, it as CONTEXT_RECALL, j as DebugAccessPlugin, k as ExcessiveAgencyPlugin, l as TeenSafetyHarmfulBodyIdealsPlugin, lt as loadRubricPrompt, m as ShellInjectionPlugin, mt as fail, n as getGraderById, nt as CONTEXT_FAITHFULNESS_LONGFORM, o as UnsafeBenchPlugin, ot as CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN, p as SqlInjectionPlugin, pt as euclideanDistance, q as matchesTrajectoryGoalSuccess, r as VLSUPlugin, rt as CONTEXT_FAITHFULNESS_NLI_STATEMENTS, s as ToxicChatPlugin, st as CONTEXT_RELEVANCE, t as GRADERS, tt as ANSWER_RELEVANCY_GENERATE, u as TeenSafetyDangerousRoleplayPlugin, ut as renderLlmRubricPrompt, v as PolicyPlugin, vt as coerceString, w as OverreliancePlugin, wt as callProviderWithContext, x as makeInlinePolicyIdSync, xt as processFileReference, y as determinePolicyTypeFromId, yt as getFinalTest, z as retryWithDeduplication } from "./graders-BoUqsCEm.js";
30
+ import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-o8S5huU2.js";
31
+ import { t as EvalResult } from "./evalResult-pSvGWFMo.js";
31
32
  import * as fs$2 from "fs";
32
33
  import fs, { createWriteStream } from "fs";
33
34
  import * as path$2 from "path";
@@ -35,26 +36,26 @@ import path, { parse } from "path";
35
36
  import async from "async";
36
37
  import yaml from "js-yaml";
37
38
  import { AsyncResource } from "node:async_hooks";
38
- import { resolve } from "node:path";
39
- import { fileURLToPath } from "node:url";
40
39
  import chalk from "chalk";
41
40
  import * as os$1 from "os";
42
41
  import os from "os";
43
- import util from "util";
44
42
  import dedent from "dedent";
45
- import * as fsPromises from "fs/promises";
46
- import { globSync } from "glob";
47
43
  import { z } from "zod";
48
- import { parse as parse$1 } from "csv-parse/sync";
49
- import { XMLParser } from "fast-xml-parser";
44
+ import * as fsPromises from "fs/promises";
45
+ import util from "util";
46
+ import input from "@inquirer/input";
47
+ import { resolve } from "node:path";
48
+ import { fileURLToPath } from "node:url";
50
49
  import crypto$1, { createHash, randomBytes } from "crypto";
51
50
  import { DiagConsoleLogger, DiagLogLevel, diag, propagation } from "@opentelemetry/api";
52
- import input from "@inquirer/input";
53
51
  import readline from "readline";
52
+ import { parse as parse$1 } from "csv-parse/sync";
53
+ import { globSync } from "glob";
54
54
  import { and, desc, eq, inArray, sql } from "drizzle-orm";
55
+ import { XMLParser } from "fast-xml-parser";
55
56
  import cliProgress from "cli-progress";
56
57
  import { URL } from "url";
57
- import { JSDOM } from "jsdom";
58
+ import { parse as parse$2 } from "parse5";
58
59
  import { distance } from "fastest-levenshtein";
59
60
  import * as rouge from "js-rouge";
60
61
  import { isDeepStrictEqual } from "node:util";
@@ -242,6 +243,502 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
242
243
  };
243
244
  };
244
245
  //#endregion
246
+ //#region src/matchers/classification.ts
247
+ /**
248
+ *
249
+ * @param expected Expected classification. If undefined, matches any classification.
250
+ * @param output Text to classify.
251
+ * @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
252
+ * @param grading
253
+ * @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
254
+ */
255
+ async function matchesClassification(expected, output, threshold, grading) {
256
+ const resp = await (await getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
257
+ if (!resp.classification) return fail(resp.error || "Unknown error fetching classification");
258
+ let score;
259
+ if (expected === void 0) {
260
+ const scores = Object.values(resp.classification);
261
+ if (scores.length === 0) return {
262
+ pass: false,
263
+ score: 0,
264
+ reason: "No classification scores returned"
265
+ };
266
+ score = Math.max(...scores);
267
+ } else score = resp.classification[expected] || 0;
268
+ if (score >= threshold - Number.EPSILON) {
269
+ const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
270
+ return {
271
+ pass: true,
272
+ score,
273
+ reason
274
+ };
275
+ }
276
+ return {
277
+ pass: false,
278
+ score,
279
+ reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
280
+ };
281
+ }
282
+ //#endregion
283
+ //#region src/matchers/comparison.ts
284
+ async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
285
+ invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
286
+ const resp = await callProviderWithContext(await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "select-best check"), await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, SELECT_BEST_PROMPT), {
287
+ criteria,
288
+ outputs: outputs.map((o) => tryParse(o)),
289
+ ...vars || {}
290
+ }), "select-best", {
291
+ criteria,
292
+ outputs: outputs.map((o) => tryParse(o)),
293
+ ...vars || {}
294
+ }, providerCallContext);
295
+ if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => fail(resp.error || "No output", resp.tokenUsage));
296
+ invariant(typeof resp.output === "string", "select-best produced malformed response");
297
+ const firstIntegerMatch = resp.output.trim().match(/\d+/);
298
+ const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
299
+ if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
300
+ const tokensUsed = normalizeMatcherTokenUsage(resp.tokenUsage);
301
+ return outputs.map((_output, index) => {
302
+ if (index === verdict) return {
303
+ pass: true,
304
+ score: 1,
305
+ reason: `Output selected as the best: ${criteria}`,
306
+ tokensUsed
307
+ };
308
+ else return {
309
+ pass: false,
310
+ score: 0,
311
+ reason: `Output not selected: ${criteria}`,
312
+ tokensUsed
313
+ };
314
+ });
315
+ }
316
+ async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
317
+ invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
318
+ const value = assertion.value || {};
319
+ const options = {
320
+ method: typeof value === "object" && "method" in value ? value.method : "average",
321
+ weights: typeof value === "object" && "weights" in value ? value.weights : {},
322
+ threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
323
+ };
324
+ const scores = resultsWithGradingResults.map((result, index) => {
325
+ const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
326
+ if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
327
+ let totalWeightedScore = 0;
328
+ let totalWeight = 0;
329
+ relevantResults.forEach((componentResult) => {
330
+ const assertionType = componentResult.assertion?.type || "unknown";
331
+ const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
332
+ const score = componentResult.score || 0;
333
+ totalWeightedScore += score * weight;
334
+ totalWeight += weight;
335
+ });
336
+ let aggregateScore;
337
+ if (options.method === "sum") aggregateScore = totalWeightedScore;
338
+ else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
339
+ return {
340
+ index,
341
+ score: aggregateScore,
342
+ componentCount: relevantResults.length,
343
+ totalWeight
344
+ };
345
+ });
346
+ let maxScore = -Infinity;
347
+ let winnerIndex = 0;
348
+ for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
349
+ maxScore = scores[i].score;
350
+ winnerIndex = i;
351
+ }
352
+ const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
353
+ return scores.map(({ index, score, componentCount, totalWeight }) => {
354
+ const isWinner = index === winnerIndex && meetsThreshold;
355
+ return {
356
+ pass: isWinner,
357
+ score: isWinner ? 1 : 0,
358
+ reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
359
+ namedScores: {
360
+ maxScore: score,
361
+ assertionCount: componentCount,
362
+ totalWeight
363
+ }
364
+ };
365
+ });
366
+ }
367
+ //#endregion
368
+ //#region src/matchers/moderation.ts
369
+ async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
370
+ if (!assistantResponse) return {
371
+ pass: true,
372
+ score: 1,
373
+ reason: "No output to moderate"
374
+ };
375
+ const defaultProviders = await getDefaultProviders();
376
+ const defaultModerationProvider = !getEnvString("OPENAI_API_KEY") && (getEnvString("REPLICATE_API_KEY") || getEnvString("REPLICATE_API_TOKEN")) ? await loadApiProvider(LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
377
+ const moderationProvider = await getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
378
+ invariant(moderationProvider, "Moderation provider must be defined");
379
+ const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
380
+ if (resp.error) return {
381
+ pass: false,
382
+ score: 0,
383
+ reason: `Moderation API error: ${resp.error}`
384
+ };
385
+ const { flags } = resp;
386
+ if (!flags || flags.length === 0) return {
387
+ pass: true,
388
+ score: 1,
389
+ reason: "No moderation flags detected"
390
+ };
391
+ const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
392
+ if (filteredFlags.length > 0) return {
393
+ pass: false,
394
+ score: 0,
395
+ reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
396
+ };
397
+ return {
398
+ pass: true,
399
+ score: 1,
400
+ reason: "No relevant moderation flags detected"
401
+ };
402
+ }
403
+ //#endregion
404
+ //#region src/assertions/contextUtils.ts
405
+ /**
406
+ * Resolves the context value for context-based assertions.
407
+ * Supports extracting context from test variables or transforming from output.
408
+ * Can return either a single context string or an array of context chunks.
409
+ *
410
+ * @param assertion - The assertion configuration
411
+ * @param test - The test case
412
+ * @param output - The provider output (after provider transform, before test transform)
413
+ * @param prompt - The prompt text
414
+ * @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
415
+ * @param providerResponse - Optional full provider response for contextTransform
416
+ * @returns The resolved context string or array of strings
417
+ * @throws Error if context cannot be resolved or transform fails
418
+ */
419
+ async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
420
+ let contextValue;
421
+ if (test.vars?.context) {
422
+ if (typeof test.vars.context === "string") contextValue = test.vars.context;
423
+ else if (Array.isArray(test.vars.context)) {
424
+ const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
425
+ if (invalidEntry) {
426
+ const [idx, val] = invalidEntry;
427
+ invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
428
+ }
429
+ contextValue = test.vars.context;
430
+ }
431
+ } else if (fallbackContext) contextValue = fallbackContext;
432
+ if (assertion.contextTransform) try {
433
+ const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
434
+ const transformed = await transform(assertion.contextTransform, outputForTransform, {
435
+ vars: test.vars,
436
+ prompt: { label: prompt },
437
+ ...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
438
+ });
439
+ invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${assertion.contextTransform}`);
440
+ contextValue = transformed;
441
+ } catch (error) {
442
+ throw new Error(`Failed to transform context using expression '${assertion.contextTransform}': ${error instanceof Error ? error.message : String(error)}`);
443
+ }
444
+ invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
445
+ return contextValue;
446
+ }
447
+ /**
448
+ * Serializes context (string or string[]) to a single string for prompts.
449
+ * Joins chunks with double newlines to preserve separation.
450
+ */
451
+ function serializeContext(context) {
452
+ return Array.isArray(context) ? context.join("\n\n") : context;
453
+ }
454
+ //#endregion
455
+ //#region src/matchers/rag.ts
456
+ async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
457
+ const defaults = await getDefaultProviders();
458
+ const embeddingProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
459
+ const textProvider = await getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
460
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
461
+ const rubricPrompt = await loadRubricPrompt(grading?.rubricPrompt, ANSWER_RELEVANCY_GENERATE);
462
+ const parsedOutput = tryParse(output);
463
+ const promptText = await renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
464
+ const candidateQuestions = [];
465
+ for (let i = 0; i < 3; i++) {
466
+ const resp = await callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
467
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
468
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
469
+ invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
470
+ candidateQuestions.push(resp.output);
471
+ }
472
+ invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
473
+ const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
474
+ accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
475
+ if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
476
+ const inputEmbedding = inputEmbeddingResp.embedding;
477
+ const similarities = [];
478
+ const questionsWithScores = [];
479
+ for (const question of candidateQuestions) {
480
+ const resp = await embeddingProvider.callEmbeddingApi(question);
481
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
482
+ if (resp.error || !resp.embedding) return fail(resp.error || "No embedding", tokensUsed);
483
+ const questionSimilarity = cosineSimilarity(inputEmbedding, resp.embedding);
484
+ similarities.push(questionSimilarity);
485
+ questionsWithScores.push({
486
+ question,
487
+ similarity: questionSimilarity
488
+ });
489
+ }
490
+ const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
491
+ const pass = similarity >= threshold - Number.EPSILON;
492
+ const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
493
+ const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
494
+ const metadata = {
495
+ generatedQuestions: questionsWithScores,
496
+ averageSimilarity: similarity,
497
+ threshold
498
+ };
499
+ if (pass) return {
500
+ pass: true,
501
+ score: similarity,
502
+ reason: greaterThanReason,
503
+ tokensUsed,
504
+ metadata
505
+ };
506
+ return {
507
+ pass: false,
508
+ score: similarity,
509
+ reason: lessThanReason,
510
+ tokensUsed,
511
+ metadata
512
+ };
513
+ }
514
+ async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
515
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context recall check");
516
+ const contextString = serializeContext(context);
517
+ const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RECALL), {
518
+ context: contextString,
519
+ groundTruth,
520
+ ...vars || {}
521
+ }), "context-recall", {
522
+ context: contextString,
523
+ groundTruth,
524
+ ...vars || {}
525
+ }, providerCallContext);
526
+ if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
527
+ invariant(typeof resp.output === "string", "context-recall produced malformed response");
528
+ const attributedTokenLower = CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
529
+ const notAttributedTokenLower = CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
530
+ const sentences = splitIntoSentences(resp.output).filter((line) => {
531
+ const lowerLine = line.toLowerCase();
532
+ return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
533
+ });
534
+ const sentenceAttributions = [];
535
+ let numerator = 0;
536
+ for (const sentence of sentences) {
537
+ const lowerSentence = sentence.toLowerCase();
538
+ const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
539
+ if (isAttributed) numerator++;
540
+ const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
541
+ const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
542
+ sentenceAttributions.push({
543
+ sentence: cleanSentence,
544
+ attributed: isAttributed
545
+ });
546
+ }
547
+ const score = sentences.length > 0 ? numerator / sentences.length : 0;
548
+ const pass = score >= threshold - Number.EPSILON;
549
+ const metadata = {
550
+ sentenceAttributions,
551
+ totalSentences: sentences.length,
552
+ attributedSentences: numerator,
553
+ score
554
+ };
555
+ return {
556
+ pass,
557
+ score,
558
+ reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
559
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
560
+ metadata
561
+ };
562
+ }
563
+ async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
564
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context relevance check");
565
+ const contextString = serializeContext(context);
566
+ const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RELEVANCE), {
567
+ context: contextString,
568
+ query: question
569
+ }), "context-relevance", {
570
+ context: contextString,
571
+ query: question
572
+ }, providerCallContext);
573
+ if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
574
+ invariant(typeof resp.output === "string", "context-relevance produced malformed response");
575
+ const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : splitIntoSentences(context);
576
+ const totalContextUnits = contextUnits.length;
577
+ const extractedSentences = splitIntoSentences(resp.output);
578
+ const relevantSentences = [];
579
+ const insufficientInformation = resp.output.includes(CONTEXT_RELEVANCE_BAD);
580
+ let numerator = 0;
581
+ if (insufficientInformation) numerator = 0;
582
+ else {
583
+ const uniqueRelevantSentences = [...new Set(extractedSentences)];
584
+ numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
585
+ relevantSentences.push(...uniqueRelevantSentences);
586
+ }
587
+ const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
588
+ const pass = score >= threshold - Number.EPSILON;
589
+ const metadata = {
590
+ extractedSentences: relevantSentences,
591
+ totalContextUnits,
592
+ totalContextSentences: totalContextUnits,
593
+ contextUnits,
594
+ relevantSentenceCount: numerator,
595
+ insufficientInformation,
596
+ score
597
+ };
598
+ return {
599
+ pass,
600
+ score,
601
+ reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
602
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
603
+ metadata
604
+ };
605
+ }
606
+ async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
607
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "faithfulness check");
608
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
609
+ if (grading?.rubricPrompt) invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
610
+ const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
611
+ const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
612
+ const longformPrompt = await loadRubricPrompt(rawLongformPrompt, CONTEXT_FAITHFULNESS_LONGFORM);
613
+ const nliPrompt = await loadRubricPrompt(rawNliPrompt, CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
614
+ let promptText = await renderLlmRubricPrompt(longformPrompt, {
615
+ question: query,
616
+ answer: tryParse(output),
617
+ ...vars || {}
618
+ });
619
+ let resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
620
+ question: query,
621
+ answer: tryParse(output),
622
+ ...vars || {}
623
+ }, providerCallContext);
624
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
625
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
626
+ invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
627
+ const contextString = serializeContext(context);
628
+ const statements = splitIntoSentences(resp.output);
629
+ promptText = await renderLlmRubricPrompt(nliPrompt, {
630
+ context: contextString,
631
+ statements,
632
+ ...vars || {}
633
+ });
634
+ resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
635
+ context: contextString,
636
+ statements,
637
+ ...vars || {}
638
+ }, providerCallContext);
639
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
640
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
641
+ invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
642
+ let finalAnswer = "Final verdict for each statement in order:";
643
+ finalAnswer = finalAnswer.toLowerCase();
644
+ let verdicts = resp.output.toLowerCase().trim();
645
+ let score = 0;
646
+ if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
647
+ verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
648
+ const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
649
+ if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
650
+ } else {
651
+ const noVerdictCount = verdicts.split("verdict: no").length - 1;
652
+ if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
653
+ }
654
+ score = Math.min(1, Math.max(0, score));
655
+ const pass = score >= threshold - Number.EPSILON;
656
+ return {
657
+ pass,
658
+ score,
659
+ reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
660
+ tokensUsed
661
+ };
662
+ }
663
+ //#endregion
664
+ //#region src/matchers/similarity.ts
665
+ function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
666
+ switch (metric) {
667
+ case "cosine": return cosineSimilarity(expectedEmbedding, outputEmbedding);
668
+ case "dot_product": return dotProduct(expectedEmbedding, outputEmbedding);
669
+ case "euclidean": return euclideanDistance(expectedEmbedding, outputEmbedding);
670
+ default: return fail(`Unsupported metric: ${metric}`, tokensUsed);
671
+ }
672
+ }
673
+ function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
674
+ if (metric === "euclidean") {
675
+ const distance = similarity;
676
+ const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
677
+ const normalizedScore = 1 / (1 + distance);
678
+ const score = inverse ? 1 - normalizedScore : normalizedScore;
679
+ const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
680
+ const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
681
+ return {
682
+ pass,
683
+ score,
684
+ reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
685
+ tokensUsed
686
+ };
687
+ }
688
+ const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
689
+ const score = inverse ? 1 - similarity : similarity;
690
+ const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
691
+ const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
692
+ return {
693
+ pass,
694
+ score,
695
+ reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
696
+ tokensUsed
697
+ };
698
+ }
699
+ async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
700
+ if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
701
+ const similarityResp = await finalProvider.callSimilarityApi(expected, output);
702
+ accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
703
+ if (similarityResp.error) return fail(similarityResp.error, tokensUsed);
704
+ if (similarityResp.similarity == null) return fail("Unknown error fetching similarity", tokensUsed);
705
+ if (!Number.isFinite(similarityResp.similarity)) return fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
706
+ return similarityResp.similarity;
707
+ }
708
+ const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
709
+ if (typeof callEmbeddingApi !== "function") {
710
+ if ("callSimilarityApi" in finalProvider) return fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
711
+ throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
712
+ }
713
+ const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
714
+ const mergedUsage = normalizeMatcherTokenUsage(void 0);
715
+ accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
716
+ accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
717
+ accumulateTokenUsage(tokensUsed, mergedUsage);
718
+ if (expectedEmbedding.error || outputEmbedding.error) return fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
719
+ if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return fail("Embedding not found", tokensUsed);
720
+ return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
721
+ }
722
+ async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
723
+ if (metric === "cosine" && state.config?.redteam && shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
724
+ return await doRemoteGrading({
725
+ task: "similar",
726
+ expected,
727
+ output,
728
+ threshold,
729
+ inverse
730
+ });
731
+ } catch (error) {
732
+ return fail(`Could not perform remote grading: ${error}`);
733
+ }
734
+ const defaults = await getDefaultProviders();
735
+ const finalProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
736
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
737
+ const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
738
+ if (typeof similarity !== "number") return similarity;
739
+ return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
740
+ }
741
+ //#endregion
245
742
  //#region src/tracing/evaluatorTracing.ts
246
743
  let otlpReceiverStarted = false;
247
744
  const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
@@ -285,7 +782,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
285
782
  telemetry.record("feature_used", { feature: "tracing" });
286
783
  try {
287
784
  logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
288
- const { startOTLPReceiver } = await import("./otlpReceiver-B2z58l4e.js");
785
+ const { startOTLPReceiver } = await import("./otlpReceiver-CcdIikOu.js");
289
786
  const port = testSuite.tracing.otlp.http.port || 4318;
290
787
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
291
788
  const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
@@ -309,7 +806,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
309
806
  async function stopOtlpReceiverIfNeeded() {
310
807
  if (otlpReceiverStarted) try {
311
808
  logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
312
- const { stopOTLPReceiver } = await import("./otlpReceiver-B2z58l4e.js");
809
+ const { stopOTLPReceiver } = await import("./otlpReceiver-CcdIikOu.js");
313
810
  await stopOTLPReceiver();
314
811
  otlpReceiverStarted = false;
315
812
  logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -344,7 +841,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
344
841
  }
345
842
  if (!tracingEnabled) return null;
346
843
  logger.debug("[EvaluatorTracing] Importing trace store");
347
- const { getTraceStore } = await import("./store-M0b1WfYb.js").then((n) => n.n);
844
+ const { getTraceStore } = await import("./store-BPkzEyFM.js").then((n) => n.n);
348
845
  const traceStore = getTraceStore();
349
846
  const traceId = generateTraceId();
350
847
  const spanId = generateSpanId();
@@ -654,38 +1151,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
654
1151
  }
655
1152
  //#endregion
656
1153
  //#region src/assertions/contains.ts
1154
+ /**
1155
+ * Advance over separators between parsed fields.
1156
+ *
1157
+ * Contains-any values allow whitespace around comma delimiters, and historical
1158
+ * parsing ignored repeated commas rather than producing empty fields.
1159
+ */
1160
+ function skipWhitespaceAndCommas(value, startIndex) {
1161
+ let i = startIndex;
1162
+ while (i < value.length) {
1163
+ i = skipWhitespace(value, i);
1164
+ if (value[i] !== ",") break;
1165
+ i++;
1166
+ }
1167
+ return i;
1168
+ }
1169
+ /**
1170
+ * Advance over whitespace while preserving comma delimiter handling for callers.
1171
+ */
1172
+ function skipWhitespace(value, startIndex) {
1173
+ let i = startIndex;
1174
+ while (i < value.length && /\s/.test(value[i])) i++;
1175
+ return i;
1176
+ }
1177
+ /**
1178
+ * Parse a quoted field using the assertion parser's CSV-like escape rules.
1179
+ *
1180
+ * Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
1181
+ * unterminated fields so malformed assertion values do not silently pass.
1182
+ */
1183
+ function parseQuotedField(value, startIndex) {
1184
+ let i = startIndex + 1;
1185
+ let field = "";
1186
+ let terminated = false;
1187
+ while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
1188
+ field += value[i + 1];
1189
+ i += 2;
1190
+ } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
1191
+ field += "\"";
1192
+ i += 2;
1193
+ } else if (value[i] === "\"") {
1194
+ i++;
1195
+ terminated = true;
1196
+ break;
1197
+ } else {
1198
+ field += value[i];
1199
+ i++;
1200
+ }
1201
+ invariant(terminated, "Unterminated quoted field in contains assertion value");
1202
+ return {
1203
+ field,
1204
+ nextIndex: i
1205
+ };
1206
+ }
1207
+ /**
1208
+ * Parse an unquoted field up to the next comma, trimming surrounding whitespace.
1209
+ */
1210
+ function parseUnquotedField(value, startIndex) {
1211
+ let i = startIndex;
1212
+ while (i < value.length && value[i] !== ",") i++;
1213
+ return {
1214
+ field: value.substring(startIndex, i).trim(),
1215
+ nextIndex: i
1216
+ };
1217
+ }
1218
+ /**
1219
+ * Split a contains-any string into fields while preserving quoted commas.
1220
+ */
657
1221
  function parseCommaSeparatedValues(value) {
658
1222
  const results = [];
659
1223
  let i = 0;
660
1224
  while (i < value.length) {
661
- while (i < value.length && /\s/.test(value[i])) i++;
1225
+ i = skipWhitespaceAndCommas(value, i);
662
1226
  if (i >= value.length) break;
663
- if (value[i] === ",") {
664
- i++;
665
- continue;
666
- }
667
- if (value[i] === "\"") {
668
- i++;
669
- let field = "";
670
- while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
671
- field += value[i + 1];
672
- i += 2;
673
- } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
674
- field += "\"";
675
- i += 2;
676
- } else if (value[i] === "\"") {
677
- i++;
678
- break;
679
- } else {
680
- field += value[i];
681
- i++;
682
- }
683
- results.push(field);
684
- } else {
685
- const start = i;
686
- while (i < value.length && value[i] !== ",") i++;
687
- results.push(value.substring(start, i).trim());
688
- }
1227
+ const isQuotedField = value[i] === "\"";
1228
+ const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
1229
+ results.push(parsed.field);
1230
+ i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
1231
+ invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
689
1232
  }
690
1233
  return results;
691
1234
  }
@@ -1090,6 +1633,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
1090
1633
  };
1091
1634
  //#endregion
1092
1635
  //#region src/assertions/html.ts
1636
+ const LITERAL_WRAPPER_PATTERNS = {
1637
+ html: /<html(?=[\s>/])/,
1638
+ head: /<head(?=[\s>/])/,
1639
+ body: /<body(?=[\s>/])/
1640
+ };
1641
+ function isWrapperTagName(tagName) {
1642
+ return tagName === "html" || tagName === "head" || tagName === "body";
1643
+ }
1644
+ function isTextNode(node) {
1645
+ return node.nodeName === "#text";
1646
+ }
1647
+ function isElementNode(node) {
1648
+ return "tagName" in node;
1649
+ }
1650
+ function hasSourceCodeLocation(element) {
1651
+ return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
1652
+ }
1653
+ function getChildNodes(node) {
1654
+ return "childNodes" in node ? node.childNodes : [];
1655
+ }
1656
+ function findFirstElement(root, predicate) {
1657
+ const stack = [root];
1658
+ while (stack.length > 0) {
1659
+ const current = stack.pop();
1660
+ if (isElementNode(current) && predicate(current)) return current;
1661
+ const children = getChildNodes(current);
1662
+ for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
1663
+ }
1664
+ }
1665
+ function hasTopLevelText(parentNode) {
1666
+ return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
1667
+ }
1668
+ function isUserProvidedElement(element, inputLowercase) {
1669
+ const tagName = element.tagName.toLowerCase();
1670
+ if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
1671
+ return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
1672
+ }
1093
1673
  const HTML_PATTERNS = {
1094
1674
  openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
1095
1675
  closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
@@ -1245,37 +1825,21 @@ function validateHtml(htmlString) {
1245
1825
  isValid: false,
1246
1826
  reason: "Output appears to be XML, not HTML"
1247
1827
  };
1248
- try {
1249
- const { document } = new JSDOM(trimmed, { contentType: "text/html" }).window;
1250
- if (document.body && !trimmed.toLowerCase().includes("<body")) {
1251
- if (Array.from(document.body.childNodes).some((node) => node.nodeType === 3 && node.textContent?.trim())) return {
1252
- isValid: false,
1253
- reason: "Output must be wrapped in HTML tags"
1254
- };
1255
- }
1256
- const allElements = document.querySelectorAll("*");
1257
- if (!Array.from(allElements).find((element) => {
1258
- const tagName = element.tagName.toLowerCase();
1259
- if ([
1260
- "html",
1261
- "head",
1262
- "body"
1263
- ].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
1264
- return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
1265
- })) return {
1266
- isValid: false,
1267
- reason: "Output does not contain recognized HTML elements"
1268
- };
1269
- return {
1270
- isValid: true,
1271
- reason: "Output is valid HTML"
1272
- };
1273
- } catch (error) {
1274
- return {
1275
- isValid: false,
1276
- reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
1277
- };
1278
- }
1828
+ const document = parse$2(trimmed, { sourceCodeLocationInfo: true });
1829
+ const inputLowercase = trimmed.toLowerCase();
1830
+ const body = findFirstElement(document, (element) => element.tagName === "body");
1831
+ if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
1832
+ isValid: false,
1833
+ reason: "Output must be wrapped in HTML tags"
1834
+ };
1835
+ if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
1836
+ isValid: false,
1837
+ reason: "Output does not contain recognized HTML elements"
1838
+ };
1839
+ return {
1840
+ isValid: true,
1841
+ reason: "Output is valid HTML"
1842
+ };
1279
1843
  }
1280
1844
  const handleContainsHtml = ({ assertion, outputString, inverse }) => {
1281
1845
  const pass = containsHtml(outputString) !== inverse;
@@ -2380,11 +2944,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
2380
2944
  const rougeMethod = rouge[baseType[baseType.length - 1]];
2381
2945
  const score = rougeMethod(outputString, renderedValue, {});
2382
2946
  const threshold = assertion.threshold ?? .75;
2383
- const pass = score >= threshold != inverse;
2384
2947
  return {
2385
- pass,
2948
+ pass: score >= threshold !== inverse,
2386
2949
  score: inverse ? 1 - score : score,
2387
- reason: pass ? `${baseType.toUpperCase()} score ${score.toFixed(2)} is greater than or equal to threshold ${threshold}` : `${baseType.toUpperCase()} score ${score.toFixed(2)} is less than threshold ${threshold}`,
2950
+ reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
2388
2951
  assertion
2389
2952
  };
2390
2953
  }
@@ -2446,6 +3009,192 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
2446
3009
  }
2447
3010
  };
2448
3011
  //#endregion
3012
+ //#region src/providers/webSearchUtils.ts
3013
+ function hasTool(provider, predicate) {
3014
+ return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
3015
+ }
3016
+ function getProviderId(provider) {
3017
+ if (typeof provider.id !== "function") return null;
3018
+ try {
3019
+ return provider.id();
3020
+ } catch (err) {
3021
+ logger.debug(`Failed to read provider id: ${err}`);
3022
+ return null;
3023
+ }
3024
+ }
3025
+ function isOpenAiResponsesProvider(provider, id) {
3026
+ return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
3027
+ }
3028
+ /**
3029
+ * Check if a provider has web search capabilities
3030
+ * @param provider The provider to check
3031
+ * @returns true if the provider supports web search
3032
+ */
3033
+ function hasWebSearchCapability(provider) {
3034
+ if (!provider) return false;
3035
+ const id = getProviderId(provider);
3036
+ if (!id) return false;
3037
+ if (id.includes("perplexity")) return true;
3038
+ if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
3039
+ if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
3040
+ if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
3041
+ if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
3042
+ if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
3043
+ return false;
3044
+ }
3045
+ /**
3046
+ * Load a provider with web search capabilities.
3047
+ * Tries multiple providers in order of preference until one succeeds.
3048
+ * Uses the latest and most capable models from each provider with specific checkpoint IDs.
3049
+ *
3050
+ * @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
3051
+ * @returns A provider with web search capabilities or null
3052
+ */
3053
+ async function loadWebSearchProvider(preferAnthropic = false) {
3054
+ const loadAnthropicWebSearch = async () => {
3055
+ try {
3056
+ return await loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
3057
+ type: "web_search_20250305",
3058
+ name: "web_search",
3059
+ max_uses: 5
3060
+ }] } } });
3061
+ } catch (err) {
3062
+ logger.debug(`Failed to load Anthropic web search provider: ${err}`);
3063
+ return null;
3064
+ }
3065
+ };
3066
+ const loadOpenAIWebSearch = async () => {
3067
+ try {
3068
+ return await loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
3069
+ } catch (err) {
3070
+ logger.debug(`Failed to load OpenAI web search provider: ${err}`);
3071
+ return null;
3072
+ }
3073
+ };
3074
+ const loadPerplexity = async () => {
3075
+ try {
3076
+ return await loadApiProvider("perplexity:sonar-pro");
3077
+ } catch (err) {
3078
+ logger.debug(`Failed to load Perplexity provider: ${err}`);
3079
+ return null;
3080
+ }
3081
+ };
3082
+ const loadGoogleWebSearch = async () => {
3083
+ try {
3084
+ return await loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
3085
+ } catch (err) {
3086
+ logger.debug(`Failed to load Google web search provider: ${err}`);
3087
+ return null;
3088
+ }
3089
+ };
3090
+ const loadVertexWebSearch = async () => {
3091
+ try {
3092
+ return await loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
3093
+ } catch (err) {
3094
+ logger.debug(`Failed to load Vertex web search provider: ${err}`);
3095
+ return null;
3096
+ }
3097
+ };
3098
+ const loadXaiWebSearch = async () => {
3099
+ try {
3100
+ return await loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
3101
+ } catch (err) {
3102
+ logger.debug(`Failed to load xAI web search provider: ${err}`);
3103
+ return null;
3104
+ }
3105
+ };
3106
+ const providers = preferAnthropic ? [
3107
+ loadAnthropicWebSearch,
3108
+ loadOpenAIWebSearch,
3109
+ loadPerplexity,
3110
+ loadGoogleWebSearch,
3111
+ loadVertexWebSearch,
3112
+ loadXaiWebSearch
3113
+ ] : [
3114
+ loadOpenAIWebSearch,
3115
+ loadAnthropicWebSearch,
3116
+ loadPerplexity,
3117
+ loadGoogleWebSearch,
3118
+ loadVertexWebSearch,
3119
+ loadXaiWebSearch
3120
+ ];
3121
+ for (const getProvider of providers) {
3122
+ const provider = await getProvider();
3123
+ if (provider && hasWebSearchCapability(provider)) {
3124
+ logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
3125
+ return provider;
3126
+ }
3127
+ if (provider) logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
3128
+ }
3129
+ return null;
3130
+ }
3131
+ //#endregion
3132
+ //#region src/matchers/search.ts
3133
+ async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
3134
+ if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
3135
+ const defaultProviders = await getDefaultProviders();
3136
+ const defaultSearchProviders = [
3137
+ defaultProviders.webSearchProvider,
3138
+ defaultProviders.llmRubricProvider,
3139
+ defaultProviders.gradingProvider
3140
+ ];
3141
+ let searchProvider = (grading.provider ? await getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
3142
+ if (!hasWebSearchCapability(searchProvider)) {
3143
+ const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
3144
+ if (webSearchDefault) searchProvider = webSearchDefault;
3145
+ }
3146
+ if (!hasWebSearchCapability(searchProvider)) {
3147
+ const webSearchProvider = await loadWebSearchProvider(true);
3148
+ if (webSearchProvider) searchProvider = webSearchProvider;
3149
+ }
3150
+ if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
3151
+ const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, DEFAULT_WEB_SEARCH_PROMPT), {
3152
+ output: tryParse(llmOutput),
3153
+ rubric,
3154
+ ...vars || {}
3155
+ });
3156
+ const resp = await callProviderWithContext(searchProvider, prompt, "search-rubric", {
3157
+ output: tryParse(llmOutput),
3158
+ rubric,
3159
+ ...vars || {}
3160
+ }, providerCallContext);
3161
+ if (resp.error || !resp.output) return {
3162
+ pass: false,
3163
+ score: 0,
3164
+ reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
3165
+ tokensUsed: resp.tokenUsage,
3166
+ assertion
3167
+ };
3168
+ try {
3169
+ const result = extractFirstJsonObject(String(resp.output));
3170
+ let pass = result.pass ?? false;
3171
+ const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
3172
+ if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
3173
+ return {
3174
+ pass,
3175
+ score,
3176
+ reason: result.reason || "No reason provided",
3177
+ tokensUsed: resp.tokenUsage,
3178
+ assertion,
3179
+ metadata: {
3180
+ searchResults: result.searchResults || [],
3181
+ searchProvider: searchProvider.id()
3182
+ }
3183
+ };
3184
+ } catch (err) {
3185
+ logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
3186
+ const outputLower = String(resp.output).toLowerCase();
3187
+ const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
3188
+ return {
3189
+ pass,
3190
+ score: pass ? 1 : 0,
3191
+ reason: resp.output,
3192
+ tokensUsed: resp.tokenUsage,
3193
+ assertion
3194
+ };
3195
+ }
3196
+ }
3197
+ //#endregion
2449
3198
  //#region src/assertions/searchRubric.ts
2450
3199
  async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
2451
3200
  if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
@@ -3531,7 +4280,7 @@ const ASSERTION_HANDLERS = {
3531
4280
  "llm-rubric": handleLlmRubric,
3532
4281
  meteor: async (params) => {
3533
4282
  try {
3534
- const { handleMeteorAssertion } = await import("./meteor-DuAFv6gF.js");
4283
+ const { handleMeteorAssertion } = await import("./meteor-BKTM-7KS.js");
3535
4284
  return handleMeteorAssertion(params);
3536
4285
  } catch (error) {
3537
4286
  if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
@@ -3667,7 +4416,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
3667
4416
  };
3668
4417
  }
3669
4418
  else if (filePath.endsWith(".rb")) try {
3670
- const { runRuby } = await import("./rubyUtils-DVLeA2jg.js").then((n) => n.t);
4419
+ const { runRuby } = await import("./rubyUtils-BI0p46eZ.js").then((n) => n.t);
3671
4420
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
3672
4421
  logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
3673
4422
  } catch (error) {
@@ -5465,7 +6214,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
5465
6214
  const defaultProvider = defaultTest.provider;
5466
6215
  if (isApiProvider(defaultProvider)) return defaultProvider;
5467
6216
  if (typeof defaultProvider === "object" && defaultProvider.id) {
5468
- const { loadApiProvider } = await import("./providers-u9Enmfok.js");
6217
+ const { loadApiProvider } = await import("./providers-CCE2COJi2.js");
5469
6218
  return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
5470
6219
  }
5471
6220
  return defaultProvider;
@@ -5625,7 +6374,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
5625
6374
  async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
5626
6375
  if (!state.resume || !evalRecord.persisted) return;
5627
6376
  try {
5628
- const { default: EvalResult } = await import("./evalResult-Bgm9ZH31.js").then((n) => n.n);
6377
+ const { default: EvalResult } = await import("./evalResult-pSvGWFMo.js").then((n) => n.n);
5629
6378
  const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: state.retryMode });
5630
6379
  const originalCount = runEvalOptions.length;
5631
6380
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -6084,9 +6833,8 @@ var Evaluator = class {
6084
6833
  context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
6085
6834
  }
6086
6835
  async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
6087
- let flushGroupedRows;
6088
6836
  try {
6089
- if (shouldGroupGradingByProvider) flushGroupedRows = await this.runGroupedEvalSteps({
6837
+ if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
6090
6838
  checkAbort,
6091
6839
  evalStepIndexMap,
6092
6840
  groupedRunEvalOptions,
@@ -6118,7 +6866,6 @@ var Evaluator = class {
6118
6866
  cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
6119
6867
  throw err;
6120
6868
  }
6121
- await flushGroupedRows?.();
6122
6869
  if (isEvalTimedOut()) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
6123
6870
  else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
6124
6871
  ciProgressReporter,
@@ -8418,47 +9165,11 @@ function filterPrompts(prompts, filterPromptsOption) {
8418
9165
  //#endregion
8419
9166
  //#region src/commands/eval/filterProviders.ts
8420
9167
  /**
8421
- * Checks if a value is a valid provider ID (non-empty string).
8422
- */
8423
- function isValidProviderId(id) {
8424
- return id !== null && id !== void 0 && typeof id === "string" && id !== "";
8425
- }
8426
- /**
8427
9168
  * Extracts the id and label from a raw provider config without instantiating it.
8428
9169
  * Handles all provider config formats: string, function, ProviderOptions, ProviderOptionsMap.
8429
9170
  */
8430
9171
  function getProviderIdAndLabel(provider, index) {
8431
- if (typeof provider === "string") return { id: provider };
8432
- if (typeof provider === "function") {
8433
- const label = provider.label;
8434
- return {
8435
- id: label ?? `custom-function-${index}`,
8436
- label
8437
- };
8438
- }
8439
- const providerId = provider.id;
8440
- if ("id" in provider && isValidProviderId(providerId)) return {
8441
- id: providerId,
8442
- label: provider.label
8443
- };
8444
- const keys = Object.keys(provider);
8445
- if (keys.length > 0) {
8446
- const id = keys[0];
8447
- const value = provider[id];
8448
- if (typeof value === "object" && value !== null) return {
8449
- id: value.id || id,
8450
- label: value.label
8451
- };
8452
- }
8453
- const label = provider.label;
8454
- if (isValidProviderId(label)) return {
8455
- id: label,
8456
- label
8457
- };
8458
- return {
8459
- id: `unknown-${index}`,
8460
- label
8461
- };
9172
+ return normalizeProviderRef(provider, { index });
8462
9173
  }
8463
9174
  /**
8464
9175
  * Filters raw provider configs BEFORE instantiation.
@@ -11163,9 +11874,10 @@ function dedupeTestCases(testCases) {
11163
11874
  return deduped;
11164
11875
  }
11165
11876
  function buildMaxCharsRetryInstructions(rejectedPromptLengths, limit) {
11877
+ const longestRejectedPromptText = rejectedPromptLengths.length > 0 ? `${Math.max(...rejectedPromptLengths)} characters` : "unknown length";
11166
11878
  return dedent`
11167
11879
  Your previous response included ${rejectedPromptLengths.length} generated prompt${rejectedPromptLengths.length === 1 ? "" : "s"} that exceeded the ${limit ?? "configured"}-character limit.
11168
- The longest rejected prompt was ${Math.max(...rejectedPromptLengths)} characters.
11880
+ The longest rejected prompt was ${longestRejectedPromptText}.
11169
11881
  Generate replacement prompts only, and keep every user message within the character limit.
11170
11882
  `.trim();
11171
11883
  }
@@ -13214,7 +13926,7 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
13214
13926
  for (const row of evaluateTable.body.slice(0, maxRows)) table.push([...row.vars.map((v) => ellipsize(v, tableCellMaxLength)), ...row.outputs.map(({ pass, text, failureReason: failureType }) => {
13215
13927
  text = ellipsize(text, tableCellMaxLength);
13216
13928
  if (pass) return chalk.green("[PASS] ") + text;
13217
- else return chalk.red(failureType === ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.red.bold(c) : c).join("---");
13929
+ return chalk.red(failureType === ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.red.bold(c) : c).join("---");
13218
13930
  })]);
13219
13931
  return table.toString();
13220
13932
  }
@@ -13305,6 +14017,115 @@ function formatDuration(seconds) {
13305
14017
  }
13306
14018
  //#endregion
13307
14019
  //#region src/commands/eval/summary.ts
14020
+ function getCompletionMessage({ completionType, evalId, shareableUrl, wasAborted, writeToDatabase, activelySharing }) {
14021
+ if (wasAborted) {
14022
+ const idSuffix = writeToDatabase ? ` (ID: ${chalk.cyan(evalId)})` : "";
14023
+ return `${chalk.red("✗")} ${completionType} aborted${idSuffix}`;
14024
+ }
14025
+ if (writeToDatabase && shareableUrl) return `${chalk.green("✓")} ${completionType} complete: ${shareableUrl}`;
14026
+ if (writeToDatabase && activelySharing) return `${chalk.green("✓")} ${completionType} complete`;
14027
+ if (writeToDatabase) return `${chalk.green("✓")} ${completionType} complete (ID: ${chalk.cyan(evalId)})`;
14028
+ return `${chalk.green("✓")} ${completionType} complete`;
14029
+ }
14030
+ function getAbortSummaryLines(targetErrorStatus) {
14031
+ if (targetErrorStatus == null) return [];
14032
+ return [
14033
+ "",
14034
+ chalk.red.bold("Scan stopped: Target is unavailable and will not recover on retry."),
14035
+ chalk.red(` Target returned HTTP ${targetErrorStatus}`),
14036
+ "",
14037
+ chalk.yellow("Possible causes:"),
14038
+ chalk.yellow(" • Invalid API key or authentication (401/403)"),
14039
+ chalk.yellow(" • Target endpoint does not exist (404)"),
14040
+ chalk.yellow(" • Server does not support the request (501)"),
14041
+ "",
14042
+ chalk.cyan("To fix: Check your target configuration and credentials.")
14043
+ ];
14044
+ }
14045
+ function getGuidanceLines({ writeToDatabase, shareableUrl, wantsToShare, activelySharing, hasExplicitDisable, cloudEnabled }) {
14046
+ if (!writeToDatabase || shareableUrl || wantsToShare || activelySharing) return [];
14047
+ const lines = ["", `» View results: ${chalk.green.bold("promptfoo view")}`];
14048
+ if (!hasExplicitDisable) lines.push(cloudEnabled ? `» Create shareable URL: ${chalk.green.bold("promptfoo share")}` : `» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
14049
+ lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
14050
+ return lines;
14051
+ }
14052
+ function buildUsageDetails(usage, total) {
14053
+ const parts = [];
14054
+ if (usage.prompt && usage.prompt > 0) parts.push(`${usage.prompt.toLocaleString()} prompt`);
14055
+ if (usage.completion && usage.completion > 0) parts.push(`${usage.completion.toLocaleString()} completion`);
14056
+ if (usage.cached && usage.cached > 0) parts.push(usage.cached === total && parts.length === 0 ? "cached" : `${usage.cached.toLocaleString()} cached`);
14057
+ if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) parts.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
14058
+ return parts;
14059
+ }
14060
+ function getTokenUsageLines(tokenUsage, isRedteam, tracker) {
14061
+ const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
14062
+ const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
14063
+ if (!hasEvalTokens && !hasGradingTokens) return [];
14064
+ const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
14065
+ const evalTokens = {
14066
+ prompt: tokenUsage.prompt || 0,
14067
+ completion: tokenUsage.completion || 0,
14068
+ total: tokenUsage.total || combinedTotal,
14069
+ cached: tokenUsage.cached || 0,
14070
+ numRequests: tokenUsage.numRequests || 0,
14071
+ completionDetails: tokenUsage.completionDetails || {
14072
+ reasoning: 0,
14073
+ acceptedPrediction: 0,
14074
+ rejectedPrediction: 0
14075
+ }
14076
+ };
14077
+ const lines = [`${chalk.bold("Total Tokens:")} ${chalk.white.bold((evalTokens.total + (tokenUsage.assertions?.total || 0)).toLocaleString())}`];
14078
+ if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
14079
+ if (evalTokens.total > 0) {
14080
+ const evalParts = buildUsageDetails(evalTokens, evalTokens.total);
14081
+ lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
14082
+ }
14083
+ if (tokenUsage.assertions?.total && tokenUsage.assertions.total > 0) {
14084
+ const gradingParts = buildUsageDetails(tokenUsage.assertions, tokenUsage.assertions.total);
14085
+ lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
14086
+ }
14087
+ lines.push(...getProviderUsageLines(tracker));
14088
+ return lines;
14089
+ }
14090
+ function getProviderUsageLines(tracker) {
14091
+ const providerIds = tracker.getProviderIds();
14092
+ if (providerIds.length <= 1) return [];
14093
+ const sortedProviders = providerIds.map((id) => ({
14094
+ id,
14095
+ usage: tracker.getProviderUsage(id)
14096
+ })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
14097
+ const lines = ["", chalk.bold("Providers:")];
14098
+ for (const { id, usage } of sortedProviders) {
14099
+ if ((usage.total || 0) === 0 && (usage.prompt || 0) + (usage.completion || 0) === 0) continue;
14100
+ const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
14101
+ const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
14102
+ const details = buildUsageDetails(usage, displayTotal);
14103
+ const requestInfo = `${usage.numRequests || 0} requests`;
14104
+ const separator = details.length > 0 ? "; " : "";
14105
+ lines.push(` ${chalk.gray(`${displayId}:`)} ${chalk.white(displayTotal.toLocaleString())} (${requestInfo}${separator}${details.join(", ")})`);
14106
+ }
14107
+ return lines;
14108
+ }
14109
+ function formatResultPercentage(count, totalTests) {
14110
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
14111
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
14112
+ }
14113
+ function formatResultLine(count, label, icon, iconColor, totalTests) {
14114
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count, totalTests)})`)}`;
14115
+ }
14116
+ function getResultsLines({ successes, failures, errors, duration, maxConcurrency }) {
14117
+ const totalTests = successes + failures + errors;
14118
+ const errorLabel = errors === 1 ? "error" : "errors";
14119
+ return [
14120
+ "",
14121
+ chalk.bold("Results:"),
14122
+ formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green, totalTests),
14123
+ formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red, totalTests),
14124
+ formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red, totalTests),
14125
+ chalk.gray(`Duration: ${formatDuration(duration)} (concurrency: ${maxConcurrency})`),
14126
+ ""
14127
+ ];
14128
+ }
13308
14129
  /**
13309
14130
  * Generate formatted evaluation summary output for CLI display.
13310
14131
  *
@@ -13343,115 +14164,28 @@ function formatDuration(seconds) {
13343
14164
  * ```
13344
14165
  */
13345
14166
  function generateEvalSummary(params) {
13346
- const { evalId, isRedteam, writeToDatabase, shareableUrl, wantsToShare, hasExplicitDisable, cloudEnabled, activelySharing = false, tokenUsage, successes, failures, errors, duration, maxConcurrency, tracker, targetErrorStatus } = params;
13347
- const lines = [];
13348
- const completionType = isRedteam ? "Red team" : "Eval";
13349
- const wasAborted = targetErrorStatus != null;
13350
- let completionMessage;
13351
- if (wasAborted) {
13352
- completionMessage = `${chalk.red("✗")} ${completionType} aborted`;
13353
- if (writeToDatabase) completionMessage += ` (ID: ${chalk.cyan(evalId)})`;
13354
- } else if (writeToDatabase && shareableUrl) completionMessage = `${chalk.green("✓")} ${completionType} complete: ${shareableUrl}`;
13355
- else if (writeToDatabase && activelySharing) completionMessage = `${chalk.green("✓")} ${completionType} complete`;
13356
- else if (writeToDatabase) completionMessage = `${chalk.green("✓")} ${completionType} complete (ID: ${chalk.cyan(evalId)})`;
13357
- else completionMessage = `${chalk.green("✓")} ${completionType} complete`;
13358
- lines.push(completionMessage);
13359
- if (wasAborted && targetErrorStatus != null) {
13360
- lines.push("");
13361
- lines.push(chalk.red.bold("Scan stopped: Target is unavailable and will not recover on retry."));
13362
- lines.push(chalk.red(` Target returned HTTP ${targetErrorStatus}`));
13363
- lines.push("");
13364
- lines.push(chalk.yellow("Possible causes:"));
13365
- lines.push(chalk.yellow(" • Invalid API key or authentication (401/403)"));
13366
- lines.push(chalk.yellow(" • Target endpoint does not exist (404)"));
13367
- lines.push(chalk.yellow(" • Server does not support the request (501)"));
13368
- lines.push("");
13369
- lines.push(chalk.cyan("To fix: Check your target configuration and credentials."));
13370
- }
13371
- if (writeToDatabase && !shareableUrl && !wantsToShare && !activelySharing) {
13372
- lines.push("");
13373
- lines.push(`» View results: ${chalk.green.bold("promptfoo view")}`);
13374
- if (!hasExplicitDisable) if (cloudEnabled) lines.push(`» Create shareable URL: ${chalk.green.bold("promptfoo share")}`);
13375
- else lines.push(`» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
13376
- lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
13377
- }
13378
- lines.push("");
13379
- const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
13380
- const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
13381
- if (hasEvalTokens || hasGradingTokens) {
13382
- const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
13383
- const evalTokens = {
13384
- prompt: tokenUsage.prompt || 0,
13385
- completion: tokenUsage.completion || 0,
13386
- total: tokenUsage.total || combinedTotal,
13387
- cached: tokenUsage.cached || 0,
13388
- completionDetails: tokenUsage.completionDetails || {
13389
- reasoning: 0,
13390
- acceptedPrediction: 0,
13391
- rejectedPrediction: 0
13392
- }
13393
- };
13394
- const grandTotal = evalTokens.total + (tokenUsage.assertions?.total || 0);
13395
- lines.push(`${chalk.bold("Total Tokens:")} ${chalk.white.bold(grandTotal.toLocaleString())}`);
13396
- if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
13397
- if (evalTokens.total > 0) {
13398
- const evalParts = [];
13399
- if (evalTokens.prompt > 0) evalParts.push(`${evalTokens.prompt.toLocaleString()} prompt`);
13400
- if (evalTokens.completion > 0) evalParts.push(`${evalTokens.completion.toLocaleString()} completion`);
13401
- if (evalTokens.cached > 0) if (evalTokens.cached === evalTokens.total && evalParts.length === 0) evalParts.push("cached");
13402
- else evalParts.push(`${evalTokens.cached.toLocaleString()} cached`);
13403
- if (evalTokens.completionDetails?.reasoning && evalTokens.completionDetails.reasoning > 0) evalParts.push(`${evalTokens.completionDetails.reasoning.toLocaleString()} reasoning`);
13404
- lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
13405
- }
13406
- if (tokenUsage.assertions && tokenUsage.assertions.total && tokenUsage.assertions.total > 0) {
13407
- const gradingParts = [];
13408
- if (tokenUsage.assertions.prompt && tokenUsage.assertions.prompt > 0) gradingParts.push(`${tokenUsage.assertions.prompt.toLocaleString()} prompt`);
13409
- if (tokenUsage.assertions.completion && tokenUsage.assertions.completion > 0) gradingParts.push(`${tokenUsage.assertions.completion.toLocaleString()} completion`);
13410
- if (tokenUsage.assertions.cached && tokenUsage.assertions.cached > 0) if (tokenUsage.assertions.cached === tokenUsage.assertions.total && gradingParts.length === 0) gradingParts.push("cached");
13411
- else gradingParts.push(`${tokenUsage.assertions.cached.toLocaleString()} cached`);
13412
- if (tokenUsage.assertions.completionDetails?.reasoning && tokenUsage.assertions.completionDetails.reasoning > 0) gradingParts.push(`${tokenUsage.assertions.completionDetails.reasoning.toLocaleString()} reasoning`);
13413
- lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
13414
- }
13415
- const providerIds = tracker.getProviderIds();
13416
- if (providerIds.length > 1) {
13417
- lines.push("");
13418
- lines.push(chalk.bold("Providers:"));
13419
- const sortedProviders = providerIds.map((id) => ({
13420
- id,
13421
- usage: tracker.getProviderUsage(id)
13422
- })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
13423
- for (const { id, usage } of sortedProviders) if ((usage.total || 0) > 0 || (usage.prompt || 0) + (usage.completion || 0) > 0) {
13424
- const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
13425
- const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
13426
- const details = [];
13427
- if (usage.prompt && usage.prompt > 0) details.push(`${usage.prompt.toLocaleString()} prompt`);
13428
- if (usage.completion && usage.completion > 0) details.push(`${usage.completion.toLocaleString()} completion`);
13429
- if (usage.cached && usage.cached > 0) if (usage.cached === displayTotal && details.length === 0) details.push("cached");
13430
- else details.push(`${usage.cached.toLocaleString()} cached`);
13431
- if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) details.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
13432
- const breakdown = ` (${`${usage.numRequests || 0} requests`}${details.length > 0 ? "; " : ""}${details.join(", ")})`;
13433
- lines.push(` ${chalk.gray(displayId + ":")} ${chalk.white(displayTotal.toLocaleString())}${breakdown}`);
13434
- }
13435
- }
13436
- }
13437
- lines.push("");
13438
- const totalTests = successes + failures + errors;
13439
- const formatResultPercentage = (count) => {
13440
- const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
13441
- return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
13442
- };
13443
- const formatResultLine = (count, label, icon, iconColor) => {
13444
- return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
13445
- };
13446
- const errorLabel = errors === 1 ? "error" : "errors";
13447
- lines.push(chalk.bold("Results:"));
13448
- lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
13449
- lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
13450
- lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
13451
- const durationDisplay = formatDuration(duration);
13452
- lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
13453
- lines.push("");
13454
- return lines;
14167
+ return [
14168
+ getCompletionMessage({
14169
+ completionType: params.isRedteam ? "Red team" : "Eval",
14170
+ evalId: params.evalId,
14171
+ shareableUrl: params.shareableUrl,
14172
+ wasAborted: params.targetErrorStatus != null,
14173
+ writeToDatabase: params.writeToDatabase,
14174
+ activelySharing: params.activelySharing ?? false
14175
+ }),
14176
+ ...getAbortSummaryLines(params.targetErrorStatus),
14177
+ ...getGuidanceLines({
14178
+ writeToDatabase: params.writeToDatabase,
14179
+ shareableUrl: params.shareableUrl,
14180
+ wantsToShare: params.wantsToShare,
14181
+ activelySharing: params.activelySharing ?? false,
14182
+ hasExplicitDisable: params.hasExplicitDisable,
14183
+ cloudEnabled: params.cloudEnabled
14184
+ }),
14185
+ "",
14186
+ ...getTokenUsageLines(params.tokenUsage, params.isRedteam, params.tracker),
14187
+ ...getResultsLines(params)
14188
+ ];
13455
14189
  }
13456
14190
  //#endregion
13457
14191
  //#region src/commands/retry.ts
@@ -14221,6 +14955,26 @@ async function doRedteamRun(options) {
14221
14955
  }
14222
14956
  //#endregion
14223
14957
  //#region src/index.ts
14958
+ /**
14959
+ * Shallow-clone a test case so the caller can swap in resolved ApiProvider
14960
+ * instances on `options.provider` / `assert[].provider` without leaking those
14961
+ * mutations back to the input. The input may alias the unified config written
14962
+ * to the Eval record, and a live SDK client (e.g. Bedrock's BedrockRuntime,
14963
+ * Anthropic's client) holds circular references that break drizzle's JSON
14964
+ * serialization on `evalRecord.save()`. Fixes #8687.
14965
+ *
14966
+ * Detaches only `options` and `assert[]`. Other reference fields (`provider`,
14967
+ * `vars`, `metadata`, `providerOutput`) remain aliased — callers must reassign
14968
+ * those by reference rather than mutating in place. `assert-set` children are
14969
+ * not deep-cloned because the resolve loop skips `assert-set`; if that ever
14970
+ * changes, extend this helper.
14971
+ */
14972
+ function cloneTestForResolve(test) {
14973
+ const cloned = { ...test };
14974
+ if (test.options) cloned.options = { ...test.options };
14975
+ if (test.assert) cloned.assert = test.assert.map((assertion) => ({ ...assertion }));
14976
+ return cloned;
14977
+ }
14224
14978
  async function evaluate(testSuite, options = {}) {
14225
14979
  if (testSuite.writeLatestResults) await runDbMigrations();
14226
14980
  const loadedProviders = await loadApiProviders(testSuite.providers, { env: testSuite.env });
@@ -14240,22 +14994,24 @@ async function evaluate(testSuite, options = {}) {
14240
14994
  nunjucksFilters: await readFilters(testSuite.nunjucksFilters || {}),
14241
14995
  prompts: await processPrompts(testSuite.prompts)
14242
14996
  };
14243
- if (typeof constructedTestSuite.defaultTest === "object") {
14244
- if (constructedTestSuite.defaultTest?.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
14997
+ if (typeof constructedTestSuite.defaultTest === "object" && constructedTestSuite.defaultTest) {
14998
+ constructedTestSuite.defaultTest = cloneTestForResolve(constructedTestSuite.defaultTest);
14999
+ if (constructedTestSuite.defaultTest.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
14245
15000
  env: testSuite.env,
14246
15001
  basePath: state.basePath
14247
15002
  });
14248
- if (constructedTestSuite.defaultTest?.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
15003
+ if (constructedTestSuite.defaultTest.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
14249
15004
  env: testSuite.env,
14250
15005
  basePath: state.basePath
14251
15006
  });
14252
15007
  }
14253
- for (const test of constructedTestSuite.tests || []) {
15008
+ constructedTestSuite.tests = (constructedTestSuite.tests || []).map(cloneTestForResolve);
15009
+ for (const test of constructedTestSuite.tests) {
14254
15010
  if (test.options?.provider && !isApiProvider(test.options.provider)) test.options.provider = await resolveProvider(test.options.provider, providerMap, {
14255
15011
  env: testSuite.env,
14256
15012
  basePath: state.basePath
14257
15013
  });
14258
- if (test.assert) for (const assertion of test.assert) {
15014
+ for (const assertion of test.assert || []) {
14259
15015
  if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
14260
15016
  if (assertion.provider && !isApiProvider(assertion.provider)) assertion.provider = await resolveProvider(assertion.provider, providerMap, {
14261
15017
  env: testSuite.env,