promptfoo 0.121.4 → 0.121.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (346) hide show
  1. package/dist/src/{ListApp-DQkFNqE9.js → ListApp-BRUsT43Y.js} +1 -1
  2. package/dist/src/{accounts-Dy17bs4D.cjs → accounts-BIFntVWB.cjs} +4 -4
  3. package/dist/src/{accounts-F9d_5sMC.js → accounts-CLJHCDDb.js} +6 -6
  4. package/dist/src/{accounts-DhMYUUbu.js → accounts-CaLNYnf7.js} +4 -4
  5. package/dist/src/{accounts-DdJ2pHMI.js → accounts-bnyHT7Ju.js} +5 -5
  6. package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-B5krlibj.js} +3 -3
  7. package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-Ba67xmgs.js} +2 -2
  8. package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-BclbiXiq.js} +3 -3
  9. package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-D2x0wGhB.cjs} +2 -2
  10. package/dist/src/{agents-CgaMXvLM.js → agents-BGqaTDnr.js} +5 -5
  11. package/dist/src/{agents-8FDnTriG.js → agents-BV9yFpXX.js} +5 -5
  12. package/dist/src/{agents-aYPQLf8W.js → agents-BYdMl1UE.js} +4 -4
  13. package/dist/src/{agents-pQeBEXMm.js → agents-DhxWMCtH.js} +5 -5
  14. package/dist/src/{agents-D7-HGxUj.cjs → agents-DiWmQYH9.cjs} +4 -4
  15. package/dist/src/{agents-BahDpe5G.cjs → agents-WULPVjbH.cjs} +4 -4
  16. package/dist/src/{agents-DJ35I3Nt.js → agents-emVcx3yh.js} +5 -5
  17. package/dist/src/{agents-C-R_jfzI.js → agents-n6vPqV3i.js} +4 -4
  18. package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-BxqK9HF_.js} +7 -7
  19. package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-BzLjZI_m.cjs} +6 -6
  20. package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-DR4pgeiC.js} +6 -6
  21. package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-uPGp0Zdo.js} +7 -7
  22. package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
  23. package/dist/src/app/assets/Report-vjzrbgce.js +1 -0
  24. package/dist/src/app/assets/index-B3NQ8HTd.js +385 -0
  25. package/dist/src/app/assets/{index-BXGkeMwh.css → index-Cli2yAXv.css} +1 -1
  26. package/dist/src/app/index.html +27 -2
  27. package/dist/src/{audio-DcVKoInv.js → audio-BvpTOArF.js} +4 -4
  28. package/dist/src/{audio-BQtNuYBj.cjs → audio-C0vDeS0j.cjs} +3 -3
  29. package/dist/src/{audio-B7izf48x.js → audio-CScmnmEB.js} +4 -4
  30. package/dist/src/{audio-COrn8rM6.js → audio-Da8U9IS5.js} +3 -3
  31. package/dist/src/{base-fZ9wgg50.js → base-BOMaNEes.js} +3 -3
  32. package/dist/src/{base-PYJvBE1i.js → base-BTux96b1.js} +2 -2
  33. package/dist/src/{base-D-670DX8.cjs → base-Tw6uhH8K.cjs} +2 -2
  34. package/dist/src/{base-yrI1Yal4.js → base-dYsl2hmL.js} +3 -3
  35. package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B95F_7vE.cjs} +2 -2
  36. package/dist/src/{blobs-C-F78Kfn.js → blobs-BW4U31ue.js} +2 -2
  37. package/dist/src/{blobs-BCZavS8s.js → blobs-D_gg8nbm.js} +3 -3
  38. package/dist/src/{blobs-BQWqnnvL.js → blobs-DjLby-uP.js} +3 -3
  39. package/dist/src/{cache-mb7c8hbp.js → cache-BI5BY7ey.js} +4 -4
  40. package/dist/src/{cache-DbLsVWB2.cjs → cache-BRkhlH3k.cjs} +1 -1
  41. package/dist/src/cache-BlC6aeJ0.js +3 -0
  42. package/dist/src/{cache-D5NZmMiT.js → cache-Bzttsk0X.js} +2 -2
  43. package/dist/src/{cache-C4Xb-hNb.js → cache-Cr-qWIbP.js} +3 -3
  44. package/dist/src/{cache-BIyPcp5v.cjs → cache-DGg-yTZG.cjs} +2 -2
  45. package/dist/src/{chat-Dr3DUQ0D.js → chat-BLOdH60v.js} +12 -12
  46. package/dist/src/{chat-BfPaS15_.js → chat-Cx_LkwvZ.js} +12 -12
  47. package/dist/src/{chat-mW0ORo8G.js → chat-D9nudO9b.js} +4 -4
  48. package/dist/src/{chat-I9izLm49.js → chat-DChSH_Es.js} +12 -12
  49. package/dist/src/{chat-MKxMnZJZ.js → chat-DG2LkwLq.js} +2 -2
  50. package/dist/src/{chat-BPXSW8Bv.cjs → chat-DH97tVV9.cjs} +2 -2
  51. package/dist/src/{chat-0bwXjVP0.js → chat-aMQZw6R7.js} +4 -4
  52. package/dist/src/{chat-CclRbxGf.cjs → chat-vYqqv1gP.cjs} +11 -11
  53. package/dist/src/{chatkit-zUIVoDos.js → chatkit-B8X34dQc.js} +4 -4
  54. package/dist/src/{chatkit-Cv6AhukM.js → chatkit-BXu42Qwt.js} +3 -3
  55. package/dist/src/{chatkit-CJnHRRMM.js → chatkit-CbMRoeYw.js} +4 -4
  56. package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-D44VyUyB.cjs} +3 -3
  57. package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-BRq0bbIK.cjs} +8 -8
  58. package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-BjriSVRZ.js} +7 -7
  59. package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BzNZeZ0N.js} +7 -7
  60. package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-DYv_AJ8u.js} +7 -7
  61. package/dist/src/cloud-CoD5OacT.js +3 -0
  62. package/dist/src/{cloud-DQZ5sVjW.js → cloud-Da0bofJd.js} +3 -3
  63. package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-CXC4b1EU.js} +4 -4
  64. package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-CyBoIs1Q.js} +6 -6
  65. package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-DGOwgexC.js} +6 -6
  66. package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-DJv5qnyb.cjs} +4 -4
  67. package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-1sAoOyft.js} +5 -5
  68. package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-D-dnkzCF.js} +5 -5
  69. package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-DKVjkDav.cjs} +3 -3
  70. package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-TJkVrZlB.js} +3 -3
  71. package/dist/src/codex-app-server-CCLjqCh9.js +1915 -0
  72. package/dist/src/codex-app-server-CCe0TiDc.js +1915 -0
  73. package/dist/src/codex-app-server-CPW1LFwh.js +1916 -0
  74. package/dist/src/codex-app-server-VMRnjZ68.cjs +1920 -0
  75. package/dist/src/codex-sdk-1jm_qPHf.js +3 -0
  76. package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-Bd8UbO9q.cjs} +5 -5
  77. package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-BgEFQ70r.js} +6 -6
  78. package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-Bzb_TqX9.js} +6 -6
  79. package/dist/src/{codex-sdk-CWEnH70W.cjs → codex-sdk-Danroptg.cjs} +1 -1
  80. package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-DfvDTN33.js} +5 -5
  81. package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-B5ImDlSm.js} +8 -8
  82. package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-BgAkuYCw.cjs} +7 -7
  83. package/dist/src/{cometapi-BUlt_ELa.js → cometapi-CC7hWxmX.js} +8 -8
  84. package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CCbpHkuF.js} +7 -7
  85. package/dist/src/{completion-x0a_c2y1.js → completion-2iuYVxwi.js} +6 -6
  86. package/dist/src/{completion-Dnxn7E-j.js → completion-CrD6MQ93.js} +5 -5
  87. package/dist/src/{completion-BozdoXba.cjs → completion-DtQ72Bm3.cjs} +5 -5
  88. package/dist/src/{completion-HUe8wDhZ.js → completion-Vq_ad618.js} +6 -6
  89. package/dist/src/{createHash-ChI45QR1.js → createHash-DPpsZgFF.js} +1 -1
  90. package/dist/src/{createHash-CwDVU5xr.js → createHash-Un4Q_huE.js} +1 -1
  91. package/dist/src/{createHash-B7KvgoOD.cjs → createHash-VvBIc-AW.cjs} +1 -1
  92. package/dist/src/{docker-DCgsveLD.js → docker--3qzPa-6.js} +6 -6
  93. package/dist/src/{docker-DS4_Osau.cjs → docker-D3AY-5F5.cjs} +5 -5
  94. package/dist/src/{docker-CQmlA2NU.js → docker-DCsCDvwM.js} +6 -6
  95. package/dist/src/{docker-ClnmCf1Z.js → docker-Dorv4_Dg.js} +5 -5
  96. package/dist/src/{embedding-I45KG3o7.cjs → embedding-BXhN5lCH.cjs} +5 -5
  97. package/dist/src/{embedding-nFbumxcv.js → embedding-ChS1ivFS.js} +5 -5
  98. package/dist/src/{embedding-D3xTseo7.js → embedding-DNRvZwRN.js} +6 -6
  99. package/dist/src/{embedding-DD9wa3ae.js → embedding-D_bI4NDq.js} +6 -6
  100. package/dist/src/{errors-Cw810C93.js → errors-DFHe4L-n.js} +1 -1
  101. package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
  102. package/dist/src/{esm-C7PnfdF8.js → esm-BRkfNsYs.js} +1 -1
  103. package/dist/src/{esm-tVgYPY-f.js → esm-BX8fwlAO.js} +2 -2
  104. package/dist/src/{esm-CtEPLdAj.cjs → esm-B_rGuPTo.cjs} +1 -1
  105. package/dist/src/{eval-CzJFfFO9.js → eval-BQPLBJbw.js} +1 -1
  106. package/dist/src/{eval-u4UVafl6.js → eval-DJ_4A-tr.js} +14 -14
  107. package/dist/src/evalResult-BBJAHAtw.cjs +2 -0
  108. package/dist/src/evalResult-BBK58h2B.js +3 -0
  109. package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-Cx-8OWkb.cjs} +28 -10
  110. package/dist/src/{evalResult-D3hVYFis.js → evalResult-D6P5I5il.js} +29 -11
  111. package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-pSvGWFMo.js} +29 -11
  112. package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-D-UIbbYq.js} +845 -98
  113. package/dist/src/evaluator-DgLKaZk8.js +3 -0
  114. package/dist/src/{extractor-Dk6bRWkv.js → extractor-BM3jRERL.js} +5 -5
  115. package/dist/src/{extractor-WVPOrH43.cjs → extractor-Dxr2J_wK.cjs} +5 -5
  116. package/dist/src/{extractor-DNSeBVOJ.js → extractor-DxyiFhPk.js} +6 -6
  117. package/dist/src/{extractor-CAfTSraf.js → extractor-YlZbUMsL.js} +6 -6
  118. package/dist/src/fetch-8viavNv8.js +3 -0
  119. package/dist/src/{fetch-BEWnXrrG.js → fetch-B6ch2nU2.js} +9 -20
  120. package/dist/src/{fetch-Di00EQrc.js → fetch-D9xxyC1p.js} +221 -232
  121. package/dist/src/{fetch-CJU5ELPa.cjs → fetch-NuqXW1Xb.cjs} +221 -244
  122. package/dist/src/{fetch-B0Z3Oe4k.js → fetch-Y5qX_kST.js} +8 -19
  123. package/dist/src/{fileExtensions-BArZuxsI.js → fileExtensions-8CjoL7vB.js} +1 -1
  124. package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-BGh-W-HT.js} +1 -1
  125. package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-D9h-8Wxg.cjs} +1 -1
  126. package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-DysCsxNG.js} +1 -1
  127. package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-Ch4A7G3o.js} +1 -1
  128. package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-BokHC-MW.cjs} +1 -1
  129. package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-C3ZPQU60.js} +1 -1
  130. package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-CFny3gOy.js} +1 -1
  131. package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-DxODqT9e.js} +1 -1
  132. package/dist/src/{graders-Zy3x0zqX.js → graders-BoUqsCEm.js} +1303 -2044
  133. package/dist/src/{graders--zknU_uk.cjs → graders-Bw1wk_21.cjs} +1553 -2240
  134. package/dist/src/graders-C84JI-m5.js +2 -0
  135. package/dist/src/graders-CBbd0K0Q.cjs +2 -0
  136. package/dist/src/graders-CbQqpHSN.js +3 -0
  137. package/dist/src/{graders-eIHhRqoC.js → graders-CgPn32yp.js} +1300 -2041
  138. package/dist/src/{graders-pvbReLLn.js → graders-CwrbifOo.js} +747 -1488
  139. package/dist/src/graders-DS42d3ZG.js +2 -0
  140. package/dist/src/{image-9302QVqR.js → image-BeWaInPF.js} +3 -3
  141. package/dist/src/{image-DVz2RiMF.js → image-BmilRNqO.js} +7 -7
  142. package/dist/src/{image-x6KqLQl4.cjs → image-CxJoa3aW.cjs} +6 -6
  143. package/dist/src/{image-De2FBmYV.cjs → image-D10dNAav.cjs} +3 -3
  144. package/dist/src/{image-dnoUgPrC.js → image-Dr_3I3nK.js} +4 -4
  145. package/dist/src/{image-B5Mv-Z3h.js → image-DsGRlkh7.js} +7 -7
  146. package/dist/src/{image-qUpPvmNZ.js → image-a_SGUobh.js} +6 -6
  147. package/dist/src/{image-u7-rKnYU.js → image-qjO6FWPs.js} +4 -4
  148. package/dist/src/index.cjs +1052 -296
  149. package/dist/src/index.d.cts +124 -13
  150. package/dist/src/index.d.ts +125 -14
  151. package/dist/src/index.js +1018 -262
  152. package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-CCICw2cy.js} +2 -2
  153. package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
  154. package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
  155. package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-BBETc5-S.js} +6 -6
  156. package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-C8qOo26M.cjs} +5 -5
  157. package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CzAi2rUI.js} +6 -6
  158. package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-Dr3Kib7F.js} +5 -5
  159. package/dist/src/{litellm-C2kqjxqp.js → litellm-BLSiANhk.js} +5 -5
  160. package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CaUmV7Mk.cjs} +4 -4
  161. package/dist/src/{litellm-p37R1dzQ.js → litellm-DQGo_juI.js} +4 -4
  162. package/dist/src/{litellm-DRjpcSa7.js → litellm-DRc4qWfc.js} +5 -5
  163. package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
  164. package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
  165. package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-B-tNZzqW.js} +6 -6
  166. package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CtS3OlGq.js} +5 -5
  167. package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-PJJgUjOc.js} +6 -6
  168. package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-if-Ml4R9.cjs} +5 -5
  169. package/dist/src/main.js +242 -198
  170. package/dist/src/{messages-zWbkLLHz.js → messages-B9dSjrNf.js} +264 -16
  171. package/dist/src/{messages-811uVVW5.cjs → messages-BnsVHUnm.cjs} +266 -15
  172. package/dist/src/{messages-MYTQ2TWp.js → messages-CI69Lasb.js} +264 -16
  173. package/dist/src/{messages-BTQz42fn.js → messages-CewuNcNS.js} +264 -16
  174. package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-BBGcGeCa.cjs} +1 -1
  175. package/dist/src/{meteor-DuAFv6gF.js → meteor-BKTM-7KS.js} +1 -1
  176. package/dist/src/{meteor-DHdzY1Ss.js → meteor-CeGo0Lu2.js} +2 -2
  177. package/dist/src/{meteor-CU5UAE-H.js → meteor-Wc_aUVvu.js} +2 -2
  178. package/dist/src/{modelslab-wu9yi5GE.js → modelslab-BCLOtfek.js} +7 -7
  179. package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-BkapYJhh.cjs} +6 -6
  180. package/dist/src/{modelslab-DIq-6y7x.js → modelslab-D73OnKSx.js} +6 -6
  181. package/dist/src/{modelslab-D0erNWKe.js → modelslab-zpz9JcK0.js} +7 -7
  182. package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-B8F_TK5w.js} +6 -6
  183. package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-Bx0NFV2f.js} +5 -5
  184. package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-CNGJTLtG.js} +6 -6
  185. package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-DkT7tnoB.cjs} +5 -5
  186. package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BaXRN1cr.js} +4 -4
  187. package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-BeTRaFOh.js} +4 -4
  188. package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CL7Zqv0G.js} +3 -3
  189. package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-YT426juD.cjs} +3 -3
  190. package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
  191. package/dist/src/{openai-Qsvz25mV.js → openai-BT-JvDse.js} +2 -2
  192. package/dist/src/{openai-URNyItar.cjs → openai-Cy1XLs0c.cjs} +1 -1
  193. package/dist/src/{openai-iYtrXzOX.js → openai-D4fxGvRx.js} +1 -1
  194. package/dist/src/{openclaw-CwzlQSQX.js → openclaw-Bq7RVR3k.js} +7 -6
  195. package/dist/src/{openclaw-CLWrW03k.js → openclaw-DA8U4DsD.js} +8 -7
  196. package/dist/src/{openclaw-CnQ363Wi.js → openclaw-DObVgpjC.js} +8 -7
  197. package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-DUBZP3GL.cjs} +8 -7
  198. package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-BB40Wir1.js} +4 -4
  199. package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-BM1UAIv1.js} +3 -3
  200. package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-CeqiOcOU.cjs} +4 -4
  201. package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-ChdK7F7z.js} +4 -4
  202. package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-C6thJRXi.js} +4 -4
  203. package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-CcdIikOu.js} +3 -3
  204. package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-DNSQj6bf.cjs} +3 -3
  205. package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver-UYMQx3sy.js} +4 -4
  206. package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
  207. package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-BESeALrr.cjs} +1 -1
  208. package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DoACwqhD.js} +1 -1
  209. package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-PMsleEzs.js} +2 -2
  210. package/dist/src/{providers-Bp4S-FvO.js → providers-BuyzKt7C.js} +1 -1
  211. package/dist/src/{providers-DV3ax9e_.cjs → providers-C7lNVBjX.cjs} +1 -1
  212. package/dist/src/{providers-u9Enmfok.js → providers-CCE2COJi2.js} +1 -1
  213. package/dist/src/{providers-DruaQfwu.js → providers-CJh7iriU.js} +18103 -17952
  214. package/dist/src/{providers-iUt5fbAN.js → providers-Ctcc592x.js} +1 -1
  215. package/dist/src/{providers-Domz_llv.js → providers-DRrerKra.js} +432 -281
  216. package/dist/src/{providers-BV_KMZje.js → providers-DT-GtF2t.js} +19094 -18943
  217. package/dist/src/{providers-1eKkXBKp.cjs → providers-eDShy16E.cjs} +17946 -17795
  218. package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-C4tltmIn.js} +3 -3
  219. package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-CoLaCwNY.cjs} +3 -3
  220. package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-DMO68Jg7.js} +3 -3
  221. package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-DNqbnRdx.js} +3 -3
  222. package/dist/src/{quiverai-DR0SnIQV.js → quiverai-BSS9a7wV.js} +3 -3
  223. package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Bk1KrvL6.js} +4 -4
  224. package/dist/src/{quiverai-DFotyafY.cjs → quiverai-Bpx6MZ7T.cjs} +3 -3
  225. package/dist/src/{quiverai-aPPvXOgn.js → quiverai-CPKhWgaT.js} +4 -4
  226. package/dist/src/{render-DHIZ6_k8.js → render-7uNJ2V14.js} +2 -2
  227. package/dist/src/{render-CH-62LbA.js → render-DlscvAUJ.js} +1 -1
  228. package/dist/src/{render-CMEpfLaO.js → render-eui5p5mL.js} +2 -2
  229. package/dist/src/{render-CgVDrJmM.js → render-nj-UaPdn.js} +2 -2
  230. package/dist/src/{render-DfQSFxGE.cjs → render-tG6ir9_g.cjs} +1 -1
  231. package/dist/src/{responses--OsX2aYW.js → responses-1ztiVYsx.js} +49 -15
  232. package/dist/src/{responses-DL9m8CyY.js → responses-B8haB-mD.js} +49 -15
  233. package/dist/src/{responses-C-flexAY.js → responses-BiaBguAu.js} +49 -15
  234. package/dist/src/{responses-Bi9vBuW_.cjs → responses-CF-ayauu.cjs} +48 -14
  235. package/dist/src/rubyUtils-4hjGxvju.js +3 -0
  236. package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-BI0p46eZ.js} +3 -3
  237. package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CIQFnVz4.js} +3 -3
  238. package/dist/src/rubyUtils-CO-tuszQ.cjs +2 -0
  239. package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DGnoCYL2.js} +3 -3
  240. package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-DoifqkiA.cjs} +4 -3
  241. package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-BDLeW29y.js} +12 -12
  242. package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-C5T60MKf.js} +13 -13
  243. package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-ClS_NB07.js} +13 -13
  244. package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-ljtY12VM.cjs} +12 -12
  245. package/dist/src/{scanner-1DqWi1Ej.js → scanner-nOCWNIXa.js} +7 -7
  246. package/dist/src/server/index.js +1067 -265
  247. package/dist/src/{server-Dx2TyCH2.cjs → server-BEECpeGG.cjs} +5 -5
  248. package/dist/src/{server-BNYztJkh.js → server-ByiF3qlg.js} +9 -8
  249. package/dist/src/{server-BSB45Nt9.js → server-ByxbqAcQ.js} +8 -7
  250. package/dist/src/{server-DaA2eR26.cjs → server-C0XKRNB_.cjs} +1 -1
  251. package/dist/src/server-C_15p79-.js +3 -0
  252. package/dist/src/{server-D6Il2Sob.js → server-gyd6d4Hc.js} +5 -5
  253. package/dist/src/{signal-CE5G3a7x.js → signal-DTtUuU3l.js} +3 -3
  254. package/dist/src/{slack-acRb0IqQ.js → slack-4zZX1OKP.js} +1 -1
  255. package/dist/src/{slack-1Rhq0EoV.cjs → slack-BLlsDpfG.cjs} +1 -1
  256. package/dist/src/{slack-D5Wpy8LM.js → slack-BPYLQLgb.js} +2 -2
  257. package/dist/src/{slack-DDUe-5MC.js → slack-Bamy_7te.js} +2 -2
  258. package/dist/src/{store-DAAyxcy6.cjs → store-2K0kDi80.cjs} +2 -2
  259. package/dist/src/{store-Dn9HUkdW.js → store-2OXm_eBY.js} +3 -3
  260. package/dist/src/store-BELqNwvz.js +3 -0
  261. package/dist/src/{store-M0b1WfYb.js → store-BPkzEyFM.js} +2 -2
  262. package/dist/src/{store-CYEy5J2D.js → store-CPh25336.js} +3 -3
  263. package/dist/src/store-uQZ4AjPe.cjs +2 -0
  264. package/dist/src/{tables-CsWou1Bx.js → tables-BMSOS2Gg.js} +3 -3
  265. package/dist/src/{tables-DUfh1F7Z.cjs → tables-CXbaZ9y1.cjs} +2 -2
  266. package/dist/src/{tables-C4CH3zRr.js → tables-NlvH23ky.js} +3 -3
  267. package/dist/src/{tables-DQ4WU5tX.js → tables-WgdUZ8Ck.js} +2 -2
  268. package/dist/src/{telemetry-dbaJ0E98.js → telemetry--iqaGyaS.js} +5 -4
  269. package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CEQxGnMZ.cjs} +7 -6
  270. package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-CgdVGV8N.js} +4 -3
  271. package/dist/src/{telemetry-CQPez_Jp.js → telemetry-DWdGHvEf.js} +5 -4
  272. package/dist/src/telemetry-DjNoC_n3.cjs +2 -0
  273. package/dist/src/telemetry-ZdPZc0fm.js +3 -0
  274. package/dist/src/{text-BVi-cLPJ.cjs → text-BiNME7QG.cjs} +1 -1
  275. package/dist/src/{text-KvuD2Iko.js → text-D4lz-Jg_.js} +1 -1
  276. package/dist/src/{text-DHxdyQqT.js → text-DDQP0tuQ.js} +1 -1
  277. package/dist/src/{text-CZr46tp_.js → text-NWvfMfkF.js} +1 -1
  278. package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-2wIvAhB3.js} +1 -1
  279. package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-4c780gFd.js} +1 -1
  280. package/dist/src/tokenUsageUtils-BjVkdk18.js +142 -0
  281. package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-C9odhsbW.cjs} +1 -1
  282. package/dist/src/{transcription-DuWDupG7.js → transcription-84t4ALo2.js} +5 -5
  283. package/dist/src/{transcription-CJspiD2c.js → transcription-Bm2emLmJ.js} +6 -6
  284. package/dist/src/{transcription-BvjmiYB1.cjs → transcription-CZ4LG5hQ.cjs} +5 -5
  285. package/dist/src/{transcription-V2HaAmy2.js → transcription-D7Q0vJsh.js} +6 -6
  286. package/dist/src/{transform-zDhMmzwX.js → transform-B-b6Cq-q.js} +5 -5
  287. package/dist/src/transform-BQt0BeAW.js +3 -0
  288. package/dist/src/{transform-DgKlRr73.cjs → transform-Bq5oqC0s.cjs} +1 -1
  289. package/dist/src/{transform-CUnzlsbn.cjs → transform-C9izGX54.cjs} +4 -4
  290. package/dist/src/{transform-DYX1_Xnh.js → transform-CwbAZ84V.js} +5 -5
  291. package/dist/src/{transform-CTeuTR3S.cjs → transform-Dg4LcO1Y.cjs} +6 -6
  292. package/dist/src/{transform-CG0ehZNG.js → transform-DtooZqYY.js} +6 -6
  293. package/dist/src/{transform-UN5UGu8U.js → transform-DzCF-wqV.js} +5 -5
  294. package/dist/src/{transform-lQrDE1BQ.js → transform-_DpNB4qp.js} +5 -5
  295. package/dist/src/{transform-Bbg6A8Jk.js → transform-eGiUAv86.js} +4 -4
  296. package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-B22swDxr.cjs} +1 -1
  297. package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-lvCCvuPT.js} +1 -1
  298. package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-rJGPccjr.js} +1 -1
  299. package/dist/src/{types-Bgh5SOn6.js → types-BDjGOq4E.js} +4 -2
  300. package/dist/src/{types-Dm9JM6Vb.js → types-BVH9hjgW.js} +4 -2
  301. package/dist/src/{types-CeaeaZdP.cjs → types-CgG2rKiW.cjs} +151 -149
  302. package/dist/src/{types-BGQDAP8i.js → types-DNRZVOue.js} +152 -150
  303. package/dist/src/{util-C8e5uydV.js → util-3pBZZb_H.js} +142 -17
  304. package/dist/src/{util-CN3SrLT4.cjs → util-A5_ZsQUn.cjs} +65 -43
  305. package/dist/src/{util-D3q0WQ-0.js → util-B9CNhyac.js} +66 -44
  306. package/dist/src/{util-DxWpWjhc.js → util-BQOCAHQC.js} +700 -575
  307. package/dist/src/{util-BYvQUPp7.js → util-BVXcTwXu.js} +3 -3
  308. package/dist/src/{util-D9TisOyk.js → util-BlFVL0UF.js} +65 -43
  309. package/dist/src/{util-C9J8ahRn.js → util-C-kmRosx.js} +66 -44
  310. package/dist/src/{util-DvU2Pw8c.js → util-DFPeFkiV.js} +3 -3
  311. package/dist/src/{util-DDs-7g6-.js → util-DN0-b81k.js} +3 -3
  312. package/dist/src/{util-olYL5C6N.cjs → util-Dpmm_dAI.cjs} +3 -3
  313. package/dist/src/{util-oGMLA7vc.js → util-Dub0f_ej.js} +700 -575
  314. package/dist/src/{util-Bxn8emtE.cjs → util-DvpHnLt0.cjs} +718 -570
  315. package/dist/src/{utils-DJfvjyMj.js → utils-BUMN8orw.js} +3 -3
  316. package/dist/src/{utils-B05gLxER.cjs → utils-DkVeShIB.cjs} +2 -2
  317. package/dist/src/{utils-BLJKfv0y.js → utils-kt7lv30R.js} +3 -3
  318. package/dist/src/{utils-hXtCYanr.js → utils-o8S5huU2.js} +2 -2
  319. package/dist/src/version-0frU0UTr.js +16 -0
  320. package/dist/src/version-CbpiUINz.js +17 -0
  321. package/dist/src/version-CbuBKu2U.js +16 -0
  322. package/dist/src/version-D9zu9FWB.cjs +27 -0
  323. package/dist/tsconfig.tsbuildinfo +1 -1
  324. package/package.json +22 -20
  325. package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
  326. package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
  327. package/dist/src/cache-Cr9oLMUa.js +0 -3
  328. package/dist/src/cloud-Hphvo8kr.js +0 -3
  329. package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
  330. package/dist/src/evalResult-D8MT9p0s.js +0 -3
  331. package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
  332. package/dist/src/evaluator-CVessDWe.js +0 -3
  333. package/dist/src/fetch-C7bGKDlQ.js +0 -3
  334. package/dist/src/graders-BOAzQEUe.cjs +0 -2
  335. package/dist/src/graders-D4BTsZdG2.js +0 -3
  336. package/dist/src/graders-DOJK1XpV.js +0 -2
  337. package/dist/src/graders-NAv9LcBn.js +0 -2
  338. package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
  339. package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
  340. package/dist/src/server-DCtHUqlp.js +0 -3
  341. package/dist/src/store-CWOSz6D_.cjs +0 -2
  342. package/dist/src/store-DCDBhv7B.js +0 -3
  343. package/dist/src/telemetry-C1IqxcdW.js +0 -3
  344. package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
  345. package/dist/src/transform-M6ITAESf.js +0 -3
  346. /package/dist/src/{evalResult-DElBuddX.js → evalResult-spPqh1G_.js} +0 -0
@@ -1,32 +1,33 @@
1
- import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, f as sanitizeObject, g as getAjv, h as extractJsonObjects, k as state, m as extractFirstJsonObject, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "../logger-B88EkIn6.js";
2
- import { A as TERMINAL_MAX_WIDTH, I as VERSION, L as FILE_METADATA_KEY, M as getDefaultShareViewBaseUrl, N as getShareApiBaseUrl, P as getShareViewBaseUrl, R as HUMAN_ASSERTION_TYPE, S as parseChatPrompt, a as CloudConfig, d as sleep, h as REQUEST_TIMEOUT_MS, j as getDefaultPort, n as fetchWithRetries, o as cloudConfig, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp, y as isPromptfooSampleTarget } from "../fetch-B0Z3Oe4k.js";
3
- import { t as invariant } from "../invariant-vgHWClmd.js";
4
- import { a as getAuthor, c as isLoggedIntoCloud, l as promptForEmailUnverified, n as checkEmailStatusAndMaybeExit, o as getUserEmail, r as clearUserEmail, s as getUserId, t as checkEmailStatus, u as setUserEmail } from "../accounts-DdJ2pHMI.js";
5
- import { r as importModule, t as getDirectory } from "../esm-Dh4dOLlt.js";
6
- import { a as getNunjucksEngine, i as extractVariablesFromTemplates, r as extractVariablesFromTemplate, t as renderEnvOnlyInObject } from "../render-DHIZ6_k8.js";
7
- import { t as providerRegistry } from "../providerRegistry-CPQ_CmVO.js";
8
- import { a as openBrowser, c as getRemoteGenerationUrl, d as neverGenerateRemote, i as checkServerRunning, n as BrowserBehaviorNames, p as shouldGenerateRemote, s as promptYesNo, t as BrowserBehavior, u as getRemoteHealthUrl } from "../server-BSB45Nt9.js";
9
- import { a as evalResultsTable, c as evalsToPromptsTable, d as promptsTable, g as getDbSignalPath, h as getDb, i as datasetsTable, l as evalsToTagsTable, n as blobReferencesTable, o as evalsTable, p as tagsTable, r as configsTable, s as evalsToDatasetsTable, t as blobAssetsTable, u as modelAuditsTable } from "../tables-CsWou1Bx.js";
10
- import { $ as MULTI_INPUT_EXCLUDED_PLUGINS, A as STRATEGY_COLLECTIONS, B as ALIASED_PLUGIN_MAPPINGS, E as ALL_STRATEGIES, F as isMultiTurnStrategy, G as DEFAULT_PLUGINS, H as BIAS_PLUGINS, I as Severity, J as HARM_PLUGINS, K as FINANCIAL_PLUGINS, L as categoryAliases, M as getDefaultNFanout, O as DEFAULT_STRATEGIES, P as isFanoutStrategy, Q as MEDICAL_PLUGINS, S as StrategyConfigSchema, U as CANARY_BREAKING_STRATEGY_IDS, V as ALL_PLUGINS, W as DATASET_EXEMPT_PLUGINS, X as LLAMA_GUARD_ENABLED_CATEGORIES, Y as INSURANCE_PLUGINS, Z as LLAMA_GUARD_REPLICATE_PROVIDER, _ as ProvidersSchema, a as EvaluateOptionsSchema, at as REDTEAM_PROVIDER_HARM_PLUGINS, b as PluginConfigSchema, c as TestSuiteConfigSchema, ct as TEEN_SAFETY_PLUGINS, d as isGradingResult, dt as CODING_AGENT_CORE_PLUGINS, et as MULTI_INPUT_VAR, f as isResultFailureReason, g as ProviderOptionsSchema, h as RedteamConfigSchema, ht as PromptSchema, i as EvalResultsFilterMode, it as REDTEAM_MODEL, j as STRATEGY_COLLECTION_MAPPINGS, l as TestSuiteSchema, lt as TELECOM_PLUGINS, m as isProviderOptions, n as BaseAssertionTypesSchema, nt as PII_PLUGINS, ot as REMOTE_ONLY_PLUGIN_IDS, p as isApiProvider, q as FOUNDATION_PLUGINS, r as CommandLineOptionsSchema, rt as PLUGIN_CATEGORIES, s as ResultFailureReason, t as AssertionOrSetSchema, tt as PHARMACY_PLUGINS, u as UnifiedConfigSchema, ut as UNALIGNED_PROVIDER_HARM_PLUGINS, v as ConversationMessageSchema, w as isUuid, y as PartialGenerationError, z as riskCategorySeverityMap } from "../types-Bgh5SOn6.js";
11
- import { i as isJavascriptFile } from "../fileExtensions-BArZuxsI.js";
12
- import { n as sha256, t as randomSequence } from "../createHash-CwDVU5xr.js";
13
- import { a as generateIdFromPrompt, t as hashPrompt } from "../utils-DJfvjyMj.js";
14
- import { t as getTraceStore } from "../store-Dn9HUkdW.js";
15
- import { c as isNonTransientHttpStatus, i as getCache, n as disableCache, o as withCacheNamespace, r as fetchWithCache, s as NON_TRANSIENT_HTTP_STATUSES, t as cache_exports } from "../cache-C4Xb-hNb.js";
16
- import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "../tokenUsageUtils-C-bmyHoE.js";
17
- import { n as getBlobUrl, t as getBlobByHash } from "../blobs-BCZavS8s.js";
18
- import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "../extractor-CAfTSraf.js";
19
- import { $ as AIStudioChatProvider, D as getShortPluginId, E as getSessionId, F as loadFromPackage, G as throwIfTargetPromptExceedsMaxChars, H as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY, I as redteamProviderManager, L as TokenUsageTracker, M as renderPrompt, N as runExtensionHook, O as isBasicRefusal, P as isPackagePath, Q as VertexChatProvider, R as createRateLimitRegistry, S as extractGoalFromPrompt, T as extractVariablesFromJson, U as getGeneratedPromptOverLimit, V as PromptfooHarmfulCompletionProvider, W as getMaxCharsPerMessageModifierValue, _ as mediaExists, a as resolveProviderConfigs, b as checkExfilTracking, c as MCPProvider, ct as checkCloudPermissions, d as createTransformResponse, dt as getOrgContext, f as GoogleLiveProvider, ft as getPluginSeverityOverridesFromCloud, g as getMediaStorage, h as validateStrategies, ht as resolveTeamId, i as resolveProvider, j as collectFileMetadata, l as HttpProvider, lt as getCloudDatabaseId, m as loadStrategy, mt as isCloudProvider, n as loadApiProvider, p as Strategies, r as loadApiProviders, t as getProviderIds, u as createTransformRequest, ut as getEvalConfigFromCloud, v as retrieveMedia, w as extractPromptFromTags, y as pluginMatchesStrategyTargets, z as createProviderRateLimitOptions } from "../providers-Domz_llv.js";
20
- import { n as telemetry, t as TelemetryEventSchema } from "../telemetry-dbaJ0E98.js";
21
- import { r as runPython } from "../pythonUtils-Cldx7huE.js";
22
- import { A as readFilters, M as loadFunction, N as parseFileUrl, O as maybeLoadToolsFromExternalFile, T as maybeLoadFromExternalFile, _ as getProviderDescription, a as evalTableToJson, b as isOpenAiProvider, c as fetchCsvFromGoogleSheet, d as extractRuntimeVars, f as filterRuntimeVars, g as doesProviderRefMatch, h as checkProviderApiKeys, i as ComparisonEvalNotFoundError, j as readOutput, l as setupEnv, m as resultIsForTestCase, n as writeMultipleOutputs, o as generateEvalCsv, p as getTestCaseDeduplicationKey, r as writeOutput, s as mergeComparisonTables, t as printBorder, u as deduplicateTestCases, v as isAnthropicProvider, w as maybeLoadConfigFromExternalFile, x as isProviderAllowed, y as isGoogleProvider } from "../util-C8e5uydV.js";
23
- import { t as OpenAiChatCompletionProvider } from "../chat-BfPaS15_.js";
24
- import { h as validateFunctionCall } from "../transform-lQrDE1BQ.js";
25
- import { $ as matchesPiScore, A as DivergentRepetitionPlugin, B as fail, C as getPiiLeakTestsForCategory, D as HarmbenchPlugin, E as ImitationPlugin, F as AegisPlugin, G as matchesClosedQa, H as loadRubricPrompt, I as RedteamGraderBase, J as matchesContextRelevance, K as matchesContextFaithfulness, L as RedteamPluginBase, M as CrossSessionLeakPlugin, N as ContractPlugin, O as HallucinationPlugin, P as BeavertailsPlugin, Q as matchesModeration, R as fetchHuggingFaceDataset, S as PlinyPlugin, T as IntentPlugin, U as matchesAnswerRelevance, V as getAndCheckProvider, W as matchesClassification, X as matchesGEval, Y as matchesFactuality, Z as matchesLlmRubric, _ as PoliticsPlugin, _t as retryWithDeduplication, a as UnverifiableClaimsPlugin, at as withProviderCallExecutionContext, b as isValidPolicyObject, c as ToolDiscoveryPlugin, ct as readPrompts, d as TeenSafetyDangerousContentPlugin, dt as coerceString, et as matchesSearchRubric, f as TeenSafetyAgeRestrictedGoodsAndServicesPlugin, ft as getFinalTest, g as PromptExtractionPlugin, gt as getCustomPolicies, h as RbacPlugin, ht as resolveContext, i as VLGuardPlugin, it as selectMaxScore, j as DebugAccessPlugin, k as ExcessiveAgencyPlugin, l as TeenSafetyHarmfulBodyIdealsPlugin, lt as readProviderPromptMap, m as ShellInjectionPlugin, mt as processFileReference, n as getGraderById, nt as matchesSimilarity, o as UnsafeBenchPlugin, ot as doRemoteGrading, p as SqlInjectionPlugin, pt as loadFromJavaScriptFile, q as matchesContextRecall, r as VLSUPlugin, rt as matchesTrajectoryGoalSuccess, s as ToxicChatPlugin, st as processPrompts, t as GRADERS, tt as matchesSelectBest, u as TeenSafetyDangerousRoleplayPlugin, ut as SUGGEST_PROMPTS_SYSTEM_MESSAGE, v as PolicyPlugin, vt as sampleArray, w as OverreliancePlugin, x as makeInlinePolicyIdSync, y as determinePolicyTypeFromId, yt as getDefaultProviders, z as callProviderWithContext } from "../graders-pvbReLLn.js";
26
- import { l as validateFunctionCall$1 } from "../util-D3q0WQ-0.js";
27
- import { i as getProcessShim, n as transform, t as TransformInputType } from "../transform-DYX1_Xnh.js";
28
- import { t as ellipsize } from "../text-CZr46tp_.js";
29
- import { n as runRuby } from "../rubyUtils-CYSQEG4a.js";
1
+ import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, f as sanitizeObject, g as getAjv, h as extractJsonObjects, k as state, m as extractFirstJsonObject, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "../logger-KD8JjCRJ.js";
2
+ import { A as TERMINAL_MAX_WIDTH, F as FILE_METADATA_KEY, I as HUMAN_ASSERTION_TYPE, M as getDefaultShareViewBaseUrl, N as getShareApiBaseUrl, P as getShareViewBaseUrl, S as parseChatPrompt, a as CloudConfig, d as sleep, h as REQUEST_TIMEOUT_MS, j as getDefaultPort, n as fetchWithRetries, o as cloudConfig, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp, y as isPromptfooSampleTarget } from "../fetch-Y5qX_kST.js";
3
+ import { n as VERSION } from "../version-0frU0UTr.js";
4
+ import { t as invariant } from "../invariant-DIYf9sP1.js";
5
+ import { a as getAuthor, c as isLoggedIntoCloud, l as promptForEmailUnverified, n as checkEmailStatusAndMaybeExit, o as getUserEmail, r as clearUserEmail, s as getUserId, t as checkEmailStatus, u as setUserEmail } from "../accounts-bnyHT7Ju.js";
6
+ import { r as importModule, t as getDirectory } from "../esm-B6whoAcf.js";
7
+ import { a as getNunjucksEngine, i as extractVariablesFromTemplates, r as extractVariablesFromTemplate, t as renderEnvOnlyInObject } from "../render-7uNJ2V14.js";
8
+ import { t as providerRegistry } from "../providerRegistry-1gB5vtzQ.js";
9
+ import { a as openBrowser, c as getRemoteGenerationUrl, d as neverGenerateRemote, i as checkServerRunning, n as BrowserBehaviorNames, p as shouldGenerateRemote, s as promptYesNo, t as BrowserBehavior, u as getRemoteHealthUrl } from "../server-ByxbqAcQ.js";
10
+ import { a as evalResultsTable, c as evalsToPromptsTable, d as promptsTable, g as getDbSignalPath, h as getDb, i as datasetsTable, l as evalsToTagsTable, n as blobReferencesTable, o as evalsTable, p as tagsTable, r as configsTable, s as evalsToDatasetsTable, t as blobAssetsTable, u as modelAuditsTable } from "../tables-BMSOS2Gg.js";
11
+ import { $ as MULTI_INPUT_EXCLUDED_PLUGINS, A as STRATEGY_COLLECTIONS, B as ALIASED_PLUGIN_MAPPINGS, E as ALL_STRATEGIES, F as isMultiTurnStrategy, G as DEFAULT_PLUGINS, H as BIAS_PLUGINS, I as Severity, J as HARM_PLUGINS, K as FINANCIAL_PLUGINS, L as categoryAliases, M as getDefaultNFanout, O as DEFAULT_STRATEGIES, P as isFanoutStrategy, Q as MEDICAL_PLUGINS, S as StrategyConfigSchema, U as CANARY_BREAKING_STRATEGY_IDS, V as ALL_PLUGINS, W as DATASET_EXEMPT_PLUGINS, X as LLAMA_GUARD_ENABLED_CATEGORIES, Y as INSURANCE_PLUGINS, Z as LLAMA_GUARD_REPLICATE_PROVIDER, _ as ProvidersSchema, a as EvaluateOptionsSchema, at as REDTEAM_PROVIDER_HARM_PLUGINS, b as PluginConfigSchema, c as TestSuiteConfigSchema, ct as TEEN_SAFETY_PLUGINS, d as isGradingResult, dt as CODING_AGENT_CORE_PLUGINS, et as MULTI_INPUT_VAR, f as isResultFailureReason, g as ProviderOptionsSchema, h as RedteamConfigSchema, ht as PromptSchema, i as EvalResultsFilterMode, it as REDTEAM_MODEL, j as STRATEGY_COLLECTION_MAPPINGS, l as TestSuiteSchema, lt as TELECOM_PLUGINS, m as isProviderOptions, n as BaseAssertionTypesSchema, nt as PII_PLUGINS, ot as REMOTE_ONLY_PLUGIN_IDS, p as isApiProvider, q as FOUNDATION_PLUGINS, r as CommandLineOptionsSchema, rt as PLUGIN_CATEGORIES, s as ResultFailureReason, t as AssertionOrSetSchema, tt as PHARMACY_PLUGINS, u as UnifiedConfigSchema, ut as UNALIGNED_PROVIDER_HARM_PLUGINS, v as ConversationMessageSchema, w as isUuid, y as PartialGenerationError, z as riskCategorySeverityMap } from "../types-BDjGOq4E.js";
12
+ import { i as isJavascriptFile } from "../fileExtensions-BGh-W-HT.js";
13
+ import { n as sha256, t as randomSequence } from "../createHash-4gFQpDDv.js";
14
+ import { a as generateIdFromPrompt, t as hashPrompt } from "../utils-BUMN8orw.js";
15
+ import { t as getTraceStore } from "../store-2OXm_eBY.js";
16
+ import { c as isNonTransientHttpStatus, i as getCache, n as disableCache, o as withCacheNamespace, r as fetchWithCache, s as NON_TRANSIENT_HTTP_STATUSES, t as cache_exports } from "../cache-Cr-qWIbP.js";
17
+ import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "../tokenUsageUtils-BjVkdk18.js";
18
+ import { n as getBlobUrl, t as getBlobByHash } from "../blobs-D_gg8nbm.js";
19
+ import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "../extractor-YlZbUMsL.js";
20
+ import { $ as AIStudioChatProvider, D as getShortPluginId, E as getSessionId, F as loadFromPackage, G as throwIfTargetPromptExceedsMaxChars, H as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY, I as redteamProviderManager, L as TokenUsageTracker, M as renderPrompt, N as runExtensionHook, O as isBasicRefusal, P as isPackagePath, Q as VertexChatProvider, R as createRateLimitRegistry, S as extractGoalFromPrompt, T as extractVariablesFromJson, U as getGeneratedPromptOverLimit, V as PromptfooHarmfulCompletionProvider, W as getMaxCharsPerMessageModifierValue, _ as mediaExists, a as resolveProviderConfigs, b as checkExfilTracking, c as MCPProvider, ct as checkCloudPermissions, d as createTransformResponse, dt as getOrgContext, f as GoogleLiveProvider, ft as getPluginSeverityOverridesFromCloud, g as getMediaStorage, h as validateStrategies, ht as resolveTeamId, i as resolveProvider, j as collectFileMetadata, l as HttpProvider, lt as getCloudDatabaseId, m as loadStrategy, mt as isCloudProvider, n as loadApiProvider, p as Strategies, r as loadApiProviders, t as getProviderIds, u as createTransformRequest, ut as getEvalConfigFromCloud, v as retrieveMedia, w as extractPromptFromTags, y as pluginMatchesStrategyTargets, z as createProviderRateLimitOptions } from "../providers-DRrerKra.js";
21
+ import { n as telemetry, t as TelemetryEventSchema } from "../telemetry--iqaGyaS.js";
22
+ import { r as runPython } from "../pythonUtils-C4tltmIn.js";
23
+ import { F as readOutput, I as loadFunction, L as parseFileUrl, M as maybeLoadToolsFromExternalFile, O as maybeLoadConfigFromExternalFile, P as readFilters, _ as getProviderDescription, a as evalTableToJson, b as isOpenAiProvider, c as fetchCsvFromGoogleSheet, d as extractRuntimeVars, f as filterRuntimeVars, g as doesProviderRefMatch, h as checkProviderApiKeys, i as ComparisonEvalNotFoundError, k as maybeLoadFromExternalFile, l as setupEnv, m as resultIsForTestCase, n as writeMultipleOutputs, o as generateEvalCsv, p as getTestCaseDeduplicationKey, r as writeOutput, s as mergeComparisonTables, t as printBorder, u as deduplicateTestCases, v as isAnthropicProvider, w as normalizeProviderRef, x as isProviderAllowed, y as isGoogleProvider } from "../util-3pBZZb_H.js";
24
+ import { t as OpenAiChatCompletionProvider } from "../chat-DChSH_Es.js";
25
+ import { h as validateFunctionCall } from "../transform-DtooZqYY.js";
26
+ import { $ as CONTEXT_FAITHFULNESS_LONGFORM, A as DivergentRepetitionPlugin, B as matchesFactuality, C as getPiiLeakTestsForCategory, Ct as getCustomPolicies, D as HarmbenchPlugin, Dt as DEFAULT_ANTHROPIC_MODEL, E as ImitationPlugin, Et as getDefaultProviders, F as AegisPlugin, G as doRemoteGrading, H as matchesLlmRubric, I as RedteamGraderBase, J as readProviderPromptMap, K as processPrompts, L as RedteamPluginBase, M as CrossSessionLeakPlugin, N as ContractPlugin, O as HallucinationPlugin, P as BeavertailsPlugin, Q as ANSWER_RELEVANCY_GENERATE, R as fetchHuggingFaceDataset, S as PlinyPlugin, St as withProviderCallExecutionContext, T as IntentPlugin, Tt as sampleArray, U as matchesPiScore, V as matchesGEval, W as matchesTrajectoryGoalSuccess, X as SELECT_BEST_PROMPT, Y as DEFAULT_WEB_SEARCH_PROMPT, Z as SUGGEST_PROMPTS_SYSTEM_MESSAGE, _ as PoliticsPlugin, _t as loadFromJavaScriptFile, a as UnverifiableClaimsPlugin, at as CONTEXT_RELEVANCE_BAD, b as isValidPolicyObject, bt as getAndCheckProvider, c as ToolDiscoveryPlugin, ct as cosineSimilarity, d as TeenSafetyDangerousContentPlugin, dt as fail, et as CONTEXT_FAITHFULNESS_NLI_STATEMENTS, f as TeenSafetyAgeRestrictedGoodsAndServicesPlugin, ft as normalizeMatcherTokenUsage, g as PromptExtractionPlugin, gt as getFinalTest, h as RbacPlugin, ht as coerceString, i as VLGuardPlugin, it as CONTEXT_RELEVANCE, j as DebugAccessPlugin, k as ExcessiveAgencyPlugin, l as TeenSafetyHarmfulBodyIdealsPlugin, lt as dotProduct, m as ShellInjectionPlugin, mt as tryParse, n as getGraderById, nt as CONTEXT_RECALL_ATTRIBUTED_TOKEN, o as UnsafeBenchPlugin, ot as loadRubricPrompt, p as SqlInjectionPlugin, pt as splitIntoSentences, q as readPrompts, r as VLSUPlugin, rt as CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN, s as ToxicChatPlugin, st as renderLlmRubricPrompt, t as GRADERS, tt as CONTEXT_RECALL, u as TeenSafetyDangerousRoleplayPlugin, ut as euclideanDistance, v as PolicyPlugin, vt as processFileReference, w as OverreliancePlugin, wt as retryWithDeduplication, x as makeInlinePolicyIdSync, xt as getGradingProvider, y as determinePolicyTypeFromId, yt as callProviderWithContext, z as matchesClosedQa } from "../graders-CwrbifOo.js";
27
+ import { l as validateFunctionCall$1 } from "../util-B9CNhyac.js";
28
+ import { i as getProcessShim, n as transform, t as TransformInputType } from "../transform-CwbAZ84V.js";
29
+ import { t as ellipsize } from "../text-NWvfMfkF.js";
30
+ import { n as runRuby } from "../rubyUtils-DGnoCYL2.js";
30
31
  import dotenv from "dotenv";
31
32
  import { AsyncResource } from "node:async_hooks";
32
33
  import * as fs$3 from "fs";
@@ -65,7 +66,7 @@ import { exec, spawn } from "child_process";
65
66
  import { XMLParser } from "fast-xml-parser";
66
67
  import async from "async";
67
68
  import { LRUCache } from "lru-cache";
68
- import { JSDOM } from "jsdom";
69
+ import { parse as parse$2 } from "parse5";
69
70
  import { distance } from "fastest-levenshtein";
70
71
  import * as rouge from "js-rouge";
71
72
  import { isDeepStrictEqual } from "node:util";
@@ -910,6 +911,24 @@ function sanitizeForDb(obj) {
910
911
  return Array.isArray(obj) ? [] : null;
911
912
  }
912
913
  }
914
+ /**
915
+ * Sanitize a per-test-case field for persistence: strips circular refs,
916
+ * collapses class instances (e.g. live SDK clients that leaked in via
917
+ * `defaultTest.options.provider`), and redacts credential fields (`apiKey`,
918
+ * `token`, etc.) at any depth. Use this for any slot that can carry a provider
919
+ * config — notably `testCase.options.provider` and `prompt.config.provider`,
920
+ * where the resolved runtime provider (with its Anthropic / Bedrock SDK
921
+ * client) flows in from the evaluator. Without this, credentials configured on
922
+ * the judge provider end up in the Eval results both in the DB and in the
923
+ * polling response served by `/api/eval/job/:id`.
924
+ */
925
+ function sanitizeForDbWithSecrets(obj) {
926
+ if (obj === null || obj === void 0) return obj;
927
+ return sanitizeObject(obj, {
928
+ context: "evalResult field",
929
+ maxDepth: Number.POSITIVE_INFINITY
930
+ });
931
+ }
913
932
  var EvalResult = class EvalResult {
914
933
  static async createFromEvaluateResult(evalId, result, opts) {
915
934
  const persist = opts?.persist == null ? true : opts.persist;
@@ -926,10 +945,10 @@ var EvalResult = class EvalResult {
926
945
  const args = {
927
946
  id: crypto.randomUUID(),
928
947
  evalId,
929
- testCase: sanitizeForDb(preSanitizeTestCase),
948
+ testCase: sanitizeForDbWithSecrets(preSanitizeTestCase),
930
949
  promptIdx: result.promptIdx,
931
950
  testIdx: result.testIdx,
932
- prompt: sanitizeForDb(prompt),
951
+ prompt: sanitizeForDbWithSecrets(prompt),
933
952
  promptId: hashPrompt(prompt),
934
953
  error: error?.toString(),
935
954
  success,
@@ -968,8 +987,8 @@ var EvalResult = class EvalResult {
968
987
  for (const result of processedResults) {
969
988
  const sanitizedResult = {
970
989
  ...result,
971
- testCase: sanitizeForDb(result.testCase),
972
- prompt: sanitizeForDb(result.prompt),
990
+ testCase: sanitizeForDbWithSecrets(result.testCase),
991
+ prompt: sanitizeForDbWithSecrets(result.prompt),
973
992
  response: sanitizeForDb(result.response),
974
993
  gradingResult: sanitizeForDb(result.gradingResult),
975
994
  namedScores: sanitizeForDb(result.namedScores),
@@ -3841,6 +3860,502 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
3841
3860
  };
3842
3861
  };
3843
3862
  //#endregion
3863
+ //#region src/matchers/classification.ts
3864
+ /**
3865
+ *
3866
+ * @param expected Expected classification. If undefined, matches any classification.
3867
+ * @param output Text to classify.
3868
+ * @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
3869
+ * @param grading
3870
+ * @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
3871
+ */
3872
+ async function matchesClassification(expected, output, threshold, grading) {
3873
+ const resp = await (await getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
3874
+ if (!resp.classification) return fail(resp.error || "Unknown error fetching classification");
3875
+ let score;
3876
+ if (expected === void 0) {
3877
+ const scores = Object.values(resp.classification);
3878
+ if (scores.length === 0) return {
3879
+ pass: false,
3880
+ score: 0,
3881
+ reason: "No classification scores returned"
3882
+ };
3883
+ score = Math.max(...scores);
3884
+ } else score = resp.classification[expected] || 0;
3885
+ if (score >= threshold - Number.EPSILON) {
3886
+ const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
3887
+ return {
3888
+ pass: true,
3889
+ score,
3890
+ reason
3891
+ };
3892
+ }
3893
+ return {
3894
+ pass: false,
3895
+ score,
3896
+ reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
3897
+ };
3898
+ }
3899
+ //#endregion
3900
+ //#region src/matchers/comparison.ts
3901
+ async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
3902
+ invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
3903
+ const resp = await callProviderWithContext(await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "select-best check"), await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, SELECT_BEST_PROMPT), {
3904
+ criteria,
3905
+ outputs: outputs.map((o) => tryParse(o)),
3906
+ ...vars || {}
3907
+ }), "select-best", {
3908
+ criteria,
3909
+ outputs: outputs.map((o) => tryParse(o)),
3910
+ ...vars || {}
3911
+ }, providerCallContext);
3912
+ if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => fail(resp.error || "No output", resp.tokenUsage));
3913
+ invariant(typeof resp.output === "string", "select-best produced malformed response");
3914
+ const firstIntegerMatch = resp.output.trim().match(/\d+/);
3915
+ const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
3916
+ if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
3917
+ const tokensUsed = normalizeMatcherTokenUsage(resp.tokenUsage);
3918
+ return outputs.map((_output, index) => {
3919
+ if (index === verdict) return {
3920
+ pass: true,
3921
+ score: 1,
3922
+ reason: `Output selected as the best: ${criteria}`,
3923
+ tokensUsed
3924
+ };
3925
+ else return {
3926
+ pass: false,
3927
+ score: 0,
3928
+ reason: `Output not selected: ${criteria}`,
3929
+ tokensUsed
3930
+ };
3931
+ });
3932
+ }
3933
+ async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
3934
+ invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
3935
+ const value = assertion.value || {};
3936
+ const options = {
3937
+ method: typeof value === "object" && "method" in value ? value.method : "average",
3938
+ weights: typeof value === "object" && "weights" in value ? value.weights : {},
3939
+ threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
3940
+ };
3941
+ const scores = resultsWithGradingResults.map((result, index) => {
3942
+ const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
3943
+ if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
3944
+ let totalWeightedScore = 0;
3945
+ let totalWeight = 0;
3946
+ relevantResults.forEach((componentResult) => {
3947
+ const assertionType = componentResult.assertion?.type || "unknown";
3948
+ const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
3949
+ const score = componentResult.score || 0;
3950
+ totalWeightedScore += score * weight;
3951
+ totalWeight += weight;
3952
+ });
3953
+ let aggregateScore;
3954
+ if (options.method === "sum") aggregateScore = totalWeightedScore;
3955
+ else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
3956
+ return {
3957
+ index,
3958
+ score: aggregateScore,
3959
+ componentCount: relevantResults.length,
3960
+ totalWeight
3961
+ };
3962
+ });
3963
+ let maxScore = -Infinity;
3964
+ let winnerIndex = 0;
3965
+ for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
3966
+ maxScore = scores[i].score;
3967
+ winnerIndex = i;
3968
+ }
3969
+ const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
3970
+ return scores.map(({ index, score, componentCount, totalWeight }) => {
3971
+ const isWinner = index === winnerIndex && meetsThreshold;
3972
+ return {
3973
+ pass: isWinner,
3974
+ score: isWinner ? 1 : 0,
3975
+ reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
3976
+ namedScores: {
3977
+ maxScore: score,
3978
+ assertionCount: componentCount,
3979
+ totalWeight
3980
+ }
3981
+ };
3982
+ });
3983
+ }
3984
+ //#endregion
3985
+ //#region src/matchers/moderation.ts
3986
+ async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
3987
+ if (!assistantResponse) return {
3988
+ pass: true,
3989
+ score: 1,
3990
+ reason: "No output to moderate"
3991
+ };
3992
+ const defaultProviders = await getDefaultProviders();
3993
+ const defaultModerationProvider = !getEnvString("OPENAI_API_KEY") && (getEnvString("REPLICATE_API_KEY") || getEnvString("REPLICATE_API_TOKEN")) ? await loadApiProvider(LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
3994
+ const moderationProvider = await getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
3995
+ invariant(moderationProvider, "Moderation provider must be defined");
3996
+ const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
3997
+ if (resp.error) return {
3998
+ pass: false,
3999
+ score: 0,
4000
+ reason: `Moderation API error: ${resp.error}`
4001
+ };
4002
+ const { flags } = resp;
4003
+ if (!flags || flags.length === 0) return {
4004
+ pass: true,
4005
+ score: 1,
4006
+ reason: "No moderation flags detected"
4007
+ };
4008
+ const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
4009
+ if (filteredFlags.length > 0) return {
4010
+ pass: false,
4011
+ score: 0,
4012
+ reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
4013
+ };
4014
+ return {
4015
+ pass: true,
4016
+ score: 1,
4017
+ reason: "No relevant moderation flags detected"
4018
+ };
4019
+ }
4020
+ //#endregion
4021
+ //#region src/assertions/contextUtils.ts
4022
+ /**
4023
+ * Resolves the context value for context-based assertions.
4024
+ * Supports extracting context from test variables or transforming from output.
4025
+ * Can return either a single context string or an array of context chunks.
4026
+ *
4027
+ * @param assertion - The assertion configuration
4028
+ * @param test - The test case
4029
+ * @param output - The provider output (after provider transform, before test transform)
4030
+ * @param prompt - The prompt text
4031
+ * @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
4032
+ * @param providerResponse - Optional full provider response for contextTransform
4033
+ * @returns The resolved context string or array of strings
4034
+ * @throws Error if context cannot be resolved or transform fails
4035
+ */
4036
+ async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
4037
+ let contextValue;
4038
+ if (test.vars?.context) {
4039
+ if (typeof test.vars.context === "string") contextValue = test.vars.context;
4040
+ else if (Array.isArray(test.vars.context)) {
4041
+ const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
4042
+ if (invalidEntry) {
4043
+ const [idx, val] = invalidEntry;
4044
+ invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
4045
+ }
4046
+ contextValue = test.vars.context;
4047
+ }
4048
+ } else if (fallbackContext) contextValue = fallbackContext;
4049
+ if (assertion.contextTransform) try {
4050
+ const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
4051
+ const transformed = await transform(assertion.contextTransform, outputForTransform, {
4052
+ vars: test.vars,
4053
+ prompt: { label: prompt },
4054
+ ...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
4055
+ });
4056
+ invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${assertion.contextTransform}`);
4057
+ contextValue = transformed;
4058
+ } catch (error) {
4059
+ throw new Error(`Failed to transform context using expression '${assertion.contextTransform}': ${error instanceof Error ? error.message : String(error)}`);
4060
+ }
4061
+ invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
4062
+ return contextValue;
4063
+ }
4064
+ /**
4065
+ * Serializes context (string or string[]) to a single string for prompts.
4066
+ * Joins chunks with double newlines to preserve separation.
4067
+ */
4068
+ function serializeContext(context) {
4069
+ return Array.isArray(context) ? context.join("\n\n") : context;
4070
+ }
4071
+ //#endregion
4072
+ //#region src/matchers/rag.ts
4073
+ async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
4074
+ const defaults = await getDefaultProviders();
4075
+ const embeddingProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
4076
+ const textProvider = await getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
4077
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
4078
+ const rubricPrompt = await loadRubricPrompt(grading?.rubricPrompt, ANSWER_RELEVANCY_GENERATE);
4079
+ const parsedOutput = tryParse(output);
4080
+ const promptText = await renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
4081
+ const candidateQuestions = [];
4082
+ for (let i = 0; i < 3; i++) {
4083
+ const resp = await callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
4084
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
4085
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
4086
+ invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
4087
+ candidateQuestions.push(resp.output);
4088
+ }
4089
+ invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
4090
+ const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
4091
+ accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
4092
+ if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
4093
+ const inputEmbedding = inputEmbeddingResp.embedding;
4094
+ const similarities = [];
4095
+ const questionsWithScores = [];
4096
+ for (const question of candidateQuestions) {
4097
+ const resp = await embeddingProvider.callEmbeddingApi(question);
4098
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
4099
+ if (resp.error || !resp.embedding) return fail(resp.error || "No embedding", tokensUsed);
4100
+ const questionSimilarity = cosineSimilarity(inputEmbedding, resp.embedding);
4101
+ similarities.push(questionSimilarity);
4102
+ questionsWithScores.push({
4103
+ question,
4104
+ similarity: questionSimilarity
4105
+ });
4106
+ }
4107
+ const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
4108
+ const pass = similarity >= threshold - Number.EPSILON;
4109
+ const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
4110
+ const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
4111
+ const metadata = {
4112
+ generatedQuestions: questionsWithScores,
4113
+ averageSimilarity: similarity,
4114
+ threshold
4115
+ };
4116
+ if (pass) return {
4117
+ pass: true,
4118
+ score: similarity,
4119
+ reason: greaterThanReason,
4120
+ tokensUsed,
4121
+ metadata
4122
+ };
4123
+ return {
4124
+ pass: false,
4125
+ score: similarity,
4126
+ reason: lessThanReason,
4127
+ tokensUsed,
4128
+ metadata
4129
+ };
4130
+ }
4131
+ async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
4132
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context recall check");
4133
+ const contextString = serializeContext(context);
4134
+ const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RECALL), {
4135
+ context: contextString,
4136
+ groundTruth,
4137
+ ...vars || {}
4138
+ }), "context-recall", {
4139
+ context: contextString,
4140
+ groundTruth,
4141
+ ...vars || {}
4142
+ }, providerCallContext);
4143
+ if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
4144
+ invariant(typeof resp.output === "string", "context-recall produced malformed response");
4145
+ const attributedTokenLower = CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
4146
+ const notAttributedTokenLower = CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
4147
+ const sentences = splitIntoSentences(resp.output).filter((line) => {
4148
+ const lowerLine = line.toLowerCase();
4149
+ return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
4150
+ });
4151
+ const sentenceAttributions = [];
4152
+ let numerator = 0;
4153
+ for (const sentence of sentences) {
4154
+ const lowerSentence = sentence.toLowerCase();
4155
+ const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
4156
+ if (isAttributed) numerator++;
4157
+ const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
4158
+ const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
4159
+ sentenceAttributions.push({
4160
+ sentence: cleanSentence,
4161
+ attributed: isAttributed
4162
+ });
4163
+ }
4164
+ const score = sentences.length > 0 ? numerator / sentences.length : 0;
4165
+ const pass = score >= threshold - Number.EPSILON;
4166
+ const metadata = {
4167
+ sentenceAttributions,
4168
+ totalSentences: sentences.length,
4169
+ attributedSentences: numerator,
4170
+ score
4171
+ };
4172
+ return {
4173
+ pass,
4174
+ score,
4175
+ reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
4176
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
4177
+ metadata
4178
+ };
4179
+ }
4180
+ async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
4181
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context relevance check");
4182
+ const contextString = serializeContext(context);
4183
+ const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RELEVANCE), {
4184
+ context: contextString,
4185
+ query: question
4186
+ }), "context-relevance", {
4187
+ context: contextString,
4188
+ query: question
4189
+ }, providerCallContext);
4190
+ if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
4191
+ invariant(typeof resp.output === "string", "context-relevance produced malformed response");
4192
+ const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : splitIntoSentences(context);
4193
+ const totalContextUnits = contextUnits.length;
4194
+ const extractedSentences = splitIntoSentences(resp.output);
4195
+ const relevantSentences = [];
4196
+ const insufficientInformation = resp.output.includes(CONTEXT_RELEVANCE_BAD);
4197
+ let numerator = 0;
4198
+ if (insufficientInformation) numerator = 0;
4199
+ else {
4200
+ const uniqueRelevantSentences = [...new Set(extractedSentences)];
4201
+ numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
4202
+ relevantSentences.push(...uniqueRelevantSentences);
4203
+ }
4204
+ const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
4205
+ const pass = score >= threshold - Number.EPSILON;
4206
+ const metadata = {
4207
+ extractedSentences: relevantSentences,
4208
+ totalContextUnits,
4209
+ totalContextSentences: totalContextUnits,
4210
+ contextUnits,
4211
+ relevantSentenceCount: numerator,
4212
+ insufficientInformation,
4213
+ score
4214
+ };
4215
+ return {
4216
+ pass,
4217
+ score,
4218
+ reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
4219
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
4220
+ metadata
4221
+ };
4222
+ }
4223
+ async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
4224
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "faithfulness check");
4225
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
4226
+ if (grading?.rubricPrompt) invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
4227
+ const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
4228
+ const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
4229
+ const longformPrompt = await loadRubricPrompt(rawLongformPrompt, CONTEXT_FAITHFULNESS_LONGFORM);
4230
+ const nliPrompt = await loadRubricPrompt(rawNliPrompt, CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
4231
+ let promptText = await renderLlmRubricPrompt(longformPrompt, {
4232
+ question: query,
4233
+ answer: tryParse(output),
4234
+ ...vars || {}
4235
+ });
4236
+ let resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
4237
+ question: query,
4238
+ answer: tryParse(output),
4239
+ ...vars || {}
4240
+ }, providerCallContext);
4241
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
4242
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
4243
+ invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
4244
+ const contextString = serializeContext(context);
4245
+ const statements = splitIntoSentences(resp.output);
4246
+ promptText = await renderLlmRubricPrompt(nliPrompt, {
4247
+ context: contextString,
4248
+ statements,
4249
+ ...vars || {}
4250
+ });
4251
+ resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
4252
+ context: contextString,
4253
+ statements,
4254
+ ...vars || {}
4255
+ }, providerCallContext);
4256
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
4257
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
4258
+ invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
4259
+ let finalAnswer = "Final verdict for each statement in order:";
4260
+ finalAnswer = finalAnswer.toLowerCase();
4261
+ let verdicts = resp.output.toLowerCase().trim();
4262
+ let score = 0;
4263
+ if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
4264
+ verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
4265
+ const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
4266
+ if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
4267
+ } else {
4268
+ const noVerdictCount = verdicts.split("verdict: no").length - 1;
4269
+ if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
4270
+ }
4271
+ score = Math.min(1, Math.max(0, score));
4272
+ const pass = score >= threshold - Number.EPSILON;
4273
+ return {
4274
+ pass,
4275
+ score,
4276
+ reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
4277
+ tokensUsed
4278
+ };
4279
+ }
4280
+ //#endregion
4281
+ //#region src/matchers/similarity.ts
4282
+ function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
4283
+ switch (metric) {
4284
+ case "cosine": return cosineSimilarity(expectedEmbedding, outputEmbedding);
4285
+ case "dot_product": return dotProduct(expectedEmbedding, outputEmbedding);
4286
+ case "euclidean": return euclideanDistance(expectedEmbedding, outputEmbedding);
4287
+ default: return fail(`Unsupported metric: ${metric}`, tokensUsed);
4288
+ }
4289
+ }
4290
+ function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
4291
+ if (metric === "euclidean") {
4292
+ const distance = similarity;
4293
+ const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
4294
+ const normalizedScore = 1 / (1 + distance);
4295
+ const score = inverse ? 1 - normalizedScore : normalizedScore;
4296
+ const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
4297
+ const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
4298
+ return {
4299
+ pass,
4300
+ score,
4301
+ reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
4302
+ tokensUsed
4303
+ };
4304
+ }
4305
+ const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
4306
+ const score = inverse ? 1 - similarity : similarity;
4307
+ const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
4308
+ const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
4309
+ return {
4310
+ pass,
4311
+ score,
4312
+ reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
4313
+ tokensUsed
4314
+ };
4315
+ }
4316
+ async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
4317
+ if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
4318
+ const similarityResp = await finalProvider.callSimilarityApi(expected, output);
4319
+ accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
4320
+ if (similarityResp.error) return fail(similarityResp.error, tokensUsed);
4321
+ if (similarityResp.similarity == null) return fail("Unknown error fetching similarity", tokensUsed);
4322
+ if (!Number.isFinite(similarityResp.similarity)) return fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
4323
+ return similarityResp.similarity;
4324
+ }
4325
+ const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
4326
+ if (typeof callEmbeddingApi !== "function") {
4327
+ if ("callSimilarityApi" in finalProvider) return fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
4328
+ throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
4329
+ }
4330
+ const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
4331
+ const mergedUsage = normalizeMatcherTokenUsage(void 0);
4332
+ accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
4333
+ accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
4334
+ accumulateTokenUsage(tokensUsed, mergedUsage);
4335
+ if (expectedEmbedding.error || outputEmbedding.error) return fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
4336
+ if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return fail("Embedding not found", tokensUsed);
4337
+ return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
4338
+ }
4339
+ async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
4340
+ if (metric === "cosine" && state.config?.redteam && shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
4341
+ return await doRemoteGrading({
4342
+ task: "similar",
4343
+ expected,
4344
+ output,
4345
+ threshold,
4346
+ inverse
4347
+ });
4348
+ } catch (error) {
4349
+ return fail(`Could not perform remote grading: ${error}`);
4350
+ }
4351
+ const defaults = await getDefaultProviders();
4352
+ const finalProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
4353
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
4354
+ const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
4355
+ if (typeof similarity !== "number") return similarity;
4356
+ return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
4357
+ }
4358
+ //#endregion
3844
4359
  //#region src/tracing/evaluatorTracing.ts
3845
4360
  let otlpReceiverStarted = false;
3846
4361
  const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
@@ -3884,7 +4399,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
3884
4399
  telemetry.record("feature_used", { feature: "tracing" });
3885
4400
  try {
3886
4401
  logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
3887
- const { startOTLPReceiver } = await import("../otlpReceiver-DmVulbhC.js");
4402
+ const { startOTLPReceiver } = await import("../otlpReceiver-C6thJRXi.js");
3888
4403
  const port = testSuite.tracing.otlp.http.port || 4318;
3889
4404
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
3890
4405
  const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
@@ -3908,7 +4423,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
3908
4423
  async function stopOtlpReceiverIfNeeded() {
3909
4424
  if (otlpReceiverStarted) try {
3910
4425
  logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
3911
- const { stopOTLPReceiver } = await import("../otlpReceiver-DmVulbhC.js");
4426
+ const { stopOTLPReceiver } = await import("../otlpReceiver-C6thJRXi.js");
3912
4427
  await stopOTLPReceiver();
3913
4428
  otlpReceiverStarted = false;
3914
4429
  logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -3943,7 +4458,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
3943
4458
  }
3944
4459
  if (!tracingEnabled) return null;
3945
4460
  logger.debug("[EvaluatorTracing] Importing trace store");
3946
- const { getTraceStore } = await import("../store-Dn9HUkdW.js").then((n) => n.n);
4461
+ const { getTraceStore } = await import("../store-2OXm_eBY.js").then((n) => n.n);
3947
4462
  const traceStore = getTraceStore();
3948
4463
  const traceId = generateTraceId();
3949
4464
  const spanId = generateSpanId();
@@ -4253,38 +4768,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
4253
4768
  }
4254
4769
  //#endregion
4255
4770
  //#region src/assertions/contains.ts
4771
+ /**
4772
+ * Advance over separators between parsed fields.
4773
+ *
4774
+ * Contains-any values allow whitespace around comma delimiters, and historical
4775
+ * parsing ignored repeated commas rather than producing empty fields.
4776
+ */
4777
+ function skipWhitespaceAndCommas(value, startIndex) {
4778
+ let i = startIndex;
4779
+ while (i < value.length) {
4780
+ i = skipWhitespace(value, i);
4781
+ if (value[i] !== ",") break;
4782
+ i++;
4783
+ }
4784
+ return i;
4785
+ }
4786
+ /**
4787
+ * Advance over whitespace while preserving comma delimiter handling for callers.
4788
+ */
4789
+ function skipWhitespace(value, startIndex) {
4790
+ let i = startIndex;
4791
+ while (i < value.length && /\s/.test(value[i])) i++;
4792
+ return i;
4793
+ }
4794
+ /**
4795
+ * Parse a quoted field using the assertion parser's CSV-like escape rules.
4796
+ *
4797
+ * Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
4798
+ * unterminated fields so malformed assertion values do not silently pass.
4799
+ */
4800
+ function parseQuotedField(value, startIndex) {
4801
+ let i = startIndex + 1;
4802
+ let field = "";
4803
+ let terminated = false;
4804
+ while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
4805
+ field += value[i + 1];
4806
+ i += 2;
4807
+ } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
4808
+ field += "\"";
4809
+ i += 2;
4810
+ } else if (value[i] === "\"") {
4811
+ i++;
4812
+ terminated = true;
4813
+ break;
4814
+ } else {
4815
+ field += value[i];
4816
+ i++;
4817
+ }
4818
+ invariant(terminated, "Unterminated quoted field in contains assertion value");
4819
+ return {
4820
+ field,
4821
+ nextIndex: i
4822
+ };
4823
+ }
4824
+ /**
4825
+ * Parse an unquoted field up to the next comma, trimming surrounding whitespace.
4826
+ */
4827
+ function parseUnquotedField(value, startIndex) {
4828
+ let i = startIndex;
4829
+ while (i < value.length && value[i] !== ",") i++;
4830
+ return {
4831
+ field: value.substring(startIndex, i).trim(),
4832
+ nextIndex: i
4833
+ };
4834
+ }
4835
+ /**
4836
+ * Split a contains-any string into fields while preserving quoted commas.
4837
+ */
4256
4838
  function parseCommaSeparatedValues(value) {
4257
4839
  const results = [];
4258
4840
  let i = 0;
4259
4841
  while (i < value.length) {
4260
- while (i < value.length && /\s/.test(value[i])) i++;
4842
+ i = skipWhitespaceAndCommas(value, i);
4261
4843
  if (i >= value.length) break;
4262
- if (value[i] === ",") {
4263
- i++;
4264
- continue;
4265
- }
4266
- if (value[i] === "\"") {
4267
- i++;
4268
- let field = "";
4269
- while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
4270
- field += value[i + 1];
4271
- i += 2;
4272
- } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
4273
- field += "\"";
4274
- i += 2;
4275
- } else if (value[i] === "\"") {
4276
- i++;
4277
- break;
4278
- } else {
4279
- field += value[i];
4280
- i++;
4281
- }
4282
- results.push(field);
4283
- } else {
4284
- const start = i;
4285
- while (i < value.length && value[i] !== ",") i++;
4286
- results.push(value.substring(start, i).trim());
4287
- }
4844
+ const isQuotedField = value[i] === "\"";
4845
+ const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
4846
+ results.push(parsed.field);
4847
+ i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
4848
+ invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
4288
4849
  }
4289
4850
  return results;
4290
4851
  }
@@ -4689,6 +5250,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
4689
5250
  };
4690
5251
  //#endregion
4691
5252
  //#region src/assertions/html.ts
5253
+ const LITERAL_WRAPPER_PATTERNS = {
5254
+ html: /<html(?=[\s>/])/,
5255
+ head: /<head(?=[\s>/])/,
5256
+ body: /<body(?=[\s>/])/
5257
+ };
5258
+ function isWrapperTagName(tagName) {
5259
+ return tagName === "html" || tagName === "head" || tagName === "body";
5260
+ }
5261
+ function isTextNode(node) {
5262
+ return node.nodeName === "#text";
5263
+ }
5264
+ function isElementNode(node) {
5265
+ return "tagName" in node;
5266
+ }
5267
+ function hasSourceCodeLocation(element) {
5268
+ return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
5269
+ }
5270
+ function getChildNodes(node) {
5271
+ return "childNodes" in node ? node.childNodes : [];
5272
+ }
5273
+ function findFirstElement(root, predicate) {
5274
+ const stack = [root];
5275
+ while (stack.length > 0) {
5276
+ const current = stack.pop();
5277
+ if (isElementNode(current) && predicate(current)) return current;
5278
+ const children = getChildNodes(current);
5279
+ for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
5280
+ }
5281
+ }
5282
+ function hasTopLevelText(parentNode) {
5283
+ return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
5284
+ }
5285
+ function isUserProvidedElement(element, inputLowercase) {
5286
+ const tagName = element.tagName.toLowerCase();
5287
+ if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
5288
+ return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
5289
+ }
4692
5290
  const HTML_PATTERNS = {
4693
5291
  openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
4694
5292
  closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
@@ -4844,37 +5442,21 @@ function validateHtml(htmlString) {
4844
5442
  isValid: false,
4845
5443
  reason: "Output appears to be XML, not HTML"
4846
5444
  };
4847
- try {
4848
- const { document } = new JSDOM(trimmed, { contentType: "text/html" }).window;
4849
- if (document.body && !trimmed.toLowerCase().includes("<body")) {
4850
- if (Array.from(document.body.childNodes).some((node) => node.nodeType === 3 && node.textContent?.trim())) return {
4851
- isValid: false,
4852
- reason: "Output must be wrapped in HTML tags"
4853
- };
4854
- }
4855
- const allElements = document.querySelectorAll("*");
4856
- if (!Array.from(allElements).find((element) => {
4857
- const tagName = element.tagName.toLowerCase();
4858
- if ([
4859
- "html",
4860
- "head",
4861
- "body"
4862
- ].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
4863
- return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
4864
- })) return {
4865
- isValid: false,
4866
- reason: "Output does not contain recognized HTML elements"
4867
- };
4868
- return {
4869
- isValid: true,
4870
- reason: "Output is valid HTML"
4871
- };
4872
- } catch (error) {
4873
- return {
4874
- isValid: false,
4875
- reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
4876
- };
4877
- }
5445
+ const document = parse$2(trimmed, { sourceCodeLocationInfo: true });
5446
+ const inputLowercase = trimmed.toLowerCase();
5447
+ const body = findFirstElement(document, (element) => element.tagName === "body");
5448
+ if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
5449
+ isValid: false,
5450
+ reason: "Output must be wrapped in HTML tags"
5451
+ };
5452
+ if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
5453
+ isValid: false,
5454
+ reason: "Output does not contain recognized HTML elements"
5455
+ };
5456
+ return {
5457
+ isValid: true,
5458
+ reason: "Output is valid HTML"
5459
+ };
4878
5460
  }
4879
5461
  const handleContainsHtml = ({ assertion, outputString, inverse }) => {
4880
5462
  const pass = containsHtml(outputString) !== inverse;
@@ -5937,11 +6519,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
5937
6519
  const rougeMethod = rouge[baseType[baseType.length - 1]];
5938
6520
  const score = rougeMethod(outputString, renderedValue, {});
5939
6521
  const threshold = assertion.threshold ?? .75;
5940
- const pass = score >= threshold != inverse;
5941
6522
  return {
5942
- pass,
6523
+ pass: score >= threshold !== inverse,
5943
6524
  score: inverse ? 1 - score : score,
5944
- reason: pass ? `${baseType.toUpperCase()} score ${score.toFixed(2)} is greater than or equal to threshold ${threshold}` : `${baseType.toUpperCase()} score ${score.toFixed(2)} is less than threshold ${threshold}`,
6525
+ reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
5945
6526
  assertion
5946
6527
  };
5947
6528
  }
@@ -6003,6 +6584,192 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
6003
6584
  }
6004
6585
  };
6005
6586
  //#endregion
6587
+ //#region src/providers/webSearchUtils.ts
6588
+ function hasTool(provider, predicate) {
6589
+ return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
6590
+ }
6591
+ function getProviderId(provider) {
6592
+ if (typeof provider.id !== "function") return null;
6593
+ try {
6594
+ return provider.id();
6595
+ } catch (err) {
6596
+ logger.debug(`Failed to read provider id: ${err}`);
6597
+ return null;
6598
+ }
6599
+ }
6600
+ function isOpenAiResponsesProvider(provider, id) {
6601
+ return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
6602
+ }
6603
+ /**
6604
+ * Check if a provider has web search capabilities
6605
+ * @param provider The provider to check
6606
+ * @returns true if the provider supports web search
6607
+ */
6608
+ function hasWebSearchCapability(provider) {
6609
+ if (!provider) return false;
6610
+ const id = getProviderId(provider);
6611
+ if (!id) return false;
6612
+ if (id.includes("perplexity")) return true;
6613
+ if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
6614
+ if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
6615
+ if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
6616
+ if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
6617
+ if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
6618
+ return false;
6619
+ }
6620
+ /**
6621
+ * Load a provider with web search capabilities.
6622
+ * Tries multiple providers in order of preference until one succeeds.
6623
+ * Uses the latest and most capable models from each provider with specific checkpoint IDs.
6624
+ *
6625
+ * @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
6626
+ * @returns A provider with web search capabilities or null
6627
+ */
6628
+ async function loadWebSearchProvider(preferAnthropic = false) {
6629
+ const loadAnthropicWebSearch = async () => {
6630
+ try {
6631
+ return await loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
6632
+ type: "web_search_20250305",
6633
+ name: "web_search",
6634
+ max_uses: 5
6635
+ }] } } });
6636
+ } catch (err) {
6637
+ logger.debug(`Failed to load Anthropic web search provider: ${err}`);
6638
+ return null;
6639
+ }
6640
+ };
6641
+ const loadOpenAIWebSearch = async () => {
6642
+ try {
6643
+ return await loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
6644
+ } catch (err) {
6645
+ logger.debug(`Failed to load OpenAI web search provider: ${err}`);
6646
+ return null;
6647
+ }
6648
+ };
6649
+ const loadPerplexity = async () => {
6650
+ try {
6651
+ return await loadApiProvider("perplexity:sonar-pro");
6652
+ } catch (err) {
6653
+ logger.debug(`Failed to load Perplexity provider: ${err}`);
6654
+ return null;
6655
+ }
6656
+ };
6657
+ const loadGoogleWebSearch = async () => {
6658
+ try {
6659
+ return await loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
6660
+ } catch (err) {
6661
+ logger.debug(`Failed to load Google web search provider: ${err}`);
6662
+ return null;
6663
+ }
6664
+ };
6665
+ const loadVertexWebSearch = async () => {
6666
+ try {
6667
+ return await loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
6668
+ } catch (err) {
6669
+ logger.debug(`Failed to load Vertex web search provider: ${err}`);
6670
+ return null;
6671
+ }
6672
+ };
6673
+ const loadXaiWebSearch = async () => {
6674
+ try {
6675
+ return await loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
6676
+ } catch (err) {
6677
+ logger.debug(`Failed to load xAI web search provider: ${err}`);
6678
+ return null;
6679
+ }
6680
+ };
6681
+ const providers = preferAnthropic ? [
6682
+ loadAnthropicWebSearch,
6683
+ loadOpenAIWebSearch,
6684
+ loadPerplexity,
6685
+ loadGoogleWebSearch,
6686
+ loadVertexWebSearch,
6687
+ loadXaiWebSearch
6688
+ ] : [
6689
+ loadOpenAIWebSearch,
6690
+ loadAnthropicWebSearch,
6691
+ loadPerplexity,
6692
+ loadGoogleWebSearch,
6693
+ loadVertexWebSearch,
6694
+ loadXaiWebSearch
6695
+ ];
6696
+ for (const getProvider of providers) {
6697
+ const provider = await getProvider();
6698
+ if (provider && hasWebSearchCapability(provider)) {
6699
+ logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
6700
+ return provider;
6701
+ }
6702
+ if (provider) logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
6703
+ }
6704
+ return null;
6705
+ }
6706
+ //#endregion
6707
+ //#region src/matchers/search.ts
6708
+ async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
6709
+ if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
6710
+ const defaultProviders = await getDefaultProviders();
6711
+ const defaultSearchProviders = [
6712
+ defaultProviders.webSearchProvider,
6713
+ defaultProviders.llmRubricProvider,
6714
+ defaultProviders.gradingProvider
6715
+ ];
6716
+ let searchProvider = (grading.provider ? await getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
6717
+ if (!hasWebSearchCapability(searchProvider)) {
6718
+ const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
6719
+ if (webSearchDefault) searchProvider = webSearchDefault;
6720
+ }
6721
+ if (!hasWebSearchCapability(searchProvider)) {
6722
+ const webSearchProvider = await loadWebSearchProvider(true);
6723
+ if (webSearchProvider) searchProvider = webSearchProvider;
6724
+ }
6725
+ if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
6726
+ const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, DEFAULT_WEB_SEARCH_PROMPT), {
6727
+ output: tryParse(llmOutput),
6728
+ rubric,
6729
+ ...vars || {}
6730
+ });
6731
+ const resp = await callProviderWithContext(searchProvider, prompt, "search-rubric", {
6732
+ output: tryParse(llmOutput),
6733
+ rubric,
6734
+ ...vars || {}
6735
+ }, providerCallContext);
6736
+ if (resp.error || !resp.output) return {
6737
+ pass: false,
6738
+ score: 0,
6739
+ reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
6740
+ tokensUsed: resp.tokenUsage,
6741
+ assertion
6742
+ };
6743
+ try {
6744
+ const result = extractFirstJsonObject(String(resp.output));
6745
+ let pass = result.pass ?? false;
6746
+ const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
6747
+ if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
6748
+ return {
6749
+ pass,
6750
+ score,
6751
+ reason: result.reason || "No reason provided",
6752
+ tokensUsed: resp.tokenUsage,
6753
+ assertion,
6754
+ metadata: {
6755
+ searchResults: result.searchResults || [],
6756
+ searchProvider: searchProvider.id()
6757
+ }
6758
+ };
6759
+ } catch (err) {
6760
+ logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
6761
+ const outputLower = String(resp.output).toLowerCase();
6762
+ const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
6763
+ return {
6764
+ pass,
6765
+ score: pass ? 1 : 0,
6766
+ reason: resp.output,
6767
+ tokensUsed: resp.tokenUsage,
6768
+ assertion
6769
+ };
6770
+ }
6771
+ }
6772
+ //#endregion
6006
6773
  //#region src/assertions/searchRubric.ts
6007
6774
  async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
6008
6775
  if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
@@ -7088,7 +7855,7 @@ const ASSERTION_HANDLERS = {
7088
7855
  "llm-rubric": handleLlmRubric,
7089
7856
  meteor: async (params) => {
7090
7857
  try {
7091
- const { handleMeteorAssertion } = await import("../meteor-CU5UAE-H.js");
7858
+ const { handleMeteorAssertion } = await import("../meteor-Wc_aUVvu.js");
7092
7859
  return handleMeteorAssertion(params);
7093
7860
  } catch (error) {
7094
7861
  if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
@@ -7224,7 +7991,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
7224
7991
  };
7225
7992
  }
7226
7993
  else if (filePath.endsWith(".rb")) try {
7227
- const { runRuby } = await import("../rubyUtils-CYSQEG4a.js").then((n) => n.t);
7994
+ const { runRuby } = await import("../rubyUtils-DGnoCYL2.js").then((n) => n.t);
7228
7995
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
7229
7996
  logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
7230
7997
  } catch (error) {
@@ -9005,7 +9772,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
9005
9772
  const defaultProvider = defaultTest.provider;
9006
9773
  if (isApiProvider(defaultProvider)) return defaultProvider;
9007
9774
  if (typeof defaultProvider === "object" && defaultProvider.id) {
9008
- const { loadApiProvider } = await import("../providers-Bp4S-FvO.js");
9775
+ const { loadApiProvider } = await import("../providers-BuyzKt7C.js");
9009
9776
  return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
9010
9777
  }
9011
9778
  return defaultProvider;
@@ -9165,7 +9932,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
9165
9932
  async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
9166
9933
  if (!state.resume || !evalRecord.persisted) return;
9167
9934
  try {
9168
- const { default: EvalResult } = await import("../evalResult-DElBuddX.js");
9935
+ const { default: EvalResult } = await import("../evalResult-spPqh1G_.js");
9169
9936
  const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: state.retryMode });
9170
9937
  const originalCount = runEvalOptions.length;
9171
9938
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -9624,9 +10391,8 @@ var Evaluator = class {
9624
10391
  context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
9625
10392
  }
9626
10393
  async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
9627
- let flushGroupedRows;
9628
10394
  try {
9629
- if (shouldGroupGradingByProvider) flushGroupedRows = await this.runGroupedEvalSteps({
10395
+ if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
9630
10396
  checkAbort,
9631
10397
  evalStepIndexMap,
9632
10398
  groupedRunEvalOptions,
@@ -9658,7 +10424,6 @@ var Evaluator = class {
9658
10424
  cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
9659
10425
  throw err;
9660
10426
  }
9661
- await flushGroupedRows?.();
9662
10427
  if (isEvalTimedOut()) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
9663
10428
  else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
9664
10429
  ciProgressReporter,
@@ -10369,47 +11134,11 @@ function filterPrompts(prompts, filterPromptsOption) {
10369
11134
  //#endregion
10370
11135
  //#region src/commands/eval/filterProviders.ts
10371
11136
  /**
10372
- * Checks if a value is a valid provider ID (non-empty string).
10373
- */
10374
- function isValidProviderId(id) {
10375
- return id !== null && id !== void 0 && typeof id === "string" && id !== "";
10376
- }
10377
- /**
10378
11137
  * Extracts the id and label from a raw provider config without instantiating it.
10379
11138
  * Handles all provider config formats: string, function, ProviderOptions, ProviderOptionsMap.
10380
11139
  */
10381
11140
  function getProviderIdAndLabel(provider, index) {
10382
- if (typeof provider === "string") return { id: provider };
10383
- if (typeof provider === "function") {
10384
- const label = provider.label;
10385
- return {
10386
- id: label ?? `custom-function-${index}`,
10387
- label
10388
- };
10389
- }
10390
- const providerId = provider.id;
10391
- if ("id" in provider && isValidProviderId(providerId)) return {
10392
- id: providerId,
10393
- label: provider.label
10394
- };
10395
- const keys = Object.keys(provider);
10396
- if (keys.length > 0) {
10397
- const id = keys[0];
10398
- const value = provider[id];
10399
- if (typeof value === "object" && value !== null) return {
10400
- id: value.id || id,
10401
- label: value.label
10402
- };
10403
- }
10404
- const label = provider.label;
10405
- if (isValidProviderId(label)) return {
10406
- id: label,
10407
- label
10408
- };
10409
- return {
10410
- id: `unknown-${index}`,
10411
- label
10412
- };
11141
+ return normalizeProviderRef(provider, { index });
10413
11142
  }
10414
11143
  /**
10415
11144
  * Filters raw provider configs BEFORE instantiation.
@@ -13048,9 +13777,10 @@ function dedupeTestCases(testCases) {
13048
13777
  return deduped;
13049
13778
  }
13050
13779
  function buildMaxCharsRetryInstructions(rejectedPromptLengths, limit) {
13780
+ const longestRejectedPromptText = rejectedPromptLengths.length > 0 ? `${Math.max(...rejectedPromptLengths)} characters` : "unknown length";
13051
13781
  return dedent`
13052
13782
  Your previous response included ${rejectedPromptLengths.length} generated prompt${rejectedPromptLengths.length === 1 ? "" : "s"} that exceeded the ${limit ?? "configured"}-character limit.
13053
- The longest rejected prompt was ${Math.max(...rejectedPromptLengths)} characters.
13783
+ The longest rejected prompt was ${longestRejectedPromptText}.
13054
13784
  Generate replacement prompts only, and keep every user message within the character limit.
13055
13785
  `.trim();
13056
13786
  }
@@ -14666,7 +15396,7 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
14666
15396
  for (const row of evaluateTable.body.slice(0, maxRows)) table.push([...row.vars.map((v) => ellipsize(v, tableCellMaxLength)), ...row.outputs.map(({ pass, text, failureReason: failureType }) => {
14667
15397
  text = ellipsize(text, tableCellMaxLength);
14668
15398
  if (pass) return chalk.green("[PASS] ") + text;
14669
- else return chalk.red(failureType === ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.red.bold(c) : c).join("---");
15399
+ return chalk.red(failureType === ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.red.bold(c) : c).join("---");
14670
15400
  })]);
14671
15401
  return table.toString();
14672
15402
  }
@@ -14757,6 +15487,115 @@ function formatDuration(seconds) {
14757
15487
  }
14758
15488
  //#endregion
14759
15489
  //#region src/commands/eval/summary.ts
15490
+ function getCompletionMessage({ completionType, evalId, shareableUrl, wasAborted, writeToDatabase, activelySharing }) {
15491
+ if (wasAborted) {
15492
+ const idSuffix = writeToDatabase ? ` (ID: ${chalk.cyan(evalId)})` : "";
15493
+ return `${chalk.red("✗")} ${completionType} aborted${idSuffix}`;
15494
+ }
15495
+ if (writeToDatabase && shareableUrl) return `${chalk.green("✓")} ${completionType} complete: ${shareableUrl}`;
15496
+ if (writeToDatabase && activelySharing) return `${chalk.green("✓")} ${completionType} complete`;
15497
+ if (writeToDatabase) return `${chalk.green("✓")} ${completionType} complete (ID: ${chalk.cyan(evalId)})`;
15498
+ return `${chalk.green("✓")} ${completionType} complete`;
15499
+ }
15500
+ function getAbortSummaryLines(targetErrorStatus) {
15501
+ if (targetErrorStatus == null) return [];
15502
+ return [
15503
+ "",
15504
+ chalk.red.bold("Scan stopped: Target is unavailable and will not recover on retry."),
15505
+ chalk.red(` Target returned HTTP ${targetErrorStatus}`),
15506
+ "",
15507
+ chalk.yellow("Possible causes:"),
15508
+ chalk.yellow(" • Invalid API key or authentication (401/403)"),
15509
+ chalk.yellow(" • Target endpoint does not exist (404)"),
15510
+ chalk.yellow(" • Server does not support the request (501)"),
15511
+ "",
15512
+ chalk.cyan("To fix: Check your target configuration and credentials.")
15513
+ ];
15514
+ }
15515
+ function getGuidanceLines({ writeToDatabase, shareableUrl, wantsToShare, activelySharing, hasExplicitDisable, cloudEnabled }) {
15516
+ if (!writeToDatabase || shareableUrl || wantsToShare || activelySharing) return [];
15517
+ const lines = ["", `» View results: ${chalk.green.bold("promptfoo view")}`];
15518
+ if (!hasExplicitDisable) lines.push(cloudEnabled ? `» Create shareable URL: ${chalk.green.bold("promptfoo share")}` : `» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
15519
+ lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
15520
+ return lines;
15521
+ }
15522
+ function buildUsageDetails(usage, total) {
15523
+ const parts = [];
15524
+ if (usage.prompt && usage.prompt > 0) parts.push(`${usage.prompt.toLocaleString()} prompt`);
15525
+ if (usage.completion && usage.completion > 0) parts.push(`${usage.completion.toLocaleString()} completion`);
15526
+ if (usage.cached && usage.cached > 0) parts.push(usage.cached === total && parts.length === 0 ? "cached" : `${usage.cached.toLocaleString()} cached`);
15527
+ if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) parts.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
15528
+ return parts;
15529
+ }
15530
+ function getTokenUsageLines(tokenUsage, isRedteam, tracker) {
15531
+ const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
15532
+ const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
15533
+ if (!hasEvalTokens && !hasGradingTokens) return [];
15534
+ const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
15535
+ const evalTokens = {
15536
+ prompt: tokenUsage.prompt || 0,
15537
+ completion: tokenUsage.completion || 0,
15538
+ total: tokenUsage.total || combinedTotal,
15539
+ cached: tokenUsage.cached || 0,
15540
+ numRequests: tokenUsage.numRequests || 0,
15541
+ completionDetails: tokenUsage.completionDetails || {
15542
+ reasoning: 0,
15543
+ acceptedPrediction: 0,
15544
+ rejectedPrediction: 0
15545
+ }
15546
+ };
15547
+ const lines = [`${chalk.bold("Total Tokens:")} ${chalk.white.bold((evalTokens.total + (tokenUsage.assertions?.total || 0)).toLocaleString())}`];
15548
+ if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
15549
+ if (evalTokens.total > 0) {
15550
+ const evalParts = buildUsageDetails(evalTokens, evalTokens.total);
15551
+ lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
15552
+ }
15553
+ if (tokenUsage.assertions?.total && tokenUsage.assertions.total > 0) {
15554
+ const gradingParts = buildUsageDetails(tokenUsage.assertions, tokenUsage.assertions.total);
15555
+ lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
15556
+ }
15557
+ lines.push(...getProviderUsageLines(tracker));
15558
+ return lines;
15559
+ }
15560
+ function getProviderUsageLines(tracker) {
15561
+ const providerIds = tracker.getProviderIds();
15562
+ if (providerIds.length <= 1) return [];
15563
+ const sortedProviders = providerIds.map((id) => ({
15564
+ id,
15565
+ usage: tracker.getProviderUsage(id)
15566
+ })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
15567
+ const lines = ["", chalk.bold("Providers:")];
15568
+ for (const { id, usage } of sortedProviders) {
15569
+ if ((usage.total || 0) === 0 && (usage.prompt || 0) + (usage.completion || 0) === 0) continue;
15570
+ const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
15571
+ const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
15572
+ const details = buildUsageDetails(usage, displayTotal);
15573
+ const requestInfo = `${usage.numRequests || 0} requests`;
15574
+ const separator = details.length > 0 ? "; " : "";
15575
+ lines.push(` ${chalk.gray(`${displayId}:`)} ${chalk.white(displayTotal.toLocaleString())} (${requestInfo}${separator}${details.join(", ")})`);
15576
+ }
15577
+ return lines;
15578
+ }
15579
+ function formatResultPercentage(count, totalTests) {
15580
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
15581
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
15582
+ }
15583
+ function formatResultLine(count, label, icon, iconColor, totalTests) {
15584
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count, totalTests)})`)}`;
15585
+ }
15586
+ function getResultsLines({ successes, failures, errors, duration, maxConcurrency }) {
15587
+ const totalTests = successes + failures + errors;
15588
+ const errorLabel = errors === 1 ? "error" : "errors";
15589
+ return [
15590
+ "",
15591
+ chalk.bold("Results:"),
15592
+ formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green, totalTests),
15593
+ formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red, totalTests),
15594
+ formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red, totalTests),
15595
+ chalk.gray(`Duration: ${formatDuration(duration)} (concurrency: ${maxConcurrency})`),
15596
+ ""
15597
+ ];
15598
+ }
14760
15599
  /**
14761
15600
  * Generate formatted evaluation summary output for CLI display.
14762
15601
  *
@@ -14795,115 +15634,28 @@ function formatDuration(seconds) {
14795
15634
  * ```
14796
15635
  */
14797
15636
  function generateEvalSummary(params) {
14798
- const { evalId, isRedteam, writeToDatabase, shareableUrl, wantsToShare, hasExplicitDisable, cloudEnabled, activelySharing = false, tokenUsage, successes, failures, errors, duration, maxConcurrency, tracker, targetErrorStatus } = params;
14799
- const lines = [];
14800
- const completionType = isRedteam ? "Red team" : "Eval";
14801
- const wasAborted = targetErrorStatus != null;
14802
- let completionMessage;
14803
- if (wasAborted) {
14804
- completionMessage = `${chalk.red("✗")} ${completionType} aborted`;
14805
- if (writeToDatabase) completionMessage += ` (ID: ${chalk.cyan(evalId)})`;
14806
- } else if (writeToDatabase && shareableUrl) completionMessage = `${chalk.green("✓")} ${completionType} complete: ${shareableUrl}`;
14807
- else if (writeToDatabase && activelySharing) completionMessage = `${chalk.green("✓")} ${completionType} complete`;
14808
- else if (writeToDatabase) completionMessage = `${chalk.green("✓")} ${completionType} complete (ID: ${chalk.cyan(evalId)})`;
14809
- else completionMessage = `${chalk.green("✓")} ${completionType} complete`;
14810
- lines.push(completionMessage);
14811
- if (wasAborted && targetErrorStatus != null) {
14812
- lines.push("");
14813
- lines.push(chalk.red.bold("Scan stopped: Target is unavailable and will not recover on retry."));
14814
- lines.push(chalk.red(` Target returned HTTP ${targetErrorStatus}`));
14815
- lines.push("");
14816
- lines.push(chalk.yellow("Possible causes:"));
14817
- lines.push(chalk.yellow(" • Invalid API key or authentication (401/403)"));
14818
- lines.push(chalk.yellow(" • Target endpoint does not exist (404)"));
14819
- lines.push(chalk.yellow(" • Server does not support the request (501)"));
14820
- lines.push("");
14821
- lines.push(chalk.cyan("To fix: Check your target configuration and credentials."));
14822
- }
14823
- if (writeToDatabase && !shareableUrl && !wantsToShare && !activelySharing) {
14824
- lines.push("");
14825
- lines.push(`» View results: ${chalk.green.bold("promptfoo view")}`);
14826
- if (!hasExplicitDisable) if (cloudEnabled) lines.push(`» Create shareable URL: ${chalk.green.bold("promptfoo share")}`);
14827
- else lines.push(`» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
14828
- lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
14829
- }
14830
- lines.push("");
14831
- const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
14832
- const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
14833
- if (hasEvalTokens || hasGradingTokens) {
14834
- const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
14835
- const evalTokens = {
14836
- prompt: tokenUsage.prompt || 0,
14837
- completion: tokenUsage.completion || 0,
14838
- total: tokenUsage.total || combinedTotal,
14839
- cached: tokenUsage.cached || 0,
14840
- completionDetails: tokenUsage.completionDetails || {
14841
- reasoning: 0,
14842
- acceptedPrediction: 0,
14843
- rejectedPrediction: 0
14844
- }
14845
- };
14846
- const grandTotal = evalTokens.total + (tokenUsage.assertions?.total || 0);
14847
- lines.push(`${chalk.bold("Total Tokens:")} ${chalk.white.bold(grandTotal.toLocaleString())}`);
14848
- if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
14849
- if (evalTokens.total > 0) {
14850
- const evalParts = [];
14851
- if (evalTokens.prompt > 0) evalParts.push(`${evalTokens.prompt.toLocaleString()} prompt`);
14852
- if (evalTokens.completion > 0) evalParts.push(`${evalTokens.completion.toLocaleString()} completion`);
14853
- if (evalTokens.cached > 0) if (evalTokens.cached === evalTokens.total && evalParts.length === 0) evalParts.push("cached");
14854
- else evalParts.push(`${evalTokens.cached.toLocaleString()} cached`);
14855
- if (evalTokens.completionDetails?.reasoning && evalTokens.completionDetails.reasoning > 0) evalParts.push(`${evalTokens.completionDetails.reasoning.toLocaleString()} reasoning`);
14856
- lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
14857
- }
14858
- if (tokenUsage.assertions && tokenUsage.assertions.total && tokenUsage.assertions.total > 0) {
14859
- const gradingParts = [];
14860
- if (tokenUsage.assertions.prompt && tokenUsage.assertions.prompt > 0) gradingParts.push(`${tokenUsage.assertions.prompt.toLocaleString()} prompt`);
14861
- if (tokenUsage.assertions.completion && tokenUsage.assertions.completion > 0) gradingParts.push(`${tokenUsage.assertions.completion.toLocaleString()} completion`);
14862
- if (tokenUsage.assertions.cached && tokenUsage.assertions.cached > 0) if (tokenUsage.assertions.cached === tokenUsage.assertions.total && gradingParts.length === 0) gradingParts.push("cached");
14863
- else gradingParts.push(`${tokenUsage.assertions.cached.toLocaleString()} cached`);
14864
- if (tokenUsage.assertions.completionDetails?.reasoning && tokenUsage.assertions.completionDetails.reasoning > 0) gradingParts.push(`${tokenUsage.assertions.completionDetails.reasoning.toLocaleString()} reasoning`);
14865
- lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
14866
- }
14867
- const providerIds = tracker.getProviderIds();
14868
- if (providerIds.length > 1) {
14869
- lines.push("");
14870
- lines.push(chalk.bold("Providers:"));
14871
- const sortedProviders = providerIds.map((id) => ({
14872
- id,
14873
- usage: tracker.getProviderUsage(id)
14874
- })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
14875
- for (const { id, usage } of sortedProviders) if ((usage.total || 0) > 0 || (usage.prompt || 0) + (usage.completion || 0) > 0) {
14876
- const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
14877
- const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
14878
- const details = [];
14879
- if (usage.prompt && usage.prompt > 0) details.push(`${usage.prompt.toLocaleString()} prompt`);
14880
- if (usage.completion && usage.completion > 0) details.push(`${usage.completion.toLocaleString()} completion`);
14881
- if (usage.cached && usage.cached > 0) if (usage.cached === displayTotal && details.length === 0) details.push("cached");
14882
- else details.push(`${usage.cached.toLocaleString()} cached`);
14883
- if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) details.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
14884
- const breakdown = ` (${`${usage.numRequests || 0} requests`}${details.length > 0 ? "; " : ""}${details.join(", ")})`;
14885
- lines.push(` ${chalk.gray(displayId + ":")} ${chalk.white(displayTotal.toLocaleString())}${breakdown}`);
14886
- }
14887
- }
14888
- }
14889
- lines.push("");
14890
- const totalTests = successes + failures + errors;
14891
- const formatResultPercentage = (count) => {
14892
- const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
14893
- return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
14894
- };
14895
- const formatResultLine = (count, label, icon, iconColor) => {
14896
- return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
14897
- };
14898
- const errorLabel = errors === 1 ? "error" : "errors";
14899
- lines.push(chalk.bold("Results:"));
14900
- lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
14901
- lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
14902
- lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
14903
- const durationDisplay = formatDuration(duration);
14904
- lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
14905
- lines.push("");
14906
- return lines;
15637
+ return [
15638
+ getCompletionMessage({
15639
+ completionType: params.isRedteam ? "Red team" : "Eval",
15640
+ evalId: params.evalId,
15641
+ shareableUrl: params.shareableUrl,
15642
+ wasAborted: params.targetErrorStatus != null,
15643
+ writeToDatabase: params.writeToDatabase,
15644
+ activelySharing: params.activelySharing ?? false
15645
+ }),
15646
+ ...getAbortSummaryLines(params.targetErrorStatus),
15647
+ ...getGuidanceLines({
15648
+ writeToDatabase: params.writeToDatabase,
15649
+ shareableUrl: params.shareableUrl,
15650
+ wantsToShare: params.wantsToShare,
15651
+ activelySharing: params.activelySharing ?? false,
15652
+ hasExplicitDisable: params.hasExplicitDisable,
15653
+ cloudEnabled: params.cloudEnabled
15654
+ }),
15655
+ "",
15656
+ ...getTokenUsageLines(params.tokenUsage, params.isRedteam, params.tracker),
15657
+ ...getResultsLines(params)
15658
+ ];
14907
15659
  }
14908
15660
  //#endregion
14909
15661
  //#region src/commands/retry.ts
@@ -15916,6 +16668,26 @@ async function doRedteamRun(options) {
15916
16668
  }
15917
16669
  //#endregion
15918
16670
  //#region src/index.ts
16671
+ /**
16672
+ * Shallow-clone a test case so the caller can swap in resolved ApiProvider
16673
+ * instances on `options.provider` / `assert[].provider` without leaking those
16674
+ * mutations back to the input. The input may alias the unified config written
16675
+ * to the Eval record, and a live SDK client (e.g. Bedrock's BedrockRuntime,
16676
+ * Anthropic's client) holds circular references that break drizzle's JSON
16677
+ * serialization on `evalRecord.save()`. Fixes #8687.
16678
+ *
16679
+ * Detaches only `options` and `assert[]`. Other reference fields (`provider`,
16680
+ * `vars`, `metadata`, `providerOutput`) remain aliased — callers must reassign
16681
+ * those by reference rather than mutating in place. `assert-set` children are
16682
+ * not deep-cloned because the resolve loop skips `assert-set`; if that ever
16683
+ * changes, extend this helper.
16684
+ */
16685
+ function cloneTestForResolve(test) {
16686
+ const cloned = { ...test };
16687
+ if (test.options) cloned.options = { ...test.options };
16688
+ if (test.assert) cloned.assert = test.assert.map((assertion) => ({ ...assertion }));
16689
+ return cloned;
16690
+ }
15919
16691
  async function evaluate(testSuite, options = {}) {
15920
16692
  if (testSuite.writeLatestResults) await runDbMigrations();
15921
16693
  const loadedProviders = await loadApiProviders(testSuite.providers, { env: testSuite.env });
@@ -15935,22 +16707,24 @@ async function evaluate(testSuite, options = {}) {
15935
16707
  nunjucksFilters: await readFilters(testSuite.nunjucksFilters || {}),
15936
16708
  prompts: await processPrompts(testSuite.prompts)
15937
16709
  };
15938
- if (typeof constructedTestSuite.defaultTest === "object") {
15939
- if (constructedTestSuite.defaultTest?.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
16710
+ if (typeof constructedTestSuite.defaultTest === "object" && constructedTestSuite.defaultTest) {
16711
+ constructedTestSuite.defaultTest = cloneTestForResolve(constructedTestSuite.defaultTest);
16712
+ if (constructedTestSuite.defaultTest.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
15940
16713
  env: testSuite.env,
15941
16714
  basePath: state.basePath
15942
16715
  });
15943
- if (constructedTestSuite.defaultTest?.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
16716
+ if (constructedTestSuite.defaultTest.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
15944
16717
  env: testSuite.env,
15945
16718
  basePath: state.basePath
15946
16719
  });
15947
16720
  }
15948
- for (const test of constructedTestSuite.tests || []) {
16721
+ constructedTestSuite.tests = (constructedTestSuite.tests || []).map(cloneTestForResolve);
16722
+ for (const test of constructedTestSuite.tests) {
15949
16723
  if (test.options?.provider && !isApiProvider(test.options.provider)) test.options.provider = await resolveProvider(test.options.provider, providerMap, {
15950
16724
  env: testSuite.env,
15951
16725
  basePath: state.basePath
15952
16726
  });
15953
- if (test.assert) for (const assertion of test.assert) {
16727
+ for (const assertion of test.assert || []) {
15954
16728
  if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
15955
16729
  if (assertion.provider && !isApiProvider(assertion.provider)) assertion.provider = await resolveProvider(assertion.provider, providerMap, {
15956
16730
  env: testSuite.env,
@@ -17591,6 +18365,38 @@ function normalizeTargetPurposeDiscoveryResult(result) {
17591
18365
  tools: cleanTools(result.tools)
17592
18366
  };
17593
18367
  }
18368
+ function extractStringField(value) {
18369
+ if (typeof value !== "string") return;
18370
+ return value.trim() || void 0;
18371
+ }
18372
+ async function getRemoteResponseErrorDetail(response) {
18373
+ const rawText = (await response.text()).trim();
18374
+ const fallback = rawText || response.statusText || "Unknown error";
18375
+ if (!rawText) return fallback;
18376
+ try {
18377
+ const parsed = JSON.parse(rawText);
18378
+ return extractStringField(parsed?.message) ?? extractStringField(parsed?.error) ?? fallback;
18379
+ } catch {
18380
+ return fallback;
18381
+ }
18382
+ }
18383
+ const REMOTE_ERROR_HINTS = {
18384
+ 400: "This usually means your promptfoo client is out of date. Try `npm install -g promptfoo@latest` and rerun.",
18385
+ 401: "Check that you are logged in (`promptfoo auth login`) and that your account has access to target discovery.",
18386
+ 403: "Check that you are logged in (`promptfoo auth login`) and that your account has access to target discovery.",
18387
+ 404: "This usually means your promptfoo client is out of date. Try `npm install -g promptfoo@latest` and rerun.",
18388
+ 429: "You are being rate limited. Wait a moment and try again."
18389
+ };
18390
+ function getRemoteErrorHint(status) {
18391
+ if (REMOTE_ERROR_HINTS[status]) return REMOTE_ERROR_HINTS[status];
18392
+ if (status >= 500) return "The remote generation service may be temporarily unavailable. Retry in a few minutes or contact support if the issue persists.";
18393
+ }
18394
+ async function buildRemoteErrorFromResponse(response) {
18395
+ const detail = await getRemoteResponseErrorDetail(response);
18396
+ const hint = getRemoteErrorHint(response.status);
18397
+ const base = `Remote server returned HTTP ${response.status}: ${detail}`;
18398
+ return new Error(hint ? `${base}\n${hint}` : base);
18399
+ }
17594
18400
  /**
17595
18401
  * Queries Cloud for the purpose-discovery logic, sends each logic to the target,
17596
18402
  * and summarizes the results.
@@ -17640,11 +18446,7 @@ async function doTargetPurposeDiscovery(target, prompt, showProgress = true) {
17640
18446
  email: getUserEmail()
17641
18447
  }))
17642
18448
  });
17643
- if (!response.ok) {
17644
- const error = await response.text();
17645
- logger.error(`${LOG_PREFIX} Error getting the next question from remote server: ${error}`);
17646
- continue;
17647
- }
18449
+ if (!response.ok) throw await buildRemoteErrorFromResponse(response);
17648
18450
  const responseData = await response.json();
17649
18451
  const data = TargetPurposeDiscoveryTaskResponseSchema.parse(responseData);
17650
18452
  logger.debug(`${LOG_PREFIX} Received response from remote server: ${JSON.stringify(data, null, 2)}`);
@@ -19554,7 +20356,7 @@ router.get("/", async (_req, res) => {
19554
20356
  };
19555
20357
  } catch (error) {
19556
20358
  logger.debug(`Failed to fetch latest version: ${error}`);
19557
- latestVersion = versionCache.latestVersion ?? "0.121.4";
20359
+ latestVersion = versionCache.latestVersion ?? "0.121.5";
19558
20360
  }
19559
20361
  }
19560
20362
  const selfHosted = getEnvBool("PROMPTFOO_SELF_HOSTED");
@@ -19563,7 +20365,7 @@ router.get("/", async (_req, res) => {
19563
20365
  selfHosted,
19564
20366
  isNpx
19565
20367
  });
19566
- const resolvedLatestVersion = latestVersion ?? "0.121.4";
20368
+ const resolvedLatestVersion = latestVersion ?? "0.121.5";
19567
20369
  const response = {
19568
20370
  currentVersion: VERSION,
19569
20371
  latestVersion: resolvedLatestVersion,