promptfoo 0.121.4 → 0.121.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (497) hide show
  1. package/dist/src/{ListApp-DQkFNqE9.js → ListApp-DLmM02JS.js} +1 -1
  2. package/dist/src/{accounts-DhMYUUbu.js → accounts-Ca7WIoPY.js} +12 -7
  3. package/dist/src/{accounts-F9d_5sMC.js → accounts-CjFnOPmb.js} +14 -9
  4. package/dist/src/{accounts-Dy17bs4D.cjs → accounts-CmWzeD2d.cjs} +16 -10
  5. package/dist/src/{accounts-DdJ2pHMI.js → accounts-DanM1wq_.js} +13 -8
  6. package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-CJ0j3fBi.js} +3 -3
  7. package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-DDEGRV9v.js} +3 -3
  8. package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-DvPWSUpb.cjs} +8 -7
  9. package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-TxUEMPYS.js} +2 -2
  10. package/dist/src/{agents-BahDpe5G.cjs → agents-B4sRuXg3.cjs} +7 -6
  11. package/dist/src/{agents-pQeBEXMm.js → agents-B8q7h_ek.js} +5 -5
  12. package/dist/src/{agents-CgaMXvLM.js → agents-CBgJvRkB.js} +21 -10
  13. package/dist/src/{agents-C-R_jfzI.js → agents-CYn2n3QP.js} +4 -4
  14. package/dist/src/{agents-8FDnTriG.js → agents-D-vDNFx4.js} +21 -10
  15. package/dist/src/{agents-aYPQLf8W.js → agents-LrHuQqr1.js} +20 -9
  16. package/dist/src/{agents-DJ35I3Nt.js → agents-QGg76OF-.js} +5 -5
  17. package/dist/src/{agents-D7-HGxUj.cjs → agents-eHZ9nlgA.cjs} +21 -10
  18. package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-CJEbQ0o6.js} +7 -7
  19. package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-D5HXzZ0s.js} +6 -6
  20. package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-T6HGNxNe.cjs} +7 -7
  21. package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-eYv3a_DK.js} +7 -7
  22. package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
  23. package/dist/src/app/assets/Report-BNHJKN35.js +1 -0
  24. package/dist/src/app/assets/index-BnT6P6sF.js +388 -0
  25. package/dist/src/app/assets/index-yhM8y1PP.css +1 -0
  26. package/dist/src/app/assets/{scroll-timeline-D9IT_e8Z.js → scroll-timeline-RpeTwOvs.js} +1 -1
  27. package/dist/src/app/assets/sync-5gq6fmG4.js +4 -0
  28. package/dist/src/app/assets/vendor-charts-BL9OMNU7.js +36 -0
  29. package/dist/src/app/assets/{vendor-markdown-Ch00wnNI.js → vendor-markdown-BYsQqn7Z.js} +10 -10
  30. package/dist/src/app/assets/{vendor-react-CVvmk1UB.js → vendor-react-CqWgVW6T.js} +2 -2
  31. package/dist/src/app/assets/{vendor-utils-BnEYbx2Q.js → vendor-utils-BHPO71pu.js} +1 -1
  32. package/dist/src/app/index.html +31 -6
  33. package/dist/src/{audio-COrn8rM6.js → audio-BqnRvcWG.js} +3 -3
  34. package/dist/src/{audio-DcVKoInv.js → audio-CPMtV1yR.js} +4 -4
  35. package/dist/src/{audio-B7izf48x.js → audio-DyiebVB3.js} +4 -4
  36. package/dist/src/{audio-BQtNuYBj.cjs → audio-FnxbEnSE.cjs} +4 -4
  37. package/dist/src/authoritativeMarkupInjection-BZIywVjG.js +74 -0
  38. package/dist/src/authoritativeMarkupInjection-DyAXAsSr.js +75 -0
  39. package/dist/src/authoritativeMarkupInjection-F2gBw0lN.cjs +74 -0
  40. package/dist/src/authoritativeMarkupInjection-QEQmFS83.js +74 -0
  41. package/dist/src/{base-PYJvBE1i.js → base-CKLo890h.js} +4 -3
  42. package/dist/src/{base-fZ9wgg50.js → base-Co80MMCi.js} +5 -4
  43. package/dist/src/{base-D-670DX8.cjs → base-DGJW48uz.cjs} +5 -4
  44. package/dist/src/{base-yrI1Yal4.js → base-E9I8zXjz.js} +5 -4
  45. package/dist/src/bestOfN-B3wNzjSB.js +137 -0
  46. package/dist/src/bestOfN-BBsO41z4.js +136 -0
  47. package/dist/src/bestOfN-CAwmg5UL.cjs +140 -0
  48. package/dist/src/bestOfN-_kTi8Bxe.js +136 -0
  49. package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B0977K1O.cjs} +7 -6
  50. package/dist/src/{blobs-BCZavS8s.js → blobs-CeFdPn_T.js} +3 -3
  51. package/dist/src/{blobs-BQWqnnvL.js → blobs-DODuTK-a.js} +3 -3
  52. package/dist/src/{blobs-C-F78Kfn.js → blobs-Dwef1Ao1.js} +2 -2
  53. package/dist/src/{cache-BIyPcp5v.cjs → cache-CPGUA4Yl.cjs} +135 -25
  54. package/dist/src/cache-Cf7b4pWE.js +3 -0
  55. package/dist/src/{cache-D5NZmMiT.js → cache-DIXbtkNO.js} +125 -10
  56. package/dist/src/{cache-mb7c8hbp.js → cache-DpPWrkTE.js} +128 -12
  57. package/dist/src/{cache-C4Xb-hNb.js → cache-roFAE0cI.js} +126 -11
  58. package/dist/src/{chat-I9izLm49.js → chat-CUCorGiL.js} +12 -12
  59. package/dist/src/{chat-BPXSW8Bv.cjs → chat-DG1wG4w0.cjs} +6 -6
  60. package/dist/src/{chat-BfPaS15_.js → chat-Dabu84Br.js} +12 -12
  61. package/dist/src/{chat-Dr3DUQ0D.js → chat-DqUFcWI0.js} +12 -12
  62. package/dist/src/{chat-CclRbxGf.cjs → chat-DxTDQ83C.cjs} +14 -13
  63. package/dist/src/{chat-MKxMnZJZ.js → chat-GmlolEwo.js} +4 -4
  64. package/dist/src/{chat-0bwXjVP0.js → chat-TP8Qifkh.js} +6 -6
  65. package/dist/src/{chat-mW0ORo8G.js → chat-iwaM5UTQ.js} +6 -6
  66. package/dist/src/{chatkit-zUIVoDos.js → chatkit-B6DWi70Q.js} +4 -4
  67. package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-BYveR48_.cjs} +6 -5
  68. package/dist/src/{chatkit-Cv6AhukM.js → chatkit-fARZwEfV.js} +3 -3
  69. package/dist/src/{chatkit-CJnHRRMM.js → chatkit-lb6FK02w.js} +4 -4
  70. package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BQNp_y-F.js} +212 -67
  71. package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-D5Jl0SDh.js} +212 -67
  72. package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-DH416NBD.cjs} +218 -72
  73. package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-x1XJ1-pU.js} +212 -67
  74. package/dist/src/{cloud-DQZ5sVjW.js → cloud-D3DiFqH6.js} +3 -3
  75. package/dist/src/cloud-p96PA4MH.js +3 -0
  76. package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-B6NVI3ax.js} +4 -4
  77. package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-CEAW-xQa.cjs} +6 -6
  78. package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-RFSojyXG.js} +6 -6
  79. package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-r4tbYmWU.js} +6 -6
  80. package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-BCkLouto.js} +5 -5
  81. package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-BaZ4insB.js} +3 -3
  82. package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-CF-Vb-2Z.js} +5 -5
  83. package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-TJMLBj6I.cjs} +5 -5
  84. package/dist/src/codex-app-server-B8KHEiF4.js +1915 -0
  85. package/dist/src/codex-app-server-CnrLBCeA.cjs +1921 -0
  86. package/dist/src/codex-app-server-DIXZ230V.js +1915 -0
  87. package/dist/src/codex-app-server-Dd22dC_N.js +1916 -0
  88. package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-B6Wah8Pa.js} +6 -6
  89. package/dist/src/codex-sdk-BGjVAk23.js +3 -0
  90. package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-CFF6gUyi.cjs} +18 -10
  91. package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-CmQABzV3.js} +6 -6
  92. package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-D2d54RL8.js} +5 -5
  93. package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-Bu9B8NUY.js} +8 -8
  94. package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CtzNCHKu.js} +7 -7
  95. package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-DHCDlQUI.cjs} +8 -8
  96. package/dist/src/{cometapi-BUlt_ELa.js → cometapi-OBILPLlu.js} +8 -8
  97. package/dist/src/{completion-HUe8wDhZ.js → completion-CO2e1_62.js} +6 -6
  98. package/dist/src/{completion-BozdoXba.cjs → completion-CSYfl2cd.cjs} +6 -6
  99. package/dist/src/{completion-x0a_c2y1.js → completion-DZNxcyfG.js} +6 -6
  100. package/dist/src/{completion-Dnxn7E-j.js → completion-sNvCLTAP.js} +5 -5
  101. package/dist/src/constants-BjJV0cRr.js +6 -0
  102. package/dist/src/constants-DH5XYLKZ.js +7 -0
  103. package/dist/src/constants-DZGEFLsu.js +6 -0
  104. package/dist/src/constants-a2kYssQk.cjs +11 -0
  105. package/dist/src/{createHash-4gFQpDDv.js → createHash-BtbSX3mj.js} +1 -1
  106. package/dist/src/{createHash-CwDVU5xr.js → createHash-CGVzWdjj.js} +1 -1
  107. package/dist/src/{createHash-B7KvgoOD.cjs → createHash-CSiqnK5P.cjs} +2 -2
  108. package/dist/src/{createHash-ChI45QR1.js → createHash-CgRvs4Fn.js} +1 -1
  109. package/dist/src/crescendo-BXEJK_bi.cjs +704 -0
  110. package/dist/src/crescendo-CU_Y2i-m.js +702 -0
  111. package/dist/src/crescendo-J1Xx4_zb.js +703 -0
  112. package/dist/src/crescendo-QiaSLW0d.js +701 -0
  113. package/dist/src/custom-BJfP00Bh.js +619 -0
  114. package/dist/src/custom-CZVn-1-r.js +620 -0
  115. package/dist/src/custom-Cqia7M0D.cjs +621 -0
  116. package/dist/src/custom-notggYVl.js +618 -0
  117. package/dist/src/{docker-DCgsveLD.js → docker-4D1eL6Gq.js} +6 -6
  118. package/dist/src/{docker-ClnmCf1Z.js → docker-BBv1WUDu.js} +5 -5
  119. package/dist/src/{docker-DS4_Osau.cjs → docker-D06JUoe2.cjs} +6 -6
  120. package/dist/src/{docker-CQmlA2NU.js → docker-DdJQBxK9.js} +6 -6
  121. package/dist/src/{embedding-D3xTseo7.js → embedding--UZVe4_7.js} +6 -6
  122. package/dist/src/{embedding-I45KG3o7.cjs → embedding-BbrwopfX.cjs} +6 -6
  123. package/dist/src/{embedding-nFbumxcv.js → embedding-Bi3rxrZF.js} +5 -5
  124. package/dist/src/{embedding-DD9wa3ae.js → embedding-C251p1-8.js} +6 -6
  125. package/dist/src/{errors-Cw810C93.js → errors-9PcUL8BC.js} +1 -1
  126. package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
  127. package/dist/src/{esm-CtEPLdAj.cjs → esm-BIKakvNa.cjs} +8 -7
  128. package/dist/src/{esm-C7PnfdF8.js → esm-BTK1W7lG.js} +1 -1
  129. package/dist/src/{esm-tVgYPY-f.js → esm-Bexx2PFc.js} +2 -2
  130. package/dist/src/{eval-u4UVafl6.js → eval-0VRANImH.js} +21 -21
  131. package/dist/src/{eval-CzJFfFO9.js → eval-DscR5iOM.js} +1 -1
  132. package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-2RRJvFyB.js} +41 -16
  133. package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-CvtS8h8u.cjs} +51 -15
  134. package/dist/src/evalResult-DqzsS6_W.js +3 -0
  135. package/dist/src/{evalResult-D3hVYFis.js → evalResult-eUkJv9Ko.js} +40 -15
  136. package/dist/src/evaluator-DNdJF1Gv.js +3 -0
  137. package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-DRoiYB2q.js} +1060 -187
  138. package/dist/src/evaluatorHelpers-BsYP_muT.js +511 -0
  139. package/dist/src/evaluatorHelpers-CRqTvSux.cjs +537 -0
  140. package/dist/src/evaluatorHelpers-DuqFFfq7.js +510 -0
  141. package/dist/src/{extractor-CAfTSraf.js → extractor-BR7XAzAL.js} +6 -6
  142. package/dist/src/{extractor-WVPOrH43.cjs → extractor-BdxEtt3J.cjs} +6 -6
  143. package/dist/src/{extractor-DNSeBVOJ.js → extractor-CIW3iN-b.js} +6 -6
  144. package/dist/src/{extractor-Dk6bRWkv.js → extractor-CxRtnaHl.js} +5 -5
  145. package/dist/src/{fetch-B0Z3Oe4k.js → fetch-BufrQtvR.js} +93 -40
  146. package/dist/src/{fetch-BEWnXrrG.js → fetch-DXUnXkVU.js} +89 -40
  147. package/dist/src/{fetch-CJU5ELPa.cjs → fetch-Dw4XZHjj.cjs} +330 -270
  148. package/dist/src/{fetch-Di00EQrc.js → fetch-It34O8Ur.js} +305 -252
  149. package/dist/src/fetch-_YgGd2qv.js +3 -0
  150. package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-BhdwzYaD.cjs} +24 -1
  151. package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-CXRfY3Ss.js} +12 -2
  152. package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-D4GCJ67J.js} +12 -2
  153. package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-CMVNrYvE.js} +1 -1
  154. package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-14nugQQx.cjs} +14 -2
  155. package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-BPVvltoW.js} +2 -2
  156. package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-D18lYzhB.js} +2 -2
  157. package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-jJKYsnjc.js} +2 -2
  158. package/dist/src/goat-Ckd3q3AY.js +467 -0
  159. package/dist/src/goat-Qgurm-NP.js +466 -0
  160. package/dist/src/goat-ghadEDdy.js +465 -0
  161. package/dist/src/goat-una6pZGP.cjs +469 -0
  162. package/dist/src/graders-BDT7dif6.js +3 -0
  163. package/dist/src/{graders-eIHhRqoC.js → graders-BGP99PdK.js} +2416 -2224
  164. package/dist/src/{graders-Zy3x0zqX.js → graders-BX0f2tvS.js} +2423 -2226
  165. package/dist/src/{graders-pvbReLLn.js → graders-C0nXU_ZP.js} +1806 -1609
  166. package/dist/src/{graders--zknU_uk.cjs → graders-ClrU2fnd.cjs} +2219 -1949
  167. package/dist/src/hydra-BSNZZm2M.js +543 -0
  168. package/dist/src/hydra-BxdG4nkg.js +541 -0
  169. package/dist/src/hydra-DE4xWwyc.js +542 -0
  170. package/dist/src/hydra-DrJttnvw.cjs +542 -0
  171. package/dist/src/image-B4oBtu6J.js +443 -0
  172. package/dist/src/{image-dnoUgPrC.js → image-BN-hjLL9.js} +4 -4
  173. package/dist/src/{image-9302QVqR.js → image-B_fPIwdg.js} +3 -3
  174. package/dist/src/image-BvUAW344.js +442 -0
  175. package/dist/src/image-Cvjwx1uY.js +442 -0
  176. package/dist/src/{image-De2FBmYV.cjs → image-DfVCGPbI.cjs} +4 -4
  177. package/dist/src/{image-u7-rKnYU.js → image-QzmydkiG.js} +4 -4
  178. package/dist/src/image-X0oY4350.cjs +465 -0
  179. package/dist/src/index.cjs +1689 -558
  180. package/dist/src/index.d.cts +3270 -1624
  181. package/dist/src/index.d.ts +3270 -1624
  182. package/dist/src/index.js +1553 -438
  183. package/dist/src/indirectWebPwn-02ZIghCS.js +259 -0
  184. package/dist/src/indirectWebPwn-BJ22AbQa.cjs +397 -0
  185. package/dist/src/indirectWebPwn-CbjUG0rh.js +385 -0
  186. package/dist/src/indirectWebPwn-CfQJt3gk.cjs +260 -0
  187. package/dist/src/indirectWebPwn-DBQhOjoD.js +260 -0
  188. package/dist/src/indirectWebPwn-OsXnKejv.js +259 -0
  189. package/dist/src/indirectWebPwn-tNx9OZ35.js +385 -0
  190. package/dist/src/indirectWebPwn-uyWdHx04.js +386 -0
  191. package/dist/src/inputVariables-B0qUChbV.js +467 -0
  192. package/dist/src/inputVariables-DUGMb9Ka.js +464 -0
  193. package/dist/src/inputVariables-DXFdi7AI.js +468 -0
  194. package/dist/src/inputVariables-Dq9W-Z3a.cjs +475 -0
  195. package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-C4QlIuoR.js} +2 -2
  196. package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
  197. package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
  198. package/dist/src/{invariant-kfQ8Bu82.cjs → invariant-QtnLD03y.cjs} +1 -1
  199. package/dist/src/iterative-CpU6i2As.js +490 -0
  200. package/dist/src/iterative-DJQEQpG3.js +491 -0
  201. package/dist/src/iterative-DQBuWM-j.cjs +493 -0
  202. package/dist/src/iterative-FTS4Bz67.js +492 -0
  203. package/dist/src/iterativeImage-BUABMVOA.js +413 -0
  204. package/dist/src/iterativeImage-ByFWkxax.cjs +415 -0
  205. package/dist/src/iterativeImage-BzUapOUi.js +414 -0
  206. package/dist/src/iterativeImage-Doz8mgxF.js +413 -0
  207. package/dist/src/iterativeMeta-B3YiAOc8.js +386 -0
  208. package/dist/src/iterativeMeta-C7APE_P1.js +385 -0
  209. package/dist/src/iterativeMeta-CSS8M6Ds.cjs +385 -0
  210. package/dist/src/iterativeMeta-DgoQ7bLh.js +384 -0
  211. package/dist/src/iterativeTree-B5zxBBSW.js +769 -0
  212. package/dist/src/iterativeTree-CNyIk0Yn.js +768 -0
  213. package/dist/src/iterativeTree-CPMF10ve.cjs +771 -0
  214. package/dist/src/iterativeTree-DvZ7GBwt.js +770 -0
  215. package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-BadkINlJ.js} +24 -10
  216. package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-Bi_8sV-H.js} +25 -11
  217. package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CkMljjdg.js} +25 -11
  218. package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-DUh34xba.cjs} +25 -11
  219. package/dist/src/{litellm-DRjpcSa7.js → litellm-BKBo0jpC.js} +5 -5
  220. package/dist/src/{litellm-C2kqjxqp.js → litellm-BXyn5kZK.js} +5 -5
  221. package/dist/src/{litellm-p37R1dzQ.js → litellm-CNcfbCfa.js} +4 -4
  222. package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CtAr7bKG.cjs} +5 -5
  223. package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
  224. package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
  225. package/dist/src/{logger-COuQb2xB.cjs → logger-cfNpzI4o.cjs} +13 -55
  226. package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-BMX1iEB6.js} +5 -5
  227. package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CR5TSpp4.js} +5 -5
  228. package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-D3FUc2K3.cjs} +9 -8
  229. package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-OEMmS1RB.js} +6 -6
  230. package/dist/src/main.js +909 -369
  231. package/dist/src/memoryPoisoning-CM83NWYl.js +107 -0
  232. package/dist/src/memoryPoisoning-D8h9gXJF.js +106 -0
  233. package/dist/src/memoryPoisoning-Dp-btinn.cjs +106 -0
  234. package/dist/src/memoryPoisoning-cLuCoTuJ.js +106 -0
  235. package/dist/src/{messages-BTQz42fn.js → messages-BabO-cX8.js} +273 -17
  236. package/dist/src/{messages-811uVVW5.cjs → messages-DBPir0TQ.cjs} +278 -18
  237. package/dist/src/{messages-zWbkLLHz.js → messages-DGUlSNU7.js} +273 -17
  238. package/dist/src/{messages-MYTQ2TWp.js → messages-vsE_-Lv0.js} +273 -17
  239. package/dist/src/{meteor-DHdzY1Ss.js → meteor--TZYICTI.js} +2 -2
  240. package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-CR226f7Z.cjs} +2 -2
  241. package/dist/src/{meteor-CU5UAE-H.js → meteor-Cl_yd7rJ.js} +2 -2
  242. package/dist/src/{meteor-DuAFv6gF.js → meteor-Dce-_zGQ.js} +1 -1
  243. package/dist/src/mischievousUser-0l8GD7Dp.js +46 -0
  244. package/dist/src/mischievousUser-BUOP9W5r.js +46 -0
  245. package/dist/src/mischievousUser-frFYKxu6.js +47 -0
  246. package/dist/src/mischievousUser-olGgHIVR.cjs +46 -0
  247. package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-CNV5bMSk.cjs} +7 -7
  248. package/dist/src/{modelslab-D0erNWKe.js → modelslab-Cogmu4mG.js} +6 -6
  249. package/dist/src/{modelslab-DIq-6y7x.js → modelslab-Dzst7VTU.js} +6 -6
  250. package/dist/src/{modelslab-wu9yi5GE.js → modelslab-EyDczZ5A.js} +7 -7
  251. package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-BGPNBOMS.js} +6 -6
  252. package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-B_5NKFu1.js} +5 -5
  253. package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-C4eUJGse.js} +5 -5
  254. package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-CjJRxI1X.cjs} +9 -8
  255. package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BNGmgfFz.js} +4 -4
  256. package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-ChPlh5na.js} +4 -4
  257. package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CrV0iaY_.js} +3 -3
  258. package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-DuOG9Aun.cjs} +5 -4
  259. package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
  260. package/dist/src/{openai-URNyItar.cjs → openai-C3uXv8wS.cjs} +2 -2
  261. package/dist/src/{openai-Qsvz25mV.js → openai-CJrsh9n4.js} +2 -2
  262. package/dist/src/{openai-iYtrXzOX.js → openai-zgwBb4Ff.js} +1 -1
  263. package/dist/src/{openclaw-CnQ363Wi.js → openclaw-BIHlu_36.js} +10 -8
  264. package/dist/src/{openclaw-CwzlQSQX.js → openclaw-CF7fMido.js} +9 -7
  265. package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-Dphc01BY.cjs} +18 -15
  266. package/dist/src/{openclaw-CLWrW03k.js → openclaw-zIJAsz3P.js} +10 -8
  267. package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-B3vlPLsp.js} +40 -5
  268. package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-D05JSgMQ.js} +40 -5
  269. package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-DoY6GbWw.cjs} +46 -10
  270. package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-sRKYHGoI.js} +39 -4
  271. package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver--gTpSagc.js} +120 -4
  272. package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-B2eaKC8C.js} +120 -4
  273. package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-BXjcRqAM.js} +119 -3
  274. package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-CvJdBGSc.cjs} +125 -7
  275. package/dist/src/packageParser--MWTSrPW.js +36 -0
  276. package/dist/src/packageParser-CgE-ziRo.js +35 -0
  277. package/dist/src/packageParser-QoCS1FMl.cjs +54 -0
  278. package/dist/src/packageParser-hwwSGnAZ.js +35 -0
  279. package/dist/src/processShim-BBxt7LKO.js +95 -0
  280. package/dist/src/processShim-BcGzU8fY.js +94 -0
  281. package/dist/src/processShim-C_z3aRvF.js +94 -0
  282. package/dist/src/processShim-DSY9BV2T.cjs +98 -0
  283. package/dist/src/promptLength-0qIHyhA5.js +71 -0
  284. package/dist/src/promptLength-4X-Wd8PG.js +72 -0
  285. package/dist/src/promptLength-B9nZEfO6.js +71 -0
  286. package/dist/src/promptLength-BbBbDHNj.cjs +94 -0
  287. package/dist/src/promptfoo-BDrfT30-.js +180 -0
  288. package/dist/src/promptfoo-Cm4hiy1Y.js +180 -0
  289. package/dist/src/promptfoo-Rjp-MeBb.js +181 -0
  290. package/dist/src/promptfoo-b-baRMj-.cjs +205 -0
  291. package/dist/src/prompts-BYMtqPCw.js +259 -0
  292. package/dist/src/prompts-C-bqE1Yp.js +260 -0
  293. package/dist/src/prompts-Cp_Qx5Ml.js +270 -0
  294. package/dist/src/prompts-DHhQsANy.js +259 -0
  295. package/dist/src/prompts-D_QpZ2Dm.js +271 -0
  296. package/dist/src/prompts-hNvWBD3z.cjs +284 -0
  297. package/dist/src/prompts-huDVH2CI.js +270 -0
  298. package/dist/src/prompts-p78Hul5i.cjs +289 -0
  299. package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
  300. package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-CZO_w7ue.cjs} +2 -2
  301. package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DHcFiVWX.js} +1 -1
  302. package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-ReCd0sFa.js} +2 -2
  303. package/dist/src/{providers-BV_KMZje.js → providers-B9KzWxAX.js} +10558 -21587
  304. package/dist/src/{providers-DruaQfwu.js → providers-BCCz6_IX.js} +1228 -12196
  305. package/dist/src/{providers-1eKkXBKp.cjs → providers-BDVVIQM6.cjs} +10649 -21843
  306. package/dist/src/{providers-iUt5fbAN.js → providers-BYAn82cf.js} +1 -1
  307. package/dist/src/{providers-Domz_llv.js → providers-DVYRZP4E.js} +10589 -21570
  308. package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-CLCgQ9tt.js} +3 -3
  309. package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-CgYxeSmO.js} +3 -3
  310. package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-Cokhluq3.cjs} +8 -7
  311. package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-D0BYebvX.js} +3 -3
  312. package/dist/src/{quiverai-DFotyafY.cjs → quiverai-BAp6iTZD.cjs} +4 -4
  313. package/dist/src/{quiverai-aPPvXOgn.js → quiverai-BvIhI_0l.js} +4 -4
  314. package/dist/src/{quiverai-DR0SnIQV.js → quiverai-CdTWPe-A.js} +3 -3
  315. package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Cv7rJKDz.js} +4 -4
  316. package/dist/src/registry-BUJrgjwv.js +124 -0
  317. package/dist/src/registry-DXm1t_x0.js +125 -0
  318. package/dist/src/registry-Dp5EqoXc.js +124 -0
  319. package/dist/src/registry-KCVF1CFC.cjs +124 -0
  320. package/dist/src/{server-D6Il2Sob.js → remoteGeneration-B1_XsKXU.js} +16 -108
  321. package/dist/src/{server-BSB45Nt9.js → remoteGeneration-COpWcmWd.js} +15 -146
  322. package/dist/src/{server-Dx2TyCH2.cjs → remoteGeneration-DS9N3pgB.cjs} +30 -119
  323. package/dist/src/remoteGeneration-DsaSwmG2.js +217 -0
  324. package/dist/src/render-BNTrbmBw.cjs +384 -0
  325. package/dist/src/render-CSP99NLm.js +348 -0
  326. package/dist/src/render-DFfDeYUK.js +347 -0
  327. package/dist/src/{render-CgVDrJmM.js → render-DznWrxGO.js} +2 -2
  328. package/dist/src/render-_6ur1fhE.js +347 -0
  329. package/dist/src/resourceAttributes-D1jP3kL5.js +17 -0
  330. package/dist/src/resourceAttributes-DQbBB--2.js +16 -0
  331. package/dist/src/resourceAttributes-ephgOvdR.cjs +27 -0
  332. package/dist/src/resourceAttributes-v6-I67fn.js +16 -0
  333. package/dist/src/{responses-Bi9vBuW_.cjs → responses-1UFFF9N_.cjs} +51 -16
  334. package/dist/src/{responses-DL9m8CyY.js → responses-B3W2JvOQ.js} +49 -15
  335. package/dist/src/{responses--OsX2aYW.js → responses-B6ktc3Ra.js} +49 -15
  336. package/dist/src/{responses-C-flexAY.js → responses-URRzV8qE.js} +49 -15
  337. package/dist/src/rolldown-runtime-D_mwlA32.cjs +43 -0
  338. package/dist/src/rubyUtils-BYVlQ94c.js +3 -0
  339. package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CXlFM2rR.js} +3 -3
  340. package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-CnlW8AYb.js} +3 -3
  341. package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-CqUWBZAt.cjs} +18 -27
  342. package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DdGojpfv.js} +3 -3
  343. package/dist/src/runtimeTransform-BJOpL9Yc.js +142 -0
  344. package/dist/src/runtimeTransform-Dgh_D7DU.js +143 -0
  345. package/dist/src/runtimeTransform-DigbjU1r.js +142 -0
  346. package/dist/src/runtimeTransform-ON3YYILw.cjs +147 -0
  347. package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-CujrzP1a.js} +62 -51
  348. package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-DzffAqo_.cjs} +65 -53
  349. package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-vhtSV7JI.js} +62 -51
  350. package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-yr1QKeBs.js} +61 -50
  351. package/dist/src/{scanner-1DqWi1Ej.js → scanner-DS0109SS.js} +7 -7
  352. package/dist/src/server/index.js +5105 -605
  353. package/dist/src/server-B8rqV126.cjs +126 -0
  354. package/dist/src/server-BaLytskk.js +3 -0
  355. package/dist/src/server-CMJD10J4.js +107 -0
  356. package/dist/src/server-Ddp8GNMp.js +146 -0
  357. package/dist/src/server-DhMHosWj.js +182 -0
  358. package/dist/src/shared-7pmVZLNO.js +1334 -0
  359. package/dist/src/shared-9WHQ1oNE.js +1335 -0
  360. package/dist/src/{fileExtensions-BArZuxsI.js → shared-BoG7qLMv.js} +12 -2
  361. package/dist/src/shared-D6IjElRI.js +1334 -0
  362. package/dist/src/shared-WkgnDkcg.cjs +1436 -0
  363. package/dist/src/{signal-CE5G3a7x.js → signal-CSurUUyV.js} +3 -3
  364. package/dist/src/simulatedUser-C9aQObBI.js +222 -0
  365. package/dist/src/simulatedUser-Cu601Dd4.cjs +227 -0
  366. package/dist/src/simulatedUser-U_qAHnuB.js +222 -0
  367. package/dist/src/simulatedUser-p3tACcmw.js +223 -0
  368. package/dist/src/{slack-DDUe-5MC.js → slack-Bapo-7_8.js} +2 -2
  369. package/dist/src/{slack-1Rhq0EoV.cjs → slack-DMC1QVEg.cjs} +3 -2
  370. package/dist/src/{slack-D5Wpy8LM.js → slack-DTEFhrMn.js} +2 -2
  371. package/dist/src/{slack-acRb0IqQ.js → slack-k-_CP84Q.js} +1 -1
  372. package/dist/src/storage-BU4qcnOb.js +875 -0
  373. package/dist/src/storage-CA-v9V2v.cjs +911 -0
  374. package/dist/src/storage-CD-GWAdx.js +822 -0
  375. package/dist/src/storage-QdU-SmvD.js +834 -0
  376. package/dist/src/{store-DAAyxcy6.cjs → store-B2NDDooM.cjs} +60 -24
  377. package/dist/src/{store-CYEy5J2D.js → store-DKd5592Q.js} +51 -20
  378. package/dist/src/{store-M0b1WfYb.js → store-HpopRVzl.js} +50 -19
  379. package/dist/src/store-IbiRIF3k.js +3 -0
  380. package/dist/src/strategies-7CS3Alao.cjs +2360 -0
  381. package/dist/src/strategies-CiSeroPH.js +2331 -0
  382. package/dist/src/strategies-DRJjGTIY.js +2333 -0
  383. package/dist/src/{tables-DQ4WU5tX.js → tables-CRSXQ2Ke.js} +2 -2
  384. package/dist/src/{tables-CsWou1Bx.js → tables-CxjU7bBd.js} +3 -3
  385. package/dist/src/{tables-DUfh1F7Z.cjs → tables-DBIJU0WE.cjs} +6 -5
  386. package/dist/src/{tables-C4CH3zRr.js → tables-DafUHOeh.js} +3 -3
  387. package/dist/src/{telemetry-CQPez_Jp.js → telemetry-00ezXr_t.js} +5 -4
  388. package/dist/src/telemetry-ByPqDcKC.js +3 -0
  389. package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CJ7FnCsc.cjs} +18 -11
  390. package/dist/src/{telemetry-dbaJ0E98.js → telemetry-DmXYcJNV.js} +5 -4
  391. package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-DwX9XUN5.js} +4 -3
  392. package/dist/src/{text-KvuD2Iko.js → text-Db-Wt2u2.js} +1 -1
  393. package/dist/src/{text-DHxdyQqT.js → text-DwYK5EBn.js} +1 -1
  394. package/dist/src/{text-BVi-cLPJ.cjs → text-nywWsRBM.cjs} +1 -1
  395. package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-BjVkdk18.js} +1 -1
  396. package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-CDet74yk.js} +1 -1
  397. package/dist/src/tokenUsageUtils-CmnQ0G2m.js +142 -0
  398. package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-_B-P8IAi.cjs} +1 -1
  399. package/dist/src/toolAttributes-BAjwcBf0.cjs +103 -0
  400. package/dist/src/toolAttributes-COVgDrBG.js +87 -0
  401. package/dist/src/toolAttributes-DJ9ZEKXD.js +86 -0
  402. package/dist/src/tracingOptions-BnwKCkSB.js +221 -0
  403. package/dist/src/tracingOptions-Chi74lOD.js +219 -0
  404. package/dist/src/tracingOptions-DrbSFaKy.cjs +249 -0
  405. package/dist/src/tracingOptions-ji2OuXbT.js +220 -0
  406. package/dist/src/{transcription-DuWDupG7.js → transcription-B8uIgCYX.js} +5 -5
  407. package/dist/src/{transcription-CJspiD2c.js → transcription-CfU5loSq.js} +6 -6
  408. package/dist/src/{transcription-V2HaAmy2.js → transcription-Dkd22_4K.js} +6 -6
  409. package/dist/src/{transcription-BvjmiYB1.cjs → transcription-mzuf18Mq.cjs} +9 -8
  410. package/dist/src/{transform-lQrDE1BQ.js → transform-BIMynQsA.js} +9 -9
  411. package/dist/src/transform-BnSTnFlp.js +187 -0
  412. package/dist/src/transform-BnSXWmU_2.cjs +221 -0
  413. package/dist/src/transform-CGt7Kt3y2.js +186 -0
  414. package/dist/src/transform-CrPGTsij.js +186 -0
  415. package/dist/src/{transform-CTeuTR3S.cjs → transform-DhNkAUs8.cjs} +13 -12
  416. package/dist/src/{transform-CG0ehZNG.js → transform-DmvYBRll.js} +9 -9
  417. package/dist/src/{transform-zDhMmzwX.js → transform-EtD4jAWi.js} +9 -9
  418. package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-0ThtPved.js} +1 -1
  419. package/dist/src/transformersAvailability-BYydDE5U.js +35 -0
  420. package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-BvyU9vDD.js} +1 -1
  421. package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-BytPvKUW.cjs} +1 -1
  422. package/dist/src/{types-Dm9JM6Vb.js → types-BFevViUY.js} +115 -19
  423. package/dist/src/{types-Bgh5SOn6.js → types-BJQBBPTP.js} +115 -19
  424. package/dist/src/{types-CeaeaZdP.cjs → types-CxJvaY2S.cjs} +357 -172
  425. package/dist/src/{types-BGQDAP8i.js → types-D6glLbdF.js} +271 -170
  426. package/dist/src/{util-BYvQUPp7.js → util--WMgw7wM.js} +28 -8
  427. package/dist/src/{util-C9J8ahRn.js → util-5WnCSb0h.js} +72 -48
  428. package/dist/src/{util-CN3SrLT4.cjs → util-BSIuSLVK.cjs} +74 -49
  429. package/dist/src/{util-C8e5uydV.js → util-Bx677_k2.js} +154 -147
  430. package/dist/src/util-CN8om2rz.cjs +386 -0
  431. package/dist/src/{util-DDs-7g6-.js → util-CoQWM76y.js} +28 -8
  432. package/dist/src/util-DNl96nNs.js +327 -0
  433. package/dist/src/{util-DxWpWjhc.js → util-DURocbYR.js} +667 -507
  434. package/dist/src/util-Df8YMvS1.js +327 -0
  435. package/dist/src/{util-DvU2Pw8c.js → util-DiQ3QvBB.js} +28 -8
  436. package/dist/src/{util-oGMLA7vc.js → util-I-Rf-KaD.js} +862 -577
  437. package/dist/src/{util-olYL5C6N.cjs → util-IYzs5Y04.cjs} +33 -7
  438. package/dist/src/{util-D9TisOyk.js → util-LKTmNsMQ.js} +71 -47
  439. package/dist/src/{util-Bxn8emtE.cjs → util-SPsvFONY.cjs} +738 -582
  440. package/dist/src/{util-D3q0WQ-0.js → util-efByNxcr.js} +72 -48
  441. package/dist/src/util-kDURhgJW.js +328 -0
  442. package/dist/src/{utils-DJfvjyMj.js → utils-B0lzitHZ.js} +3 -3
  443. package/dist/src/{utils-BLJKfv0y.js → utils-BFOh20Gb.js} +3 -3
  444. package/dist/src/{utils-hXtCYanr.js → utils-BGY69tk_.js} +2 -2
  445. package/dist/src/{utils-B05gLxER.cjs → utils-Ve6kuJsa.cjs} +3 -3
  446. package/dist/src/version-BK20a4sw.js +16 -0
  447. package/dist/src/version-BWCSaByA.cjs +27 -0
  448. package/dist/src/version-eRkNuGv8.js +17 -0
  449. package/dist/src/version-lpHV_53E.js +16 -0
  450. package/dist/tsconfig.tsbuildinfo +1 -1
  451. package/package.json +56 -28
  452. package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
  453. package/dist/src/app/assets/index-BXGkeMwh.css +0 -1
  454. package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
  455. package/dist/src/app/assets/sync-IjzpWrOE.js +0 -4
  456. package/dist/src/app/assets/vendor-charts-BNdH8TCw.js +0 -36
  457. package/dist/src/cache-Cr9oLMUa.js +0 -3
  458. package/dist/src/cache-DbLsVWB2.cjs +0 -3
  459. package/dist/src/cloud-Hphvo8kr.js +0 -3
  460. package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
  461. package/dist/src/codex-sdk-CWEnH70W.cjs +0 -2
  462. package/dist/src/evalResult-D8MT9p0s.js +0 -3
  463. package/dist/src/evalResult-DElBuddX.js +0 -2
  464. package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
  465. package/dist/src/evaluator-CVessDWe.js +0 -3
  466. package/dist/src/fetch-C7bGKDlQ.js +0 -3
  467. package/dist/src/graders-BOAzQEUe.cjs +0 -2
  468. package/dist/src/graders-D4BTsZdG2.js +0 -3
  469. package/dist/src/graders-DOJK1XpV.js +0 -2
  470. package/dist/src/graders-NAv9LcBn.js +0 -2
  471. package/dist/src/image-B5Mv-Z3h.js +0 -257
  472. package/dist/src/image-DVz2RiMF.js +0 -258
  473. package/dist/src/image-qUpPvmNZ.js +0 -257
  474. package/dist/src/image-x6KqLQl4.cjs +0 -280
  475. package/dist/src/providers-Bp4S-FvO.js +0 -2
  476. package/dist/src/providers-DV3ax9e_.cjs +0 -3
  477. package/dist/src/providers-u9Enmfok.js +0 -2
  478. package/dist/src/render-CH-62LbA.js +0 -135
  479. package/dist/src/render-CMEpfLaO.js +0 -136
  480. package/dist/src/render-DHIZ6_k8.js +0 -135
  481. package/dist/src/render-DfQSFxGE.cjs +0 -165
  482. package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
  483. package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
  484. package/dist/src/server-BNYztJkh.js +0 -385
  485. package/dist/src/server-DCtHUqlp.js +0 -3
  486. package/dist/src/server-DaA2eR26.cjs +0 -2
  487. package/dist/src/store-CWOSz6D_.cjs +0 -2
  488. package/dist/src/store-DCDBhv7B.js +0 -3
  489. package/dist/src/store-Dn9HUkdW.js +0 -240
  490. package/dist/src/telemetry-C1IqxcdW.js +0 -3
  491. package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
  492. package/dist/src/transform-Bbg6A8Jk.js +0 -216
  493. package/dist/src/transform-CUnzlsbn.cjs +0 -228
  494. package/dist/src/transform-DYX1_Xnh.js +0 -216
  495. package/dist/src/transform-DgKlRr73.cjs +0 -2
  496. package/dist/src/transform-M6ITAESf.js +0 -3
  497. package/dist/src/transform-UN5UGu8U.js +0 -213
package/dist/src/index.js CHANGED
@@ -1,33 +1,47 @@
1
- import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, g as getAjv, h as extractJsonObjects, k as state, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "./logger-Ct2S6Yx-.js";
1
+ import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, g as getAjv, h as extractJsonObjects, k as state, m as extractFirstJsonObject, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "./logger-Ct2S6Yx-.js";
2
2
  import { t as invariant } from "./invariant-Ddh24eXh.js";
3
- import { r as importModule, t as getDirectory } from "./esm-C7PnfdF8.js";
4
- import { r as runPython } from "./pythonUtils-C2UQ30Rz.js";
5
- import { i as isJavascriptFile } from "./fileExtensions-DnqA1y9x.js";
6
- import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-Bbg6A8Jk.js";
7
- import { $ as matchesGEval, A as DivergentRepetitionPlugin, B as sampleArray, C as getPiiLeakTestsForCategory, D as HarmbenchPlugin, E as ImitationPlugin, F as AegisPlugin, G as loadRubricPrompt, H as callProviderWithContext, I as RedteamGraderBase, J as matchesClosedQa, K as matchesAnswerRelevance, L as RedteamPluginBase, M as CrossSessionLeakPlugin, N as ContractPlugin, O as HallucinationPlugin, P as BeavertailsPlugin, Q as matchesFactuality, R as getCustomPolicies, S as PlinyPlugin, T as IntentPlugin, U as fail, V as fetchHuggingFaceDataset, W as getAndCheckProvider, X as matchesContextRecall, Y as matchesContextFaithfulness, Z as matchesContextRelevance, _ as PoliticsPlugin, _t as processFileReference, a as UnverifiableClaimsPlugin, at as matchesSimilarity, b as isValidPolicyObject, c as ToolDiscoveryPlugin, ct as withProviderCallExecutionContext, d as TeenSafetyDangerousContentPlugin, dt as readPrompts, et as matchesLlmRubric, f as TeenSafetyAgeRestrictedGoodsAndServicesPlugin, ft as readProviderPromptMap, g as PromptExtractionPlugin, gt as loadFromJavaScriptFile, h as RbacPlugin, ht as getFinalTest, i as VLGuardPlugin, it as matchesSelectBest, j as DebugAccessPlugin, k as ExcessiveAgencyPlugin, l as TeenSafetyHarmfulBodyIdealsPlugin, lt as getDefaultProviders, m as ShellInjectionPlugin, mt as coerceString, n as getGraderById, nt as matchesPiScore, o as UnsafeBenchPlugin, ot as matchesTrajectoryGoalSuccess, p as SqlInjectionPlugin, pt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, q as matchesClassification, r as VLSUPlugin, rt as matchesSearchRubric, s as ToxicChatPlugin, st as selectMaxScore, t as GRADERS, tt as matchesModeration, u as TeenSafetyDangerousRoleplayPlugin, ut as processPrompts, v as PolicyPlugin, vt as resolveContext, w as OverreliancePlugin, x as makeInlinePolicyIdSync, y as determinePolicyTypeFromId, z as retryWithDeduplication } from "./graders-Zy3x0zqX.js";
8
- import { A as isApiProvider, At as CompletionTokenDetailsSchema, C as TestGeneratorConfigSchema, Ct as UNALIGNED_PROVIDER_HARM_PLUGINS, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, Ot as PromptSchema, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as TELECOM_PLUGINS, T as TestSuiteSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as PLUGIN_CATEGORIES, a as AtomicTestCaseSchema, at as DEFAULT_PLUGINS, b as TestCaseWithVarsFileSchema, c as CompletedPromptSchema, ct as HARM_PLUGINS, d as EvaluateOptionsSchema, dt as LLAMA_GUARD_REPLICATE_PROVIDER, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MEDICAL_PLUGINS, g as ResultFailureReason, gt as PII_PLUGINS, h as OutputFileExtension, ht as PHARMACY_PLUGINS, i as AssertionTypeSchema, it as DATASET_EXEMPT_PLUGINS, j as isProviderOptions, jt as InputsSchema, k as isResultFailureReason, kt as BaseTokenUsageSchema, l as DerivedMetricSchema, lt as INSURANCE_PLUGINS, m as OutputConfigSchema, mt as MULTI_INPUT_VAR, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FINANCIAL_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_EXCLUDED_PLUGINS, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as CANARY_BREAKING_STRATEGY_IDS, s as CommandLineOptionsSchema, st as FOUNDATION_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_ENABLED_CATEGORIES, v as SpecialAssertionTypesSchema, vt as REDTEAM_PROVIDER_HARM_PLUGINS, w as TestSuiteConfigSchema, wt as CODING_AGENT_CORE_PLUGINS, x as TestCasesWithMetadataPromptSchema, xt as TEEN_SAFETY_PLUGINS, y as TestCaseSchema, yt as REMOTE_ONLY_PLUGIN_IDS, z as StrategyConfigSchema } from "./types-BGQDAP8i.js";
9
- import { C as checkProviderApiKeys, D as isGoogleProvider, E as isAnthropicProvider, O as isOpenAiProvider, S as resultIsForTestCase, T as getProviderDescription, _ as setupEnv, b as filterRuntimeVars, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, g as parseFileUrl, h as loadFunction, i as fetchCsvFromGoogleSheet, k as isProviderAllowed, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as deduplicateTestCases, w as doesProviderRefMatch, x as getTestCaseDeduplicationKey, y as extractRuntimeVars } from "./util-oGMLA7vc.js";
10
- import { a as getNunjucksEngine, i as extractVariablesFromTemplates, r as extractVariablesFromTemplate, t as renderEnvOnlyInObject } from "./render-CH-62LbA.js";
11
- import { A as TERMINAL_MAX_WIDTH, F as VERSION, I as FILE_METADATA_KEY, L as HUMAN_ASSERTION_TYPE, M as getShareApiBaseUrl, N as getShareViewBaseUrl, S as parseChatPrompt, a as CloudConfig, d as sleep, h as REQUEST_TIMEOUT_MS, j as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp, y as isPromptfooSampleTarget } from "./fetch-Di00EQrc.js";
12
- import { c as isNonTransientHttpStatus, i as getCache, n as disableCache, o as withCacheNamespace, r as fetchWithCache, s as NON_TRANSIENT_HTTP_STATUSES, t as cache_exports } from "./cache-D5NZmMiT.js";
13
- import { $ as AIStudioChatProvider, A as createRateLimitRegistry, B as getCloudDatabaseId, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getGeneratedPromptOverLimit, G as isCloudProvider, H as getOrgContext, I as getMaxCharsPerMessageModifierValue, K as resolveTeamId, L as throwIfTargetPromptExceedsMaxChars, N as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY, Q as VertexChatProvider, T as runExtensionHook, U as getPluginSeverityOverridesFromCloud, V as getEvalConfigFromCloud, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId, z as checkCloudPermissions } from "./providers-DruaQfwu.js";
14
- import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-hXtCYanr.js";
15
- import { n as sha256, t as randomSequence } from "./createHash-4gFQpDDv.js";
16
- import { t as OpenAiChatCompletionProvider } from "./chat-I9izLm49.js";
17
- import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-C-bmyHoE.js";
18
- import { h as validateFunctionCall } from "./transform-CG0ehZNG.js";
19
- import { l as validateFunctionCall$1 } from "./util-D9TisOyk.js";
20
- import { t as providerRegistry } from "./providerRegistry-Bvh8mv85.js";
21
- import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-D6Il2Sob.js";
22
- import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-DhMYUUbu.js";
23
- import { t as getBlobByHash } from "./blobs-C-F78Kfn.js";
24
- import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-DQ4WU5tX.js";
25
- import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-Dk6bRWkv.js";
26
- import { t as telemetry } from "./telemetry-Dvqxv3YC.js";
27
- import { t as ellipsize } from "./text-DHxdyQqT.js";
28
- import { t as getTraceStore } from "./store-M0b1WfYb.js";
29
- import { n as runRuby } from "./rubyUtils-DVLeA2jg.js";
30
- import { t as EvalResult } from "./evalResult-Bgm9ZH31.js";
3
+ import { $ as riskCategorySeverityMap, A as RedteamConfigSchema, At as DocumentMediaInjectionPlacementValues, B as isUuid, Bt as getInputDescription, C as TestGeneratorConfigSchema, Ct as CODING_AGENT_CORE_PLUGINS, D as VarsSchema, Dt as BaseTokenUsageSchema, E as UnifiedConfigSchema, F as PartialGenerationError, Ft as InputDefinitionSchema, G as STRATEGY_COLLECTIONS, Gt as isProviderOptions, Ht as normalizeInputDefinition, I as PluginConfigSchema, It as InputTypeSchema, K as STRATEGY_COLLECTION_MAPPINGS, L as PolicyObjectSchema, Lt as InputTypeValues, Mt as DocxInjectionPlacementValues, N as ProvidersSchema, Nt as InputConfigSchema, O as isGradingResult, Ot as CompletionTokenDetailsSchema, P as ConversationMessageSchema, Pt as InputDefinitionObjectSchema, R as StrategyConfigSchema, Rt as InputsSchema, S as TestCasesWithMetadataSchema, St as UNALIGNED_PROVIDER_HARM_PLUGINS, T as TestSuiteSchema, U as DEFAULT_STRATEGIES, Ut as normalizeInputs, Vt as getInputType, Wt as isApiProvider, X as Severity, Y as isFanoutStrategy, Z as categoryAliases, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TEEN_SAFETY_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as ALIASED_PLUGIN_MAPPINGS, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as PromptSchema, jt as DocxInjectionPlacementSchema, k as isResultFailureReason, kt as DocumentMediaInjectionPlacementSchema, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as CANARY_BREAKING_STRATEGY_IDS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as getDefaultNFanout, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as BIAS_PLUGINS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CODING_AGENT_PLUGINS, x as TestCasesWithMetadataPromptSchema, xt as TELECOM_PLUGINS, y as TestCaseSchema, zt as buildInputPromptDescription } from "./types-D6glLbdF.js";
4
+ import { F as getShareApiBaseUrl, I as getShareViewBaseUrl, L as FILE_METADATA_KEY, N as TERMINAL_MAX_WIDTH, P as getDefaultShareViewBaseUrl, R as HUMAN_ASSERTION_TYPE, T as cloudConfig, _ as isPromptfooSampleTarget, b as parseChatPrompt, c as getCurrentTimestamp, l as sleep, n as fetchWithRetries, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, w as CloudConfig } from "./fetch-It34O8Ur.js";
5
+ import { n as VERSION } from "./version-lpHV_53E.js";
6
+ import { i as isJavascriptFile } from "./fileExtensions-CXRfY3Ss.js";
7
+ import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-Ca7WIoPY.js";
8
+ import { r as importModule, t as getDirectory } from "./esm-BTK1W7lG.js";
9
+ import { a as extractVariablesFromTemplates, i as extractVariablesFromTemplate, o as getNunjucksEngine, r as analyzeTemplateReference, t as renderEnvOnlyInObject } from "./render-DFfDeYUK.js";
10
+ import { t as providerRegistry } from "./providerRegistry-DHcFiVWX.js";
11
+ import { a as getRemoteHealthUrl, l as shouldGenerateRemote, n as getRemoteGenerationExplicitlyDisabledError, r as getRemoteGenerationUrl, s as neverGenerateRemote } from "./remoteGeneration-DsaSwmG2.js";
12
+ import { r as promptYesNo } from "./server-CMJD10J4.js";
13
+ import { a as getCloudDatabaseId, c as getPluginSeverityOverridesFromCloud, d as isCloudProvider, i as checkCloudPermissions, o as getEvalConfigFromCloud, p as resolveTeamId, s as getOrgContext } from "./storage-CD-GWAdx.js";
14
+ import { r as runPython } from "./pythonUtils-D0BYebvX.js";
15
+ import { A as readFilters, M as loadFunction, N as parseFileUrl, O as maybeLoadToolsFromExternalFile, T as maybeLoadFromExternalFile, _ as isProviderAllowed, a as setupEnv, b as normalizeProviderRef, c as filterRuntimeVars, d as checkProviderApiKeys, f as doesProviderRefMatch, g as isOpenAiProvider, h as isGoogleProvider, i as fetchCsvFromGoogleSheet, j as readOutput, l as getTestCaseDeduplicationKey, m as isAnthropicProvider, n as writeMultipleOutputs, o as deduplicateTestCases, p as getProviderDescription, r as writeOutput, s as extractRuntimeVars, t as printBorder, u as resultIsForTestCase, w as maybeLoadConfigFromExternalFile } from "./util-Bx677_k2.js";
16
+ import { n as sha256, t as randomSequence } from "./createHash-BtbSX3mj.js";
17
+ import { c as NON_TRANSIENT_HTTP_STATUSES, i as getCache, l as isNonTransientHttpStatus, n as disableCache, r as fetchWithCache, s as withCacheNamespace, t as cache_exports } from "./cache-DIXbtkNO.js";
18
+ import { t as OpenAiChatCompletionProvider } from "./chat-Dabu84Br.js";
19
+ import { h as validateFunctionCall } from "./transform-DmvYBRll.js";
20
+ import { l as validateFunctionCall$1 } from "./util-LKTmNsMQ.js";
21
+ import { _ as AIStudioChatProvider, a as resolveProvider, f as MCPProvider, g as GoogleLiveProvider, h as VertexChatProvider, n as loadApiProvider, o as resolveProviderConfigs, r as loadApiProviders, t as getProviderIds } from "./providers-DVYRZP4E.js";
22
+ import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-CmnQ0G2m.js";
23
+ import { t as ellipsize } from "./text-DwYK5EBn.js";
24
+ import { t as telemetry } from "./telemetry-DwX9XUN5.js";
25
+ import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-CRSXQ2Ke.js";
26
+ import { t as getBlobByHash } from "./blobs-Dwef1Ao1.js";
27
+ import { t as getProcessShim } from "./processShim-BcGzU8fY.js";
28
+ import { n as loadFromPackage, t as isPackagePath } from "./packageParser-CgE-ziRo.js";
29
+ import { n as runRuby } from "./rubyUtils-CnlW8AYb.js";
30
+ import { n as materializeInputVariablesWithMetadata, t as buildPromptInputDescriptions } from "./inputVariables-DUGMb9Ka.js";
31
+ import { a as extractPromptFromTags, c as isBasicRefusal, i as extractMaterializedVariablesFromJsonWithMetadata, n as extractGoalFromPrompt, o as getSessionId, r as extractInputVarsFromPrompt, s as getShortPluginId } from "./util-DNl96nNs.js";
32
+ import { n as PromptfooHarmfulCompletionProvider } from "./promptfoo-Cm4hiy1Y.js";
33
+ import { $ as readProviderPromptMap, A as ExcessiveAgencyPlugin, At as withProviderCallExecutionContext, B as retryWithDeduplication, C as PlinyPlugin, Ct as processFileReference, D as ImitationPlugin, Dt as getAndCheckProvider, E as IntentPlugin, Et as callProviderWithContext, F as BeavertailsPlugin, G as matchesFactuality, H as fetchHuggingFaceDataset, I as AegisPlugin, J as matchesPiScore, K as matchesGEval, L as RedteamGraderBase, M as DebugAccessPlugin, N as CrossSessionLeakPlugin, O as HarmbenchPlugin, Ot as getGradingProvider, P as ContractPlugin, Q as readPrompts, R as RedteamPluginBase, S as makeInlinePolicyIdSync, St as loadFromJavaScriptFile, T as OverreliancePlugin, Tt as DEFAULT_ANTHROPIC_MODEL, U as isGraderFailure, V as sampleArray, W as matchesClosedQa, X as doRemoteGrading, Y as matchesTrajectoryGoalSuccess, Z as processPrompts, _ as PromptExtractionPlugin, _t as normalizeMatcherTokenUsage, a as VLGuardPlugin, at as CONTEXT_FAITHFULNESS_NLI_STATEMENTS, b as determinePolicyTypeFromId, bt as coerceString, c as ToxicChatPlugin, ct as CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN, d as TeenSafetyDangerousRoleplayPlugin, dt as loadRubricPrompt, et as DEFAULT_WEB_SEARCH_PROMPT, f as TeenSafetyDangerousContentPlugin, ft as renderLlmRubricPrompt, g as RbacPlugin, gt as fail, h as ShellInjectionPlugin, ht as euclideanDistance, i as VLSUPlugin, it as CONTEXT_FAITHFULNESS_LONGFORM, j as DivergentRepetitionPlugin, k as HallucinationPlugin, kt as getProviderCallExecutionContext, l as ToolDiscoveryPlugin, lt as CONTEXT_RELEVANCE, m as SqlInjectionPlugin, mt as dotProduct, n as getGraderById, nt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, o as UnverifiableClaimsPlugin, ot as CONTEXT_RECALL, p as TeenSafetyAgeRestrictedGoodsAndServicesPlugin, pt as cosineSimilarity, q as matchesLlmRubric, rt as ANSWER_RELEVANCY_GENERATE, s as UnsafeBenchPlugin, st as CONTEXT_RECALL_ATTRIBUTED_TOKEN, t as GRADERS, tt as SELECT_BEST_PROMPT, u as TeenSafetyHarmfulBodyIdealsPlugin, ut as CONTEXT_RELEVANCE_BAD, v as PoliticsPlugin, vt as splitIntoSentences, w as getPiiLeakTestsForCategory, wt as getDefaultProviders, x as isValidPolicyObject, xt as getFinalTest, y as PolicyPlugin, yt as tryParse, z as getCustomPolicies } from "./graders-BX0f2tvS.js";
34
+ import { f as redteamProviderManager, g as createProviderRateLimitOptions, h as createRateLimitRegistry, m as TokenUsageTracker } from "./shared-D6IjElRI.js";
35
+ import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-BGY69tk_.js";
36
+ import { a as getTransformLabel, i as getTransformErrorMessage, n as TRANSFORM_KEYS, o as transform, r as TransformInputType, t as INLINE_FUNCTION_LABEL } from "./transform-CGt7Kt3y2.js";
37
+ import { t as getTraceStore } from "./store-HpopRVzl.js";
38
+ import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-CxRtnaHl.js";
39
+ import { i as throwIfTargetPromptExceedsMaxChars, n as getGeneratedPromptOverLimit, r as getMaxCharsPerMessageModifierValue, t as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY } from "./promptLength-B9nZEfO6.js";
40
+ import { n as checkExfilTracking } from "./indirectWebPwn-CbjUG0rh.js";
41
+ import { n as getFirstStringAttribute, r as getToolNameFromAttributes, t as TOOL_ARGUMENT_ATTRIBUTE_KEYS } from "./toolAttributes-DJ9ZEKXD.js";
42
+ import { i as filterFiniteScores, n as renderPrompt, r as runExtensionHook, t as collectFileMetadata } from "./evaluatorHelpers-DuqFFfq7.js";
43
+ import { r as sanitizeProvider, t as EvalResult } from "./evalResult-2RRJvFyB.js";
44
+ import { i as pluginMatchesStrategyTargets, n as loadStrategy, r as validateStrategies, t as Strategies } from "./strategies-CiSeroPH.js";
31
45
  import * as fs$2 from "fs";
32
46
  import fs, { createWriteStream } from "fs";
33
47
  import * as path$2 from "path";
@@ -35,29 +49,30 @@ import path, { parse } from "path";
35
49
  import async from "async";
36
50
  import yaml from "js-yaml";
37
51
  import { AsyncResource } from "node:async_hooks";
38
- import { resolve } from "node:path";
39
- import { fileURLToPath } from "node:url";
40
52
  import chalk from "chalk";
41
53
  import * as os$1 from "os";
42
54
  import os from "os";
43
- import util from "util";
44
55
  import dedent from "dedent";
45
- import * as fsPromises from "fs/promises";
46
- import { globSync } from "glob";
47
56
  import { z } from "zod";
48
- import { parse as parse$1 } from "csv-parse/sync";
49
- import { XMLParser } from "fast-xml-parser";
57
+ import * as fsPromises from "fs/promises";
58
+ import util from "util";
59
+ import input from "@inquirer/input";
60
+ import { resolve } from "node:path";
61
+ import { fileURLToPath } from "node:url";
50
62
  import crypto$1, { createHash, randomBytes } from "crypto";
51
63
  import { DiagConsoleLogger, DiagLogLevel, diag, propagation } from "@opentelemetry/api";
52
- import input from "@inquirer/input";
53
64
  import readline from "readline";
65
+ import { parse as parse$1 } from "csv-parse/sync";
66
+ import { globSync } from "glob";
67
+ import { XMLParser } from "fast-xml-parser";
54
68
  import { and, desc, eq, inArray, sql } from "drizzle-orm";
55
- import cliProgress from "cli-progress";
56
69
  import { URL } from "url";
57
- import { JSDOM } from "jsdom";
70
+ import { parse as parse$2 } from "parse5";
58
71
  import { distance } from "fastest-levenshtein";
72
+ import cliProgress from "cli-progress";
59
73
  import * as rouge from "js-rouge";
60
74
  import { isDeepStrictEqual } from "node:util";
75
+ import { LRUCache } from "lru-cache";
61
76
  import "debounce";
62
77
  import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
63
78
  import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
@@ -242,6 +257,505 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
242
257
  };
243
258
  };
244
259
  //#endregion
260
+ //#region src/matchers/classification.ts
261
+ /**
262
+ *
263
+ * @param expected Expected classification. If undefined, matches any classification.
264
+ * @param output Text to classify.
265
+ * @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
266
+ * @param grading
267
+ * @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
268
+ */
269
+ async function matchesClassification(expected, output, threshold, grading) {
270
+ const resp = await (await getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
271
+ if (!resp.classification) return fail(resp.error || "Unknown error fetching classification");
272
+ let score;
273
+ if (expected === void 0) {
274
+ const scores = Object.values(resp.classification);
275
+ if (scores.length === 0) return {
276
+ pass: false,
277
+ score: 0,
278
+ reason: "No classification scores returned"
279
+ };
280
+ score = Math.max(...scores);
281
+ } else score = resp.classification[expected] || 0;
282
+ if (score >= threshold - Number.EPSILON) {
283
+ const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
284
+ return {
285
+ pass: true,
286
+ score,
287
+ reason
288
+ };
289
+ }
290
+ return {
291
+ pass: false,
292
+ score,
293
+ reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
294
+ };
295
+ }
296
+ //#endregion
297
+ //#region src/matchers/comparison.ts
298
+ async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
299
+ invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
300
+ const resp = await callProviderWithContext(await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "select-best check"), await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, SELECT_BEST_PROMPT), {
301
+ criteria,
302
+ outputs: outputs.map((o) => tryParse(o)),
303
+ ...vars || {}
304
+ }), "select-best", {
305
+ criteria,
306
+ outputs: outputs.map((o) => tryParse(o)),
307
+ ...vars || {}
308
+ }, providerCallContext);
309
+ if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => fail(resp.error || "No output", resp.tokenUsage));
310
+ invariant(typeof resp.output === "string", "select-best produced malformed response");
311
+ const firstIntegerMatch = resp.output.trim().match(/\d+/);
312
+ const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
313
+ if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
314
+ const tokensUsed = normalizeMatcherTokenUsage(resp.tokenUsage);
315
+ return outputs.map((_output, index) => {
316
+ if (index === verdict) return {
317
+ pass: true,
318
+ score: 1,
319
+ reason: `Output selected as the best: ${criteria}`,
320
+ tokensUsed
321
+ };
322
+ else return {
323
+ pass: false,
324
+ score: 0,
325
+ reason: `Output not selected: ${criteria}`,
326
+ tokensUsed
327
+ };
328
+ });
329
+ }
330
+ async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
331
+ invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
332
+ const value = assertion.value || {};
333
+ const options = {
334
+ method: typeof value === "object" && "method" in value ? value.method : "average",
335
+ weights: typeof value === "object" && "weights" in value ? value.weights : {},
336
+ threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
337
+ };
338
+ const scores = resultsWithGradingResults.map((result, index) => {
339
+ const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
340
+ if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
341
+ let totalWeightedScore = 0;
342
+ let totalWeight = 0;
343
+ relevantResults.forEach((componentResult) => {
344
+ const assertionType = componentResult.assertion?.type || "unknown";
345
+ const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
346
+ const score = componentResult.score || 0;
347
+ totalWeightedScore += score * weight;
348
+ totalWeight += weight;
349
+ });
350
+ let aggregateScore;
351
+ if (options.method === "sum") aggregateScore = totalWeightedScore;
352
+ else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
353
+ return {
354
+ index,
355
+ score: aggregateScore,
356
+ componentCount: relevantResults.length,
357
+ totalWeight
358
+ };
359
+ });
360
+ let maxScore = -Infinity;
361
+ let winnerIndex = 0;
362
+ for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
363
+ maxScore = scores[i].score;
364
+ winnerIndex = i;
365
+ }
366
+ const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
367
+ return scores.map(({ index, score, componentCount, totalWeight }) => {
368
+ const isWinner = index === winnerIndex && meetsThreshold;
369
+ return {
370
+ pass: isWinner,
371
+ score: isWinner ? 1 : 0,
372
+ reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
373
+ namedScores: {
374
+ maxScore: score,
375
+ assertionCount: componentCount,
376
+ totalWeight
377
+ }
378
+ };
379
+ });
380
+ }
381
+ //#endregion
382
+ //#region src/matchers/moderation.ts
383
+ async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
384
+ if (!assistantResponse) return {
385
+ pass: true,
386
+ score: 1,
387
+ reason: "No output to moderate"
388
+ };
389
+ const defaultProviders = await getDefaultProviders();
390
+ const defaultModerationProvider = !getEnvString("OPENAI_API_KEY") && (getEnvString("REPLICATE_API_KEY") || getEnvString("REPLICATE_API_TOKEN")) ? await loadApiProvider(LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
391
+ const moderationProvider = await getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
392
+ invariant(moderationProvider, "Moderation provider must be defined");
393
+ const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
394
+ if (resp.error) return {
395
+ pass: false,
396
+ score: 0,
397
+ reason: `Moderation API error: ${resp.error}`
398
+ };
399
+ const { flags } = resp;
400
+ if (!flags || flags.length === 0) return {
401
+ pass: true,
402
+ score: 1,
403
+ reason: "No moderation flags detected"
404
+ };
405
+ const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
406
+ if (filteredFlags.length > 0) return {
407
+ pass: false,
408
+ score: 0,
409
+ reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
410
+ };
411
+ return {
412
+ pass: true,
413
+ score: 1,
414
+ reason: "No relevant moderation flags detected"
415
+ };
416
+ }
417
+ //#endregion
418
+ //#region src/assertions/contextUtils.ts
419
+ /**
420
+ * Resolves the context value for context-based assertions.
421
+ * Supports extracting context from test variables or transforming from output.
422
+ * Can return either a single context string or an array of context chunks.
423
+ *
424
+ * @param assertion - The assertion configuration
425
+ * @param test - The test case
426
+ * @param output - The provider output (after provider transform, before test transform)
427
+ * @param prompt - The prompt text
428
+ * @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
429
+ * @param providerResponse - Optional full provider response for contextTransform
430
+ * @returns The resolved context string or array of strings
431
+ * @throws Error if context cannot be resolved or transform fails
432
+ */
433
+ async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
434
+ let contextValue;
435
+ if (test.vars?.context) {
436
+ if (typeof test.vars.context === "string") contextValue = test.vars.context;
437
+ else if (Array.isArray(test.vars.context)) {
438
+ const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
439
+ if (invalidEntry) {
440
+ const [idx, val] = invalidEntry;
441
+ invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
442
+ }
443
+ contextValue = test.vars.context;
444
+ }
445
+ } else if (fallbackContext) contextValue = fallbackContext;
446
+ if (assertion.contextTransform) {
447
+ const getLabel = () => getTransformLabel(assertion.contextTransform);
448
+ try {
449
+ const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
450
+ const transformed = await transform(assertion.contextTransform, outputForTransform, {
451
+ vars: test.vars,
452
+ prompt: { label: prompt },
453
+ ...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
454
+ });
455
+ invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), () => `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${getLabel()}`);
456
+ contextValue = transformed;
457
+ } catch (error) {
458
+ throw new Error(`Failed to transform context using expression '${getLabel()}': ${getTransformErrorMessage(error)}`);
459
+ }
460
+ }
461
+ invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
462
+ return contextValue;
463
+ }
464
+ /**
465
+ * Serializes context (string or string[]) to a single string for prompts.
466
+ * Joins chunks with double newlines to preserve separation.
467
+ */
468
+ function serializeContext(context) {
469
+ return Array.isArray(context) ? context.join("\n\n") : context;
470
+ }
471
+ //#endregion
472
+ //#region src/matchers/rag.ts
473
+ async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
474
+ const defaults = await getDefaultProviders();
475
+ const embeddingProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
476
+ const textProvider = await getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
477
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
478
+ const rubricPrompt = await loadRubricPrompt(grading?.rubricPrompt, ANSWER_RELEVANCY_GENERATE);
479
+ const parsedOutput = tryParse(output);
480
+ const promptText = await renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
481
+ const candidateQuestions = [];
482
+ for (let i = 0; i < 3; i++) {
483
+ const resp = await callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
484
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
485
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
486
+ invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
487
+ candidateQuestions.push(resp.output);
488
+ }
489
+ invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
490
+ const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
491
+ accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
492
+ if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
493
+ const inputEmbedding = inputEmbeddingResp.embedding;
494
+ const similarities = [];
495
+ const questionsWithScores = [];
496
+ for (const question of candidateQuestions) {
497
+ const resp = await embeddingProvider.callEmbeddingApi(question);
498
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
499
+ if (resp.error || !resp.embedding) return fail(resp.error || "No embedding", tokensUsed);
500
+ const questionSimilarity = cosineSimilarity(inputEmbedding, resp.embedding);
501
+ similarities.push(questionSimilarity);
502
+ questionsWithScores.push({
503
+ question,
504
+ similarity: questionSimilarity
505
+ });
506
+ }
507
+ const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
508
+ const pass = similarity >= threshold - Number.EPSILON;
509
+ const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
510
+ const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
511
+ const metadata = {
512
+ generatedQuestions: questionsWithScores,
513
+ averageSimilarity: similarity,
514
+ threshold
515
+ };
516
+ if (pass) return {
517
+ pass: true,
518
+ score: similarity,
519
+ reason: greaterThanReason,
520
+ tokensUsed,
521
+ metadata
522
+ };
523
+ return {
524
+ pass: false,
525
+ score: similarity,
526
+ reason: lessThanReason,
527
+ tokensUsed,
528
+ metadata
529
+ };
530
+ }
531
+ async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
532
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context recall check");
533
+ const contextString = serializeContext(context);
534
+ const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RECALL), {
535
+ context: contextString,
536
+ groundTruth,
537
+ ...vars || {}
538
+ }), "context-recall", {
539
+ context: contextString,
540
+ groundTruth,
541
+ ...vars || {}
542
+ }, providerCallContext);
543
+ if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
544
+ invariant(typeof resp.output === "string", "context-recall produced malformed response");
545
+ const attributedTokenLower = CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
546
+ const notAttributedTokenLower = CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
547
+ const sentences = splitIntoSentences(resp.output).filter((line) => {
548
+ const lowerLine = line.toLowerCase();
549
+ return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
550
+ });
551
+ const sentenceAttributions = [];
552
+ let numerator = 0;
553
+ for (const sentence of sentences) {
554
+ const lowerSentence = sentence.toLowerCase();
555
+ const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
556
+ if (isAttributed) numerator++;
557
+ const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
558
+ const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
559
+ sentenceAttributions.push({
560
+ sentence: cleanSentence,
561
+ attributed: isAttributed
562
+ });
563
+ }
564
+ const score = sentences.length > 0 ? numerator / sentences.length : 0;
565
+ const pass = score >= threshold - Number.EPSILON;
566
+ const metadata = {
567
+ sentenceAttributions,
568
+ totalSentences: sentences.length,
569
+ attributedSentences: numerator,
570
+ score
571
+ };
572
+ return {
573
+ pass,
574
+ score,
575
+ reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
576
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
577
+ metadata
578
+ };
579
+ }
580
+ async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
581
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context relevance check");
582
+ const contextString = serializeContext(context);
583
+ const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RELEVANCE), {
584
+ context: contextString,
585
+ query: question
586
+ }), "context-relevance", {
587
+ context: contextString,
588
+ query: question
589
+ }, providerCallContext);
590
+ if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
591
+ invariant(typeof resp.output === "string", "context-relevance produced malformed response");
592
+ const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : splitIntoSentences(context);
593
+ const totalContextUnits = contextUnits.length;
594
+ const extractedSentences = splitIntoSentences(resp.output);
595
+ const relevantSentences = [];
596
+ const insufficientInformation = resp.output.includes(CONTEXT_RELEVANCE_BAD);
597
+ let numerator = 0;
598
+ if (insufficientInformation) numerator = 0;
599
+ else {
600
+ const uniqueRelevantSentences = [...new Set(extractedSentences)];
601
+ numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
602
+ relevantSentences.push(...uniqueRelevantSentences);
603
+ }
604
+ const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
605
+ const pass = score >= threshold - Number.EPSILON;
606
+ const metadata = {
607
+ extractedSentences: relevantSentences,
608
+ totalContextUnits,
609
+ totalContextSentences: totalContextUnits,
610
+ contextUnits,
611
+ relevantSentenceCount: numerator,
612
+ insufficientInformation,
613
+ score
614
+ };
615
+ return {
616
+ pass,
617
+ score,
618
+ reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
619
+ tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
620
+ metadata
621
+ };
622
+ }
623
+ async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
624
+ const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "faithfulness check");
625
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
626
+ if (grading?.rubricPrompt) invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
627
+ const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
628
+ const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
629
+ const longformPrompt = await loadRubricPrompt(rawLongformPrompt, CONTEXT_FAITHFULNESS_LONGFORM);
630
+ const nliPrompt = await loadRubricPrompt(rawNliPrompt, CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
631
+ let promptText = await renderLlmRubricPrompt(longformPrompt, {
632
+ question: query,
633
+ answer: tryParse(output),
634
+ ...vars || {}
635
+ });
636
+ let resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
637
+ question: query,
638
+ answer: tryParse(output),
639
+ ...vars || {}
640
+ }, providerCallContext);
641
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
642
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
643
+ invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
644
+ const contextString = serializeContext(context);
645
+ const statements = splitIntoSentences(resp.output);
646
+ promptText = await renderLlmRubricPrompt(nliPrompt, {
647
+ context: contextString,
648
+ statements,
649
+ ...vars || {}
650
+ });
651
+ resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
652
+ context: contextString,
653
+ statements,
654
+ ...vars || {}
655
+ }, providerCallContext);
656
+ accumulateTokenUsage(tokensUsed, resp.tokenUsage);
657
+ if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
658
+ invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
659
+ let finalAnswer = "Final verdict for each statement in order:";
660
+ finalAnswer = finalAnswer.toLowerCase();
661
+ let verdicts = resp.output.toLowerCase().trim();
662
+ let score = 0;
663
+ if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
664
+ verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
665
+ const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
666
+ if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
667
+ } else {
668
+ const noVerdictCount = verdicts.split("verdict: no").length - 1;
669
+ if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
670
+ }
671
+ score = Math.min(1, Math.max(0, score));
672
+ const pass = score >= threshold - Number.EPSILON;
673
+ return {
674
+ pass,
675
+ score,
676
+ reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
677
+ tokensUsed
678
+ };
679
+ }
680
+ //#endregion
681
+ //#region src/matchers/similarity.ts
682
+ function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
683
+ switch (metric) {
684
+ case "cosine": return cosineSimilarity(expectedEmbedding, outputEmbedding);
685
+ case "dot_product": return dotProduct(expectedEmbedding, outputEmbedding);
686
+ case "euclidean": return euclideanDistance(expectedEmbedding, outputEmbedding);
687
+ default: return fail(`Unsupported metric: ${metric}`, tokensUsed);
688
+ }
689
+ }
690
+ function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
691
+ if (metric === "euclidean") {
692
+ const distance = similarity;
693
+ const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
694
+ const normalizedScore = 1 / (1 + distance);
695
+ const score = inverse ? 1 - normalizedScore : normalizedScore;
696
+ const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
697
+ const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
698
+ return {
699
+ pass,
700
+ score,
701
+ reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
702
+ tokensUsed
703
+ };
704
+ }
705
+ const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
706
+ const score = inverse ? 1 - similarity : similarity;
707
+ const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
708
+ const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
709
+ return {
710
+ pass,
711
+ score,
712
+ reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
713
+ tokensUsed
714
+ };
715
+ }
716
+ async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
717
+ if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
718
+ const similarityResp = await finalProvider.callSimilarityApi(expected, output);
719
+ accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
720
+ if (similarityResp.error) return fail(similarityResp.error, tokensUsed);
721
+ if (similarityResp.similarity == null) return fail("Unknown error fetching similarity", tokensUsed);
722
+ if (!Number.isFinite(similarityResp.similarity)) return fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
723
+ return similarityResp.similarity;
724
+ }
725
+ const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
726
+ if (typeof callEmbeddingApi !== "function") {
727
+ if ("callSimilarityApi" in finalProvider) return fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
728
+ throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
729
+ }
730
+ const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
731
+ const mergedUsage = normalizeMatcherTokenUsage(void 0);
732
+ accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
733
+ accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
734
+ accumulateTokenUsage(tokensUsed, mergedUsage);
735
+ if (expectedEmbedding.error || outputEmbedding.error) return fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
736
+ if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return fail("Embedding not found", tokensUsed);
737
+ return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
738
+ }
739
+ async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
740
+ if (metric === "cosine" && state.config?.redteam && shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
741
+ return await doRemoteGrading({
742
+ task: "similar",
743
+ expected,
744
+ output,
745
+ threshold,
746
+ inverse
747
+ });
748
+ } catch (error) {
749
+ return fail(`Could not perform remote grading: ${error}`);
750
+ }
751
+ const defaults = await getDefaultProviders();
752
+ const finalProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
753
+ const tokensUsed = normalizeMatcherTokenUsage(void 0);
754
+ const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
755
+ if (typeof similarity !== "number") return similarity;
756
+ return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
757
+ }
758
+ //#endregion
245
759
  //#region src/tracing/evaluatorTracing.ts
246
760
  let otlpReceiverStarted = false;
247
761
  const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
@@ -285,7 +799,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
285
799
  telemetry.record("feature_used", { feature: "tracing" });
286
800
  try {
287
801
  logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
288
- const { startOTLPReceiver } = await import("./otlpReceiver-B2z58l4e.js");
802
+ const { startOTLPReceiver } = await import("./otlpReceiver-BXjcRqAM.js");
289
803
  const port = testSuite.tracing.otlp.http.port || 4318;
290
804
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
291
805
  const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
@@ -309,7 +823,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
309
823
  async function stopOtlpReceiverIfNeeded() {
310
824
  if (otlpReceiverStarted) try {
311
825
  logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
312
- const { stopOTLPReceiver } = await import("./otlpReceiver-B2z58l4e.js");
826
+ const { stopOTLPReceiver } = await import("./otlpReceiver-BXjcRqAM.js");
313
827
  await stopOTLPReceiver();
314
828
  otlpReceiverStarted = false;
315
829
  logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -344,7 +858,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
344
858
  }
345
859
  if (!tracingEnabled) return null;
346
860
  logger.debug("[EvaluatorTracing] Importing trace store");
347
- const { getTraceStore } = await import("./store-M0b1WfYb.js").then((n) => n.n);
861
+ const { getTraceStore } = await import("./store-HpopRVzl.js").then((n) => n.n);
348
862
  const traceStore = getTraceStore();
349
863
  const traceId = generateTraceId();
350
864
  const spanId = generateSpanId();
@@ -654,38 +1168,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
654
1168
  }
655
1169
  //#endregion
656
1170
  //#region src/assertions/contains.ts
1171
+ /**
1172
+ * Advance over separators between parsed fields.
1173
+ *
1174
+ * Contains-any values allow whitespace around comma delimiters, and historical
1175
+ * parsing ignored repeated commas rather than producing empty fields.
1176
+ */
1177
+ function skipWhitespaceAndCommas(value, startIndex) {
1178
+ let i = startIndex;
1179
+ while (i < value.length) {
1180
+ i = skipWhitespace(value, i);
1181
+ if (value[i] !== ",") break;
1182
+ i++;
1183
+ }
1184
+ return i;
1185
+ }
1186
+ /**
1187
+ * Advance over whitespace while preserving comma delimiter handling for callers.
1188
+ */
1189
+ function skipWhitespace(value, startIndex) {
1190
+ let i = startIndex;
1191
+ while (i < value.length && /\s/.test(value[i])) i++;
1192
+ return i;
1193
+ }
1194
+ /**
1195
+ * Parse a quoted field using the assertion parser's CSV-like escape rules.
1196
+ *
1197
+ * Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
1198
+ * unterminated fields so malformed assertion values do not silently pass.
1199
+ */
1200
+ function parseQuotedField(value, startIndex) {
1201
+ let i = startIndex + 1;
1202
+ let field = "";
1203
+ let terminated = false;
1204
+ while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
1205
+ field += value[i + 1];
1206
+ i += 2;
1207
+ } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
1208
+ field += "\"";
1209
+ i += 2;
1210
+ } else if (value[i] === "\"") {
1211
+ i++;
1212
+ terminated = true;
1213
+ break;
1214
+ } else {
1215
+ field += value[i];
1216
+ i++;
1217
+ }
1218
+ invariant(terminated, "Unterminated quoted field in contains assertion value");
1219
+ return {
1220
+ field,
1221
+ nextIndex: i
1222
+ };
1223
+ }
1224
+ /**
1225
+ * Parse an unquoted field up to the next comma, trimming surrounding whitespace.
1226
+ */
1227
+ function parseUnquotedField(value, startIndex) {
1228
+ let i = startIndex;
1229
+ while (i < value.length && value[i] !== ",") i++;
1230
+ return {
1231
+ field: value.substring(startIndex, i).trim(),
1232
+ nextIndex: i
1233
+ };
1234
+ }
1235
+ /**
1236
+ * Split a contains-any string into fields while preserving quoted commas.
1237
+ */
657
1238
  function parseCommaSeparatedValues(value) {
658
1239
  const results = [];
659
1240
  let i = 0;
660
1241
  while (i < value.length) {
661
- while (i < value.length && /\s/.test(value[i])) i++;
1242
+ i = skipWhitespaceAndCommas(value, i);
662
1243
  if (i >= value.length) break;
663
- if (value[i] === ",") {
664
- i++;
665
- continue;
666
- }
667
- if (value[i] === "\"") {
668
- i++;
669
- let field = "";
670
- while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
671
- field += value[i + 1];
672
- i += 2;
673
- } else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
674
- field += "\"";
675
- i += 2;
676
- } else if (value[i] === "\"") {
677
- i++;
678
- break;
679
- } else {
680
- field += value[i];
681
- i++;
682
- }
683
- results.push(field);
684
- } else {
685
- const start = i;
686
- while (i < value.length && value[i] !== ",") i++;
687
- results.push(value.substring(start, i).trim());
688
- }
1244
+ const isQuotedField = value[i] === "\"";
1245
+ const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
1246
+ results.push(parsed.field);
1247
+ i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
1248
+ invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
689
1249
  }
690
1250
  return results;
691
1251
  }
@@ -930,27 +1490,67 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
930
1490
  };
931
1491
  //#endregion
932
1492
  //#region src/assertions/geval.ts
933
- const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
934
- invariant(typeof renderedValue === "string" || Array.isArray(renderedValue), "G-Eval assertion type must have a string or array of strings value");
1493
+ const handleGEval = async ({ assertion, inverse, renderedValue, prompt, outputString, test, providerCallContext }) => {
1494
+ invariant(typeof renderedValue === "string" || Array.isArray(renderedValue) && renderedValue.every((value) => typeof value === "string"), "G-Eval assertion type must have a string or array of strings value");
935
1495
  const threshold = assertion.threshold ?? .7;
936
1496
  if (Array.isArray(renderedValue)) {
937
- const scores = [];
938
- const reasons = [];
939
- for (const value of renderedValue) {
1497
+ if (renderedValue.length === 0) return {
1498
+ assertion,
1499
+ pass: false,
1500
+ score: 0,
1501
+ reason: "G-Eval assertion requires at least one criterion string in the value array."
1502
+ };
1503
+ const responses = [];
1504
+ let failure;
1505
+ for (const [index, value] of renderedValue.entries()) {
940
1506
  const resp = await matchesGEval(value, prompt || "", outputString, threshold, test.options, providerCallContext);
941
- scores.push(resp.score);
942
- reasons.push(resp.reason);
1507
+ responses.push(resp);
1508
+ if (isGraderFailure(resp)) {
1509
+ failure = {
1510
+ index,
1511
+ resp
1512
+ };
1513
+ break;
1514
+ }
1515
+ }
1516
+ const tokensUsed = createEmptyTokenUsage();
1517
+ for (const r of responses) accumulateTokenUsage(tokensUsed, r.tokensUsed);
1518
+ if (failure) {
1519
+ const criterion = renderedValue[failure.index];
1520
+ return {
1521
+ assertion,
1522
+ pass: false,
1523
+ score: 0,
1524
+ reason: `G-Eval criterion ${failure.index + 1}/${renderedValue.length} (${JSON.stringify(criterion)}) failed: ${failure.resp.reason}`,
1525
+ tokensUsed,
1526
+ metadata: failure.resp.metadata
1527
+ };
943
1528
  }
944
- const scoresSum = scores.reduce((a, b) => a + b, 0);
1529
+ const averageScore = responses.reduce((acc, r) => acc + r.score, 0) / responses.length;
1530
+ const combinedReason = responses.map((r) => r.reason).join("\n\n");
945
1531
  return {
946
1532
  assertion,
947
- pass: scoresSum / scores.length >= threshold,
948
- score: scoresSum / scores.length,
949
- reason: reasons.join("\n\n")
1533
+ pass: averageScore >= threshold !== inverse,
1534
+ score: inverse ? 1 - averageScore : averageScore,
1535
+ reason: combinedReason,
1536
+ tokensUsed
950
1537
  };
951
- } else return {
1538
+ }
1539
+ const resp = await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext);
1540
+ if (isGraderFailure(resp)) return {
1541
+ assertion,
1542
+ pass: false,
1543
+ score: 0,
1544
+ reason: resp.reason,
1545
+ tokensUsed: resp.tokensUsed,
1546
+ metadata: resp.metadata
1547
+ };
1548
+ const passed = resp.score >= threshold !== inverse;
1549
+ return {
952
1550
  assertion,
953
- ...await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext)
1551
+ ...resp,
1552
+ pass: passed,
1553
+ score: inverse ? 1 - resp.score : resp.score
954
1554
  };
955
1555
  };
956
1556
  //#endregion
@@ -1090,6 +1690,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
1090
1690
  };
1091
1691
  //#endregion
1092
1692
  //#region src/assertions/html.ts
1693
+ const LITERAL_WRAPPER_PATTERNS = {
1694
+ html: /<html(?=[\s>/])/,
1695
+ head: /<head(?=[\s>/])/,
1696
+ body: /<body(?=[\s>/])/
1697
+ };
1698
+ function isWrapperTagName(tagName) {
1699
+ return tagName === "html" || tagName === "head" || tagName === "body";
1700
+ }
1701
+ function isTextNode(node) {
1702
+ return node.nodeName === "#text";
1703
+ }
1704
+ function isElementNode(node) {
1705
+ return "tagName" in node;
1706
+ }
1707
+ function hasSourceCodeLocation(element) {
1708
+ return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
1709
+ }
1710
+ function getChildNodes(node) {
1711
+ return "childNodes" in node ? node.childNodes : [];
1712
+ }
1713
+ function findFirstElement(root, predicate) {
1714
+ const stack = [root];
1715
+ while (stack.length > 0) {
1716
+ const current = stack.pop();
1717
+ if (isElementNode(current) && predicate(current)) return current;
1718
+ const children = getChildNodes(current);
1719
+ for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
1720
+ }
1721
+ }
1722
+ function hasTopLevelText(parentNode) {
1723
+ return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
1724
+ }
1725
+ function isUserProvidedElement(element, inputLowercase) {
1726
+ const tagName = element.tagName.toLowerCase();
1727
+ if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
1728
+ return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
1729
+ }
1093
1730
  const HTML_PATTERNS = {
1094
1731
  openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
1095
1732
  closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
@@ -1245,37 +1882,21 @@ function validateHtml(htmlString) {
1245
1882
  isValid: false,
1246
1883
  reason: "Output appears to be XML, not HTML"
1247
1884
  };
1248
- try {
1249
- const { document } = new JSDOM(trimmed, { contentType: "text/html" }).window;
1250
- if (document.body && !trimmed.toLowerCase().includes("<body")) {
1251
- if (Array.from(document.body.childNodes).some((node) => node.nodeType === 3 && node.textContent?.trim())) return {
1252
- isValid: false,
1253
- reason: "Output must be wrapped in HTML tags"
1254
- };
1255
- }
1256
- const allElements = document.querySelectorAll("*");
1257
- if (!Array.from(allElements).find((element) => {
1258
- const tagName = element.tagName.toLowerCase();
1259
- if ([
1260
- "html",
1261
- "head",
1262
- "body"
1263
- ].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
1264
- return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
1265
- })) return {
1266
- isValid: false,
1267
- reason: "Output does not contain recognized HTML elements"
1268
- };
1269
- return {
1270
- isValid: true,
1271
- reason: "Output is valid HTML"
1272
- };
1273
- } catch (error) {
1274
- return {
1275
- isValid: false,
1276
- reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
1277
- };
1278
- }
1885
+ const document = parse$2(trimmed, { sourceCodeLocationInfo: true });
1886
+ const inputLowercase = trimmed.toLowerCase();
1887
+ const body = findFirstElement(document, (element) => element.tagName === "body");
1888
+ if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
1889
+ isValid: false,
1890
+ reason: "Output must be wrapped in HTML tags"
1891
+ };
1892
+ if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
1893
+ isValid: false,
1894
+ reason: "Output does not contain recognized HTML elements"
1895
+ };
1896
+ return {
1897
+ isValid: true,
1898
+ reason: "Output is valid HTML"
1899
+ };
1279
1900
  }
1280
1901
  const handleContainsHtml = ({ assertion, outputString, inverse }) => {
1281
1902
  const pass = containsHtml(outputString) !== inverse;
@@ -1932,45 +2553,6 @@ function matchesPattern(spanName, pattern) {
1932
2553
  }
1933
2554
  //#endregion
1934
2555
  //#region src/assertions/trajectoryUtils.ts
1935
- const TOOL_ATTRIBUTE_KEYS = [
1936
- "tool.name",
1937
- "tool_name",
1938
- "tool",
1939
- "function.name",
1940
- "function_name",
1941
- "gen_ai.tool.name",
1942
- "codex.mcp.tool",
1943
- "agent.tool",
1944
- "agent.tool_name",
1945
- "agent.toolName"
1946
- ];
1947
- const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
1948
- "tool.arguments",
1949
- "tool.args",
1950
- "tool.input",
1951
- "tool_arguments",
1952
- "tool_args",
1953
- "tool_input",
1954
- "function.arguments",
1955
- "function.args",
1956
- "function.input",
1957
- "function_arguments",
1958
- "function_args",
1959
- "gen_ai.tool.arguments",
1960
- "gen_ai.tool.args",
1961
- "gen_ai.tool.input",
1962
- "gen_ai.tool.call.arguments",
1963
- "gen_ai.tool.call.args",
1964
- "agent.tool.arguments",
1965
- "agent.tool.args",
1966
- "agent.tool.input",
1967
- "codex.mcp.arguments",
1968
- "codex.mcp.args",
1969
- "codex.mcp.input",
1970
- "arguments",
1971
- "args",
1972
- "input"
1973
- ];
1974
2556
  const COMMAND_ATTRIBUTE_KEYS = [
1975
2557
  "codex.command",
1976
2558
  "command",
@@ -1983,16 +2565,15 @@ const SEARCH_ATTRIBUTE_KEYS = [
1983
2565
  "search_query"
1984
2566
  ];
1985
2567
  const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
2568
+ const COMMAND_TOOL_NAMES = new Set([
2569
+ "exec_command",
2570
+ "local_shell",
2571
+ "shell"
2572
+ ]);
1986
2573
  const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
1987
2574
  const MAX_JUDGE_SUMMARY_STEPS = 24;
1988
2575
  const JUDGE_SUMMARY_HEAD_STEPS = 12;
1989
2576
  const JUDGE_SUMMARY_TAIL_STEPS = 12;
1990
- function getStringAttribute(attributes, keys) {
1991
- for (const key of keys) {
1992
- const value = attributes[key];
1993
- if (typeof value === "string" && value.trim()) return value.trim();
1994
- }
1995
- }
1996
2577
  function normalizeStructuredAttribute(value) {
1997
2578
  if (value === void 0 || value === null) return;
1998
2579
  if (typeof value === "string") {
@@ -2024,9 +2605,12 @@ function getTrajectoryStepStatus(step) {
2024
2605
  function getCommandExecutable(command) {
2025
2606
  return command.trim().split(/\s+/)[0] || void 0;
2026
2607
  }
2608
+ function isCommandToolName(toolName) {
2609
+ return !!toolName && COMMAND_TOOL_NAMES.has(toolName.trim().toLowerCase());
2610
+ }
2027
2611
  function extractToolName(span) {
2028
2612
  const attributes = span.attributes || {};
2029
- const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
2613
+ const directMatch = getToolNameFromAttributes(attributes);
2030
2614
  if (directMatch) return directMatch;
2031
2615
  for (const [key, value] of Object.entries(attributes)) {
2032
2616
  if (typeof value !== "string" || !value.trim()) continue;
@@ -2051,21 +2635,31 @@ function extractToolArgs(span) {
2051
2635
  if (value !== void 0) return value;
2052
2636
  }
2053
2637
  }
2054
- function extractCommand(span) {
2638
+ function extractCommand(span, toolName = extractToolName(span), getToolArgs = () => extractToolArgs(span)) {
2055
2639
  const attributes = span.attributes || {};
2056
- const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
2640
+ const directMatch = getFirstStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
2057
2641
  if (directMatch) return directMatch;
2058
2642
  for (const [key, value] of Object.entries(attributes)) {
2059
2643
  if (typeof value !== "string" || !value.trim()) continue;
2060
2644
  if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
2061
2645
  }
2646
+ const toolArgs = getToolArgs();
2647
+ if (isCommandToolName(toolName) && toolArgs && typeof toolArgs === "object") {
2648
+ const args = toolArgs;
2649
+ const command = args.cmd ?? args.command;
2650
+ if (typeof command === "string" && command.trim()) return command.trim();
2651
+ if (Array.isArray(command)) {
2652
+ const joined = command.map((part) => String(part).trim()).filter(Boolean).join(" ");
2653
+ if (joined) return joined;
2654
+ }
2655
+ }
2062
2656
  if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
2063
2657
  }
2064
2658
  function extractSearchQuery(span) {
2065
2659
  const attributes = span.attributes || {};
2066
- const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
2660
+ const directMatch = getFirstStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
2067
2661
  if (directMatch) return directMatch;
2068
- const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
2662
+ const genericQuery = getFirstStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
2069
2663
  if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
2070
2664
  if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
2071
2665
  }
@@ -2089,17 +2683,34 @@ function extractTrajectorySteps(trace) {
2089
2683
  return left.index - right.index;
2090
2684
  }).map(({ span }) => {
2091
2685
  const toolName = extractToolName(span);
2092
- const command = extractCommand(span);
2686
+ let toolArgs;
2687
+ let hasExtractedToolArgs = false;
2688
+ const getToolArgs = () => {
2689
+ if (!hasExtractedToolArgs) {
2690
+ toolArgs = extractToolArgs(span);
2691
+ hasExtractedToolArgs = true;
2692
+ }
2693
+ return toolArgs;
2694
+ };
2695
+ const command = extractCommand(span, toolName, getToolArgs);
2093
2696
  const searchQuery = extractSearchQuery(span);
2094
2697
  let type = "span";
2095
2698
  let name = span.name;
2096
2699
  const aliases = new Set([span.name]);
2097
2700
  let args;
2098
- if (toolName) {
2701
+ if (command && isCommandToolName(toolName)) {
2702
+ type = "command";
2703
+ name = command;
2704
+ aliases.add(command);
2705
+ args = getToolArgs();
2706
+ if (toolName) aliases.add(toolName);
2707
+ const executable = getCommandExecutable(command);
2708
+ if (executable) aliases.add(executable);
2709
+ } else if (toolName) {
2099
2710
  type = "tool";
2100
2711
  name = toolName;
2101
2712
  aliases.add(toolName);
2102
- args = extractToolArgs(span);
2713
+ args = getToolArgs();
2103
2714
  } else if (command) {
2104
2715
  type = "command";
2105
2716
  name = command;
@@ -2380,11 +2991,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
2380
2991
  const rougeMethod = rouge[baseType[baseType.length - 1]];
2381
2992
  const score = rougeMethod(outputString, renderedValue, {});
2382
2993
  const threshold = assertion.threshold ?? .75;
2383
- const pass = score >= threshold != inverse;
2384
2994
  return {
2385
- pass,
2995
+ pass: score >= threshold !== inverse,
2386
2996
  score: inverse ? 1 - score : score,
2387
- reason: pass ? `${baseType.toUpperCase()} score ${score.toFixed(2)} is greater than or equal to threshold ${threshold}` : `${baseType.toUpperCase()} score ${score.toFixed(2)} is less than threshold ${threshold}`,
2997
+ reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
2388
2998
  assertion
2389
2999
  };
2390
3000
  }
@@ -2446,6 +3056,192 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
2446
3056
  }
2447
3057
  };
2448
3058
  //#endregion
3059
+ //#region src/providers/webSearchUtils.ts
3060
+ function hasTool(provider, predicate) {
3061
+ return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
3062
+ }
3063
+ function getProviderId(provider) {
3064
+ if (typeof provider.id !== "function") return null;
3065
+ try {
3066
+ return provider.id();
3067
+ } catch (err) {
3068
+ logger.debug(`Failed to read provider id: ${err}`);
3069
+ return null;
3070
+ }
3071
+ }
3072
+ function isOpenAiResponsesProvider(provider, id) {
3073
+ return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
3074
+ }
3075
+ /**
3076
+ * Check if a provider has web search capabilities
3077
+ * @param provider The provider to check
3078
+ * @returns true if the provider supports web search
3079
+ */
3080
+ function hasWebSearchCapability(provider) {
3081
+ if (!provider) return false;
3082
+ const id = getProviderId(provider);
3083
+ if (!id) return false;
3084
+ if (id.includes("perplexity")) return true;
3085
+ if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
3086
+ if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
3087
+ if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
3088
+ if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
3089
+ if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
3090
+ return false;
3091
+ }
3092
+ /**
3093
+ * Load a provider with web search capabilities.
3094
+ * Tries multiple providers in order of preference until one succeeds.
3095
+ * Uses the latest and most capable models from each provider with specific checkpoint IDs.
3096
+ *
3097
+ * @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
3098
+ * @returns A provider with web search capabilities or null
3099
+ */
3100
+ async function loadWebSearchProvider(preferAnthropic = false) {
3101
+ const loadAnthropicWebSearch = async () => {
3102
+ try {
3103
+ return await loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
3104
+ type: "web_search_20250305",
3105
+ name: "web_search",
3106
+ max_uses: 5
3107
+ }] } } });
3108
+ } catch (err) {
3109
+ logger.debug(`Failed to load Anthropic web search provider: ${err}`);
3110
+ return null;
3111
+ }
3112
+ };
3113
+ const loadOpenAIWebSearch = async () => {
3114
+ try {
3115
+ return await loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
3116
+ } catch (err) {
3117
+ logger.debug(`Failed to load OpenAI web search provider: ${err}`);
3118
+ return null;
3119
+ }
3120
+ };
3121
+ const loadPerplexity = async () => {
3122
+ try {
3123
+ return await loadApiProvider("perplexity:sonar-pro");
3124
+ } catch (err) {
3125
+ logger.debug(`Failed to load Perplexity provider: ${err}`);
3126
+ return null;
3127
+ }
3128
+ };
3129
+ const loadGoogleWebSearch = async () => {
3130
+ try {
3131
+ return await loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
3132
+ } catch (err) {
3133
+ logger.debug(`Failed to load Google web search provider: ${err}`);
3134
+ return null;
3135
+ }
3136
+ };
3137
+ const loadVertexWebSearch = async () => {
3138
+ try {
3139
+ return await loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
3140
+ } catch (err) {
3141
+ logger.debug(`Failed to load Vertex web search provider: ${err}`);
3142
+ return null;
3143
+ }
3144
+ };
3145
+ const loadXaiWebSearch = async () => {
3146
+ try {
3147
+ return await loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
3148
+ } catch (err) {
3149
+ logger.debug(`Failed to load xAI web search provider: ${err}`);
3150
+ return null;
3151
+ }
3152
+ };
3153
+ const providers = preferAnthropic ? [
3154
+ loadAnthropicWebSearch,
3155
+ loadOpenAIWebSearch,
3156
+ loadPerplexity,
3157
+ loadGoogleWebSearch,
3158
+ loadVertexWebSearch,
3159
+ loadXaiWebSearch
3160
+ ] : [
3161
+ loadOpenAIWebSearch,
3162
+ loadAnthropicWebSearch,
3163
+ loadPerplexity,
3164
+ loadGoogleWebSearch,
3165
+ loadVertexWebSearch,
3166
+ loadXaiWebSearch
3167
+ ];
3168
+ for (const getProvider of providers) {
3169
+ const provider = await getProvider();
3170
+ if (provider && hasWebSearchCapability(provider)) {
3171
+ logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
3172
+ return provider;
3173
+ }
3174
+ if (provider) logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
3175
+ }
3176
+ return null;
3177
+ }
3178
+ //#endregion
3179
+ //#region src/matchers/search.ts
3180
+ async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
3181
+ if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
3182
+ const defaultProviders = await getDefaultProviders();
3183
+ const defaultSearchProviders = [
3184
+ defaultProviders.webSearchProvider,
3185
+ defaultProviders.llmRubricProvider,
3186
+ defaultProviders.gradingProvider
3187
+ ];
3188
+ let searchProvider = (grading.provider ? await getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
3189
+ if (!hasWebSearchCapability(searchProvider)) {
3190
+ const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
3191
+ if (webSearchDefault) searchProvider = webSearchDefault;
3192
+ }
3193
+ if (!hasWebSearchCapability(searchProvider)) {
3194
+ const webSearchProvider = await loadWebSearchProvider(true);
3195
+ if (webSearchProvider) searchProvider = webSearchProvider;
3196
+ }
3197
+ if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
3198
+ const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, DEFAULT_WEB_SEARCH_PROMPT), {
3199
+ output: tryParse(llmOutput),
3200
+ rubric,
3201
+ ...vars || {}
3202
+ });
3203
+ const resp = await callProviderWithContext(searchProvider, prompt, "search-rubric", {
3204
+ output: tryParse(llmOutput),
3205
+ rubric,
3206
+ ...vars || {}
3207
+ }, providerCallContext);
3208
+ if (resp.error || !resp.output) return {
3209
+ pass: false,
3210
+ score: 0,
3211
+ reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
3212
+ tokensUsed: resp.tokenUsage,
3213
+ assertion
3214
+ };
3215
+ try {
3216
+ const result = extractFirstJsonObject(String(resp.output));
3217
+ let pass = result.pass ?? false;
3218
+ const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
3219
+ if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
3220
+ return {
3221
+ pass,
3222
+ score,
3223
+ reason: result.reason || "No reason provided",
3224
+ tokensUsed: resp.tokenUsage,
3225
+ assertion,
3226
+ metadata: {
3227
+ searchResults: result.searchResults || [],
3228
+ searchProvider: searchProvider.id()
3229
+ }
3230
+ };
3231
+ } catch (err) {
3232
+ logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
3233
+ const outputLower = String(resp.output).toLowerCase();
3234
+ const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
3235
+ return {
3236
+ pass,
3237
+ score: pass ? 1 : 0,
3238
+ reason: resp.output,
3239
+ tokensUsed: resp.tokenUsage,
3240
+ assertion
3241
+ };
3242
+ }
3243
+ }
3244
+ //#endregion
2449
3245
  //#region src/assertions/searchRubric.ts
2450
3246
  async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
2451
3247
  if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
@@ -3101,13 +3897,13 @@ function resolveSequenceValue(value) {
3101
3897
  }
3102
3898
  throw new Error("trajectory:tool-sequence assertion must have an array or object value");
3103
3899
  }
3104
- function isRecord(value) {
3900
+ function isRecord$1(value) {
3105
3901
  return typeof value === "object" && value !== null && !Array.isArray(value);
3106
3902
  }
3107
3903
  function matchesExpectedArgsPartial(actual, expected) {
3108
3904
  if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
3109
- if (isRecord(expected)) {
3110
- if (!isRecord(actual)) return false;
3905
+ if (isRecord$1(expected)) {
3906
+ if (!isRecord$1(actual)) return false;
3111
3907
  return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
3112
3908
  }
3113
3909
  return isDeepStrictEqual(actual, expected);
@@ -3478,7 +4274,7 @@ async function loadTraceData(traceId) {
3478
4274
  let stableObservations = 0;
3479
4275
  let latestTrace = null;
3480
4276
  for (let attempt = 0; attempt < maxAttempts; attempt++) {
3481
- latestTrace = await traceStore.getTrace(traceId);
4277
+ latestTrace = await traceStore.getTrace(traceId, { sanitizeAttributes: false });
3482
4278
  const spanCount = latestTrace?.spans?.length ?? 0;
3483
4279
  if (spanCount > 0) {
3484
4280
  stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
@@ -3531,7 +4327,7 @@ const ASSERTION_HANDLERS = {
3531
4327
  "llm-rubric": handleLlmRubric,
3532
4328
  meteor: async (params) => {
3533
4329
  try {
3534
- const { handleMeteorAssertion } = await import("./meteor-DuAFv6gF.js");
4330
+ const { handleMeteorAssertion } = await import("./meteor-Dce-_zGQ.js");
3535
4331
  return handleMeteorAssertion(params);
3536
4332
  } catch (error) {
3537
4333
  if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
@@ -3667,7 +4463,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
3667
4463
  };
3668
4464
  }
3669
4465
  else if (filePath.endsWith(".rb")) try {
3670
- const { runRuby } = await import("./rubyUtils-DVLeA2jg.js").then((n) => n.t);
4466
+ const { runRuby } = await import("./rubyUtils-CnlW8AYb.js").then((n) => n.t);
3671
4467
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
3672
4468
  logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
3673
4469
  } catch (error) {
@@ -3784,7 +4580,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
3784
4580
  logger.debug(`Failed to preload trace data for assertions: ${error}`);
3785
4581
  preloadedTraceData = null;
3786
4582
  }
3787
- await async.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
4583
+ const concurrency = getProviderCallExecutionContext()?.providerCallQueue ? 1 : ASSERTIONS_MAX_CONCURRENCY;
4584
+ await async.forEachOfLimit(asserts, concurrency, async ({ assertion, assertResult, index }) => {
3788
4585
  if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
3789
4586
  const result = await runAssertion({
3790
4587
  prompt,
@@ -3911,7 +4708,8 @@ var CIProgressReporter = class {
3911
4708
  }
3912
4709
  updateTotalTests(newTotal) {
3913
4710
  this.totalTests = Math.max(newTotal, 1);
3914
- this.highestPercentageSeen = Math.floor(this.completedTests / this.totalTests * 100);
4711
+ const percentage = Math.floor(this.completedTests / this.totalTests * 100);
4712
+ this.highestPercentageSeen = percentage;
3915
4713
  }
3916
4714
  finish() {
3917
4715
  if (this.intervalId) {
@@ -4084,6 +4882,10 @@ function getDefaultOtelConfig() {
4084
4882
  }
4085
4883
  //#endregion
4086
4884
  //#region src/tracing/localSpanExporter.ts
4885
+ const MISSING_TRACE_RETRY_DELAY_MS = 50;
4886
+ function delay(ms) {
4887
+ return new Promise((resolve) => setTimeout(resolve, ms));
4888
+ }
4087
4889
  /**
4088
4890
  * A span exporter that writes spans to the local TraceStore (SQLite).
4089
4891
  * This allows OTEL spans to be stored locally for analysis in the promptfoo UI.
@@ -4125,7 +4927,7 @@ var LocalSpanExporter = class {
4125
4927
  }
4126
4928
  let firstError;
4127
4929
  for (const [traceId, spanDataList] of spansByTrace) try {
4128
- const result = await traceStore.addSpans(traceId, spanDataList, { skipTraceCheck: false });
4930
+ const result = await this.addSpansWithTraceRetry(traceStore, traceId, spanDataList);
4129
4931
  if (result.stored) logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
4130
4932
  else logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
4131
4933
  } catch (error) {
@@ -4137,6 +4939,16 @@ var LocalSpanExporter = class {
4137
4939
  }
4138
4940
  return firstError;
4139
4941
  }
4942
+ async addSpansWithTraceRetry(traceStore, traceId, spans) {
4943
+ const options = {
4944
+ skipTraceCheck: false,
4945
+ warnIfMissingTrace: false
4946
+ };
4947
+ const result = await traceStore.addSpans(traceId, spans, options);
4948
+ if (result.stored) return result;
4949
+ await delay(MISSING_TRACE_RETRY_DELAY_MS);
4950
+ return traceStore.addSpans(traceId, spans, options);
4951
+ }
4140
4952
  /**
4141
4953
  * Convert an OTEL ReadableSpan to our SpanData format.
4142
4954
  */
@@ -4419,6 +5231,15 @@ function isPromptAllowed(prompt, allowedPrompts) {
4419
5231
  }
4420
5232
  //#endregion
4421
5233
  //#region src/evaluator.ts
5234
+ const CONVERSATION_VAR_NAME = "_conversation";
5235
+ const promptUsesConversationVariableCache = new LRUCache({ max: 1024 });
5236
+ function promptUsesConversationVariable(prompt) {
5237
+ const cached = promptUsesConversationVariableCache.get(prompt.raw);
5238
+ if (cached !== void 0) return cached;
5239
+ const { referenced, parsed } = analyzeTemplateReference(prompt.raw, CONVERSATION_VAR_NAME);
5240
+ if (parsed) promptUsesConversationVariableCache.set(prompt.raw, referenced);
5241
+ return referenced;
5242
+ }
4422
5243
  /**
4423
5244
  * Manages a single progress bar for the evaluation
4424
5245
  */
@@ -4618,6 +5439,18 @@ function hasProviderGroupedAssertion(assertion) {
4618
5439
  function shouldDeferGradingForTest(test) {
4619
5440
  return Boolean(test.assert?.some(hasProviderGroupedAssertion));
4620
5441
  }
5442
+ function logGroupedGradingStatus({ concurrency, hasEvalStepTimeout, runEvalOptions, shouldGroupGradingByProvider, usesConversationVar }) {
5443
+ if (!runEvalOptions.some(({ test }) => shouldDeferGradingForTest(test))) return;
5444
+ if (shouldGroupGradingByProvider) {
5445
+ logger.info("Grouping model-graded assertions by provider to minimize local-model reload overhead.");
5446
+ return;
5447
+ }
5448
+ if (concurrency !== 1) return;
5449
+ const reasons = [];
5450
+ if (hasEvalStepTimeout) reasons.push("per-eval-step timeout is configured");
5451
+ if (usesConversationVar) reasons.push("conversation variables require per-row ordering");
5452
+ if (reasons.length > 0) logger.info(`Serial grading grouping disabled because ${reasons.join(" and ")}; model-graded judges may reload between rows.`);
5453
+ }
4621
5454
  function applyGradingResult(row, checkResult) {
4622
5455
  if (!checkResult.pass) {
4623
5456
  row.error = checkResult.reason;
@@ -4632,14 +5465,29 @@ function applyGradingResult(row, checkResult) {
4632
5465
  if (checkResult.tokensUsed) accumulateAssertionTokenUsage(row.tokenUsage.assertions, checkResult.tokensUsed);
4633
5466
  row.gradingResult = checkResult;
4634
5467
  }
4635
- function applyGradingError(row, error) {
4636
- const errorMessage = error instanceof Error ? error.stack ?? error.message : String(error);
4637
- logger.error("Assertion grading failed during eval", {
4638
- error: errorMessage,
4639
- promptIdx: row.promptIdx,
4640
- testIdx: row.testIdx
4641
- });
4642
- row.error = errorMessage;
5468
+ const ABORTED_GRADING_PREFIX = "Aborted: ";
5469
+ function isAbortShapedError(error) {
5470
+ return error instanceof Error && (error.name === "AbortError" || error.name === "AbortException");
5471
+ }
5472
+ function applyGradingError(row, error, abortSignal) {
5473
+ const errorAsError = error instanceof Error ? error : void 0;
5474
+ if (Boolean(abortSignal?.aborted) && isAbortShapedError(error)) {
5475
+ const shortMessage = errorAsError?.message ?? String(error);
5476
+ logger.debug("Assertion grading aborted", {
5477
+ error: shortMessage,
5478
+ promptIdx: row.promptIdx,
5479
+ testIdx: row.testIdx
5480
+ });
5481
+ row.error = `${ABORTED_GRADING_PREFIX}${shortMessage}`;
5482
+ } else {
5483
+ const fullMessage = errorAsError ? errorAsError.stack ?? errorAsError.message : String(error);
5484
+ logger.error("Assertion grading failed during eval", {
5485
+ error: fullMessage,
5486
+ promptIdx: row.promptIdx,
5487
+ testIdx: row.testIdx
5488
+ });
5489
+ row.error = fullMessage;
5490
+ }
4643
5491
  row.failureReason = ResultFailureReason.ERROR;
4644
5492
  row.success = false;
4645
5493
  row.score = 0;
@@ -4671,7 +5519,7 @@ function createRunEvalState({ provider, prompt, test }) {
4671
5519
  };
4672
5520
  }
4673
5521
  function attachConversationVar({ conversations, conversationKey, prompt, test, vars }) {
4674
- const usesConversation = prompt.raw.includes("_conversation");
5522
+ const usesConversation = promptUsesConversationVariable(prompt);
4675
5523
  if (!getEnvBool("PROMPTFOO_DISABLE_CONVERSATION_VAR") && !test.options?.disableConversationVar && usesConversation) vars._conversation = conversations?.[conversationKey] || [];
4676
5524
  }
4677
5525
  function createRunEvalSetup({ provider, prompt, promptConfig, vars }) {
@@ -4918,7 +5766,7 @@ async function gradeRunEvalResponse({ abortSignal, deferGrading, evalId, latency
4918
5766
  assertScoringFunction: test.assertScoringFunction,
4919
5767
  traceId
4920
5768
  }).then((checkResult) => applyGradingResult(ret, checkResult))).catch((error) => {
4921
- applyGradingError(ret, error);
5769
+ applyGradingError(ret, error, abortSignal);
4922
5770
  });
4923
5771
  deferredGradingPromises.set(ret, gradingPromise);
4924
5772
  return;
@@ -5465,7 +6313,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
5465
6313
  const defaultProvider = defaultTest.provider;
5466
6314
  if (isApiProvider(defaultProvider)) return defaultProvider;
5467
6315
  if (typeof defaultProvider === "object" && defaultProvider.id) {
5468
- const { loadApiProvider } = await import("./providers-u9Enmfok.js");
6316
+ const { loadApiProvider } = await import("./providers-DVYRZP4E.js").then((n) => n.i);
5469
6317
  return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
5470
6318
  }
5471
6319
  return defaultProvider;
@@ -5625,7 +6473,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
5625
6473
  async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
5626
6474
  if (!state.resume || !evalRecord.persisted) return;
5627
6475
  try {
5628
- const { default: EvalResult } = await import("./evalResult-Bgm9ZH31.js").then((n) => n.n);
6476
+ const { default: EvalResult } = await import("./evalResult-2RRJvFyB.js").then((n) => n.n);
5629
6477
  const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: state.retryMode });
5630
6478
  const originalCount = runEvalOptions.length;
5631
6479
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -5639,14 +6487,14 @@ async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
5639
6487
  }
5640
6488
  }
5641
6489
  function adjustConcurrencyForSerialFeatures({ concurrency, prompts, tests }) {
5642
- const usesConversationVar = prompts.some((p) => p.raw.includes("_conversation"));
6490
+ const usesConversationVar = prompts.some(promptUsesConversationVariable);
5643
6491
  if (concurrency <= 1) return {
5644
6492
  concurrency,
5645
6493
  usesConversationVar
5646
6494
  };
5647
6495
  const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
5648
6496
  if (usesConversationVar) {
5649
- logger.info(`Setting concurrency to 1 because the ${chalk.cyan("_conversation")} variable is used.`);
6497
+ logger.info(`Setting concurrency to 1 because the ${chalk.cyan(CONVERSATION_VAR_NAME)} variable is used.`);
5650
6498
  return {
5651
6499
  concurrency: 1,
5652
6500
  usesConversationVar
@@ -5876,7 +6724,8 @@ var Evaluator = class {
5876
6724
  };
5877
6725
  this.conversations = {};
5878
6726
  this.registers = {};
5879
- this.fileWriters = (Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : []).map((p) => new JsonlFileWriter(p));
6727
+ const jsonlFiles = Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : [];
6728
+ this.fileWriters = jsonlFiles.map((p) => new JsonlFileWriter(p));
5880
6729
  this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
5881
6730
  this.rateLimitRegistry.on("ratelimit:hit", (data) => {
5882
6731
  logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
@@ -5996,6 +6845,25 @@ var Evaluator = class {
5996
6845
  this.trackCompletedRow(evalStep, row, context);
5997
6846
  context.numComplete++;
5998
6847
  const promptEvalCount = reservePromptEvalCount(context, row.promptIdx);
6848
+ if (context.testSuite.extensions?.length) try {
6849
+ const afterEachOut = await runExtensionHook(context.testSuite.extensions, "afterEach", {
6850
+ test: evalStep.test,
6851
+ result: {
6852
+ ...row,
6853
+ namedScores: { ...row.namedScores },
6854
+ metadata: { ...row.metadata },
6855
+ response: row.response ? {
6856
+ ...row.response,
6857
+ metadata: { ...row.response.metadata }
6858
+ } : row.response
6859
+ }
6860
+ });
6861
+ row.namedScores = filterFiniteScores(afterEachOut.result.namedScores);
6862
+ row.metadata = afterEachOut.result.metadata;
6863
+ if (row.response && afterEachOut.result.response) row.response.metadata = afterEachOut.result.response.metadata;
6864
+ } catch (error) {
6865
+ logger.error(`afterEach extension hook failed, persisting row without hook modifications`, { error });
6866
+ }
5999
6867
  await this.persistEvalRow(row);
6000
6868
  if (this.abortIfTargetUnavailable(row, context)) break;
6001
6869
  const metrics = context.prompts[row.promptIdx].metrics;
@@ -6007,10 +6875,6 @@ var Evaluator = class {
6007
6875
  promptEvalCount,
6008
6876
  row
6009
6877
  });
6010
- await runExtensionHook(context.testSuite.extensions, "afterEach", {
6011
- test: evalStep.test,
6012
- result: row
6013
- });
6014
6878
  context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics);
6015
6879
  }
6016
6880
  }
@@ -6084,9 +6948,8 @@ var Evaluator = class {
6084
6948
  context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
6085
6949
  }
6086
6950
  async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
6087
- let flushGroupedRows;
6088
6951
  try {
6089
- if (shouldGroupGradingByProvider) flushGroupedRows = await this.runGroupedEvalSteps({
6952
+ if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
6090
6953
  checkAbort,
6091
6954
  evalStepIndexMap,
6092
6955
  groupedRunEvalOptions,
@@ -6118,7 +6981,6 @@ var Evaluator = class {
6118
6981
  cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
6119
6982
  throw err;
6120
6983
  }
6121
- await flushGroupedRows?.();
6122
6984
  if (isEvalTimedOut()) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
6123
6985
  else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
6124
6986
  ciProgressReporter,
@@ -6167,7 +7029,15 @@ var Evaluator = class {
6167
7029
  })) break;
6168
7030
  }
6169
7031
  } catch (error) {
6170
- await flushGroupedRows();
7032
+ const pendingRowCount = groupedRows.reduce((sum, entry) => sum + entry.rows.length, 0);
7033
+ try {
7034
+ await flushGroupedRows();
7035
+ } catch (flushError) {
7036
+ logger.warn("Failed to flush grouped rows after error; target outputs may be lost", {
7037
+ error: flushError instanceof Error ? flushError.message : String(flushError),
7038
+ pendingRowCount
7039
+ });
7040
+ }
6171
7041
  throw error;
6172
7042
  }
6173
7043
  await flushGroupedRows();
@@ -6603,6 +7473,13 @@ var Evaluator = class {
6603
7473
  if (!this.options.silent) {
6604
7474
  if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
6605
7475
  if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
7476
+ logGroupedGradingStatus({
7477
+ concurrency,
7478
+ hasEvalStepTimeout,
7479
+ runEvalOptions,
7480
+ shouldGroupGradingByProvider,
7481
+ usesConversationVar
7482
+ });
6606
7483
  }
6607
7484
  if (this.options.showProgressBar && progressBarManager) {
6608
7485
  await progressBarManager.initialize(runEvalOptions, concurrency, 0);
@@ -7606,7 +8483,7 @@ var Eval = class Eval {
7606
8483
  const evalInstance = new Eval(eval_.config, {
7607
8484
  id: eval_.id,
7608
8485
  createdAt: new Date(eval_.createdAt),
7609
- author: eval_.author || void 0,
8486
+ author: eval_.author,
7610
8487
  description: eval_.description || void 0,
7611
8488
  prompts: eval_.prompts || [],
7612
8489
  datasetId,
@@ -7629,7 +8506,7 @@ var Eval = class Eval {
7629
8506
  return (await getDb().select().from(evalsTable).limit(limit).orderBy(desc(evalsTable.createdAt)).all()).map((e) => new Eval(e.config, {
7630
8507
  id: e.id,
7631
8508
  createdAt: new Date(e.createdAt),
7632
- author: e.author || void 0,
8509
+ author: e.author,
7633
8510
  description: e.description || void 0,
7634
8511
  prompts: e.prompts || [],
7635
8512
  persisted: true
@@ -7644,7 +8521,7 @@ var Eval = class Eval {
7644
8521
  return (await getDb().select().from(evalsTable).orderBy(desc(evalsTable.createdAt)).limit(limit).offset(offset).all()).map((e) => new Eval(e.config, {
7645
8522
  id: e.id,
7646
8523
  createdAt: new Date(e.createdAt),
7647
- author: e.author || void 0,
8524
+ author: e.author,
7648
8525
  description: e.description || void 0,
7649
8526
  prompts: e.prompts || [],
7650
8527
  persisted: true
@@ -7659,7 +8536,7 @@ var Eval = class Eval {
7659
8536
  static async create(config, renderedPrompts, opts) {
7660
8537
  const createdAt = opts?.createdAt || /* @__PURE__ */ new Date();
7661
8538
  const evalId = opts?.id || createEvalId(createdAt);
7662
- const author = opts?.author || getUserEmail();
8539
+ const author = opts && "author" in opts ? opts.author ?? null : getAuthor();
7663
8540
  const db = getDb();
7664
8541
  const datasetId = sha256(JSON.stringify(config.tests || []));
7665
8542
  db.transaction(() => {
@@ -7721,7 +8598,7 @@ var Eval = class Eval {
7721
8598
  });
7722
8599
  return new Eval(config, {
7723
8600
  id: evalId,
7724
- author: opts?.author,
8601
+ author,
7725
8602
  createdAt,
7726
8603
  persisted: true,
7727
8604
  runtimeOptions: sanitizeRuntimeOptions(opts?.runtimeOptions)
@@ -7731,7 +8608,7 @@ var Eval = class Eval {
7731
8608
  const createdAt = opts?.createdAt || /* @__PURE__ */ new Date();
7732
8609
  this.createdAt = createdAt.getTime();
7733
8610
  this.id = opts?.id || createEvalId(createdAt);
7734
- this.author = opts?.author;
8611
+ this.author = opts?.author ?? null;
7735
8612
  this.config = config;
7736
8613
  this.results = [];
7737
8614
  this.prompts = opts?.prompts || [];
@@ -8261,7 +9138,7 @@ var Eval = class Eval {
8261
9138
  newConfig.description = copyDescription;
8262
9139
  const newPrompts = structuredClone(this.prompts);
8263
9140
  const newVars = this.vars ? structuredClone(this.vars) : [];
8264
- const author = getUserEmail();
9141
+ const author = getAuthor();
8265
9142
  const db = getDb();
8266
9143
  let copiedCount = 0;
8267
9144
  db.transaction(() => {
@@ -8418,47 +9295,11 @@ function filterPrompts(prompts, filterPromptsOption) {
8418
9295
  //#endregion
8419
9296
  //#region src/commands/eval/filterProviders.ts
8420
9297
  /**
8421
- * Checks if a value is a valid provider ID (non-empty string).
8422
- */
8423
- function isValidProviderId(id) {
8424
- return id !== null && id !== void 0 && typeof id === "string" && id !== "";
8425
- }
8426
- /**
8427
9298
  * Extracts the id and label from a raw provider config without instantiating it.
8428
9299
  * Handles all provider config formats: string, function, ProviderOptions, ProviderOptionsMap.
8429
9300
  */
8430
9301
  function getProviderIdAndLabel(provider, index) {
8431
- if (typeof provider === "string") return { id: provider };
8432
- if (typeof provider === "function") {
8433
- const label = provider.label;
8434
- return {
8435
- id: label ?? `custom-function-${index}`,
8436
- label
8437
- };
8438
- }
8439
- const providerId = provider.id;
8440
- if ("id" in provider && isValidProviderId(providerId)) return {
8441
- id: providerId,
8442
- label: provider.label
8443
- };
8444
- const keys = Object.keys(provider);
8445
- if (keys.length > 0) {
8446
- const id = keys[0];
8447
- const value = provider[id];
8448
- if (typeof value === "object" && value !== null) return {
8449
- id: value.id || id,
8450
- label: value.label
8451
- };
8452
- }
8453
- const label = provider.label;
8454
- if (isValidProviderId(label)) return {
8455
- id: label,
8456
- label
8457
- };
8458
- return {
8459
- id: `unknown-${index}`,
8460
- label
8461
- };
9302
+ return normalizeProviderRef(provider, { index });
8462
9303
  }
8463
9304
  /**
8464
9305
  * Filters raw provider configs BEFORE instantiation.
@@ -10691,25 +11532,45 @@ var AlignedHarmfulPlugin = class extends RedteamPluginBase {
10691
11532
  getAssertions(_prompt) {
10692
11533
  return getHarmfulAssertions(this.harmCategory);
10693
11534
  }
10694
- promptsToTestCases(prompts) {
11535
+ async promptsToTestCases(prompts) {
10695
11536
  const hasMultipleInputs = this.config.inputs && Object.keys(this.config.inputs).length > 0;
10696
11537
  const harmCategoryLabel = HARM_PLUGINS[this.harmCategory] || this.harmCategory;
10697
- return prompts.map(({ __prompt }) => {
11538
+ const pluginId = getShortPluginId(this.harmCategory);
11539
+ return Promise.all([...prompts].sort((a, b) => a.__prompt.localeCompare(b.__prompt)).map(async ({ __prompt }, materializationIndex) => {
10698
11540
  const vars = { [this.injectVar]: __prompt };
10699
- if (hasMultipleInputs) try {
10700
- const parsed = JSON.parse(__prompt);
10701
- Object.assign(vars, extractVariablesFromJson(parsed, this.config.inputs));
10702
- } catch {}
11541
+ let inputMaterialization;
11542
+ if (hasMultipleInputs) {
11543
+ let parsed;
11544
+ try {
11545
+ parsed = JSON.parse(__prompt);
11546
+ } catch (error) {
11547
+ logger.debug("[AlignedHarmful] Could not parse prompt as JSON for multi-input mode", { error });
11548
+ }
11549
+ if (parsed) try {
11550
+ const materializedVars = await extractMaterializedVariablesFromJsonWithMetadata(parsed, this.config.inputs, {
11551
+ materializationIndex,
11552
+ pluginId,
11553
+ provider: this.provider,
11554
+ purpose: this.purpose
11555
+ });
11556
+ Object.assign(vars, materializedVars.vars);
11557
+ inputMaterialization = materializedVars.metadata;
11558
+ } catch (error) {
11559
+ logger.debug("[AlignedHarmful] Failed to materialize prompt inputs", { error });
11560
+ throw error;
11561
+ }
11562
+ }
10703
11563
  return {
10704
11564
  vars,
10705
11565
  metadata: {
10706
11566
  harmCategory: harmCategoryLabel,
10707
- pluginId: getShortPluginId(this.harmCategory),
10708
- pluginConfig: this.config
11567
+ pluginId,
11568
+ pluginConfig: this.config,
11569
+ ...inputMaterialization ? { inputMaterialization } : {}
10709
11570
  },
10710
11571
  assert: getHarmfulAssertions(this.harmCategory)
10711
11572
  };
10712
- });
11573
+ }));
10713
11574
  }
10714
11575
  };
10715
11576
  //#endregion
@@ -10718,20 +11579,37 @@ var AlignedHarmfulPlugin = class extends RedteamPluginBase {
10718
11579
  * Extract content from <Prompt> tags and parse JSON if inputs are defined.
10719
11580
  * Returns the processed prompt and any additional vars extracted from JSON.
10720
11581
  */
10721
- function processPromptForInputs(prompt, _injectVar, inputs) {
11582
+ async function processPromptForInputs(prompt, inputs, plugin, provider, purpose, materializationIndex) {
10722
11583
  let processedPrompt = prompt.trim();
10723
11584
  const additionalVars = {};
11585
+ let additionalMetadata;
10724
11586
  const extractedPrompt = extractPromptFromTags(processedPrompt);
10725
11587
  if (extractedPrompt) processedPrompt = extractedPrompt;
10726
- if (inputs && Object.keys(inputs).length > 0) try {
10727
- const parsed = JSON.parse(processedPrompt);
10728
- Object.assign(additionalVars, extractVariablesFromJson(parsed, inputs));
10729
- } catch {
10730
- logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode");
11588
+ if (inputs && Object.keys(inputs).length > 0) {
11589
+ let parsed;
11590
+ try {
11591
+ parsed = JSON.parse(processedPrompt);
11592
+ } catch (error) {
11593
+ logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode", { error });
11594
+ }
11595
+ if (parsed) try {
11596
+ const materializedVars = await extractMaterializedVariablesFromJsonWithMetadata(parsed, inputs, {
11597
+ materializationIndex,
11598
+ pluginId: plugin,
11599
+ provider,
11600
+ purpose
11601
+ });
11602
+ Object.assign(additionalVars, materializedVars.vars);
11603
+ additionalMetadata = materializedVars.metadata;
11604
+ } catch (error) {
11605
+ logger.debug("[Harmful] Failed to materialize prompt inputs", { error });
11606
+ throw error;
11607
+ }
10731
11608
  }
10732
11609
  return {
10733
11610
  processedPrompt,
10734
- additionalVars
11611
+ additionalVars,
11612
+ additionalMetadata
10735
11613
  };
10736
11614
  }
10737
11615
  async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, plugin) {
@@ -10752,15 +11630,19 @@ async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, p
10752
11630
  };
10753
11631
  const allPrompts = await retryWithDeduplication(generatePrompts, n);
10754
11632
  const inputs = config?.inputs;
10755
- return sampleArray(allPrompts, n).map((prompt) => {
10756
- const { processedPrompt, additionalVars } = processPromptForInputs(prompt, injectVar, inputs);
11633
+ return Promise.all(sampleArray(allPrompts, n).map(async (prompt, materializationIndex) => {
11634
+ const { processedPrompt, additionalVars, additionalMetadata } = await processPromptForInputs(prompt, inputs, plugin, unalignedProvider, purpose, materializationIndex);
10757
11635
  const testCase = createTestCase(injectVar, processedPrompt, plugin);
10758
11636
  if (Object.keys(additionalVars).length > 0) testCase.vars = {
10759
11637
  ...testCase.vars,
10760
11638
  ...additionalVars
10761
11639
  };
11640
+ if (additionalMetadata) testCase.metadata = {
11641
+ ...testCase.metadata,
11642
+ inputMaterialization: additionalMetadata
11643
+ };
10762
11644
  return testCase;
10763
- });
11645
+ }));
10764
11646
  }
10765
11647
  //#endregion
10766
11648
  //#region src/redteam/plugins/teenSafety/graderExamples.ts
@@ -11078,7 +11960,7 @@ const MAX_CHARS_RETRY_MODIFIER_KEY = "__maxCharsPerMessageRetry";
11078
11960
  function computeModifiersFromConfig(config) {
11079
11961
  const modifiers = { ...config?.modifiers };
11080
11962
  if (config?.language && typeof config.language === "string") modifiers.language = config.language;
11081
- if (config?.inputs && Object.keys(config.inputs).length > 0) modifiers.__outputFormat = `Output each test case as JSON wrapped in <Prompt> tags: <Prompt>{${Object.entries(config.inputs).map(([k, description]) => `"${k}": "${description}"`).join(", ")}}</Prompt>`;
11963
+ if (config?.inputs && Object.keys(config.inputs).length > 0) modifiers.__outputFormat = `Output each test case as JSON wrapped in <Prompt> tags: <Prompt>{${Object.entries(buildPromptInputDescriptions(config.inputs) ?? {}).map(([k, description]) => `"${k}": "${description}"`).join(", ")}}</Prompt>`;
11082
11964
  const maxCharsModifier = getMaxCharsPerMessageModifierValue(config?.maxCharsPerMessage);
11083
11965
  if (maxCharsModifier) modifiers[MAX_CHARS_PER_MESSAGE_MODIFIER_KEY] = maxCharsModifier;
11084
11966
  return modifiers;
@@ -11163,9 +12045,10 @@ function dedupeTestCases(testCases) {
11163
12045
  return deduped;
11164
12046
  }
11165
12047
  function buildMaxCharsRetryInstructions(rejectedPromptLengths, limit) {
12048
+ const longestRejectedPromptText = rejectedPromptLengths.length > 0 ? `${Math.max(...rejectedPromptLengths)} characters` : "unknown length";
11166
12049
  return dedent`
11167
12050
  Your previous response included ${rejectedPromptLengths.length} generated prompt${rejectedPromptLengths.length === 1 ? "" : "s"} that exceeded the ${limit ?? "configured"}-character limit.
11168
- The longest rejected prompt was ${Math.max(...rejectedPromptLengths)} characters.
12051
+ The longest rejected prompt was ${longestRejectedPromptText}.
11169
12052
  Generate replacement prompts only, and keep every user message within the character limit.
11170
12053
  `.trim();
11171
12054
  }
@@ -11243,6 +12126,31 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
11243
12126
  return [];
11244
12127
  }
11245
12128
  }
12129
+ async function materializeRemoteTestCaseInputs({ config, injectVar, pluginId, provider, purpose, testCases }) {
12130
+ const inputs = config.inputs;
12131
+ if (!inputs || Object.keys(inputs).length === 0) return testCases;
12132
+ return Promise.all(testCases.map(async (testCase, materializationIndex) => {
12133
+ const inputVars = extractInputVarsFromPrompt(String(testCase.vars?.[injectVar] ?? ""), inputs);
12134
+ if (!inputVars) return testCase;
12135
+ const materializedVars = await materializeInputVariablesWithMetadata(inputVars, inputs, {
12136
+ materializationIndex,
12137
+ pluginId,
12138
+ provider,
12139
+ purpose
12140
+ });
12141
+ return {
12142
+ ...testCase,
12143
+ vars: {
12144
+ ...testCase.vars || {},
12145
+ ...materializedVars.vars
12146
+ },
12147
+ metadata: {
12148
+ ...testCase.metadata || {},
12149
+ ...materializedVars.metadata ? { inputMaterialization: materializedVars.metadata } : {}
12150
+ }
12151
+ };
12152
+ }));
12153
+ }
11246
12154
  function createPluginFactory(PluginClass, key, validate) {
11247
12155
  return {
11248
12156
  key,
@@ -11253,13 +12161,21 @@ function createPluginFactory(PluginClass, key, validate) {
11253
12161
  logger.debug(`Using local redteam generation for ${key}`);
11254
12162
  return new PluginClass(provider, purpose, injectVar, configWithDefaults).generateTests(n, delayMs);
11255
12163
  }
11256
- const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {});
12164
+ const pluginId = getShortPluginId(key);
12165
+ const testCases = await materializeRemoteTestCaseInputs({
12166
+ config: configWithDefaults ?? {},
12167
+ injectVar,
12168
+ pluginId,
12169
+ provider,
12170
+ purpose,
12171
+ testCases: await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {})
12172
+ });
11257
12173
  const computedModifiers = computeModifiersFromConfig(configWithDefaults);
11258
12174
  return testCases.map((testCase) => ({
11259
12175
  ...testCase,
11260
12176
  metadata: {
11261
12177
  ...testCase.metadata,
11262
- pluginId: getShortPluginId(key),
12178
+ pluginId,
11263
12179
  pluginConfig: {
11264
12180
  ...configWithDefaults,
11265
12181
  modifiers: computedModifiers
@@ -11316,7 +12232,7 @@ const pluginFactories = [
11316
12232
  key: category,
11317
12233
  action: async (params) => {
11318
12234
  if (neverGenerateRemote()) {
11319
- logger.error(`${category} plugin requires remote generation to be enabled`);
12235
+ logger.error(getRemoteGenerationExplicitlyDisabledError(`${category} plugin`));
11320
12236
  return [];
11321
12237
  }
11322
12238
  const testCases = await getHarmfulTests(params, category);
@@ -11339,13 +12255,21 @@ const piiPlugins = PII_PLUGINS.map((category) => ({
11339
12255
  key: category,
11340
12256
  action: async (params) => {
11341
12257
  if (shouldGenerateRemote()) {
11342
- const testCases = await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {});
12258
+ const pluginId = getShortPluginId(category);
12259
+ const testCases = await materializeRemoteTestCaseInputs({
12260
+ config: params.config ?? {},
12261
+ injectVar: params.injectVar,
12262
+ pluginId,
12263
+ provider: params.provider,
12264
+ purpose: params.purpose,
12265
+ testCases: await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {})
12266
+ });
11343
12267
  const computedModifiers = computeModifiersFromConfig(params.config);
11344
12268
  return testCases.map((testCase) => ({
11345
12269
  ...testCase,
11346
12270
  metadata: {
11347
12271
  ...testCase.metadata,
11348
- pluginId: getShortPluginId(category),
12272
+ pluginId,
11349
12273
  pluginConfig: {
11350
12274
  ...params.config,
11351
12275
  modifiers: computedModifiers
@@ -11367,16 +12291,24 @@ const biasPlugins = BIAS_PLUGINS.map((category) => ({
11367
12291
  key: category,
11368
12292
  action: async (params) => {
11369
12293
  if (neverGenerateRemote()) {
11370
- logger.error(`${category} plugin requires remote generation to be enabled`);
12294
+ logger.error(getRemoteGenerationExplicitlyDisabledError(`${category} plugin`));
11371
12295
  return [];
11372
12296
  }
11373
- const testCases = await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {});
12297
+ const pluginId = getShortPluginId(category);
12298
+ const testCases = await materializeRemoteTestCaseInputs({
12299
+ config: params.config ?? {},
12300
+ injectVar: params.injectVar,
12301
+ pluginId,
12302
+ provider: params.provider,
12303
+ purpose: params.purpose,
12304
+ testCases: await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {})
12305
+ });
11374
12306
  const computedModifiers = computeModifiersFromConfig(params.config);
11375
12307
  return testCases.map((testCase) => ({
11376
12308
  ...testCase,
11377
12309
  metadata: {
11378
12310
  ...testCase.metadata,
11379
- pluginId: getShortPluginId(category),
12311
+ pluginId,
11380
12312
  pluginConfig: {
11381
12313
  ...params.config,
11382
12314
  modifiers: computedModifiers
@@ -11389,19 +12321,27 @@ function createRemotePlugin(key, validate) {
11389
12321
  return {
11390
12322
  key,
11391
12323
  validate,
11392
- action: async ({ purpose, injectVar, n, config }) => {
12324
+ action: async ({ provider, purpose, injectVar, n, config }) => {
11393
12325
  const configWithDefaults = applyDefaultRemotePluginConfig(key, config);
11394
12326
  if (neverGenerateRemote()) {
11395
- logger.error(`${key} plugin requires remote generation to be enabled`);
12327
+ logger.error(getRemoteGenerationExplicitlyDisabledError(`${key} plugin`));
11396
12328
  return [];
11397
12329
  }
11398
- const testCases = await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {});
12330
+ const pluginId = getShortPluginId(key);
12331
+ const testCases = await materializeRemoteTestCaseInputs({
12332
+ config: configWithDefaults ?? {},
12333
+ injectVar,
12334
+ pluginId,
12335
+ provider,
12336
+ purpose,
12337
+ testCases: await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {})
12338
+ });
11399
12339
  const computedModifiers = computeModifiersFromConfig(configWithDefaults);
11400
12340
  const testsWithMetadata = testCases.map((testCase) => ({
11401
12341
  ...testCase,
11402
12342
  metadata: {
11403
12343
  ...testCase.metadata,
11404
- pluginId: getShortPluginId(key),
12344
+ pluginId,
11405
12345
  pluginConfig: {
11406
12346
  ...configWithDefaults,
11407
12347
  modifiers: computedModifiers
@@ -11471,6 +12411,37 @@ function getPolicyText(metadata) {
11471
12411
  return typeof policyObject.text === "string" && policyObject.text.length > 0 ? policyObject.text : void 0;
11472
12412
  }
11473
12413
  }
12414
+ async function rematerializeStrategyInputVars(testCase, injectVar, provider, purpose, materializationIndex) {
12415
+ const inputs = testCase.metadata?.pluginConfig?.inputs;
12416
+ const inputMaterialization = testCase.metadata?.inputMaterialization;
12417
+ if (!inputs || Object.keys(inputs).length === 0 || !testCase.vars?.[injectVar]) return {
12418
+ inputMaterialization,
12419
+ vars: testCase.vars
12420
+ };
12421
+ try {
12422
+ const materializedVars = await extractMaterializedVariablesFromJsonWithMetadata(JSON.parse(String(testCase.vars[injectVar])), inputs, {
12423
+ materializationIndex,
12424
+ pluginId: String(testCase.metadata?.pluginId || "unknown-plugin"),
12425
+ provider,
12426
+ purpose
12427
+ });
12428
+ return {
12429
+ inputMaterialization: materializedVars.metadata ? {
12430
+ ...inputMaterialization,
12431
+ ...materializedVars.metadata
12432
+ } : inputMaterialization,
12433
+ vars: {
12434
+ ...testCase.vars,
12435
+ ...materializedVars.vars
12436
+ }
12437
+ };
12438
+ } catch {
12439
+ return {
12440
+ inputMaterialization,
12441
+ vars: testCase.vars
12442
+ };
12443
+ }
12444
+ }
11474
12445
  /**
11475
12446
  * Gets the severity level for a plugin based on its ID and configuration.
11476
12447
  * @param pluginId - The ID of the plugin.
@@ -11612,6 +12583,7 @@ const categories = {
11612
12583
  foundation: FOUNDATION_PLUGINS,
11613
12584
  harmful: Object.keys(HARM_PLUGINS),
11614
12585
  "coding-agent:core": CODING_AGENT_CORE_PLUGINS,
12586
+ "coding-agent:all": CODING_AGENT_PLUGINS,
11615
12587
  bias: BIAS_PLUGINS,
11616
12588
  pii: PII_PLUGINS,
11617
12589
  medical: MEDICAL_PLUGINS,
@@ -11706,7 +12678,7 @@ function addLanguageToPluginMetadata(test, lang, plugin, maxCharsPerMessage, tes
11706
12678
  * @param injectVar - The variable to inject.
11707
12679
  * @returns An array of new test cases generated by strategies.
11708
12680
  */
11709
- async function applyStrategies(testCases, strategies, injectVar, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage) {
12681
+ async function applyStrategies(testCases, strategies, injectVar, provider, purpose, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage) {
11710
12682
  const newTestCases = [];
11711
12683
  const strategyResults = {};
11712
12684
  for (const strategy of strategies) {
@@ -11762,14 +12734,8 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
11762
12734
  }
11763
12735
  }
11764
12736
  resultTestCases = filterOversizedTestCases(resultTestCases, injectVar, `Strategy ${strategy.id}`, maxCharsPerMessage);
11765
- newTestCases.push(...resultTestCases.map((t) => {
11766
- const inputs = t?.metadata?.pluginConfig?.inputs;
11767
- let updatedVars = t.vars;
11768
- if (inputs && Object.keys(inputs).length > 0 && t.vars?.[injectVar]) try {
11769
- const parsed = JSON.parse(String(t.vars[injectVar]));
11770
- updatedVars = { ...t.vars };
11771
- Object.assign(updatedVars, extractVariablesFromJson(parsed, inputs));
11772
- } catch {}
12737
+ newTestCases.push(...await Promise.all(resultTestCases.map(async (t, materializationIndex) => {
12738
+ const { inputMaterialization, vars } = await rematerializeStrategyInputVars(t, injectVar, provider, purpose, materializationIndex);
11773
12739
  const strategyConfig = {
11774
12740
  ...strategy.config || {},
11775
12741
  ...maxCharsPerMessage ? { maxCharsPerMessage } : {},
@@ -11777,16 +12743,17 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
11777
12743
  };
11778
12744
  return {
11779
12745
  ...t,
11780
- vars: updatedVars,
12746
+ vars,
11781
12747
  metadata: {
11782
12748
  ...t?.metadata || {},
11783
12749
  ...strategy.id !== "retry" && { strategyId: t?.metadata?.strategyId || strategy.id },
11784
12750
  ...t?.metadata?.pluginId && { pluginId: t.metadata.pluginId },
11785
12751
  ...t?.metadata?.pluginConfig && { pluginConfig: t.metadata.pluginConfig },
12752
+ ...inputMaterialization && { inputMaterialization },
11786
12753
  ...Object.keys(strategyConfig).length > 0 && { strategyConfig }
11787
12754
  }
11788
12755
  };
11789
- }));
12756
+ })));
11790
12757
  const displayId = strategy.id === "layer" && Array.isArray(strategy.config?.steps) ? `layer(${strategy.config.steps.map((st) => typeof st === "string" ? st : st.id).join("→")})` : strategy.id;
11791
12758
  const languagesInResults = new Set(strategyTestCases.map((t) => getLanguageForTestCase(t)).filter((lang) => lang !== void 0));
11792
12759
  const applyNumTestsCap = (calculatedRequested) => {
@@ -12241,7 +13208,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
12241
13208
  targetIds,
12242
13209
  ...retryStrategy.config
12243
13210
  };
12244
- const { testCases: retryTestCases, strategyResults: retryResults } = await applyStrategies(pluginTestCases, [retryStrategy], injectVar, void 0, maxCharsPerMessage);
13211
+ const { testCases: retryTestCases, strategyResults: retryResults } = await applyStrategies(pluginTestCases, [retryStrategy], injectVar, redteamProvider, purpose, void 0, maxCharsPerMessage);
12245
13212
  pluginTestCases.push(...retryTestCases);
12246
13213
  Object.assign(strategyResults, retryResults);
12247
13214
  if (showProgressBar) progressBar?.increment(retryTestCases.length);
@@ -12249,7 +13216,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
12249
13216
  checkAbort();
12250
13217
  const nonBasicStrategies = strategies.filter((s) => !["basic", "retry"].includes(s.id));
12251
13218
  if (showProgressBar && nonBasicStrategies.length > 0) progressBar?.update({ task: "Applying strategies" });
12252
- const { testCases: strategyTestCases, strategyResults: otherStrategyResults } = await applyStrategies(pluginTestCases, nonBasicStrategies, injectVar, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage);
13219
+ const { testCases: strategyTestCases, strategyResults: otherStrategyResults } = await applyStrategies(pluginTestCases, nonBasicStrategies, injectVar, redteamProvider, purpose, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage);
12253
13220
  Object.assign(strategyResults, otherStrategyResults);
12254
13221
  if (showProgressBar && strategyTestCases.length > 0) progressBar?.increment(strategyTestCases.length);
12255
13222
  const finalTestCases = [...includeBasicTests ? pluginTestCases : [], ...strategyTestCases];
@@ -13142,6 +14109,10 @@ function stripAuthFromUrl(urlString) {
13142
14109
  }
13143
14110
  }
13144
14111
  async function handleEmailCollection(evalRecord) {
14112
+ if (evalRecord.author) {
14113
+ logger.debug(`[Share] Skipping email collection because author is already set`, { evalId: evalRecord.id });
14114
+ return;
14115
+ }
13145
14116
  if (!process.stdout.isTTY || isCI() || getEnvBool("PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST")) return;
13146
14117
  let email = getUserEmail();
13147
14118
  if (!email) {
@@ -13214,7 +14185,7 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
13214
14185
  for (const row of evaluateTable.body.slice(0, maxRows)) table.push([...row.vars.map((v) => ellipsize(v, tableCellMaxLength)), ...row.outputs.map(({ pass, text, failureReason: failureType }) => {
13215
14186
  text = ellipsize(text, tableCellMaxLength);
13216
14187
  if (pass) return chalk.green("[PASS] ") + text;
13217
- else return chalk.red(failureType === ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.red.bold(c) : c).join("---");
14188
+ return chalk.red(failureType === ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.red.bold(c) : c).join("---");
13218
14189
  })]);
13219
14190
  return table.toString();
13220
14191
  }
@@ -13285,6 +14256,14 @@ function shouldShareResults(opts) {
13285
14256
  return cloudConfig.isEnabled() && sharing !== false;
13286
14257
  }
13287
14258
  //#endregion
14259
+ //#region src/commands/eval/redteamWarning.ts
14260
+ function warnIfRedteamConfigHasNoTests(config, testSuite) {
14261
+ if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) logger.warn(chalk.yellow(dedent`
14262
+ Warning: Config file has a redteam section but no test cases.
14263
+ Did you mean to run ${chalk.bold("promptfoo redteam generate")} instead?
14264
+ `));
14265
+ }
14266
+ //#endregion
13288
14267
  //#region src/util/formatDuration.ts
13289
14268
  /**
13290
14269
  * Formats a duration in seconds into a human-readable string
@@ -13305,6 +14284,115 @@ function formatDuration(seconds) {
13305
14284
  }
13306
14285
  //#endregion
13307
14286
  //#region src/commands/eval/summary.ts
14287
+ function getCompletionMessage({ completionType, evalId, shareableUrl, wasAborted, writeToDatabase, activelySharing }) {
14288
+ if (wasAborted) {
14289
+ const idSuffix = writeToDatabase ? ` (ID: ${chalk.cyan(evalId)})` : "";
14290
+ return `${chalk.red("✗")} ${completionType} aborted${idSuffix}`;
14291
+ }
14292
+ if (writeToDatabase && shareableUrl) return `${chalk.green("✓")} ${completionType} complete: ${shareableUrl}`;
14293
+ if (writeToDatabase && activelySharing) return `${chalk.green("✓")} ${completionType} complete`;
14294
+ if (writeToDatabase) return `${chalk.green("✓")} ${completionType} complete (ID: ${chalk.cyan(evalId)})`;
14295
+ return `${chalk.green("✓")} ${completionType} complete`;
14296
+ }
14297
+ function getAbortSummaryLines(targetErrorStatus) {
14298
+ if (targetErrorStatus == null) return [];
14299
+ return [
14300
+ "",
14301
+ chalk.red.bold("Scan stopped: Target is unavailable and will not recover on retry."),
14302
+ chalk.red(` Target returned HTTP ${targetErrorStatus}`),
14303
+ "",
14304
+ chalk.yellow("Possible causes:"),
14305
+ chalk.yellow(" • Invalid API key or authentication (401/403)"),
14306
+ chalk.yellow(" • Target endpoint does not exist (404)"),
14307
+ chalk.yellow(" • Server does not support the request (501)"),
14308
+ "",
14309
+ chalk.cyan("To fix: Check your target configuration and credentials.")
14310
+ ];
14311
+ }
14312
+ function getGuidanceLines({ writeToDatabase, shareableUrl, wantsToShare, activelySharing, hasExplicitDisable, cloudEnabled }) {
14313
+ if (!writeToDatabase || shareableUrl || wantsToShare || activelySharing) return [];
14314
+ const lines = ["", `» View results: ${chalk.green.bold("promptfoo view")}`];
14315
+ if (!hasExplicitDisable) lines.push(cloudEnabled ? `» Create shareable URL: ${chalk.green.bold("promptfoo share")}` : `» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
14316
+ lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
14317
+ return lines;
14318
+ }
14319
+ function buildUsageDetails(usage, total) {
14320
+ const parts = [];
14321
+ if (usage.prompt && usage.prompt > 0) parts.push(`${usage.prompt.toLocaleString()} prompt`);
14322
+ if (usage.completion && usage.completion > 0) parts.push(`${usage.completion.toLocaleString()} completion`);
14323
+ if (usage.cached && usage.cached > 0) parts.push(usage.cached === total && parts.length === 0 ? "cached" : `${usage.cached.toLocaleString()} cached`);
14324
+ if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) parts.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
14325
+ return parts;
14326
+ }
14327
+ function getTokenUsageLines(tokenUsage, isRedteam, tracker) {
14328
+ const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
14329
+ const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
14330
+ if (!hasEvalTokens && !hasGradingTokens) return [];
14331
+ const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
14332
+ const evalTokens = {
14333
+ prompt: tokenUsage.prompt || 0,
14334
+ completion: tokenUsage.completion || 0,
14335
+ total: tokenUsage.total || combinedTotal,
14336
+ cached: tokenUsage.cached || 0,
14337
+ numRequests: tokenUsage.numRequests || 0,
14338
+ completionDetails: tokenUsage.completionDetails || {
14339
+ reasoning: 0,
14340
+ acceptedPrediction: 0,
14341
+ rejectedPrediction: 0
14342
+ }
14343
+ };
14344
+ const lines = [`${chalk.bold("Total Tokens:")} ${chalk.white.bold((evalTokens.total + (tokenUsage.assertions?.total || 0)).toLocaleString())}`];
14345
+ if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
14346
+ if (evalTokens.total > 0) {
14347
+ const evalParts = buildUsageDetails(evalTokens, evalTokens.total);
14348
+ lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
14349
+ }
14350
+ if (tokenUsage.assertions?.total && tokenUsage.assertions.total > 0) {
14351
+ const gradingParts = buildUsageDetails(tokenUsage.assertions, tokenUsage.assertions.total);
14352
+ lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
14353
+ }
14354
+ lines.push(...getProviderUsageLines(tracker));
14355
+ return lines;
14356
+ }
14357
+ function getProviderUsageLines(tracker) {
14358
+ const providerIds = tracker.getProviderIds();
14359
+ if (providerIds.length <= 1) return [];
14360
+ const sortedProviders = providerIds.map((id) => ({
14361
+ id,
14362
+ usage: tracker.getProviderUsage(id)
14363
+ })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
14364
+ const lines = ["", chalk.bold("Providers:")];
14365
+ for (const { id, usage } of sortedProviders) {
14366
+ if ((usage.total || 0) === 0 && (usage.prompt || 0) + (usage.completion || 0) === 0) continue;
14367
+ const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
14368
+ const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
14369
+ const details = buildUsageDetails(usage, displayTotal);
14370
+ const requestInfo = `${usage.numRequests || 0} requests`;
14371
+ const separator = details.length > 0 ? "; " : "";
14372
+ lines.push(` ${chalk.gray(`${displayId}:`)} ${chalk.white(displayTotal.toLocaleString())} (${requestInfo}${separator}${details.join(", ")})`);
14373
+ }
14374
+ return lines;
14375
+ }
14376
+ function formatResultPercentage(count, totalTests) {
14377
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
14378
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
14379
+ }
14380
+ function formatResultLine(count, label, icon, iconColor, totalTests) {
14381
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count, totalTests)})`)}`;
14382
+ }
14383
+ function getResultsLines({ successes, failures, errors, duration, maxConcurrency }) {
14384
+ const totalTests = successes + failures + errors;
14385
+ const errorLabel = errors === 1 ? "error" : "errors";
14386
+ return [
14387
+ "",
14388
+ chalk.bold("Results:"),
14389
+ formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green, totalTests),
14390
+ formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red, totalTests),
14391
+ formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red, totalTests),
14392
+ chalk.gray(`Duration: ${formatDuration(duration)} (concurrency: ${maxConcurrency})`),
14393
+ ""
14394
+ ];
14395
+ }
13308
14396
  /**
13309
14397
  * Generate formatted evaluation summary output for CLI display.
13310
14398
  *
@@ -13343,115 +14431,28 @@ function formatDuration(seconds) {
13343
14431
  * ```
13344
14432
  */
13345
14433
  function generateEvalSummary(params) {
13346
- const { evalId, isRedteam, writeToDatabase, shareableUrl, wantsToShare, hasExplicitDisable, cloudEnabled, activelySharing = false, tokenUsage, successes, failures, errors, duration, maxConcurrency, tracker, targetErrorStatus } = params;
13347
- const lines = [];
13348
- const completionType = isRedteam ? "Red team" : "Eval";
13349
- const wasAborted = targetErrorStatus != null;
13350
- let completionMessage;
13351
- if (wasAborted) {
13352
- completionMessage = `${chalk.red("✗")} ${completionType} aborted`;
13353
- if (writeToDatabase) completionMessage += ` (ID: ${chalk.cyan(evalId)})`;
13354
- } else if (writeToDatabase && shareableUrl) completionMessage = `${chalk.green("✓")} ${completionType} complete: ${shareableUrl}`;
13355
- else if (writeToDatabase && activelySharing) completionMessage = `${chalk.green("✓")} ${completionType} complete`;
13356
- else if (writeToDatabase) completionMessage = `${chalk.green("✓")} ${completionType} complete (ID: ${chalk.cyan(evalId)})`;
13357
- else completionMessage = `${chalk.green("✓")} ${completionType} complete`;
13358
- lines.push(completionMessage);
13359
- if (wasAborted && targetErrorStatus != null) {
13360
- lines.push("");
13361
- lines.push(chalk.red.bold("Scan stopped: Target is unavailable and will not recover on retry."));
13362
- lines.push(chalk.red(` Target returned HTTP ${targetErrorStatus}`));
13363
- lines.push("");
13364
- lines.push(chalk.yellow("Possible causes:"));
13365
- lines.push(chalk.yellow(" • Invalid API key or authentication (401/403)"));
13366
- lines.push(chalk.yellow(" • Target endpoint does not exist (404)"));
13367
- lines.push(chalk.yellow(" • Server does not support the request (501)"));
13368
- lines.push("");
13369
- lines.push(chalk.cyan("To fix: Check your target configuration and credentials."));
13370
- }
13371
- if (writeToDatabase && !shareableUrl && !wantsToShare && !activelySharing) {
13372
- lines.push("");
13373
- lines.push(`» View results: ${chalk.green.bold("promptfoo view")}`);
13374
- if (!hasExplicitDisable) if (cloudEnabled) lines.push(`» Create shareable URL: ${chalk.green.bold("promptfoo share")}`);
13375
- else lines.push(`» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
13376
- lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
13377
- }
13378
- lines.push("");
13379
- const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
13380
- const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
13381
- if (hasEvalTokens || hasGradingTokens) {
13382
- const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
13383
- const evalTokens = {
13384
- prompt: tokenUsage.prompt || 0,
13385
- completion: tokenUsage.completion || 0,
13386
- total: tokenUsage.total || combinedTotal,
13387
- cached: tokenUsage.cached || 0,
13388
- completionDetails: tokenUsage.completionDetails || {
13389
- reasoning: 0,
13390
- acceptedPrediction: 0,
13391
- rejectedPrediction: 0
13392
- }
13393
- };
13394
- const grandTotal = evalTokens.total + (tokenUsage.assertions?.total || 0);
13395
- lines.push(`${chalk.bold("Total Tokens:")} ${chalk.white.bold(grandTotal.toLocaleString())}`);
13396
- if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
13397
- if (evalTokens.total > 0) {
13398
- const evalParts = [];
13399
- if (evalTokens.prompt > 0) evalParts.push(`${evalTokens.prompt.toLocaleString()} prompt`);
13400
- if (evalTokens.completion > 0) evalParts.push(`${evalTokens.completion.toLocaleString()} completion`);
13401
- if (evalTokens.cached > 0) if (evalTokens.cached === evalTokens.total && evalParts.length === 0) evalParts.push("cached");
13402
- else evalParts.push(`${evalTokens.cached.toLocaleString()} cached`);
13403
- if (evalTokens.completionDetails?.reasoning && evalTokens.completionDetails.reasoning > 0) evalParts.push(`${evalTokens.completionDetails.reasoning.toLocaleString()} reasoning`);
13404
- lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
13405
- }
13406
- if (tokenUsage.assertions && tokenUsage.assertions.total && tokenUsage.assertions.total > 0) {
13407
- const gradingParts = [];
13408
- if (tokenUsage.assertions.prompt && tokenUsage.assertions.prompt > 0) gradingParts.push(`${tokenUsage.assertions.prompt.toLocaleString()} prompt`);
13409
- if (tokenUsage.assertions.completion && tokenUsage.assertions.completion > 0) gradingParts.push(`${tokenUsage.assertions.completion.toLocaleString()} completion`);
13410
- if (tokenUsage.assertions.cached && tokenUsage.assertions.cached > 0) if (tokenUsage.assertions.cached === tokenUsage.assertions.total && gradingParts.length === 0) gradingParts.push("cached");
13411
- else gradingParts.push(`${tokenUsage.assertions.cached.toLocaleString()} cached`);
13412
- if (tokenUsage.assertions.completionDetails?.reasoning && tokenUsage.assertions.completionDetails.reasoning > 0) gradingParts.push(`${tokenUsage.assertions.completionDetails.reasoning.toLocaleString()} reasoning`);
13413
- lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
13414
- }
13415
- const providerIds = tracker.getProviderIds();
13416
- if (providerIds.length > 1) {
13417
- lines.push("");
13418
- lines.push(chalk.bold("Providers:"));
13419
- const sortedProviders = providerIds.map((id) => ({
13420
- id,
13421
- usage: tracker.getProviderUsage(id)
13422
- })).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
13423
- for (const { id, usage } of sortedProviders) if ((usage.total || 0) > 0 || (usage.prompt || 0) + (usage.completion || 0) > 0) {
13424
- const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
13425
- const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
13426
- const details = [];
13427
- if (usage.prompt && usage.prompt > 0) details.push(`${usage.prompt.toLocaleString()} prompt`);
13428
- if (usage.completion && usage.completion > 0) details.push(`${usage.completion.toLocaleString()} completion`);
13429
- if (usage.cached && usage.cached > 0) if (usage.cached === displayTotal && details.length === 0) details.push("cached");
13430
- else details.push(`${usage.cached.toLocaleString()} cached`);
13431
- if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) details.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
13432
- const breakdown = ` (${`${usage.numRequests || 0} requests`}${details.length > 0 ? "; " : ""}${details.join(", ")})`;
13433
- lines.push(` ${chalk.gray(displayId + ":")} ${chalk.white(displayTotal.toLocaleString())}${breakdown}`);
13434
- }
13435
- }
13436
- }
13437
- lines.push("");
13438
- const totalTests = successes + failures + errors;
13439
- const formatResultPercentage = (count) => {
13440
- const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
13441
- return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
13442
- };
13443
- const formatResultLine = (count, label, icon, iconColor) => {
13444
- return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
13445
- };
13446
- const errorLabel = errors === 1 ? "error" : "errors";
13447
- lines.push(chalk.bold("Results:"));
13448
- lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
13449
- lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
13450
- lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
13451
- const durationDisplay = formatDuration(duration);
13452
- lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
13453
- lines.push("");
13454
- return lines;
14434
+ return [
14435
+ getCompletionMessage({
14436
+ completionType: params.isRedteam ? "Red team" : "Eval",
14437
+ evalId: params.evalId,
14438
+ shareableUrl: params.shareableUrl,
14439
+ wasAborted: params.targetErrorStatus != null,
14440
+ writeToDatabase: params.writeToDatabase,
14441
+ activelySharing: params.activelySharing ?? false
14442
+ }),
14443
+ ...getAbortSummaryLines(params.targetErrorStatus),
14444
+ ...getGuidanceLines({
14445
+ writeToDatabase: params.writeToDatabase,
14446
+ shareableUrl: params.shareableUrl,
14447
+ wantsToShare: params.wantsToShare,
14448
+ activelySharing: params.activelySharing ?? false,
14449
+ hasExplicitDisable: params.hasExplicitDisable,
14450
+ cloudEnabled: params.cloudEnabled
14451
+ }),
14452
+ "",
14453
+ ...getTokenUsageLines(params.tokenUsage, params.isRedteam, params.tracker),
14454
+ ...getResultsLines(params)
14455
+ ];
13455
14456
  }
13456
14457
  //#endregion
13457
14458
  //#region src/commands/retry.ts
@@ -13703,14 +14704,11 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
13703
14704
  state.resume = true;
13704
14705
  state.retryMode = true;
13705
14706
  } else ({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs(cmdObj, defaultConfig));
13706
- if (!cmdObj.envPath && commandLineOptions?.envPath) {
14707
+ if ((!cmdObj.envPath || cmdObj.envPath.length === 0) && commandLineOptions?.envPath) {
13707
14708
  logger.debug(`Loading additional environment from config: ${commandLineOptions.envPath}`);
13708
14709
  setupEnv(commandLineOptions.envPath);
13709
14710
  }
13710
- if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) logger.warn(chalk.yellow(dedent`
13711
- Warning: Config file has a redteam section but no test cases.
13712
- Did you mean to run ${chalk.bold("promptfoo redteam generate")} instead?
13713
- `));
14711
+ warnIfRedteamConfigHasNoTests(config, testSuite);
13714
14712
  if (config.redteam && Array.isArray(config.providers) && config.providers.length > 0 && typeof config.providers[0] === "object" && config.providers[0].id === "http") {
13715
14713
  const maybeUrl = config.providers[0]?.config?.url;
13716
14714
  if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) telemetry.record("feature_used", { feature: "redteam_run_with_example" });
@@ -13818,7 +14816,14 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
13818
14816
  ${z.prettifyError(testSuiteSchema.error)}
13819
14817
 
13820
14818
  Please review your promptfooconfig.yaml configuration.`));
13821
- const evalRecord = resumeEval ? resumeEval : cmdObj.write ? await Eval.create(config, testSuite.prompts, { runtimeOptions: options }) : new Eval(config, { runtimeOptions: options });
14819
+ const author = getAuthor();
14820
+ const evalRecord = resumeEval ? resumeEval : cmdObj.write ? await Eval.create(config, testSuite.prompts, {
14821
+ author,
14822
+ runtimeOptions: options
14823
+ }) : new Eval(config, {
14824
+ author,
14825
+ runtimeOptions: options
14826
+ });
13822
14827
  const abortController = new AbortController();
13823
14828
  const previousAbortSignal = evaluateOptions.abortSignal;
13824
14829
  evaluateOptions.abortSignal = previousAbortSignal ? AbortSignal.any([previousAbortSignal, abortController.signal]) : abortController.signal;
@@ -14220,65 +15225,175 @@ async function doRedteamRun(options) {
14220
15225
  return evalResult;
14221
15226
  }
14222
15227
  //#endregion
15228
+ //#region src/types/transform.ts
15229
+ /** Runtime type guard for `TransformFunction` values. */
15230
+ function isTransformFunction(value) {
15231
+ return typeof value === "function";
15232
+ }
15233
+ //#endregion
14223
15234
  //#region src/index.ts
15235
+ /**
15236
+ * Shallow-clone a test case so the caller can swap in resolved ApiProvider
15237
+ * instances on `options.provider` / `assert[].provider` without leaking those
15238
+ * mutations back to the input. The input may alias the unified config written
15239
+ * to the Eval record, and a live SDK client (e.g. Bedrock's BedrockRuntime,
15240
+ * Anthropic's client) holds circular references that break drizzle's JSON
15241
+ * serialization on `evalRecord.save()`. Fixes #8687.
15242
+ *
15243
+ * Detaches only `options` and `assert[]`. Other reference fields (`provider`,
15244
+ * `vars`, `metadata`, `providerOutput`) remain aliased — callers must reassign
15245
+ * those by reference rather than mutating in place. `assert-set` children are
15246
+ * not deep-cloned because the resolve loop skips `assert-set`; if that ever
15247
+ * changes, extend this helper.
15248
+ */
15249
+ function cloneTestForResolve(test) {
15250
+ const cloned = { ...test };
15251
+ if (test.options) cloned.options = { ...test.options };
15252
+ if (test.assert) cloned.assert = test.assert.map((assertion) => ({ ...assertion }));
15253
+ return cloned;
15254
+ }
15255
+ function toSerializableProviderRef(provider) {
15256
+ if (isApiProvider(provider)) return sanitizeProvider(provider);
15257
+ if (Array.isArray(provider)) return provider.map(toSerializableProviderRef);
15258
+ return provider;
15259
+ }
15260
+ function isRecord(value) {
15261
+ return Boolean(value && typeof value === "object" && !Array.isArray(value));
15262
+ }
15263
+ function withSerializableProvider(record) {
15264
+ if (!isApiProvider(record.provider)) return record;
15265
+ return {
15266
+ ...record,
15267
+ provider: sanitizeProvider(record.provider)
15268
+ };
15269
+ }
15270
+ /**
15271
+ * Function-valued transforms are first-class at runtime but are silently dropped
15272
+ * by `JSON.stringify`. Persisted eval configs (drizzle-stored) must never retain
15273
+ * a function reference, so replace every `transform`-like field with a
15274
+ * `[inline function]: name` marker. Non-function values pass through unchanged.
15275
+ *
15276
+ * `droppedRef.value` is flipped to `true` the first time a function is replaced
15277
+ * so the caller can emit a single warning instead of logging per field.
15278
+ */
15279
+ function replaceFunctionTransforms(record, droppedRef) {
15280
+ let result;
15281
+ for (const key of TRANSFORM_KEYS) {
15282
+ const value = record[key];
15283
+ if (!isTransformFunction(value)) continue;
15284
+ if (!result) result = { ...record };
15285
+ result[key] = value.name ? `${INLINE_FUNCTION_LABEL}: ${value.name}` : INLINE_FUNCTION_LABEL;
15286
+ droppedRef.value = true;
15287
+ }
15288
+ return result ?? record;
15289
+ }
15290
+ function toSerializableAssertion(assertion, droppedRef) {
15291
+ if (!isRecord(assertion)) return assertion;
15292
+ let sanitizedAssertion = withSerializableProvider(assertion);
15293
+ sanitizedAssertion = replaceFunctionTransforms(sanitizedAssertion, droppedRef);
15294
+ if (Array.isArray(assertion.assert)) sanitizedAssertion = {
15295
+ ...sanitizedAssertion,
15296
+ assert: assertion.assert.map((a) => toSerializableAssertion(a, droppedRef))
15297
+ };
15298
+ return sanitizedAssertion;
15299
+ }
15300
+ function toSerializableTestCase(test, droppedRef) {
15301
+ if (!isRecord(test)) return test;
15302
+ let sanitizedTest = withSerializableProvider(test);
15303
+ if (isRecord(test.options)) {
15304
+ let options = withSerializableProvider(test.options);
15305
+ options = replaceFunctionTransforms(options, droppedRef);
15306
+ if (options !== test.options) sanitizedTest = {
15307
+ ...sanitizedTest,
15308
+ options
15309
+ };
15310
+ }
15311
+ if (Array.isArray(test.assert)) sanitizedTest = {
15312
+ ...sanitizedTest,
15313
+ assert: test.assert.map((a) => toSerializableAssertion(a, droppedRef))
15314
+ };
15315
+ return sanitizedTest;
15316
+ }
15317
+ function toSerializableScenario(scenario, droppedRef) {
15318
+ if (!isRecord(scenario)) return scenario;
15319
+ if (!Array.isArray(scenario.tests)) return scenario;
15320
+ return {
15321
+ ...scenario,
15322
+ tests: scenario.tests.map((t) => toSerializableTestCase(t, droppedRef))
15323
+ };
15324
+ }
15325
+ function createSerializableUnifiedConfig(testSuite, prompts) {
15326
+ const droppedRef = { value: false };
15327
+ const config = {
15328
+ ...testSuite,
15329
+ providers: toSerializableProviderRef(testSuite.providers),
15330
+ defaultTest: toSerializableTestCase(testSuite.defaultTest, droppedRef),
15331
+ tests: Array.isArray(testSuite.tests) ? testSuite.tests.map((t) => toSerializableTestCase(t, droppedRef)) : testSuite.tests,
15332
+ scenarios: Array.isArray(testSuite.scenarios) ? testSuite.scenarios.map((s) => toSerializableScenario(s, droppedRef)) : testSuite.scenarios,
15333
+ prompts
15334
+ };
15335
+ if (droppedRef.value && testSuite.writeLatestResults) logger.warn("Function-valued transform(s) in testSuite were replaced with \"[inline function]\" markers in the persisted config. Re-running the saved eval will not invoke them; use string expressions or file:// references if you need the config to round-trip.");
15336
+ return config;
15337
+ }
14224
15338
  async function evaluate(testSuite, options = {}) {
14225
- if (testSuite.writeLatestResults) await runDbMigrations();
14226
- const loadedProviders = await loadApiProviders(testSuite.providers, { env: testSuite.env });
15339
+ const { author: suiteAuthor, ...testSuiteConfig } = testSuite;
15340
+ if (testSuiteConfig.writeLatestResults) await runDbMigrations();
15341
+ const loadedProviders = await loadApiProviders(testSuiteConfig.providers, { env: testSuiteConfig.env });
14227
15342
  const providerMap = {};
14228
15343
  for (const p of loadedProviders) {
14229
15344
  providerMap[p.id()] = p;
14230
15345
  if (p.label) providerMap[p.label] = p;
14231
15346
  }
14232
- let resolvedDefaultTest = testSuite.defaultTest;
14233
- if (typeof testSuite.defaultTest === "string" && testSuite.defaultTest.startsWith("file://")) resolvedDefaultTest = await maybeLoadFromExternalFile(testSuite.defaultTest);
15347
+ let resolvedDefaultTest = testSuiteConfig.defaultTest;
15348
+ if (typeof testSuiteConfig.defaultTest === "string" && testSuiteConfig.defaultTest.startsWith("file://")) resolvedDefaultTest = await maybeLoadFromExternalFile(testSuiteConfig.defaultTest);
14234
15349
  const constructedTestSuite = {
14235
- ...testSuite,
15350
+ ...testSuiteConfig,
14236
15351
  defaultTest: resolvedDefaultTest,
14237
- scenarios: testSuite.scenarios,
15352
+ scenarios: testSuiteConfig.scenarios,
14238
15353
  providers: loadedProviders,
14239
- tests: await readTests(testSuite.tests),
14240
- nunjucksFilters: await readFilters(testSuite.nunjucksFilters || {}),
14241
- prompts: await processPrompts(testSuite.prompts)
14242
- };
14243
- if (typeof constructedTestSuite.defaultTest === "object") {
14244
- if (constructedTestSuite.defaultTest?.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
14245
- env: testSuite.env,
15354
+ tests: await readTests(testSuiteConfig.tests),
15355
+ nunjucksFilters: await readFilters(testSuiteConfig.nunjucksFilters || {}),
15356
+ prompts: await processPrompts(testSuiteConfig.prompts)
15357
+ };
15358
+ if (typeof constructedTestSuite.defaultTest === "object" && constructedTestSuite.defaultTest) {
15359
+ constructedTestSuite.defaultTest = cloneTestForResolve(constructedTestSuite.defaultTest);
15360
+ if (constructedTestSuite.defaultTest.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
15361
+ env: testSuiteConfig.env,
14246
15362
  basePath: state.basePath
14247
15363
  });
14248
- if (constructedTestSuite.defaultTest?.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
14249
- env: testSuite.env,
15364
+ if (constructedTestSuite.defaultTest.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
15365
+ env: testSuiteConfig.env,
14250
15366
  basePath: state.basePath
14251
15367
  });
14252
15368
  }
14253
- for (const test of constructedTestSuite.tests || []) {
15369
+ constructedTestSuite.tests = (constructedTestSuite.tests || []).map(cloneTestForResolve);
15370
+ for (const test of constructedTestSuite.tests) {
14254
15371
  if (test.options?.provider && !isApiProvider(test.options.provider)) test.options.provider = await resolveProvider(test.options.provider, providerMap, {
14255
- env: testSuite.env,
15372
+ env: testSuiteConfig.env,
14256
15373
  basePath: state.basePath
14257
15374
  });
14258
- if (test.assert) for (const assertion of test.assert) {
15375
+ for (const assertion of test.assert || []) {
14259
15376
  if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
14260
15377
  if (assertion.provider && !isApiProvider(assertion.provider)) assertion.provider = await resolveProvider(assertion.provider, providerMap, {
14261
- env: testSuite.env,
15378
+ env: testSuiteConfig.env,
14262
15379
  basePath: state.basePath
14263
15380
  });
14264
15381
  }
14265
15382
  }
14266
15383
  if (options.cache === false) disableCache();
14267
- const parsedProviderPromptMap = readProviderPromptMap(testSuite, constructedTestSuite.prompts);
14268
- const unifiedConfig = {
14269
- ...testSuite,
14270
- prompts: constructedTestSuite.prompts
14271
- };
14272
- const evalRecord = testSuite.writeLatestResults ? await Eval.create(unifiedConfig, constructedTestSuite.prompts) : new Eval(unifiedConfig);
15384
+ const parsedProviderPromptMap = readProviderPromptMap(testSuiteConfig, constructedTestSuite.prompts);
15385
+ const unifiedConfig = createSerializableUnifiedConfig(testSuiteConfig, constructedTestSuite.prompts);
15386
+ const author = getAuthor(suiteAuthor);
15387
+ const evalRecord = testSuiteConfig.writeLatestResults ? await Eval.create(unifiedConfig, constructedTestSuite.prompts, { author }) : new Eval(unifiedConfig, { author });
14273
15388
  const ret = await evaluate$1({
14274
15389
  ...constructedTestSuite,
14275
15390
  providerPromptMap: parsedProviderPromptMap
14276
15391
  }, evalRecord, {
14277
15392
  eventSource: "library",
14278
- isRedteam: Boolean(testSuite.redteam),
15393
+ isRedteam: Boolean(testSuiteConfig.redteam),
14279
15394
  ...options
14280
15395
  });
14281
- if (testSuite.writeLatestResults && testSuite.sharing) if (isSharingEnabled(ret)) try {
15396
+ if (testSuiteConfig.writeLatestResults && testSuiteConfig.sharing) if (isSharingEnabled(ret)) try {
14282
15397
  const shareableUrl = await createShareableUrl(ret, { silent: true });
14283
15398
  if (shareableUrl) {
14284
15399
  ret.shareableUrl = shareableUrl;
@@ -14289,9 +15404,9 @@ async function evaluate(testSuite, options = {}) {
14289
15404
  logger.warn(`Failed to create shareable URL: ${error}`);
14290
15405
  }
14291
15406
  else logger.debug("Sharing requested but not enabled (check cloud config or sharing settings)");
14292
- if (testSuite.outputPath) {
14293
- if (typeof testSuite.outputPath === "string") await writeOutput(testSuite.outputPath, evalRecord, null);
14294
- else if (Array.isArray(testSuite.outputPath)) await writeMultipleOutputs(testSuite.outputPath, evalRecord, null);
15407
+ if (testSuiteConfig.outputPath) {
15408
+ if (typeof testSuiteConfig.outputPath === "string") await writeOutput(testSuiteConfig.outputPath, evalRecord, null);
15409
+ else if (Array.isArray(testSuiteConfig.outputPath)) await writeMultipleOutputs(testSuiteConfig.outputPath, evalRecord, null);
14295
15410
  }
14296
15411
  return ret;
14297
15412
  }
@@ -14320,6 +15435,6 @@ var src_default = {
14320
15435
  redteam
14321
15436
  };
14322
15437
  //#endregion
14323
- export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, cache_exports as cache, src_default as default, evaluate, generateTable, guardrails, isApiProvider, isGradingResult, isProviderOptions, isResultFailureReason, loadApiProvider, redteam };
15438
+ export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, DocumentMediaInjectionPlacementSchema, DocumentMediaInjectionPlacementValues, DocxInjectionPlacementSchema, DocxInjectionPlacementValues, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputConfigSchema, InputDefinitionObjectSchema, InputDefinitionSchema, InputTypeSchema, InputTypeValues, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, buildInputPromptDescription, cache_exports as cache, src_default as default, evaluate, generateTable, getInputDescription, getInputType, guardrails, isApiProvider, isGradingResult, isProviderOptions, isResultFailureReason, isTransformFunction, loadApiProvider, normalizeInputDefinition, normalizeInputs, redteam };
14324
15439
 
14325
15440
  //# sourceMappingURL=index.js.map