promptfoo 0.121.2 → 0.121.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +2 -0
  2. package/dist/src/{accounts-CiBLOnA7.js → accounts-B2XmGjty.js} +5 -5
  3. package/dist/src/{accounts-gtkH-5KX.cjs → accounts-BPyfpSeU.cjs} +5 -5
  4. package/dist/src/{accounts-Bm2D8Db9.js → accounts-CFLK3mnD.js} +6 -6
  5. package/dist/src/{accounts-B0pgC1oV.js → accounts-Xatc0RYb.js} +5 -5
  6. package/dist/src/{agentic-utils-DS1g3GLF.js → agentic-utils-36epdqwB.js} +3 -3
  7. package/dist/src/{cometapi-CUQq3H_a.js → agentic-utils-D8yXo5Lm.js} +4 -61
  8. package/dist/src/{cometapi-C4xSqeID.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
  9. package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
  10. package/dist/src/{agents-CBr9A01V.js → agents-BBVJCIYr.js} +226 -13
  11. package/dist/src/{agents-Di9DKPzn.cjs → agents-BBWxKSM0.cjs} +7 -7
  12. package/dist/src/{agents-DgF2zDag.js → agents-Bqgfdokm.js} +228 -13
  13. package/dist/src/{agents-DbRtpYxR.cjs → agents-CAYbM7qD.cjs} +226 -13
  14. package/dist/src/{agents-9qiOy0ho.js → agents-CLQ-P15P.js} +7 -7
  15. package/dist/src/{agents-cLXA8a_8.js → agents-CgBniSlI.js} +8 -8
  16. package/dist/src/{agents-D__IdAlg.js → agents-DSSTV4bv.js} +226 -15
  17. package/dist/src/{agents-CmvBq8LV.js → agents-wg3ohknq.js} +7 -7
  18. package/dist/src/{aimlapi-BvlNH0gr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
  19. package/dist/src/{aimlapi-DHJU_kcV.js → aimlapi-BwGC1TtS.js} +13 -13
  20. package/dist/src/{aimlapi-CnkC2HqE.js → aimlapi-DaC3qZ-o.js} +14 -14
  21. package/dist/src/{aimlapi-B4rcnZgv.js → aimlapi-MgSLdvy7.js} +13 -13
  22. package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
  23. package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
  24. package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
  25. package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
  26. package/dist/src/app/index.html +3 -3
  27. package/dist/src/{audio-Bkv46et0.js → audio-Bn44pQxv.js} +4 -4
  28. package/dist/src/{audio-ClI_AFre.js → audio-DDA5WHdx.js} +4 -4
  29. package/dist/src/{audio-CGMyULza.cjs → audio-DVFjQ67_.cjs} +4 -4
  30. package/dist/src/{audio-Dz3z7s3J.js → audio-DjU9GswO.js} +5 -5
  31. package/dist/src/{base-CGrhspbK.cjs → base-BboXIF_0.cjs} +3 -3
  32. package/dist/src/{base-Dy1V8--Z.js → base-CKjwebIH.js} +3 -3
  33. package/dist/src/{base-DLKtKMFh.js → base-CqzQ4K8j.js} +3 -3
  34. package/dist/src/{base-CpjcHe4e.js → base-Cz2ZC_iA.js} +3 -3
  35. package/dist/src/{blobs-CMHN0Qcz.js → blobs-B1JriOyi.js} +3 -3
  36. package/dist/src/{blobs-BDbfYdrJ.js → blobs-BUWmKWzo.js} +3 -3
  37. package/dist/src/{blobs-D23XLin-.cjs → blobs-C6j0bvFz.cjs} +3 -3
  38. package/dist/src/{blobs-CBO20krR.js → blobs-DXTl6J3H.js} +3 -3
  39. package/dist/src/{cache-Dh5WtQps.cjs → cache-C5yFZ4gC.cjs} +3 -3
  40. package/dist/src/{cache-C4Nxf52C.js → cache-CaT5tPgo.js} +3 -3
  41. package/dist/src/cache-CyCanoMu.js +6 -0
  42. package/dist/src/{cache-BVeDlD87.js → cache-DSqR6ezl.js} +3 -3
  43. package/dist/src/cache-Df_QFDNu.cjs +5 -0
  44. package/dist/src/{cache-i1P6crbO.js → cache-HP0NP4k3.js} +3 -3
  45. package/dist/src/{chat-CzkrVDfz.js → chat-B-52XYI1.js} +12 -12
  46. package/dist/src/{chat-DJIw17u0.js → chat-B0iaWhoh.js} +14 -14
  47. package/dist/src/{chat-qmatte1u.js → chat-BE0qTA8e.js} +13 -13
  48. package/dist/src/{chat-BiKyneZl.js → chat-BEwdgGEg.js} +14 -14
  49. package/dist/src/{chat-C1Qst7jL.cjs → chat-BtIKkLKx.cjs} +13 -13
  50. package/dist/src/{chat-CgF-J-Jj.cjs → chat-CM8qWR3_.cjs} +15 -15
  51. package/dist/src/{chat-C2jrdPMx.js → chat-DK1U-eZ-.js} +12 -12
  52. package/dist/src/{chat-DqxYYtWA.js → chat-pxmiVpWe.js} +14 -14
  53. package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
  54. package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
  55. package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
  56. package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
  57. package/dist/src/{claude-agent-sdk-D9Z5Pr9X.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
  58. package/dist/src/{claude-agent-sdk-DfCoW0E6.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
  59. package/dist/src/{claude-agent-sdk-Apiy0iaz.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
  60. package/dist/src/{claude-agent-sdk-D2bJee9S.js → claude-agent-sdk-HgbFioFw.js} +33 -15
  61. package/dist/src/cloud-DE3t1-ZI.js +4 -0
  62. package/dist/src/{cloud-C0dlstV_.js → cloud-z8KZpUoa.js} +3 -3
  63. package/dist/src/{cloudflare-ai-g7PB6VHR.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
  64. package/dist/src/{cloudflare-ai-8TDxHR0x.js → cloudflare-ai-Bbp26N0L.js} +13 -13
  65. package/dist/src/{cloudflare-ai-CknbZ5LJ.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
  66. package/dist/src/{cloudflare-ai-BxAGvfju.js → cloudflare-ai-DdKP9TKT.js} +14 -14
  67. package/dist/src/{cloudflare-gateway-CP9QEWYS.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
  68. package/dist/src/{cloudflare-gateway-B9HWA5wf.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
  69. package/dist/src/{cloudflare-gateway-CKDb4dJ8.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
  70. package/dist/src/{cloudflare-gateway-BSnDmHYo.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
  71. package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
  72. package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
  73. package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
  74. package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
  75. package/dist/src/cometapi-BDyV-NNm.js +62 -0
  76. package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
  77. package/dist/src/{cometapi-BL9yvj_f.js → cometapi-hhL4TAh3.js} +14 -14
  78. package/dist/src/{cometapi-DFNiKmSz.js → cometapi-sp7sJpBD.js} +15 -15
  79. package/dist/src/{completion-5MzrpJxT.js → completion-BCimtq-h.js} +6 -6
  80. package/dist/src/{completion-qRoZAYRB.js → completion-DCjv7RZ3.js} +6 -6
  81. package/dist/src/{completion-CM6oK8PS.cjs → completion-DlXUhj5c.cjs} +6 -6
  82. package/dist/src/{completion-DZ083F31.js → completion-DoYy49ti.js} +6 -6
  83. package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
  84. package/dist/src/{docker-DcF2pRrj.cjs → docker-Cqj2-QVi.cjs} +14 -14
  85. package/dist/src/{docker-Bb5dcxr8.js → docker-CxCkwMzc.js} +13 -13
  86. package/dist/src/{docker-BvfL2BrW.js → docker-DpguQj-w.js} +14 -14
  87. package/dist/src/{docker-ExVyLp0S.js → docker-FeBni2dw.js} +13 -13
  88. package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
  89. package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
  90. package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
  91. package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
  92. package/dist/src/eval-7aEqoMs3.js +15 -0
  93. package/dist/src/{eval-Dg2nG4v2.js → eval-BTqTn7lb.js} +10 -10
  94. package/dist/src/{evalResult-BDMqrapS.js → evalResult-BkIhRdTe.js} +7 -7
  95. package/dist/src/evalResult-CYNHkk5A.js +12 -0
  96. package/dist/src/evalResult-CuvJeNiM.js +10 -0
  97. package/dist/src/{evalResult-BBRNtX4I.js → evalResult-DUDShQrm.js} +7 -7
  98. package/dist/src/{evalResult-fuaI8HkH.cjs → evalResult-DpARzUCb.cjs} +7 -7
  99. package/dist/src/evalResult-tGdilrWt.cjs +10 -0
  100. package/dist/src/evaluator-BBUqRhz1.js +36 -0
  101. package/dist/src/{evaluator-BhoWwp5b.js → evaluator-BcvOGaam.js} +823 -73
  102. package/dist/src/{extractor-D25qpmGX.js → extractor-C8XwivI9.js} +6 -6
  103. package/dist/src/{extractor-DReVID0K.js → extractor-CAZ2G3Kh.js} +6 -6
  104. package/dist/src/{extractor-pYLLi3wS.cjs → extractor-DG3sSfXE.cjs} +6 -6
  105. package/dist/src/{extractor-C0EVHewb.js → extractor-D_wd8jxt.js} +6 -6
  106. package/dist/src/{fetch-HaqdX7U1.js → fetch-BiYv2BZc.js} +3 -3
  107. package/dist/src/{fetch-BPkYtG8K.cjs → fetch-BnR9wSnm.cjs} +3 -3
  108. package/dist/src/{fetch-Cwxnd8zz.js → fetch-CVAtKnI3.js} +3 -3
  109. package/dist/src/{fetch-Dxpd4_sr.js → fetch-DoVRJZhJ.js} +4 -4
  110. package/dist/src/fetch-UWU706qb.js +5 -0
  111. package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
  112. package/dist/src/{graders-DU49_J8Y.cjs → graders-BElhu9ZY.cjs} +126 -55
  113. package/dist/src/{graders-DP7KFFo-.js → graders-BXAJ0sbS.js} +120 -55
  114. package/dist/src/graders-BxfEguVY.js +32 -0
  115. package/dist/src/graders-CzVMbEnv.js +34 -0
  116. package/dist/src/{graders-BTeBGqjJ.js → graders-DG7mhg-b.js} +120 -55
  117. package/dist/src/graders-DjCXfj0l.cjs +32 -0
  118. package/dist/src/{graders-Bj_Odv7c.js → graders-RjHF8VfG.js} +120 -55
  119. package/dist/src/graders-kHzIWOKu.js +32 -0
  120. package/dist/src/{image-BLmROtN3.cjs → image--F58eEIn.cjs} +6 -6
  121. package/dist/src/{image-B0h9VEMc.js → image-6WQXK8m8.js} +4 -4
  122. package/dist/src/{image-Dpxa1Jt6.js → image-B8b6f36E.js} +6 -6
  123. package/dist/src/{image-CHfWvljl.js → image-CoxZp9PZ.js} +6 -6
  124. package/dist/src/{image-B02ogr_b.js → image-DO0RYnjH.js} +5 -5
  125. package/dist/src/{image-DS-o-0ph.js → image-PoF6DN3x.js} +6 -6
  126. package/dist/src/{image-C1madmKh.cjs → image-fza3zuKs.cjs} +4 -4
  127. package/dist/src/{image-Bb4vWQLM.js → image-xNbw5ph2.js} +4 -4
  128. package/dist/src/index.cjs +853 -104
  129. package/dist/src/index.d.cts +573 -60
  130. package/dist/src/index.d.ts +573 -60
  131. package/dist/src/index.js +850 -102
  132. package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
  133. package/dist/src/{knowledgeBase-B3OoKIej.js → knowledgeBase-Bi7CmDbx.js} +7 -7
  134. package/dist/src/{knowledgeBase-CYTLHOt1.js → knowledgeBase-Ce3ofVan.js} +8 -8
  135. package/dist/src/{knowledgeBase-D33Ty2l6.js → knowledgeBase-DFRXPZl_.js} +7 -7
  136. package/dist/src/{knowledgeBase-DOO_BM9b.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
  137. package/dist/src/{litellm-AaeZcZQF.js → litellm-Bo2gQXpo.js} +14 -14
  138. package/dist/src/{litellm-NbjknEh6.js → litellm-CKiAxnoM.js} +13 -13
  139. package/dist/src/{litellm-I_hbp_dc.cjs → litellm-CnHI69aj.cjs} +14 -14
  140. package/dist/src/{litellm-TrljxD9G.js → litellm-Tc294Jhj.js} +13 -13
  141. package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
  142. package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
  143. package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
  144. package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
  145. package/dist/src/{luma-ray-BS2_tY8L.js → luma-ray-0ehMPt5N.js} +10 -10
  146. package/dist/src/{luma-ray-DDsjcgZZ.js → luma-ray-C9q8rdQe.js} +9 -9
  147. package/dist/src/{luma-ray-f6I2fft-.js → luma-ray-DP0QA9qn.js} +9 -9
  148. package/dist/src/{luma-ray-Due0n7di.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
  149. package/dist/src/main.js +69 -71
  150. package/dist/src/{messages-D0lx5qK7.js → messages-DJNo37Ko.js} +14 -9
  151. package/dist/src/{messages-BS17jdMx.js → messages-Dy9QecMs.js} +14 -9
  152. package/dist/src/{messages-Bs1kC7P4.cjs → messages-HJsyEh4o.cjs} +15 -10
  153. package/dist/src/{messages-ZJk778GH.js → messages-biC_ex-p.js} +14 -9
  154. package/dist/src/{modelslab-DRb74SP4.js → modelslab-B5J-ZM5c.js} +9 -9
  155. package/dist/src/{modelslab-Bx9IrZfS.js → modelslab-BI458moT.js} +10 -10
  156. package/dist/src/{modelslab-Bmni6skY.js → modelslab-BTOT8FUO.js} +9 -9
  157. package/dist/src/{modelslab-CoUX6Jc_.cjs → modelslab-IQbNg-r7.cjs} +9 -9
  158. package/dist/src/{nova-reel-bgjxilYW.js → nova-reel-BZ9y-Y5s.js} +9 -9
  159. package/dist/src/{nova-reel-C_QM18Xn.cjs → nova-reel-CE5etkv9.cjs} +9 -9
  160. package/dist/src/{nova-reel-D_W1tjMH.js → nova-reel-DEeQlnOJ.js} +10 -10
  161. package/dist/src/{nova-reel-BfPq-0Yk.js → nova-reel-Xw1SXLpg.js} +9 -9
  162. package/dist/src/{nova-sonic-De1HW5fD.js → nova-sonic-DWswpN1E.js} +7 -7
  163. package/dist/src/{nova-sonic-CFb5GYhg.js → nova-sonic-DXTLpi-r.js} +6 -6
  164. package/dist/src/{nova-sonic-zfcljeRp.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
  165. package/dist/src/{nova-sonic-DIGQNR07.js → nova-sonic-Ogqf-csn.js} +6 -6
  166. package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
  167. package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
  168. package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
  169. package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
  170. package/dist/src/{openclaw-tiVYRtr-.js → openclaw-0Sv7AK3O.js} +13 -13
  171. package/dist/src/{openclaw-CSugPYAr.cjs → openclaw-CXxbKgDH.cjs} +14 -14
  172. package/dist/src/{openclaw-DuvJKEW5.js → openclaw-D1FSCps-.js} +13 -13
  173. package/dist/src/{openclaw-DiSz3I5L.js → openclaw-D2ENvu7a.js} +14 -14
  174. package/dist/src/{opencode-sdk-0j6rTWNb.js → opencode-sdk-C71Z0ehR.js} +13 -13
  175. package/dist/src/{opencode-sdk-B3CWY9h_.js → opencode-sdk-CHCs7dEb.js} +12 -12
  176. package/dist/src/{opencode-sdk-C2y6UkP2.js → opencode-sdk-DDxj4QqH.js} +12 -12
  177. package/dist/src/{opencode-sdk-BL764Jdi.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
  178. package/dist/src/{otlpReceiver-C99PPb48.js → otlpReceiver-C9KlUtxh.js} +6 -6
  179. package/dist/src/{otlpReceiver-CdNBdbsk.js → otlpReceiver-CZL48YfC.js} +6 -6
  180. package/dist/src/{otlpReceiver-D89fR-rC.js → otlpReceiver-CavGAA6k.js} +6 -6
  181. package/dist/src/{otlpReceiver-CGq6LspY.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
  182. package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
  183. package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
  184. package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
  185. package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
  186. package/dist/src/providers-BSLEaIQG.js +32 -0
  187. package/dist/src/{providers-CgKOSgTR.cjs → providers-CScd1wN6.cjs} +733 -464
  188. package/dist/src/{providers-BlqUifFg.js → providers-Ch6Mr0gn.js} +795 -526
  189. package/dist/src/{providers-Dk_6ocUX.js → providers-Cn73d5sr.js} +795 -526
  190. package/dist/src/providers-D-FnDg8k.cjs +31 -0
  191. package/dist/src/providers-DEYiFVAo.js +30 -0
  192. package/dist/src/{providers-D8lF1sqW.js → providers-DvddrgxL.js} +795 -526
  193. package/dist/src/providers-sS2WI8YD.js +30 -0
  194. package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
  195. package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
  196. package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
  197. package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
  198. package/dist/src/{quiverai-CIaELU_m.js → quiverai-BeofbLVc.js} +4 -4
  199. package/dist/src/{quiverai-uH-dcTIr.js → quiverai-CCQn73lq.js} +5 -5
  200. package/dist/src/{quiverai-PdShCPox.cjs → quiverai-CcUhPIBg.cjs} +4 -4
  201. package/dist/src/{quiverai-BbOUOn2L.js → quiverai-DVSEqJiq.js} +4 -4
  202. package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
  203. package/dist/src/{responses-WNGNYe3K.js → responses-BKP_WYis.js} +14 -10
  204. package/dist/src/{responses-DIR9Ud3j.js → responses-CQb1Tj69.js} +14 -10
  205. package/dist/src/{responses-CB2jwoAr.js → responses-CgNyTPsY.js} +14 -10
  206. package/dist/src/{responses-D8SBTL64.cjs → responses-mo0KQDbu.cjs} +14 -10
  207. package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
  208. package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
  209. package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
  210. package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
  211. package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
  212. package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
  213. package/dist/src/{sagemaker-CNBxx5CJ.js → sagemaker-CVv8W7so.js} +17 -17
  214. package/dist/src/{sagemaker-CemTFp2h.js → sagemaker-CqeASYE5.js} +17 -17
  215. package/dist/src/{sagemaker-YSyBXQQh.js → sagemaker-MUbD5V3v.js} +18 -18
  216. package/dist/src/{sagemaker-Cl28mZU2.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
  217. package/dist/src/{scanner-BsBlNXNn.js → scanner-DVDeUz1r.js} +10 -10
  218. package/dist/src/server/index.js +854 -106
  219. package/dist/src/server-B0Xh1Gx-.js +7 -0
  220. package/dist/src/{server-C_7Ax-hA.cjs → server-BtoCXeXI.cjs} +4 -4
  221. package/dist/src/{server-VWgWb00X.js → server-CP9qKM40.js} +4 -4
  222. package/dist/src/{server-CuxBbeSY.js → server-Cns05F1j.js} +5 -5
  223. package/dist/src/server-DJTKu9IR.cjs +5 -0
  224. package/dist/src/{server-CqzrVGpF.js → server-DZ9MtCn0.js} +6 -6
  225. package/dist/src/{signal-4U3mfRvL.js → signal-C3ZTsUgi.js} +3 -3
  226. package/dist/src/{slack-DOdy_kyv.js → slack-2sdpGzbt.js} +2 -2
  227. package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
  228. package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
  229. package/dist/src/{slack-DXMKtA-f.js → slack-DCEV-vWP.js} +2 -2
  230. package/dist/src/store-C5u6MgC8.js +6 -0
  231. package/dist/src/{store-DLlFCC4h.cjs → store-CLyU7AtI.cjs} +17 -5
  232. package/dist/src/store-CNHk-De4.cjs +5 -0
  233. package/dist/src/{store-DXilxTl-.js → store-Cj258DgL.js} +17 -5
  234. package/dist/src/{store-Dim__MDd.js → store-P8OKm19S.js} +17 -5
  235. package/dist/src/{store-CXGFv4aR.js → store-VB0GP46K.js} +17 -5
  236. package/dist/src/{tables-DLJPUdUE.js → tables-BEIFz2tM.js} +3 -3
  237. package/dist/src/{tables-DPi7wKeM.cjs → tables-BdZQEpRz.cjs} +3 -3
  238. package/dist/src/{tables-gftXzE9I.js → tables-DmzvLbeZ.js} +3 -3
  239. package/dist/src/{tables-6YKwjN9-.js → tables-kC7R5kiK.js} +3 -3
  240. package/dist/src/{telemetry-CMrFgtPB.js → telemetry-BnH5VJAU.js} +4 -4
  241. package/dist/src/{telemetry-Dthj_BbD.js → telemetry-BugWqKiu.js} +4 -4
  242. package/dist/src/{telemetry-Cps3mIU-.js → telemetry-DPXLd7UE.js} +4 -4
  243. package/dist/src/telemetry-Yig0Tino.js +7 -0
  244. package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
  245. package/dist/src/{telemetry-DaX14Chu.cjs → telemetry-re627Lre.cjs} +4 -4
  246. package/dist/src/{transcription-NLVG9MT1.cjs → transcription-BvtsrzRG.cjs} +13 -13
  247. package/dist/src/{transcription-BNYURcXg.js → transcription-CaMivnjG.js} +13 -13
  248. package/dist/src/{transcription-s6A-bNrZ.js → transcription-DOMMTu01.js} +14 -14
  249. package/dist/src/{transcription-B_OdaHp7.js → transcription-Hb3VnC4M.js} +13 -13
  250. package/dist/src/{transform-DuHvhZpj.cjs → transform-0BwoBsvO.cjs} +19 -5
  251. package/dist/src/{transform-uAytVuyX.js → transform-B2-jIv68.js} +8 -6
  252. package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
  253. package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
  254. package/dist/src/transform-ChNIpHz7.js +6 -0
  255. package/dist/src/{transform-D5HsjduX.js → transform-DrleutM3.js} +8 -6
  256. package/dist/src/{transform-vNucnNr0.js → transform-DyDAwEpE.js} +8 -6
  257. package/dist/src/transform-PtQ6rAE3.cjs +5 -0
  258. package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
  259. package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
  260. package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
  261. package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
  262. package/dist/src/{types-Cbd8uOMq.js → types-CIhFeUC4.js} +7 -1
  263. package/dist/src/{types-CzW2QFyi.js → types-Cd3ygw8W.js} +7 -1
  264. package/dist/src/{types-C_7nyzr1.cjs → types-D8cGDZbL.cjs} +8 -2
  265. package/dist/src/{types-DmyIJ-sR.js → types-q8GXGF65.js} +7 -1
  266. package/dist/src/{util-DGNOS1db.cjs → util--9u9UVCt.cjs} +3 -3
  267. package/dist/src/{util-ZzmqNPlg.js → util-BLvy9qfE.js} +7 -7
  268. package/dist/src/{util-C1CeHl-P.js → util-Bm3E9jpK.js} +7 -7
  269. package/dist/src/{util-BV4XUC0n.js → util-BtoGs5Cb.js} +18 -4
  270. package/dist/src/{util-BzMcevZc.cjs → util-CFj4YKIn.cjs} +18 -4
  271. package/dist/src/{util-BRYkYPTd.js → util-CMMkIxfU.js} +7 -7
  272. package/dist/src/{util-Dnmk2mBQ.js → util-CgDCK4KI.js} +18 -4
  273. package/dist/src/{util-B9vlHIIh.cjs → util-CuLo2pMR.cjs} +7 -7
  274. package/dist/src/{util-CMy69ZgQ.js → util-DM2rTn_6.js} +18 -4
  275. package/dist/src/{util-B3xGByQh.js → util-DMFeUvLz.js} +3 -3
  276. package/dist/src/{util-BHGHw5G1.js → util-DbVG-yZU.js} +3 -3
  277. package/dist/src/{util-Bv6uGDfH.js → util-vNmDL5DT.js} +3 -3
  278. package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
  279. package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
  280. package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
  281. package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
  282. package/dist/tsconfig.tsbuildinfo +1 -1
  283. package/package.json +32 -32
  284. package/dist/src/app/assets/index-4LKxG2CG.js +0 -439
  285. package/dist/src/app/assets/index-C3zcsZFQ.css +0 -1
  286. package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
  287. package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
  288. package/dist/src/app/tsconfig.app.tsbuildinfo +0 -1
  289. package/dist/src/cache-CeUpFm3M.cjs +0 -5
  290. package/dist/src/cache-n-RCJ-hL.js +0 -6
  291. package/dist/src/cloud-BBh91EUK.js +0 -4
  292. package/dist/src/eval-B3r2CVXr.js +0 -15
  293. package/dist/src/evalResult-5xwYnECe.js +0 -12
  294. package/dist/src/evalResult-71lY93Kj.cjs +0 -10
  295. package/dist/src/evalResult-Dx5P5cIv.js +0 -10
  296. package/dist/src/evaluator-Jx6bRZV6.js +0 -36
  297. package/dist/src/fetch-BxNb_Lp3.js +0 -5
  298. package/dist/src/graders-B_pgMLS2.js +0 -34
  299. package/dist/src/graders-DErokPDO.cjs +0 -32
  300. package/dist/src/graders-DR_uNe54.js +0 -32
  301. package/dist/src/graders-w3176Wz-.js +0 -32
  302. package/dist/src/providers-B7V0njNs.js +0 -32
  303. package/dist/src/providers-BEwbhv0X.js +0 -30
  304. package/dist/src/providers-CH3C7zf7.js +0 -30
  305. package/dist/src/providers-zyB6k_38.cjs +0 -31
  306. package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
  307. package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
  308. package/dist/src/server-DA4Cyrrq.js +0 -7
  309. package/dist/src/server-Dulb-4-K.cjs +0 -5
  310. package/dist/src/store-CXS-Q_91.js +0 -6
  311. package/dist/src/store-eYkaKMwq.cjs +0 -5
  312. package/dist/src/telemetry-BpMfhthR.cjs +0 -5
  313. package/dist/src/telemetry-Dw38hanS.js +0 -7
  314. package/dist/src/transform-DTGDnAzW.js +0 -6
  315. package/dist/src/transform-m3qNw4KP.cjs +0 -5
@@ -1,39 +1,39 @@
1
- import { C as isCI, S as getMaxEvalTimeMs, _ as getEnvBool, a as setLogCallback, b as getEnvString, d as getAjv, h as summarizeEvaluateResultForLogging, i as logger, l as extractFirstJsonObject, m as safeJsonStringify, n as isDebugEnabled, o as setLogLevel, p as orderKeys, s as sanitizeObject, t as getLogLevel, u as extractJsonObjects, v as getEnvFloat, w as state, x as getEvalTimeoutMs, y as getEnvInt } from "../logger-DLcq4dWf.js";
2
- import { A as getDefaultShareViewBaseUrl, F as FILE_METADATA_KEY, I as HUMAN_ASSERTION_TYPE, M as getShareViewBaseUrl, O as TERMINAL_MAX_WIDTH, P as VERSION, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareApiBaseUrl, k as getDefaultPort, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "../fetch-HaqdX7U1.js";
1
+ import { C as getEnvString, D as state, E as isCI, S as getEnvInt, T as getMaxEvalTimeMs, _ as safeJsonStringify, a as logger, b as getEnvBool, f as extractFirstJsonObject, g as orderKeys, m as getAjv, n as globalLogCallback, o as setLogCallback, p as extractJsonObjects, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, u as sanitizeObject, v as summarizeEvaluateResultForLogging, w as getEvalTimeoutMs, x as getEnvFloat } from "../logger-BnkjG2jt.js";
2
+ import { A as getDefaultShareViewBaseUrl, F as FILE_METADATA_KEY, I as HUMAN_ASSERTION_TYPE, M as getShareViewBaseUrl, O as TERMINAL_MAX_WIDTH, P as VERSION, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareApiBaseUrl, k as getDefaultPort, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "../fetch-BiYv2BZc.js";
3
3
  import { t as invariant } from "../invariant-vgHWClmd.js";
4
- import { a as getAuthor, c as isLoggedIntoCloud, l as promptForEmailUnverified, n as checkEmailStatusAndMaybeExit, o as getUserEmail, r as clearUserEmail, s as getUserId, t as checkEmailStatus, u as setUserEmail } from "../accounts-B0pgC1oV.js";
5
- import { a as openBrowser, c as getRemoteGenerationUrl, d as neverGenerateRemote, i as checkServerRunning, n as BrowserBehaviorNames, p as shouldGenerateRemote, s as promptYesNo, t as BrowserBehavior, u as getRemoteHealthUrl } from "../server-CuxBbeSY.js";
6
- import { a as evalResultsTable, c as evalsToPromptsTable, d as promptsTable, g as getDbSignalPath, h as getDb, i as datasetsTable, l as evalsToTagsTable, n as blobReferencesTable, o as evalsTable, p as tagsTable, r as configsTable, s as evalsToDatasetsTable, t as blobAssetsTable, u as modelAuditsTable } from "../tables-DLJPUdUE.js";
7
- import { r as importModule, t as getDirectory } from "../esm-CaIwzWR5.js";
8
- import { $ as MULTI_INPUT_VAR, A as STRATEGY_COLLECTIONS, B as ALIASED_PLUGIN_MAPPINGS, E as ALL_STRATEGIES, F as isMultiTurnStrategy, G as FINANCIAL_PLUGINS, H as BIAS_PLUGINS, I as Severity, J as INSURANCE_PLUGINS, K as FOUNDATION_PLUGINS, L as categoryAliases, M as getDefaultNFanout, O as DEFAULT_STRATEGIES, P as isFanoutStrategy, Q as MULTI_INPUT_EXCLUDED_PLUGINS, S as StrategyConfigSchema, U as DATASET_EXEMPT_PLUGINS, V as ALL_PLUGINS, W as DEFAULT_PLUGINS, X as LLAMA_GUARD_REPLICATE_PROVIDER, Y as LLAMA_GUARD_ENABLED_CATEGORIES, Z as MEDICAL_PLUGINS, _ as ProvidersSchema, a as EvaluateOptionsSchema, at as REMOTE_ONLY_PLUGIN_IDS, b as PluginConfigSchema, c as TestSuiteConfigSchema, ct as UNALIGNED_PROVIDER_HARM_PLUGINS, d as isGradingResult, et as PHARMACY_PLUGINS, f as isResultFailureReason, g as ProviderOptionsSchema, h as RedteamConfigSchema, i as EvalResultsFilterMode, it as REDTEAM_PROVIDER_HARM_PLUGINS, j as STRATEGY_COLLECTION_MAPPINGS, l as TestSuiteSchema, lt as PromptSchema, m as isProviderOptions, n as BaseAssertionTypesSchema, nt as PLUGIN_CATEGORIES, p as isApiProvider, q as HARM_PLUGINS, r as CommandLineOptionsSchema, rt as REDTEAM_MODEL, s as ResultFailureReason, st as TELECOM_PLUGINS, t as AssertionOrSetSchema, tt as PII_PLUGINS, u as UnifiedConfigSchema, v as ConversationMessageSchema, w as isUuid, y as PartialGenerationError, z as riskCategorySeverityMap } from "../types-CzW2QFyi.js";
4
+ import { a as getAuthor, c as isLoggedIntoCloud, l as promptForEmailUnverified, n as checkEmailStatusAndMaybeExit, o as getUserEmail, r as clearUserEmail, s as getUserId, t as checkEmailStatus, u as setUserEmail } from "../accounts-Xatc0RYb.js";
5
+ import { a as openBrowser, c as getRemoteGenerationUrl, d as neverGenerateRemote, i as checkServerRunning, n as BrowserBehaviorNames, p as shouldGenerateRemote, s as promptYesNo, t as BrowserBehavior, u as getRemoteHealthUrl } from "../server-Cns05F1j.js";
6
+ import { a as evalResultsTable, c as evalsToPromptsTable, d as promptsTable, g as getDbSignalPath, h as getDb, i as datasetsTable, l as evalsToTagsTable, n as blobReferencesTable, o as evalsTable, p as tagsTable, r as configsTable, s as evalsToDatasetsTable, t as blobAssetsTable, u as modelAuditsTable } from "../tables-BEIFz2tM.js";
7
+ import { r as importModule, t as getDirectory } from "../esm-CKWP3u_P.js";
8
+ import { $ as MULTI_INPUT_VAR, A as STRATEGY_COLLECTIONS, B as ALIASED_PLUGIN_MAPPINGS, E as ALL_STRATEGIES, F as isMultiTurnStrategy, G as FINANCIAL_PLUGINS, H as BIAS_PLUGINS, I as Severity, J as INSURANCE_PLUGINS, K as FOUNDATION_PLUGINS, L as categoryAliases, M as getDefaultNFanout, O as DEFAULT_STRATEGIES, P as isFanoutStrategy, Q as MULTI_INPUT_EXCLUDED_PLUGINS, S as StrategyConfigSchema, U as DATASET_EXEMPT_PLUGINS, V as ALL_PLUGINS, W as DEFAULT_PLUGINS, X as LLAMA_GUARD_REPLICATE_PROVIDER, Y as LLAMA_GUARD_ENABLED_CATEGORIES, Z as MEDICAL_PLUGINS, _ as ProvidersSchema, a as EvaluateOptionsSchema, at as REMOTE_ONLY_PLUGIN_IDS, b as PluginConfigSchema, c as TestSuiteConfigSchema, ct as UNALIGNED_PROVIDER_HARM_PLUGINS, d as isGradingResult, et as PHARMACY_PLUGINS, f as isResultFailureReason, g as ProviderOptionsSchema, h as RedteamConfigSchema, i as EvalResultsFilterMode, it as REDTEAM_PROVIDER_HARM_PLUGINS, j as STRATEGY_COLLECTION_MAPPINGS, l as TestSuiteSchema, lt as PromptSchema, m as isProviderOptions, n as BaseAssertionTypesSchema, nt as PLUGIN_CATEGORIES, p as isApiProvider, q as HARM_PLUGINS, r as CommandLineOptionsSchema, rt as REDTEAM_MODEL, s as ResultFailureReason, st as TELECOM_PLUGINS, t as AssertionOrSetSchema, tt as PII_PLUGINS, u as UnifiedConfigSchema, v as ConversationMessageSchema, w as isUuid, y as PartialGenerationError, z as riskCategorySeverityMap } from "../types-Cd3ygw8W.js";
9
9
  import { i as isJavascriptFile } from "../fileExtensions-LcDYkU4v.js";
10
10
  import { n as sha256, t as randomSequence } from "../createHash-CTQmL3G2.js";
11
- import { a as generateIdFromPrompt, t as hashPrompt } from "../utils-dLokC-eR.js";
12
- import { t as getTraceStore } from "../store-CXGFv4aR.js";
13
- import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "../cache-i1P6crbO.js";
11
+ import { a as generateIdFromPrompt, t as hashPrompt } from "../utils-DOjD4dTC.js";
12
+ import { t as getTraceStore } from "../store-VB0GP46K.js";
13
+ import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "../cache-HP0NP4k3.js";
14
14
  import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "../tokenUsageUtils-BDGe-iyI.js";
15
- import { n as getBlobUrl, t as getBlobByHash } from "../blobs-BDbfYdrJ.js";
16
- import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "../extractor-D25qpmGX.js";
17
- import { B as PromptfooHarmfulCompletionProvider, D as getShortPluginId, E as getSessionId, F as loadFromPackage, I as redteamProviderManager, J as AIStudioChatProvider, L as TokenUsageTracker, M as renderPrompt, N as runExtensionHook, O as isBasicRefusal, P as isPackagePath, R as createRateLimitRegistry, S as extractGoalFromPrompt, T as extractVariablesFromJson, _ as mediaExists, a as resolveProviderConfigs, at as getEvalConfigFromCloud, b as checkExfilTracking, c as MCPProvider, d as createTransformResponse, f as GoogleLiveProvider, g as getMediaStorage, h as validateStrategies, i as resolveProvider, it as getCloudDatabaseId, j as collectFileMetadata, l as HttpProvider, lt as isCloudProvider, m as loadStrategy, n as loadApiProvider, ot as getOrgContext, p as Strategies, q as VertexChatProvider, r as loadApiProviders, rt as checkCloudPermissions, st as getPluginSeverityOverridesFromCloud, t as getProviderIds, u as createTransformRequest, ut as resolveTeamId, v as retrieveMedia, w as extractPromptFromTags, y as pluginMatchesStrategyTargets, z as createProviderRateLimitOptions } from "../providers-D8lF1sqW.js";
18
- import { n as telemetry, t as TelemetryEventSchema } from "../telemetry-Cps3mIU-.js";
15
+ import { n as getBlobUrl, t as getBlobByHash } from "../blobs-BUWmKWzo.js";
16
+ import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "../extractor-C8XwivI9.js";
17
+ import { B as PromptfooHarmfulCompletionProvider, D as getShortPluginId, E as getSessionId, F as loadFromPackage, I as redteamProviderManager, J as AIStudioChatProvider, L as TokenUsageTracker, M as renderPrompt, N as runExtensionHook, O as isBasicRefusal, P as isPackagePath, R as createRateLimitRegistry, S as extractGoalFromPrompt, T as extractVariablesFromJson, _ as mediaExists, a as resolveProviderConfigs, at as getEvalConfigFromCloud, b as checkExfilTracking, c as MCPProvider, d as createTransformResponse, f as GoogleLiveProvider, g as getMediaStorage, h as validateStrategies, i as resolveProvider, it as getCloudDatabaseId, j as collectFileMetadata, l as HttpProvider, lt as isCloudProvider, m as loadStrategy, n as loadApiProvider, ot as getOrgContext, p as Strategies, q as VertexChatProvider, r as loadApiProviders, rt as checkCloudPermissions, st as getPluginSeverityOverridesFromCloud, t as getProviderIds, u as createTransformRequest, ut as resolveTeamId, v as retrieveMedia, w as extractPromptFromTags, y as pluginMatchesStrategyTargets, z as createProviderRateLimitOptions } from "../providers-DvddrgxL.js";
18
+ import { n as telemetry, t as TelemetryEventSchema } from "../telemetry-DPXLd7UE.js";
19
19
  import "../genaiTracer-70Z8BIuV.js";
20
- import { r as runPython } from "../pythonUtils-D6fwaDSg.js";
21
- import { A as readFilters, F as extractVariablesFromTemplates, I as getNunjucksEngine, L as loadFunction, M as renderEnvOnlyInObject, O as maybeLoadToolsFromExternalFile, R as parseFileUrl, T as maybeLoadFromExternalFile, _ as getProviderDescription, a as evalTableToJson, b as isOpenAiProvider, c as fetchCsvFromGoogleSheet, d as extractRuntimeVars, f as filterRuntimeVars, g as doesProviderRefMatch, h as checkProviderApiKeys, i as ComparisonEvalNotFoundError, j as readOutput, l as setupEnv, m as resultIsForTestCase, n as writeMultipleOutputs, o as generateEvalCsv, p as getTestCaseDeduplicationKey, r as writeOutput, s as mergeComparisonTables, t as printBorder, u as deduplicateTestCases, v as isAnthropicProvider, w as maybeLoadConfigFromExternalFile, x as isProviderAllowed, y as isGoogleProvider } from "../util-ZzmqNPlg.js";
22
- import { t as OpenAiChatCompletionProvider } from "../chat-DJIw17u0.js";
23
- import { m as validateFunctionCall } from "../transform-uAytVuyX.js";
24
- import "../messages-ZJk778GH.js";
25
- import "../util-Bv6uGDfH.js";
26
- import { $ as selectMaxScore, A as BeavertailsPlugin, B as matchesClassification, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as callProviderWithContext, G as matchesFactuality, H as matchesContextFaithfulness, I as fail, J as matchesModeration, K as matchesGEval, L as getAndCheckProvider, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as fetchHuggingFaceDataset, Q as matchesSimilarity, R as loadRubricPrompt, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesContextRecall, V as matchesClosedQa, W as matchesContextRelevance, X as matchesSearchRubric, Y as matchesPiScore, Z as matchesSelectBest, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as coerceString, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as processFileReference, d as RbacPlugin, dt as retryWithDeduplication, et as doRemoteGrading, f as PromptExtractionPlugin, ft as sampleArray, g as isValidPolicyObject, h as determinePolicyTypeFromId, i as VLGuardPlugin, it as SUGGEST_PROMPTS_SYSTEM_MESSAGE, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as resolveContext, m as PolicyPlugin, mt as DefaultSuggestionsProvider, n as getGraderById, nt as readPrompts, o as UnsafeBenchPlugin, ot as getFinalTest, p as PoliticsPlugin, pt as getDefaultProviders, q as matchesLlmRubric, r as VLSUPlugin, rt as readProviderPromptMap, s as ToxicChatPlugin, st as loadFromJavaScriptFile, t as GRADERS, tt as processPrompts, u as ShellInjectionPlugin, ut as getCustomPolicies, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as matchesAnswerRelevance } from "../graders-DP7KFFo-.js";
27
- import "../responses-CB2jwoAr.js";
28
- import "../openai-j-sE2O7r.js";
29
- import { l as validateFunctionCall$1 } from "../util-BV4XUC0n.js";
30
- import "../completion-qRoZAYRB.js";
31
- import { i as getProcessShim, n as transform, t as TransformInputType } from "../transform-aa6tmVpZ.js";
20
+ import { r as runPython } from "../pythonUtils-Bzwbgpbg.js";
21
+ import { A as readFilters, F as extractVariablesFromTemplates, I as getNunjucksEngine, L as loadFunction, M as renderEnvOnlyInObject, O as maybeLoadToolsFromExternalFile, R as parseFileUrl, T as maybeLoadFromExternalFile, _ as getProviderDescription, a as evalTableToJson, b as isOpenAiProvider, c as fetchCsvFromGoogleSheet, d as extractRuntimeVars, f as filterRuntimeVars, g as doesProviderRefMatch, h as checkProviderApiKeys, i as ComparisonEvalNotFoundError, j as readOutput, l as setupEnv, m as resultIsForTestCase, n as writeMultipleOutputs, o as generateEvalCsv, p as getTestCaseDeduplicationKey, r as writeOutput, s as mergeComparisonTables, t as printBorder, u as deduplicateTestCases, v as isAnthropicProvider, w as maybeLoadConfigFromExternalFile, x as isProviderAllowed, y as isGoogleProvider } from "../util-BLvy9qfE.js";
22
+ import { t as OpenAiChatCompletionProvider } from "../chat-B0iaWhoh.js";
23
+ import { m as validateFunctionCall } from "../transform-B2-jIv68.js";
24
+ import "../messages-biC_ex-p.js";
25
+ import "../util-DbVG-yZU.js";
26
+ import { $ as matchesTrajectoryGoalSuccess, A as BeavertailsPlugin, B as matchesClassification, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as callProviderWithContext, G as matchesFactuality, H as matchesContextFaithfulness, I as fail, J as matchesModeration, K as matchesGEval, L as getAndCheckProvider, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as fetchHuggingFaceDataset, Q as matchesSimilarity, R as loadRubricPrompt, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesContextRecall, V as matchesClosedQa, W as matchesContextRelevance, X as matchesSearchRubric, Y as matchesPiScore, Z as matchesSelectBest, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as SUGGEST_PROMPTS_SYSTEM_MESSAGE, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as loadFromJavaScriptFile, d as RbacPlugin, dt as getCustomPolicies, et as selectMaxScore, f as PromptExtractionPlugin, ft as retryWithDeduplication, g as isValidPolicyObject, h as determinePolicyTypeFromId, ht as DefaultSuggestionsProvider, i as VLGuardPlugin, it as readProviderPromptMap, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as processFileReference, m as PolicyPlugin, mt as getDefaultProviders, n as getGraderById, nt as processPrompts, o as UnsafeBenchPlugin, ot as coerceString, p as PoliticsPlugin, pt as sampleArray, q as matchesLlmRubric, r as VLSUPlugin, rt as readPrompts, s as ToxicChatPlugin, st as getFinalTest, t as GRADERS, tt as doRemoteGrading, u as ShellInjectionPlugin, ut as resolveContext, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as matchesAnswerRelevance } from "../graders-BXAJ0sbS.js";
27
+ import "../responses-CgNyTPsY.js";
28
+ import "../openai-D6wITiVn.js";
29
+ import { l as validateFunctionCall$1 } from "../util-BtoGs5Cb.js";
30
+ import "../completion-BCimtq-h.js";
31
+ import { i as getProcessShim, n as transform, t as TransformInputType } from "../transform-BzK09Q_9.js";
32
32
  import { t as ellipsize } from "../text-TIv0QYnd.js";
33
- import "../base-CpjcHe4e.js";
34
- import "../image-Dpxa1Jt6.js";
35
- import { t as providerRegistry } from "../providerRegistry-DM8rZYol.js";
36
- import { n as runRuby } from "../rubyUtils-BcuGX77l.js";
33
+ import "../base-Cz2ZC_iA.js";
34
+ import "../image-B8b6f36E.js";
35
+ import { t as providerRegistry } from "../providerRegistry-BkzVH5Ba.js";
36
+ import { n as runRuby } from "../rubyUtils-DECSbsfY.js";
37
37
  import dotenv from "dotenv";
38
38
  import * as fs$2 from "fs";
39
39
  import fs, { createWriteStream, existsSync, readFileSync } from "fs";
@@ -47,6 +47,7 @@ import input from "@inquirer/input";
47
47
  import { z } from "zod";
48
48
  import * as fsPromises from "fs/promises";
49
49
  import util, { promisify } from "util";
50
+ import readline from "readline";
50
51
  import compression from "compression";
51
52
  import cors from "cors";
52
53
  import fs$1 from "node:fs";
@@ -72,6 +73,7 @@ import { LRUCache } from "lru-cache";
72
73
  import { JSDOM } from "jsdom";
73
74
  import { distance } from "fastest-levenshtein";
74
75
  import * as rouge from "js-rouge";
76
+ import { isDeepStrictEqual } from "node:util";
75
77
  import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
76
78
  import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
77
79
  import { resourceFromAttributes } from "@opentelemetry/resources";
@@ -3857,7 +3859,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
3857
3859
  telemetry.record("feature_used", { feature: "tracing" });
3858
3860
  try {
3859
3861
  logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
3860
- const { startOTLPReceiver } = await import("../otlpReceiver-C99PPb48.js");
3862
+ const { startOTLPReceiver } = await import("../otlpReceiver-C9KlUtxh.js");
3861
3863
  const port = testSuite.tracing.otlp.http.port || 4318;
3862
3864
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
3863
3865
  logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
@@ -3880,7 +3882,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
3880
3882
  async function stopOtlpReceiverIfNeeded() {
3881
3883
  if (otlpReceiverStarted) try {
3882
3884
  logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
3883
- const { stopOTLPReceiver } = await import("../otlpReceiver-C99PPb48.js");
3885
+ const { stopOTLPReceiver } = await import("../otlpReceiver-C9KlUtxh.js");
3884
3886
  await stopOTLPReceiver();
3885
3887
  otlpReceiverStarted = false;
3886
3888
  logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -3915,7 +3917,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
3915
3917
  }
3916
3918
  if (!tracingEnabled) return null;
3917
3919
  logger.debug("[EvaluatorTracing] Importing trace store");
3918
- const { getTraceStore } = await import("../store-CXGFv4aR.js").then((n) => n.n);
3920
+ const { getTraceStore } = await import("../store-VB0GP46K.js").then((n) => n.n);
3919
3921
  const traceStore = getTraceStore();
3920
3922
  const traceId = generateTraceId();
3921
3923
  const spanId = generateSpanId();
@@ -4948,7 +4950,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
4948
4950
  pass = result !== inverse;
4949
4951
  score = pass ? 1 : 0;
4950
4952
  } else if (typeof result === "number") {
4951
- pass = assertion.threshold !== void 0 ? result >= assertion.threshold : result > 0;
4953
+ pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
4952
4954
  score = result;
4953
4955
  } else if (typeof result === "object") return result;
4954
4956
  else throw new Error("Custom function must return a boolean or number");
@@ -5172,7 +5174,7 @@ function handlePerplexity({ logProbs, assertion }) {
5172
5174
  if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
5173
5175
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
5174
5176
  const perplexity = Math.exp(-avgLogProb);
5175
- const pass = assertion.threshold !== void 0 ? perplexity <= assertion.threshold : true;
5177
+ const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
5176
5178
  return {
5177
5179
  pass,
5178
5180
  score: pass ? 1 : 0,
@@ -5184,7 +5186,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
5184
5186
  if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
5185
5187
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
5186
5188
  const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
5187
- const pass = assertion.threshold !== void 0 ? perplexityNorm >= assertion.threshold : true;
5189
+ const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
5188
5190
  return {
5189
5191
  pass,
5190
5192
  score: perplexityNorm,
@@ -5299,7 +5301,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
5299
5301
  } else {
5300
5302
  score = Number.parseFloat(String(result));
5301
5303
  if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
5302
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
5304
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
5303
5305
  }
5304
5306
  } catch (err) {
5305
5307
  return {
@@ -5560,7 +5562,7 @@ end
5560
5562
  } else {
5561
5563
  score = Number.parseFloat(String(result));
5562
5564
  if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
5563
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
5565
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
5564
5566
  }
5565
5567
  } catch (err) {
5566
5568
  return {
@@ -5631,6 +5633,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
5631
5633
  };
5632
5634
  };
5633
5635
  //#endregion
5636
+ //#region src/assertions/traceUtils.ts
5637
+ /**
5638
+ * Shared utilities for trace assertions
5639
+ */
5640
+ /**
5641
+ * Match a span name against a glob-like pattern.
5642
+ * Supports * (any characters) and ? (single character) wildcards.
5643
+ *
5644
+ * @param spanName - The span name to match
5645
+ * @param pattern - The glob pattern to match against
5646
+ * @returns true if the span name matches the pattern
5647
+ */
5648
+ function matchesPattern(spanName, pattern) {
5649
+ const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
5650
+ return new RegExp(`^${regexPattern}$`, "i").test(spanName);
5651
+ }
5652
+ //#endregion
5653
+ //#region src/assertions/skill.ts
5654
+ function getSkillCalls(params) {
5655
+ const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
5656
+ if (!Array.isArray(rawSkillCalls)) return [];
5657
+ return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
5658
+ }
5659
+ function matchesSkill(skillCall, matcher) {
5660
+ if (matcher.name && skillCall.name !== matcher.name) return false;
5661
+ if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
5662
+ return true;
5663
+ }
5664
+ function formatSkillCall(skillCall) {
5665
+ const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
5666
+ return details ? `${skillCall.name} (${details})` : skillCall.name;
5667
+ }
5668
+ function resolveSkillMatchers(value) {
5669
+ const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
5670
+ const validateCount = (field, count) => {
5671
+ if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
5672
+ };
5673
+ if (typeof value === "string" && value.trim()) return {
5674
+ kind: "list",
5675
+ matchers: [{ name: normalizeText(value) }]
5676
+ };
5677
+ if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
5678
+ kind: "list",
5679
+ matchers: value.map((item) => ({ name: item.trim() }))
5680
+ };
5681
+ if (value && typeof value === "object" && !Array.isArray(value)) {
5682
+ const rawMatcher = value;
5683
+ const matcher = rawMatcher;
5684
+ const name = normalizeText(matcher.name);
5685
+ const pattern = normalizeText(matcher.pattern);
5686
+ if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
5687
+ if ("min" in rawMatcher) validateCount("min", matcher.min);
5688
+ if ("max" in rawMatcher) validateCount("max", matcher.max);
5689
+ if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
5690
+ return {
5691
+ kind: "count",
5692
+ matcher: {
5693
+ max: typeof matcher.max === "number" ? matcher.max : void 0,
5694
+ min: typeof matcher.min === "number" ? matcher.min : void 0,
5695
+ name,
5696
+ pattern
5697
+ }
5698
+ };
5699
+ }
5700
+ throw new Error("skill-used assertion must have a string, string array, or object value");
5701
+ }
5702
+ function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
5703
+ const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
5704
+ const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
5705
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
5706
+ const expectedSkills = expected.matchers.map((matcher) => matcher.name);
5707
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
5708
+ let reason;
5709
+ if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
5710
+ else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
5711
+ else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
5712
+ return {
5713
+ pass,
5714
+ score: pass ? 1 : 0,
5715
+ reason,
5716
+ assertion: params.assertion
5717
+ };
5718
+ }
5719
+ function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
5720
+ const hasExplicitMin = matcher.min !== void 0;
5721
+ const hasExplicitMax = matcher.max !== void 0;
5722
+ const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
5723
+ const max = matcher.max;
5724
+ const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
5725
+ const count = matchingSkillCalls.length;
5726
+ const matcherLabel = matcher.pattern || matcher.name || "*";
5727
+ if (params.inverse) {
5728
+ if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
5729
+ const pass = count === 0;
5730
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
5731
+ return {
5732
+ pass,
5733
+ score: pass ? 1 : 0,
5734
+ reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
5735
+ assertion: params.assertion
5736
+ };
5737
+ }
5738
+ const pass = count >= min && (max === void 0 || count <= max);
5739
+ let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
5740
+ reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
5741
+ if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
5742
+ return {
5743
+ pass,
5744
+ score: pass ? 1 : 0,
5745
+ reason,
5746
+ assertion: params.assertion
5747
+ };
5748
+ }
5749
+ function handleSkillUsed(params) {
5750
+ const skillCalls = getSkillCalls(params);
5751
+ const actualSkills = skillCalls.map(formatSkillCall);
5752
+ const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
5753
+ if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
5754
+ return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
5755
+ }
5756
+ //#endregion
5634
5757
  //#region src/assertions/sql.ts
5635
5758
  const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
5636
5759
  let pass = false;
@@ -5863,23 +5986,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
5863
5986
  };
5864
5987
  };
5865
5988
  //#endregion
5866
- //#region src/assertions/traceUtils.ts
5867
- /**
5868
- * Shared utilities for trace assertions
5869
- */
5870
- /**
5871
- * Match a span name against a glob-like pattern.
5872
- * Supports * (any characters) and ? (single character) wildcards.
5873
- *
5874
- * @param spanName - The span name to match
5875
- * @param pattern - The glob pattern to match against
5876
- * @returns true if the span name matches the pattern
5877
- */
5878
- function matchesPattern(spanName, pattern) {
5879
- const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
5880
- return new RegExp(`^${regexPattern}$`, "i").test(spanName);
5881
- }
5882
- //#endregion
5883
5989
  //#region src/assertions/traceErrorSpans.ts
5884
5990
  function isErrorSpan(span) {
5885
5991
  if (span.statusCode && span.statusCode >= 400) return true;
@@ -6048,6 +6154,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
6048
6154
  };
6049
6155
  };
6050
6156
  //#endregion
6157
+ //#region src/assertions/trajectoryUtils.ts
6158
+ const TOOL_ATTRIBUTE_KEYS = [
6159
+ "tool.name",
6160
+ "tool_name",
6161
+ "tool",
6162
+ "function.name",
6163
+ "function_name",
6164
+ "gen_ai.tool.name",
6165
+ "codex.mcp.tool",
6166
+ "agent.tool",
6167
+ "agent.tool_name",
6168
+ "agent.toolName"
6169
+ ];
6170
+ const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
6171
+ "tool.arguments",
6172
+ "tool.args",
6173
+ "tool.input",
6174
+ "tool_arguments",
6175
+ "tool_args",
6176
+ "tool_input",
6177
+ "function.arguments",
6178
+ "function.args",
6179
+ "function.input",
6180
+ "function_arguments",
6181
+ "function_args",
6182
+ "gen_ai.tool.arguments",
6183
+ "gen_ai.tool.args",
6184
+ "gen_ai.tool.input",
6185
+ "gen_ai.tool.call.arguments",
6186
+ "gen_ai.tool.call.args",
6187
+ "agent.tool.arguments",
6188
+ "agent.tool.args",
6189
+ "agent.tool.input",
6190
+ "codex.mcp.arguments",
6191
+ "codex.mcp.args",
6192
+ "codex.mcp.input",
6193
+ "arguments",
6194
+ "args",
6195
+ "input"
6196
+ ];
6197
+ const COMMAND_ATTRIBUTE_KEYS = [
6198
+ "codex.command",
6199
+ "command",
6200
+ "command.name",
6201
+ "command_name"
6202
+ ];
6203
+ const SEARCH_ATTRIBUTE_KEYS = [
6204
+ "codex.search.query",
6205
+ "search.query",
6206
+ "search_query"
6207
+ ];
6208
+ const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
6209
+ const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
6210
+ const MAX_JUDGE_SUMMARY_STEPS = 24;
6211
+ const JUDGE_SUMMARY_HEAD_STEPS = 12;
6212
+ const JUDGE_SUMMARY_TAIL_STEPS = 12;
6213
+ function getStringAttribute(attributes, keys) {
6214
+ for (const key of keys) {
6215
+ const value = attributes[key];
6216
+ if (typeof value === "string" && value.trim()) return value.trim();
6217
+ }
6218
+ }
6219
+ function normalizeStructuredAttribute(value) {
6220
+ if (value === void 0 || value === null) return;
6221
+ if (typeof value === "string") {
6222
+ const trimmed = value.trim();
6223
+ if (!trimmed) return;
6224
+ try {
6225
+ return JSON.parse(trimmed);
6226
+ } catch {
6227
+ return trimmed;
6228
+ }
6229
+ }
6230
+ if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
6231
+ }
6232
+ function hasSameStatus(left, right) {
6233
+ return left?.code === right?.code && left?.message === right?.message;
6234
+ }
6235
+ function isSearchLikeSpan(span) {
6236
+ const attributes = span.attributes || {};
6237
+ if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
6238
+ return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
6239
+ }
6240
+ function getTrajectoryStepStatus(step) {
6241
+ if (step.statusCode === void 0 || step.statusCode === 0) return;
6242
+ return {
6243
+ code: step.statusCode,
6244
+ ...step.statusMessage ? { message: step.statusMessage } : {}
6245
+ };
6246
+ }
6247
+ function getCommandExecutable(command) {
6248
+ return command.trim().split(/\s+/)[0] || void 0;
6249
+ }
6250
+ function extractToolName(span) {
6251
+ const attributes = span.attributes || {};
6252
+ const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
6253
+ if (directMatch) return directMatch;
6254
+ for (const [key, value] of Object.entries(attributes)) {
6255
+ if (typeof value !== "string" || !value.trim()) continue;
6256
+ if (/tool.?name|function.?name/i.test(key)) return value.trim();
6257
+ if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
6258
+ }
6259
+ if (span.name.startsWith("mcp ")) {
6260
+ const slashIndex = span.name.lastIndexOf("/");
6261
+ if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
6262
+ }
6263
+ }
6264
+ function extractToolArgs(span) {
6265
+ const attributes = span.attributes || {};
6266
+ for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
6267
+ const value = normalizeStructuredAttribute(attributes[key]);
6268
+ if (value !== void 0) return value;
6269
+ }
6270
+ for (const [key, rawValue] of Object.entries(attributes)) {
6271
+ if (/result|output|error|status/i.test(key)) continue;
6272
+ if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
6273
+ const value = normalizeStructuredAttribute(rawValue);
6274
+ if (value !== void 0) return value;
6275
+ }
6276
+ }
6277
+ function extractCommand(span) {
6278
+ const attributes = span.attributes || {};
6279
+ const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
6280
+ if (directMatch) return directMatch;
6281
+ for (const [key, value] of Object.entries(attributes)) {
6282
+ if (typeof value !== "string" || !value.trim()) continue;
6283
+ if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
6284
+ }
6285
+ if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
6286
+ }
6287
+ function extractSearchQuery(span) {
6288
+ const attributes = span.attributes || {};
6289
+ const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
6290
+ if (directMatch) return directMatch;
6291
+ const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
6292
+ if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
6293
+ if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
6294
+ }
6295
+ function isReasoningSpan(span) {
6296
+ if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
6297
+ return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
6298
+ }
6299
+ function isMessageSpan(span) {
6300
+ if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
6301
+ return span.name === "agent response" || span.name === "send input";
6302
+ }
6303
+ function extractTrajectorySteps(trace) {
6304
+ return [...trace.spans || []].map((span, index) => ({
6305
+ span,
6306
+ index
6307
+ })).sort((left, right) => {
6308
+ const timeDiff = left.span.startTime - right.span.startTime;
6309
+ if (timeDiff !== 0) return timeDiff;
6310
+ const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
6311
+ if (endDiff !== 0) return endDiff;
6312
+ return left.index - right.index;
6313
+ }).map(({ span }) => {
6314
+ const toolName = extractToolName(span);
6315
+ const command = extractCommand(span);
6316
+ const searchQuery = extractSearchQuery(span);
6317
+ let type = "span";
6318
+ let name = span.name;
6319
+ const aliases = new Set([span.name]);
6320
+ let args;
6321
+ if (toolName) {
6322
+ type = "tool";
6323
+ name = toolName;
6324
+ aliases.add(toolName);
6325
+ args = extractToolArgs(span);
6326
+ } else if (command) {
6327
+ type = "command";
6328
+ name = command;
6329
+ aliases.add(command);
6330
+ const executable = getCommandExecutable(command);
6331
+ if (executable) aliases.add(executable);
6332
+ } else if (searchQuery) {
6333
+ type = "search";
6334
+ name = searchQuery;
6335
+ aliases.add(searchQuery);
6336
+ } else if (isReasoningSpan(span)) {
6337
+ type = "reasoning";
6338
+ name = span.name;
6339
+ aliases.add("reasoning");
6340
+ } else if (isMessageSpan(span)) {
6341
+ type = "message";
6342
+ name = span.name;
6343
+ aliases.add("message");
6344
+ }
6345
+ return {
6346
+ aliases: [...aliases],
6347
+ ...args === void 0 ? {} : { args },
6348
+ attributes: span.attributes || {},
6349
+ endTime: span.endTime,
6350
+ name,
6351
+ spanId: span.spanId,
6352
+ spanName: span.name,
6353
+ startTime: span.startTime,
6354
+ statusCode: span.statusCode,
6355
+ statusMessage: span.statusMessage,
6356
+ type
6357
+ };
6358
+ });
6359
+ }
6360
+ function normalizeTrajectoryMatcher(matcher, defaultType) {
6361
+ if (typeof matcher === "string") return {
6362
+ pattern: matcher,
6363
+ ...defaultType ? { type: defaultType } : {}
6364
+ };
6365
+ return {
6366
+ ...matcher,
6367
+ ...matcher.type ? {} : defaultType ? { type: defaultType } : {}
6368
+ };
6369
+ }
6370
+ function matchesTrajectoryStep(step, matcher, defaultType) {
6371
+ const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
6372
+ if (type) {
6373
+ if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
6374
+ }
6375
+ const matchPattern = pattern || name;
6376
+ if (!matchPattern) return true;
6377
+ return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
6378
+ }
6379
+ function formatTrajectoryStep(step) {
6380
+ return `${step.type}:${step.name}`;
6381
+ }
6382
+ function formatTrajectoryArgs(args) {
6383
+ if (args === void 0) return "(none)";
6384
+ try {
6385
+ const serialized = JSON.stringify(args);
6386
+ if (serialized !== void 0) return serialized;
6387
+ } catch {}
6388
+ return String(args);
6389
+ }
6390
+ function compactJudgeTrajectorySteps(steps) {
6391
+ const compacted = [];
6392
+ for (const step of steps) {
6393
+ const previousStep = compacted[compacted.length - 1];
6394
+ if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
6395
+ previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
6396
+ continue;
6397
+ }
6398
+ compacted.push(step);
6399
+ }
6400
+ return compacted;
6401
+ }
6402
+ function truncateJudgeTrajectorySteps(steps) {
6403
+ if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
6404
+ return [
6405
+ ...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
6406
+ { omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
6407
+ ...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
6408
+ ];
6409
+ }
6410
+ function summarizeTrajectoryForJudge(trace) {
6411
+ const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
6412
+ index: index + 1,
6413
+ type: step.type,
6414
+ name: step.name,
6415
+ ...step.spanName === step.name ? {} : { spanName: step.spanName },
6416
+ ...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
6417
+ }));
6418
+ const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
6419
+ const steps = truncateJudgeTrajectorySteps(compactedSteps);
6420
+ return JSON.stringify({
6421
+ traceId: trace.traceId,
6422
+ stepCount: rawSteps.length,
6423
+ compactedStepCount: compactedSteps.length,
6424
+ steps
6425
+ }, null, 2);
6426
+ }
6427
+ //#endregion
6428
+ //#region src/assertions/trajectory.ts
6429
+ function getTraceOrThrow(params) {
6430
+ const trace = params.assertionValueContext.trace;
6431
+ if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
6432
+ return trace;
6433
+ }
6434
+ function applyInverse(pass, inverse) {
6435
+ return inverse ? !pass : pass;
6436
+ }
6437
+ function formatStepList(stepLabels) {
6438
+ return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
6439
+ }
6440
+ function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
6441
+ if (matcher.pattern || matcher.name) return;
6442
+ const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
6443
+ throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
6444
+ }
6445
+ function resolveGoalSuccessValue(value) {
6446
+ if (typeof value === "string" && value.trim()) return { goal: value.trim() };
6447
+ if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
6448
+ throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
6449
+ }
6450
+ function resolveToolMatchers(value) {
6451
+ if (typeof value === "string") return {
6452
+ kind: "list",
6453
+ matchers: [normalizeTrajectoryMatcher(value, "tool")]
6454
+ };
6455
+ if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
6456
+ kind: "list",
6457
+ matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
6458
+ };
6459
+ if (value && typeof value === "object" && !Array.isArray(value)) return {
6460
+ kind: "count",
6461
+ matcher: {
6462
+ ...normalizeTrajectoryMatcher(value, "tool"),
6463
+ max: typeof value.max === "number" ? value.max : void 0,
6464
+ min: typeof value.min === "number" ? value.min : void 0
6465
+ }
6466
+ };
6467
+ throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
6468
+ }
6469
+ const handleTrajectoryToolUsed = (params) => {
6470
+ const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
6471
+ const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
6472
+ if (expected.kind === "list") {
6473
+ if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
6474
+ const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
6475
+ const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
6476
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
6477
+ const actualTools = steps.map(formatTrajectoryStep);
6478
+ const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
6479
+ let reason;
6480
+ if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
6481
+ else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
6482
+ else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
6483
+ return {
6484
+ pass,
6485
+ score: pass ? 1 : 0,
6486
+ reason,
6487
+ assertion: params.assertion
6488
+ };
6489
+ }
6490
+ const matcher = expected.matcher;
6491
+ const min = matcher.min ?? 1;
6492
+ const max = matcher.max;
6493
+ if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
6494
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
6495
+ const count = matchingSteps.length;
6496
+ const basePass = count >= min && (max === void 0 || count <= max);
6497
+ const pass = applyInverse(basePass, params.inverse);
6498
+ const matcherLabel = matcher.pattern || matcher.name || "*";
6499
+ let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
6500
+ if (max === void 0) reason += ` (expected at least ${min})`;
6501
+ else reason += ` (expected ${min}-${max})`;
6502
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
6503
+ if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
6504
+ return {
6505
+ pass,
6506
+ score: pass ? 1 : 0,
6507
+ reason,
6508
+ assertion: params.assertion
6509
+ };
6510
+ };
6511
+ function resolveSequenceValue(value) {
6512
+ if (Array.isArray(value)) return {
6513
+ mode: "in_order",
6514
+ steps: value
6515
+ };
6516
+ if (value && typeof value === "object" && !Array.isArray(value)) {
6517
+ const sequenceValue = value;
6518
+ return {
6519
+ mode: sequenceValue.mode || "in_order",
6520
+ steps: sequenceValue.steps || []
6521
+ };
6522
+ }
6523
+ throw new Error("trajectory:tool-sequence assertion must have an array or object value");
6524
+ }
6525
+ function isRecord(value) {
6526
+ return typeof value === "object" && value !== null && !Array.isArray(value);
6527
+ }
6528
+ function matchesExpectedArgsPartial(actual, expected) {
6529
+ if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
6530
+ if (isRecord(expected)) {
6531
+ if (!isRecord(actual)) return false;
6532
+ return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
6533
+ }
6534
+ return isDeepStrictEqual(actual, expected);
6535
+ }
6536
+ function matchesToolArgs(actual, expected, mode) {
6537
+ if (mode === "exact") return isDeepStrictEqual(actual, expected);
6538
+ return matchesExpectedArgsPartial(actual, expected);
6539
+ }
6540
+ function resolveToolArgsMatchMode(mode) {
6541
+ if (mode === void 0) return "partial";
6542
+ if (mode === "partial" || mode === "exact") return mode;
6543
+ throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
6544
+ }
6545
+ function resolveToolArgsMatchValue(value) {
6546
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
6547
+ const matcher = normalizeTrajectoryMatcher(value, "tool");
6548
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
6549
+ const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
6550
+ if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
6551
+ return {
6552
+ matcher,
6553
+ expectedArgs,
6554
+ mode: resolveToolArgsMatchMode(value.mode)
6555
+ };
6556
+ }
6557
+ const handleTrajectoryToolSequence = (params) => {
6558
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
6559
+ const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
6560
+ const expectedMatchers = value.steps.map((step, index) => {
6561
+ const matcher = normalizeTrajectoryMatcher(step, "tool");
6562
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
6563
+ return matcher;
6564
+ });
6565
+ if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
6566
+ const actualTools = toolSteps.map(formatTrajectoryStep);
6567
+ let basePass = false;
6568
+ let reason = "";
6569
+ if (value.mode === "exact") {
6570
+ basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
6571
+ if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
6572
+ else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
6573
+ } else {
6574
+ let expectedIndex = 0;
6575
+ const matchedSteps = [];
6576
+ for (const step of toolSteps) {
6577
+ if (expectedIndex >= expectedMatchers.length) break;
6578
+ if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
6579
+ matchedSteps.push(formatTrajectoryStep(step));
6580
+ expectedIndex += 1;
6581
+ }
6582
+ }
6583
+ basePass = expectedIndex === expectedMatchers.length;
6584
+ if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
6585
+ else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
6586
+ }
6587
+ const pass = applyInverse(basePass, params.inverse);
6588
+ if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
6589
+ return {
6590
+ pass,
6591
+ score: pass ? 1 : 0,
6592
+ reason,
6593
+ assertion: params.assertion
6594
+ };
6595
+ };
6596
+ const handleTrajectoryToolArgsMatch = (params) => {
6597
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
6598
+ const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
6599
+ const matcherLabel = matcher.pattern || matcher.name || "*";
6600
+ const actualTools = toolSteps.map(formatTrajectoryStep);
6601
+ const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
6602
+ const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
6603
+ const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
6604
+ const basePass = matchedStep !== void 0;
6605
+ const pass = applyInverse(basePass, params.inverse);
6606
+ const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
6607
+ const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
6608
+ let reason;
6609
+ if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
6610
+ else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
6611
+ else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
6612
+ else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
6613
+ else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
6614
+ else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
6615
+ else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
6616
+ return {
6617
+ pass,
6618
+ score: pass ? 1 : 0,
6619
+ reason,
6620
+ assertion: params.assertion
6621
+ };
6622
+ };
6623
+ function resolveStepCountValue(value) {
6624
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
6625
+ return {
6626
+ ...normalizeTrajectoryMatcher(value),
6627
+ max: typeof value.max === "number" ? value.max : void 0,
6628
+ min: typeof value.min === "number" ? value.min : void 0
6629
+ };
6630
+ }
6631
+ const handleTrajectoryStepCount = (params) => {
6632
+ const steps = extractTrajectorySteps(getTraceOrThrow(params));
6633
+ const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
6634
+ const { min, max } = matcher;
6635
+ if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
6636
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
6637
+ const count = matchingSteps.length;
6638
+ const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
6639
+ const pass = applyInverse(basePass, params.inverse);
6640
+ const filterParts = [];
6641
+ if (matcher.type) {
6642
+ const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
6643
+ filterParts.push(`type=${types.join("|")}`);
6644
+ }
6645
+ const pattern = matcher.pattern || matcher.name;
6646
+ if (pattern) filterParts.push(`pattern=${pattern}`);
6647
+ let reason = `Matched ${count} trajectory step(s)`;
6648
+ if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
6649
+ if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
6650
+ else if (min !== void 0) reason += ` (expected at least ${min})`;
6651
+ else if (max !== void 0) reason += ` (expected at most ${max})`;
6652
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
6653
+ if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
6654
+ return {
6655
+ pass,
6656
+ score: pass ? 1 : 0,
6657
+ reason,
6658
+ assertion: params.assertion
6659
+ };
6660
+ };
6661
+ const handleTrajectoryGoalSuccess = async (params) => {
6662
+ const trace = getTraceOrThrow(params);
6663
+ const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
6664
+ const result = await matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
6665
+ if (!params.inverse) return result;
6666
+ return {
6667
+ ...result,
6668
+ assertion: params.assertion,
6669
+ pass: !result.pass,
6670
+ score: result.pass ? 0 : 1,
6671
+ reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
6672
+ };
6673
+ };
6674
+ //#endregion
6051
6675
  //#region src/assertions/webhook.ts
6052
6676
  async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
6053
6677
  invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
@@ -6116,18 +6740,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
6116
6740
  if (pass) reason = "Assertion passed";
6117
6741
  else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
6118
6742
  else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
6119
- } else if (min !== void 0) {
6120
- const basePass = wordCount >= min;
6121
- pass = inverse ? !basePass : basePass;
6122
- if (pass) reason = "Assertion passed";
6123
- else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
6124
- else reason = `Word count ${wordCount} is less than minimum ${min}`;
6125
- } else {
6743
+ } else if (min === void 0) {
6126
6744
  const basePass = wordCount <= max;
6127
6745
  pass = inverse ? !basePass : basePass;
6128
6746
  if (pass) reason = "Assertion passed";
6129
6747
  else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
6130
6748
  else reason = `Word count ${wordCount} is greater than maximum ${max}`;
6749
+ } else {
6750
+ const basePass = wordCount >= min;
6751
+ pass = inverse ? !basePass : basePass;
6752
+ if (pass) reason = "Assertion passed";
6753
+ else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
6754
+ else reason = `Word count ${wordCount} is less than minimum ${min}`;
6131
6755
  }
6132
6756
  } else {
6133
6757
  invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
@@ -6222,6 +6846,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
6222
6846
  //#endregion
6223
6847
  //#region src/assertions/index.ts
6224
6848
  const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
6849
+ const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
6850
+ const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
6851
+ const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
6852
+ const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
6853
+ const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
6854
+ const MAX_TRACE_FETCH_STABLE_POLLS = 10;
6225
6855
  const MODEL_GRADED_ASSERTION_TYPES = new Set([
6226
6856
  "answer-relevance",
6227
6857
  "context-faithfulness",
@@ -6231,8 +6861,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
6231
6861
  "llm-rubric",
6232
6862
  "model-graded-closedqa",
6233
6863
  "model-graded-factuality",
6234
- "search-rubric"
6864
+ "search-rubric",
6865
+ "trajectory:goal-success"
6235
6866
  ]);
6867
+ const TRACE_AWARE_ASSERTION_TYPES = new Set([
6868
+ "javascript",
6869
+ "python",
6870
+ "ruby",
6871
+ "trace-error-spans",
6872
+ "trace-span-count",
6873
+ "trace-span-duration",
6874
+ "trajectory:goal-success",
6875
+ "trajectory:step-count",
6876
+ "trajectory:tool-args-match",
6877
+ "trajectory:tool-sequence",
6878
+ "trajectory:tool-used"
6879
+ ]);
6880
+ function assertionUsesTrace(assertion) {
6881
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
6882
+ return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
6883
+ }
6884
+ function assertionMayNeedTraceContext(assertion) {
6885
+ if (assertionUsesTrace(assertion)) return true;
6886
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
6887
+ return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || isPackagePath(assertion.value) : false;
6888
+ }
6889
+ function hasTraceAwareAssertions(assertions) {
6890
+ return Boolean(assertions?.some(assertionMayNeedTraceContext));
6891
+ }
6892
+ async function loadTraceData(traceId) {
6893
+ const traceStore = getTraceStore();
6894
+ const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
6895
+ const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
6896
+ const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
6897
+ let lastSpanCount = -1;
6898
+ let stableObservations = 0;
6899
+ let latestTrace = null;
6900
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
6901
+ latestTrace = await traceStore.getTrace(traceId);
6902
+ const spanCount = latestTrace?.spans?.length ?? 0;
6903
+ if (spanCount > 0) {
6904
+ stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
6905
+ lastSpanCount = spanCount;
6906
+ if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
6907
+ } else {
6908
+ stableObservations = 0;
6909
+ lastSpanCount = spanCount;
6910
+ }
6911
+ if (attempt < maxAttempts - 1) await sleep(retryDelayMs);
6912
+ }
6913
+ return latestTrace;
6914
+ }
6236
6915
  const ASSERTION_HANDLERS = {
6237
6916
  "answer-relevance": handleAnswerRelevance,
6238
6917
  bleu: handleBleuScore,
@@ -6295,12 +6974,18 @@ const ASSERTION_HANDLERS = {
6295
6974
  ruby: handleRuby,
6296
6975
  "rouge-n": handleRougeScore,
6297
6976
  "search-rubric": handleSearchRubric,
6977
+ "skill-used": handleSkillUsed,
6298
6978
  similar: handleSimilar,
6299
6979
  "similar:cosine": handleSimilar,
6300
6980
  "similar:dot": handleSimilar,
6301
6981
  "similar:euclidean": handleSimilar,
6302
6982
  "starts-with": handleStartsWith,
6303
6983
  "tool-call-f1": handleToolCallF1,
6984
+ "trajectory:goal-success": handleTrajectoryGoalSuccess,
6985
+ "trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
6986
+ "trajectory:step-count": handleTrajectoryStepCount,
6987
+ "trajectory:tool-sequence": handleTrajectoryToolSequence,
6988
+ "trajectory:tool-used": handleTrajectoryToolUsed,
6304
6989
  "trace-error-spans": handleTraceErrorSpans,
6305
6990
  "trace-span-count": handleTraceSpanCount,
6306
6991
  "trace-span-duration": handleTraceSpanDuration,
@@ -6343,7 +7028,7 @@ function isAssertionInverse(assertion) {
6343
7028
  function getAssertionBaseType(assertion) {
6344
7029
  return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
6345
7030
  }
6346
- async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
7031
+ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
6347
7032
  const resolvedVars = vars || test.vars || {};
6348
7033
  const { cost, logProbs, output: originalOutput } = providerResponse;
6349
7034
  let output = originalOutput;
@@ -6362,14 +7047,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
6362
7047
  providerResponse,
6363
7048
  ...assertion.config ? { config: structuredClone(assertion.config) } : {}
6364
7049
  };
6365
- if (traceId) try {
6366
- const traceData = await getTraceStore().getTrace(traceId);
6367
- if (traceData) context.trace = {
6368
- traceId: traceData.traceId,
6369
- evaluationId: traceData.evaluationId,
6370
- testCaseId: traceData.testCaseId,
6371
- metadata: traceData.metadata,
6372
- spans: traceData.spans || []
7050
+ if (traceId && assertionMayNeedTraceContext(assertion)) try {
7051
+ const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
7052
+ if (resolvedTraceData) context.trace = {
7053
+ traceId: resolvedTraceData.traceId,
7054
+ evaluationId: resolvedTraceData.evaluationId,
7055
+ testCaseId: resolvedTraceData.testCaseId,
7056
+ metadata: resolvedTraceData.metadata,
7057
+ spans: resolvedTraceData.spans || []
6373
7058
  };
6374
7059
  } catch (error) {
6375
7060
  logger.debug(`Failed to fetch trace data for assertion: ${error}`);
@@ -6402,7 +7087,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
6402
7087
  };
6403
7088
  }
6404
7089
  else if (filePath.endsWith(".rb")) try {
6405
- const { runRuby } = await import("../rubyUtils-BcuGX77l.js").then((n) => n.t);
7090
+ const { runRuby } = await import("../rubyUtils-DECSbsfY.js").then((n) => n.t);
6406
7091
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
6407
7092
  logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
6408
7093
  } catch (error) {
@@ -6511,6 +7196,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
6511
7196
  index: i
6512
7197
  };
6513
7198
  }).flat();
7199
+ const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
7200
+ let preloadedTraceData;
7201
+ if (shouldPreloadTrace && traceId) try {
7202
+ preloadedTraceData = await loadTraceData(traceId);
7203
+ } catch (error) {
7204
+ logger.debug(`Failed to preload trace data for assertions: ${error}`);
7205
+ preloadedTraceData = null;
7206
+ }
6514
7207
  await async.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
6515
7208
  if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
6516
7209
  const result = await runAssertion({
@@ -6522,7 +7215,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
6522
7215
  vars,
6523
7216
  latencyMs,
6524
7217
  assertIndex: index,
6525
- traceId
7218
+ traceId,
7219
+ traceData: preloadedTraceData
6526
7220
  });
6527
7221
  assertResult.addResult({
6528
7222
  index,
@@ -6651,7 +7345,7 @@ var CIProgressReporter = class {
6651
7345
  else {
6652
7346
  const eta = remaining / rate;
6653
7347
  if (eta > 1440) etaDisplay = ">24 hours";
6654
- else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
7348
+ else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
6655
7349
  }
6656
7350
  const percentage = Math.floor(this.completedTests / this.totalTests * 100);
6657
7351
  logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
@@ -7052,12 +7746,55 @@ function isPromptAllowed(prompt, allowedPrompts) {
7052
7746
  var ProgressBarManager = class {
7053
7747
  progressBar;
7054
7748
  isWebUI;
7749
+ originalLogCallback = null;
7750
+ installedLogCallback = null;
7751
+ pendingRender = null;
7055
7752
  totalCount = 0;
7056
7753
  completedCount = 0;
7057
7754
  concurrency = 1;
7058
7755
  constructor(isWebUI) {
7059
7756
  this.isWebUI = isWebUI;
7060
7757
  }
7758
+ clearProgressBarLine() {
7759
+ readline.cursorTo(process.stderr, 0);
7760
+ readline.clearLine(process.stderr, 0);
7761
+ }
7762
+ scheduleRender() {
7763
+ if (!this.progressBar || this.pendingRender) return;
7764
+ this.pendingRender = setImmediate(() => {
7765
+ this.pendingRender = null;
7766
+ this.progressBar?.render();
7767
+ });
7768
+ }
7769
+ handleLogMessage() {
7770
+ if (!this.progressBar) return;
7771
+ this.clearProgressBarLine();
7772
+ this.scheduleRender();
7773
+ }
7774
+ /**
7775
+ * Coordinate console logging with the progress bar to prevent visual corruption.
7776
+ */
7777
+ installLogInterceptor() {
7778
+ if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
7779
+ this.originalLogCallback = globalLogCallback;
7780
+ this.installedLogCallback = (message) => {
7781
+ this.originalLogCallback?.(message);
7782
+ this.handleLogMessage();
7783
+ };
7784
+ setLogCallback(this.installedLogCallback);
7785
+ }
7786
+ /**
7787
+ * Remove the log interceptor and restore original logger callback behavior.
7788
+ */
7789
+ removeLogInterceptor() {
7790
+ if (this.pendingRender) {
7791
+ clearImmediate(this.pendingRender);
7792
+ this.pendingRender = null;
7793
+ }
7794
+ if (this.installedLogCallback && globalLogCallback === this.installedLogCallback) setLogCallback(this.originalLogCallback);
7795
+ this.installedLogCallback = null;
7796
+ this.originalLogCallback = null;
7797
+ }
7061
7798
  /**
7062
7799
  * Initialize progress bar
7063
7800
  */
@@ -7077,7 +7814,8 @@ var ProgressBarManager = class {
7077
7814
  return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
7078
7815
  },
7079
7816
  hideCursor: true,
7080
- gracefulExit: true
7817
+ gracefulExit: true,
7818
+ stream: process.stderr
7081
7819
  }, cliProgress.Presets.shades_classic);
7082
7820
  this.progressBar.start(this.totalCount, 0, {
7083
7821
  provider: "",
@@ -7352,6 +8090,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
7352
8090
  const parts = traceContext.traceparent.split("-");
7353
8091
  if (parts.length >= 3) traceId = parts[1];
7354
8092
  }
8093
+ if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
7355
8094
  const checkResult = await runAssertions({
7356
8095
  prompt: renderedPrompt,
7357
8096
  provider,
@@ -7749,7 +8488,7 @@ var Evaluator = class {
7749
8488
  const defaultProvider = testSuite.defaultTest.provider;
7750
8489
  if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
7751
8490
  else if (typeof defaultProvider === "object" && defaultProvider.id) {
7752
- const { loadApiProvider } = await import("../providers-CH3C7zf7.js");
8491
+ const { loadApiProvider } = await import("../providers-DEYiFVAo.js");
7753
8492
  testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
7754
8493
  } else testCase.provider = defaultProvider;
7755
8494
  }
@@ -7833,7 +8572,7 @@ var Evaluator = class {
7833
8572
  if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
7834
8573
  }
7835
8574
  if (state.resume && this.evalRecord.persisted) try {
7836
- const { default: EvalResult } = await import("../evalResult-Dx5P5cIv.js");
8575
+ const { default: EvalResult } = await import("../evalResult-CuvJeNiM.js");
7837
8576
  const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
7838
8577
  const originalCount = runEvalOptions.length;
7839
8578
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -8033,7 +8772,7 @@ var Evaluator = class {
8033
8772
  if (isCI() && !isWebUI) {
8034
8773
  ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
8035
8774
  ciProgressReporter.start();
8036
- } else if (this.options.showProgressBar && process.stdout.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
8775
+ } else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
8037
8776
  this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
8038
8777
  if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
8039
8778
  if (isWebUI) {
@@ -8054,7 +8793,10 @@ var Evaluator = class {
8054
8793
  if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
8055
8794
  if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
8056
8795
  }
8057
- if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
8796
+ if (this.options.showProgressBar && progressBarManager) {
8797
+ await progressBarManager.initialize(runEvalOptions, concurrency, 0);
8798
+ progressBarManager.installLogInterceptor();
8799
+ }
8058
8800
  try {
8059
8801
  if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
8060
8802
  checkAbort();
@@ -8080,7 +8822,10 @@ var Evaluator = class {
8080
8822
  else if (!targetUnavailable) {
8081
8823
  logger.info("Evaluation interrupted, saving progress...");
8082
8824
  if (globalTimeout) clearTimeout(globalTimeout);
8083
- if (progressBarManager) progressBarManager.stop();
8825
+ if (progressBarManager) {
8826
+ progressBarManager.removeLogInterceptor();
8827
+ progressBarManager.stop();
8828
+ }
8084
8829
  if (ciProgressReporter) ciProgressReporter.finish();
8085
8830
  this.evalRecord.setVars(Array.from(vars));
8086
8831
  await this.evalRecord.addPrompts(prompts);
@@ -8088,6 +8833,10 @@ var Evaluator = class {
8088
8833
  return this.evalRecord;
8089
8834
  }
8090
8835
  } else {
8836
+ if (progressBarManager) {
8837
+ progressBarManager.removeLogInterceptor();
8838
+ progressBarManager.stop();
8839
+ }
8091
8840
  if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
8092
8841
  throw err;
8093
8842
  }
@@ -8230,6 +8979,7 @@ var Evaluator = class {
8230
8979
  await this.evalRecord.addPrompts(prompts);
8231
8980
  try {
8232
8981
  if (progressBarManager) {
8982
+ progressBarManager.removeLogInterceptor();
8233
8983
  progressBarManager.complete();
8234
8984
  progressBarManager.stop();
8235
8985
  } else if (ciProgressReporter) ciProgressReporter.finish();
@@ -9034,8 +9784,7 @@ function testCaseFromCsvRow(row) {
9034
9784
  logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
9035
9785
  } else if (key.startsWith("__config:")) {
9036
9786
  const configParts = key.slice(9).split(":");
9037
- if (configParts.length !== 2) logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
9038
- else {
9787
+ if (configParts.length === 2) {
9039
9788
  const [expectedKey, configKey] = configParts;
9040
9789
  let targetIndex;
9041
9790
  if (expectedKey === "__expected") targetIndex = 0;
@@ -9061,7 +9810,7 @@ function testCaseFromCsvRow(row) {
9061
9810
  }
9062
9811
  }
9063
9812
  assertionConfigs[targetIndex][configKey] = parsedValue;
9064
- }
9813
+ } else logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
9065
9814
  } else vars[key] = value;
9066
9815
  }
9067
9816
  for (let i = 0; i < asserts.length; i++) {
@@ -9190,14 +9939,14 @@ async function parseXlsxFile(filePath) {
9190
9939
  const sheetName = typeof sheetOption === "number" ? sheetNames[sheetOption - 1] : sheetOption;
9191
9940
  const rows = await readXlsxFile(actualFilePath, { sheet: sheetOption });
9192
9941
  if (rows.length === 0) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
9193
- const headers = rows[0].map((cell) => cell != null ? String(cell) : "");
9942
+ const headers = rows[0].map((cell) => cell == null ? "" : String(cell));
9194
9943
  if (headers.length === 0 || headers.every((h) => h === "")) throw new Error(`Sheet "${sheetName}" has no valid column headers`);
9195
9944
  if (rows.length === 1) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
9196
9945
  const data = rows.slice(1).map((row) => {
9197
9946
  const obj = {};
9198
9947
  headers.forEach((header, index) => {
9199
9948
  const cellValue = row[index];
9200
- obj[header] = cellValue != null ? String(cellValue) : "";
9949
+ obj[header] = cellValue == null ? "" : String(cellValue);
9201
9950
  });
9202
9951
  return obj;
9203
9952
  });
@@ -12638,20 +13387,19 @@ function generateEvalSummary(params) {
12638
13387
  }
12639
13388
  }
12640
13389
  lines.push("");
12641
- const passRate = successes / (successes + failures + errors) * 100;
12642
- let passRateDisplay;
12643
- if (!Number.isNaN(passRate)) {
12644
- const passRateFormatted = passRate === 0 || passRate === 100 ? `${passRate.toFixed(0)}%` : `${passRate.toFixed(2)}%`;
12645
- if (passRate >= 100) passRateDisplay = chalk.green.bold(passRateFormatted);
12646
- else if (passRate >= 80) passRateDisplay = chalk.yellow.bold(passRateFormatted);
12647
- else passRateDisplay = chalk.red.bold(passRateFormatted);
12648
- }
12649
- const passedPart = successes > 0 ? `${chalk.green("✓")} ${chalk.green.bold(successes.toLocaleString())} passed` : `${chalk.gray.bold(successes.toLocaleString())} passed`;
12650
- const failedPart = failures > 0 ? `${chalk.red("✗")} ${chalk.red.bold(failures.toLocaleString())} failed` : `${chalk.gray.bold(failures.toLocaleString())} failed`;
13390
+ const totalTests = successes + failures + errors;
13391
+ const formatResultPercentage = (count) => {
13392
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
13393
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
13394
+ };
13395
+ const formatResultLine = (count, label, icon, iconColor) => {
13396
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
13397
+ };
12651
13398
  const errorLabel = errors === 1 ? "error" : "errors";
12652
- const resultsLine = `${passedPart}, ${failedPart}, ${errors > 0 ? `${chalk.red("✗")} ${chalk.red.bold(errors.toLocaleString())} ${errorLabel}` : `${chalk.gray.bold(errors.toLocaleString())} ${errorLabel}`}`;
12653
- if (Number.isNaN(passRate)) lines.push(`${chalk.bold("Results:")} ${resultsLine}`);
12654
- else lines.push(`${chalk.bold("Results:")} ${resultsLine} (${passRateDisplay})`);
13399
+ lines.push(chalk.bold("Results:"));
13400
+ lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
13401
+ lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
13402
+ lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
12655
13403
  const durationDisplay = formatDuration(duration);
12656
13404
  lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
12657
13405
  lines.push("");
@@ -12812,8 +13560,8 @@ var ModelAudit = class ModelAudit {
12812
13560
  this.issues = data.issues || data.results?.issues || null;
12813
13561
  const issues = data.issues || data.results?.issues;
12814
13562
  const resultsHasErrors = data.results?.has_errors ?? false;
12815
- if (data.hasErrors !== void 0) this.hasErrors = data.hasErrors;
12816
- else this.hasErrors = resultsHasErrors || issues && issues.some((issue) => issue.severity === "critical" || issue.severity === "error") || false;
13563
+ if (data.hasErrors === void 0) this.hasErrors = resultsHasErrors || issues && issues.some((issue) => issue.severity === "critical" || issue.severity === "error") || false;
13564
+ else this.hasErrors = data.hasErrors;
12817
13565
  this.totalChecks = data.totalChecks;
12818
13566
  this.passedChecks = data.passedChecks;
12819
13567
  this.failedChecks = data.failedChecks;
@@ -13228,7 +13976,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
13228
13976
  await checkCloudPermissions(config);
13229
13977
  const options = {
13230
13978
  ...evaluateOptions,
13231
- showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar !== void 0 ? cmdObj.progressBar !== false : evaluateOptions.showProgressBar !== void 0 ? evaluateOptions.showProgressBar : true,
13979
+ showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
13232
13980
  repeat,
13233
13981
  delay: !Number.isNaN(delay) && delay > 0 ? delay : void 0,
13234
13982
  maxConcurrency,
@@ -13612,7 +14360,7 @@ async function doRedteamRun(options) {
13612
14360
  redteamConfig = await doGenerateRedteam({
13613
14361
  ...passThroughOptions,
13614
14362
  ...options.liveRedteamConfig?.commandLineOptions || {},
13615
- ...maxConcurrency !== void 0 ? { maxConcurrency } : {},
14363
+ ...maxConcurrency === void 0 ? {} : { maxConcurrency },
13616
14364
  config: configPath,
13617
14365
  output: redteamPath,
13618
14366
  force: options.force,
@@ -14434,7 +15182,7 @@ evalRouter.post("/", async (req, res) => {
14434
15182
  logger.debug("[POST /api/eval] Saving eval results (v4) to database");
14435
15183
  const eval_ = await Eval.create(incEval.config, incEval.prompts || [], {
14436
15184
  author: incEval.author,
14437
- createdAt: incEval.createdAt !== void 0 ? new Date(incEval.createdAt) : void 0,
15185
+ createdAt: incEval.createdAt === void 0 ? void 0 : new Date(incEval.createdAt),
14438
15186
  results: incEval.results,
14439
15187
  vars: incEval.vars
14440
15188
  });
@@ -17308,7 +18056,7 @@ router.get("/", async (_req, res) => {
17308
18056
  };
17309
18057
  } catch (error) {
17310
18058
  logger.debug(`Failed to fetch latest version: ${error}`);
17311
- latestVersion = versionCache.latestVersion ?? "0.121.2";
18059
+ latestVersion = versionCache.latestVersion ?? "0.121.3";
17312
18060
  }
17313
18061
  }
17314
18062
  const selfHosted = getEnvBool("PROMPTFOO_SELF_HOSTED");
@@ -17317,7 +18065,7 @@ router.get("/", async (_req, res) => {
17317
18065
  selfHosted,
17318
18066
  isNpx
17319
18067
  });
17320
- const resolvedLatestVersion = latestVersion ?? "0.121.2";
18068
+ const resolvedLatestVersion = latestVersion ?? "0.121.3";
17321
18069
  const response = {
17322
18070
  currentVersion: VERSION,
17323
18071
  latestVersion: resolvedLatestVersion,