promptfoo 0.121.1 → 0.121.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. package/README.md +2 -0
  2. package/dist/src/{accounts-xrUGFA6n.js → accounts-B2XmGjty.js} +5 -5
  3. package/dist/src/{accounts-Bx-x3bmW.cjs → accounts-BPyfpSeU.cjs} +5 -5
  4. package/dist/src/{accounts-CMqkzrVf.js → accounts-CFLK3mnD.js} +6 -6
  5. package/dist/src/{accounts-BgNJDBE6.js → accounts-Xatc0RYb.js} +5 -5
  6. package/dist/src/{agentic-utils-BKIN5PKu.js → agentic-utils-36epdqwB.js} +3 -3
  7. package/dist/src/{cometapi-DkXrKi5z.js → agentic-utils-D8yXo5Lm.js} +4 -61
  8. package/dist/src/{cometapi-vY6aDZgo.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
  9. package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
  10. package/dist/src/{agents-C-dDThPK.js → agents-BBVJCIYr.js} +226 -13
  11. package/dist/src/{agents-CErsqg5U.cjs → agents-BBWxKSM0.cjs} +7 -7
  12. package/dist/src/{agents-Dy2YpZpa.js → agents-Bqgfdokm.js} +227 -14
  13. package/dist/src/{agents-B0f4HICh.cjs → agents-CAYbM7qD.cjs} +226 -13
  14. package/dist/src/{agents-CVIn-Utx.js → agents-CLQ-P15P.js} +7 -7
  15. package/dist/src/{agents-DeH4Gu94.js → agents-CgBniSlI.js} +8 -8
  16. package/dist/src/{agents-CXknwsFX.js → agents-DSSTV4bv.js} +226 -13
  17. package/dist/src/{agents-aF4-T121.js → agents-wg3ohknq.js} +7 -7
  18. package/dist/src/{aimlapi-tg0Gkcvr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
  19. package/dist/src/{aimlapi-BNfTBexL.js → aimlapi-BwGC1TtS.js} +13 -13
  20. package/dist/src/{aimlapi-BAGZDo5G.js → aimlapi-DaC3qZ-o.js} +14 -14
  21. package/dist/src/{aimlapi-DHRKlBEA.js → aimlapi-MgSLdvy7.js} +13 -13
  22. package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
  23. package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
  24. package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
  25. package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
  26. package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
  27. package/dist/src/app/index.html +4 -4
  28. package/dist/src/{audio-tf_NBjlC.js → audio-Bn44pQxv.js} +4 -4
  29. package/dist/src/{audio-CHQ4r-RV.js → audio-DDA5WHdx.js} +4 -4
  30. package/dist/src/{audio-BWeaWovU.cjs → audio-DVFjQ67_.cjs} +4 -4
  31. package/dist/src/{audio-BRODU0UK.js → audio-DjU9GswO.js} +5 -5
  32. package/dist/src/{base-DBtwl2FR.cjs → base-BboXIF_0.cjs} +3 -3
  33. package/dist/src/{base-B4QJRyFS.js → base-CKjwebIH.js} +3 -3
  34. package/dist/src/{base-B0tcrnq_.js → base-CqzQ4K8j.js} +3 -3
  35. package/dist/src/{base-fEDN28WM.js → base-Cz2ZC_iA.js} +3 -3
  36. package/dist/src/{blobs-BAU-dXan.js → blobs-B1JriOyi.js} +3 -3
  37. package/dist/src/{blobs-qTYm-1PY.js → blobs-BUWmKWzo.js} +3 -3
  38. package/dist/src/{blobs-DvS-O6be.cjs → blobs-C6j0bvFz.cjs} +3 -3
  39. package/dist/src/{blobs-Bpg5rH6i.js → blobs-DXTl6J3H.js} +3 -3
  40. package/dist/src/{cache-COish3-W.cjs → cache-C5yFZ4gC.cjs} +75 -58
  41. package/dist/src/{cache-8XhNqPKW.js → cache-CaT5tPgo.js} +75 -58
  42. package/dist/src/cache-CyCanoMu.js +6 -0
  43. package/dist/src/{cache-CG0SlR1d.js → cache-DSqR6ezl.js} +75 -58
  44. package/dist/src/cache-Df_QFDNu.cjs +5 -0
  45. package/dist/src/{cache-D3eqDYGU.js → cache-HP0NP4k3.js} +75 -58
  46. package/dist/src/{chat-DHMH-N64.js → chat-B-52XYI1.js} +12 -12
  47. package/dist/src/{chat-BKm79wib.js → chat-B0iaWhoh.js} +16 -14
  48. package/dist/src/{chat-DxysjBvt.js → chat-BE0qTA8e.js} +13 -13
  49. package/dist/src/{chat-CRWNNq73.js → chat-BEwdgGEg.js} +16 -14
  50. package/dist/src/{chat-2K608PeQ.cjs → chat-BtIKkLKx.cjs} +13 -13
  51. package/dist/src/{chat-DaqekjFr.cjs → chat-CM8qWR3_.cjs} +17 -15
  52. package/dist/src/{chat-CM_kyI8B.js → chat-DK1U-eZ-.js} +12 -12
  53. package/dist/src/{chat-CznLWr_D.js → chat-pxmiVpWe.js} +16 -14
  54. package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
  55. package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
  56. package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
  57. package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
  58. package/dist/src/{claude-agent-sdk-CJH22shf.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
  59. package/dist/src/{claude-agent-sdk-Dy5lT-Tx.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
  60. package/dist/src/{claude-agent-sdk-BLTu0WBO.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
  61. package/dist/src/{claude-agent-sdk-D6_k9FKA.js → claude-agent-sdk-HgbFioFw.js} +33 -15
  62. package/dist/src/cloud-DE3t1-ZI.js +4 -0
  63. package/dist/src/{cloud-Bc9526yV.js → cloud-z8KZpUoa.js} +3 -3
  64. package/dist/src/{cloudflare-ai-CWWJCRim.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
  65. package/dist/src/{cloudflare-ai-C9r2sRhw.js → cloudflare-ai-Bbp26N0L.js} +13 -13
  66. package/dist/src/{cloudflare-ai-ClWSdor4.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
  67. package/dist/src/{cloudflare-ai-ICsOuD-z.js → cloudflare-ai-DdKP9TKT.js} +14 -14
  68. package/dist/src/{cloudflare-gateway-D6xFc5pa.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
  69. package/dist/src/{cloudflare-gateway-D6O7AlYb.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
  70. package/dist/src/{cloudflare-gateway-pXGHxJ47.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
  71. package/dist/src/{cloudflare-gateway-C2_-KG5o.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
  72. package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
  73. package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
  74. package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
  75. package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
  76. package/dist/src/cometapi-BDyV-NNm.js +62 -0
  77. package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
  78. package/dist/src/{cometapi-Bbjp5V4x.js → cometapi-hhL4TAh3.js} +14 -14
  79. package/dist/src/{cometapi-BasUi7-_.js → cometapi-sp7sJpBD.js} +15 -15
  80. package/dist/src/{completion-C_P3ypkJ.js → completion-BCimtq-h.js} +6 -6
  81. package/dist/src/{completion-6Mx_iXxK.js → completion-DCjv7RZ3.js} +6 -6
  82. package/dist/src/{completion-CDOouNzq.cjs → completion-DlXUhj5c.cjs} +6 -6
  83. package/dist/src/{completion-C5rtR_9P.js → completion-DoYy49ti.js} +6 -6
  84. package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
  85. package/dist/src/{docker-BwsKwxFs.cjs → docker-Cqj2-QVi.cjs} +14 -14
  86. package/dist/src/{docker-CZnqU1XV.js → docker-CxCkwMzc.js} +13 -13
  87. package/dist/src/{docker-DzxyDPIj.js → docker-DpguQj-w.js} +14 -14
  88. package/dist/src/{docker-5KcG-_86.js → docker-FeBni2dw.js} +13 -13
  89. package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
  90. package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
  91. package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
  92. package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
  93. package/dist/src/eval-7aEqoMs3.js +15 -0
  94. package/dist/src/{eval-DmFyWU7i.js → eval-BTqTn7lb.js} +10 -10
  95. package/dist/src/{evalResult-CDQiuUuf.js → evalResult-BkIhRdTe.js} +7 -7
  96. package/dist/src/evalResult-CYNHkk5A.js +12 -0
  97. package/dist/src/evalResult-CuvJeNiM.js +10 -0
  98. package/dist/src/{evalResult-CTG2AHOS.js → evalResult-DUDShQrm.js} +7 -7
  99. package/dist/src/{evalResult-Dap2CekP.cjs → evalResult-DpARzUCb.cjs} +7 -7
  100. package/dist/src/evalResult-tGdilrWt.cjs +10 -0
  101. package/dist/src/evaluator-BBUqRhz1.js +36 -0
  102. package/dist/src/{evaluator-DPFRbFIL.js → evaluator-BcvOGaam.js} +833 -79
  103. package/dist/src/{extractor-YMU_Gvt8.js → extractor-C8XwivI9.js} +6 -6
  104. package/dist/src/{extractor-CFG6bcWJ.js → extractor-CAZ2G3Kh.js} +6 -6
  105. package/dist/src/{extractor-DX36oYEv.cjs → extractor-DG3sSfXE.cjs} +6 -6
  106. package/dist/src/{extractor-M67RUtg6.js → extractor-D_wd8jxt.js} +6 -6
  107. package/dist/src/{fetch-4M3YRaqL.js → fetch-BiYv2BZc.js} +3 -3
  108. package/dist/src/{fetch-BxUk8odA.cjs → fetch-BnR9wSnm.cjs} +3 -3
  109. package/dist/src/{fetch-60Gzydls.js → fetch-CVAtKnI3.js} +3 -3
  110. package/dist/src/{fetch-BMv0O527.js → fetch-DoVRJZhJ.js} +4 -4
  111. package/dist/src/fetch-UWU706qb.js +5 -0
  112. package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
  113. package/dist/src/{graders-DOXycdlG.cjs → graders-BElhu9ZY.cjs} +126 -55
  114. package/dist/src/{graders-R9rYUM0d.js → graders-BXAJ0sbS.js} +120 -55
  115. package/dist/src/graders-BxfEguVY.js +32 -0
  116. package/dist/src/graders-CzVMbEnv.js +34 -0
  117. package/dist/src/{graders-CpdqD9PI.js → graders-DG7mhg-b.js} +120 -55
  118. package/dist/src/graders-DjCXfj0l.cjs +32 -0
  119. package/dist/src/{graders-CHO8EPM4.js → graders-RjHF8VfG.js} +120 -55
  120. package/dist/src/graders-kHzIWOKu.js +32 -0
  121. package/dist/src/{image-DTedmQPg.cjs → image--F58eEIn.cjs} +6 -6
  122. package/dist/src/{image-DJEvKveK.js → image-6WQXK8m8.js} +4 -4
  123. package/dist/src/{image-pAX56tPG.js → image-B8b6f36E.js} +6 -6
  124. package/dist/src/{image-BmEZqVmk.js → image-CoxZp9PZ.js} +6 -6
  125. package/dist/src/{image-gvmivTEe.js → image-DO0RYnjH.js} +5 -5
  126. package/dist/src/{image-CBBVXWuT.js → image-PoF6DN3x.js} +6 -6
  127. package/dist/src/{image-CDLQOcqT.cjs → image-fza3zuKs.cjs} +4 -4
  128. package/dist/src/{image-tL5hIOFh.js → image-xNbw5ph2.js} +4 -4
  129. package/dist/src/index.cjs +863 -110
  130. package/dist/src/index.d.cts +833 -60
  131. package/dist/src/index.d.ts +833 -60
  132. package/dist/src/index.js +860 -108
  133. package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
  134. package/dist/src/{knowledgeBase-CoU-UQBg.js → knowledgeBase-Bi7CmDbx.js} +7 -7
  135. package/dist/src/{knowledgeBase-CLJybhnF.js → knowledgeBase-Ce3ofVan.js} +8 -8
  136. package/dist/src/{knowledgeBase-DjWPVqSb.js → knowledgeBase-DFRXPZl_.js} +7 -7
  137. package/dist/src/{knowledgeBase-wkxuRFhA.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
  138. package/dist/src/{litellm-B9Hysuri.js → litellm-Bo2gQXpo.js} +16 -15
  139. package/dist/src/{litellm-ePxtr9F1.js → litellm-CKiAxnoM.js} +15 -14
  140. package/dist/src/{litellm-NYpQ8RQu.cjs → litellm-CnHI69aj.cjs} +16 -15
  141. package/dist/src/{litellm-CTfa0hqi.js → litellm-Tc294Jhj.js} +15 -14
  142. package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
  143. package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
  144. package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
  145. package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
  146. package/dist/src/{luma-ray-BW9IRGIc.js → luma-ray-0ehMPt5N.js} +10 -10
  147. package/dist/src/{luma-ray-BE2mOt6N.js → luma-ray-C9q8rdQe.js} +9 -9
  148. package/dist/src/{luma-ray-Cm1KZBhs.js → luma-ray-DP0QA9qn.js} +9 -9
  149. package/dist/src/{luma-ray-B0GGNRc1.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
  150. package/dist/src/main.js +69 -71
  151. package/dist/src/{messages-1x9atZmP.js → messages-DJNo37Ko.js} +14 -9
  152. package/dist/src/{messages-BLbWdsyt.js → messages-Dy9QecMs.js} +14 -9
  153. package/dist/src/{messages-1JrJs91T.cjs → messages-HJsyEh4o.cjs} +15 -10
  154. package/dist/src/{messages-D8EA0oDc.js → messages-biC_ex-p.js} +14 -9
  155. package/dist/src/{modelslab-C1OLRmVX.js → modelslab-B5J-ZM5c.js} +9 -9
  156. package/dist/src/{modelslab-CqXBy3U8.js → modelslab-BI458moT.js} +10 -10
  157. package/dist/src/{modelslab-X5-4LroM.js → modelslab-BTOT8FUO.js} +9 -9
  158. package/dist/src/{modelslab-DcOSFwKh.cjs → modelslab-IQbNg-r7.cjs} +9 -9
  159. package/dist/src/{nova-reel-DihqLeol.js → nova-reel-BZ9y-Y5s.js} +9 -9
  160. package/dist/src/{nova-reel-D9xfaMBs.cjs → nova-reel-CE5etkv9.cjs} +9 -9
  161. package/dist/src/{nova-reel-D2ZkOSyr.js → nova-reel-DEeQlnOJ.js} +10 -10
  162. package/dist/src/{nova-reel-BgS1ZWuK.js → nova-reel-Xw1SXLpg.js} +9 -9
  163. package/dist/src/{nova-sonic-Q3BOJeig.js → nova-sonic-DWswpN1E.js} +7 -7
  164. package/dist/src/{nova-sonic-DezhVUYT.js → nova-sonic-DXTLpi-r.js} +6 -6
  165. package/dist/src/{nova-sonic-DVu3mMIy.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
  166. package/dist/src/{nova-sonic-P-CdUMlV.js → nova-sonic-Ogqf-csn.js} +6 -6
  167. package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
  168. package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
  169. package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
  170. package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
  171. package/dist/src/{openclaw-Bv1DINsX.js → openclaw-0Sv7AK3O.js} +172 -109
  172. package/dist/src/{openclaw-DAfWQn-o.cjs → openclaw-CXxbKgDH.cjs} +174 -110
  173. package/dist/src/{openclaw-BiSZPL7J.js → openclaw-D1FSCps-.js} +172 -109
  174. package/dist/src/{openclaw-D1D_ej1z.js → openclaw-D2ENvu7a.js} +173 -110
  175. package/dist/src/{opencode-sdk-D95s6SnR.js → opencode-sdk-C71Z0ehR.js} +13 -13
  176. package/dist/src/{opencode-sdk-DxUPkLT7.js → opencode-sdk-CHCs7dEb.js} +12 -12
  177. package/dist/src/{opencode-sdk-C7m-wRfI.js → opencode-sdk-DDxj4QqH.js} +12 -12
  178. package/dist/src/{opencode-sdk-CfaLN8PY.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
  179. package/dist/src/{otlpReceiver-g3ByGaXs.js → otlpReceiver-C9KlUtxh.js} +6 -6
  180. package/dist/src/{otlpReceiver--AIRW_S4.js → otlpReceiver-CZL48YfC.js} +6 -6
  181. package/dist/src/{otlpReceiver-Bn5wGB1v.js → otlpReceiver-CavGAA6k.js} +6 -6
  182. package/dist/src/{otlpReceiver-Diec4cln.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
  183. package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
  184. package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
  185. package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
  186. package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
  187. package/dist/src/providers-BSLEaIQG.js +32 -0
  188. package/dist/src/{providers-CFu-TZl-.cjs → providers-CScd1wN6.cjs} +733 -464
  189. package/dist/src/{providers-CFLy1_ji.js → providers-Ch6Mr0gn.js} +795 -526
  190. package/dist/src/{providers-BKRJTjBz.js → providers-Cn73d5sr.js} +795 -526
  191. package/dist/src/providers-D-FnDg8k.cjs +31 -0
  192. package/dist/src/providers-DEYiFVAo.js +30 -0
  193. package/dist/src/{providers-B3HvufyI.js → providers-DvddrgxL.js} +795 -526
  194. package/dist/src/providers-sS2WI8YD.js +30 -0
  195. package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
  196. package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
  197. package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
  198. package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
  199. package/dist/src/{quiverai-CI6gYJVI.js → quiverai-BeofbLVc.js} +4 -4
  200. package/dist/src/{quiverai-MHSxbmmZ.js → quiverai-CCQn73lq.js} +5 -5
  201. package/dist/src/{quiverai-CLkWkyZc.cjs → quiverai-CcUhPIBg.cjs} +4 -4
  202. package/dist/src/{quiverai-C2jVwbH1.js → quiverai-DVSEqJiq.js} +4 -4
  203. package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
  204. package/dist/src/{responses-CGw0DCzh.js → responses-BKP_WYis.js} +16 -12
  205. package/dist/src/{responses-BKqJmhhc.js → responses-CQb1Tj69.js} +16 -12
  206. package/dist/src/{responses-jxdehPkC.js → responses-CgNyTPsY.js} +16 -12
  207. package/dist/src/{responses-tD4Bd4dc.cjs → responses-mo0KQDbu.cjs} +16 -12
  208. package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
  209. package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
  210. package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
  211. package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
  212. package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
  213. package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
  214. package/dist/src/{sagemaker-BK4Zb993.js → sagemaker-CVv8W7so.js} +17 -17
  215. package/dist/src/{sagemaker-D2Q1c-sD.js → sagemaker-CqeASYE5.js} +17 -17
  216. package/dist/src/{sagemaker-BfiWTmvn.js → sagemaker-MUbD5V3v.js} +18 -18
  217. package/dist/src/{sagemaker-CcQHM1jV.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
  218. package/dist/src/{scanner-J8CA3LsV.js → scanner-DVDeUz1r.js} +10 -10
  219. package/dist/src/server/index.js +864 -112
  220. package/dist/src/server-B0Xh1Gx-.js +7 -0
  221. package/dist/src/{server-B0PPuDw-.cjs → server-BtoCXeXI.cjs} +4 -4
  222. package/dist/src/{server-BC7XJFgr.js → server-CP9qKM40.js} +4 -4
  223. package/dist/src/{server-OAs3nBRT.js → server-Cns05F1j.js} +5 -5
  224. package/dist/src/server-DJTKu9IR.cjs +5 -0
  225. package/dist/src/{server-DbFphssR.js → server-DZ9MtCn0.js} +6 -6
  226. package/dist/src/{signal-BOTbd53Z.js → signal-C3ZTsUgi.js} +3 -3
  227. package/dist/src/{slack-DXMKtA-f.js → slack-2sdpGzbt.js} +2 -2
  228. package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
  229. package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
  230. package/dist/src/{slack-DOdy_kyv.js → slack-DCEV-vWP.js} +2 -2
  231. package/dist/src/store-C5u6MgC8.js +6 -0
  232. package/dist/src/{store-BSc-TF2w.cjs → store-CLyU7AtI.cjs} +17 -5
  233. package/dist/src/store-CNHk-De4.cjs +5 -0
  234. package/dist/src/{store-DQLEjuEO.js → store-Cj258DgL.js} +17 -5
  235. package/dist/src/{store-D1tv90v3.js → store-P8OKm19S.js} +17 -5
  236. package/dist/src/{store-Ub2vaGJ1.js → store-VB0GP46K.js} +17 -5
  237. package/dist/src/{tables-xKANLRBD.js → tables-BEIFz2tM.js} +3 -3
  238. package/dist/src/{tables-C7K-XKWp.cjs → tables-BdZQEpRz.cjs} +3 -3
  239. package/dist/src/{tables-D36WTqKX.js → tables-DmzvLbeZ.js} +3 -3
  240. package/dist/src/{tables-5EvT_Bwn.js → tables-kC7R5kiK.js} +3 -3
  241. package/dist/src/{telemetry-C2YDkUQH.js → telemetry-BnH5VJAU.js} +4 -4
  242. package/dist/src/{telemetry-C15ziL8u.js → telemetry-BugWqKiu.js} +4 -4
  243. package/dist/src/{telemetry-DMb2Mpfm.js → telemetry-DPXLd7UE.js} +4 -4
  244. package/dist/src/telemetry-Yig0Tino.js +7 -0
  245. package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
  246. package/dist/src/{telemetry-CbrnxHp_.cjs → telemetry-re627Lre.cjs} +4 -4
  247. package/dist/src/{transcription-CL78qbOU.cjs → transcription-BvtsrzRG.cjs} +13 -13
  248. package/dist/src/{transcription-DAtxHhAM.js → transcription-CaMivnjG.js} +13 -13
  249. package/dist/src/{transcription-QHh3AH6Z.js → transcription-DOMMTu01.js} +14 -14
  250. package/dist/src/{transcription-LNZTNUUL.js → transcription-Hb3VnC4M.js} +13 -13
  251. package/dist/src/{transform-DOcQeLld.cjs → transform-0BwoBsvO.cjs} +19 -5
  252. package/dist/src/{transform-DGxXocjk.js → transform-B2-jIv68.js} +8 -6
  253. package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
  254. package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
  255. package/dist/src/transform-ChNIpHz7.js +6 -0
  256. package/dist/src/{transform-Cgi24fJ7.js → transform-DrleutM3.js} +8 -6
  257. package/dist/src/{transform-DGLazrMm.js → transform-DyDAwEpE.js} +8 -6
  258. package/dist/src/transform-PtQ6rAE3.cjs +5 -0
  259. package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
  260. package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
  261. package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
  262. package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
  263. package/dist/src/{types-CH3Ge2sE.js → types-CIhFeUC4.js} +45 -11
  264. package/dist/src/{types-CN_TZ2GJ.js → types-Cd3ygw8W.js} +45 -11
  265. package/dist/src/{types-LJ0r3wbR.cjs → types-D8cGDZbL.cjs} +46 -12
  266. package/dist/src/{types-CLKiCBW3.js → types-q8GXGF65.js} +45 -11
  267. package/dist/src/{util-CchiqXh_.cjs → util--9u9UVCt.cjs} +3 -3
  268. package/dist/src/{util-5cB-L7U3.js → util-BLvy9qfE.js} +7 -11
  269. package/dist/src/{util-YT5HPZaS.js → util-Bm3E9jpK.js} +7 -11
  270. package/dist/src/{util-6-GqIvzS.js → util-BtoGs5Cb.js} +18 -4
  271. package/dist/src/{util-Db0a0AFH.cjs → util-CFj4YKIn.cjs} +18 -4
  272. package/dist/src/{util-Dlz_Wvgm.js → util-CMMkIxfU.js} +7 -11
  273. package/dist/src/{util-Betm42rL.js → util-CgDCK4KI.js} +18 -4
  274. package/dist/src/{util-Yz-1aEhW.cjs → util-CuLo2pMR.cjs} +7 -11
  275. package/dist/src/{util-C-PPYSMq.js → util-DM2rTn_6.js} +18 -4
  276. package/dist/src/{util-B7T3SiBS.js → util-DMFeUvLz.js} +3 -3
  277. package/dist/src/{util-ZZH-3QZz.js → util-DbVG-yZU.js} +3 -3
  278. package/dist/src/{util-DaWTWKBK.js → util-vNmDL5DT.js} +3 -3
  279. package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
  280. package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
  281. package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
  282. package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
  283. package/dist/tsconfig.tsbuildinfo +1 -1
  284. package/package.json +38 -38
  285. package/dist/src/app/assets/index-BFCZg7hQ.js +0 -439
  286. package/dist/src/app/assets/index-NCn4eVBv.css +0 -1
  287. package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
  288. package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
  289. package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
  290. package/dist/src/cache-Bbn1Nyrd.cjs +0 -5
  291. package/dist/src/cache-BwsMSda7.js +0 -6
  292. package/dist/src/cloud-DmE0EwsY.js +0 -4
  293. package/dist/src/eval-17JizQIv.js +0 -15
  294. package/dist/src/evalResult-Cqj8pldJ.js +0 -12
  295. package/dist/src/evalResult-DvcJAWJU.cjs +0 -10
  296. package/dist/src/evalResult-Hftn-S_i.js +0 -10
  297. package/dist/src/evaluator-B2CFNt-P.js +0 -36
  298. package/dist/src/fetch-KV5kNASw.js +0 -5
  299. package/dist/src/graders-Bu0H9nXi.js +0 -32
  300. package/dist/src/graders-Cfhkvx-e.js +0 -34
  301. package/dist/src/graders-DClJVpGP.cjs +0 -32
  302. package/dist/src/graders-DcnJsrMO.js +0 -32
  303. package/dist/src/providers-C1rOSHiR.js +0 -32
  304. package/dist/src/providers-CxmDwEFf.cjs +0 -31
  305. package/dist/src/providers-Dodakqr0.js +0 -30
  306. package/dist/src/providers-GIQ2TcsA.js +0 -30
  307. package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
  308. package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
  309. package/dist/src/server-B1vi21hA.js +0 -7
  310. package/dist/src/server-Cm9Kai_h.cjs +0 -5
  311. package/dist/src/store-BNmZ1KAz.cjs +0 -5
  312. package/dist/src/store-BltJg2cd.js +0 -6
  313. package/dist/src/telemetry-5BCRNBbe.cjs +0 -5
  314. package/dist/src/telemetry-D4W5hboe.js +0 -7
  315. package/dist/src/transform-DTGDnAzW.js +0 -6
  316. package/dist/src/transform-m3qNw4KP.cjs +0 -5
package/dist/src/index.js CHANGED
@@ -1,40 +1,40 @@
1
- import { C as isCI, S as getMaxEvalTimeMs, _ as getEnvBool, a as setLogCallback, b as getEnvString, d as getAjv, h as summarizeEvaluateResultForLogging, i as logger, m as safeJsonStringify, n as isDebugEnabled, o as setLogLevel, p as orderKeys, t as getLogLevel, u as extractJsonObjects, v as getEnvFloat, w as state, x as getEvalTimeoutMs, y as getEnvInt } from "./logger-CT3IKMKA.js";
1
+ import { C as getEnvString, D as state, E as isCI, S as getEnvInt, T as getMaxEvalTimeMs, _ as safeJsonStringify, a as logger, b as getEnvBool, g as orderKeys, m as getAjv, n as globalLogCallback, o as setLogCallback, p as extractJsonObjects, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as summarizeEvaluateResultForLogging, w as getEvalTimeoutMs, x as getEnvFloat } from "./logger-DO8_zM18.js";
2
2
  import { t as invariant } from "./invariant-Ddh24eXh.js";
3
- import { r as importModule, t as getDirectory } from "./esm-Cd1AjG1D.js";
4
- import { r as runPython } from "./pythonUtils-D5nxkQ0P.js";
3
+ import { r as importModule, t as getDirectory } from "./esm-SUNIX1x3.js";
4
+ import { r as runPython } from "./pythonUtils-Cpo0Ez1p.js";
5
5
  import { i as isJavascriptFile } from "./fileExtensions-DnqA1y9x.js";
6
- import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-DECvGmzp.js";
7
- import { $ as matchesSearchRubric, A as BeavertailsPlugin, B as getAndCheckProvider, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as retryWithDeduplication, G as matchesContextFaithfulness, H as matchesAnswerRelevance, I as sampleArray, J as matchesFactuality, K as matchesContextRecall, L as fetchHuggingFaceDataset, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as getCustomPolicies, Q as matchesPiScore, R as callProviderWithContext, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as processPrompts, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as SUGGEST_PROMPTS_SYSTEM_MESSAGE, d as RbacPlugin, dt as loadFromJavaScriptFile, et as matchesSelectBest, f as PromptExtractionPlugin, ft as processFileReference, g as isValidPolicyObject, h as determinePolicyTypeFromId, i as VLGuardPlugin, it as DefaultSuggestionsProvider, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as coerceString, m as PolicyPlugin, n as getGraderById, nt as selectMaxScore, o as UnsafeBenchPlugin, ot as readPrompts, p as PoliticsPlugin, pt as resolveContext, q as matchesContextRelevance, r as VLSUPlugin, rt as getDefaultProviders, s as ToxicChatPlugin, st as readProviderPromptMap, t as GRADERS, tt as matchesSimilarity, u as ShellInjectionPlugin, ut as getFinalTest, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as fail } from "./graders-CpdqD9PI.js";
8
- import { A as isApiProvider, C as TestGeneratorConfigSchema, Ct as BaseTokenUsageSchema, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as PromptSchema, T as TestSuiteSchema, Tt as InputsSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TELECOM_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as isProviderOptions, k as isResultFailureReason, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CompletionTokenDetailsSchema, x as TestCasesWithMetadataPromptSchema, xt as UNALIGNED_PROVIDER_HARM_PLUGINS, y as TestCaseSchema, z as StrategyConfigSchema } from "./types-CLKiCBW3.js";
9
- import { A as getProviderDescription, C as deduplicateTestCases, D as resultIsForTestCase, E as getTestCaseDeduplicationKey, M as isGoogleProvider, N as isOpenAiProvider, O as checkProviderApiKeys, P as isProviderAllowed, S as setupEnv, T as filterRuntimeVars, b as loadFunction, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, h as renderEnvOnlyInObject, i as fetchCsvFromGoogleSheet, j as isAnthropicProvider, k as doesProviderRefMatch, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as extractVariablesFromTemplates, w as extractRuntimeVars, x as parseFileUrl, y as getNunjucksEngine } from "./util-Dlz_Wvgm.js";
10
- import { A as getShareApiBaseUrl, F as HUMAN_ASSERTION_TYPE, N as VERSION, O as TERMINAL_MAX_WIDTH, P as FILE_METADATA_KEY, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareViewBaseUrl, k as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "./fetch-60Gzydls.js";
11
- import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "./cache-8XhNqPKW.js";
12
- import { A as createRateLimitRegistry, B as isCloudProvider, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getCloudDatabaseId, I as getEvalConfigFromCloud, J as AIStudioChatProvider, L as getOrgContext, M as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as checkCloudPermissions, R as getPluginSeverityOverridesFromCloud, T as runExtensionHook, V as resolveTeamId, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, q as VertexChatProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId } from "./providers-BKRJTjBz.js";
13
- import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-XiOAgly5.js";
6
+ import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-BqPkNPYm.js";
7
+ import { $ as matchesSearchRubric, A as BeavertailsPlugin, B as getAndCheckProvider, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as retryWithDeduplication, G as matchesContextFaithfulness, H as matchesAnswerRelevance, I as sampleArray, J as matchesFactuality, K as matchesContextRecall, L as fetchHuggingFaceDataset, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as getCustomPolicies, Q as matchesPiScore, R as callProviderWithContext, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as DefaultSuggestionsProvider, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as readProviderPromptMap, d as RbacPlugin, dt as getFinalTest, et as matchesSelectBest, f as PromptExtractionPlugin, ft as loadFromJavaScriptFile, g as isValidPolicyObject, h as determinePolicyTypeFromId, i as VLGuardPlugin, it as getDefaultProviders, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, m as PolicyPlugin, mt as resolveContext, n as getGraderById, nt as matchesTrajectoryGoalSuccess, o as UnsafeBenchPlugin, ot as processPrompts, p as PoliticsPlugin, pt as processFileReference, q as matchesContextRelevance, r as VLSUPlugin, rt as selectMaxScore, s as ToxicChatPlugin, st as readPrompts, t as GRADERS, tt as matchesSimilarity, u as ShellInjectionPlugin, ut as coerceString, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as fail } from "./graders-DG7mhg-b.js";
8
+ import { A as isApiProvider, C as TestGeneratorConfigSchema, Ct as BaseTokenUsageSchema, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as PromptSchema, T as TestSuiteSchema, Tt as InputsSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TELECOM_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as isProviderOptions, k as isResultFailureReason, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CompletionTokenDetailsSchema, x as TestCasesWithMetadataPromptSchema, xt as UNALIGNED_PROVIDER_HARM_PLUGINS, y as TestCaseSchema, z as StrategyConfigSchema } from "./types-q8GXGF65.js";
9
+ import { A as getProviderDescription, C as deduplicateTestCases, D as resultIsForTestCase, E as getTestCaseDeduplicationKey, M as isGoogleProvider, N as isOpenAiProvider, O as checkProviderApiKeys, P as isProviderAllowed, S as setupEnv, T as filterRuntimeVars, b as loadFunction, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, h as renderEnvOnlyInObject, i as fetchCsvFromGoogleSheet, j as isAnthropicProvider, k as doesProviderRefMatch, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as extractVariablesFromTemplates, w as extractRuntimeVars, x as parseFileUrl, y as getNunjucksEngine } from "./util-CMMkIxfU.js";
10
+ import { A as getShareApiBaseUrl, F as HUMAN_ASSERTION_TYPE, N as VERSION, O as TERMINAL_MAX_WIDTH, P as FILE_METADATA_KEY, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareViewBaseUrl, k as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "./fetch-CVAtKnI3.js";
11
+ import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "./cache-CaT5tPgo.js";
12
+ import { A as createRateLimitRegistry, B as isCloudProvider, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getCloudDatabaseId, I as getEvalConfigFromCloud, J as AIStudioChatProvider, L as getOrgContext, M as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as checkCloudPermissions, R as getPluginSeverityOverridesFromCloud, T as runExtensionHook, V as resolveTeamId, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, q as VertexChatProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId } from "./providers-Cn73d5sr.js";
13
+ import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-CFxO9KGo.js";
14
14
  import { n as sha256, t as randomSequence } from "./createHash-DmPQkvBh.js";
15
15
  import "./genaiTracer-D3fD9dNV.js";
16
- import { t as OpenAiChatCompletionProvider } from "./chat-CznLWr_D.js";
16
+ import { t as OpenAiChatCompletionProvider } from "./chat-pxmiVpWe.js";
17
17
  import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-NYT-WKS6.js";
18
- import { m as validateFunctionCall } from "./transform-DGLazrMm.js";
19
- import "./messages-BLbWdsyt.js";
20
- import "./util-DaWTWKBK.js";
21
- import "./responses-BKqJmhhc.js";
22
- import "./openai-DElQ-fPX.js";
23
- import { l as validateFunctionCall$1 } from "./util-Betm42rL.js";
24
- import "./completion-C_P3ypkJ.js";
25
- import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-xrUGFA6n.js";
26
- import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-BC7XJFgr.js";
27
- import { t as getBlobByHash } from "./blobs-Bpg5rH6i.js";
28
- import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-5EvT_Bwn.js";
29
- import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-M67RUtg6.js";
30
- import { t as telemetry } from "./telemetry-C15ziL8u.js";
18
+ import { m as validateFunctionCall } from "./transform-DyDAwEpE.js";
19
+ import "./messages-Dy9QecMs.js";
20
+ import "./util-vNmDL5DT.js";
21
+ import "./responses-CQb1Tj69.js";
22
+ import "./openai-BcB5KlTk.js";
23
+ import { l as validateFunctionCall$1 } from "./util-CgDCK4KI.js";
24
+ import "./completion-DCjv7RZ3.js";
25
+ import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-B2XmGjty.js";
26
+ import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-CP9qKM40.js";
27
+ import { t as getBlobByHash } from "./blobs-DXTl6J3H.js";
28
+ import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-kC7R5kiK.js";
29
+ import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-D_wd8jxt.js";
30
+ import { t as telemetry } from "./telemetry-BugWqKiu.js";
31
31
  import { t as ellipsize } from "./text-B_UCRPp2.js";
32
- import { t as getTraceStore } from "./store-DQLEjuEO.js";
33
- import "./base-B0tcrnq_.js";
34
- import "./image-BmEZqVmk.js";
35
- import { t as providerRegistry } from "./providerRegistry-CD8MEar9.js";
36
- import { n as runRuby } from "./rubyUtils-BUVePouc.js";
37
- import { t as EvalResult } from "./evalResult-CDQiuUuf.js";
32
+ import { t as getTraceStore } from "./store-Cj258DgL.js";
33
+ import "./base-CqzQ4K8j.js";
34
+ import "./image-CoxZp9PZ.js";
35
+ import { t as providerRegistry } from "./providerRegistry-CUWki5mQ.js";
36
+ import { n as runRuby } from "./rubyUtils-PgU-gHmx.js";
37
+ import { t as EvalResult } from "./evalResult-BkIhRdTe.js";
38
38
  import * as fs$1 from "fs";
39
39
  import fs, { createWriteStream } from "fs";
40
40
  import * as path$2 from "path";
@@ -56,11 +56,13 @@ import { XMLParser } from "fast-xml-parser";
56
56
  import crypto$1, { createHash, randomBytes } from "crypto";
57
57
  import { DiagConsoleLogger, DiagLogLevel, diag, propagation } from "@opentelemetry/api";
58
58
  import input from "@inquirer/input";
59
+ import readline from "readline";
59
60
  import { and, desc, eq, inArray, sql } from "drizzle-orm";
60
61
  import cliProgress from "cli-progress";
61
62
  import { JSDOM } from "jsdom";
62
63
  import { distance } from "fastest-levenshtein";
63
64
  import * as rouge from "js-rouge";
65
+ import { isDeepStrictEqual } from "node:util";
64
66
  import "debounce";
65
67
  import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
66
68
  import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
@@ -288,7 +290,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
288
290
  telemetry.record("feature_used", { feature: "tracing" });
289
291
  try {
290
292
  logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
291
- const { startOTLPReceiver } = await import("./otlpReceiver--AIRW_S4.js");
293
+ const { startOTLPReceiver } = await import("./otlpReceiver-CZL48YfC.js");
292
294
  const port = testSuite.tracing.otlp.http.port || 4318;
293
295
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
294
296
  logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
@@ -311,7 +313,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
311
313
  async function stopOtlpReceiverIfNeeded() {
312
314
  if (otlpReceiverStarted) try {
313
315
  logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
314
- const { stopOTLPReceiver } = await import("./otlpReceiver--AIRW_S4.js");
316
+ const { stopOTLPReceiver } = await import("./otlpReceiver-CZL48YfC.js");
315
317
  await stopOTLPReceiver();
316
318
  otlpReceiverStarted = false;
317
319
  logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -346,7 +348,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
346
348
  }
347
349
  if (!tracingEnabled) return null;
348
350
  logger.debug("[EvaluatorTracing] Importing trace store");
349
- const { getTraceStore } = await import("./store-DQLEjuEO.js").then((n) => n.n);
351
+ const { getTraceStore } = await import("./store-Cj258DgL.js").then((n) => n.n);
350
352
  const traceStore = getTraceStore();
351
353
  const traceId = generateTraceId();
352
354
  const spanId = generateSpanId();
@@ -1379,7 +1381,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
1379
1381
  pass = result !== inverse;
1380
1382
  score = pass ? 1 : 0;
1381
1383
  } else if (typeof result === "number") {
1382
- pass = assertion.threshold !== void 0 ? result >= assertion.threshold : result > 0;
1384
+ pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
1383
1385
  score = result;
1384
1386
  } else if (typeof result === "object") return result;
1385
1387
  else throw new Error("Custom function must return a boolean or number");
@@ -1412,7 +1414,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
1412
1414
  } catch {
1413
1415
  pass = inverse;
1414
1416
  }
1415
- if (pass && renderedValue) {
1417
+ if (parsedJson !== void 0 && renderedValue) {
1416
1418
  let validate;
1417
1419
  if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
1418
1420
  const schema = valueFromScript;
@@ -1424,11 +1426,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
1424
1426
  }
1425
1427
  else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
1426
1428
  else throw new Error("is-json assertion must have a string or object value");
1427
- pass = validate(parsedJson);
1429
+ const valid = validate(parsedJson);
1430
+ pass = inverse ? !valid : valid;
1428
1431
  if (!pass) return {
1429
1432
  pass,
1430
1433
  score: 0,
1431
- reason: `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
1434
+ reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
1432
1435
  assertion
1433
1436
  };
1434
1437
  }
@@ -1455,9 +1458,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
1455
1458
  }
1456
1459
  else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
1457
1460
  else throw new Error("contains-json assertion must have a string or object value");
1458
- pass = validate(jsonObject);
1459
- if (pass) break;
1460
- else errorMessage = `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`;
1461
+ const valid = validate(jsonObject);
1462
+ pass = inverse ? !valid : valid;
1463
+ if (valid) {
1464
+ if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
1465
+ break;
1466
+ } else errorMessage = `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`;
1461
1467
  }
1462
1468
  return {
1463
1469
  pass,
@@ -1641,7 +1647,7 @@ function handlePerplexity({ logProbs, assertion }) {
1641
1647
  if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
1642
1648
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
1643
1649
  const perplexity = Math.exp(-avgLogProb);
1644
- const pass = assertion.threshold !== void 0 ? perplexity <= assertion.threshold : true;
1650
+ const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
1645
1651
  return {
1646
1652
  pass,
1647
1653
  score: pass ? 1 : 0,
@@ -1653,7 +1659,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
1653
1659
  if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
1654
1660
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
1655
1661
  const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
1656
- const pass = assertion.threshold !== void 0 ? perplexityNorm >= assertion.threshold : true;
1662
+ const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
1657
1663
  return {
1658
1664
  pass,
1659
1665
  score: perplexityNorm,
@@ -1768,7 +1774,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
1768
1774
  } else {
1769
1775
  score = Number.parseFloat(String(result));
1770
1776
  if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
1771
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
1777
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
1772
1778
  }
1773
1779
  } catch (err) {
1774
1780
  return {
@@ -2029,7 +2035,7 @@ end
2029
2035
  } else {
2030
2036
  score = Number.parseFloat(String(result));
2031
2037
  if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
2032
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
2038
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
2033
2039
  }
2034
2040
  } catch (err) {
2035
2041
  return {
@@ -2100,6 +2106,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
2100
2106
  };
2101
2107
  };
2102
2108
  //#endregion
2109
+ //#region src/assertions/traceUtils.ts
2110
+ /**
2111
+ * Shared utilities for trace assertions
2112
+ */
2113
+ /**
2114
+ * Match a span name against a glob-like pattern.
2115
+ * Supports * (any characters) and ? (single character) wildcards.
2116
+ *
2117
+ * @param spanName - The span name to match
2118
+ * @param pattern - The glob pattern to match against
2119
+ * @returns true if the span name matches the pattern
2120
+ */
2121
+ function matchesPattern(spanName, pattern) {
2122
+ const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
2123
+ return new RegExp(`^${regexPattern}$`, "i").test(spanName);
2124
+ }
2125
+ //#endregion
2126
+ //#region src/assertions/skill.ts
2127
+ function getSkillCalls(params) {
2128
+ const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
2129
+ if (!Array.isArray(rawSkillCalls)) return [];
2130
+ return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
2131
+ }
2132
+ function matchesSkill(skillCall, matcher) {
2133
+ if (matcher.name && skillCall.name !== matcher.name) return false;
2134
+ if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
2135
+ return true;
2136
+ }
2137
+ function formatSkillCall(skillCall) {
2138
+ const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
2139
+ return details ? `${skillCall.name} (${details})` : skillCall.name;
2140
+ }
2141
+ function resolveSkillMatchers(value) {
2142
+ const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
2143
+ const validateCount = (field, count) => {
2144
+ if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
2145
+ };
2146
+ if (typeof value === "string" && value.trim()) return {
2147
+ kind: "list",
2148
+ matchers: [{ name: normalizeText(value) }]
2149
+ };
2150
+ if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
2151
+ kind: "list",
2152
+ matchers: value.map((item) => ({ name: item.trim() }))
2153
+ };
2154
+ if (value && typeof value === "object" && !Array.isArray(value)) {
2155
+ const rawMatcher = value;
2156
+ const matcher = rawMatcher;
2157
+ const name = normalizeText(matcher.name);
2158
+ const pattern = normalizeText(matcher.pattern);
2159
+ if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
2160
+ if ("min" in rawMatcher) validateCount("min", matcher.min);
2161
+ if ("max" in rawMatcher) validateCount("max", matcher.max);
2162
+ if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
2163
+ return {
2164
+ kind: "count",
2165
+ matcher: {
2166
+ max: typeof matcher.max === "number" ? matcher.max : void 0,
2167
+ min: typeof matcher.min === "number" ? matcher.min : void 0,
2168
+ name,
2169
+ pattern
2170
+ }
2171
+ };
2172
+ }
2173
+ throw new Error("skill-used assertion must have a string, string array, or object value");
2174
+ }
2175
+ function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
2176
+ const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
2177
+ const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
2178
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
2179
+ const expectedSkills = expected.matchers.map((matcher) => matcher.name);
2180
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
2181
+ let reason;
2182
+ if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
2183
+ else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
2184
+ else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
2185
+ return {
2186
+ pass,
2187
+ score: pass ? 1 : 0,
2188
+ reason,
2189
+ assertion: params.assertion
2190
+ };
2191
+ }
2192
+ function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
2193
+ const hasExplicitMin = matcher.min !== void 0;
2194
+ const hasExplicitMax = matcher.max !== void 0;
2195
+ const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
2196
+ const max = matcher.max;
2197
+ const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
2198
+ const count = matchingSkillCalls.length;
2199
+ const matcherLabel = matcher.pattern || matcher.name || "*";
2200
+ if (params.inverse) {
2201
+ if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
2202
+ const pass = count === 0;
2203
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
2204
+ return {
2205
+ pass,
2206
+ score: pass ? 1 : 0,
2207
+ reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
2208
+ assertion: params.assertion
2209
+ };
2210
+ }
2211
+ const pass = count >= min && (max === void 0 || count <= max);
2212
+ let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
2213
+ reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
2214
+ if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
2215
+ return {
2216
+ pass,
2217
+ score: pass ? 1 : 0,
2218
+ reason,
2219
+ assertion: params.assertion
2220
+ };
2221
+ }
2222
+ function handleSkillUsed(params) {
2223
+ const skillCalls = getSkillCalls(params);
2224
+ const actualSkills = skillCalls.map(formatSkillCall);
2225
+ const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
2226
+ if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
2227
+ return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
2228
+ }
2229
+ //#endregion
2103
2230
  //#region src/assertions/sql.ts
2104
2231
  const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
2105
2232
  let pass = false;
@@ -2332,23 +2459,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
2332
2459
  };
2333
2460
  };
2334
2461
  //#endregion
2335
- //#region src/assertions/traceUtils.ts
2336
- /**
2337
- * Shared utilities for trace assertions
2338
- */
2339
- /**
2340
- * Match a span name against a glob-like pattern.
2341
- * Supports * (any characters) and ? (single character) wildcards.
2342
- *
2343
- * @param spanName - The span name to match
2344
- * @param pattern - The glob pattern to match against
2345
- * @returns true if the span name matches the pattern
2346
- */
2347
- function matchesPattern(spanName, pattern) {
2348
- const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
2349
- return new RegExp(`^${regexPattern}$`, "i").test(spanName);
2350
- }
2351
- //#endregion
2352
2462
  //#region src/assertions/traceErrorSpans.ts
2353
2463
  function isErrorSpan(span) {
2354
2464
  if (span.statusCode && span.statusCode >= 400) return true;
@@ -2517,6 +2627,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
2517
2627
  };
2518
2628
  };
2519
2629
  //#endregion
2630
+ //#region src/assertions/trajectoryUtils.ts
2631
+ const TOOL_ATTRIBUTE_KEYS = [
2632
+ "tool.name",
2633
+ "tool_name",
2634
+ "tool",
2635
+ "function.name",
2636
+ "function_name",
2637
+ "gen_ai.tool.name",
2638
+ "codex.mcp.tool",
2639
+ "agent.tool",
2640
+ "agent.tool_name",
2641
+ "agent.toolName"
2642
+ ];
2643
+ const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
2644
+ "tool.arguments",
2645
+ "tool.args",
2646
+ "tool.input",
2647
+ "tool_arguments",
2648
+ "tool_args",
2649
+ "tool_input",
2650
+ "function.arguments",
2651
+ "function.args",
2652
+ "function.input",
2653
+ "function_arguments",
2654
+ "function_args",
2655
+ "gen_ai.tool.arguments",
2656
+ "gen_ai.tool.args",
2657
+ "gen_ai.tool.input",
2658
+ "gen_ai.tool.call.arguments",
2659
+ "gen_ai.tool.call.args",
2660
+ "agent.tool.arguments",
2661
+ "agent.tool.args",
2662
+ "agent.tool.input",
2663
+ "codex.mcp.arguments",
2664
+ "codex.mcp.args",
2665
+ "codex.mcp.input",
2666
+ "arguments",
2667
+ "args",
2668
+ "input"
2669
+ ];
2670
+ const COMMAND_ATTRIBUTE_KEYS = [
2671
+ "codex.command",
2672
+ "command",
2673
+ "command.name",
2674
+ "command_name"
2675
+ ];
2676
+ const SEARCH_ATTRIBUTE_KEYS = [
2677
+ "codex.search.query",
2678
+ "search.query",
2679
+ "search_query"
2680
+ ];
2681
+ const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
2682
+ const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
2683
+ const MAX_JUDGE_SUMMARY_STEPS = 24;
2684
+ const JUDGE_SUMMARY_HEAD_STEPS = 12;
2685
+ const JUDGE_SUMMARY_TAIL_STEPS = 12;
2686
+ function getStringAttribute(attributes, keys) {
2687
+ for (const key of keys) {
2688
+ const value = attributes[key];
2689
+ if (typeof value === "string" && value.trim()) return value.trim();
2690
+ }
2691
+ }
2692
+ function normalizeStructuredAttribute(value) {
2693
+ if (value === void 0 || value === null) return;
2694
+ if (typeof value === "string") {
2695
+ const trimmed = value.trim();
2696
+ if (!trimmed) return;
2697
+ try {
2698
+ return JSON.parse(trimmed);
2699
+ } catch {
2700
+ return trimmed;
2701
+ }
2702
+ }
2703
+ if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
2704
+ }
2705
+ function hasSameStatus(left, right) {
2706
+ return left?.code === right?.code && left?.message === right?.message;
2707
+ }
2708
+ function isSearchLikeSpan(span) {
2709
+ const attributes = span.attributes || {};
2710
+ if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
2711
+ return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
2712
+ }
2713
+ function getTrajectoryStepStatus(step) {
2714
+ if (step.statusCode === void 0 || step.statusCode === 0) return;
2715
+ return {
2716
+ code: step.statusCode,
2717
+ ...step.statusMessage ? { message: step.statusMessage } : {}
2718
+ };
2719
+ }
2720
+ function getCommandExecutable(command) {
2721
+ return command.trim().split(/\s+/)[0] || void 0;
2722
+ }
2723
+ function extractToolName(span) {
2724
+ const attributes = span.attributes || {};
2725
+ const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
2726
+ if (directMatch) return directMatch;
2727
+ for (const [key, value] of Object.entries(attributes)) {
2728
+ if (typeof value !== "string" || !value.trim()) continue;
2729
+ if (/tool.?name|function.?name/i.test(key)) return value.trim();
2730
+ if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
2731
+ }
2732
+ if (span.name.startsWith("mcp ")) {
2733
+ const slashIndex = span.name.lastIndexOf("/");
2734
+ if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
2735
+ }
2736
+ }
2737
+ function extractToolArgs(span) {
2738
+ const attributes = span.attributes || {};
2739
+ for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
2740
+ const value = normalizeStructuredAttribute(attributes[key]);
2741
+ if (value !== void 0) return value;
2742
+ }
2743
+ for (const [key, rawValue] of Object.entries(attributes)) {
2744
+ if (/result|output|error|status/i.test(key)) continue;
2745
+ if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
2746
+ const value = normalizeStructuredAttribute(rawValue);
2747
+ if (value !== void 0) return value;
2748
+ }
2749
+ }
2750
+ function extractCommand(span) {
2751
+ const attributes = span.attributes || {};
2752
+ const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
2753
+ if (directMatch) return directMatch;
2754
+ for (const [key, value] of Object.entries(attributes)) {
2755
+ if (typeof value !== "string" || !value.trim()) continue;
2756
+ if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
2757
+ }
2758
+ if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
2759
+ }
2760
+ function extractSearchQuery(span) {
2761
+ const attributes = span.attributes || {};
2762
+ const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
2763
+ if (directMatch) return directMatch;
2764
+ const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
2765
+ if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
2766
+ if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
2767
+ }
2768
+ function isReasoningSpan(span) {
2769
+ if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
2770
+ return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
2771
+ }
2772
+ function isMessageSpan(span) {
2773
+ if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
2774
+ return span.name === "agent response" || span.name === "send input";
2775
+ }
2776
+ function extractTrajectorySteps(trace) {
2777
+ return [...trace.spans || []].map((span, index) => ({
2778
+ span,
2779
+ index
2780
+ })).sort((left, right) => {
2781
+ const timeDiff = left.span.startTime - right.span.startTime;
2782
+ if (timeDiff !== 0) return timeDiff;
2783
+ const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
2784
+ if (endDiff !== 0) return endDiff;
2785
+ return left.index - right.index;
2786
+ }).map(({ span }) => {
2787
+ const toolName = extractToolName(span);
2788
+ const command = extractCommand(span);
2789
+ const searchQuery = extractSearchQuery(span);
2790
+ let type = "span";
2791
+ let name = span.name;
2792
+ const aliases = new Set([span.name]);
2793
+ let args;
2794
+ if (toolName) {
2795
+ type = "tool";
2796
+ name = toolName;
2797
+ aliases.add(toolName);
2798
+ args = extractToolArgs(span);
2799
+ } else if (command) {
2800
+ type = "command";
2801
+ name = command;
2802
+ aliases.add(command);
2803
+ const executable = getCommandExecutable(command);
2804
+ if (executable) aliases.add(executable);
2805
+ } else if (searchQuery) {
2806
+ type = "search";
2807
+ name = searchQuery;
2808
+ aliases.add(searchQuery);
2809
+ } else if (isReasoningSpan(span)) {
2810
+ type = "reasoning";
2811
+ name = span.name;
2812
+ aliases.add("reasoning");
2813
+ } else if (isMessageSpan(span)) {
2814
+ type = "message";
2815
+ name = span.name;
2816
+ aliases.add("message");
2817
+ }
2818
+ return {
2819
+ aliases: [...aliases],
2820
+ ...args === void 0 ? {} : { args },
2821
+ attributes: span.attributes || {},
2822
+ endTime: span.endTime,
2823
+ name,
2824
+ spanId: span.spanId,
2825
+ spanName: span.name,
2826
+ startTime: span.startTime,
2827
+ statusCode: span.statusCode,
2828
+ statusMessage: span.statusMessage,
2829
+ type
2830
+ };
2831
+ });
2832
+ }
2833
+ function normalizeTrajectoryMatcher(matcher, defaultType) {
2834
+ if (typeof matcher === "string") return {
2835
+ pattern: matcher,
2836
+ ...defaultType ? { type: defaultType } : {}
2837
+ };
2838
+ return {
2839
+ ...matcher,
2840
+ ...matcher.type ? {} : defaultType ? { type: defaultType } : {}
2841
+ };
2842
+ }
2843
+ function matchesTrajectoryStep(step, matcher, defaultType) {
2844
+ const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
2845
+ if (type) {
2846
+ if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
2847
+ }
2848
+ const matchPattern = pattern || name;
2849
+ if (!matchPattern) return true;
2850
+ return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
2851
+ }
2852
+ function formatTrajectoryStep(step) {
2853
+ return `${step.type}:${step.name}`;
2854
+ }
2855
+ function formatTrajectoryArgs(args) {
2856
+ if (args === void 0) return "(none)";
2857
+ try {
2858
+ const serialized = JSON.stringify(args);
2859
+ if (serialized !== void 0) return serialized;
2860
+ } catch {}
2861
+ return String(args);
2862
+ }
2863
+ function compactJudgeTrajectorySteps(steps) {
2864
+ const compacted = [];
2865
+ for (const step of steps) {
2866
+ const previousStep = compacted[compacted.length - 1];
2867
+ if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
2868
+ previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
2869
+ continue;
2870
+ }
2871
+ compacted.push(step);
2872
+ }
2873
+ return compacted;
2874
+ }
2875
+ function truncateJudgeTrajectorySteps(steps) {
2876
+ if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
2877
+ return [
2878
+ ...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
2879
+ { omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
2880
+ ...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
2881
+ ];
2882
+ }
2883
+ function summarizeTrajectoryForJudge(trace) {
2884
+ const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
2885
+ index: index + 1,
2886
+ type: step.type,
2887
+ name: step.name,
2888
+ ...step.spanName === step.name ? {} : { spanName: step.spanName },
2889
+ ...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
2890
+ }));
2891
+ const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
2892
+ const steps = truncateJudgeTrajectorySteps(compactedSteps);
2893
+ return JSON.stringify({
2894
+ traceId: trace.traceId,
2895
+ stepCount: rawSteps.length,
2896
+ compactedStepCount: compactedSteps.length,
2897
+ steps
2898
+ }, null, 2);
2899
+ }
2900
+ //#endregion
2901
+ //#region src/assertions/trajectory.ts
2902
+ function getTraceOrThrow(params) {
2903
+ const trace = params.assertionValueContext.trace;
2904
+ if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
2905
+ return trace;
2906
+ }
2907
+ function applyInverse(pass, inverse) {
2908
+ return inverse ? !pass : pass;
2909
+ }
2910
+ function formatStepList(stepLabels) {
2911
+ return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
2912
+ }
2913
+ function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
2914
+ if (matcher.pattern || matcher.name) return;
2915
+ const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
2916
+ throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
2917
+ }
2918
+ function resolveGoalSuccessValue(value) {
2919
+ if (typeof value === "string" && value.trim()) return { goal: value.trim() };
2920
+ if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
2921
+ throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
2922
+ }
2923
+ function resolveToolMatchers(value) {
2924
+ if (typeof value === "string") return {
2925
+ kind: "list",
2926
+ matchers: [normalizeTrajectoryMatcher(value, "tool")]
2927
+ };
2928
+ if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
2929
+ kind: "list",
2930
+ matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
2931
+ };
2932
+ if (value && typeof value === "object" && !Array.isArray(value)) return {
2933
+ kind: "count",
2934
+ matcher: {
2935
+ ...normalizeTrajectoryMatcher(value, "tool"),
2936
+ max: typeof value.max === "number" ? value.max : void 0,
2937
+ min: typeof value.min === "number" ? value.min : void 0
2938
+ }
2939
+ };
2940
+ throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
2941
+ }
2942
+ const handleTrajectoryToolUsed = (params) => {
2943
+ const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
2944
+ const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
2945
+ if (expected.kind === "list") {
2946
+ if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
2947
+ const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
2948
+ const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
2949
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
2950
+ const actualTools = steps.map(formatTrajectoryStep);
2951
+ const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
2952
+ let reason;
2953
+ if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2954
+ else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2955
+ else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2956
+ return {
2957
+ pass,
2958
+ score: pass ? 1 : 0,
2959
+ reason,
2960
+ assertion: params.assertion
2961
+ };
2962
+ }
2963
+ const matcher = expected.matcher;
2964
+ const min = matcher.min ?? 1;
2965
+ const max = matcher.max;
2966
+ if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
2967
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
2968
+ const count = matchingSteps.length;
2969
+ const basePass = count >= min && (max === void 0 || count <= max);
2970
+ const pass = applyInverse(basePass, params.inverse);
2971
+ const matcherLabel = matcher.pattern || matcher.name || "*";
2972
+ let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
2973
+ if (max === void 0) reason += ` (expected at least ${min})`;
2974
+ else reason += ` (expected ${min}-${max})`;
2975
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
2976
+ if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
2977
+ return {
2978
+ pass,
2979
+ score: pass ? 1 : 0,
2980
+ reason,
2981
+ assertion: params.assertion
2982
+ };
2983
+ };
2984
+ function resolveSequenceValue(value) {
2985
+ if (Array.isArray(value)) return {
2986
+ mode: "in_order",
2987
+ steps: value
2988
+ };
2989
+ if (value && typeof value === "object" && !Array.isArray(value)) {
2990
+ const sequenceValue = value;
2991
+ return {
2992
+ mode: sequenceValue.mode || "in_order",
2993
+ steps: sequenceValue.steps || []
2994
+ };
2995
+ }
2996
+ throw new Error("trajectory:tool-sequence assertion must have an array or object value");
2997
+ }
2998
+ function isRecord(value) {
2999
+ return typeof value === "object" && value !== null && !Array.isArray(value);
3000
+ }
3001
+ function matchesExpectedArgsPartial(actual, expected) {
3002
+ if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
3003
+ if (isRecord(expected)) {
3004
+ if (!isRecord(actual)) return false;
3005
+ return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
3006
+ }
3007
+ return isDeepStrictEqual(actual, expected);
3008
+ }
3009
+ function matchesToolArgs(actual, expected, mode) {
3010
+ if (mode === "exact") return isDeepStrictEqual(actual, expected);
3011
+ return matchesExpectedArgsPartial(actual, expected);
3012
+ }
3013
+ function resolveToolArgsMatchMode(mode) {
3014
+ if (mode === void 0) return "partial";
3015
+ if (mode === "partial" || mode === "exact") return mode;
3016
+ throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
3017
+ }
3018
+ function resolveToolArgsMatchValue(value) {
3019
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
3020
+ const matcher = normalizeTrajectoryMatcher(value, "tool");
3021
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
3022
+ const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
3023
+ if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
3024
+ return {
3025
+ matcher,
3026
+ expectedArgs,
3027
+ mode: resolveToolArgsMatchMode(value.mode)
3028
+ };
3029
+ }
3030
+ const handleTrajectoryToolSequence = (params) => {
3031
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
3032
+ const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
3033
+ const expectedMatchers = value.steps.map((step, index) => {
3034
+ const matcher = normalizeTrajectoryMatcher(step, "tool");
3035
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
3036
+ return matcher;
3037
+ });
3038
+ if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
3039
+ const actualTools = toolSteps.map(formatTrajectoryStep);
3040
+ let basePass = false;
3041
+ let reason = "";
3042
+ if (value.mode === "exact") {
3043
+ basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
3044
+ if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
3045
+ else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
3046
+ } else {
3047
+ let expectedIndex = 0;
3048
+ const matchedSteps = [];
3049
+ for (const step of toolSteps) {
3050
+ if (expectedIndex >= expectedMatchers.length) break;
3051
+ if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
3052
+ matchedSteps.push(formatTrajectoryStep(step));
3053
+ expectedIndex += 1;
3054
+ }
3055
+ }
3056
+ basePass = expectedIndex === expectedMatchers.length;
3057
+ if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
3058
+ else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
3059
+ }
3060
+ const pass = applyInverse(basePass, params.inverse);
3061
+ if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
3062
+ return {
3063
+ pass,
3064
+ score: pass ? 1 : 0,
3065
+ reason,
3066
+ assertion: params.assertion
3067
+ };
3068
+ };
3069
+ const handleTrajectoryToolArgsMatch = (params) => {
3070
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
3071
+ const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
3072
+ const matcherLabel = matcher.pattern || matcher.name || "*";
3073
+ const actualTools = toolSteps.map(formatTrajectoryStep);
3074
+ const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
3075
+ const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
3076
+ const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
3077
+ const basePass = matchedStep !== void 0;
3078
+ const pass = applyInverse(basePass, params.inverse);
3079
+ const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
3080
+ const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
3081
+ let reason;
3082
+ if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
3083
+ else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
3084
+ else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
3085
+ else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
3086
+ else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
3087
+ else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
3088
+ else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
3089
+ return {
3090
+ pass,
3091
+ score: pass ? 1 : 0,
3092
+ reason,
3093
+ assertion: params.assertion
3094
+ };
3095
+ };
3096
+ function resolveStepCountValue(value) {
3097
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
3098
+ return {
3099
+ ...normalizeTrajectoryMatcher(value),
3100
+ max: typeof value.max === "number" ? value.max : void 0,
3101
+ min: typeof value.min === "number" ? value.min : void 0
3102
+ };
3103
+ }
3104
+ const handleTrajectoryStepCount = (params) => {
3105
+ const steps = extractTrajectorySteps(getTraceOrThrow(params));
3106
+ const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
3107
+ const { min, max } = matcher;
3108
+ if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
3109
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
3110
+ const count = matchingSteps.length;
3111
+ const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
3112
+ const pass = applyInverse(basePass, params.inverse);
3113
+ const filterParts = [];
3114
+ if (matcher.type) {
3115
+ const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
3116
+ filterParts.push(`type=${types.join("|")}`);
3117
+ }
3118
+ const pattern = matcher.pattern || matcher.name;
3119
+ if (pattern) filterParts.push(`pattern=${pattern}`);
3120
+ let reason = `Matched ${count} trajectory step(s)`;
3121
+ if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
3122
+ if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
3123
+ else if (min !== void 0) reason += ` (expected at least ${min})`;
3124
+ else if (max !== void 0) reason += ` (expected at most ${max})`;
3125
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
3126
+ if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
3127
+ return {
3128
+ pass,
3129
+ score: pass ? 1 : 0,
3130
+ reason,
3131
+ assertion: params.assertion
3132
+ };
3133
+ };
3134
+ const handleTrajectoryGoalSuccess = async (params) => {
3135
+ const trace = getTraceOrThrow(params);
3136
+ const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
3137
+ const result = await matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
3138
+ if (!params.inverse) return result;
3139
+ return {
3140
+ ...result,
3141
+ assertion: params.assertion,
3142
+ pass: !result.pass,
3143
+ score: result.pass ? 0 : 1,
3144
+ reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
3145
+ };
3146
+ };
3147
+ //#endregion
2520
3148
  //#region src/assertions/webhook.ts
2521
3149
  async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
2522
3150
  invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
@@ -2585,18 +3213,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
2585
3213
  if (pass) reason = "Assertion passed";
2586
3214
  else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
2587
3215
  else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
2588
- } else if (min !== void 0) {
2589
- const basePass = wordCount >= min;
2590
- pass = inverse ? !basePass : basePass;
2591
- if (pass) reason = "Assertion passed";
2592
- else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
2593
- else reason = `Word count ${wordCount} is less than minimum ${min}`;
2594
- } else {
3216
+ } else if (min === void 0) {
2595
3217
  const basePass = wordCount <= max;
2596
3218
  pass = inverse ? !basePass : basePass;
2597
3219
  if (pass) reason = "Assertion passed";
2598
3220
  else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
2599
3221
  else reason = `Word count ${wordCount} is greater than maximum ${max}`;
3222
+ } else {
3223
+ const basePass = wordCount >= min;
3224
+ pass = inverse ? !basePass : basePass;
3225
+ if (pass) reason = "Assertion passed";
3226
+ else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
3227
+ else reason = `Word count ${wordCount} is less than minimum ${min}`;
2600
3228
  }
2601
3229
  } else {
2602
3230
  invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
@@ -2691,6 +3319,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
2691
3319
  //#endregion
2692
3320
  //#region src/assertions/index.ts
2693
3321
  const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
3322
+ const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
3323
+ const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
3324
+ const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
3325
+ const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
3326
+ const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
3327
+ const MAX_TRACE_FETCH_STABLE_POLLS = 10;
2694
3328
  const MODEL_GRADED_ASSERTION_TYPES = new Set([
2695
3329
  "answer-relevance",
2696
3330
  "context-faithfulness",
@@ -2700,8 +3334,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
2700
3334
  "llm-rubric",
2701
3335
  "model-graded-closedqa",
2702
3336
  "model-graded-factuality",
2703
- "search-rubric"
3337
+ "search-rubric",
3338
+ "trajectory:goal-success"
2704
3339
  ]);
3340
+ const TRACE_AWARE_ASSERTION_TYPES = new Set([
3341
+ "javascript",
3342
+ "python",
3343
+ "ruby",
3344
+ "trace-error-spans",
3345
+ "trace-span-count",
3346
+ "trace-span-duration",
3347
+ "trajectory:goal-success",
3348
+ "trajectory:step-count",
3349
+ "trajectory:tool-args-match",
3350
+ "trajectory:tool-sequence",
3351
+ "trajectory:tool-used"
3352
+ ]);
3353
+ function assertionUsesTrace(assertion) {
3354
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
3355
+ return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
3356
+ }
3357
+ function assertionMayNeedTraceContext(assertion) {
3358
+ if (assertionUsesTrace(assertion)) return true;
3359
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
3360
+ return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || isPackagePath(assertion.value) : false;
3361
+ }
3362
+ function hasTraceAwareAssertions(assertions) {
3363
+ return Boolean(assertions?.some(assertionMayNeedTraceContext));
3364
+ }
3365
+ async function loadTraceData(traceId) {
3366
+ const traceStore = getTraceStore();
3367
+ const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
3368
+ const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
3369
+ const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
3370
+ let lastSpanCount = -1;
3371
+ let stableObservations = 0;
3372
+ let latestTrace = null;
3373
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
3374
+ latestTrace = await traceStore.getTrace(traceId);
3375
+ const spanCount = latestTrace?.spans?.length ?? 0;
3376
+ if (spanCount > 0) {
3377
+ stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
3378
+ lastSpanCount = spanCount;
3379
+ if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
3380
+ } else {
3381
+ stableObservations = 0;
3382
+ lastSpanCount = spanCount;
3383
+ }
3384
+ if (attempt < maxAttempts - 1) await sleep(retryDelayMs);
3385
+ }
3386
+ return latestTrace;
3387
+ }
2705
3388
  const ASSERTION_HANDLERS = {
2706
3389
  "answer-relevance": handleAnswerRelevance,
2707
3390
  bleu: handleBleuScore,
@@ -2764,12 +3447,18 @@ const ASSERTION_HANDLERS = {
2764
3447
  ruby: handleRuby,
2765
3448
  "rouge-n": handleRougeScore,
2766
3449
  "search-rubric": handleSearchRubric,
3450
+ "skill-used": handleSkillUsed,
2767
3451
  similar: handleSimilar,
2768
3452
  "similar:cosine": handleSimilar,
2769
3453
  "similar:dot": handleSimilar,
2770
3454
  "similar:euclidean": handleSimilar,
2771
3455
  "starts-with": handleStartsWith,
2772
3456
  "tool-call-f1": handleToolCallF1,
3457
+ "trajectory:goal-success": handleTrajectoryGoalSuccess,
3458
+ "trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
3459
+ "trajectory:step-count": handleTrajectoryStepCount,
3460
+ "trajectory:tool-sequence": handleTrajectoryToolSequence,
3461
+ "trajectory:tool-used": handleTrajectoryToolUsed,
2773
3462
  "trace-error-spans": handleTraceErrorSpans,
2774
3463
  "trace-span-count": handleTraceSpanCount,
2775
3464
  "trace-span-duration": handleTraceSpanDuration,
@@ -2812,7 +3501,7 @@ function isAssertionInverse(assertion) {
2812
3501
  function getAssertionBaseType(assertion) {
2813
3502
  return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
2814
3503
  }
2815
- async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
3504
+ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
2816
3505
  const resolvedVars = vars || test.vars || {};
2817
3506
  const { cost, logProbs, output: originalOutput } = providerResponse;
2818
3507
  let output = originalOutput;
@@ -2831,14 +3520,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
2831
3520
  providerResponse,
2832
3521
  ...assertion.config ? { config: structuredClone(assertion.config) } : {}
2833
3522
  };
2834
- if (traceId) try {
2835
- const traceData = await getTraceStore().getTrace(traceId);
2836
- if (traceData) context.trace = {
2837
- traceId: traceData.traceId,
2838
- evaluationId: traceData.evaluationId,
2839
- testCaseId: traceData.testCaseId,
2840
- metadata: traceData.metadata,
2841
- spans: traceData.spans || []
3523
+ if (traceId && assertionMayNeedTraceContext(assertion)) try {
3524
+ const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
3525
+ if (resolvedTraceData) context.trace = {
3526
+ traceId: resolvedTraceData.traceId,
3527
+ evaluationId: resolvedTraceData.evaluationId,
3528
+ testCaseId: resolvedTraceData.testCaseId,
3529
+ metadata: resolvedTraceData.metadata,
3530
+ spans: resolvedTraceData.spans || []
2842
3531
  };
2843
3532
  } catch (error) {
2844
3533
  logger.debug(`Failed to fetch trace data for assertion: ${error}`);
@@ -2871,7 +3560,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
2871
3560
  };
2872
3561
  }
2873
3562
  else if (filePath.endsWith(".rb")) try {
2874
- const { runRuby } = await import("./rubyUtils-BUVePouc.js").then((n) => n.t);
3563
+ const { runRuby } = await import("./rubyUtils-PgU-gHmx.js").then((n) => n.t);
2875
3564
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
2876
3565
  logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
2877
3566
  } catch (error) {
@@ -2980,6 +3669,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
2980
3669
  index: i
2981
3670
  };
2982
3671
  }).flat();
3672
+ const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
3673
+ let preloadedTraceData;
3674
+ if (shouldPreloadTrace && traceId) try {
3675
+ preloadedTraceData = await loadTraceData(traceId);
3676
+ } catch (error) {
3677
+ logger.debug(`Failed to preload trace data for assertions: ${error}`);
3678
+ preloadedTraceData = null;
3679
+ }
2983
3680
  await async.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
2984
3681
  if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
2985
3682
  const result = await runAssertion({
@@ -2991,7 +3688,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
2991
3688
  vars,
2992
3689
  latencyMs,
2993
3690
  assertIndex: index,
2994
- traceId
3691
+ traceId,
3692
+ traceData: preloadedTraceData
2995
3693
  });
2996
3694
  assertResult.addResult({
2997
3695
  index,
@@ -3137,7 +3835,7 @@ var CIProgressReporter = class {
3137
3835
  else {
3138
3836
  const eta = remaining / rate;
3139
3837
  if (eta > 1440) etaDisplay = ">24 hours";
3140
- else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
3838
+ else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
3141
3839
  }
3142
3840
  const percentage = Math.floor(this.completedTests / this.totalTests * 100);
3143
3841
  logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
@@ -3538,12 +4236,55 @@ function isPromptAllowed(prompt, allowedPrompts) {
3538
4236
  var ProgressBarManager = class {
3539
4237
  progressBar;
3540
4238
  isWebUI;
4239
+ originalLogCallback = null;
4240
+ installedLogCallback = null;
4241
+ pendingRender = null;
3541
4242
  totalCount = 0;
3542
4243
  completedCount = 0;
3543
4244
  concurrency = 1;
3544
4245
  constructor(isWebUI) {
3545
4246
  this.isWebUI = isWebUI;
3546
4247
  }
4248
+ clearProgressBarLine() {
4249
+ readline.cursorTo(process.stderr, 0);
4250
+ readline.clearLine(process.stderr, 0);
4251
+ }
4252
+ scheduleRender() {
4253
+ if (!this.progressBar || this.pendingRender) return;
4254
+ this.pendingRender = setImmediate(() => {
4255
+ this.pendingRender = null;
4256
+ this.progressBar?.render();
4257
+ });
4258
+ }
4259
+ handleLogMessage() {
4260
+ if (!this.progressBar) return;
4261
+ this.clearProgressBarLine();
4262
+ this.scheduleRender();
4263
+ }
4264
+ /**
4265
+ * Coordinate console logging with the progress bar to prevent visual corruption.
4266
+ */
4267
+ installLogInterceptor() {
4268
+ if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
4269
+ this.originalLogCallback = globalLogCallback;
4270
+ this.installedLogCallback = (message) => {
4271
+ this.originalLogCallback?.(message);
4272
+ this.handleLogMessage();
4273
+ };
4274
+ setLogCallback(this.installedLogCallback);
4275
+ }
4276
+ /**
4277
+ * Remove the log interceptor and restore original logger callback behavior.
4278
+ */
4279
+ removeLogInterceptor() {
4280
+ if (this.pendingRender) {
4281
+ clearImmediate(this.pendingRender);
4282
+ this.pendingRender = null;
4283
+ }
4284
+ if (this.installedLogCallback && globalLogCallback === this.installedLogCallback) setLogCallback(this.originalLogCallback);
4285
+ this.installedLogCallback = null;
4286
+ this.originalLogCallback = null;
4287
+ }
3547
4288
  /**
3548
4289
  * Initialize progress bar
3549
4290
  */
@@ -3563,7 +4304,8 @@ var ProgressBarManager = class {
3563
4304
  return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
3564
4305
  },
3565
4306
  hideCursor: true,
3566
- gracefulExit: true
4307
+ gracefulExit: true,
4308
+ stream: process.stderr
3567
4309
  }, cliProgress.Presets.shades_classic);
3568
4310
  this.progressBar.start(this.totalCount, 0, {
3569
4311
  provider: "",
@@ -3838,6 +4580,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
3838
4580
  const parts = traceContext.traceparent.split("-");
3839
4581
  if (parts.length >= 3) traceId = parts[1];
3840
4582
  }
4583
+ if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
3841
4584
  const checkResult = await runAssertions({
3842
4585
  prompt: renderedPrompt,
3843
4586
  provider,
@@ -4235,7 +4978,7 @@ var Evaluator = class {
4235
4978
  const defaultProvider = testSuite.defaultTest.provider;
4236
4979
  if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
4237
4980
  else if (typeof defaultProvider === "object" && defaultProvider.id) {
4238
- const { loadApiProvider } = await import("./providers-GIQ2TcsA.js");
4981
+ const { loadApiProvider } = await import("./providers-sS2WI8YD.js");
4239
4982
  testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
4240
4983
  } else testCase.provider = defaultProvider;
4241
4984
  }
@@ -4319,7 +5062,7 @@ var Evaluator = class {
4319
5062
  if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
4320
5063
  }
4321
5064
  if (state.resume && this.evalRecord.persisted) try {
4322
- const { default: EvalResult } = await import("./evalResult-CDQiuUuf.js").then((n) => n.n);
5065
+ const { default: EvalResult } = await import("./evalResult-BkIhRdTe.js").then((n) => n.n);
4323
5066
  const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
4324
5067
  const originalCount = runEvalOptions.length;
4325
5068
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -4519,7 +5262,7 @@ var Evaluator = class {
4519
5262
  if (isCI() && !isWebUI) {
4520
5263
  ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
4521
5264
  ciProgressReporter.start();
4522
- } else if (this.options.showProgressBar && process.stdout.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
5265
+ } else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
4523
5266
  this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
4524
5267
  if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
4525
5268
  if (isWebUI) {
@@ -4540,7 +5283,10 @@ var Evaluator = class {
4540
5283
  if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
4541
5284
  if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
4542
5285
  }
4543
- if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
5286
+ if (this.options.showProgressBar && progressBarManager) {
5287
+ await progressBarManager.initialize(runEvalOptions, concurrency, 0);
5288
+ progressBarManager.installLogInterceptor();
5289
+ }
4544
5290
  try {
4545
5291
  if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
4546
5292
  checkAbort();
@@ -4566,7 +5312,10 @@ var Evaluator = class {
4566
5312
  else if (!targetUnavailable) {
4567
5313
  logger.info("Evaluation interrupted, saving progress...");
4568
5314
  if (globalTimeout) clearTimeout(globalTimeout);
4569
- if (progressBarManager) progressBarManager.stop();
5315
+ if (progressBarManager) {
5316
+ progressBarManager.removeLogInterceptor();
5317
+ progressBarManager.stop();
5318
+ }
4570
5319
  if (ciProgressReporter) ciProgressReporter.finish();
4571
5320
  this.evalRecord.setVars(Array.from(vars));
4572
5321
  await this.evalRecord.addPrompts(prompts);
@@ -4574,6 +5323,10 @@ var Evaluator = class {
4574
5323
  return this.evalRecord;
4575
5324
  }
4576
5325
  } else {
5326
+ if (progressBarManager) {
5327
+ progressBarManager.removeLogInterceptor();
5328
+ progressBarManager.stop();
5329
+ }
4577
5330
  if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
4578
5331
  throw err;
4579
5332
  }
@@ -4716,6 +5469,7 @@ var Evaluator = class {
4716
5469
  await this.evalRecord.addPrompts(prompts);
4717
5470
  try {
4718
5471
  if (progressBarManager) {
5472
+ progressBarManager.removeLogInterceptor();
4719
5473
  progressBarManager.complete();
4720
5474
  progressBarManager.stop();
4721
5475
  } else if (ciProgressReporter) ciProgressReporter.finish();
@@ -7073,8 +7827,7 @@ function testCaseFromCsvRow(row) {
7073
7827
  logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
7074
7828
  } else if (key.startsWith("__config:")) {
7075
7829
  const configParts = key.slice(9).split(":");
7076
- if (configParts.length !== 2) logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
7077
- else {
7830
+ if (configParts.length === 2) {
7078
7831
  const [expectedKey, configKey] = configParts;
7079
7832
  let targetIndex;
7080
7833
  if (expectedKey === "__expected") targetIndex = 0;
@@ -7100,7 +7853,7 @@ function testCaseFromCsvRow(row) {
7100
7853
  }
7101
7854
  }
7102
7855
  assertionConfigs[targetIndex][configKey] = parsedValue;
7103
- }
7856
+ } else logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
7104
7857
  } else vars[key] = value;
7105
7858
  }
7106
7859
  for (let i = 0; i < asserts.length; i++) {
@@ -7229,14 +7982,14 @@ async function parseXlsxFile(filePath) {
7229
7982
  const sheetName = typeof sheetOption === "number" ? sheetNames[sheetOption - 1] : sheetOption;
7230
7983
  const rows = await readXlsxFile(actualFilePath, { sheet: sheetOption });
7231
7984
  if (rows.length === 0) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
7232
- const headers = rows[0].map((cell) => cell != null ? String(cell) : "");
7985
+ const headers = rows[0].map((cell) => cell == null ? "" : String(cell));
7233
7986
  if (headers.length === 0 || headers.every((h) => h === "")) throw new Error(`Sheet "${sheetName}" has no valid column headers`);
7234
7987
  if (rows.length === 1) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
7235
7988
  const data = rows.slice(1).map((row) => {
7236
7989
  const obj = {};
7237
7990
  headers.forEach((header, index) => {
7238
7991
  const cellValue = row[index];
7239
- obj[header] = cellValue != null ? String(cellValue) : "";
7992
+ obj[header] = cellValue == null ? "" : String(cellValue);
7240
7993
  });
7241
7994
  return obj;
7242
7995
  });
@@ -11183,20 +11936,19 @@ function generateEvalSummary(params) {
11183
11936
  }
11184
11937
  }
11185
11938
  lines.push("");
11186
- const passRate = successes / (successes + failures + errors) * 100;
11187
- let passRateDisplay;
11188
- if (!Number.isNaN(passRate)) {
11189
- const passRateFormatted = passRate === 0 || passRate === 100 ? `${passRate.toFixed(0)}%` : `${passRate.toFixed(2)}%`;
11190
- if (passRate >= 100) passRateDisplay = chalk.green.bold(passRateFormatted);
11191
- else if (passRate >= 80) passRateDisplay = chalk.yellow.bold(passRateFormatted);
11192
- else passRateDisplay = chalk.red.bold(passRateFormatted);
11193
- }
11194
- const passedPart = successes > 0 ? `${chalk.green("✓")} ${chalk.green.bold(successes.toLocaleString())} passed` : `${chalk.gray.bold(successes.toLocaleString())} passed`;
11195
- const failedPart = failures > 0 ? `${chalk.red("✗")} ${chalk.red.bold(failures.toLocaleString())} failed` : `${chalk.gray.bold(failures.toLocaleString())} failed`;
11939
+ const totalTests = successes + failures + errors;
11940
+ const formatResultPercentage = (count) => {
11941
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
11942
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
11943
+ };
11944
+ const formatResultLine = (count, label, icon, iconColor) => {
11945
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
11946
+ };
11196
11947
  const errorLabel = errors === 1 ? "error" : "errors";
11197
- const resultsLine = `${passedPart}, ${failedPart}, ${errors > 0 ? `${chalk.red("✗")} ${chalk.red.bold(errors.toLocaleString())} ${errorLabel}` : `${chalk.gray.bold(errors.toLocaleString())} ${errorLabel}`}`;
11198
- if (Number.isNaN(passRate)) lines.push(`${chalk.bold("Results:")} ${resultsLine}`);
11199
- else lines.push(`${chalk.bold("Results:")} ${resultsLine} (${passRateDisplay})`);
11948
+ lines.push(chalk.bold("Results:"));
11949
+ lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
11950
+ lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
11951
+ lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
11200
11952
  const durationDisplay = formatDuration(duration);
11201
11953
  lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
11202
11954
  lines.push("");
@@ -11530,7 +12282,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
11530
12282
  await checkCloudPermissions(config);
11531
12283
  const options = {
11532
12284
  ...evaluateOptions,
11533
- showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar !== void 0 ? cmdObj.progressBar !== false : evaluateOptions.showProgressBar !== void 0 ? evaluateOptions.showProgressBar : true,
12285
+ showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
11534
12286
  repeat,
11535
12287
  delay: !Number.isNaN(delay) && delay > 0 ? delay : void 0,
11536
12288
  maxConcurrency,
@@ -11914,7 +12666,7 @@ async function doRedteamRun(options) {
11914
12666
  redteamConfig = await doGenerateRedteam({
11915
12667
  ...passThroughOptions,
11916
12668
  ...options.liveRedteamConfig?.commandLineOptions || {},
11917
- ...maxConcurrency !== void 0 ? { maxConcurrency } : {},
12669
+ ...maxConcurrency === void 0 ? {} : { maxConcurrency },
11918
12670
  config: configPath,
11919
12671
  output: redteamPath,
11920
12672
  force: options.force,