promptfoo 0.121.1 → 0.121.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. package/README.md +2 -0
  2. package/dist/src/{accounts-xrUGFA6n.js → accounts-B2XmGjty.js} +5 -5
  3. package/dist/src/{accounts-Bx-x3bmW.cjs → accounts-BPyfpSeU.cjs} +5 -5
  4. package/dist/src/{accounts-CMqkzrVf.js → accounts-CFLK3mnD.js} +6 -6
  5. package/dist/src/{accounts-BgNJDBE6.js → accounts-Xatc0RYb.js} +5 -5
  6. package/dist/src/{agentic-utils-BKIN5PKu.js → agentic-utils-36epdqwB.js} +3 -3
  7. package/dist/src/{cometapi-DkXrKi5z.js → agentic-utils-D8yXo5Lm.js} +4 -61
  8. package/dist/src/{cometapi-vY6aDZgo.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
  9. package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
  10. package/dist/src/{agents-C-dDThPK.js → agents-BBVJCIYr.js} +226 -13
  11. package/dist/src/{agents-CErsqg5U.cjs → agents-BBWxKSM0.cjs} +7 -7
  12. package/dist/src/{agents-Dy2YpZpa.js → agents-Bqgfdokm.js} +227 -14
  13. package/dist/src/{agents-B0f4HICh.cjs → agents-CAYbM7qD.cjs} +226 -13
  14. package/dist/src/{agents-CVIn-Utx.js → agents-CLQ-P15P.js} +7 -7
  15. package/dist/src/{agents-DeH4Gu94.js → agents-CgBniSlI.js} +8 -8
  16. package/dist/src/{agents-CXknwsFX.js → agents-DSSTV4bv.js} +226 -13
  17. package/dist/src/{agents-aF4-T121.js → agents-wg3ohknq.js} +7 -7
  18. package/dist/src/{aimlapi-tg0Gkcvr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
  19. package/dist/src/{aimlapi-BNfTBexL.js → aimlapi-BwGC1TtS.js} +13 -13
  20. package/dist/src/{aimlapi-BAGZDo5G.js → aimlapi-DaC3qZ-o.js} +14 -14
  21. package/dist/src/{aimlapi-DHRKlBEA.js → aimlapi-MgSLdvy7.js} +13 -13
  22. package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
  23. package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
  24. package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
  25. package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
  26. package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
  27. package/dist/src/app/index.html +4 -4
  28. package/dist/src/{audio-tf_NBjlC.js → audio-Bn44pQxv.js} +4 -4
  29. package/dist/src/{audio-CHQ4r-RV.js → audio-DDA5WHdx.js} +4 -4
  30. package/dist/src/{audio-BWeaWovU.cjs → audio-DVFjQ67_.cjs} +4 -4
  31. package/dist/src/{audio-BRODU0UK.js → audio-DjU9GswO.js} +5 -5
  32. package/dist/src/{base-DBtwl2FR.cjs → base-BboXIF_0.cjs} +3 -3
  33. package/dist/src/{base-B4QJRyFS.js → base-CKjwebIH.js} +3 -3
  34. package/dist/src/{base-B0tcrnq_.js → base-CqzQ4K8j.js} +3 -3
  35. package/dist/src/{base-fEDN28WM.js → base-Cz2ZC_iA.js} +3 -3
  36. package/dist/src/{blobs-BAU-dXan.js → blobs-B1JriOyi.js} +3 -3
  37. package/dist/src/{blobs-qTYm-1PY.js → blobs-BUWmKWzo.js} +3 -3
  38. package/dist/src/{blobs-DvS-O6be.cjs → blobs-C6j0bvFz.cjs} +3 -3
  39. package/dist/src/{blobs-Bpg5rH6i.js → blobs-DXTl6J3H.js} +3 -3
  40. package/dist/src/{cache-COish3-W.cjs → cache-C5yFZ4gC.cjs} +75 -58
  41. package/dist/src/{cache-8XhNqPKW.js → cache-CaT5tPgo.js} +75 -58
  42. package/dist/src/cache-CyCanoMu.js +6 -0
  43. package/dist/src/{cache-CG0SlR1d.js → cache-DSqR6ezl.js} +75 -58
  44. package/dist/src/cache-Df_QFDNu.cjs +5 -0
  45. package/dist/src/{cache-D3eqDYGU.js → cache-HP0NP4k3.js} +75 -58
  46. package/dist/src/{chat-DHMH-N64.js → chat-B-52XYI1.js} +12 -12
  47. package/dist/src/{chat-BKm79wib.js → chat-B0iaWhoh.js} +16 -14
  48. package/dist/src/{chat-DxysjBvt.js → chat-BE0qTA8e.js} +13 -13
  49. package/dist/src/{chat-CRWNNq73.js → chat-BEwdgGEg.js} +16 -14
  50. package/dist/src/{chat-2K608PeQ.cjs → chat-BtIKkLKx.cjs} +13 -13
  51. package/dist/src/{chat-DaqekjFr.cjs → chat-CM8qWR3_.cjs} +17 -15
  52. package/dist/src/{chat-CM_kyI8B.js → chat-DK1U-eZ-.js} +12 -12
  53. package/dist/src/{chat-CznLWr_D.js → chat-pxmiVpWe.js} +16 -14
  54. package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
  55. package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
  56. package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
  57. package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
  58. package/dist/src/{claude-agent-sdk-CJH22shf.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
  59. package/dist/src/{claude-agent-sdk-Dy5lT-Tx.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
  60. package/dist/src/{claude-agent-sdk-BLTu0WBO.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
  61. package/dist/src/{claude-agent-sdk-D6_k9FKA.js → claude-agent-sdk-HgbFioFw.js} +33 -15
  62. package/dist/src/cloud-DE3t1-ZI.js +4 -0
  63. package/dist/src/{cloud-Bc9526yV.js → cloud-z8KZpUoa.js} +3 -3
  64. package/dist/src/{cloudflare-ai-CWWJCRim.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
  65. package/dist/src/{cloudflare-ai-C9r2sRhw.js → cloudflare-ai-Bbp26N0L.js} +13 -13
  66. package/dist/src/{cloudflare-ai-ClWSdor4.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
  67. package/dist/src/{cloudflare-ai-ICsOuD-z.js → cloudflare-ai-DdKP9TKT.js} +14 -14
  68. package/dist/src/{cloudflare-gateway-D6xFc5pa.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
  69. package/dist/src/{cloudflare-gateway-D6O7AlYb.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
  70. package/dist/src/{cloudflare-gateway-pXGHxJ47.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
  71. package/dist/src/{cloudflare-gateway-C2_-KG5o.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
  72. package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
  73. package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
  74. package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
  75. package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
  76. package/dist/src/cometapi-BDyV-NNm.js +62 -0
  77. package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
  78. package/dist/src/{cometapi-Bbjp5V4x.js → cometapi-hhL4TAh3.js} +14 -14
  79. package/dist/src/{cometapi-BasUi7-_.js → cometapi-sp7sJpBD.js} +15 -15
  80. package/dist/src/{completion-C_P3ypkJ.js → completion-BCimtq-h.js} +6 -6
  81. package/dist/src/{completion-6Mx_iXxK.js → completion-DCjv7RZ3.js} +6 -6
  82. package/dist/src/{completion-CDOouNzq.cjs → completion-DlXUhj5c.cjs} +6 -6
  83. package/dist/src/{completion-C5rtR_9P.js → completion-DoYy49ti.js} +6 -6
  84. package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
  85. package/dist/src/{docker-BwsKwxFs.cjs → docker-Cqj2-QVi.cjs} +14 -14
  86. package/dist/src/{docker-CZnqU1XV.js → docker-CxCkwMzc.js} +13 -13
  87. package/dist/src/{docker-DzxyDPIj.js → docker-DpguQj-w.js} +14 -14
  88. package/dist/src/{docker-5KcG-_86.js → docker-FeBni2dw.js} +13 -13
  89. package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
  90. package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
  91. package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
  92. package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
  93. package/dist/src/eval-7aEqoMs3.js +15 -0
  94. package/dist/src/{eval-DmFyWU7i.js → eval-BTqTn7lb.js} +10 -10
  95. package/dist/src/{evalResult-CDQiuUuf.js → evalResult-BkIhRdTe.js} +7 -7
  96. package/dist/src/evalResult-CYNHkk5A.js +12 -0
  97. package/dist/src/evalResult-CuvJeNiM.js +10 -0
  98. package/dist/src/{evalResult-CTG2AHOS.js → evalResult-DUDShQrm.js} +7 -7
  99. package/dist/src/{evalResult-Dap2CekP.cjs → evalResult-DpARzUCb.cjs} +7 -7
  100. package/dist/src/evalResult-tGdilrWt.cjs +10 -0
  101. package/dist/src/evaluator-BBUqRhz1.js +36 -0
  102. package/dist/src/{evaluator-DPFRbFIL.js → evaluator-BcvOGaam.js} +833 -79
  103. package/dist/src/{extractor-YMU_Gvt8.js → extractor-C8XwivI9.js} +6 -6
  104. package/dist/src/{extractor-CFG6bcWJ.js → extractor-CAZ2G3Kh.js} +6 -6
  105. package/dist/src/{extractor-DX36oYEv.cjs → extractor-DG3sSfXE.cjs} +6 -6
  106. package/dist/src/{extractor-M67RUtg6.js → extractor-D_wd8jxt.js} +6 -6
  107. package/dist/src/{fetch-4M3YRaqL.js → fetch-BiYv2BZc.js} +3 -3
  108. package/dist/src/{fetch-BxUk8odA.cjs → fetch-BnR9wSnm.cjs} +3 -3
  109. package/dist/src/{fetch-60Gzydls.js → fetch-CVAtKnI3.js} +3 -3
  110. package/dist/src/{fetch-BMv0O527.js → fetch-DoVRJZhJ.js} +4 -4
  111. package/dist/src/fetch-UWU706qb.js +5 -0
  112. package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
  113. package/dist/src/{graders-DOXycdlG.cjs → graders-BElhu9ZY.cjs} +126 -55
  114. package/dist/src/{graders-R9rYUM0d.js → graders-BXAJ0sbS.js} +120 -55
  115. package/dist/src/graders-BxfEguVY.js +32 -0
  116. package/dist/src/graders-CzVMbEnv.js +34 -0
  117. package/dist/src/{graders-CpdqD9PI.js → graders-DG7mhg-b.js} +120 -55
  118. package/dist/src/graders-DjCXfj0l.cjs +32 -0
  119. package/dist/src/{graders-CHO8EPM4.js → graders-RjHF8VfG.js} +120 -55
  120. package/dist/src/graders-kHzIWOKu.js +32 -0
  121. package/dist/src/{image-DTedmQPg.cjs → image--F58eEIn.cjs} +6 -6
  122. package/dist/src/{image-DJEvKveK.js → image-6WQXK8m8.js} +4 -4
  123. package/dist/src/{image-pAX56tPG.js → image-B8b6f36E.js} +6 -6
  124. package/dist/src/{image-BmEZqVmk.js → image-CoxZp9PZ.js} +6 -6
  125. package/dist/src/{image-gvmivTEe.js → image-DO0RYnjH.js} +5 -5
  126. package/dist/src/{image-CBBVXWuT.js → image-PoF6DN3x.js} +6 -6
  127. package/dist/src/{image-CDLQOcqT.cjs → image-fza3zuKs.cjs} +4 -4
  128. package/dist/src/{image-tL5hIOFh.js → image-xNbw5ph2.js} +4 -4
  129. package/dist/src/index.cjs +863 -110
  130. package/dist/src/index.d.cts +833 -60
  131. package/dist/src/index.d.ts +833 -60
  132. package/dist/src/index.js +860 -108
  133. package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
  134. package/dist/src/{knowledgeBase-CoU-UQBg.js → knowledgeBase-Bi7CmDbx.js} +7 -7
  135. package/dist/src/{knowledgeBase-CLJybhnF.js → knowledgeBase-Ce3ofVan.js} +8 -8
  136. package/dist/src/{knowledgeBase-DjWPVqSb.js → knowledgeBase-DFRXPZl_.js} +7 -7
  137. package/dist/src/{knowledgeBase-wkxuRFhA.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
  138. package/dist/src/{litellm-B9Hysuri.js → litellm-Bo2gQXpo.js} +16 -15
  139. package/dist/src/{litellm-ePxtr9F1.js → litellm-CKiAxnoM.js} +15 -14
  140. package/dist/src/{litellm-NYpQ8RQu.cjs → litellm-CnHI69aj.cjs} +16 -15
  141. package/dist/src/{litellm-CTfa0hqi.js → litellm-Tc294Jhj.js} +15 -14
  142. package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
  143. package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
  144. package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
  145. package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
  146. package/dist/src/{luma-ray-BW9IRGIc.js → luma-ray-0ehMPt5N.js} +10 -10
  147. package/dist/src/{luma-ray-BE2mOt6N.js → luma-ray-C9q8rdQe.js} +9 -9
  148. package/dist/src/{luma-ray-Cm1KZBhs.js → luma-ray-DP0QA9qn.js} +9 -9
  149. package/dist/src/{luma-ray-B0GGNRc1.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
  150. package/dist/src/main.js +69 -71
  151. package/dist/src/{messages-1x9atZmP.js → messages-DJNo37Ko.js} +14 -9
  152. package/dist/src/{messages-BLbWdsyt.js → messages-Dy9QecMs.js} +14 -9
  153. package/dist/src/{messages-1JrJs91T.cjs → messages-HJsyEh4o.cjs} +15 -10
  154. package/dist/src/{messages-D8EA0oDc.js → messages-biC_ex-p.js} +14 -9
  155. package/dist/src/{modelslab-C1OLRmVX.js → modelslab-B5J-ZM5c.js} +9 -9
  156. package/dist/src/{modelslab-CqXBy3U8.js → modelslab-BI458moT.js} +10 -10
  157. package/dist/src/{modelslab-X5-4LroM.js → modelslab-BTOT8FUO.js} +9 -9
  158. package/dist/src/{modelslab-DcOSFwKh.cjs → modelslab-IQbNg-r7.cjs} +9 -9
  159. package/dist/src/{nova-reel-DihqLeol.js → nova-reel-BZ9y-Y5s.js} +9 -9
  160. package/dist/src/{nova-reel-D9xfaMBs.cjs → nova-reel-CE5etkv9.cjs} +9 -9
  161. package/dist/src/{nova-reel-D2ZkOSyr.js → nova-reel-DEeQlnOJ.js} +10 -10
  162. package/dist/src/{nova-reel-BgS1ZWuK.js → nova-reel-Xw1SXLpg.js} +9 -9
  163. package/dist/src/{nova-sonic-Q3BOJeig.js → nova-sonic-DWswpN1E.js} +7 -7
  164. package/dist/src/{nova-sonic-DezhVUYT.js → nova-sonic-DXTLpi-r.js} +6 -6
  165. package/dist/src/{nova-sonic-DVu3mMIy.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
  166. package/dist/src/{nova-sonic-P-CdUMlV.js → nova-sonic-Ogqf-csn.js} +6 -6
  167. package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
  168. package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
  169. package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
  170. package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
  171. package/dist/src/{openclaw-Bv1DINsX.js → openclaw-0Sv7AK3O.js} +172 -109
  172. package/dist/src/{openclaw-DAfWQn-o.cjs → openclaw-CXxbKgDH.cjs} +174 -110
  173. package/dist/src/{openclaw-BiSZPL7J.js → openclaw-D1FSCps-.js} +172 -109
  174. package/dist/src/{openclaw-D1D_ej1z.js → openclaw-D2ENvu7a.js} +173 -110
  175. package/dist/src/{opencode-sdk-D95s6SnR.js → opencode-sdk-C71Z0ehR.js} +13 -13
  176. package/dist/src/{opencode-sdk-DxUPkLT7.js → opencode-sdk-CHCs7dEb.js} +12 -12
  177. package/dist/src/{opencode-sdk-C7m-wRfI.js → opencode-sdk-DDxj4QqH.js} +12 -12
  178. package/dist/src/{opencode-sdk-CfaLN8PY.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
  179. package/dist/src/{otlpReceiver-g3ByGaXs.js → otlpReceiver-C9KlUtxh.js} +6 -6
  180. package/dist/src/{otlpReceiver--AIRW_S4.js → otlpReceiver-CZL48YfC.js} +6 -6
  181. package/dist/src/{otlpReceiver-Bn5wGB1v.js → otlpReceiver-CavGAA6k.js} +6 -6
  182. package/dist/src/{otlpReceiver-Diec4cln.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
  183. package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
  184. package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
  185. package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
  186. package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
  187. package/dist/src/providers-BSLEaIQG.js +32 -0
  188. package/dist/src/{providers-CFu-TZl-.cjs → providers-CScd1wN6.cjs} +733 -464
  189. package/dist/src/{providers-CFLy1_ji.js → providers-Ch6Mr0gn.js} +795 -526
  190. package/dist/src/{providers-BKRJTjBz.js → providers-Cn73d5sr.js} +795 -526
  191. package/dist/src/providers-D-FnDg8k.cjs +31 -0
  192. package/dist/src/providers-DEYiFVAo.js +30 -0
  193. package/dist/src/{providers-B3HvufyI.js → providers-DvddrgxL.js} +795 -526
  194. package/dist/src/providers-sS2WI8YD.js +30 -0
  195. package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
  196. package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
  197. package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
  198. package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
  199. package/dist/src/{quiverai-CI6gYJVI.js → quiverai-BeofbLVc.js} +4 -4
  200. package/dist/src/{quiverai-MHSxbmmZ.js → quiverai-CCQn73lq.js} +5 -5
  201. package/dist/src/{quiverai-CLkWkyZc.cjs → quiverai-CcUhPIBg.cjs} +4 -4
  202. package/dist/src/{quiverai-C2jVwbH1.js → quiverai-DVSEqJiq.js} +4 -4
  203. package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
  204. package/dist/src/{responses-CGw0DCzh.js → responses-BKP_WYis.js} +16 -12
  205. package/dist/src/{responses-BKqJmhhc.js → responses-CQb1Tj69.js} +16 -12
  206. package/dist/src/{responses-jxdehPkC.js → responses-CgNyTPsY.js} +16 -12
  207. package/dist/src/{responses-tD4Bd4dc.cjs → responses-mo0KQDbu.cjs} +16 -12
  208. package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
  209. package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
  210. package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
  211. package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
  212. package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
  213. package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
  214. package/dist/src/{sagemaker-BK4Zb993.js → sagemaker-CVv8W7so.js} +17 -17
  215. package/dist/src/{sagemaker-D2Q1c-sD.js → sagemaker-CqeASYE5.js} +17 -17
  216. package/dist/src/{sagemaker-BfiWTmvn.js → sagemaker-MUbD5V3v.js} +18 -18
  217. package/dist/src/{sagemaker-CcQHM1jV.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
  218. package/dist/src/{scanner-J8CA3LsV.js → scanner-DVDeUz1r.js} +10 -10
  219. package/dist/src/server/index.js +864 -112
  220. package/dist/src/server-B0Xh1Gx-.js +7 -0
  221. package/dist/src/{server-B0PPuDw-.cjs → server-BtoCXeXI.cjs} +4 -4
  222. package/dist/src/{server-BC7XJFgr.js → server-CP9qKM40.js} +4 -4
  223. package/dist/src/{server-OAs3nBRT.js → server-Cns05F1j.js} +5 -5
  224. package/dist/src/server-DJTKu9IR.cjs +5 -0
  225. package/dist/src/{server-DbFphssR.js → server-DZ9MtCn0.js} +6 -6
  226. package/dist/src/{signal-BOTbd53Z.js → signal-C3ZTsUgi.js} +3 -3
  227. package/dist/src/{slack-DXMKtA-f.js → slack-2sdpGzbt.js} +2 -2
  228. package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
  229. package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
  230. package/dist/src/{slack-DOdy_kyv.js → slack-DCEV-vWP.js} +2 -2
  231. package/dist/src/store-C5u6MgC8.js +6 -0
  232. package/dist/src/{store-BSc-TF2w.cjs → store-CLyU7AtI.cjs} +17 -5
  233. package/dist/src/store-CNHk-De4.cjs +5 -0
  234. package/dist/src/{store-DQLEjuEO.js → store-Cj258DgL.js} +17 -5
  235. package/dist/src/{store-D1tv90v3.js → store-P8OKm19S.js} +17 -5
  236. package/dist/src/{store-Ub2vaGJ1.js → store-VB0GP46K.js} +17 -5
  237. package/dist/src/{tables-xKANLRBD.js → tables-BEIFz2tM.js} +3 -3
  238. package/dist/src/{tables-C7K-XKWp.cjs → tables-BdZQEpRz.cjs} +3 -3
  239. package/dist/src/{tables-D36WTqKX.js → tables-DmzvLbeZ.js} +3 -3
  240. package/dist/src/{tables-5EvT_Bwn.js → tables-kC7R5kiK.js} +3 -3
  241. package/dist/src/{telemetry-C2YDkUQH.js → telemetry-BnH5VJAU.js} +4 -4
  242. package/dist/src/{telemetry-C15ziL8u.js → telemetry-BugWqKiu.js} +4 -4
  243. package/dist/src/{telemetry-DMb2Mpfm.js → telemetry-DPXLd7UE.js} +4 -4
  244. package/dist/src/telemetry-Yig0Tino.js +7 -0
  245. package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
  246. package/dist/src/{telemetry-CbrnxHp_.cjs → telemetry-re627Lre.cjs} +4 -4
  247. package/dist/src/{transcription-CL78qbOU.cjs → transcription-BvtsrzRG.cjs} +13 -13
  248. package/dist/src/{transcription-DAtxHhAM.js → transcription-CaMivnjG.js} +13 -13
  249. package/dist/src/{transcription-QHh3AH6Z.js → transcription-DOMMTu01.js} +14 -14
  250. package/dist/src/{transcription-LNZTNUUL.js → transcription-Hb3VnC4M.js} +13 -13
  251. package/dist/src/{transform-DOcQeLld.cjs → transform-0BwoBsvO.cjs} +19 -5
  252. package/dist/src/{transform-DGxXocjk.js → transform-B2-jIv68.js} +8 -6
  253. package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
  254. package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
  255. package/dist/src/transform-ChNIpHz7.js +6 -0
  256. package/dist/src/{transform-Cgi24fJ7.js → transform-DrleutM3.js} +8 -6
  257. package/dist/src/{transform-DGLazrMm.js → transform-DyDAwEpE.js} +8 -6
  258. package/dist/src/transform-PtQ6rAE3.cjs +5 -0
  259. package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
  260. package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
  261. package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
  262. package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
  263. package/dist/src/{types-CH3Ge2sE.js → types-CIhFeUC4.js} +45 -11
  264. package/dist/src/{types-CN_TZ2GJ.js → types-Cd3ygw8W.js} +45 -11
  265. package/dist/src/{types-LJ0r3wbR.cjs → types-D8cGDZbL.cjs} +46 -12
  266. package/dist/src/{types-CLKiCBW3.js → types-q8GXGF65.js} +45 -11
  267. package/dist/src/{util-CchiqXh_.cjs → util--9u9UVCt.cjs} +3 -3
  268. package/dist/src/{util-5cB-L7U3.js → util-BLvy9qfE.js} +7 -11
  269. package/dist/src/{util-YT5HPZaS.js → util-Bm3E9jpK.js} +7 -11
  270. package/dist/src/{util-6-GqIvzS.js → util-BtoGs5Cb.js} +18 -4
  271. package/dist/src/{util-Db0a0AFH.cjs → util-CFj4YKIn.cjs} +18 -4
  272. package/dist/src/{util-Dlz_Wvgm.js → util-CMMkIxfU.js} +7 -11
  273. package/dist/src/{util-Betm42rL.js → util-CgDCK4KI.js} +18 -4
  274. package/dist/src/{util-Yz-1aEhW.cjs → util-CuLo2pMR.cjs} +7 -11
  275. package/dist/src/{util-C-PPYSMq.js → util-DM2rTn_6.js} +18 -4
  276. package/dist/src/{util-B7T3SiBS.js → util-DMFeUvLz.js} +3 -3
  277. package/dist/src/{util-ZZH-3QZz.js → util-DbVG-yZU.js} +3 -3
  278. package/dist/src/{util-DaWTWKBK.js → util-vNmDL5DT.js} +3 -3
  279. package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
  280. package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
  281. package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
  282. package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
  283. package/dist/tsconfig.tsbuildinfo +1 -1
  284. package/package.json +38 -38
  285. package/dist/src/app/assets/index-BFCZg7hQ.js +0 -439
  286. package/dist/src/app/assets/index-NCn4eVBv.css +0 -1
  287. package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
  288. package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
  289. package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
  290. package/dist/src/cache-Bbn1Nyrd.cjs +0 -5
  291. package/dist/src/cache-BwsMSda7.js +0 -6
  292. package/dist/src/cloud-DmE0EwsY.js +0 -4
  293. package/dist/src/eval-17JizQIv.js +0 -15
  294. package/dist/src/evalResult-Cqj8pldJ.js +0 -12
  295. package/dist/src/evalResult-DvcJAWJU.cjs +0 -10
  296. package/dist/src/evalResult-Hftn-S_i.js +0 -10
  297. package/dist/src/evaluator-B2CFNt-P.js +0 -36
  298. package/dist/src/fetch-KV5kNASw.js +0 -5
  299. package/dist/src/graders-Bu0H9nXi.js +0 -32
  300. package/dist/src/graders-Cfhkvx-e.js +0 -34
  301. package/dist/src/graders-DClJVpGP.cjs +0 -32
  302. package/dist/src/graders-DcnJsrMO.js +0 -32
  303. package/dist/src/providers-C1rOSHiR.js +0 -32
  304. package/dist/src/providers-CxmDwEFf.cjs +0 -31
  305. package/dist/src/providers-Dodakqr0.js +0 -30
  306. package/dist/src/providers-GIQ2TcsA.js +0 -30
  307. package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
  308. package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
  309. package/dist/src/server-B1vi21hA.js +0 -7
  310. package/dist/src/server-Cm9Kai_h.cjs +0 -5
  311. package/dist/src/store-BNmZ1KAz.cjs +0 -5
  312. package/dist/src/store-BltJg2cd.js +0 -6
  313. package/dist/src/telemetry-5BCRNBbe.cjs +0 -5
  314. package/dist/src/telemetry-D4W5hboe.js +0 -7
  315. package/dist/src/transform-DTGDnAzW.js +0 -6
  316. package/dist/src/transform-m3qNw4KP.cjs +0 -5
@@ -2,43 +2,43 @@ Object.defineProperties(exports, {
2
2
  __esModule: { value: true },
3
3
  [Symbol.toStringTag]: { value: "Module" }
4
4
  });
5
- const require_logger = require("./logger-Cp1GPUjj.cjs");
5
+ const require_logger = require("./logger-D5iKBpu_.cjs");
6
6
  const require_invariant = require("./invariant-kfQ8Bu82.cjs");
7
- const require_esm = require("./esm-CnNt7sI4.cjs");
8
- const require_pythonUtils = require("./pythonUtils-CTU3Y3lw.cjs");
7
+ const require_esm = require("./esm-CipptfDu.cjs");
8
+ const require_pythonUtils = require("./pythonUtils-dAVigVK-.cjs");
9
9
  const require_fileExtensions = require("./fileExtensions-bYh77CN8.cjs");
10
- const require_transform = require("./transform-CzK1Q0zl.cjs");
11
- const require_graders = require("./graders-DOXycdlG.cjs");
12
- const require_types = require("./types-LJ0r3wbR.cjs");
13
- const require_util = require("./util-Yz-1aEhW.cjs");
14
- const require_fetch = require("./fetch-BxUk8odA.cjs");
15
- const require_cache = require("./cache-COish3-W.cjs");
16
- const require_providers = require("./providers-CFu-TZl-.cjs");
17
- const require_utils = require("./utils-Cz9qXqII.cjs");
18
- const require_createHash = require("./createHash-CfZSc0b4.cjs");
19
- require("./genaiTracer-DN4dQywX.cjs");
20
- const require_chat = require("./chat-DaqekjFr.cjs");
10
+ const require_transform = require("./transform-ZrG2dvlo.cjs");
11
+ const require_graders = require("./graders-BElhu9ZY.cjs");
12
+ const require_types = require("./types-D8cGDZbL.cjs");
13
+ const require_util = require("./util-CuLo2pMR.cjs");
14
+ const require_fetch = require("./fetch-BnR9wSnm.cjs");
15
+ const require_cache = require("./cache-C5yFZ4gC.cjs");
16
+ const require_providers = require("./providers-CScd1wN6.cjs");
17
+ const require_utils = require("./utils-DKw8mrgr.cjs");
18
+ const require_createHash = require("./createHash-BYwImsYv.cjs");
19
+ require("./genaiTracer-BfxrvSUb.cjs");
20
+ const require_chat = require("./chat-CM8qWR3_.cjs");
21
21
  const require_tokenUsageUtils = require("./tokenUsageUtils-bVa1ga6f.cjs");
22
- const require_transform$1 = require("./transform-DOcQeLld.cjs");
23
- require("./messages-1JrJs91T.cjs");
24
- require("./util-CchiqXh_.cjs");
25
- require("./responses-tD4Bd4dc.cjs");
26
- require("./openai-Cuif0GEt.cjs");
27
- const require_util$2 = require("./util-Db0a0AFH.cjs");
28
- require("./completion-CDOouNzq.cjs");
29
- const require_accounts = require("./accounts-Bx-x3bmW.cjs");
30
- const require_server = require("./server-B0PPuDw-.cjs");
31
- const require_blobs = require("./blobs-DvS-O6be.cjs");
32
- const require_tables = require("./tables-C7K-XKWp.cjs");
33
- const require_extractor = require("./extractor-DX36oYEv.cjs");
34
- const require_telemetry = require("./telemetry-CbrnxHp_.cjs");
22
+ const require_transform$1 = require("./transform-0BwoBsvO.cjs");
23
+ require("./messages-HJsyEh4o.cjs");
24
+ require("./util--9u9UVCt.cjs");
25
+ require("./responses-mo0KQDbu.cjs");
26
+ require("./openai-CoxGAQwn.cjs");
27
+ const require_util$2 = require("./util-CFj4YKIn.cjs");
28
+ require("./completion-DlXUhj5c.cjs");
29
+ const require_accounts = require("./accounts-BPyfpSeU.cjs");
30
+ const require_server = require("./server-BtoCXeXI.cjs");
31
+ const require_blobs = require("./blobs-C6j0bvFz.cjs");
32
+ const require_tables = require("./tables-BdZQEpRz.cjs");
33
+ const require_extractor = require("./extractor-DG3sSfXE.cjs");
34
+ const require_telemetry = require("./telemetry-re627Lre.cjs");
35
35
  const require_text = require("./text-CW1cyrwj.cjs");
36
- const require_store = require("./store-BSc-TF2w.cjs");
37
- require("./base-DBtwl2FR.cjs");
38
- require("./image-DTedmQPg.cjs");
39
- const require_providerRegistry = require("./providerRegistry-Civky8Ar.cjs");
40
- const require_rubyUtils = require("./rubyUtils-DhCAlxZr.cjs");
41
- const require_evalResult = require("./evalResult-Dap2CekP.cjs");
36
+ const require_store = require("./store-CLyU7AtI.cjs");
37
+ require("./base-BboXIF_0.cjs");
38
+ require("./image--F58eEIn.cjs");
39
+ const require_providerRegistry = require("./providerRegistry-BTDgfV5h.cjs");
40
+ const require_rubyUtils = require("./rubyUtils-CGeUtCfW.cjs");
41
+ const require_evalResult = require("./evalResult-DpARzUCb.cjs");
42
42
  let fs = require("fs");
43
43
  fs = require_logger.__toESM(fs);
44
44
  let path = require("path");
@@ -68,6 +68,8 @@ crypto$1 = require_logger.__toESM(crypto$1);
68
68
  let _opentelemetry_api = require("@opentelemetry/api");
69
69
  let _inquirer_input = require("@inquirer/input");
70
70
  _inquirer_input = require_logger.__toESM(_inquirer_input);
71
+ let readline = require("readline");
72
+ readline = require_logger.__toESM(readline);
71
73
  let drizzle_orm = require("drizzle-orm");
72
74
  let cli_progress = require("cli-progress");
73
75
  cli_progress = require_logger.__toESM(cli_progress);
@@ -75,6 +77,7 @@ let jsdom = require("jsdom");
75
77
  let fastest_levenshtein = require("fastest-levenshtein");
76
78
  let js_rouge = require("js-rouge");
77
79
  js_rouge = require_logger.__toESM(js_rouge);
80
+ let node_util = require("node:util");
78
81
  require("debounce");
79
82
  let _opentelemetry_core = require("@opentelemetry/core");
80
83
  let _opentelemetry_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otlp-http");
@@ -307,7 +310,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
307
310
  require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
308
311
  try {
309
312
  require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
310
- const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-Diec4cln.cjs"));
313
+ const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DHKqJlsz.cjs"));
311
314
  const port = testSuite.tracing.otlp.http.port || 4318;
312
315
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
313
316
  require_logger.logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
@@ -330,7 +333,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
330
333
  async function stopOtlpReceiverIfNeeded() {
331
334
  if (otlpReceiverStarted) try {
332
335
  require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
333
- const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-Diec4cln.cjs"));
336
+ const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DHKqJlsz.cjs"));
334
337
  await stopOTLPReceiver();
335
338
  otlpReceiverStarted = false;
336
339
  require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -365,7 +368,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
365
368
  }
366
369
  if (!tracingEnabled) return null;
367
370
  require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
368
- const { getTraceStore } = await Promise.resolve().then(() => require("./store-BNmZ1KAz.cjs"));
371
+ const { getTraceStore } = await Promise.resolve().then(() => require("./store-CNHk-De4.cjs"));
369
372
  const traceStore = getTraceStore();
370
373
  const traceId = generateTraceId();
371
374
  const spanId = generateSpanId();
@@ -1398,7 +1401,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
1398
1401
  pass = result !== inverse;
1399
1402
  score = pass ? 1 : 0;
1400
1403
  } else if (typeof result === "number") {
1401
- pass = assertion.threshold !== void 0 ? result >= assertion.threshold : result > 0;
1404
+ pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
1402
1405
  score = result;
1403
1406
  } else if (typeof result === "object") return result;
1404
1407
  else throw new Error("Custom function must return a boolean or number");
@@ -1431,7 +1434,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
1431
1434
  } catch {
1432
1435
  pass = inverse;
1433
1436
  }
1434
- if (pass && renderedValue) {
1437
+ if (parsedJson !== void 0 && renderedValue) {
1435
1438
  let validate;
1436
1439
  if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
1437
1440
  const schema = valueFromScript;
@@ -1443,11 +1446,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
1443
1446
  }
1444
1447
  else if (typeof renderedValue === "object") validate = require_logger.getAjv().compile(renderedValue);
1445
1448
  else throw new Error("is-json assertion must have a string or object value");
1446
- pass = validate(parsedJson);
1449
+ const valid = validate(parsedJson);
1450
+ pass = inverse ? !valid : valid;
1447
1451
  if (!pass) return {
1448
1452
  pass,
1449
1453
  score: 0,
1450
- reason: `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`,
1454
+ reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`,
1451
1455
  assertion
1452
1456
  };
1453
1457
  }
@@ -1474,9 +1478,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
1474
1478
  }
1475
1479
  else if (typeof renderedValue === "object") validate = require_logger.getAjv().compile(renderedValue);
1476
1480
  else throw new Error("contains-json assertion must have a string or object value");
1477
- pass = validate(jsonObject);
1478
- if (pass) break;
1479
- else errorMessage = `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`;
1481
+ const valid = validate(jsonObject);
1482
+ pass = inverse ? !valid : valid;
1483
+ if (valid) {
1484
+ if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
1485
+ break;
1486
+ } else errorMessage = `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`;
1480
1487
  }
1481
1488
  return {
1482
1489
  pass,
@@ -1660,7 +1667,7 @@ function handlePerplexity({ logProbs, assertion }) {
1660
1667
  if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
1661
1668
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
1662
1669
  const perplexity = Math.exp(-avgLogProb);
1663
- const pass = assertion.threshold !== void 0 ? perplexity <= assertion.threshold : true;
1670
+ const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
1664
1671
  return {
1665
1672
  pass,
1666
1673
  score: pass ? 1 : 0,
@@ -1672,7 +1679,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
1672
1679
  if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
1673
1680
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
1674
1681
  const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
1675
- const pass = assertion.threshold !== void 0 ? perplexityNorm >= assertion.threshold : true;
1682
+ const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
1676
1683
  return {
1677
1684
  pass,
1678
1685
  score: perplexityNorm,
@@ -1787,7 +1794,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
1787
1794
  } else {
1788
1795
  score = Number.parseFloat(String(result));
1789
1796
  if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
1790
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
1797
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
1791
1798
  }
1792
1799
  } catch (err) {
1793
1800
  return {
@@ -2048,7 +2055,7 @@ end
2048
2055
  } else {
2049
2056
  score = Number.parseFloat(String(result));
2050
2057
  if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
2051
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
2058
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
2052
2059
  }
2053
2060
  } catch (err) {
2054
2061
  return {
@@ -2119,6 +2126,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
2119
2126
  };
2120
2127
  };
2121
2128
  //#endregion
2129
+ //#region src/assertions/traceUtils.ts
2130
+ /**
2131
+ * Shared utilities for trace assertions
2132
+ */
2133
+ /**
2134
+ * Match a span name against a glob-like pattern.
2135
+ * Supports * (any characters) and ? (single character) wildcards.
2136
+ *
2137
+ * @param spanName - The span name to match
2138
+ * @param pattern - The glob pattern to match against
2139
+ * @returns true if the span name matches the pattern
2140
+ */
2141
+ function matchesPattern(spanName, pattern) {
2142
+ const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
2143
+ return new RegExp(`^${regexPattern}$`, "i").test(spanName);
2144
+ }
2145
+ //#endregion
2146
+ //#region src/assertions/skill.ts
2147
+ function getSkillCalls(params) {
2148
+ const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
2149
+ if (!Array.isArray(rawSkillCalls)) return [];
2150
+ return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
2151
+ }
2152
+ function matchesSkill(skillCall, matcher) {
2153
+ if (matcher.name && skillCall.name !== matcher.name) return false;
2154
+ if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
2155
+ return true;
2156
+ }
2157
+ function formatSkillCall(skillCall) {
2158
+ const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
2159
+ return details ? `${skillCall.name} (${details})` : skillCall.name;
2160
+ }
2161
+ function resolveSkillMatchers(value) {
2162
+ const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
2163
+ const validateCount = (field, count) => {
2164
+ if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
2165
+ };
2166
+ if (typeof value === "string" && value.trim()) return {
2167
+ kind: "list",
2168
+ matchers: [{ name: normalizeText(value) }]
2169
+ };
2170
+ if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
2171
+ kind: "list",
2172
+ matchers: value.map((item) => ({ name: item.trim() }))
2173
+ };
2174
+ if (value && typeof value === "object" && !Array.isArray(value)) {
2175
+ const rawMatcher = value;
2176
+ const matcher = rawMatcher;
2177
+ const name = normalizeText(matcher.name);
2178
+ const pattern = normalizeText(matcher.pattern);
2179
+ if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
2180
+ if ("min" in rawMatcher) validateCount("min", matcher.min);
2181
+ if ("max" in rawMatcher) validateCount("max", matcher.max);
2182
+ if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
2183
+ return {
2184
+ kind: "count",
2185
+ matcher: {
2186
+ max: typeof matcher.max === "number" ? matcher.max : void 0,
2187
+ min: typeof matcher.min === "number" ? matcher.min : void 0,
2188
+ name,
2189
+ pattern
2190
+ }
2191
+ };
2192
+ }
2193
+ throw new Error("skill-used assertion must have a string, string array, or object value");
2194
+ }
2195
+ function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
2196
+ const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
2197
+ const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
2198
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
2199
+ const expectedSkills = expected.matchers.map((matcher) => matcher.name);
2200
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
2201
+ let reason;
2202
+ if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
2203
+ else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
2204
+ else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
2205
+ return {
2206
+ pass,
2207
+ score: pass ? 1 : 0,
2208
+ reason,
2209
+ assertion: params.assertion
2210
+ };
2211
+ }
2212
+ function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
2213
+ const hasExplicitMin = matcher.min !== void 0;
2214
+ const hasExplicitMax = matcher.max !== void 0;
2215
+ const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
2216
+ const max = matcher.max;
2217
+ const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
2218
+ const count = matchingSkillCalls.length;
2219
+ const matcherLabel = matcher.pattern || matcher.name || "*";
2220
+ if (params.inverse) {
2221
+ if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
2222
+ const pass = count === 0;
2223
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
2224
+ return {
2225
+ pass,
2226
+ score: pass ? 1 : 0,
2227
+ reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
2228
+ assertion: params.assertion
2229
+ };
2230
+ }
2231
+ const pass = count >= min && (max === void 0 || count <= max);
2232
+ let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
2233
+ reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
2234
+ if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
2235
+ return {
2236
+ pass,
2237
+ score: pass ? 1 : 0,
2238
+ reason,
2239
+ assertion: params.assertion
2240
+ };
2241
+ }
2242
+ function handleSkillUsed(params) {
2243
+ const skillCalls = getSkillCalls(params);
2244
+ const actualSkills = skillCalls.map(formatSkillCall);
2245
+ const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
2246
+ if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
2247
+ return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
2248
+ }
2249
+ //#endregion
2122
2250
  //#region src/assertions/sql.ts
2123
2251
  const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
2124
2252
  let pass = false;
@@ -2351,23 +2479,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
2351
2479
  };
2352
2480
  };
2353
2481
  //#endregion
2354
- //#region src/assertions/traceUtils.ts
2355
- /**
2356
- * Shared utilities for trace assertions
2357
- */
2358
- /**
2359
- * Match a span name against a glob-like pattern.
2360
- * Supports * (any characters) and ? (single character) wildcards.
2361
- *
2362
- * @param spanName - The span name to match
2363
- * @param pattern - The glob pattern to match against
2364
- * @returns true if the span name matches the pattern
2365
- */
2366
- function matchesPattern(spanName, pattern) {
2367
- const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
2368
- return new RegExp(`^${regexPattern}$`, "i").test(spanName);
2369
- }
2370
- //#endregion
2371
2482
  //#region src/assertions/traceErrorSpans.ts
2372
2483
  function isErrorSpan(span) {
2373
2484
  if (span.statusCode && span.statusCode >= 400) return true;
@@ -2536,6 +2647,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
2536
2647
  };
2537
2648
  };
2538
2649
  //#endregion
2650
+ //#region src/assertions/trajectoryUtils.ts
2651
+ const TOOL_ATTRIBUTE_KEYS = [
2652
+ "tool.name",
2653
+ "tool_name",
2654
+ "tool",
2655
+ "function.name",
2656
+ "function_name",
2657
+ "gen_ai.tool.name",
2658
+ "codex.mcp.tool",
2659
+ "agent.tool",
2660
+ "agent.tool_name",
2661
+ "agent.toolName"
2662
+ ];
2663
+ const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
2664
+ "tool.arguments",
2665
+ "tool.args",
2666
+ "tool.input",
2667
+ "tool_arguments",
2668
+ "tool_args",
2669
+ "tool_input",
2670
+ "function.arguments",
2671
+ "function.args",
2672
+ "function.input",
2673
+ "function_arguments",
2674
+ "function_args",
2675
+ "gen_ai.tool.arguments",
2676
+ "gen_ai.tool.args",
2677
+ "gen_ai.tool.input",
2678
+ "gen_ai.tool.call.arguments",
2679
+ "gen_ai.tool.call.args",
2680
+ "agent.tool.arguments",
2681
+ "agent.tool.args",
2682
+ "agent.tool.input",
2683
+ "codex.mcp.arguments",
2684
+ "codex.mcp.args",
2685
+ "codex.mcp.input",
2686
+ "arguments",
2687
+ "args",
2688
+ "input"
2689
+ ];
2690
+ const COMMAND_ATTRIBUTE_KEYS = [
2691
+ "codex.command",
2692
+ "command",
2693
+ "command.name",
2694
+ "command_name"
2695
+ ];
2696
+ const SEARCH_ATTRIBUTE_KEYS = [
2697
+ "codex.search.query",
2698
+ "search.query",
2699
+ "search_query"
2700
+ ];
2701
+ const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
2702
+ const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
2703
+ const MAX_JUDGE_SUMMARY_STEPS = 24;
2704
+ const JUDGE_SUMMARY_HEAD_STEPS = 12;
2705
+ const JUDGE_SUMMARY_TAIL_STEPS = 12;
2706
+ function getStringAttribute(attributes, keys) {
2707
+ for (const key of keys) {
2708
+ const value = attributes[key];
2709
+ if (typeof value === "string" && value.trim()) return value.trim();
2710
+ }
2711
+ }
2712
+ function normalizeStructuredAttribute(value) {
2713
+ if (value === void 0 || value === null) return;
2714
+ if (typeof value === "string") {
2715
+ const trimmed = value.trim();
2716
+ if (!trimmed) return;
2717
+ try {
2718
+ return JSON.parse(trimmed);
2719
+ } catch {
2720
+ return trimmed;
2721
+ }
2722
+ }
2723
+ if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
2724
+ }
2725
+ function hasSameStatus(left, right) {
2726
+ return left?.code === right?.code && left?.message === right?.message;
2727
+ }
2728
+ function isSearchLikeSpan(span) {
2729
+ const attributes = span.attributes || {};
2730
+ if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
2731
+ return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
2732
+ }
2733
+ function getTrajectoryStepStatus(step) {
2734
+ if (step.statusCode === void 0 || step.statusCode === 0) return;
2735
+ return {
2736
+ code: step.statusCode,
2737
+ ...step.statusMessage ? { message: step.statusMessage } : {}
2738
+ };
2739
+ }
2740
+ function getCommandExecutable(command) {
2741
+ return command.trim().split(/\s+/)[0] || void 0;
2742
+ }
2743
+ function extractToolName(span) {
2744
+ const attributes = span.attributes || {};
2745
+ const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
2746
+ if (directMatch) return directMatch;
2747
+ for (const [key, value] of Object.entries(attributes)) {
2748
+ if (typeof value !== "string" || !value.trim()) continue;
2749
+ if (/tool.?name|function.?name/i.test(key)) return value.trim();
2750
+ if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
2751
+ }
2752
+ if (span.name.startsWith("mcp ")) {
2753
+ const slashIndex = span.name.lastIndexOf("/");
2754
+ if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
2755
+ }
2756
+ }
2757
+ function extractToolArgs(span) {
2758
+ const attributes = span.attributes || {};
2759
+ for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
2760
+ const value = normalizeStructuredAttribute(attributes[key]);
2761
+ if (value !== void 0) return value;
2762
+ }
2763
+ for (const [key, rawValue] of Object.entries(attributes)) {
2764
+ if (/result|output|error|status/i.test(key)) continue;
2765
+ if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
2766
+ const value = normalizeStructuredAttribute(rawValue);
2767
+ if (value !== void 0) return value;
2768
+ }
2769
+ }
2770
+ function extractCommand(span) {
2771
+ const attributes = span.attributes || {};
2772
+ const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
2773
+ if (directMatch) return directMatch;
2774
+ for (const [key, value] of Object.entries(attributes)) {
2775
+ if (typeof value !== "string" || !value.trim()) continue;
2776
+ if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
2777
+ }
2778
+ if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
2779
+ }
2780
+ function extractSearchQuery(span) {
2781
+ const attributes = span.attributes || {};
2782
+ const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
2783
+ if (directMatch) return directMatch;
2784
+ const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
2785
+ if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
2786
+ if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
2787
+ }
2788
+ function isReasoningSpan(span) {
2789
+ if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
2790
+ return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
2791
+ }
2792
+ function isMessageSpan(span) {
2793
+ if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
2794
+ return span.name === "agent response" || span.name === "send input";
2795
+ }
2796
+ function extractTrajectorySteps(trace) {
2797
+ return [...trace.spans || []].map((span, index) => ({
2798
+ span,
2799
+ index
2800
+ })).sort((left, right) => {
2801
+ const timeDiff = left.span.startTime - right.span.startTime;
2802
+ if (timeDiff !== 0) return timeDiff;
2803
+ const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
2804
+ if (endDiff !== 0) return endDiff;
2805
+ return left.index - right.index;
2806
+ }).map(({ span }) => {
2807
+ const toolName = extractToolName(span);
2808
+ const command = extractCommand(span);
2809
+ const searchQuery = extractSearchQuery(span);
2810
+ let type = "span";
2811
+ let name = span.name;
2812
+ const aliases = new Set([span.name]);
2813
+ let args;
2814
+ if (toolName) {
2815
+ type = "tool";
2816
+ name = toolName;
2817
+ aliases.add(toolName);
2818
+ args = extractToolArgs(span);
2819
+ } else if (command) {
2820
+ type = "command";
2821
+ name = command;
2822
+ aliases.add(command);
2823
+ const executable = getCommandExecutable(command);
2824
+ if (executable) aliases.add(executable);
2825
+ } else if (searchQuery) {
2826
+ type = "search";
2827
+ name = searchQuery;
2828
+ aliases.add(searchQuery);
2829
+ } else if (isReasoningSpan(span)) {
2830
+ type = "reasoning";
2831
+ name = span.name;
2832
+ aliases.add("reasoning");
2833
+ } else if (isMessageSpan(span)) {
2834
+ type = "message";
2835
+ name = span.name;
2836
+ aliases.add("message");
2837
+ }
2838
+ return {
2839
+ aliases: [...aliases],
2840
+ ...args === void 0 ? {} : { args },
2841
+ attributes: span.attributes || {},
2842
+ endTime: span.endTime,
2843
+ name,
2844
+ spanId: span.spanId,
2845
+ spanName: span.name,
2846
+ startTime: span.startTime,
2847
+ statusCode: span.statusCode,
2848
+ statusMessage: span.statusMessage,
2849
+ type
2850
+ };
2851
+ });
2852
+ }
2853
+ function normalizeTrajectoryMatcher(matcher, defaultType) {
2854
+ if (typeof matcher === "string") return {
2855
+ pattern: matcher,
2856
+ ...defaultType ? { type: defaultType } : {}
2857
+ };
2858
+ return {
2859
+ ...matcher,
2860
+ ...matcher.type ? {} : defaultType ? { type: defaultType } : {}
2861
+ };
2862
+ }
2863
+ function matchesTrajectoryStep(step, matcher, defaultType) {
2864
+ const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
2865
+ if (type) {
2866
+ if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
2867
+ }
2868
+ const matchPattern = pattern || name;
2869
+ if (!matchPattern) return true;
2870
+ return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
2871
+ }
2872
+ function formatTrajectoryStep(step) {
2873
+ return `${step.type}:${step.name}`;
2874
+ }
2875
+ function formatTrajectoryArgs(args) {
2876
+ if (args === void 0) return "(none)";
2877
+ try {
2878
+ const serialized = JSON.stringify(args);
2879
+ if (serialized !== void 0) return serialized;
2880
+ } catch {}
2881
+ return String(args);
2882
+ }
2883
+ function compactJudgeTrajectorySteps(steps) {
2884
+ const compacted = [];
2885
+ for (const step of steps) {
2886
+ const previousStep = compacted[compacted.length - 1];
2887
+ if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
2888
+ previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
2889
+ continue;
2890
+ }
2891
+ compacted.push(step);
2892
+ }
2893
+ return compacted;
2894
+ }
2895
+ function truncateJudgeTrajectorySteps(steps) {
2896
+ if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
2897
+ return [
2898
+ ...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
2899
+ { omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
2900
+ ...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
2901
+ ];
2902
+ }
2903
+ function summarizeTrajectoryForJudge(trace) {
2904
+ const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
2905
+ index: index + 1,
2906
+ type: step.type,
2907
+ name: step.name,
2908
+ ...step.spanName === step.name ? {} : { spanName: step.spanName },
2909
+ ...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
2910
+ }));
2911
+ const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
2912
+ const steps = truncateJudgeTrajectorySteps(compactedSteps);
2913
+ return JSON.stringify({
2914
+ traceId: trace.traceId,
2915
+ stepCount: rawSteps.length,
2916
+ compactedStepCount: compactedSteps.length,
2917
+ steps
2918
+ }, null, 2);
2919
+ }
2920
+ //#endregion
2921
+ //#region src/assertions/trajectory.ts
2922
+ function getTraceOrThrow(params) {
2923
+ const trace = params.assertionValueContext.trace;
2924
+ if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
2925
+ return trace;
2926
+ }
2927
+ function applyInverse(pass, inverse) {
2928
+ return inverse ? !pass : pass;
2929
+ }
2930
+ function formatStepList(stepLabels) {
2931
+ return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
2932
+ }
2933
+ function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
2934
+ if (matcher.pattern || matcher.name) return;
2935
+ const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
2936
+ throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
2937
+ }
2938
+ function resolveGoalSuccessValue(value) {
2939
+ if (typeof value === "string" && value.trim()) return { goal: value.trim() };
2940
+ if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
2941
+ throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
2942
+ }
2943
+ function resolveToolMatchers(value) {
2944
+ if (typeof value === "string") return {
2945
+ kind: "list",
2946
+ matchers: [normalizeTrajectoryMatcher(value, "tool")]
2947
+ };
2948
+ if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
2949
+ kind: "list",
2950
+ matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
2951
+ };
2952
+ if (value && typeof value === "object" && !Array.isArray(value)) return {
2953
+ kind: "count",
2954
+ matcher: {
2955
+ ...normalizeTrajectoryMatcher(value, "tool"),
2956
+ max: typeof value.max === "number" ? value.max : void 0,
2957
+ min: typeof value.min === "number" ? value.min : void 0
2958
+ }
2959
+ };
2960
+ throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
2961
+ }
2962
+ const handleTrajectoryToolUsed = (params) => {
2963
+ const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
2964
+ const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
2965
+ if (expected.kind === "list") {
2966
+ if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
2967
+ const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
2968
+ const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
2969
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
2970
+ const actualTools = steps.map(formatTrajectoryStep);
2971
+ const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
2972
+ let reason;
2973
+ if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2974
+ else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2975
+ else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2976
+ return {
2977
+ pass,
2978
+ score: pass ? 1 : 0,
2979
+ reason,
2980
+ assertion: params.assertion
2981
+ };
2982
+ }
2983
+ const matcher = expected.matcher;
2984
+ const min = matcher.min ?? 1;
2985
+ const max = matcher.max;
2986
+ if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
2987
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
2988
+ const count = matchingSteps.length;
2989
+ const basePass = count >= min && (max === void 0 || count <= max);
2990
+ const pass = applyInverse(basePass, params.inverse);
2991
+ const matcherLabel = matcher.pattern || matcher.name || "*";
2992
+ let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
2993
+ if (max === void 0) reason += ` (expected at least ${min})`;
2994
+ else reason += ` (expected ${min}-${max})`;
2995
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
2996
+ if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
2997
+ return {
2998
+ pass,
2999
+ score: pass ? 1 : 0,
3000
+ reason,
3001
+ assertion: params.assertion
3002
+ };
3003
+ };
3004
+ function resolveSequenceValue(value) {
3005
+ if (Array.isArray(value)) return {
3006
+ mode: "in_order",
3007
+ steps: value
3008
+ };
3009
+ if (value && typeof value === "object" && !Array.isArray(value)) {
3010
+ const sequenceValue = value;
3011
+ return {
3012
+ mode: sequenceValue.mode || "in_order",
3013
+ steps: sequenceValue.steps || []
3014
+ };
3015
+ }
3016
+ throw new Error("trajectory:tool-sequence assertion must have an array or object value");
3017
+ }
3018
+ function isRecord(value) {
3019
+ return typeof value === "object" && value !== null && !Array.isArray(value);
3020
+ }
3021
+ function matchesExpectedArgsPartial(actual, expected) {
3022
+ if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
3023
+ if (isRecord(expected)) {
3024
+ if (!isRecord(actual)) return false;
3025
+ return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
3026
+ }
3027
+ return (0, node_util.isDeepStrictEqual)(actual, expected);
3028
+ }
3029
+ function matchesToolArgs(actual, expected, mode) {
3030
+ if (mode === "exact") return (0, node_util.isDeepStrictEqual)(actual, expected);
3031
+ return matchesExpectedArgsPartial(actual, expected);
3032
+ }
3033
+ function resolveToolArgsMatchMode(mode) {
3034
+ if (mode === void 0) return "partial";
3035
+ if (mode === "partial" || mode === "exact") return mode;
3036
+ throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
3037
+ }
3038
+ function resolveToolArgsMatchValue(value) {
3039
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
3040
+ const matcher = normalizeTrajectoryMatcher(value, "tool");
3041
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
3042
+ const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
3043
+ if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
3044
+ return {
3045
+ matcher,
3046
+ expectedArgs,
3047
+ mode: resolveToolArgsMatchMode(value.mode)
3048
+ };
3049
+ }
3050
+ const handleTrajectoryToolSequence = (params) => {
3051
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
3052
+ const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
3053
+ const expectedMatchers = value.steps.map((step, index) => {
3054
+ const matcher = normalizeTrajectoryMatcher(step, "tool");
3055
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
3056
+ return matcher;
3057
+ });
3058
+ if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
3059
+ const actualTools = toolSteps.map(formatTrajectoryStep);
3060
+ let basePass = false;
3061
+ let reason = "";
3062
+ if (value.mode === "exact") {
3063
+ basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
3064
+ if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
3065
+ else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
3066
+ } else {
3067
+ let expectedIndex = 0;
3068
+ const matchedSteps = [];
3069
+ for (const step of toolSteps) {
3070
+ if (expectedIndex >= expectedMatchers.length) break;
3071
+ if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
3072
+ matchedSteps.push(formatTrajectoryStep(step));
3073
+ expectedIndex += 1;
3074
+ }
3075
+ }
3076
+ basePass = expectedIndex === expectedMatchers.length;
3077
+ if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
3078
+ else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
3079
+ }
3080
+ const pass = applyInverse(basePass, params.inverse);
3081
+ if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
3082
+ return {
3083
+ pass,
3084
+ score: pass ? 1 : 0,
3085
+ reason,
3086
+ assertion: params.assertion
3087
+ };
3088
+ };
3089
+ const handleTrajectoryToolArgsMatch = (params) => {
3090
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
3091
+ const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
3092
+ const matcherLabel = matcher.pattern || matcher.name || "*";
3093
+ const actualTools = toolSteps.map(formatTrajectoryStep);
3094
+ const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
3095
+ const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
3096
+ const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
3097
+ const basePass = matchedStep !== void 0;
3098
+ const pass = applyInverse(basePass, params.inverse);
3099
+ const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
3100
+ const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
3101
+ let reason;
3102
+ if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
3103
+ else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
3104
+ else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
3105
+ else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
3106
+ else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
3107
+ else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
3108
+ else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
3109
+ return {
3110
+ pass,
3111
+ score: pass ? 1 : 0,
3112
+ reason,
3113
+ assertion: params.assertion
3114
+ };
3115
+ };
3116
+ function resolveStepCountValue(value) {
3117
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
3118
+ return {
3119
+ ...normalizeTrajectoryMatcher(value),
3120
+ max: typeof value.max === "number" ? value.max : void 0,
3121
+ min: typeof value.min === "number" ? value.min : void 0
3122
+ };
3123
+ }
3124
+ const handleTrajectoryStepCount = (params) => {
3125
+ const steps = extractTrajectorySteps(getTraceOrThrow(params));
3126
+ const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
3127
+ const { min, max } = matcher;
3128
+ if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
3129
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
3130
+ const count = matchingSteps.length;
3131
+ const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
3132
+ const pass = applyInverse(basePass, params.inverse);
3133
+ const filterParts = [];
3134
+ if (matcher.type) {
3135
+ const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
3136
+ filterParts.push(`type=${types.join("|")}`);
3137
+ }
3138
+ const pattern = matcher.pattern || matcher.name;
3139
+ if (pattern) filterParts.push(`pattern=${pattern}`);
3140
+ let reason = `Matched ${count} trajectory step(s)`;
3141
+ if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
3142
+ if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
3143
+ else if (min !== void 0) reason += ` (expected at least ${min})`;
3144
+ else if (max !== void 0) reason += ` (expected at most ${max})`;
3145
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
3146
+ if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
3147
+ return {
3148
+ pass,
3149
+ score: pass ? 1 : 0,
3150
+ reason,
3151
+ assertion: params.assertion
3152
+ };
3153
+ };
3154
+ const handleTrajectoryGoalSuccess = async (params) => {
3155
+ const trace = getTraceOrThrow(params);
3156
+ const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
3157
+ const result = await require_graders.matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
3158
+ if (!params.inverse) return result;
3159
+ return {
3160
+ ...result,
3161
+ assertion: params.assertion,
3162
+ pass: !result.pass,
3163
+ score: result.pass ? 0 : 1,
3164
+ reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
3165
+ };
3166
+ };
3167
+ //#endregion
2539
3168
  //#region src/assertions/webhook.ts
2540
3169
  async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
2541
3170
  require_invariant.invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
@@ -2604,18 +3233,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
2604
3233
  if (pass) reason = "Assertion passed";
2605
3234
  else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
2606
3235
  else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
2607
- } else if (min !== void 0) {
2608
- const basePass = wordCount >= min;
2609
- pass = inverse ? !basePass : basePass;
2610
- if (pass) reason = "Assertion passed";
2611
- else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
2612
- else reason = `Word count ${wordCount} is less than minimum ${min}`;
2613
- } else {
3236
+ } else if (min === void 0) {
2614
3237
  const basePass = wordCount <= max;
2615
3238
  pass = inverse ? !basePass : basePass;
2616
3239
  if (pass) reason = "Assertion passed";
2617
3240
  else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
2618
3241
  else reason = `Word count ${wordCount} is greater than maximum ${max}`;
3242
+ } else {
3243
+ const basePass = wordCount >= min;
3244
+ pass = inverse ? !basePass : basePass;
3245
+ if (pass) reason = "Assertion passed";
3246
+ else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
3247
+ else reason = `Word count ${wordCount} is less than minimum ${min}`;
2619
3248
  }
2620
3249
  } else {
2621
3250
  require_invariant.invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
@@ -2710,6 +3339,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
2710
3339
  //#endregion
2711
3340
  //#region src/assertions/index.ts
2712
3341
  const ASSERTIONS_MAX_CONCURRENCY = require_logger.getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
3342
+ const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
3343
+ const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
3344
+ const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
3345
+ const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
3346
+ const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
3347
+ const MAX_TRACE_FETCH_STABLE_POLLS = 10;
2713
3348
  const MODEL_GRADED_ASSERTION_TYPES = new Set([
2714
3349
  "answer-relevance",
2715
3350
  "context-faithfulness",
@@ -2719,8 +3354,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
2719
3354
  "llm-rubric",
2720
3355
  "model-graded-closedqa",
2721
3356
  "model-graded-factuality",
2722
- "search-rubric"
3357
+ "search-rubric",
3358
+ "trajectory:goal-success"
3359
+ ]);
3360
+ const TRACE_AWARE_ASSERTION_TYPES = new Set([
3361
+ "javascript",
3362
+ "python",
3363
+ "ruby",
3364
+ "trace-error-spans",
3365
+ "trace-span-count",
3366
+ "trace-span-duration",
3367
+ "trajectory:goal-success",
3368
+ "trajectory:step-count",
3369
+ "trajectory:tool-args-match",
3370
+ "trajectory:tool-sequence",
3371
+ "trajectory:tool-used"
2723
3372
  ]);
3373
+ function assertionUsesTrace(assertion) {
3374
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
3375
+ return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
3376
+ }
3377
+ function assertionMayNeedTraceContext(assertion) {
3378
+ if (assertionUsesTrace(assertion)) return true;
3379
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
3380
+ return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || require_providers.isPackagePath(assertion.value) : false;
3381
+ }
3382
+ function hasTraceAwareAssertions(assertions) {
3383
+ return Boolean(assertions?.some(assertionMayNeedTraceContext));
3384
+ }
3385
+ async function loadTraceData(traceId) {
3386
+ const traceStore = require_store.getTraceStore();
3387
+ const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
3388
+ const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
3389
+ const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
3390
+ let lastSpanCount = -1;
3391
+ let stableObservations = 0;
3392
+ let latestTrace = null;
3393
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
3394
+ latestTrace = await traceStore.getTrace(traceId);
3395
+ const spanCount = latestTrace?.spans?.length ?? 0;
3396
+ if (spanCount > 0) {
3397
+ stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
3398
+ lastSpanCount = spanCount;
3399
+ if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
3400
+ } else {
3401
+ stableObservations = 0;
3402
+ lastSpanCount = spanCount;
3403
+ }
3404
+ if (attempt < maxAttempts - 1) await require_fetch.sleep(retryDelayMs);
3405
+ }
3406
+ return latestTrace;
3407
+ }
2724
3408
  const ASSERTION_HANDLERS = {
2725
3409
  "answer-relevance": handleAnswerRelevance,
2726
3410
  bleu: handleBleuScore,
@@ -2783,12 +3467,18 @@ const ASSERTION_HANDLERS = {
2783
3467
  ruby: handleRuby,
2784
3468
  "rouge-n": handleRougeScore,
2785
3469
  "search-rubric": handleSearchRubric,
3470
+ "skill-used": handleSkillUsed,
2786
3471
  similar: handleSimilar,
2787
3472
  "similar:cosine": handleSimilar,
2788
3473
  "similar:dot": handleSimilar,
2789
3474
  "similar:euclidean": handleSimilar,
2790
3475
  "starts-with": handleStartsWith,
2791
3476
  "tool-call-f1": handleToolCallF1,
3477
+ "trajectory:goal-success": handleTrajectoryGoalSuccess,
3478
+ "trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
3479
+ "trajectory:step-count": handleTrajectoryStepCount,
3480
+ "trajectory:tool-sequence": handleTrajectoryToolSequence,
3481
+ "trajectory:tool-used": handleTrajectoryToolUsed,
2792
3482
  "trace-error-spans": handleTraceErrorSpans,
2793
3483
  "trace-span-count": handleTraceSpanCount,
2794
3484
  "trace-span-duration": handleTraceSpanDuration,
@@ -2831,7 +3521,7 @@ function isAssertionInverse(assertion) {
2831
3521
  function getAssertionBaseType(assertion) {
2832
3522
  return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
2833
3523
  }
2834
- async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
3524
+ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
2835
3525
  const resolvedVars = vars || test.vars || {};
2836
3526
  const { cost, logProbs, output: originalOutput } = providerResponse;
2837
3527
  let output = originalOutput;
@@ -2850,14 +3540,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
2850
3540
  providerResponse,
2851
3541
  ...assertion.config ? { config: structuredClone(assertion.config) } : {}
2852
3542
  };
2853
- if (traceId) try {
2854
- const traceData = await require_store.getTraceStore().getTrace(traceId);
2855
- if (traceData) context.trace = {
2856
- traceId: traceData.traceId,
2857
- evaluationId: traceData.evaluationId,
2858
- testCaseId: traceData.testCaseId,
2859
- metadata: traceData.metadata,
2860
- spans: traceData.spans || []
3543
+ if (traceId && assertionMayNeedTraceContext(assertion)) try {
3544
+ const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
3545
+ if (resolvedTraceData) context.trace = {
3546
+ traceId: resolvedTraceData.traceId,
3547
+ evaluationId: resolvedTraceData.evaluationId,
3548
+ testCaseId: resolvedTraceData.testCaseId,
3549
+ metadata: resolvedTraceData.metadata,
3550
+ spans: resolvedTraceData.spans || []
2861
3551
  };
2862
3552
  } catch (error) {
2863
3553
  require_logger.logger.debug(`Failed to fetch trace data for assertion: ${error}`);
@@ -2890,7 +3580,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
2890
3580
  };
2891
3581
  }
2892
3582
  else if (filePath.endsWith(".rb")) try {
2893
- const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-CP42kMvq.cjs"));
3583
+ const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-B1HXG4ej.cjs"));
2894
3584
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
2895
3585
  require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
2896
3586
  } catch (error) {
@@ -2999,6 +3689,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
2999
3689
  index: i
3000
3690
  };
3001
3691
  }).flat();
3692
+ const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
3693
+ let preloadedTraceData;
3694
+ if (shouldPreloadTrace && traceId) try {
3695
+ preloadedTraceData = await loadTraceData(traceId);
3696
+ } catch (error) {
3697
+ require_logger.logger.debug(`Failed to preload trace data for assertions: ${error}`);
3698
+ preloadedTraceData = null;
3699
+ }
3002
3700
  await async.default.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
3003
3701
  if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
3004
3702
  const result = await runAssertion({
@@ -3010,7 +3708,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
3010
3708
  vars,
3011
3709
  latencyMs,
3012
3710
  assertIndex: index,
3013
- traceId
3711
+ traceId,
3712
+ traceData: preloadedTraceData
3014
3713
  });
3015
3714
  assertResult.addResult({
3016
3715
  index,
@@ -3156,7 +3855,7 @@ var CIProgressReporter = class {
3156
3855
  else {
3157
3856
  const eta = remaining / rate;
3158
3857
  if (eta > 1440) etaDisplay = ">24 hours";
3159
- else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
3858
+ else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
3160
3859
  }
3161
3860
  const percentage = Math.floor(this.completedTests / this.totalTests * 100);
3162
3861
  require_logger.logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
@@ -3557,12 +4256,55 @@ function isPromptAllowed(prompt, allowedPrompts) {
3557
4256
  var ProgressBarManager = class {
3558
4257
  progressBar;
3559
4258
  isWebUI;
4259
+ originalLogCallback = null;
4260
+ installedLogCallback = null;
4261
+ pendingRender = null;
3560
4262
  totalCount = 0;
3561
4263
  completedCount = 0;
3562
4264
  concurrency = 1;
3563
4265
  constructor(isWebUI) {
3564
4266
  this.isWebUI = isWebUI;
3565
4267
  }
4268
+ clearProgressBarLine() {
4269
+ readline.default.cursorTo(process.stderr, 0);
4270
+ readline.default.clearLine(process.stderr, 0);
4271
+ }
4272
+ scheduleRender() {
4273
+ if (!this.progressBar || this.pendingRender) return;
4274
+ this.pendingRender = setImmediate(() => {
4275
+ this.pendingRender = null;
4276
+ this.progressBar?.render();
4277
+ });
4278
+ }
4279
+ handleLogMessage() {
4280
+ if (!this.progressBar) return;
4281
+ this.clearProgressBarLine();
4282
+ this.scheduleRender();
4283
+ }
4284
+ /**
4285
+ * Coordinate console logging with the progress bar to prevent visual corruption.
4286
+ */
4287
+ installLogInterceptor() {
4288
+ if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
4289
+ this.originalLogCallback = require_logger.globalLogCallback;
4290
+ this.installedLogCallback = (message) => {
4291
+ this.originalLogCallback?.(message);
4292
+ this.handleLogMessage();
4293
+ };
4294
+ require_logger.setLogCallback(this.installedLogCallback);
4295
+ }
4296
+ /**
4297
+ * Remove the log interceptor and restore original logger callback behavior.
4298
+ */
4299
+ removeLogInterceptor() {
4300
+ if (this.pendingRender) {
4301
+ clearImmediate(this.pendingRender);
4302
+ this.pendingRender = null;
4303
+ }
4304
+ if (this.installedLogCallback && require_logger.globalLogCallback === this.installedLogCallback) require_logger.setLogCallback(this.originalLogCallback);
4305
+ this.installedLogCallback = null;
4306
+ this.originalLogCallback = null;
4307
+ }
3566
4308
  /**
3567
4309
  * Initialize progress bar
3568
4310
  */
@@ -3582,7 +4324,8 @@ var ProgressBarManager = class {
3582
4324
  return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
3583
4325
  },
3584
4326
  hideCursor: true,
3585
- gracefulExit: true
4327
+ gracefulExit: true,
4328
+ stream: process.stderr
3586
4329
  }, cli_progress.default.Presets.shades_classic);
3587
4330
  this.progressBar.start(this.totalCount, 0, {
3588
4331
  provider: "",
@@ -3857,6 +4600,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
3857
4600
  const parts = traceContext.traceparent.split("-");
3858
4601
  if (parts.length >= 3) traceId = parts[1];
3859
4602
  }
4603
+ if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
3860
4604
  const checkResult = await runAssertions({
3861
4605
  prompt: renderedPrompt,
3862
4606
  provider,
@@ -4254,7 +4998,7 @@ var Evaluator = class {
4254
4998
  const defaultProvider = testSuite.defaultTest.provider;
4255
4999
  if (require_types.isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
4256
5000
  else if (typeof defaultProvider === "object" && defaultProvider.id) {
4257
- const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-CxmDwEFf.cjs"));
5001
+ const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-D-FnDg8k.cjs"));
4258
5002
  testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
4259
5003
  } else testCase.provider = defaultProvider;
4260
5004
  }
@@ -4338,7 +5082,7 @@ var Evaluator = class {
4338
5082
  if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
4339
5083
  }
4340
5084
  if (require_logger.state.resume && this.evalRecord.persisted) try {
4341
- const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-DvcJAWJU.cjs"));
5085
+ const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-tGdilrWt.cjs"));
4342
5086
  const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: require_logger.state.retryMode });
4343
5087
  const originalCount = runEvalOptions.length;
4344
5088
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -4538,7 +5282,7 @@ var Evaluator = class {
4538
5282
  if (require_logger.isCI() && !isWebUI) {
4539
5283
  ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
4540
5284
  ciProgressReporter.start();
4541
- } else if (this.options.showProgressBar && process.stdout.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
5285
+ } else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
4542
5286
  this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
4543
5287
  if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
4544
5288
  if (isWebUI) {
@@ -4559,7 +5303,10 @@ var Evaluator = class {
4559
5303
  if (serialRunEvalOptions.length > 0) require_logger.logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
4560
5304
  if (concurrentRunEvalOptions.length > 0) require_logger.logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
4561
5305
  }
4562
- if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
5306
+ if (this.options.showProgressBar && progressBarManager) {
5307
+ await progressBarManager.initialize(runEvalOptions, concurrency, 0);
5308
+ progressBarManager.installLogInterceptor();
5309
+ }
4563
5310
  try {
4564
5311
  if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
4565
5312
  checkAbort();
@@ -4585,7 +5332,10 @@ var Evaluator = class {
4585
5332
  else if (!targetUnavailable) {
4586
5333
  require_logger.logger.info("Evaluation interrupted, saving progress...");
4587
5334
  if (globalTimeout) clearTimeout(globalTimeout);
4588
- if (progressBarManager) progressBarManager.stop();
5335
+ if (progressBarManager) {
5336
+ progressBarManager.removeLogInterceptor();
5337
+ progressBarManager.stop();
5338
+ }
4589
5339
  if (ciProgressReporter) ciProgressReporter.finish();
4590
5340
  this.evalRecord.setVars(Array.from(vars));
4591
5341
  await this.evalRecord.addPrompts(prompts);
@@ -4593,6 +5343,10 @@ var Evaluator = class {
4593
5343
  return this.evalRecord;
4594
5344
  }
4595
5345
  } else {
5346
+ if (progressBarManager) {
5347
+ progressBarManager.removeLogInterceptor();
5348
+ progressBarManager.stop();
5349
+ }
4596
5350
  if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
4597
5351
  throw err;
4598
5352
  }
@@ -4735,6 +5489,7 @@ var Evaluator = class {
4735
5489
  await this.evalRecord.addPrompts(prompts);
4736
5490
  try {
4737
5491
  if (progressBarManager) {
5492
+ progressBarManager.removeLogInterceptor();
4738
5493
  progressBarManager.complete();
4739
5494
  progressBarManager.stop();
4740
5495
  } else if (ciProgressReporter) ciProgressReporter.finish();
@@ -7088,8 +7843,7 @@ function testCaseFromCsvRow(row) {
7088
7843
  require_logger.logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
7089
7844
  } else if (key.startsWith("__config:")) {
7090
7845
  const configParts = key.slice(9).split(":");
7091
- if (configParts.length !== 2) require_logger.logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
7092
- else {
7846
+ if (configParts.length === 2) {
7093
7847
  const [expectedKey, configKey] = configParts;
7094
7848
  let targetIndex;
7095
7849
  if (expectedKey === "__expected") targetIndex = 0;
@@ -7115,7 +7869,7 @@ function testCaseFromCsvRow(row) {
7115
7869
  }
7116
7870
  }
7117
7871
  assertionConfigs[targetIndex][configKey] = parsedValue;
7118
- }
7872
+ } else require_logger.logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
7119
7873
  } else vars[key] = value;
7120
7874
  }
7121
7875
  for (let i = 0; i < asserts.length; i++) {
@@ -7244,14 +7998,14 @@ async function parseXlsxFile(filePath) {
7244
7998
  const sheetName = typeof sheetOption === "number" ? sheetNames[sheetOption - 1] : sheetOption;
7245
7999
  const rows = await readXlsxFile(actualFilePath, { sheet: sheetOption });
7246
8000
  if (rows.length === 0) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
7247
- const headers = rows[0].map((cell) => cell != null ? String(cell) : "");
8001
+ const headers = rows[0].map((cell) => cell == null ? "" : String(cell));
7248
8002
  if (headers.length === 0 || headers.every((h) => h === "")) throw new Error(`Sheet "${sheetName}" has no valid column headers`);
7249
8003
  if (rows.length === 1) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
7250
8004
  const data = rows.slice(1).map((row) => {
7251
8005
  const obj = {};
7252
8006
  headers.forEach((header, index) => {
7253
8007
  const cellValue = row[index];
7254
- obj[header] = cellValue != null ? String(cellValue) : "";
8008
+ obj[header] = cellValue == null ? "" : String(cellValue);
7255
8009
  });
7256
8010
  return obj;
7257
8011
  });
@@ -11198,20 +11952,19 @@ function generateEvalSummary(params) {
11198
11952
  }
11199
11953
  }
11200
11954
  lines.push("");
11201
- const passRate = successes / (successes + failures + errors) * 100;
11202
- let passRateDisplay;
11203
- if (!Number.isNaN(passRate)) {
11204
- const passRateFormatted = passRate === 0 || passRate === 100 ? `${passRate.toFixed(0)}%` : `${passRate.toFixed(2)}%`;
11205
- if (passRate >= 100) passRateDisplay = chalk.default.green.bold(passRateFormatted);
11206
- else if (passRate >= 80) passRateDisplay = chalk.default.yellow.bold(passRateFormatted);
11207
- else passRateDisplay = chalk.default.red.bold(passRateFormatted);
11208
- }
11209
- const passedPart = successes > 0 ? `${chalk.default.green("✓")} ${chalk.default.green.bold(successes.toLocaleString())} passed` : `${chalk.default.gray.bold(successes.toLocaleString())} passed`;
11210
- const failedPart = failures > 0 ? `${chalk.default.red("✗")} ${chalk.default.red.bold(failures.toLocaleString())} failed` : `${chalk.default.gray.bold(failures.toLocaleString())} failed`;
11955
+ const totalTests = successes + failures + errors;
11956
+ const formatResultPercentage = (count) => {
11957
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
11958
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
11959
+ };
11960
+ const formatResultLine = (count, label, icon, iconColor) => {
11961
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count)})`)}`;
11962
+ };
11211
11963
  const errorLabel = errors === 1 ? "error" : "errors";
11212
- const resultsLine = `${passedPart}, ${failedPart}, ${errors > 0 ? `${chalk.default.red("✗")} ${chalk.default.red.bold(errors.toLocaleString())} ${errorLabel}` : `${chalk.default.gray.bold(errors.toLocaleString())} ${errorLabel}`}`;
11213
- if (Number.isNaN(passRate)) lines.push(`${chalk.default.bold("Results:")} ${resultsLine}`);
11214
- else lines.push(`${chalk.default.bold("Results:")} ${resultsLine} (${passRateDisplay})`);
11964
+ lines.push(chalk.default.bold("Results:"));
11965
+ lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green));
11966
+ lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red));
11967
+ lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red));
11215
11968
  const durationDisplay = formatDuration(duration);
11216
11969
  lines.push(chalk.default.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
11217
11970
  lines.push("");
@@ -11545,7 +12298,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
11545
12298
  await require_providers.checkCloudPermissions(config);
11546
12299
  const options = {
11547
12300
  ...evaluateOptions,
11548
- showProgressBar: require_logger.getLogLevel() === "debug" ? false : cmdObj.progressBar !== void 0 ? cmdObj.progressBar !== false : evaluateOptions.showProgressBar !== void 0 ? evaluateOptions.showProgressBar : true,
12301
+ showProgressBar: require_logger.getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
11549
12302
  repeat,
11550
12303
  delay: !Number.isNaN(delay) && delay > 0 ? delay : void 0,
11551
12304
  maxConcurrency,
@@ -11929,7 +12682,7 @@ async function doRedteamRun(options) {
11929
12682
  redteamConfig = await doGenerateRedteam({
11930
12683
  ...passThroughOptions,
11931
12684
  ...options.liveRedteamConfig?.commandLineOptions || {},
11932
- ...maxConcurrency !== void 0 ? { maxConcurrency } : {},
12685
+ ...maxConcurrency === void 0 ? {} : { maxConcurrency },
11933
12686
  config: configPath,
11934
12687
  output: redteamPath,
11935
12688
  force: options.force,