promptfoo 0.121.2 → 0.121.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/README.md +2 -0
  2. package/dist/src/{accounts-CiBLOnA7.js → accounts-B2XmGjty.js} +5 -5
  3. package/dist/src/{accounts-gtkH-5KX.cjs → accounts-BPyfpSeU.cjs} +5 -5
  4. package/dist/src/{accounts-Bm2D8Db9.js → accounts-CFLK3mnD.js} +6 -6
  5. package/dist/src/{accounts-B0pgC1oV.js → accounts-Xatc0RYb.js} +5 -5
  6. package/dist/src/{agentic-utils-DS1g3GLF.js → agentic-utils-36epdqwB.js} +3 -3
  7. package/dist/src/{cometapi-CUQq3H_a.js → agentic-utils-D8yXo5Lm.js} +4 -61
  8. package/dist/src/{cometapi-C4xSqeID.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
  9. package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
  10. package/dist/src/{agents-CBr9A01V.js → agents-BBVJCIYr.js} +226 -13
  11. package/dist/src/{agents-Di9DKPzn.cjs → agents-BBWxKSM0.cjs} +7 -7
  12. package/dist/src/{agents-DgF2zDag.js → agents-Bqgfdokm.js} +228 -13
  13. package/dist/src/{agents-DbRtpYxR.cjs → agents-CAYbM7qD.cjs} +226 -13
  14. package/dist/src/{agents-9qiOy0ho.js → agents-CLQ-P15P.js} +7 -7
  15. package/dist/src/{agents-cLXA8a_8.js → agents-CgBniSlI.js} +8 -8
  16. package/dist/src/{agents-D__IdAlg.js → agents-DSSTV4bv.js} +226 -15
  17. package/dist/src/{agents-CmvBq8LV.js → agents-wg3ohknq.js} +7 -7
  18. package/dist/src/{aimlapi-BvlNH0gr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
  19. package/dist/src/{aimlapi-DHJU_kcV.js → aimlapi-BwGC1TtS.js} +13 -13
  20. package/dist/src/{aimlapi-CnkC2HqE.js → aimlapi-DaC3qZ-o.js} +14 -14
  21. package/dist/src/{aimlapi-B4rcnZgv.js → aimlapi-MgSLdvy7.js} +13 -13
  22. package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
  23. package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
  24. package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
  25. package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
  26. package/dist/src/app/index.html +3 -3
  27. package/dist/src/{audio-Bkv46et0.js → audio-Bn44pQxv.js} +4 -4
  28. package/dist/src/{audio-ClI_AFre.js → audio-DDA5WHdx.js} +4 -4
  29. package/dist/src/{audio-CGMyULza.cjs → audio-DVFjQ67_.cjs} +4 -4
  30. package/dist/src/{audio-Dz3z7s3J.js → audio-DjU9GswO.js} +5 -5
  31. package/dist/src/{base-CGrhspbK.cjs → base-BboXIF_0.cjs} +3 -3
  32. package/dist/src/{base-Dy1V8--Z.js → base-CKjwebIH.js} +3 -3
  33. package/dist/src/{base-DLKtKMFh.js → base-CqzQ4K8j.js} +3 -3
  34. package/dist/src/{base-CpjcHe4e.js → base-Cz2ZC_iA.js} +3 -3
  35. package/dist/src/{blobs-CMHN0Qcz.js → blobs-B1JriOyi.js} +3 -3
  36. package/dist/src/{blobs-BDbfYdrJ.js → blobs-BUWmKWzo.js} +3 -3
  37. package/dist/src/{blobs-D23XLin-.cjs → blobs-C6j0bvFz.cjs} +3 -3
  38. package/dist/src/{blobs-CBO20krR.js → blobs-DXTl6J3H.js} +3 -3
  39. package/dist/src/{cache-Dh5WtQps.cjs → cache-C5yFZ4gC.cjs} +3 -3
  40. package/dist/src/{cache-C4Nxf52C.js → cache-CaT5tPgo.js} +3 -3
  41. package/dist/src/cache-CyCanoMu.js +6 -0
  42. package/dist/src/{cache-BVeDlD87.js → cache-DSqR6ezl.js} +3 -3
  43. package/dist/src/cache-Df_QFDNu.cjs +5 -0
  44. package/dist/src/{cache-i1P6crbO.js → cache-HP0NP4k3.js} +3 -3
  45. package/dist/src/{chat-CzkrVDfz.js → chat-B-52XYI1.js} +12 -12
  46. package/dist/src/{chat-DJIw17u0.js → chat-B0iaWhoh.js} +14 -14
  47. package/dist/src/{chat-qmatte1u.js → chat-BE0qTA8e.js} +13 -13
  48. package/dist/src/{chat-BiKyneZl.js → chat-BEwdgGEg.js} +14 -14
  49. package/dist/src/{chat-C1Qst7jL.cjs → chat-BtIKkLKx.cjs} +13 -13
  50. package/dist/src/{chat-CgF-J-Jj.cjs → chat-CM8qWR3_.cjs} +15 -15
  51. package/dist/src/{chat-C2jrdPMx.js → chat-DK1U-eZ-.js} +12 -12
  52. package/dist/src/{chat-DqxYYtWA.js → chat-pxmiVpWe.js} +14 -14
  53. package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
  54. package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
  55. package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
  56. package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
  57. package/dist/src/{claude-agent-sdk-D9Z5Pr9X.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
  58. package/dist/src/{claude-agent-sdk-DfCoW0E6.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
  59. package/dist/src/{claude-agent-sdk-Apiy0iaz.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
  60. package/dist/src/{claude-agent-sdk-D2bJee9S.js → claude-agent-sdk-HgbFioFw.js} +33 -15
  61. package/dist/src/cloud-DE3t1-ZI.js +4 -0
  62. package/dist/src/{cloud-C0dlstV_.js → cloud-z8KZpUoa.js} +3 -3
  63. package/dist/src/{cloudflare-ai-g7PB6VHR.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
  64. package/dist/src/{cloudflare-ai-8TDxHR0x.js → cloudflare-ai-Bbp26N0L.js} +13 -13
  65. package/dist/src/{cloudflare-ai-CknbZ5LJ.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
  66. package/dist/src/{cloudflare-ai-BxAGvfju.js → cloudflare-ai-DdKP9TKT.js} +14 -14
  67. package/dist/src/{cloudflare-gateway-CP9QEWYS.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
  68. package/dist/src/{cloudflare-gateway-B9HWA5wf.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
  69. package/dist/src/{cloudflare-gateway-CKDb4dJ8.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
  70. package/dist/src/{cloudflare-gateway-BSnDmHYo.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
  71. package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
  72. package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
  73. package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
  74. package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
  75. package/dist/src/cometapi-BDyV-NNm.js +62 -0
  76. package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
  77. package/dist/src/{cometapi-BL9yvj_f.js → cometapi-hhL4TAh3.js} +14 -14
  78. package/dist/src/{cometapi-DFNiKmSz.js → cometapi-sp7sJpBD.js} +15 -15
  79. package/dist/src/{completion-5MzrpJxT.js → completion-BCimtq-h.js} +6 -6
  80. package/dist/src/{completion-qRoZAYRB.js → completion-DCjv7RZ3.js} +6 -6
  81. package/dist/src/{completion-CM6oK8PS.cjs → completion-DlXUhj5c.cjs} +6 -6
  82. package/dist/src/{completion-DZ083F31.js → completion-DoYy49ti.js} +6 -6
  83. package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
  84. package/dist/src/{docker-DcF2pRrj.cjs → docker-Cqj2-QVi.cjs} +14 -14
  85. package/dist/src/{docker-Bb5dcxr8.js → docker-CxCkwMzc.js} +13 -13
  86. package/dist/src/{docker-BvfL2BrW.js → docker-DpguQj-w.js} +14 -14
  87. package/dist/src/{docker-ExVyLp0S.js → docker-FeBni2dw.js} +13 -13
  88. package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
  89. package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
  90. package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
  91. package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
  92. package/dist/src/eval-7aEqoMs3.js +15 -0
  93. package/dist/src/{eval-Dg2nG4v2.js → eval-BTqTn7lb.js} +10 -10
  94. package/dist/src/{evalResult-BDMqrapS.js → evalResult-BkIhRdTe.js} +7 -7
  95. package/dist/src/evalResult-CYNHkk5A.js +12 -0
  96. package/dist/src/evalResult-CuvJeNiM.js +10 -0
  97. package/dist/src/{evalResult-BBRNtX4I.js → evalResult-DUDShQrm.js} +7 -7
  98. package/dist/src/{evalResult-fuaI8HkH.cjs → evalResult-DpARzUCb.cjs} +7 -7
  99. package/dist/src/evalResult-tGdilrWt.cjs +10 -0
  100. package/dist/src/evaluator-BBUqRhz1.js +36 -0
  101. package/dist/src/{evaluator-BhoWwp5b.js → evaluator-BcvOGaam.js} +823 -73
  102. package/dist/src/{extractor-D25qpmGX.js → extractor-C8XwivI9.js} +6 -6
  103. package/dist/src/{extractor-DReVID0K.js → extractor-CAZ2G3Kh.js} +6 -6
  104. package/dist/src/{extractor-pYLLi3wS.cjs → extractor-DG3sSfXE.cjs} +6 -6
  105. package/dist/src/{extractor-C0EVHewb.js → extractor-D_wd8jxt.js} +6 -6
  106. package/dist/src/{fetch-HaqdX7U1.js → fetch-BiYv2BZc.js} +3 -3
  107. package/dist/src/{fetch-BPkYtG8K.cjs → fetch-BnR9wSnm.cjs} +3 -3
  108. package/dist/src/{fetch-Cwxnd8zz.js → fetch-CVAtKnI3.js} +3 -3
  109. package/dist/src/{fetch-Dxpd4_sr.js → fetch-DoVRJZhJ.js} +4 -4
  110. package/dist/src/fetch-UWU706qb.js +5 -0
  111. package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
  112. package/dist/src/{graders-DU49_J8Y.cjs → graders-BElhu9ZY.cjs} +126 -55
  113. package/dist/src/{graders-DP7KFFo-.js → graders-BXAJ0sbS.js} +120 -55
  114. package/dist/src/graders-BxfEguVY.js +32 -0
  115. package/dist/src/graders-CzVMbEnv.js +34 -0
  116. package/dist/src/{graders-BTeBGqjJ.js → graders-DG7mhg-b.js} +120 -55
  117. package/dist/src/graders-DjCXfj0l.cjs +32 -0
  118. package/dist/src/{graders-Bj_Odv7c.js → graders-RjHF8VfG.js} +120 -55
  119. package/dist/src/graders-kHzIWOKu.js +32 -0
  120. package/dist/src/{image-BLmROtN3.cjs → image--F58eEIn.cjs} +6 -6
  121. package/dist/src/{image-B0h9VEMc.js → image-6WQXK8m8.js} +4 -4
  122. package/dist/src/{image-Dpxa1Jt6.js → image-B8b6f36E.js} +6 -6
  123. package/dist/src/{image-CHfWvljl.js → image-CoxZp9PZ.js} +6 -6
  124. package/dist/src/{image-B02ogr_b.js → image-DO0RYnjH.js} +5 -5
  125. package/dist/src/{image-DS-o-0ph.js → image-PoF6DN3x.js} +6 -6
  126. package/dist/src/{image-C1madmKh.cjs → image-fza3zuKs.cjs} +4 -4
  127. package/dist/src/{image-Bb4vWQLM.js → image-xNbw5ph2.js} +4 -4
  128. package/dist/src/index.cjs +853 -104
  129. package/dist/src/index.d.cts +573 -60
  130. package/dist/src/index.d.ts +573 -60
  131. package/dist/src/index.js +850 -102
  132. package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
  133. package/dist/src/{knowledgeBase-B3OoKIej.js → knowledgeBase-Bi7CmDbx.js} +7 -7
  134. package/dist/src/{knowledgeBase-CYTLHOt1.js → knowledgeBase-Ce3ofVan.js} +8 -8
  135. package/dist/src/{knowledgeBase-D33Ty2l6.js → knowledgeBase-DFRXPZl_.js} +7 -7
  136. package/dist/src/{knowledgeBase-DOO_BM9b.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
  137. package/dist/src/{litellm-AaeZcZQF.js → litellm-Bo2gQXpo.js} +14 -14
  138. package/dist/src/{litellm-NbjknEh6.js → litellm-CKiAxnoM.js} +13 -13
  139. package/dist/src/{litellm-I_hbp_dc.cjs → litellm-CnHI69aj.cjs} +14 -14
  140. package/dist/src/{litellm-TrljxD9G.js → litellm-Tc294Jhj.js} +13 -13
  141. package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
  142. package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
  143. package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
  144. package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
  145. package/dist/src/{luma-ray-BS2_tY8L.js → luma-ray-0ehMPt5N.js} +10 -10
  146. package/dist/src/{luma-ray-DDsjcgZZ.js → luma-ray-C9q8rdQe.js} +9 -9
  147. package/dist/src/{luma-ray-f6I2fft-.js → luma-ray-DP0QA9qn.js} +9 -9
  148. package/dist/src/{luma-ray-Due0n7di.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
  149. package/dist/src/main.js +69 -71
  150. package/dist/src/{messages-D0lx5qK7.js → messages-DJNo37Ko.js} +14 -9
  151. package/dist/src/{messages-BS17jdMx.js → messages-Dy9QecMs.js} +14 -9
  152. package/dist/src/{messages-Bs1kC7P4.cjs → messages-HJsyEh4o.cjs} +15 -10
  153. package/dist/src/{messages-ZJk778GH.js → messages-biC_ex-p.js} +14 -9
  154. package/dist/src/{modelslab-DRb74SP4.js → modelslab-B5J-ZM5c.js} +9 -9
  155. package/dist/src/{modelslab-Bx9IrZfS.js → modelslab-BI458moT.js} +10 -10
  156. package/dist/src/{modelslab-Bmni6skY.js → modelslab-BTOT8FUO.js} +9 -9
  157. package/dist/src/{modelslab-CoUX6Jc_.cjs → modelslab-IQbNg-r7.cjs} +9 -9
  158. package/dist/src/{nova-reel-bgjxilYW.js → nova-reel-BZ9y-Y5s.js} +9 -9
  159. package/dist/src/{nova-reel-C_QM18Xn.cjs → nova-reel-CE5etkv9.cjs} +9 -9
  160. package/dist/src/{nova-reel-D_W1tjMH.js → nova-reel-DEeQlnOJ.js} +10 -10
  161. package/dist/src/{nova-reel-BfPq-0Yk.js → nova-reel-Xw1SXLpg.js} +9 -9
  162. package/dist/src/{nova-sonic-De1HW5fD.js → nova-sonic-DWswpN1E.js} +7 -7
  163. package/dist/src/{nova-sonic-CFb5GYhg.js → nova-sonic-DXTLpi-r.js} +6 -6
  164. package/dist/src/{nova-sonic-zfcljeRp.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
  165. package/dist/src/{nova-sonic-DIGQNR07.js → nova-sonic-Ogqf-csn.js} +6 -6
  166. package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
  167. package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
  168. package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
  169. package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
  170. package/dist/src/{openclaw-tiVYRtr-.js → openclaw-0Sv7AK3O.js} +13 -13
  171. package/dist/src/{openclaw-CSugPYAr.cjs → openclaw-CXxbKgDH.cjs} +14 -14
  172. package/dist/src/{openclaw-DuvJKEW5.js → openclaw-D1FSCps-.js} +13 -13
  173. package/dist/src/{openclaw-DiSz3I5L.js → openclaw-D2ENvu7a.js} +14 -14
  174. package/dist/src/{opencode-sdk-0j6rTWNb.js → opencode-sdk-C71Z0ehR.js} +13 -13
  175. package/dist/src/{opencode-sdk-B3CWY9h_.js → opencode-sdk-CHCs7dEb.js} +12 -12
  176. package/dist/src/{opencode-sdk-C2y6UkP2.js → opencode-sdk-DDxj4QqH.js} +12 -12
  177. package/dist/src/{opencode-sdk-BL764Jdi.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
  178. package/dist/src/{otlpReceiver-C99PPb48.js → otlpReceiver-C9KlUtxh.js} +6 -6
  179. package/dist/src/{otlpReceiver-CdNBdbsk.js → otlpReceiver-CZL48YfC.js} +6 -6
  180. package/dist/src/{otlpReceiver-D89fR-rC.js → otlpReceiver-CavGAA6k.js} +6 -6
  181. package/dist/src/{otlpReceiver-CGq6LspY.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
  182. package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
  183. package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
  184. package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
  185. package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
  186. package/dist/src/providers-BSLEaIQG.js +32 -0
  187. package/dist/src/{providers-CgKOSgTR.cjs → providers-CScd1wN6.cjs} +733 -464
  188. package/dist/src/{providers-BlqUifFg.js → providers-Ch6Mr0gn.js} +795 -526
  189. package/dist/src/{providers-Dk_6ocUX.js → providers-Cn73d5sr.js} +795 -526
  190. package/dist/src/providers-D-FnDg8k.cjs +31 -0
  191. package/dist/src/providers-DEYiFVAo.js +30 -0
  192. package/dist/src/{providers-D8lF1sqW.js → providers-DvddrgxL.js} +795 -526
  193. package/dist/src/providers-sS2WI8YD.js +30 -0
  194. package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
  195. package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
  196. package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
  197. package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
  198. package/dist/src/{quiverai-CIaELU_m.js → quiverai-BeofbLVc.js} +4 -4
  199. package/dist/src/{quiverai-uH-dcTIr.js → quiverai-CCQn73lq.js} +5 -5
  200. package/dist/src/{quiverai-PdShCPox.cjs → quiverai-CcUhPIBg.cjs} +4 -4
  201. package/dist/src/{quiverai-BbOUOn2L.js → quiverai-DVSEqJiq.js} +4 -4
  202. package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
  203. package/dist/src/{responses-WNGNYe3K.js → responses-BKP_WYis.js} +14 -10
  204. package/dist/src/{responses-DIR9Ud3j.js → responses-CQb1Tj69.js} +14 -10
  205. package/dist/src/{responses-CB2jwoAr.js → responses-CgNyTPsY.js} +14 -10
  206. package/dist/src/{responses-D8SBTL64.cjs → responses-mo0KQDbu.cjs} +14 -10
  207. package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
  208. package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
  209. package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
  210. package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
  211. package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
  212. package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
  213. package/dist/src/{sagemaker-CNBxx5CJ.js → sagemaker-CVv8W7so.js} +17 -17
  214. package/dist/src/{sagemaker-CemTFp2h.js → sagemaker-CqeASYE5.js} +17 -17
  215. package/dist/src/{sagemaker-YSyBXQQh.js → sagemaker-MUbD5V3v.js} +18 -18
  216. package/dist/src/{sagemaker-Cl28mZU2.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
  217. package/dist/src/{scanner-BsBlNXNn.js → scanner-DVDeUz1r.js} +10 -10
  218. package/dist/src/server/index.js +854 -106
  219. package/dist/src/server-B0Xh1Gx-.js +7 -0
  220. package/dist/src/{server-C_7Ax-hA.cjs → server-BtoCXeXI.cjs} +4 -4
  221. package/dist/src/{server-VWgWb00X.js → server-CP9qKM40.js} +4 -4
  222. package/dist/src/{server-CuxBbeSY.js → server-Cns05F1j.js} +5 -5
  223. package/dist/src/server-DJTKu9IR.cjs +5 -0
  224. package/dist/src/{server-CqzrVGpF.js → server-DZ9MtCn0.js} +6 -6
  225. package/dist/src/{signal-4U3mfRvL.js → signal-C3ZTsUgi.js} +3 -3
  226. package/dist/src/{slack-DOdy_kyv.js → slack-2sdpGzbt.js} +2 -2
  227. package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
  228. package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
  229. package/dist/src/{slack-DXMKtA-f.js → slack-DCEV-vWP.js} +2 -2
  230. package/dist/src/store-C5u6MgC8.js +6 -0
  231. package/dist/src/{store-DLlFCC4h.cjs → store-CLyU7AtI.cjs} +17 -5
  232. package/dist/src/store-CNHk-De4.cjs +5 -0
  233. package/dist/src/{store-DXilxTl-.js → store-Cj258DgL.js} +17 -5
  234. package/dist/src/{store-Dim__MDd.js → store-P8OKm19S.js} +17 -5
  235. package/dist/src/{store-CXGFv4aR.js → store-VB0GP46K.js} +17 -5
  236. package/dist/src/{tables-DLJPUdUE.js → tables-BEIFz2tM.js} +3 -3
  237. package/dist/src/{tables-DPi7wKeM.cjs → tables-BdZQEpRz.cjs} +3 -3
  238. package/dist/src/{tables-gftXzE9I.js → tables-DmzvLbeZ.js} +3 -3
  239. package/dist/src/{tables-6YKwjN9-.js → tables-kC7R5kiK.js} +3 -3
  240. package/dist/src/{telemetry-CMrFgtPB.js → telemetry-BnH5VJAU.js} +4 -4
  241. package/dist/src/{telemetry-Dthj_BbD.js → telemetry-BugWqKiu.js} +4 -4
  242. package/dist/src/{telemetry-Cps3mIU-.js → telemetry-DPXLd7UE.js} +4 -4
  243. package/dist/src/telemetry-Yig0Tino.js +7 -0
  244. package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
  245. package/dist/src/{telemetry-DaX14Chu.cjs → telemetry-re627Lre.cjs} +4 -4
  246. package/dist/src/{transcription-NLVG9MT1.cjs → transcription-BvtsrzRG.cjs} +13 -13
  247. package/dist/src/{transcription-BNYURcXg.js → transcription-CaMivnjG.js} +13 -13
  248. package/dist/src/{transcription-s6A-bNrZ.js → transcription-DOMMTu01.js} +14 -14
  249. package/dist/src/{transcription-B_OdaHp7.js → transcription-Hb3VnC4M.js} +13 -13
  250. package/dist/src/{transform-DuHvhZpj.cjs → transform-0BwoBsvO.cjs} +19 -5
  251. package/dist/src/{transform-uAytVuyX.js → transform-B2-jIv68.js} +8 -6
  252. package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
  253. package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
  254. package/dist/src/transform-ChNIpHz7.js +6 -0
  255. package/dist/src/{transform-D5HsjduX.js → transform-DrleutM3.js} +8 -6
  256. package/dist/src/{transform-vNucnNr0.js → transform-DyDAwEpE.js} +8 -6
  257. package/dist/src/transform-PtQ6rAE3.cjs +5 -0
  258. package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
  259. package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
  260. package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
  261. package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
  262. package/dist/src/{types-Cbd8uOMq.js → types-CIhFeUC4.js} +7 -1
  263. package/dist/src/{types-CzW2QFyi.js → types-Cd3ygw8W.js} +7 -1
  264. package/dist/src/{types-C_7nyzr1.cjs → types-D8cGDZbL.cjs} +8 -2
  265. package/dist/src/{types-DmyIJ-sR.js → types-q8GXGF65.js} +7 -1
  266. package/dist/src/{util-DGNOS1db.cjs → util--9u9UVCt.cjs} +3 -3
  267. package/dist/src/{util-ZzmqNPlg.js → util-BLvy9qfE.js} +7 -7
  268. package/dist/src/{util-C1CeHl-P.js → util-Bm3E9jpK.js} +7 -7
  269. package/dist/src/{util-BV4XUC0n.js → util-BtoGs5Cb.js} +18 -4
  270. package/dist/src/{util-BzMcevZc.cjs → util-CFj4YKIn.cjs} +18 -4
  271. package/dist/src/{util-BRYkYPTd.js → util-CMMkIxfU.js} +7 -7
  272. package/dist/src/{util-Dnmk2mBQ.js → util-CgDCK4KI.js} +18 -4
  273. package/dist/src/{util-B9vlHIIh.cjs → util-CuLo2pMR.cjs} +7 -7
  274. package/dist/src/{util-CMy69ZgQ.js → util-DM2rTn_6.js} +18 -4
  275. package/dist/src/{util-B3xGByQh.js → util-DMFeUvLz.js} +3 -3
  276. package/dist/src/{util-BHGHw5G1.js → util-DbVG-yZU.js} +3 -3
  277. package/dist/src/{util-Bv6uGDfH.js → util-vNmDL5DT.js} +3 -3
  278. package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
  279. package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
  280. package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
  281. package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
  282. package/dist/tsconfig.tsbuildinfo +1 -1
  283. package/package.json +32 -32
  284. package/dist/src/app/assets/index-4LKxG2CG.js +0 -439
  285. package/dist/src/app/assets/index-C3zcsZFQ.css +0 -1
  286. package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
  287. package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
  288. package/dist/src/app/tsconfig.app.tsbuildinfo +0 -1
  289. package/dist/src/cache-CeUpFm3M.cjs +0 -5
  290. package/dist/src/cache-n-RCJ-hL.js +0 -6
  291. package/dist/src/cloud-BBh91EUK.js +0 -4
  292. package/dist/src/eval-B3r2CVXr.js +0 -15
  293. package/dist/src/evalResult-5xwYnECe.js +0 -12
  294. package/dist/src/evalResult-71lY93Kj.cjs +0 -10
  295. package/dist/src/evalResult-Dx5P5cIv.js +0 -10
  296. package/dist/src/evaluator-Jx6bRZV6.js +0 -36
  297. package/dist/src/fetch-BxNb_Lp3.js +0 -5
  298. package/dist/src/graders-B_pgMLS2.js +0 -34
  299. package/dist/src/graders-DErokPDO.cjs +0 -32
  300. package/dist/src/graders-DR_uNe54.js +0 -32
  301. package/dist/src/graders-w3176Wz-.js +0 -32
  302. package/dist/src/providers-B7V0njNs.js +0 -32
  303. package/dist/src/providers-BEwbhv0X.js +0 -30
  304. package/dist/src/providers-CH3C7zf7.js +0 -30
  305. package/dist/src/providers-zyB6k_38.cjs +0 -31
  306. package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
  307. package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
  308. package/dist/src/server-DA4Cyrrq.js +0 -7
  309. package/dist/src/server-Dulb-4-K.cjs +0 -5
  310. package/dist/src/store-CXS-Q_91.js +0 -6
  311. package/dist/src/store-eYkaKMwq.cjs +0 -5
  312. package/dist/src/telemetry-BpMfhthR.cjs +0 -5
  313. package/dist/src/telemetry-Dw38hanS.js +0 -7
  314. package/dist/src/transform-DTGDnAzW.js +0 -6
  315. package/dist/src/transform-m3qNw4KP.cjs +0 -5
@@ -2,43 +2,43 @@ Object.defineProperties(exports, {
2
2
  __esModule: { value: true },
3
3
  [Symbol.toStringTag]: { value: "Module" }
4
4
  });
5
- const require_logger = require("./logger-Cp1GPUjj.cjs");
5
+ const require_logger = require("./logger-D5iKBpu_.cjs");
6
6
  const require_invariant = require("./invariant-kfQ8Bu82.cjs");
7
- const require_esm = require("./esm-CnNt7sI4.cjs");
8
- const require_pythonUtils = require("./pythonUtils-CTU3Y3lw.cjs");
7
+ const require_esm = require("./esm-CipptfDu.cjs");
8
+ const require_pythonUtils = require("./pythonUtils-dAVigVK-.cjs");
9
9
  const require_fileExtensions = require("./fileExtensions-bYh77CN8.cjs");
10
- const require_transform = require("./transform-CzK1Q0zl.cjs");
11
- const require_graders = require("./graders-DU49_J8Y.cjs");
12
- const require_types = require("./types-C_7nyzr1.cjs");
13
- const require_util = require("./util-B9vlHIIh.cjs");
14
- const require_fetch = require("./fetch-BPkYtG8K.cjs");
15
- const require_cache = require("./cache-Dh5WtQps.cjs");
16
- const require_providers = require("./providers-CgKOSgTR.cjs");
17
- const require_utils = require("./utils-Cz9qXqII.cjs");
18
- const require_createHash = require("./createHash-CfZSc0b4.cjs");
19
- require("./genaiTracer-DN4dQywX.cjs");
20
- const require_chat = require("./chat-CgF-J-Jj.cjs");
10
+ const require_transform = require("./transform-ZrG2dvlo.cjs");
11
+ const require_graders = require("./graders-BElhu9ZY.cjs");
12
+ const require_types = require("./types-D8cGDZbL.cjs");
13
+ const require_util = require("./util-CuLo2pMR.cjs");
14
+ const require_fetch = require("./fetch-BnR9wSnm.cjs");
15
+ const require_cache = require("./cache-C5yFZ4gC.cjs");
16
+ const require_providers = require("./providers-CScd1wN6.cjs");
17
+ const require_utils = require("./utils-DKw8mrgr.cjs");
18
+ const require_createHash = require("./createHash-BYwImsYv.cjs");
19
+ require("./genaiTracer-BfxrvSUb.cjs");
20
+ const require_chat = require("./chat-CM8qWR3_.cjs");
21
21
  const require_tokenUsageUtils = require("./tokenUsageUtils-bVa1ga6f.cjs");
22
- const require_transform$1 = require("./transform-DuHvhZpj.cjs");
23
- require("./messages-Bs1kC7P4.cjs");
24
- require("./util-DGNOS1db.cjs");
25
- require("./responses-D8SBTL64.cjs");
26
- require("./openai-Cuif0GEt.cjs");
27
- const require_util$2 = require("./util-BzMcevZc.cjs");
28
- require("./completion-CM6oK8PS.cjs");
29
- const require_accounts = require("./accounts-gtkH-5KX.cjs");
30
- const require_server = require("./server-C_7Ax-hA.cjs");
31
- const require_blobs = require("./blobs-D23XLin-.cjs");
32
- const require_tables = require("./tables-DPi7wKeM.cjs");
33
- const require_extractor = require("./extractor-pYLLi3wS.cjs");
34
- const require_telemetry = require("./telemetry-DaX14Chu.cjs");
22
+ const require_transform$1 = require("./transform-0BwoBsvO.cjs");
23
+ require("./messages-HJsyEh4o.cjs");
24
+ require("./util--9u9UVCt.cjs");
25
+ require("./responses-mo0KQDbu.cjs");
26
+ require("./openai-CoxGAQwn.cjs");
27
+ const require_util$2 = require("./util-CFj4YKIn.cjs");
28
+ require("./completion-DlXUhj5c.cjs");
29
+ const require_accounts = require("./accounts-BPyfpSeU.cjs");
30
+ const require_server = require("./server-BtoCXeXI.cjs");
31
+ const require_blobs = require("./blobs-C6j0bvFz.cjs");
32
+ const require_tables = require("./tables-BdZQEpRz.cjs");
33
+ const require_extractor = require("./extractor-DG3sSfXE.cjs");
34
+ const require_telemetry = require("./telemetry-re627Lre.cjs");
35
35
  const require_text = require("./text-CW1cyrwj.cjs");
36
- const require_store = require("./store-DLlFCC4h.cjs");
37
- require("./base-CGrhspbK.cjs");
38
- require("./image-BLmROtN3.cjs");
39
- const require_providerRegistry = require("./providerRegistry-Civky8Ar.cjs");
40
- const require_rubyUtils = require("./rubyUtils-DhCAlxZr.cjs");
41
- const require_evalResult = require("./evalResult-fuaI8HkH.cjs");
36
+ const require_store = require("./store-CLyU7AtI.cjs");
37
+ require("./base-BboXIF_0.cjs");
38
+ require("./image--F58eEIn.cjs");
39
+ const require_providerRegistry = require("./providerRegistry-BTDgfV5h.cjs");
40
+ const require_rubyUtils = require("./rubyUtils-CGeUtCfW.cjs");
41
+ const require_evalResult = require("./evalResult-DpARzUCb.cjs");
42
42
  let fs = require("fs");
43
43
  fs = require_logger.__toESM(fs);
44
44
  let path = require("path");
@@ -68,6 +68,8 @@ crypto$1 = require_logger.__toESM(crypto$1);
68
68
  let _opentelemetry_api = require("@opentelemetry/api");
69
69
  let _inquirer_input = require("@inquirer/input");
70
70
  _inquirer_input = require_logger.__toESM(_inquirer_input);
71
+ let readline = require("readline");
72
+ readline = require_logger.__toESM(readline);
71
73
  let drizzle_orm = require("drizzle-orm");
72
74
  let cli_progress = require("cli-progress");
73
75
  cli_progress = require_logger.__toESM(cli_progress);
@@ -75,6 +77,7 @@ let jsdom = require("jsdom");
75
77
  let fastest_levenshtein = require("fastest-levenshtein");
76
78
  let js_rouge = require("js-rouge");
77
79
  js_rouge = require_logger.__toESM(js_rouge);
80
+ let node_util = require("node:util");
78
81
  require("debounce");
79
82
  let _opentelemetry_core = require("@opentelemetry/core");
80
83
  let _opentelemetry_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otlp-http");
@@ -307,7 +310,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
307
310
  require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
308
311
  try {
309
312
  require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
310
- const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-CGq6LspY.cjs"));
313
+ const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DHKqJlsz.cjs"));
311
314
  const port = testSuite.tracing.otlp.http.port || 4318;
312
315
  const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
313
316
  require_logger.logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
@@ -330,7 +333,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
330
333
  async function stopOtlpReceiverIfNeeded() {
331
334
  if (otlpReceiverStarted) try {
332
335
  require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
333
- const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-CGq6LspY.cjs"));
336
+ const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DHKqJlsz.cjs"));
334
337
  await stopOTLPReceiver();
335
338
  otlpReceiverStarted = false;
336
339
  require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
@@ -365,7 +368,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
365
368
  }
366
369
  if (!tracingEnabled) return null;
367
370
  require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
368
- const { getTraceStore } = await Promise.resolve().then(() => require("./store-eYkaKMwq.cjs"));
371
+ const { getTraceStore } = await Promise.resolve().then(() => require("./store-CNHk-De4.cjs"));
369
372
  const traceStore = getTraceStore();
370
373
  const traceId = generateTraceId();
371
374
  const spanId = generateSpanId();
@@ -1398,7 +1401,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
1398
1401
  pass = result !== inverse;
1399
1402
  score = pass ? 1 : 0;
1400
1403
  } else if (typeof result === "number") {
1401
- pass = assertion.threshold !== void 0 ? result >= assertion.threshold : result > 0;
1404
+ pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
1402
1405
  score = result;
1403
1406
  } else if (typeof result === "object") return result;
1404
1407
  else throw new Error("Custom function must return a boolean or number");
@@ -1664,7 +1667,7 @@ function handlePerplexity({ logProbs, assertion }) {
1664
1667
  if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
1665
1668
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
1666
1669
  const perplexity = Math.exp(-avgLogProb);
1667
- const pass = assertion.threshold !== void 0 ? perplexity <= assertion.threshold : true;
1670
+ const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
1668
1671
  return {
1669
1672
  pass,
1670
1673
  score: pass ? 1 : 0,
@@ -1676,7 +1679,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
1676
1679
  if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
1677
1680
  const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
1678
1681
  const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
1679
- const pass = assertion.threshold !== void 0 ? perplexityNorm >= assertion.threshold : true;
1682
+ const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
1680
1683
  return {
1681
1684
  pass,
1682
1685
  score: perplexityNorm,
@@ -1791,7 +1794,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
1791
1794
  } else {
1792
1795
  score = Number.parseFloat(String(result));
1793
1796
  if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
1794
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
1797
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
1795
1798
  }
1796
1799
  } catch (err) {
1797
1800
  return {
@@ -2052,7 +2055,7 @@ end
2052
2055
  } else {
2053
2056
  score = Number.parseFloat(String(result));
2054
2057
  if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
2055
- pass = assertion.threshold !== void 0 ? score >= assertion.threshold : score > 0;
2058
+ pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
2056
2059
  }
2057
2060
  } catch (err) {
2058
2061
  return {
@@ -2123,6 +2126,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
2123
2126
  };
2124
2127
  };
2125
2128
  //#endregion
2129
+ //#region src/assertions/traceUtils.ts
2130
+ /**
2131
+ * Shared utilities for trace assertions
2132
+ */
2133
+ /**
2134
+ * Match a span name against a glob-like pattern.
2135
+ * Supports * (any characters) and ? (single character) wildcards.
2136
+ *
2137
+ * @param spanName - The span name to match
2138
+ * @param pattern - The glob pattern to match against
2139
+ * @returns true if the span name matches the pattern
2140
+ */
2141
+ function matchesPattern(spanName, pattern) {
2142
+ const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
2143
+ return new RegExp(`^${regexPattern}$`, "i").test(spanName);
2144
+ }
2145
+ //#endregion
2146
+ //#region src/assertions/skill.ts
2147
+ function getSkillCalls(params) {
2148
+ const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
2149
+ if (!Array.isArray(rawSkillCalls)) return [];
2150
+ return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
2151
+ }
2152
+ function matchesSkill(skillCall, matcher) {
2153
+ if (matcher.name && skillCall.name !== matcher.name) return false;
2154
+ if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
2155
+ return true;
2156
+ }
2157
+ function formatSkillCall(skillCall) {
2158
+ const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
2159
+ return details ? `${skillCall.name} (${details})` : skillCall.name;
2160
+ }
2161
+ function resolveSkillMatchers(value) {
2162
+ const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
2163
+ const validateCount = (field, count) => {
2164
+ if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
2165
+ };
2166
+ if (typeof value === "string" && value.trim()) return {
2167
+ kind: "list",
2168
+ matchers: [{ name: normalizeText(value) }]
2169
+ };
2170
+ if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
2171
+ kind: "list",
2172
+ matchers: value.map((item) => ({ name: item.trim() }))
2173
+ };
2174
+ if (value && typeof value === "object" && !Array.isArray(value)) {
2175
+ const rawMatcher = value;
2176
+ const matcher = rawMatcher;
2177
+ const name = normalizeText(matcher.name);
2178
+ const pattern = normalizeText(matcher.pattern);
2179
+ if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
2180
+ if ("min" in rawMatcher) validateCount("min", matcher.min);
2181
+ if ("max" in rawMatcher) validateCount("max", matcher.max);
2182
+ if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
2183
+ return {
2184
+ kind: "count",
2185
+ matcher: {
2186
+ max: typeof matcher.max === "number" ? matcher.max : void 0,
2187
+ min: typeof matcher.min === "number" ? matcher.min : void 0,
2188
+ name,
2189
+ pattern
2190
+ }
2191
+ };
2192
+ }
2193
+ throw new Error("skill-used assertion must have a string, string array, or object value");
2194
+ }
2195
+ function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
2196
+ const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
2197
+ const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
2198
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
2199
+ const expectedSkills = expected.matchers.map((matcher) => matcher.name);
2200
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
2201
+ let reason;
2202
+ if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
2203
+ else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
2204
+ else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
2205
+ return {
2206
+ pass,
2207
+ score: pass ? 1 : 0,
2208
+ reason,
2209
+ assertion: params.assertion
2210
+ };
2211
+ }
2212
+ function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
2213
+ const hasExplicitMin = matcher.min !== void 0;
2214
+ const hasExplicitMax = matcher.max !== void 0;
2215
+ const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
2216
+ const max = matcher.max;
2217
+ const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
2218
+ const count = matchingSkillCalls.length;
2219
+ const matcherLabel = matcher.pattern || matcher.name || "*";
2220
+ if (params.inverse) {
2221
+ if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
2222
+ const pass = count === 0;
2223
+ const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
2224
+ return {
2225
+ pass,
2226
+ score: pass ? 1 : 0,
2227
+ reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
2228
+ assertion: params.assertion
2229
+ };
2230
+ }
2231
+ const pass = count >= min && (max === void 0 || count <= max);
2232
+ let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
2233
+ reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
2234
+ if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
2235
+ return {
2236
+ pass,
2237
+ score: pass ? 1 : 0,
2238
+ reason,
2239
+ assertion: params.assertion
2240
+ };
2241
+ }
2242
+ function handleSkillUsed(params) {
2243
+ const skillCalls = getSkillCalls(params);
2244
+ const actualSkills = skillCalls.map(formatSkillCall);
2245
+ const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
2246
+ if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
2247
+ return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
2248
+ }
2249
+ //#endregion
2126
2250
  //#region src/assertions/sql.ts
2127
2251
  const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
2128
2252
  let pass = false;
@@ -2355,23 +2479,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
2355
2479
  };
2356
2480
  };
2357
2481
  //#endregion
2358
- //#region src/assertions/traceUtils.ts
2359
- /**
2360
- * Shared utilities for trace assertions
2361
- */
2362
- /**
2363
- * Match a span name against a glob-like pattern.
2364
- * Supports * (any characters) and ? (single character) wildcards.
2365
- *
2366
- * @param spanName - The span name to match
2367
- * @param pattern - The glob pattern to match against
2368
- * @returns true if the span name matches the pattern
2369
- */
2370
- function matchesPattern(spanName, pattern) {
2371
- const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
2372
- return new RegExp(`^${regexPattern}$`, "i").test(spanName);
2373
- }
2374
- //#endregion
2375
2482
  //#region src/assertions/traceErrorSpans.ts
2376
2483
  function isErrorSpan(span) {
2377
2484
  if (span.statusCode && span.statusCode >= 400) return true;
@@ -2540,6 +2647,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
2540
2647
  };
2541
2648
  };
2542
2649
  //#endregion
2650
+ //#region src/assertions/trajectoryUtils.ts
2651
+ const TOOL_ATTRIBUTE_KEYS = [
2652
+ "tool.name",
2653
+ "tool_name",
2654
+ "tool",
2655
+ "function.name",
2656
+ "function_name",
2657
+ "gen_ai.tool.name",
2658
+ "codex.mcp.tool",
2659
+ "agent.tool",
2660
+ "agent.tool_name",
2661
+ "agent.toolName"
2662
+ ];
2663
+ const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
2664
+ "tool.arguments",
2665
+ "tool.args",
2666
+ "tool.input",
2667
+ "tool_arguments",
2668
+ "tool_args",
2669
+ "tool_input",
2670
+ "function.arguments",
2671
+ "function.args",
2672
+ "function.input",
2673
+ "function_arguments",
2674
+ "function_args",
2675
+ "gen_ai.tool.arguments",
2676
+ "gen_ai.tool.args",
2677
+ "gen_ai.tool.input",
2678
+ "gen_ai.tool.call.arguments",
2679
+ "gen_ai.tool.call.args",
2680
+ "agent.tool.arguments",
2681
+ "agent.tool.args",
2682
+ "agent.tool.input",
2683
+ "codex.mcp.arguments",
2684
+ "codex.mcp.args",
2685
+ "codex.mcp.input",
2686
+ "arguments",
2687
+ "args",
2688
+ "input"
2689
+ ];
2690
+ const COMMAND_ATTRIBUTE_KEYS = [
2691
+ "codex.command",
2692
+ "command",
2693
+ "command.name",
2694
+ "command_name"
2695
+ ];
2696
+ const SEARCH_ATTRIBUTE_KEYS = [
2697
+ "codex.search.query",
2698
+ "search.query",
2699
+ "search_query"
2700
+ ];
2701
+ const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
2702
+ const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
2703
+ const MAX_JUDGE_SUMMARY_STEPS = 24;
2704
+ const JUDGE_SUMMARY_HEAD_STEPS = 12;
2705
+ const JUDGE_SUMMARY_TAIL_STEPS = 12;
2706
+ function getStringAttribute(attributes, keys) {
2707
+ for (const key of keys) {
2708
+ const value = attributes[key];
2709
+ if (typeof value === "string" && value.trim()) return value.trim();
2710
+ }
2711
+ }
2712
+ function normalizeStructuredAttribute(value) {
2713
+ if (value === void 0 || value === null) return;
2714
+ if (typeof value === "string") {
2715
+ const trimmed = value.trim();
2716
+ if (!trimmed) return;
2717
+ try {
2718
+ return JSON.parse(trimmed);
2719
+ } catch {
2720
+ return trimmed;
2721
+ }
2722
+ }
2723
+ if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
2724
+ }
2725
+ function hasSameStatus(left, right) {
2726
+ return left?.code === right?.code && left?.message === right?.message;
2727
+ }
2728
+ function isSearchLikeSpan(span) {
2729
+ const attributes = span.attributes || {};
2730
+ if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
2731
+ return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
2732
+ }
2733
+ function getTrajectoryStepStatus(step) {
2734
+ if (step.statusCode === void 0 || step.statusCode === 0) return;
2735
+ return {
2736
+ code: step.statusCode,
2737
+ ...step.statusMessage ? { message: step.statusMessage } : {}
2738
+ };
2739
+ }
2740
+ function getCommandExecutable(command) {
2741
+ return command.trim().split(/\s+/)[0] || void 0;
2742
+ }
2743
+ function extractToolName(span) {
2744
+ const attributes = span.attributes || {};
2745
+ const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
2746
+ if (directMatch) return directMatch;
2747
+ for (const [key, value] of Object.entries(attributes)) {
2748
+ if (typeof value !== "string" || !value.trim()) continue;
2749
+ if (/tool.?name|function.?name/i.test(key)) return value.trim();
2750
+ if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
2751
+ }
2752
+ if (span.name.startsWith("mcp ")) {
2753
+ const slashIndex = span.name.lastIndexOf("/");
2754
+ if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
2755
+ }
2756
+ }
2757
+ function extractToolArgs(span) {
2758
+ const attributes = span.attributes || {};
2759
+ for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
2760
+ const value = normalizeStructuredAttribute(attributes[key]);
2761
+ if (value !== void 0) return value;
2762
+ }
2763
+ for (const [key, rawValue] of Object.entries(attributes)) {
2764
+ if (/result|output|error|status/i.test(key)) continue;
2765
+ if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
2766
+ const value = normalizeStructuredAttribute(rawValue);
2767
+ if (value !== void 0) return value;
2768
+ }
2769
+ }
2770
+ function extractCommand(span) {
2771
+ const attributes = span.attributes || {};
2772
+ const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
2773
+ if (directMatch) return directMatch;
2774
+ for (const [key, value] of Object.entries(attributes)) {
2775
+ if (typeof value !== "string" || !value.trim()) continue;
2776
+ if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
2777
+ }
2778
+ if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
2779
+ }
2780
+ function extractSearchQuery(span) {
2781
+ const attributes = span.attributes || {};
2782
+ const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
2783
+ if (directMatch) return directMatch;
2784
+ const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
2785
+ if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
2786
+ if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
2787
+ }
2788
+ function isReasoningSpan(span) {
2789
+ if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
2790
+ return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
2791
+ }
2792
+ function isMessageSpan(span) {
2793
+ if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
2794
+ return span.name === "agent response" || span.name === "send input";
2795
+ }
2796
+ function extractTrajectorySteps(trace) {
2797
+ return [...trace.spans || []].map((span, index) => ({
2798
+ span,
2799
+ index
2800
+ })).sort((left, right) => {
2801
+ const timeDiff = left.span.startTime - right.span.startTime;
2802
+ if (timeDiff !== 0) return timeDiff;
2803
+ const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
2804
+ if (endDiff !== 0) return endDiff;
2805
+ return left.index - right.index;
2806
+ }).map(({ span }) => {
2807
+ const toolName = extractToolName(span);
2808
+ const command = extractCommand(span);
2809
+ const searchQuery = extractSearchQuery(span);
2810
+ let type = "span";
2811
+ let name = span.name;
2812
+ const aliases = new Set([span.name]);
2813
+ let args;
2814
+ if (toolName) {
2815
+ type = "tool";
2816
+ name = toolName;
2817
+ aliases.add(toolName);
2818
+ args = extractToolArgs(span);
2819
+ } else if (command) {
2820
+ type = "command";
2821
+ name = command;
2822
+ aliases.add(command);
2823
+ const executable = getCommandExecutable(command);
2824
+ if (executable) aliases.add(executable);
2825
+ } else if (searchQuery) {
2826
+ type = "search";
2827
+ name = searchQuery;
2828
+ aliases.add(searchQuery);
2829
+ } else if (isReasoningSpan(span)) {
2830
+ type = "reasoning";
2831
+ name = span.name;
2832
+ aliases.add("reasoning");
2833
+ } else if (isMessageSpan(span)) {
2834
+ type = "message";
2835
+ name = span.name;
2836
+ aliases.add("message");
2837
+ }
2838
+ return {
2839
+ aliases: [...aliases],
2840
+ ...args === void 0 ? {} : { args },
2841
+ attributes: span.attributes || {},
2842
+ endTime: span.endTime,
2843
+ name,
2844
+ spanId: span.spanId,
2845
+ spanName: span.name,
2846
+ startTime: span.startTime,
2847
+ statusCode: span.statusCode,
2848
+ statusMessage: span.statusMessage,
2849
+ type
2850
+ };
2851
+ });
2852
+ }
2853
+ function normalizeTrajectoryMatcher(matcher, defaultType) {
2854
+ if (typeof matcher === "string") return {
2855
+ pattern: matcher,
2856
+ ...defaultType ? { type: defaultType } : {}
2857
+ };
2858
+ return {
2859
+ ...matcher,
2860
+ ...matcher.type ? {} : defaultType ? { type: defaultType } : {}
2861
+ };
2862
+ }
2863
+ function matchesTrajectoryStep(step, matcher, defaultType) {
2864
+ const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
2865
+ if (type) {
2866
+ if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
2867
+ }
2868
+ const matchPattern = pattern || name;
2869
+ if (!matchPattern) return true;
2870
+ return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
2871
+ }
2872
+ function formatTrajectoryStep(step) {
2873
+ return `${step.type}:${step.name}`;
2874
+ }
2875
+ function formatTrajectoryArgs(args) {
2876
+ if (args === void 0) return "(none)";
2877
+ try {
2878
+ const serialized = JSON.stringify(args);
2879
+ if (serialized !== void 0) return serialized;
2880
+ } catch {}
2881
+ return String(args);
2882
+ }
2883
+ function compactJudgeTrajectorySteps(steps) {
2884
+ const compacted = [];
2885
+ for (const step of steps) {
2886
+ const previousStep = compacted[compacted.length - 1];
2887
+ if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
2888
+ previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
2889
+ continue;
2890
+ }
2891
+ compacted.push(step);
2892
+ }
2893
+ return compacted;
2894
+ }
2895
+ function truncateJudgeTrajectorySteps(steps) {
2896
+ if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
2897
+ return [
2898
+ ...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
2899
+ { omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
2900
+ ...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
2901
+ ];
2902
+ }
2903
+ function summarizeTrajectoryForJudge(trace) {
2904
+ const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
2905
+ index: index + 1,
2906
+ type: step.type,
2907
+ name: step.name,
2908
+ ...step.spanName === step.name ? {} : { spanName: step.spanName },
2909
+ ...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
2910
+ }));
2911
+ const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
2912
+ const steps = truncateJudgeTrajectorySteps(compactedSteps);
2913
+ return JSON.stringify({
2914
+ traceId: trace.traceId,
2915
+ stepCount: rawSteps.length,
2916
+ compactedStepCount: compactedSteps.length,
2917
+ steps
2918
+ }, null, 2);
2919
+ }
2920
+ //#endregion
2921
+ //#region src/assertions/trajectory.ts
2922
+ function getTraceOrThrow(params) {
2923
+ const trace = params.assertionValueContext.trace;
2924
+ if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
2925
+ return trace;
2926
+ }
2927
+ function applyInverse(pass, inverse) {
2928
+ return inverse ? !pass : pass;
2929
+ }
2930
+ function formatStepList(stepLabels) {
2931
+ return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
2932
+ }
2933
+ function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
2934
+ if (matcher.pattern || matcher.name) return;
2935
+ const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
2936
+ throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
2937
+ }
2938
+ function resolveGoalSuccessValue(value) {
2939
+ if (typeof value === "string" && value.trim()) return { goal: value.trim() };
2940
+ if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
2941
+ throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
2942
+ }
2943
+ function resolveToolMatchers(value) {
2944
+ if (typeof value === "string") return {
2945
+ kind: "list",
2946
+ matchers: [normalizeTrajectoryMatcher(value, "tool")]
2947
+ };
2948
+ if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
2949
+ kind: "list",
2950
+ matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
2951
+ };
2952
+ if (value && typeof value === "object" && !Array.isArray(value)) return {
2953
+ kind: "count",
2954
+ matcher: {
2955
+ ...normalizeTrajectoryMatcher(value, "tool"),
2956
+ max: typeof value.max === "number" ? value.max : void 0,
2957
+ min: typeof value.min === "number" ? value.min : void 0
2958
+ }
2959
+ };
2960
+ throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
2961
+ }
2962
+ const handleTrajectoryToolUsed = (params) => {
2963
+ const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
2964
+ const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
2965
+ if (expected.kind === "list") {
2966
+ if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
2967
+ const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
2968
+ const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
2969
+ const pass = params.inverse ? matched.length === 0 : missing.length === 0;
2970
+ const actualTools = steps.map(formatTrajectoryStep);
2971
+ const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
2972
+ let reason;
2973
+ if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2974
+ else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2975
+ else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
2976
+ return {
2977
+ pass,
2978
+ score: pass ? 1 : 0,
2979
+ reason,
2980
+ assertion: params.assertion
2981
+ };
2982
+ }
2983
+ const matcher = expected.matcher;
2984
+ const min = matcher.min ?? 1;
2985
+ const max = matcher.max;
2986
+ if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
2987
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
2988
+ const count = matchingSteps.length;
2989
+ const basePass = count >= min && (max === void 0 || count <= max);
2990
+ const pass = applyInverse(basePass, params.inverse);
2991
+ const matcherLabel = matcher.pattern || matcher.name || "*";
2992
+ let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
2993
+ if (max === void 0) reason += ` (expected at least ${min})`;
2994
+ else reason += ` (expected ${min}-${max})`;
2995
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
2996
+ if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
2997
+ return {
2998
+ pass,
2999
+ score: pass ? 1 : 0,
3000
+ reason,
3001
+ assertion: params.assertion
3002
+ };
3003
+ };
3004
+ function resolveSequenceValue(value) {
3005
+ if (Array.isArray(value)) return {
3006
+ mode: "in_order",
3007
+ steps: value
3008
+ };
3009
+ if (value && typeof value === "object" && !Array.isArray(value)) {
3010
+ const sequenceValue = value;
3011
+ return {
3012
+ mode: sequenceValue.mode || "in_order",
3013
+ steps: sequenceValue.steps || []
3014
+ };
3015
+ }
3016
+ throw new Error("trajectory:tool-sequence assertion must have an array or object value");
3017
+ }
3018
+ function isRecord(value) {
3019
+ return typeof value === "object" && value !== null && !Array.isArray(value);
3020
+ }
3021
+ function matchesExpectedArgsPartial(actual, expected) {
3022
+ if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
3023
+ if (isRecord(expected)) {
3024
+ if (!isRecord(actual)) return false;
3025
+ return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
3026
+ }
3027
+ return (0, node_util.isDeepStrictEqual)(actual, expected);
3028
+ }
3029
+ function matchesToolArgs(actual, expected, mode) {
3030
+ if (mode === "exact") return (0, node_util.isDeepStrictEqual)(actual, expected);
3031
+ return matchesExpectedArgsPartial(actual, expected);
3032
+ }
3033
+ function resolveToolArgsMatchMode(mode) {
3034
+ if (mode === void 0) return "partial";
3035
+ if (mode === "partial" || mode === "exact") return mode;
3036
+ throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
3037
+ }
3038
+ function resolveToolArgsMatchValue(value) {
3039
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
3040
+ const matcher = normalizeTrajectoryMatcher(value, "tool");
3041
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
3042
+ const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
3043
+ if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
3044
+ return {
3045
+ matcher,
3046
+ expectedArgs,
3047
+ mode: resolveToolArgsMatchMode(value.mode)
3048
+ };
3049
+ }
3050
+ const handleTrajectoryToolSequence = (params) => {
3051
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
3052
+ const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
3053
+ const expectedMatchers = value.steps.map((step, index) => {
3054
+ const matcher = normalizeTrajectoryMatcher(step, "tool");
3055
+ requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
3056
+ return matcher;
3057
+ });
3058
+ if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
3059
+ const actualTools = toolSteps.map(formatTrajectoryStep);
3060
+ let basePass = false;
3061
+ let reason = "";
3062
+ if (value.mode === "exact") {
3063
+ basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
3064
+ if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
3065
+ else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
3066
+ } else {
3067
+ let expectedIndex = 0;
3068
+ const matchedSteps = [];
3069
+ for (const step of toolSteps) {
3070
+ if (expectedIndex >= expectedMatchers.length) break;
3071
+ if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
3072
+ matchedSteps.push(formatTrajectoryStep(step));
3073
+ expectedIndex += 1;
3074
+ }
3075
+ }
3076
+ basePass = expectedIndex === expectedMatchers.length;
3077
+ if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
3078
+ else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
3079
+ }
3080
+ const pass = applyInverse(basePass, params.inverse);
3081
+ if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
3082
+ return {
3083
+ pass,
3084
+ score: pass ? 1 : 0,
3085
+ reason,
3086
+ assertion: params.assertion
3087
+ };
3088
+ };
3089
+ const handleTrajectoryToolArgsMatch = (params) => {
3090
+ const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
3091
+ const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
3092
+ const matcherLabel = matcher.pattern || matcher.name || "*";
3093
+ const actualTools = toolSteps.map(formatTrajectoryStep);
3094
+ const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
3095
+ const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
3096
+ const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
3097
+ const basePass = matchedStep !== void 0;
3098
+ const pass = applyInverse(basePass, params.inverse);
3099
+ const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
3100
+ const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
3101
+ let reason;
3102
+ if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
3103
+ else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
3104
+ else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
3105
+ else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
3106
+ else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
3107
+ else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
3108
+ else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
3109
+ return {
3110
+ pass,
3111
+ score: pass ? 1 : 0,
3112
+ reason,
3113
+ assertion: params.assertion
3114
+ };
3115
+ };
3116
+ function resolveStepCountValue(value) {
3117
+ if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
3118
+ return {
3119
+ ...normalizeTrajectoryMatcher(value),
3120
+ max: typeof value.max === "number" ? value.max : void 0,
3121
+ min: typeof value.min === "number" ? value.min : void 0
3122
+ };
3123
+ }
3124
+ const handleTrajectoryStepCount = (params) => {
3125
+ const steps = extractTrajectorySteps(getTraceOrThrow(params));
3126
+ const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
3127
+ const { min, max } = matcher;
3128
+ if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
3129
+ const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
3130
+ const count = matchingSteps.length;
3131
+ const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
3132
+ const pass = applyInverse(basePass, params.inverse);
3133
+ const filterParts = [];
3134
+ if (matcher.type) {
3135
+ const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
3136
+ filterParts.push(`type=${types.join("|")}`);
3137
+ }
3138
+ const pattern = matcher.pattern || matcher.name;
3139
+ if (pattern) filterParts.push(`pattern=${pattern}`);
3140
+ let reason = `Matched ${count} trajectory step(s)`;
3141
+ if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
3142
+ if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
3143
+ else if (min !== void 0) reason += ` (expected at least ${min})`;
3144
+ else if (max !== void 0) reason += ` (expected at most ${max})`;
3145
+ if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
3146
+ if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
3147
+ return {
3148
+ pass,
3149
+ score: pass ? 1 : 0,
3150
+ reason,
3151
+ assertion: params.assertion
3152
+ };
3153
+ };
3154
+ const handleTrajectoryGoalSuccess = async (params) => {
3155
+ const trace = getTraceOrThrow(params);
3156
+ const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
3157
+ const result = await require_graders.matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
3158
+ if (!params.inverse) return result;
3159
+ return {
3160
+ ...result,
3161
+ assertion: params.assertion,
3162
+ pass: !result.pass,
3163
+ score: result.pass ? 0 : 1,
3164
+ reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
3165
+ };
3166
+ };
3167
+ //#endregion
2543
3168
  //#region src/assertions/webhook.ts
2544
3169
  async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
2545
3170
  require_invariant.invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
@@ -2608,18 +3233,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
2608
3233
  if (pass) reason = "Assertion passed";
2609
3234
  else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
2610
3235
  else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
2611
- } else if (min !== void 0) {
2612
- const basePass = wordCount >= min;
2613
- pass = inverse ? !basePass : basePass;
2614
- if (pass) reason = "Assertion passed";
2615
- else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
2616
- else reason = `Word count ${wordCount} is less than minimum ${min}`;
2617
- } else {
3236
+ } else if (min === void 0) {
2618
3237
  const basePass = wordCount <= max;
2619
3238
  pass = inverse ? !basePass : basePass;
2620
3239
  if (pass) reason = "Assertion passed";
2621
3240
  else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
2622
3241
  else reason = `Word count ${wordCount} is greater than maximum ${max}`;
3242
+ } else {
3243
+ const basePass = wordCount >= min;
3244
+ pass = inverse ? !basePass : basePass;
3245
+ if (pass) reason = "Assertion passed";
3246
+ else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
3247
+ else reason = `Word count ${wordCount} is less than minimum ${min}`;
2623
3248
  }
2624
3249
  } else {
2625
3250
  require_invariant.invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
@@ -2714,6 +3339,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
2714
3339
  //#endregion
2715
3340
  //#region src/assertions/index.ts
2716
3341
  const ASSERTIONS_MAX_CONCURRENCY = require_logger.getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
3342
+ const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
3343
+ const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
3344
+ const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
3345
+ const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
3346
+ const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
3347
+ const MAX_TRACE_FETCH_STABLE_POLLS = 10;
2717
3348
  const MODEL_GRADED_ASSERTION_TYPES = new Set([
2718
3349
  "answer-relevance",
2719
3350
  "context-faithfulness",
@@ -2723,8 +3354,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
2723
3354
  "llm-rubric",
2724
3355
  "model-graded-closedqa",
2725
3356
  "model-graded-factuality",
2726
- "search-rubric"
3357
+ "search-rubric",
3358
+ "trajectory:goal-success"
3359
+ ]);
3360
+ const TRACE_AWARE_ASSERTION_TYPES = new Set([
3361
+ "javascript",
3362
+ "python",
3363
+ "ruby",
3364
+ "trace-error-spans",
3365
+ "trace-span-count",
3366
+ "trace-span-duration",
3367
+ "trajectory:goal-success",
3368
+ "trajectory:step-count",
3369
+ "trajectory:tool-args-match",
3370
+ "trajectory:tool-sequence",
3371
+ "trajectory:tool-used"
2727
3372
  ]);
3373
+ function assertionUsesTrace(assertion) {
3374
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
3375
+ return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
3376
+ }
3377
+ function assertionMayNeedTraceContext(assertion) {
3378
+ if (assertionUsesTrace(assertion)) return true;
3379
+ if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
3380
+ return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || require_providers.isPackagePath(assertion.value) : false;
3381
+ }
3382
+ function hasTraceAwareAssertions(assertions) {
3383
+ return Boolean(assertions?.some(assertionMayNeedTraceContext));
3384
+ }
3385
+ async function loadTraceData(traceId) {
3386
+ const traceStore = require_store.getTraceStore();
3387
+ const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
3388
+ const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
3389
+ const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
3390
+ let lastSpanCount = -1;
3391
+ let stableObservations = 0;
3392
+ let latestTrace = null;
3393
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
3394
+ latestTrace = await traceStore.getTrace(traceId);
3395
+ const spanCount = latestTrace?.spans?.length ?? 0;
3396
+ if (spanCount > 0) {
3397
+ stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
3398
+ lastSpanCount = spanCount;
3399
+ if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
3400
+ } else {
3401
+ stableObservations = 0;
3402
+ lastSpanCount = spanCount;
3403
+ }
3404
+ if (attempt < maxAttempts - 1) await require_fetch.sleep(retryDelayMs);
3405
+ }
3406
+ return latestTrace;
3407
+ }
2728
3408
  const ASSERTION_HANDLERS = {
2729
3409
  "answer-relevance": handleAnswerRelevance,
2730
3410
  bleu: handleBleuScore,
@@ -2787,12 +3467,18 @@ const ASSERTION_HANDLERS = {
2787
3467
  ruby: handleRuby,
2788
3468
  "rouge-n": handleRougeScore,
2789
3469
  "search-rubric": handleSearchRubric,
3470
+ "skill-used": handleSkillUsed,
2790
3471
  similar: handleSimilar,
2791
3472
  "similar:cosine": handleSimilar,
2792
3473
  "similar:dot": handleSimilar,
2793
3474
  "similar:euclidean": handleSimilar,
2794
3475
  "starts-with": handleStartsWith,
2795
3476
  "tool-call-f1": handleToolCallF1,
3477
+ "trajectory:goal-success": handleTrajectoryGoalSuccess,
3478
+ "trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
3479
+ "trajectory:step-count": handleTrajectoryStepCount,
3480
+ "trajectory:tool-sequence": handleTrajectoryToolSequence,
3481
+ "trajectory:tool-used": handleTrajectoryToolUsed,
2796
3482
  "trace-error-spans": handleTraceErrorSpans,
2797
3483
  "trace-span-count": handleTraceSpanCount,
2798
3484
  "trace-span-duration": handleTraceSpanDuration,
@@ -2835,7 +3521,7 @@ function isAssertionInverse(assertion) {
2835
3521
  function getAssertionBaseType(assertion) {
2836
3522
  return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
2837
3523
  }
2838
- async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
3524
+ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
2839
3525
  const resolvedVars = vars || test.vars || {};
2840
3526
  const { cost, logProbs, output: originalOutput } = providerResponse;
2841
3527
  let output = originalOutput;
@@ -2854,14 +3540,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
2854
3540
  providerResponse,
2855
3541
  ...assertion.config ? { config: structuredClone(assertion.config) } : {}
2856
3542
  };
2857
- if (traceId) try {
2858
- const traceData = await require_store.getTraceStore().getTrace(traceId);
2859
- if (traceData) context.trace = {
2860
- traceId: traceData.traceId,
2861
- evaluationId: traceData.evaluationId,
2862
- testCaseId: traceData.testCaseId,
2863
- metadata: traceData.metadata,
2864
- spans: traceData.spans || []
3543
+ if (traceId && assertionMayNeedTraceContext(assertion)) try {
3544
+ const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
3545
+ if (resolvedTraceData) context.trace = {
3546
+ traceId: resolvedTraceData.traceId,
3547
+ evaluationId: resolvedTraceData.evaluationId,
3548
+ testCaseId: resolvedTraceData.testCaseId,
3549
+ metadata: resolvedTraceData.metadata,
3550
+ spans: resolvedTraceData.spans || []
2865
3551
  };
2866
3552
  } catch (error) {
2867
3553
  require_logger.logger.debug(`Failed to fetch trace data for assertion: ${error}`);
@@ -2894,7 +3580,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
2894
3580
  };
2895
3581
  }
2896
3582
  else if (filePath.endsWith(".rb")) try {
2897
- const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-CP42kMvq.cjs"));
3583
+ const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-B1HXG4ej.cjs"));
2898
3584
  valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
2899
3585
  require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
2900
3586
  } catch (error) {
@@ -3003,6 +3689,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
3003
3689
  index: i
3004
3690
  };
3005
3691
  }).flat();
3692
+ const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
3693
+ let preloadedTraceData;
3694
+ if (shouldPreloadTrace && traceId) try {
3695
+ preloadedTraceData = await loadTraceData(traceId);
3696
+ } catch (error) {
3697
+ require_logger.logger.debug(`Failed to preload trace data for assertions: ${error}`);
3698
+ preloadedTraceData = null;
3699
+ }
3006
3700
  await async.default.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
3007
3701
  if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
3008
3702
  const result = await runAssertion({
@@ -3014,7 +3708,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
3014
3708
  vars,
3015
3709
  latencyMs,
3016
3710
  assertIndex: index,
3017
- traceId
3711
+ traceId,
3712
+ traceData: preloadedTraceData
3018
3713
  });
3019
3714
  assertResult.addResult({
3020
3715
  index,
@@ -3160,7 +3855,7 @@ var CIProgressReporter = class {
3160
3855
  else {
3161
3856
  const eta = remaining / rate;
3162
3857
  if (eta > 1440) etaDisplay = ">24 hours";
3163
- else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) !== 1 ? "s" : ""}`;
3858
+ else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
3164
3859
  }
3165
3860
  const percentage = Math.floor(this.completedTests / this.totalTests * 100);
3166
3861
  require_logger.logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
@@ -3561,12 +4256,55 @@ function isPromptAllowed(prompt, allowedPrompts) {
3561
4256
  var ProgressBarManager = class {
3562
4257
  progressBar;
3563
4258
  isWebUI;
4259
+ originalLogCallback = null;
4260
+ installedLogCallback = null;
4261
+ pendingRender = null;
3564
4262
  totalCount = 0;
3565
4263
  completedCount = 0;
3566
4264
  concurrency = 1;
3567
4265
  constructor(isWebUI) {
3568
4266
  this.isWebUI = isWebUI;
3569
4267
  }
4268
+ clearProgressBarLine() {
4269
+ readline.default.cursorTo(process.stderr, 0);
4270
+ readline.default.clearLine(process.stderr, 0);
4271
+ }
4272
+ scheduleRender() {
4273
+ if (!this.progressBar || this.pendingRender) return;
4274
+ this.pendingRender = setImmediate(() => {
4275
+ this.pendingRender = null;
4276
+ this.progressBar?.render();
4277
+ });
4278
+ }
4279
+ handleLogMessage() {
4280
+ if (!this.progressBar) return;
4281
+ this.clearProgressBarLine();
4282
+ this.scheduleRender();
4283
+ }
4284
+ /**
4285
+ * Coordinate console logging with the progress bar to prevent visual corruption.
4286
+ */
4287
+ installLogInterceptor() {
4288
+ if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
4289
+ this.originalLogCallback = require_logger.globalLogCallback;
4290
+ this.installedLogCallback = (message) => {
4291
+ this.originalLogCallback?.(message);
4292
+ this.handleLogMessage();
4293
+ };
4294
+ require_logger.setLogCallback(this.installedLogCallback);
4295
+ }
4296
+ /**
4297
+ * Remove the log interceptor and restore original logger callback behavior.
4298
+ */
4299
+ removeLogInterceptor() {
4300
+ if (this.pendingRender) {
4301
+ clearImmediate(this.pendingRender);
4302
+ this.pendingRender = null;
4303
+ }
4304
+ if (this.installedLogCallback && require_logger.globalLogCallback === this.installedLogCallback) require_logger.setLogCallback(this.originalLogCallback);
4305
+ this.installedLogCallback = null;
4306
+ this.originalLogCallback = null;
4307
+ }
3570
4308
  /**
3571
4309
  * Initialize progress bar
3572
4310
  */
@@ -3586,7 +4324,8 @@ var ProgressBarManager = class {
3586
4324
  return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
3587
4325
  },
3588
4326
  hideCursor: true,
3589
- gracefulExit: true
4327
+ gracefulExit: true,
4328
+ stream: process.stderr
3590
4329
  }, cli_progress.default.Presets.shades_classic);
3591
4330
  this.progressBar.start(this.totalCount, 0, {
3592
4331
  provider: "",
@@ -3861,6 +4600,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
3861
4600
  const parts = traceContext.traceparent.split("-");
3862
4601
  if (parts.length >= 3) traceId = parts[1];
3863
4602
  }
4603
+ if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
3864
4604
  const checkResult = await runAssertions({
3865
4605
  prompt: renderedPrompt,
3866
4606
  provider,
@@ -4258,7 +4998,7 @@ var Evaluator = class {
4258
4998
  const defaultProvider = testSuite.defaultTest.provider;
4259
4999
  if (require_types.isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
4260
5000
  else if (typeof defaultProvider === "object" && defaultProvider.id) {
4261
- const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-zyB6k_38.cjs"));
5001
+ const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-D-FnDg8k.cjs"));
4262
5002
  testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
4263
5003
  } else testCase.provider = defaultProvider;
4264
5004
  }
@@ -4342,7 +5082,7 @@ var Evaluator = class {
4342
5082
  if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
4343
5083
  }
4344
5084
  if (require_logger.state.resume && this.evalRecord.persisted) try {
4345
- const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-71lY93Kj.cjs"));
5085
+ const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-tGdilrWt.cjs"));
4346
5086
  const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: require_logger.state.retryMode });
4347
5087
  const originalCount = runEvalOptions.length;
4348
5088
  for (let i = runEvalOptions.length - 1; i >= 0; i--) {
@@ -4542,7 +5282,7 @@ var Evaluator = class {
4542
5282
  if (require_logger.isCI() && !isWebUI) {
4543
5283
  ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
4544
5284
  ciProgressReporter.start();
4545
- } else if (this.options.showProgressBar && process.stdout.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
5285
+ } else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
4546
5286
  this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
4547
5287
  if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
4548
5288
  if (isWebUI) {
@@ -4563,7 +5303,10 @@ var Evaluator = class {
4563
5303
  if (serialRunEvalOptions.length > 0) require_logger.logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
4564
5304
  if (concurrentRunEvalOptions.length > 0) require_logger.logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
4565
5305
  }
4566
- if (this.options.showProgressBar && progressBarManager) await progressBarManager.initialize(runEvalOptions, concurrency, 0);
5306
+ if (this.options.showProgressBar && progressBarManager) {
5307
+ await progressBarManager.initialize(runEvalOptions, concurrency, 0);
5308
+ progressBarManager.installLogInterceptor();
5309
+ }
4567
5310
  try {
4568
5311
  if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
4569
5312
  checkAbort();
@@ -4589,7 +5332,10 @@ var Evaluator = class {
4589
5332
  else if (!targetUnavailable) {
4590
5333
  require_logger.logger.info("Evaluation interrupted, saving progress...");
4591
5334
  if (globalTimeout) clearTimeout(globalTimeout);
4592
- if (progressBarManager) progressBarManager.stop();
5335
+ if (progressBarManager) {
5336
+ progressBarManager.removeLogInterceptor();
5337
+ progressBarManager.stop();
5338
+ }
4593
5339
  if (ciProgressReporter) ciProgressReporter.finish();
4594
5340
  this.evalRecord.setVars(Array.from(vars));
4595
5341
  await this.evalRecord.addPrompts(prompts);
@@ -4597,6 +5343,10 @@ var Evaluator = class {
4597
5343
  return this.evalRecord;
4598
5344
  }
4599
5345
  } else {
5346
+ if (progressBarManager) {
5347
+ progressBarManager.removeLogInterceptor();
5348
+ progressBarManager.stop();
5349
+ }
4600
5350
  if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
4601
5351
  throw err;
4602
5352
  }
@@ -4739,6 +5489,7 @@ var Evaluator = class {
4739
5489
  await this.evalRecord.addPrompts(prompts);
4740
5490
  try {
4741
5491
  if (progressBarManager) {
5492
+ progressBarManager.removeLogInterceptor();
4742
5493
  progressBarManager.complete();
4743
5494
  progressBarManager.stop();
4744
5495
  } else if (ciProgressReporter) ciProgressReporter.finish();
@@ -7092,8 +7843,7 @@ function testCaseFromCsvRow(row) {
7092
7843
  require_logger.logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
7093
7844
  } else if (key.startsWith("__config:")) {
7094
7845
  const configParts = key.slice(9).split(":");
7095
- if (configParts.length !== 2) require_logger.logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
7096
- else {
7846
+ if (configParts.length === 2) {
7097
7847
  const [expectedKey, configKey] = configParts;
7098
7848
  let targetIndex;
7099
7849
  if (expectedKey === "__expected") targetIndex = 0;
@@ -7119,7 +7869,7 @@ function testCaseFromCsvRow(row) {
7119
7869
  }
7120
7870
  }
7121
7871
  assertionConfigs[targetIndex][configKey] = parsedValue;
7122
- }
7872
+ } else require_logger.logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
7123
7873
  } else vars[key] = value;
7124
7874
  }
7125
7875
  for (let i = 0; i < asserts.length; i++) {
@@ -7248,14 +7998,14 @@ async function parseXlsxFile(filePath) {
7248
7998
  const sheetName = typeof sheetOption === "number" ? sheetNames[sheetOption - 1] : sheetOption;
7249
7999
  const rows = await readXlsxFile(actualFilePath, { sheet: sheetOption });
7250
8000
  if (rows.length === 0) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
7251
- const headers = rows[0].map((cell) => cell != null ? String(cell) : "");
8001
+ const headers = rows[0].map((cell) => cell == null ? "" : String(cell));
7252
8002
  if (headers.length === 0 || headers.every((h) => h === "")) throw new Error(`Sheet "${sheetName}" has no valid column headers`);
7253
8003
  if (rows.length === 1) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
7254
8004
  const data = rows.slice(1).map((row) => {
7255
8005
  const obj = {};
7256
8006
  headers.forEach((header, index) => {
7257
8007
  const cellValue = row[index];
7258
- obj[header] = cellValue != null ? String(cellValue) : "";
8008
+ obj[header] = cellValue == null ? "" : String(cellValue);
7259
8009
  });
7260
8010
  return obj;
7261
8011
  });
@@ -11202,20 +11952,19 @@ function generateEvalSummary(params) {
11202
11952
  }
11203
11953
  }
11204
11954
  lines.push("");
11205
- const passRate = successes / (successes + failures + errors) * 100;
11206
- let passRateDisplay;
11207
- if (!Number.isNaN(passRate)) {
11208
- const passRateFormatted = passRate === 0 || passRate === 100 ? `${passRate.toFixed(0)}%` : `${passRate.toFixed(2)}%`;
11209
- if (passRate >= 100) passRateDisplay = chalk.default.green.bold(passRateFormatted);
11210
- else if (passRate >= 80) passRateDisplay = chalk.default.yellow.bold(passRateFormatted);
11211
- else passRateDisplay = chalk.default.red.bold(passRateFormatted);
11212
- }
11213
- const passedPart = successes > 0 ? `${chalk.default.green("✓")} ${chalk.default.green.bold(successes.toLocaleString())} passed` : `${chalk.default.gray.bold(successes.toLocaleString())} passed`;
11214
- const failedPart = failures > 0 ? `${chalk.default.red("✗")} ${chalk.default.red.bold(failures.toLocaleString())} failed` : `${chalk.default.gray.bold(failures.toLocaleString())} failed`;
11955
+ const totalTests = successes + failures + errors;
11956
+ const formatResultPercentage = (count) => {
11957
+ const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
11958
+ return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
11959
+ };
11960
+ const formatResultLine = (count, label, icon, iconColor) => {
11961
+ return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count)})`)}`;
11962
+ };
11215
11963
  const errorLabel = errors === 1 ? "error" : "errors";
11216
- const resultsLine = `${passedPart}, ${failedPart}, ${errors > 0 ? `${chalk.default.red("✗")} ${chalk.default.red.bold(errors.toLocaleString())} ${errorLabel}` : `${chalk.default.gray.bold(errors.toLocaleString())} ${errorLabel}`}`;
11217
- if (Number.isNaN(passRate)) lines.push(`${chalk.default.bold("Results:")} ${resultsLine}`);
11218
- else lines.push(`${chalk.default.bold("Results:")} ${resultsLine} (${passRateDisplay})`);
11964
+ lines.push(chalk.default.bold("Results:"));
11965
+ lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green));
11966
+ lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red));
11967
+ lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red));
11219
11968
  const durationDisplay = formatDuration(duration);
11220
11969
  lines.push(chalk.default.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
11221
11970
  lines.push("");
@@ -11549,7 +12298,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
11549
12298
  await require_providers.checkCloudPermissions(config);
11550
12299
  const options = {
11551
12300
  ...evaluateOptions,
11552
- showProgressBar: require_logger.getLogLevel() === "debug" ? false : cmdObj.progressBar !== void 0 ? cmdObj.progressBar !== false : evaluateOptions.showProgressBar !== void 0 ? evaluateOptions.showProgressBar : true,
12301
+ showProgressBar: require_logger.getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
11553
12302
  repeat,
11554
12303
  delay: !Number.isNaN(delay) && delay > 0 ? delay : void 0,
11555
12304
  maxConcurrency,
@@ -11933,7 +12682,7 @@ async function doRedteamRun(options) {
11933
12682
  redteamConfig = await doGenerateRedteam({
11934
12683
  ...passThroughOptions,
11935
12684
  ...options.liveRedteamConfig?.commandLineOptions || {},
11936
- ...maxConcurrency !== void 0 ? { maxConcurrency } : {},
12685
+ ...maxConcurrency === void 0 ? {} : { maxConcurrency },
11937
12686
  config: configPath,
11938
12687
  output: redteamPath,
11939
12688
  force: options.force,