promptfoo 0.121.1 → 0.121.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/src/{accounts-xrUGFA6n.js → accounts-B2XmGjty.js} +5 -5
- package/dist/src/{accounts-Bx-x3bmW.cjs → accounts-BPyfpSeU.cjs} +5 -5
- package/dist/src/{accounts-CMqkzrVf.js → accounts-CFLK3mnD.js} +6 -6
- package/dist/src/{accounts-BgNJDBE6.js → accounts-Xatc0RYb.js} +5 -5
- package/dist/src/{agentic-utils-BKIN5PKu.js → agentic-utils-36epdqwB.js} +3 -3
- package/dist/src/{cometapi-DkXrKi5z.js → agentic-utils-D8yXo5Lm.js} +4 -61
- package/dist/src/{cometapi-vY6aDZgo.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
- package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
- package/dist/src/{agents-C-dDThPK.js → agents-BBVJCIYr.js} +226 -13
- package/dist/src/{agents-CErsqg5U.cjs → agents-BBWxKSM0.cjs} +7 -7
- package/dist/src/{agents-Dy2YpZpa.js → agents-Bqgfdokm.js} +227 -14
- package/dist/src/{agents-B0f4HICh.cjs → agents-CAYbM7qD.cjs} +226 -13
- package/dist/src/{agents-CVIn-Utx.js → agents-CLQ-P15P.js} +7 -7
- package/dist/src/{agents-DeH4Gu94.js → agents-CgBniSlI.js} +8 -8
- package/dist/src/{agents-CXknwsFX.js → agents-DSSTV4bv.js} +226 -13
- package/dist/src/{agents-aF4-T121.js → agents-wg3ohknq.js} +7 -7
- package/dist/src/{aimlapi-tg0Gkcvr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
- package/dist/src/{aimlapi-BNfTBexL.js → aimlapi-BwGC1TtS.js} +13 -13
- package/dist/src/{aimlapi-BAGZDo5G.js → aimlapi-DaC3qZ-o.js} +14 -14
- package/dist/src/{aimlapi-DHRKlBEA.js → aimlapi-MgSLdvy7.js} +13 -13
- package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
- package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
- package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
- package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
- package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
- package/dist/src/app/index.html +4 -4
- package/dist/src/{audio-tf_NBjlC.js → audio-Bn44pQxv.js} +4 -4
- package/dist/src/{audio-CHQ4r-RV.js → audio-DDA5WHdx.js} +4 -4
- package/dist/src/{audio-BWeaWovU.cjs → audio-DVFjQ67_.cjs} +4 -4
- package/dist/src/{audio-BRODU0UK.js → audio-DjU9GswO.js} +5 -5
- package/dist/src/{base-DBtwl2FR.cjs → base-BboXIF_0.cjs} +3 -3
- package/dist/src/{base-B4QJRyFS.js → base-CKjwebIH.js} +3 -3
- package/dist/src/{base-B0tcrnq_.js → base-CqzQ4K8j.js} +3 -3
- package/dist/src/{base-fEDN28WM.js → base-Cz2ZC_iA.js} +3 -3
- package/dist/src/{blobs-BAU-dXan.js → blobs-B1JriOyi.js} +3 -3
- package/dist/src/{blobs-qTYm-1PY.js → blobs-BUWmKWzo.js} +3 -3
- package/dist/src/{blobs-DvS-O6be.cjs → blobs-C6j0bvFz.cjs} +3 -3
- package/dist/src/{blobs-Bpg5rH6i.js → blobs-DXTl6J3H.js} +3 -3
- package/dist/src/{cache-COish3-W.cjs → cache-C5yFZ4gC.cjs} +75 -58
- package/dist/src/{cache-8XhNqPKW.js → cache-CaT5tPgo.js} +75 -58
- package/dist/src/cache-CyCanoMu.js +6 -0
- package/dist/src/{cache-CG0SlR1d.js → cache-DSqR6ezl.js} +75 -58
- package/dist/src/cache-Df_QFDNu.cjs +5 -0
- package/dist/src/{cache-D3eqDYGU.js → cache-HP0NP4k3.js} +75 -58
- package/dist/src/{chat-DHMH-N64.js → chat-B-52XYI1.js} +12 -12
- package/dist/src/{chat-BKm79wib.js → chat-B0iaWhoh.js} +16 -14
- package/dist/src/{chat-DxysjBvt.js → chat-BE0qTA8e.js} +13 -13
- package/dist/src/{chat-CRWNNq73.js → chat-BEwdgGEg.js} +16 -14
- package/dist/src/{chat-2K608PeQ.cjs → chat-BtIKkLKx.cjs} +13 -13
- package/dist/src/{chat-DaqekjFr.cjs → chat-CM8qWR3_.cjs} +17 -15
- package/dist/src/{chat-CM_kyI8B.js → chat-DK1U-eZ-.js} +12 -12
- package/dist/src/{chat-CznLWr_D.js → chat-pxmiVpWe.js} +16 -14
- package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
- package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
- package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
- package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
- package/dist/src/{claude-agent-sdk-CJH22shf.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
- package/dist/src/{claude-agent-sdk-Dy5lT-Tx.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
- package/dist/src/{claude-agent-sdk-BLTu0WBO.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
- package/dist/src/{claude-agent-sdk-D6_k9FKA.js → claude-agent-sdk-HgbFioFw.js} +33 -15
- package/dist/src/cloud-DE3t1-ZI.js +4 -0
- package/dist/src/{cloud-Bc9526yV.js → cloud-z8KZpUoa.js} +3 -3
- package/dist/src/{cloudflare-ai-CWWJCRim.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
- package/dist/src/{cloudflare-ai-C9r2sRhw.js → cloudflare-ai-Bbp26N0L.js} +13 -13
- package/dist/src/{cloudflare-ai-ClWSdor4.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
- package/dist/src/{cloudflare-ai-ICsOuD-z.js → cloudflare-ai-DdKP9TKT.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6xFc5pa.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6O7AlYb.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
- package/dist/src/{cloudflare-gateway-pXGHxJ47.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
- package/dist/src/{cloudflare-gateway-C2_-KG5o.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
- package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
- package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
- package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
- package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
- package/dist/src/cometapi-BDyV-NNm.js +62 -0
- package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
- package/dist/src/{cometapi-Bbjp5V4x.js → cometapi-hhL4TAh3.js} +14 -14
- package/dist/src/{cometapi-BasUi7-_.js → cometapi-sp7sJpBD.js} +15 -15
- package/dist/src/{completion-C_P3ypkJ.js → completion-BCimtq-h.js} +6 -6
- package/dist/src/{completion-6Mx_iXxK.js → completion-DCjv7RZ3.js} +6 -6
- package/dist/src/{completion-CDOouNzq.cjs → completion-DlXUhj5c.cjs} +6 -6
- package/dist/src/{completion-C5rtR_9P.js → completion-DoYy49ti.js} +6 -6
- package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
- package/dist/src/{docker-BwsKwxFs.cjs → docker-Cqj2-QVi.cjs} +14 -14
- package/dist/src/{docker-CZnqU1XV.js → docker-CxCkwMzc.js} +13 -13
- package/dist/src/{docker-DzxyDPIj.js → docker-DpguQj-w.js} +14 -14
- package/dist/src/{docker-5KcG-_86.js → docker-FeBni2dw.js} +13 -13
- package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
- package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
- package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
- package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
- package/dist/src/eval-7aEqoMs3.js +15 -0
- package/dist/src/{eval-DmFyWU7i.js → eval-BTqTn7lb.js} +10 -10
- package/dist/src/{evalResult-CDQiuUuf.js → evalResult-BkIhRdTe.js} +7 -7
- package/dist/src/evalResult-CYNHkk5A.js +12 -0
- package/dist/src/evalResult-CuvJeNiM.js +10 -0
- package/dist/src/{evalResult-CTG2AHOS.js → evalResult-DUDShQrm.js} +7 -7
- package/dist/src/{evalResult-Dap2CekP.cjs → evalResult-DpARzUCb.cjs} +7 -7
- package/dist/src/evalResult-tGdilrWt.cjs +10 -0
- package/dist/src/evaluator-BBUqRhz1.js +36 -0
- package/dist/src/{evaluator-DPFRbFIL.js → evaluator-BcvOGaam.js} +833 -79
- package/dist/src/{extractor-YMU_Gvt8.js → extractor-C8XwivI9.js} +6 -6
- package/dist/src/{extractor-CFG6bcWJ.js → extractor-CAZ2G3Kh.js} +6 -6
- package/dist/src/{extractor-DX36oYEv.cjs → extractor-DG3sSfXE.cjs} +6 -6
- package/dist/src/{extractor-M67RUtg6.js → extractor-D_wd8jxt.js} +6 -6
- package/dist/src/{fetch-4M3YRaqL.js → fetch-BiYv2BZc.js} +3 -3
- package/dist/src/{fetch-BxUk8odA.cjs → fetch-BnR9wSnm.cjs} +3 -3
- package/dist/src/{fetch-60Gzydls.js → fetch-CVAtKnI3.js} +3 -3
- package/dist/src/{fetch-BMv0O527.js → fetch-DoVRJZhJ.js} +4 -4
- package/dist/src/fetch-UWU706qb.js +5 -0
- package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
- package/dist/src/{graders-DOXycdlG.cjs → graders-BElhu9ZY.cjs} +126 -55
- package/dist/src/{graders-R9rYUM0d.js → graders-BXAJ0sbS.js} +120 -55
- package/dist/src/graders-BxfEguVY.js +32 -0
- package/dist/src/graders-CzVMbEnv.js +34 -0
- package/dist/src/{graders-CpdqD9PI.js → graders-DG7mhg-b.js} +120 -55
- package/dist/src/graders-DjCXfj0l.cjs +32 -0
- package/dist/src/{graders-CHO8EPM4.js → graders-RjHF8VfG.js} +120 -55
- package/dist/src/graders-kHzIWOKu.js +32 -0
- package/dist/src/{image-DTedmQPg.cjs → image--F58eEIn.cjs} +6 -6
- package/dist/src/{image-DJEvKveK.js → image-6WQXK8m8.js} +4 -4
- package/dist/src/{image-pAX56tPG.js → image-B8b6f36E.js} +6 -6
- package/dist/src/{image-BmEZqVmk.js → image-CoxZp9PZ.js} +6 -6
- package/dist/src/{image-gvmivTEe.js → image-DO0RYnjH.js} +5 -5
- package/dist/src/{image-CBBVXWuT.js → image-PoF6DN3x.js} +6 -6
- package/dist/src/{image-CDLQOcqT.cjs → image-fza3zuKs.cjs} +4 -4
- package/dist/src/{image-tL5hIOFh.js → image-xNbw5ph2.js} +4 -4
- package/dist/src/index.cjs +863 -110
- package/dist/src/index.d.cts +833 -60
- package/dist/src/index.d.ts +833 -60
- package/dist/src/index.js +860 -108
- package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
- package/dist/src/{knowledgeBase-CoU-UQBg.js → knowledgeBase-Bi7CmDbx.js} +7 -7
- package/dist/src/{knowledgeBase-CLJybhnF.js → knowledgeBase-Ce3ofVan.js} +8 -8
- package/dist/src/{knowledgeBase-DjWPVqSb.js → knowledgeBase-DFRXPZl_.js} +7 -7
- package/dist/src/{knowledgeBase-wkxuRFhA.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
- package/dist/src/{litellm-B9Hysuri.js → litellm-Bo2gQXpo.js} +16 -15
- package/dist/src/{litellm-ePxtr9F1.js → litellm-CKiAxnoM.js} +15 -14
- package/dist/src/{litellm-NYpQ8RQu.cjs → litellm-CnHI69aj.cjs} +16 -15
- package/dist/src/{litellm-CTfa0hqi.js → litellm-Tc294Jhj.js} +15 -14
- package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
- package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
- package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
- package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
- package/dist/src/{luma-ray-BW9IRGIc.js → luma-ray-0ehMPt5N.js} +10 -10
- package/dist/src/{luma-ray-BE2mOt6N.js → luma-ray-C9q8rdQe.js} +9 -9
- package/dist/src/{luma-ray-Cm1KZBhs.js → luma-ray-DP0QA9qn.js} +9 -9
- package/dist/src/{luma-ray-B0GGNRc1.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
- package/dist/src/main.js +69 -71
- package/dist/src/{messages-1x9atZmP.js → messages-DJNo37Ko.js} +14 -9
- package/dist/src/{messages-BLbWdsyt.js → messages-Dy9QecMs.js} +14 -9
- package/dist/src/{messages-1JrJs91T.cjs → messages-HJsyEh4o.cjs} +15 -10
- package/dist/src/{messages-D8EA0oDc.js → messages-biC_ex-p.js} +14 -9
- package/dist/src/{modelslab-C1OLRmVX.js → modelslab-B5J-ZM5c.js} +9 -9
- package/dist/src/{modelslab-CqXBy3U8.js → modelslab-BI458moT.js} +10 -10
- package/dist/src/{modelslab-X5-4LroM.js → modelslab-BTOT8FUO.js} +9 -9
- package/dist/src/{modelslab-DcOSFwKh.cjs → modelslab-IQbNg-r7.cjs} +9 -9
- package/dist/src/{nova-reel-DihqLeol.js → nova-reel-BZ9y-Y5s.js} +9 -9
- package/dist/src/{nova-reel-D9xfaMBs.cjs → nova-reel-CE5etkv9.cjs} +9 -9
- package/dist/src/{nova-reel-D2ZkOSyr.js → nova-reel-DEeQlnOJ.js} +10 -10
- package/dist/src/{nova-reel-BgS1ZWuK.js → nova-reel-Xw1SXLpg.js} +9 -9
- package/dist/src/{nova-sonic-Q3BOJeig.js → nova-sonic-DWswpN1E.js} +7 -7
- package/dist/src/{nova-sonic-DezhVUYT.js → nova-sonic-DXTLpi-r.js} +6 -6
- package/dist/src/{nova-sonic-DVu3mMIy.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
- package/dist/src/{nova-sonic-P-CdUMlV.js → nova-sonic-Ogqf-csn.js} +6 -6
- package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
- package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
- package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
- package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
- package/dist/src/{openclaw-Bv1DINsX.js → openclaw-0Sv7AK3O.js} +172 -109
- package/dist/src/{openclaw-DAfWQn-o.cjs → openclaw-CXxbKgDH.cjs} +174 -110
- package/dist/src/{openclaw-BiSZPL7J.js → openclaw-D1FSCps-.js} +172 -109
- package/dist/src/{openclaw-D1D_ej1z.js → openclaw-D2ENvu7a.js} +173 -110
- package/dist/src/{opencode-sdk-D95s6SnR.js → opencode-sdk-C71Z0ehR.js} +13 -13
- package/dist/src/{opencode-sdk-DxUPkLT7.js → opencode-sdk-CHCs7dEb.js} +12 -12
- package/dist/src/{opencode-sdk-C7m-wRfI.js → opencode-sdk-DDxj4QqH.js} +12 -12
- package/dist/src/{opencode-sdk-CfaLN8PY.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
- package/dist/src/{otlpReceiver-g3ByGaXs.js → otlpReceiver-C9KlUtxh.js} +6 -6
- package/dist/src/{otlpReceiver--AIRW_S4.js → otlpReceiver-CZL48YfC.js} +6 -6
- package/dist/src/{otlpReceiver-Bn5wGB1v.js → otlpReceiver-CavGAA6k.js} +6 -6
- package/dist/src/{otlpReceiver-Diec4cln.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
- package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
- package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
- package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
- package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
- package/dist/src/providers-BSLEaIQG.js +32 -0
- package/dist/src/{providers-CFu-TZl-.cjs → providers-CScd1wN6.cjs} +733 -464
- package/dist/src/{providers-CFLy1_ji.js → providers-Ch6Mr0gn.js} +795 -526
- package/dist/src/{providers-BKRJTjBz.js → providers-Cn73d5sr.js} +795 -526
- package/dist/src/providers-D-FnDg8k.cjs +31 -0
- package/dist/src/providers-DEYiFVAo.js +30 -0
- package/dist/src/{providers-B3HvufyI.js → providers-DvddrgxL.js} +795 -526
- package/dist/src/providers-sS2WI8YD.js +30 -0
- package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
- package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
- package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
- package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
- package/dist/src/{quiverai-CI6gYJVI.js → quiverai-BeofbLVc.js} +4 -4
- package/dist/src/{quiverai-MHSxbmmZ.js → quiverai-CCQn73lq.js} +5 -5
- package/dist/src/{quiverai-CLkWkyZc.cjs → quiverai-CcUhPIBg.cjs} +4 -4
- package/dist/src/{quiverai-C2jVwbH1.js → quiverai-DVSEqJiq.js} +4 -4
- package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
- package/dist/src/{responses-CGw0DCzh.js → responses-BKP_WYis.js} +16 -12
- package/dist/src/{responses-BKqJmhhc.js → responses-CQb1Tj69.js} +16 -12
- package/dist/src/{responses-jxdehPkC.js → responses-CgNyTPsY.js} +16 -12
- package/dist/src/{responses-tD4Bd4dc.cjs → responses-mo0KQDbu.cjs} +16 -12
- package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
- package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
- package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
- package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
- package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
- package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
- package/dist/src/{sagemaker-BK4Zb993.js → sagemaker-CVv8W7so.js} +17 -17
- package/dist/src/{sagemaker-D2Q1c-sD.js → sagemaker-CqeASYE5.js} +17 -17
- package/dist/src/{sagemaker-BfiWTmvn.js → sagemaker-MUbD5V3v.js} +18 -18
- package/dist/src/{sagemaker-CcQHM1jV.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
- package/dist/src/{scanner-J8CA3LsV.js → scanner-DVDeUz1r.js} +10 -10
- package/dist/src/server/index.js +864 -112
- package/dist/src/server-B0Xh1Gx-.js +7 -0
- package/dist/src/{server-B0PPuDw-.cjs → server-BtoCXeXI.cjs} +4 -4
- package/dist/src/{server-BC7XJFgr.js → server-CP9qKM40.js} +4 -4
- package/dist/src/{server-OAs3nBRT.js → server-Cns05F1j.js} +5 -5
- package/dist/src/server-DJTKu9IR.cjs +5 -0
- package/dist/src/{server-DbFphssR.js → server-DZ9MtCn0.js} +6 -6
- package/dist/src/{signal-BOTbd53Z.js → signal-C3ZTsUgi.js} +3 -3
- package/dist/src/{slack-DXMKtA-f.js → slack-2sdpGzbt.js} +2 -2
- package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
- package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
- package/dist/src/{slack-DOdy_kyv.js → slack-DCEV-vWP.js} +2 -2
- package/dist/src/store-C5u6MgC8.js +6 -0
- package/dist/src/{store-BSc-TF2w.cjs → store-CLyU7AtI.cjs} +17 -5
- package/dist/src/store-CNHk-De4.cjs +5 -0
- package/dist/src/{store-DQLEjuEO.js → store-Cj258DgL.js} +17 -5
- package/dist/src/{store-D1tv90v3.js → store-P8OKm19S.js} +17 -5
- package/dist/src/{store-Ub2vaGJ1.js → store-VB0GP46K.js} +17 -5
- package/dist/src/{tables-xKANLRBD.js → tables-BEIFz2tM.js} +3 -3
- package/dist/src/{tables-C7K-XKWp.cjs → tables-BdZQEpRz.cjs} +3 -3
- package/dist/src/{tables-D36WTqKX.js → tables-DmzvLbeZ.js} +3 -3
- package/dist/src/{tables-5EvT_Bwn.js → tables-kC7R5kiK.js} +3 -3
- package/dist/src/{telemetry-C2YDkUQH.js → telemetry-BnH5VJAU.js} +4 -4
- package/dist/src/{telemetry-C15ziL8u.js → telemetry-BugWqKiu.js} +4 -4
- package/dist/src/{telemetry-DMb2Mpfm.js → telemetry-DPXLd7UE.js} +4 -4
- package/dist/src/telemetry-Yig0Tino.js +7 -0
- package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
- package/dist/src/{telemetry-CbrnxHp_.cjs → telemetry-re627Lre.cjs} +4 -4
- package/dist/src/{transcription-CL78qbOU.cjs → transcription-BvtsrzRG.cjs} +13 -13
- package/dist/src/{transcription-DAtxHhAM.js → transcription-CaMivnjG.js} +13 -13
- package/dist/src/{transcription-QHh3AH6Z.js → transcription-DOMMTu01.js} +14 -14
- package/dist/src/{transcription-LNZTNUUL.js → transcription-Hb3VnC4M.js} +13 -13
- package/dist/src/{transform-DOcQeLld.cjs → transform-0BwoBsvO.cjs} +19 -5
- package/dist/src/{transform-DGxXocjk.js → transform-B2-jIv68.js} +8 -6
- package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
- package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
- package/dist/src/transform-ChNIpHz7.js +6 -0
- package/dist/src/{transform-Cgi24fJ7.js → transform-DrleutM3.js} +8 -6
- package/dist/src/{transform-DGLazrMm.js → transform-DyDAwEpE.js} +8 -6
- package/dist/src/transform-PtQ6rAE3.cjs +5 -0
- package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
- package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
- package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
- package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
- package/dist/src/{types-CH3Ge2sE.js → types-CIhFeUC4.js} +45 -11
- package/dist/src/{types-CN_TZ2GJ.js → types-Cd3ygw8W.js} +45 -11
- package/dist/src/{types-LJ0r3wbR.cjs → types-D8cGDZbL.cjs} +46 -12
- package/dist/src/{types-CLKiCBW3.js → types-q8GXGF65.js} +45 -11
- package/dist/src/{util-CchiqXh_.cjs → util--9u9UVCt.cjs} +3 -3
- package/dist/src/{util-5cB-L7U3.js → util-BLvy9qfE.js} +7 -11
- package/dist/src/{util-YT5HPZaS.js → util-Bm3E9jpK.js} +7 -11
- package/dist/src/{util-6-GqIvzS.js → util-BtoGs5Cb.js} +18 -4
- package/dist/src/{util-Db0a0AFH.cjs → util-CFj4YKIn.cjs} +18 -4
- package/dist/src/{util-Dlz_Wvgm.js → util-CMMkIxfU.js} +7 -11
- package/dist/src/{util-Betm42rL.js → util-CgDCK4KI.js} +18 -4
- package/dist/src/{util-Yz-1aEhW.cjs → util-CuLo2pMR.cjs} +7 -11
- package/dist/src/{util-C-PPYSMq.js → util-DM2rTn_6.js} +18 -4
- package/dist/src/{util-B7T3SiBS.js → util-DMFeUvLz.js} +3 -3
- package/dist/src/{util-ZZH-3QZz.js → util-DbVG-yZU.js} +3 -3
- package/dist/src/{util-DaWTWKBK.js → util-vNmDL5DT.js} +3 -3
- package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
- package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
- package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
- package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -38
- package/dist/src/app/assets/index-BFCZg7hQ.js +0 -439
- package/dist/src/app/assets/index-NCn4eVBv.css +0 -1
- package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
- package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
- package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
- package/dist/src/cache-Bbn1Nyrd.cjs +0 -5
- package/dist/src/cache-BwsMSda7.js +0 -6
- package/dist/src/cloud-DmE0EwsY.js +0 -4
- package/dist/src/eval-17JizQIv.js +0 -15
- package/dist/src/evalResult-Cqj8pldJ.js +0 -12
- package/dist/src/evalResult-DvcJAWJU.cjs +0 -10
- package/dist/src/evalResult-Hftn-S_i.js +0 -10
- package/dist/src/evaluator-B2CFNt-P.js +0 -36
- package/dist/src/fetch-KV5kNASw.js +0 -5
- package/dist/src/graders-Bu0H9nXi.js +0 -32
- package/dist/src/graders-Cfhkvx-e.js +0 -34
- package/dist/src/graders-DClJVpGP.cjs +0 -32
- package/dist/src/graders-DcnJsrMO.js +0 -32
- package/dist/src/providers-C1rOSHiR.js +0 -32
- package/dist/src/providers-CxmDwEFf.cjs +0 -31
- package/dist/src/providers-Dodakqr0.js +0 -30
- package/dist/src/providers-GIQ2TcsA.js +0 -30
- package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
- package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
- package/dist/src/server-B1vi21hA.js +0 -7
- package/dist/src/server-Cm9Kai_h.cjs +0 -5
- package/dist/src/store-BNmZ1KAz.cjs +0 -5
- package/dist/src/store-BltJg2cd.js +0 -6
- package/dist/src/telemetry-5BCRNBbe.cjs +0 -5
- package/dist/src/telemetry-D4W5hboe.js +0 -7
- package/dist/src/transform-DTGDnAzW.js +0 -6
- package/dist/src/transform-m3qNw4KP.cjs +0 -5
package/dist/src/server/index.js
CHANGED
|
@@ -1,39 +1,39 @@
|
|
|
1
|
-
import { C as
|
|
2
|
-
import { A as getDefaultShareViewBaseUrl, F as FILE_METADATA_KEY, I as HUMAN_ASSERTION_TYPE, M as getShareViewBaseUrl, O as TERMINAL_MAX_WIDTH, P as VERSION, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareApiBaseUrl, k as getDefaultPort, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "../fetch-
|
|
1
|
+
import { C as getEnvString, D as state, E as isCI, S as getEnvInt, T as getMaxEvalTimeMs, _ as safeJsonStringify, a as logger, b as getEnvBool, f as extractFirstJsonObject, g as orderKeys, m as getAjv, n as globalLogCallback, o as setLogCallback, p as extractJsonObjects, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, u as sanitizeObject, v as summarizeEvaluateResultForLogging, w as getEvalTimeoutMs, x as getEnvFloat } from "../logger-BnkjG2jt.js";
|
|
2
|
+
import { A as getDefaultShareViewBaseUrl, F as FILE_METADATA_KEY, I as HUMAN_ASSERTION_TYPE, M as getShareViewBaseUrl, O as TERMINAL_MAX_WIDTH, P as VERSION, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareApiBaseUrl, k as getDefaultPort, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "../fetch-BiYv2BZc.js";
|
|
3
3
|
import { t as invariant } from "../invariant-vgHWClmd.js";
|
|
4
|
-
import { a as getAuthor, c as isLoggedIntoCloud, l as promptForEmailUnverified, n as checkEmailStatusAndMaybeExit, o as getUserEmail, r as clearUserEmail, s as getUserId, t as checkEmailStatus, u as setUserEmail } from "../accounts-
|
|
5
|
-
import { a as openBrowser, c as getRemoteGenerationUrl, d as neverGenerateRemote, i as checkServerRunning, n as BrowserBehaviorNames, p as shouldGenerateRemote, s as promptYesNo, t as BrowserBehavior, u as getRemoteHealthUrl } from "../server-
|
|
6
|
-
import { a as evalResultsTable, c as evalsToPromptsTable, d as promptsTable, g as getDbSignalPath, h as getDb, i as datasetsTable, l as evalsToTagsTable, n as blobReferencesTable, o as evalsTable, p as tagsTable, r as configsTable, s as evalsToDatasetsTable, t as blobAssetsTable, u as modelAuditsTable } from "../tables-
|
|
7
|
-
import { r as importModule, t as getDirectory } from "../esm-
|
|
8
|
-
import { $ as MULTI_INPUT_VAR, A as STRATEGY_COLLECTIONS, B as ALIASED_PLUGIN_MAPPINGS, E as ALL_STRATEGIES, F as isMultiTurnStrategy, G as FINANCIAL_PLUGINS, H as BIAS_PLUGINS, I as Severity, J as INSURANCE_PLUGINS, K as FOUNDATION_PLUGINS, L as categoryAliases, M as getDefaultNFanout, O as DEFAULT_STRATEGIES, P as isFanoutStrategy, Q as MULTI_INPUT_EXCLUDED_PLUGINS, S as StrategyConfigSchema, U as DATASET_EXEMPT_PLUGINS, V as ALL_PLUGINS, W as DEFAULT_PLUGINS, X as LLAMA_GUARD_REPLICATE_PROVIDER, Y as LLAMA_GUARD_ENABLED_CATEGORIES, Z as MEDICAL_PLUGINS, _ as ProvidersSchema, a as EvaluateOptionsSchema, at as REMOTE_ONLY_PLUGIN_IDS, b as PluginConfigSchema, c as TestSuiteConfigSchema, ct as UNALIGNED_PROVIDER_HARM_PLUGINS, d as isGradingResult, et as PHARMACY_PLUGINS, f as isResultFailureReason, g as ProviderOptionsSchema, h as RedteamConfigSchema, i as EvalResultsFilterMode, it as REDTEAM_PROVIDER_HARM_PLUGINS, j as STRATEGY_COLLECTION_MAPPINGS, l as TestSuiteSchema, lt as PromptSchema, m as isProviderOptions, n as BaseAssertionTypesSchema, nt as PLUGIN_CATEGORIES, p as isApiProvider, q as HARM_PLUGINS, r as CommandLineOptionsSchema, rt as REDTEAM_MODEL, s as ResultFailureReason, st as TELECOM_PLUGINS, t as AssertionOrSetSchema, tt as PII_PLUGINS, u as UnifiedConfigSchema, v as ConversationMessageSchema, w as isUuid, y as PartialGenerationError, z as riskCategorySeverityMap } from "../types-
|
|
4
|
+
import { a as getAuthor, c as isLoggedIntoCloud, l as promptForEmailUnverified, n as checkEmailStatusAndMaybeExit, o as getUserEmail, r as clearUserEmail, s as getUserId, t as checkEmailStatus, u as setUserEmail } from "../accounts-Xatc0RYb.js";
|
|
5
|
+
import { a as openBrowser, c as getRemoteGenerationUrl, d as neverGenerateRemote, i as checkServerRunning, n as BrowserBehaviorNames, p as shouldGenerateRemote, s as promptYesNo, t as BrowserBehavior, u as getRemoteHealthUrl } from "../server-Cns05F1j.js";
|
|
6
|
+
import { a as evalResultsTable, c as evalsToPromptsTable, d as promptsTable, g as getDbSignalPath, h as getDb, i as datasetsTable, l as evalsToTagsTable, n as blobReferencesTable, o as evalsTable, p as tagsTable, r as configsTable, s as evalsToDatasetsTable, t as blobAssetsTable, u as modelAuditsTable } from "../tables-BEIFz2tM.js";
|
|
7
|
+
import { r as importModule, t as getDirectory } from "../esm-CKWP3u_P.js";
|
|
8
|
+
import { $ as MULTI_INPUT_VAR, A as STRATEGY_COLLECTIONS, B as ALIASED_PLUGIN_MAPPINGS, E as ALL_STRATEGIES, F as isMultiTurnStrategy, G as FINANCIAL_PLUGINS, H as BIAS_PLUGINS, I as Severity, J as INSURANCE_PLUGINS, K as FOUNDATION_PLUGINS, L as categoryAliases, M as getDefaultNFanout, O as DEFAULT_STRATEGIES, P as isFanoutStrategy, Q as MULTI_INPUT_EXCLUDED_PLUGINS, S as StrategyConfigSchema, U as DATASET_EXEMPT_PLUGINS, V as ALL_PLUGINS, W as DEFAULT_PLUGINS, X as LLAMA_GUARD_REPLICATE_PROVIDER, Y as LLAMA_GUARD_ENABLED_CATEGORIES, Z as MEDICAL_PLUGINS, _ as ProvidersSchema, a as EvaluateOptionsSchema, at as REMOTE_ONLY_PLUGIN_IDS, b as PluginConfigSchema, c as TestSuiteConfigSchema, ct as UNALIGNED_PROVIDER_HARM_PLUGINS, d as isGradingResult, et as PHARMACY_PLUGINS, f as isResultFailureReason, g as ProviderOptionsSchema, h as RedteamConfigSchema, i as EvalResultsFilterMode, it as REDTEAM_PROVIDER_HARM_PLUGINS, j as STRATEGY_COLLECTION_MAPPINGS, l as TestSuiteSchema, lt as PromptSchema, m as isProviderOptions, n as BaseAssertionTypesSchema, nt as PLUGIN_CATEGORIES, p as isApiProvider, q as HARM_PLUGINS, r as CommandLineOptionsSchema, rt as REDTEAM_MODEL, s as ResultFailureReason, st as TELECOM_PLUGINS, t as AssertionOrSetSchema, tt as PII_PLUGINS, u as UnifiedConfigSchema, v as ConversationMessageSchema, w as isUuid, y as PartialGenerationError, z as riskCategorySeverityMap } from "../types-Cd3ygw8W.js";
|
|
9
9
|
import { i as isJavascriptFile } from "../fileExtensions-LcDYkU4v.js";
|
|
10
10
|
import { n as sha256, t as randomSequence } from "../createHash-CTQmL3G2.js";
|
|
11
|
-
import { a as generateIdFromPrompt, t as hashPrompt } from "../utils-
|
|
12
|
-
import { t as getTraceStore } from "../store-
|
|
13
|
-
import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "../cache-
|
|
11
|
+
import { a as generateIdFromPrompt, t as hashPrompt } from "../utils-DOjD4dTC.js";
|
|
12
|
+
import { t as getTraceStore } from "../store-VB0GP46K.js";
|
|
13
|
+
import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "../cache-HP0NP4k3.js";
|
|
14
14
|
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "../tokenUsageUtils-BDGe-iyI.js";
|
|
15
|
-
import { n as getBlobUrl, t as getBlobByHash } from "../blobs-
|
|
16
|
-
import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "../extractor-
|
|
17
|
-
import { B as PromptfooHarmfulCompletionProvider, D as getShortPluginId, E as getSessionId, F as loadFromPackage, I as redteamProviderManager, J as AIStudioChatProvider, L as TokenUsageTracker, M as renderPrompt, N as runExtensionHook, O as isBasicRefusal, P as isPackagePath, R as createRateLimitRegistry, S as extractGoalFromPrompt, T as extractVariablesFromJson, _ as mediaExists, a as resolveProviderConfigs, at as getEvalConfigFromCloud, b as checkExfilTracking, c as MCPProvider, d as createTransformResponse, f as GoogleLiveProvider, g as getMediaStorage, h as validateStrategies, i as resolveProvider, it as getCloudDatabaseId, j as collectFileMetadata, l as HttpProvider, lt as isCloudProvider, m as loadStrategy, n as loadApiProvider, ot as getOrgContext, p as Strategies, q as VertexChatProvider, r as loadApiProviders, rt as checkCloudPermissions, st as getPluginSeverityOverridesFromCloud, t as getProviderIds, u as createTransformRequest, ut as resolveTeamId, v as retrieveMedia, w as extractPromptFromTags, y as pluginMatchesStrategyTargets, z as createProviderRateLimitOptions } from "../providers-
|
|
18
|
-
import { n as telemetry, t as TelemetryEventSchema } from "../telemetry-
|
|
15
|
+
import { n as getBlobUrl, t as getBlobByHash } from "../blobs-BUWmKWzo.js";
|
|
16
|
+
import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "../extractor-C8XwivI9.js";
|
|
17
|
+
import { B as PromptfooHarmfulCompletionProvider, D as getShortPluginId, E as getSessionId, F as loadFromPackage, I as redteamProviderManager, J as AIStudioChatProvider, L as TokenUsageTracker, M as renderPrompt, N as runExtensionHook, O as isBasicRefusal, P as isPackagePath, R as createRateLimitRegistry, S as extractGoalFromPrompt, T as extractVariablesFromJson, _ as mediaExists, a as resolveProviderConfigs, at as getEvalConfigFromCloud, b as checkExfilTracking, c as MCPProvider, d as createTransformResponse, f as GoogleLiveProvider, g as getMediaStorage, h as validateStrategies, i as resolveProvider, it as getCloudDatabaseId, j as collectFileMetadata, l as HttpProvider, lt as isCloudProvider, m as loadStrategy, n as loadApiProvider, ot as getOrgContext, p as Strategies, q as VertexChatProvider, r as loadApiProviders, rt as checkCloudPermissions, st as getPluginSeverityOverridesFromCloud, t as getProviderIds, u as createTransformRequest, ut as resolveTeamId, v as retrieveMedia, w as extractPromptFromTags, y as pluginMatchesStrategyTargets, z as createProviderRateLimitOptions } from "../providers-DvddrgxL.js";
|
|
18
|
+
import { n as telemetry, t as TelemetryEventSchema } from "../telemetry-DPXLd7UE.js";
|
|
19
19
|
import "../genaiTracer-70Z8BIuV.js";
|
|
20
|
-
import { r as runPython } from "../pythonUtils-
|
|
21
|
-
import { A as readFilters, F as extractVariablesFromTemplates, I as getNunjucksEngine, L as loadFunction, M as renderEnvOnlyInObject, O as maybeLoadToolsFromExternalFile, R as parseFileUrl, T as maybeLoadFromExternalFile, _ as getProviderDescription, a as evalTableToJson, b as isOpenAiProvider, c as fetchCsvFromGoogleSheet, d as extractRuntimeVars, f as filterRuntimeVars, g as doesProviderRefMatch, h as checkProviderApiKeys, i as ComparisonEvalNotFoundError, j as readOutput, l as setupEnv, m as resultIsForTestCase, n as writeMultipleOutputs, o as generateEvalCsv, p as getTestCaseDeduplicationKey, r as writeOutput, s as mergeComparisonTables, t as printBorder, u as deduplicateTestCases, v as isAnthropicProvider, w as maybeLoadConfigFromExternalFile, x as isProviderAllowed, y as isGoogleProvider } from "../util-
|
|
22
|
-
import { t as OpenAiChatCompletionProvider } from "../chat-
|
|
23
|
-
import { m as validateFunctionCall } from "../transform-
|
|
24
|
-
import "../messages-
|
|
25
|
-
import "../util-
|
|
26
|
-
import { $ as
|
|
27
|
-
import "../responses-
|
|
28
|
-
import "../openai-
|
|
29
|
-
import { l as validateFunctionCall$1 } from "../util-
|
|
30
|
-
import "../completion-
|
|
31
|
-
import { i as getProcessShim, n as transform, t as TransformInputType } from "../transform-
|
|
20
|
+
import { r as runPython } from "../pythonUtils-Bzwbgpbg.js";
|
|
21
|
+
import { A as readFilters, F as extractVariablesFromTemplates, I as getNunjucksEngine, L as loadFunction, M as renderEnvOnlyInObject, O as maybeLoadToolsFromExternalFile, R as parseFileUrl, T as maybeLoadFromExternalFile, _ as getProviderDescription, a as evalTableToJson, b as isOpenAiProvider, c as fetchCsvFromGoogleSheet, d as extractRuntimeVars, f as filterRuntimeVars, g as doesProviderRefMatch, h as checkProviderApiKeys, i as ComparisonEvalNotFoundError, j as readOutput, l as setupEnv, m as resultIsForTestCase, n as writeMultipleOutputs, o as generateEvalCsv, p as getTestCaseDeduplicationKey, r as writeOutput, s as mergeComparisonTables, t as printBorder, u as deduplicateTestCases, v as isAnthropicProvider, w as maybeLoadConfigFromExternalFile, x as isProviderAllowed, y as isGoogleProvider } from "../util-BLvy9qfE.js";
|
|
22
|
+
import { t as OpenAiChatCompletionProvider } from "../chat-B0iaWhoh.js";
|
|
23
|
+
import { m as validateFunctionCall } from "../transform-B2-jIv68.js";
|
|
24
|
+
import "../messages-biC_ex-p.js";
|
|
25
|
+
import "../util-DbVG-yZU.js";
|
|
26
|
+
import { $ as matchesTrajectoryGoalSuccess, A as BeavertailsPlugin, B as matchesClassification, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as callProviderWithContext, G as matchesFactuality, H as matchesContextFaithfulness, I as fail, J as matchesModeration, K as matchesGEval, L as getAndCheckProvider, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as fetchHuggingFaceDataset, Q as matchesSimilarity, R as loadRubricPrompt, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesContextRecall, V as matchesClosedQa, W as matchesContextRelevance, X as matchesSearchRubric, Y as matchesPiScore, Z as matchesSelectBest, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as SUGGEST_PROMPTS_SYSTEM_MESSAGE, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as loadFromJavaScriptFile, d as RbacPlugin, dt as getCustomPolicies, et as selectMaxScore, f as PromptExtractionPlugin, ft as retryWithDeduplication, g as isValidPolicyObject, h as determinePolicyTypeFromId, ht as DefaultSuggestionsProvider, i as VLGuardPlugin, it as readProviderPromptMap, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as processFileReference, m as PolicyPlugin, mt as getDefaultProviders, n as getGraderById, nt as processPrompts, o as UnsafeBenchPlugin, ot as coerceString, p as PoliticsPlugin, pt as sampleArray, q as matchesLlmRubric, r as VLSUPlugin, rt as readPrompts, s as ToxicChatPlugin, st as getFinalTest, t as GRADERS, tt as doRemoteGrading, u as ShellInjectionPlugin, ut as resolveContext, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as matchesAnswerRelevance } from "../graders-BXAJ0sbS.js";
|
|
27
|
+
import "../responses-CgNyTPsY.js";
|
|
28
|
+
import "../openai-D6wITiVn.js";
|
|
29
|
+
import { l as validateFunctionCall$1 } from "../util-BtoGs5Cb.js";
|
|
30
|
+
import "../completion-BCimtq-h.js";
|
|
31
|
+
import { i as getProcessShim, n as transform, t as TransformInputType } from "../transform-BzK09Q_9.js";
|
|
32
32
|
import { t as ellipsize } from "../text-TIv0QYnd.js";
|
|
33
|
-
import "../base-
|
|
34
|
-
import "../image-
|
|
35
|
-
import { t as providerRegistry } from "../providerRegistry-
|
|
36
|
-
import { n as runRuby } from "../rubyUtils-
|
|
33
|
+
import "../base-Cz2ZC_iA.js";
|
|
34
|
+
import "../image-B8b6f36E.js";
|
|
35
|
+
import { t as providerRegistry } from "../providerRegistry-BkzVH5Ba.js";
|
|
36
|
+
import { n as runRuby } from "../rubyUtils-DECSbsfY.js";
|
|
37
37
|
import dotenv from "dotenv";
|
|
38
38
|
import * as fs$2 from "fs";
|
|
39
39
|
import fs, { createWriteStream, existsSync, readFileSync } from "fs";
|
|
@@ -47,6 +47,7 @@ import input from "@inquirer/input";
|
|
|
47
47
|
import { z } from "zod";
|
|
48
48
|
import * as fsPromises from "fs/promises";
|
|
49
49
|
import util, { promisify } from "util";
|
|
50
|
+
import readline from "readline";
|
|
50
51
|
import compression from "compression";
|
|
51
52
|
import cors from "cors";
|
|
52
53
|
import fs$1 from "node:fs";
|
|
@@ -72,6 +73,7 @@ import { LRUCache } from "lru-cache";
|
|
|
72
73
|
import { JSDOM } from "jsdom";
|
|
73
74
|
import { distance } from "fastest-levenshtein";
|
|
74
75
|
import * as rouge from "js-rouge";
|
|
76
|
+
import { isDeepStrictEqual } from "node:util";
|
|
75
77
|
import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
|
|
76
78
|
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
77
79
|
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
@@ -3857,7 +3859,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
3857
3859
|
telemetry.record("feature_used", { feature: "tracing" });
|
|
3858
3860
|
try {
|
|
3859
3861
|
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
3860
|
-
const { startOTLPReceiver } = await import("../otlpReceiver-
|
|
3862
|
+
const { startOTLPReceiver } = await import("../otlpReceiver-C9KlUtxh.js");
|
|
3861
3863
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
3862
3864
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
3863
3865
|
logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
@@ -3880,7 +3882,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
3880
3882
|
async function stopOtlpReceiverIfNeeded() {
|
|
3881
3883
|
if (otlpReceiverStarted) try {
|
|
3882
3884
|
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
3883
|
-
const { stopOTLPReceiver } = await import("../otlpReceiver-
|
|
3885
|
+
const { stopOTLPReceiver } = await import("../otlpReceiver-C9KlUtxh.js");
|
|
3884
3886
|
await stopOTLPReceiver();
|
|
3885
3887
|
otlpReceiverStarted = false;
|
|
3886
3888
|
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -3915,7 +3917,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
3915
3917
|
}
|
|
3916
3918
|
if (!tracingEnabled) return null;
|
|
3917
3919
|
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
3918
|
-
const { getTraceStore } = await import("../store-
|
|
3920
|
+
const { getTraceStore } = await import("../store-VB0GP46K.js").then((n) => n.n);
|
|
3919
3921
|
const traceStore = getTraceStore();
|
|
3920
3922
|
const traceId = generateTraceId();
|
|
3921
3923
|
const spanId = generateSpanId();
|
|
@@ -4948,7 +4950,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
|
|
|
4948
4950
|
pass = result !== inverse;
|
|
4949
4951
|
score = pass ? 1 : 0;
|
|
4950
4952
|
} else if (typeof result === "number") {
|
|
4951
|
-
pass = assertion.threshold
|
|
4953
|
+
pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
|
|
4952
4954
|
score = result;
|
|
4953
4955
|
} else if (typeof result === "object") return result;
|
|
4954
4956
|
else throw new Error("Custom function must return a boolean or number");
|
|
@@ -4981,7 +4983,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
4981
4983
|
} catch {
|
|
4982
4984
|
pass = inverse;
|
|
4983
4985
|
}
|
|
4984
|
-
if (
|
|
4986
|
+
if (parsedJson !== void 0 && renderedValue) {
|
|
4985
4987
|
let validate;
|
|
4986
4988
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
4987
4989
|
const schema = valueFromScript;
|
|
@@ -4993,11 +4995,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
4993
4995
|
}
|
|
4994
4996
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
4995
4997
|
else throw new Error("is-json assertion must have a string or object value");
|
|
4996
|
-
|
|
4998
|
+
const valid = validate(parsedJson);
|
|
4999
|
+
pass = inverse ? !valid : valid;
|
|
4997
5000
|
if (!pass) return {
|
|
4998
5001
|
pass,
|
|
4999
5002
|
score: 0,
|
|
5000
|
-
reason: `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
5003
|
+
reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
5001
5004
|
assertion
|
|
5002
5005
|
};
|
|
5003
5006
|
}
|
|
@@ -5024,9 +5027,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
5024
5027
|
}
|
|
5025
5028
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
5026
5029
|
else throw new Error("contains-json assertion must have a string or object value");
|
|
5027
|
-
|
|
5028
|
-
|
|
5029
|
-
|
|
5030
|
+
const valid = validate(jsonObject);
|
|
5031
|
+
pass = inverse ? !valid : valid;
|
|
5032
|
+
if (valid) {
|
|
5033
|
+
if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
|
|
5034
|
+
break;
|
|
5035
|
+
} else errorMessage = `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`;
|
|
5030
5036
|
}
|
|
5031
5037
|
return {
|
|
5032
5038
|
pass,
|
|
@@ -5168,7 +5174,7 @@ function handlePerplexity({ logProbs, assertion }) {
|
|
|
5168
5174
|
if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
|
|
5169
5175
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
5170
5176
|
const perplexity = Math.exp(-avgLogProb);
|
|
5171
|
-
const pass = assertion.threshold
|
|
5177
|
+
const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
|
|
5172
5178
|
return {
|
|
5173
5179
|
pass,
|
|
5174
5180
|
score: pass ? 1 : 0,
|
|
@@ -5180,7 +5186,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
5180
5186
|
if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
|
|
5181
5187
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
5182
5188
|
const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
|
|
5183
|
-
const pass = assertion.threshold
|
|
5189
|
+
const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
|
|
5184
5190
|
return {
|
|
5185
5191
|
pass,
|
|
5186
5192
|
score: perplexityNorm,
|
|
@@ -5295,7 +5301,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
5295
5301
|
} else {
|
|
5296
5302
|
score = Number.parseFloat(String(result));
|
|
5297
5303
|
if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
5298
|
-
pass = assertion.threshold
|
|
5304
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
5299
5305
|
}
|
|
5300
5306
|
} catch (err) {
|
|
5301
5307
|
return {
|
|
@@ -5556,7 +5562,7 @@ end
|
|
|
5556
5562
|
} else {
|
|
5557
5563
|
score = Number.parseFloat(String(result));
|
|
5558
5564
|
if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
5559
|
-
pass = assertion.threshold
|
|
5565
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
5560
5566
|
}
|
|
5561
5567
|
} catch (err) {
|
|
5562
5568
|
return {
|
|
@@ -5627,6 +5633,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
5627
5633
|
};
|
|
5628
5634
|
};
|
|
5629
5635
|
//#endregion
|
|
5636
|
+
//#region src/assertions/traceUtils.ts
|
|
5637
|
+
/**
|
|
5638
|
+
* Shared utilities for trace assertions
|
|
5639
|
+
*/
|
|
5640
|
+
/**
|
|
5641
|
+
* Match a span name against a glob-like pattern.
|
|
5642
|
+
* Supports * (any characters) and ? (single character) wildcards.
|
|
5643
|
+
*
|
|
5644
|
+
* @param spanName - The span name to match
|
|
5645
|
+
* @param pattern - The glob pattern to match against
|
|
5646
|
+
* @returns true if the span name matches the pattern
|
|
5647
|
+
*/
|
|
5648
|
+
function matchesPattern(spanName, pattern) {
|
|
5649
|
+
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
5650
|
+
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
5651
|
+
}
|
|
5652
|
+
//#endregion
|
|
5653
|
+
//#region src/assertions/skill.ts
|
|
5654
|
+
function getSkillCalls(params) {
|
|
5655
|
+
const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
|
|
5656
|
+
if (!Array.isArray(rawSkillCalls)) return [];
|
|
5657
|
+
return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
|
|
5658
|
+
}
|
|
5659
|
+
function matchesSkill(skillCall, matcher) {
|
|
5660
|
+
if (matcher.name && skillCall.name !== matcher.name) return false;
|
|
5661
|
+
if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
|
|
5662
|
+
return true;
|
|
5663
|
+
}
|
|
5664
|
+
function formatSkillCall(skillCall) {
|
|
5665
|
+
const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
|
|
5666
|
+
return details ? `${skillCall.name} (${details})` : skillCall.name;
|
|
5667
|
+
}
|
|
5668
|
+
function resolveSkillMatchers(value) {
|
|
5669
|
+
const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
|
|
5670
|
+
const validateCount = (field, count) => {
|
|
5671
|
+
if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
|
|
5672
|
+
};
|
|
5673
|
+
if (typeof value === "string" && value.trim()) return {
|
|
5674
|
+
kind: "list",
|
|
5675
|
+
matchers: [{ name: normalizeText(value) }]
|
|
5676
|
+
};
|
|
5677
|
+
if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
|
|
5678
|
+
kind: "list",
|
|
5679
|
+
matchers: value.map((item) => ({ name: item.trim() }))
|
|
5680
|
+
};
|
|
5681
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
5682
|
+
const rawMatcher = value;
|
|
5683
|
+
const matcher = rawMatcher;
|
|
5684
|
+
const name = normalizeText(matcher.name);
|
|
5685
|
+
const pattern = normalizeText(matcher.pattern);
|
|
5686
|
+
if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
|
|
5687
|
+
if ("min" in rawMatcher) validateCount("min", matcher.min);
|
|
5688
|
+
if ("max" in rawMatcher) validateCount("max", matcher.max);
|
|
5689
|
+
if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
|
|
5690
|
+
return {
|
|
5691
|
+
kind: "count",
|
|
5692
|
+
matcher: {
|
|
5693
|
+
max: typeof matcher.max === "number" ? matcher.max : void 0,
|
|
5694
|
+
min: typeof matcher.min === "number" ? matcher.min : void 0,
|
|
5695
|
+
name,
|
|
5696
|
+
pattern
|
|
5697
|
+
}
|
|
5698
|
+
};
|
|
5699
|
+
}
|
|
5700
|
+
throw new Error("skill-used assertion must have a string, string array, or object value");
|
|
5701
|
+
}
|
|
5702
|
+
function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
|
|
5703
|
+
const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
5704
|
+
const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
5705
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
5706
|
+
const expectedSkills = expected.matchers.map((matcher) => matcher.name);
|
|
5707
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
5708
|
+
let reason;
|
|
5709
|
+
if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
5710
|
+
else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
|
|
5711
|
+
else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
5712
|
+
return {
|
|
5713
|
+
pass,
|
|
5714
|
+
score: pass ? 1 : 0,
|
|
5715
|
+
reason,
|
|
5716
|
+
assertion: params.assertion
|
|
5717
|
+
};
|
|
5718
|
+
}
|
|
5719
|
+
function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
|
|
5720
|
+
const hasExplicitMin = matcher.min !== void 0;
|
|
5721
|
+
const hasExplicitMax = matcher.max !== void 0;
|
|
5722
|
+
const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
|
|
5723
|
+
const max = matcher.max;
|
|
5724
|
+
const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
|
|
5725
|
+
const count = matchingSkillCalls.length;
|
|
5726
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
5727
|
+
if (params.inverse) {
|
|
5728
|
+
if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
|
|
5729
|
+
const pass = count === 0;
|
|
5730
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
5731
|
+
return {
|
|
5732
|
+
pass,
|
|
5733
|
+
score: pass ? 1 : 0,
|
|
5734
|
+
reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
|
|
5735
|
+
assertion: params.assertion
|
|
5736
|
+
};
|
|
5737
|
+
}
|
|
5738
|
+
const pass = count >= min && (max === void 0 || count <= max);
|
|
5739
|
+
let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
|
|
5740
|
+
reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
|
|
5741
|
+
if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
|
|
5742
|
+
return {
|
|
5743
|
+
pass,
|
|
5744
|
+
score: pass ? 1 : 0,
|
|
5745
|
+
reason,
|
|
5746
|
+
assertion: params.assertion
|
|
5747
|
+
};
|
|
5748
|
+
}
|
|
5749
|
+
function handleSkillUsed(params) {
|
|
5750
|
+
const skillCalls = getSkillCalls(params);
|
|
5751
|
+
const actualSkills = skillCalls.map(formatSkillCall);
|
|
5752
|
+
const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
|
|
5753
|
+
if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
|
|
5754
|
+
return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
|
|
5755
|
+
}
|
|
5756
|
+
//#endregion
|
|
5630
5757
|
//#region src/assertions/sql.ts
|
|
5631
5758
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
5632
5759
|
let pass = false;
|
|
@@ -5859,23 +5986,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
5859
5986
|
};
|
|
5860
5987
|
};
|
|
5861
5988
|
//#endregion
|
|
5862
|
-
//#region src/assertions/traceUtils.ts
|
|
5863
|
-
/**
|
|
5864
|
-
* Shared utilities for trace assertions
|
|
5865
|
-
*/
|
|
5866
|
-
/**
|
|
5867
|
-
* Match a span name against a glob-like pattern.
|
|
5868
|
-
* Supports * (any characters) and ? (single character) wildcards.
|
|
5869
|
-
*
|
|
5870
|
-
* @param spanName - The span name to match
|
|
5871
|
-
* @param pattern - The glob pattern to match against
|
|
5872
|
-
* @returns true if the span name matches the pattern
|
|
5873
|
-
*/
|
|
5874
|
-
function matchesPattern(spanName, pattern) {
|
|
5875
|
-
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
5876
|
-
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
5877
|
-
}
|
|
5878
|
-
//#endregion
|
|
5879
5989
|
//#region src/assertions/traceErrorSpans.ts
|
|
5880
5990
|
function isErrorSpan(span) {
|
|
5881
5991
|
if (span.statusCode && span.statusCode >= 400) return true;
|
|
@@ -6044,6 +6154,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
6044
6154
|
};
|
|
6045
6155
|
};
|
|
6046
6156
|
//#endregion
|
|
6157
|
+
//#region src/assertions/trajectoryUtils.ts
|
|
6158
|
+
const TOOL_ATTRIBUTE_KEYS = [
|
|
6159
|
+
"tool.name",
|
|
6160
|
+
"tool_name",
|
|
6161
|
+
"tool",
|
|
6162
|
+
"function.name",
|
|
6163
|
+
"function_name",
|
|
6164
|
+
"gen_ai.tool.name",
|
|
6165
|
+
"codex.mcp.tool",
|
|
6166
|
+
"agent.tool",
|
|
6167
|
+
"agent.tool_name",
|
|
6168
|
+
"agent.toolName"
|
|
6169
|
+
];
|
|
6170
|
+
const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
|
|
6171
|
+
"tool.arguments",
|
|
6172
|
+
"tool.args",
|
|
6173
|
+
"tool.input",
|
|
6174
|
+
"tool_arguments",
|
|
6175
|
+
"tool_args",
|
|
6176
|
+
"tool_input",
|
|
6177
|
+
"function.arguments",
|
|
6178
|
+
"function.args",
|
|
6179
|
+
"function.input",
|
|
6180
|
+
"function_arguments",
|
|
6181
|
+
"function_args",
|
|
6182
|
+
"gen_ai.tool.arguments",
|
|
6183
|
+
"gen_ai.tool.args",
|
|
6184
|
+
"gen_ai.tool.input",
|
|
6185
|
+
"gen_ai.tool.call.arguments",
|
|
6186
|
+
"gen_ai.tool.call.args",
|
|
6187
|
+
"agent.tool.arguments",
|
|
6188
|
+
"agent.tool.args",
|
|
6189
|
+
"agent.tool.input",
|
|
6190
|
+
"codex.mcp.arguments",
|
|
6191
|
+
"codex.mcp.args",
|
|
6192
|
+
"codex.mcp.input",
|
|
6193
|
+
"arguments",
|
|
6194
|
+
"args",
|
|
6195
|
+
"input"
|
|
6196
|
+
];
|
|
6197
|
+
const COMMAND_ATTRIBUTE_KEYS = [
|
|
6198
|
+
"codex.command",
|
|
6199
|
+
"command",
|
|
6200
|
+
"command.name",
|
|
6201
|
+
"command_name"
|
|
6202
|
+
];
|
|
6203
|
+
const SEARCH_ATTRIBUTE_KEYS = [
|
|
6204
|
+
"codex.search.query",
|
|
6205
|
+
"search.query",
|
|
6206
|
+
"search_query"
|
|
6207
|
+
];
|
|
6208
|
+
const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
|
|
6209
|
+
const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
|
|
6210
|
+
const MAX_JUDGE_SUMMARY_STEPS = 24;
|
|
6211
|
+
const JUDGE_SUMMARY_HEAD_STEPS = 12;
|
|
6212
|
+
const JUDGE_SUMMARY_TAIL_STEPS = 12;
|
|
6213
|
+
function getStringAttribute(attributes, keys) {
|
|
6214
|
+
for (const key of keys) {
|
|
6215
|
+
const value = attributes[key];
|
|
6216
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
6217
|
+
}
|
|
6218
|
+
}
|
|
6219
|
+
function normalizeStructuredAttribute(value) {
|
|
6220
|
+
if (value === void 0 || value === null) return;
|
|
6221
|
+
if (typeof value === "string") {
|
|
6222
|
+
const trimmed = value.trim();
|
|
6223
|
+
if (!trimmed) return;
|
|
6224
|
+
try {
|
|
6225
|
+
return JSON.parse(trimmed);
|
|
6226
|
+
} catch {
|
|
6227
|
+
return trimmed;
|
|
6228
|
+
}
|
|
6229
|
+
}
|
|
6230
|
+
if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
|
|
6231
|
+
}
|
|
6232
|
+
function hasSameStatus(left, right) {
|
|
6233
|
+
return left?.code === right?.code && left?.message === right?.message;
|
|
6234
|
+
}
|
|
6235
|
+
function isSearchLikeSpan(span) {
|
|
6236
|
+
const attributes = span.attributes || {};
|
|
6237
|
+
if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
|
|
6238
|
+
return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
|
|
6239
|
+
}
|
|
6240
|
+
function getTrajectoryStepStatus(step) {
|
|
6241
|
+
if (step.statusCode === void 0 || step.statusCode === 0) return;
|
|
6242
|
+
return {
|
|
6243
|
+
code: step.statusCode,
|
|
6244
|
+
...step.statusMessage ? { message: step.statusMessage } : {}
|
|
6245
|
+
};
|
|
6246
|
+
}
|
|
6247
|
+
function getCommandExecutable(command) {
|
|
6248
|
+
return command.trim().split(/\s+/)[0] || void 0;
|
|
6249
|
+
}
|
|
6250
|
+
function extractToolName(span) {
|
|
6251
|
+
const attributes = span.attributes || {};
|
|
6252
|
+
const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
|
|
6253
|
+
if (directMatch) return directMatch;
|
|
6254
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
6255
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
6256
|
+
if (/tool.?name|function.?name/i.test(key)) return value.trim();
|
|
6257
|
+
if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
|
|
6258
|
+
}
|
|
6259
|
+
if (span.name.startsWith("mcp ")) {
|
|
6260
|
+
const slashIndex = span.name.lastIndexOf("/");
|
|
6261
|
+
if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
|
|
6262
|
+
}
|
|
6263
|
+
}
|
|
6264
|
+
function extractToolArgs(span) {
|
|
6265
|
+
const attributes = span.attributes || {};
|
|
6266
|
+
for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
|
|
6267
|
+
const value = normalizeStructuredAttribute(attributes[key]);
|
|
6268
|
+
if (value !== void 0) return value;
|
|
6269
|
+
}
|
|
6270
|
+
for (const [key, rawValue] of Object.entries(attributes)) {
|
|
6271
|
+
if (/result|output|error|status/i.test(key)) continue;
|
|
6272
|
+
if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
|
|
6273
|
+
const value = normalizeStructuredAttribute(rawValue);
|
|
6274
|
+
if (value !== void 0) return value;
|
|
6275
|
+
}
|
|
6276
|
+
}
|
|
6277
|
+
function extractCommand(span) {
|
|
6278
|
+
const attributes = span.attributes || {};
|
|
6279
|
+
const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
|
|
6280
|
+
if (directMatch) return directMatch;
|
|
6281
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
6282
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
6283
|
+
if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
|
|
6284
|
+
}
|
|
6285
|
+
if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
|
|
6286
|
+
}
|
|
6287
|
+
function extractSearchQuery(span) {
|
|
6288
|
+
const attributes = span.attributes || {};
|
|
6289
|
+
const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
|
|
6290
|
+
if (directMatch) return directMatch;
|
|
6291
|
+
const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
|
|
6292
|
+
if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
|
|
6293
|
+
if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
|
|
6294
|
+
}
|
|
6295
|
+
function isReasoningSpan(span) {
|
|
6296
|
+
if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
|
|
6297
|
+
return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
|
|
6298
|
+
}
|
|
6299
|
+
function isMessageSpan(span) {
|
|
6300
|
+
if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
|
|
6301
|
+
return span.name === "agent response" || span.name === "send input";
|
|
6302
|
+
}
|
|
6303
|
+
function extractTrajectorySteps(trace) {
|
|
6304
|
+
return [...trace.spans || []].map((span, index) => ({
|
|
6305
|
+
span,
|
|
6306
|
+
index
|
|
6307
|
+
})).sort((left, right) => {
|
|
6308
|
+
const timeDiff = left.span.startTime - right.span.startTime;
|
|
6309
|
+
if (timeDiff !== 0) return timeDiff;
|
|
6310
|
+
const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
|
|
6311
|
+
if (endDiff !== 0) return endDiff;
|
|
6312
|
+
return left.index - right.index;
|
|
6313
|
+
}).map(({ span }) => {
|
|
6314
|
+
const toolName = extractToolName(span);
|
|
6315
|
+
const command = extractCommand(span);
|
|
6316
|
+
const searchQuery = extractSearchQuery(span);
|
|
6317
|
+
let type = "span";
|
|
6318
|
+
let name = span.name;
|
|
6319
|
+
const aliases = new Set([span.name]);
|
|
6320
|
+
let args;
|
|
6321
|
+
if (toolName) {
|
|
6322
|
+
type = "tool";
|
|
6323
|
+
name = toolName;
|
|
6324
|
+
aliases.add(toolName);
|
|
6325
|
+
args = extractToolArgs(span);
|
|
6326
|
+
} else if (command) {
|
|
6327
|
+
type = "command";
|
|
6328
|
+
name = command;
|
|
6329
|
+
aliases.add(command);
|
|
6330
|
+
const executable = getCommandExecutable(command);
|
|
6331
|
+
if (executable) aliases.add(executable);
|
|
6332
|
+
} else if (searchQuery) {
|
|
6333
|
+
type = "search";
|
|
6334
|
+
name = searchQuery;
|
|
6335
|
+
aliases.add(searchQuery);
|
|
6336
|
+
} else if (isReasoningSpan(span)) {
|
|
6337
|
+
type = "reasoning";
|
|
6338
|
+
name = span.name;
|
|
6339
|
+
aliases.add("reasoning");
|
|
6340
|
+
} else if (isMessageSpan(span)) {
|
|
6341
|
+
type = "message";
|
|
6342
|
+
name = span.name;
|
|
6343
|
+
aliases.add("message");
|
|
6344
|
+
}
|
|
6345
|
+
return {
|
|
6346
|
+
aliases: [...aliases],
|
|
6347
|
+
...args === void 0 ? {} : { args },
|
|
6348
|
+
attributes: span.attributes || {},
|
|
6349
|
+
endTime: span.endTime,
|
|
6350
|
+
name,
|
|
6351
|
+
spanId: span.spanId,
|
|
6352
|
+
spanName: span.name,
|
|
6353
|
+
startTime: span.startTime,
|
|
6354
|
+
statusCode: span.statusCode,
|
|
6355
|
+
statusMessage: span.statusMessage,
|
|
6356
|
+
type
|
|
6357
|
+
};
|
|
6358
|
+
});
|
|
6359
|
+
}
|
|
6360
|
+
function normalizeTrajectoryMatcher(matcher, defaultType) {
|
|
6361
|
+
if (typeof matcher === "string") return {
|
|
6362
|
+
pattern: matcher,
|
|
6363
|
+
...defaultType ? { type: defaultType } : {}
|
|
6364
|
+
};
|
|
6365
|
+
return {
|
|
6366
|
+
...matcher,
|
|
6367
|
+
...matcher.type ? {} : defaultType ? { type: defaultType } : {}
|
|
6368
|
+
};
|
|
6369
|
+
}
|
|
6370
|
+
function matchesTrajectoryStep(step, matcher, defaultType) {
|
|
6371
|
+
const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
|
|
6372
|
+
if (type) {
|
|
6373
|
+
if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
|
|
6374
|
+
}
|
|
6375
|
+
const matchPattern = pattern || name;
|
|
6376
|
+
if (!matchPattern) return true;
|
|
6377
|
+
return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
|
|
6378
|
+
}
|
|
6379
|
+
function formatTrajectoryStep(step) {
|
|
6380
|
+
return `${step.type}:${step.name}`;
|
|
6381
|
+
}
|
|
6382
|
+
function formatTrajectoryArgs(args) {
|
|
6383
|
+
if (args === void 0) return "(none)";
|
|
6384
|
+
try {
|
|
6385
|
+
const serialized = JSON.stringify(args);
|
|
6386
|
+
if (serialized !== void 0) return serialized;
|
|
6387
|
+
} catch {}
|
|
6388
|
+
return String(args);
|
|
6389
|
+
}
|
|
6390
|
+
function compactJudgeTrajectorySteps(steps) {
|
|
6391
|
+
const compacted = [];
|
|
6392
|
+
for (const step of steps) {
|
|
6393
|
+
const previousStep = compacted[compacted.length - 1];
|
|
6394
|
+
if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
|
|
6395
|
+
previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
|
|
6396
|
+
continue;
|
|
6397
|
+
}
|
|
6398
|
+
compacted.push(step);
|
|
6399
|
+
}
|
|
6400
|
+
return compacted;
|
|
6401
|
+
}
|
|
6402
|
+
function truncateJudgeTrajectorySteps(steps) {
|
|
6403
|
+
if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
|
|
6404
|
+
return [
|
|
6405
|
+
...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
|
|
6406
|
+
{ omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
|
|
6407
|
+
...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
|
|
6408
|
+
];
|
|
6409
|
+
}
|
|
6410
|
+
function summarizeTrajectoryForJudge(trace) {
|
|
6411
|
+
const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
|
|
6412
|
+
index: index + 1,
|
|
6413
|
+
type: step.type,
|
|
6414
|
+
name: step.name,
|
|
6415
|
+
...step.spanName === step.name ? {} : { spanName: step.spanName },
|
|
6416
|
+
...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
|
|
6417
|
+
}));
|
|
6418
|
+
const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
|
|
6419
|
+
const steps = truncateJudgeTrajectorySteps(compactedSteps);
|
|
6420
|
+
return JSON.stringify({
|
|
6421
|
+
traceId: trace.traceId,
|
|
6422
|
+
stepCount: rawSteps.length,
|
|
6423
|
+
compactedStepCount: compactedSteps.length,
|
|
6424
|
+
steps
|
|
6425
|
+
}, null, 2);
|
|
6426
|
+
}
|
|
6427
|
+
//#endregion
|
|
6428
|
+
//#region src/assertions/trajectory.ts
|
|
6429
|
+
function getTraceOrThrow(params) {
|
|
6430
|
+
const trace = params.assertionValueContext.trace;
|
|
6431
|
+
if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
|
|
6432
|
+
return trace;
|
|
6433
|
+
}
|
|
6434
|
+
function applyInverse(pass, inverse) {
|
|
6435
|
+
return inverse ? !pass : pass;
|
|
6436
|
+
}
|
|
6437
|
+
function formatStepList(stepLabels) {
|
|
6438
|
+
return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
|
|
6439
|
+
}
|
|
6440
|
+
function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
|
|
6441
|
+
if (matcher.pattern || matcher.name) return;
|
|
6442
|
+
const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
|
|
6443
|
+
throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
|
|
6444
|
+
}
|
|
6445
|
+
function resolveGoalSuccessValue(value) {
|
|
6446
|
+
if (typeof value === "string" && value.trim()) return { goal: value.trim() };
|
|
6447
|
+
if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
|
|
6448
|
+
throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
|
|
6449
|
+
}
|
|
6450
|
+
function resolveToolMatchers(value) {
|
|
6451
|
+
if (typeof value === "string") return {
|
|
6452
|
+
kind: "list",
|
|
6453
|
+
matchers: [normalizeTrajectoryMatcher(value, "tool")]
|
|
6454
|
+
};
|
|
6455
|
+
if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
|
|
6456
|
+
kind: "list",
|
|
6457
|
+
matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
|
|
6458
|
+
};
|
|
6459
|
+
if (value && typeof value === "object" && !Array.isArray(value)) return {
|
|
6460
|
+
kind: "count",
|
|
6461
|
+
matcher: {
|
|
6462
|
+
...normalizeTrajectoryMatcher(value, "tool"),
|
|
6463
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
6464
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
6465
|
+
}
|
|
6466
|
+
};
|
|
6467
|
+
throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
|
|
6468
|
+
}
|
|
6469
|
+
const handleTrajectoryToolUsed = (params) => {
|
|
6470
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
6471
|
+
const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
|
|
6472
|
+
if (expected.kind === "list") {
|
|
6473
|
+
if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
|
|
6474
|
+
const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
6475
|
+
const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
6476
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
6477
|
+
const actualTools = steps.map(formatTrajectoryStep);
|
|
6478
|
+
const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
|
|
6479
|
+
let reason;
|
|
6480
|
+
if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
6481
|
+
else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
6482
|
+
else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
6483
|
+
return {
|
|
6484
|
+
pass,
|
|
6485
|
+
score: pass ? 1 : 0,
|
|
6486
|
+
reason,
|
|
6487
|
+
assertion: params.assertion
|
|
6488
|
+
};
|
|
6489
|
+
}
|
|
6490
|
+
const matcher = expected.matcher;
|
|
6491
|
+
const min = matcher.min ?? 1;
|
|
6492
|
+
const max = matcher.max;
|
|
6493
|
+
if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
|
|
6494
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
6495
|
+
const count = matchingSteps.length;
|
|
6496
|
+
const basePass = count >= min && (max === void 0 || count <= max);
|
|
6497
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
6498
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
6499
|
+
let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
|
|
6500
|
+
if (max === void 0) reason += ` (expected at least ${min})`;
|
|
6501
|
+
else reason += ` (expected ${min}-${max})`;
|
|
6502
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
6503
|
+
if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
|
|
6504
|
+
return {
|
|
6505
|
+
pass,
|
|
6506
|
+
score: pass ? 1 : 0,
|
|
6507
|
+
reason,
|
|
6508
|
+
assertion: params.assertion
|
|
6509
|
+
};
|
|
6510
|
+
};
|
|
6511
|
+
function resolveSequenceValue(value) {
|
|
6512
|
+
if (Array.isArray(value)) return {
|
|
6513
|
+
mode: "in_order",
|
|
6514
|
+
steps: value
|
|
6515
|
+
};
|
|
6516
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
6517
|
+
const sequenceValue = value;
|
|
6518
|
+
return {
|
|
6519
|
+
mode: sequenceValue.mode || "in_order",
|
|
6520
|
+
steps: sequenceValue.steps || []
|
|
6521
|
+
};
|
|
6522
|
+
}
|
|
6523
|
+
throw new Error("trajectory:tool-sequence assertion must have an array or object value");
|
|
6524
|
+
}
|
|
6525
|
+
function isRecord(value) {
|
|
6526
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
6527
|
+
}
|
|
6528
|
+
function matchesExpectedArgsPartial(actual, expected) {
|
|
6529
|
+
if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
|
|
6530
|
+
if (isRecord(expected)) {
|
|
6531
|
+
if (!isRecord(actual)) return false;
|
|
6532
|
+
return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
|
|
6533
|
+
}
|
|
6534
|
+
return isDeepStrictEqual(actual, expected);
|
|
6535
|
+
}
|
|
6536
|
+
function matchesToolArgs(actual, expected, mode) {
|
|
6537
|
+
if (mode === "exact") return isDeepStrictEqual(actual, expected);
|
|
6538
|
+
return matchesExpectedArgsPartial(actual, expected);
|
|
6539
|
+
}
|
|
6540
|
+
function resolveToolArgsMatchMode(mode) {
|
|
6541
|
+
if (mode === void 0) return "partial";
|
|
6542
|
+
if (mode === "partial" || mode === "exact") return mode;
|
|
6543
|
+
throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
|
|
6544
|
+
}
|
|
6545
|
+
function resolveToolArgsMatchValue(value) {
|
|
6546
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
|
|
6547
|
+
const matcher = normalizeTrajectoryMatcher(value, "tool");
|
|
6548
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
|
|
6549
|
+
const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
|
|
6550
|
+
if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
|
|
6551
|
+
return {
|
|
6552
|
+
matcher,
|
|
6553
|
+
expectedArgs,
|
|
6554
|
+
mode: resolveToolArgsMatchMode(value.mode)
|
|
6555
|
+
};
|
|
6556
|
+
}
|
|
6557
|
+
const handleTrajectoryToolSequence = (params) => {
|
|
6558
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
6559
|
+
const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
|
|
6560
|
+
const expectedMatchers = value.steps.map((step, index) => {
|
|
6561
|
+
const matcher = normalizeTrajectoryMatcher(step, "tool");
|
|
6562
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
|
|
6563
|
+
return matcher;
|
|
6564
|
+
});
|
|
6565
|
+
if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
|
|
6566
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
6567
|
+
let basePass = false;
|
|
6568
|
+
let reason = "";
|
|
6569
|
+
if (value.mode === "exact") {
|
|
6570
|
+
basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
|
|
6571
|
+
if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
|
|
6572
|
+
else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
|
|
6573
|
+
} else {
|
|
6574
|
+
let expectedIndex = 0;
|
|
6575
|
+
const matchedSteps = [];
|
|
6576
|
+
for (const step of toolSteps) {
|
|
6577
|
+
if (expectedIndex >= expectedMatchers.length) break;
|
|
6578
|
+
if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
|
|
6579
|
+
matchedSteps.push(formatTrajectoryStep(step));
|
|
6580
|
+
expectedIndex += 1;
|
|
6581
|
+
}
|
|
6582
|
+
}
|
|
6583
|
+
basePass = expectedIndex === expectedMatchers.length;
|
|
6584
|
+
if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
6585
|
+
else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
|
|
6586
|
+
}
|
|
6587
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
6588
|
+
if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
|
|
6589
|
+
return {
|
|
6590
|
+
pass,
|
|
6591
|
+
score: pass ? 1 : 0,
|
|
6592
|
+
reason,
|
|
6593
|
+
assertion: params.assertion
|
|
6594
|
+
};
|
|
6595
|
+
};
|
|
6596
|
+
const handleTrajectoryToolArgsMatch = (params) => {
|
|
6597
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
6598
|
+
const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
|
|
6599
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
6600
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
6601
|
+
const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
6602
|
+
const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
|
|
6603
|
+
const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
|
|
6604
|
+
const basePass = matchedStep !== void 0;
|
|
6605
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
6606
|
+
const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
|
|
6607
|
+
const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
|
|
6608
|
+
let reason;
|
|
6609
|
+
if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
6610
|
+
else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
|
|
6611
|
+
else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
|
|
6612
|
+
else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
6613
|
+
else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
|
|
6614
|
+
else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
|
|
6615
|
+
else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
|
|
6616
|
+
return {
|
|
6617
|
+
pass,
|
|
6618
|
+
score: pass ? 1 : 0,
|
|
6619
|
+
reason,
|
|
6620
|
+
assertion: params.assertion
|
|
6621
|
+
};
|
|
6622
|
+
};
|
|
6623
|
+
function resolveStepCountValue(value) {
|
|
6624
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
|
|
6625
|
+
return {
|
|
6626
|
+
...normalizeTrajectoryMatcher(value),
|
|
6627
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
6628
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
6629
|
+
};
|
|
6630
|
+
}
|
|
6631
|
+
const handleTrajectoryStepCount = (params) => {
|
|
6632
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params));
|
|
6633
|
+
const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
|
|
6634
|
+
const { min, max } = matcher;
|
|
6635
|
+
if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
|
|
6636
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
6637
|
+
const count = matchingSteps.length;
|
|
6638
|
+
const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
|
|
6639
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
6640
|
+
const filterParts = [];
|
|
6641
|
+
if (matcher.type) {
|
|
6642
|
+
const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
|
|
6643
|
+
filterParts.push(`type=${types.join("|")}`);
|
|
6644
|
+
}
|
|
6645
|
+
const pattern = matcher.pattern || matcher.name;
|
|
6646
|
+
if (pattern) filterParts.push(`pattern=${pattern}`);
|
|
6647
|
+
let reason = `Matched ${count} trajectory step(s)`;
|
|
6648
|
+
if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
|
|
6649
|
+
if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
|
|
6650
|
+
else if (min !== void 0) reason += ` (expected at least ${min})`;
|
|
6651
|
+
else if (max !== void 0) reason += ` (expected at most ${max})`;
|
|
6652
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
6653
|
+
if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
|
|
6654
|
+
return {
|
|
6655
|
+
pass,
|
|
6656
|
+
score: pass ? 1 : 0,
|
|
6657
|
+
reason,
|
|
6658
|
+
assertion: params.assertion
|
|
6659
|
+
};
|
|
6660
|
+
};
|
|
6661
|
+
const handleTrajectoryGoalSuccess = async (params) => {
|
|
6662
|
+
const trace = getTraceOrThrow(params);
|
|
6663
|
+
const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
|
|
6664
|
+
const result = await matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
|
|
6665
|
+
if (!params.inverse) return result;
|
|
6666
|
+
return {
|
|
6667
|
+
...result,
|
|
6668
|
+
assertion: params.assertion,
|
|
6669
|
+
pass: !result.pass,
|
|
6670
|
+
score: result.pass ? 0 : 1,
|
|
6671
|
+
reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
|
|
6672
|
+
};
|
|
6673
|
+
};
|
|
6674
|
+
//#endregion
|
|
6047
6675
|
//#region src/assertions/webhook.ts
|
|
6048
6676
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
6049
6677
|
invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
|
|
@@ -6112,18 +6740,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
6112
6740
|
if (pass) reason = "Assertion passed";
|
|
6113
6741
|
else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
|
|
6114
6742
|
else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
|
|
6115
|
-
} else if (min
|
|
6116
|
-
const basePass = wordCount >= min;
|
|
6117
|
-
pass = inverse ? !basePass : basePass;
|
|
6118
|
-
if (pass) reason = "Assertion passed";
|
|
6119
|
-
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
6120
|
-
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
6121
|
-
} else {
|
|
6743
|
+
} else if (min === void 0) {
|
|
6122
6744
|
const basePass = wordCount <= max;
|
|
6123
6745
|
pass = inverse ? !basePass : basePass;
|
|
6124
6746
|
if (pass) reason = "Assertion passed";
|
|
6125
6747
|
else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
|
|
6126
6748
|
else reason = `Word count ${wordCount} is greater than maximum ${max}`;
|
|
6749
|
+
} else {
|
|
6750
|
+
const basePass = wordCount >= min;
|
|
6751
|
+
pass = inverse ? !basePass : basePass;
|
|
6752
|
+
if (pass) reason = "Assertion passed";
|
|
6753
|
+
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
6754
|
+
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
6127
6755
|
}
|
|
6128
6756
|
} else {
|
|
6129
6757
|
invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
|
|
@@ -6218,6 +6846,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
6218
6846
|
//#endregion
|
|
6219
6847
|
//#region src/assertions/index.ts
|
|
6220
6848
|
const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
6849
|
+
const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
|
|
6850
|
+
const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
|
|
6851
|
+
const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
|
|
6852
|
+
const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
|
|
6853
|
+
const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
|
|
6854
|
+
const MAX_TRACE_FETCH_STABLE_POLLS = 10;
|
|
6221
6855
|
const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
6222
6856
|
"answer-relevance",
|
|
6223
6857
|
"context-faithfulness",
|
|
@@ -6227,8 +6861,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
|
6227
6861
|
"llm-rubric",
|
|
6228
6862
|
"model-graded-closedqa",
|
|
6229
6863
|
"model-graded-factuality",
|
|
6230
|
-
"search-rubric"
|
|
6864
|
+
"search-rubric",
|
|
6865
|
+
"trajectory:goal-success"
|
|
6231
6866
|
]);
|
|
6867
|
+
const TRACE_AWARE_ASSERTION_TYPES = new Set([
|
|
6868
|
+
"javascript",
|
|
6869
|
+
"python",
|
|
6870
|
+
"ruby",
|
|
6871
|
+
"trace-error-spans",
|
|
6872
|
+
"trace-span-count",
|
|
6873
|
+
"trace-span-duration",
|
|
6874
|
+
"trajectory:goal-success",
|
|
6875
|
+
"trajectory:step-count",
|
|
6876
|
+
"trajectory:tool-args-match",
|
|
6877
|
+
"trajectory:tool-sequence",
|
|
6878
|
+
"trajectory:tool-used"
|
|
6879
|
+
]);
|
|
6880
|
+
function assertionUsesTrace(assertion) {
|
|
6881
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
|
|
6882
|
+
return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
|
|
6883
|
+
}
|
|
6884
|
+
function assertionMayNeedTraceContext(assertion) {
|
|
6885
|
+
if (assertionUsesTrace(assertion)) return true;
|
|
6886
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
|
|
6887
|
+
return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || isPackagePath(assertion.value) : false;
|
|
6888
|
+
}
|
|
6889
|
+
function hasTraceAwareAssertions(assertions) {
|
|
6890
|
+
return Boolean(assertions?.some(assertionMayNeedTraceContext));
|
|
6891
|
+
}
|
|
6892
|
+
async function loadTraceData(traceId) {
|
|
6893
|
+
const traceStore = getTraceStore();
|
|
6894
|
+
const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
|
|
6895
|
+
const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
|
|
6896
|
+
const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
|
|
6897
|
+
let lastSpanCount = -1;
|
|
6898
|
+
let stableObservations = 0;
|
|
6899
|
+
let latestTrace = null;
|
|
6900
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
6901
|
+
latestTrace = await traceStore.getTrace(traceId);
|
|
6902
|
+
const spanCount = latestTrace?.spans?.length ?? 0;
|
|
6903
|
+
if (spanCount > 0) {
|
|
6904
|
+
stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
|
|
6905
|
+
lastSpanCount = spanCount;
|
|
6906
|
+
if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
|
|
6907
|
+
} else {
|
|
6908
|
+
stableObservations = 0;
|
|
6909
|
+
lastSpanCount = spanCount;
|
|
6910
|
+
}
|
|
6911
|
+
if (attempt < maxAttempts - 1) await sleep(retryDelayMs);
|
|
6912
|
+
}
|
|
6913
|
+
return latestTrace;
|
|
6914
|
+
}
|
|
6232
6915
|
const ASSERTION_HANDLERS = {
|
|
6233
6916
|
"answer-relevance": handleAnswerRelevance,
|
|
6234
6917
|
bleu: handleBleuScore,
|
|
@@ -6291,12 +6974,18 @@ const ASSERTION_HANDLERS = {
|
|
|
6291
6974
|
ruby: handleRuby,
|
|
6292
6975
|
"rouge-n": handleRougeScore,
|
|
6293
6976
|
"search-rubric": handleSearchRubric,
|
|
6977
|
+
"skill-used": handleSkillUsed,
|
|
6294
6978
|
similar: handleSimilar,
|
|
6295
6979
|
"similar:cosine": handleSimilar,
|
|
6296
6980
|
"similar:dot": handleSimilar,
|
|
6297
6981
|
"similar:euclidean": handleSimilar,
|
|
6298
6982
|
"starts-with": handleStartsWith,
|
|
6299
6983
|
"tool-call-f1": handleToolCallF1,
|
|
6984
|
+
"trajectory:goal-success": handleTrajectoryGoalSuccess,
|
|
6985
|
+
"trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
|
|
6986
|
+
"trajectory:step-count": handleTrajectoryStepCount,
|
|
6987
|
+
"trajectory:tool-sequence": handleTrajectoryToolSequence,
|
|
6988
|
+
"trajectory:tool-used": handleTrajectoryToolUsed,
|
|
6300
6989
|
"trace-error-spans": handleTraceErrorSpans,
|
|
6301
6990
|
"trace-span-count": handleTraceSpanCount,
|
|
6302
6991
|
"trace-span-duration": handleTraceSpanDuration,
|
|
@@ -6339,7 +7028,7 @@ function isAssertionInverse(assertion) {
|
|
|
6339
7028
|
function getAssertionBaseType(assertion) {
|
|
6340
7029
|
return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
|
|
6341
7030
|
}
|
|
6342
|
-
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
|
|
7031
|
+
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
|
|
6343
7032
|
const resolvedVars = vars || test.vars || {};
|
|
6344
7033
|
const { cost, logProbs, output: originalOutput } = providerResponse;
|
|
6345
7034
|
let output = originalOutput;
|
|
@@ -6358,14 +7047,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
6358
7047
|
providerResponse,
|
|
6359
7048
|
...assertion.config ? { config: structuredClone(assertion.config) } : {}
|
|
6360
7049
|
};
|
|
6361
|
-
if (traceId) try {
|
|
6362
|
-
const
|
|
6363
|
-
if (
|
|
6364
|
-
traceId:
|
|
6365
|
-
evaluationId:
|
|
6366
|
-
testCaseId:
|
|
6367
|
-
metadata:
|
|
6368
|
-
spans:
|
|
7050
|
+
if (traceId && assertionMayNeedTraceContext(assertion)) try {
|
|
7051
|
+
const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
|
|
7052
|
+
if (resolvedTraceData) context.trace = {
|
|
7053
|
+
traceId: resolvedTraceData.traceId,
|
|
7054
|
+
evaluationId: resolvedTraceData.evaluationId,
|
|
7055
|
+
testCaseId: resolvedTraceData.testCaseId,
|
|
7056
|
+
metadata: resolvedTraceData.metadata,
|
|
7057
|
+
spans: resolvedTraceData.spans || []
|
|
6369
7058
|
};
|
|
6370
7059
|
} catch (error) {
|
|
6371
7060
|
logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
@@ -6398,7 +7087,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
6398
7087
|
};
|
|
6399
7088
|
}
|
|
6400
7089
|
else if (filePath.endsWith(".rb")) try {
|
|
6401
|
-
const { runRuby } = await import("../rubyUtils-
|
|
7090
|
+
const { runRuby } = await import("../rubyUtils-DECSbsfY.js").then((n) => n.t);
|
|
6402
7091
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
6403
7092
|
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
6404
7093
|
} catch (error) {
|
|
@@ -6507,6 +7196,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
6507
7196
|
index: i
|
|
6508
7197
|
};
|
|
6509
7198
|
}).flat();
|
|
7199
|
+
const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
|
|
7200
|
+
let preloadedTraceData;
|
|
7201
|
+
if (shouldPreloadTrace && traceId) try {
|
|
7202
|
+
preloadedTraceData = await loadTraceData(traceId);
|
|
7203
|
+
} catch (error) {
|
|
7204
|
+
logger.debug(`Failed to preload trace data for assertions: ${error}`);
|
|
7205
|
+
preloadedTraceData = null;
|
|
7206
|
+
}
|
|
6510
7207
|
await async.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
|
|
6511
7208
|
if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
|
|
6512
7209
|
const result = await runAssertion({
|
|
@@ -6518,7 +7215,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
6518
7215
|
vars,
|
|
6519
7216
|
latencyMs,
|
|
6520
7217
|
assertIndex: index,
|
|
6521
|
-
traceId
|
|
7218
|
+
traceId,
|
|
7219
|
+
traceData: preloadedTraceData
|
|
6522
7220
|
});
|
|
6523
7221
|
assertResult.addResult({
|
|
6524
7222
|
index,
|
|
@@ -6647,7 +7345,7 @@ var CIProgressReporter = class {
|
|
|
6647
7345
|
else {
|
|
6648
7346
|
const eta = remaining / rate;
|
|
6649
7347
|
if (eta > 1440) etaDisplay = ">24 hours";
|
|
6650
|
-
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta)
|
|
7348
|
+
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
|
|
6651
7349
|
}
|
|
6652
7350
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
6653
7351
|
logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
@@ -7048,12 +7746,55 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
7048
7746
|
var ProgressBarManager = class {
|
|
7049
7747
|
progressBar;
|
|
7050
7748
|
isWebUI;
|
|
7749
|
+
originalLogCallback = null;
|
|
7750
|
+
installedLogCallback = null;
|
|
7751
|
+
pendingRender = null;
|
|
7051
7752
|
totalCount = 0;
|
|
7052
7753
|
completedCount = 0;
|
|
7053
7754
|
concurrency = 1;
|
|
7054
7755
|
constructor(isWebUI) {
|
|
7055
7756
|
this.isWebUI = isWebUI;
|
|
7056
7757
|
}
|
|
7758
|
+
clearProgressBarLine() {
|
|
7759
|
+
readline.cursorTo(process.stderr, 0);
|
|
7760
|
+
readline.clearLine(process.stderr, 0);
|
|
7761
|
+
}
|
|
7762
|
+
scheduleRender() {
|
|
7763
|
+
if (!this.progressBar || this.pendingRender) return;
|
|
7764
|
+
this.pendingRender = setImmediate(() => {
|
|
7765
|
+
this.pendingRender = null;
|
|
7766
|
+
this.progressBar?.render();
|
|
7767
|
+
});
|
|
7768
|
+
}
|
|
7769
|
+
handleLogMessage() {
|
|
7770
|
+
if (!this.progressBar) return;
|
|
7771
|
+
this.clearProgressBarLine();
|
|
7772
|
+
this.scheduleRender();
|
|
7773
|
+
}
|
|
7774
|
+
/**
|
|
7775
|
+
* Coordinate console logging with the progress bar to prevent visual corruption.
|
|
7776
|
+
*/
|
|
7777
|
+
installLogInterceptor() {
|
|
7778
|
+
if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
|
|
7779
|
+
this.originalLogCallback = globalLogCallback;
|
|
7780
|
+
this.installedLogCallback = (message) => {
|
|
7781
|
+
this.originalLogCallback?.(message);
|
|
7782
|
+
this.handleLogMessage();
|
|
7783
|
+
};
|
|
7784
|
+
setLogCallback(this.installedLogCallback);
|
|
7785
|
+
}
|
|
7786
|
+
/**
|
|
7787
|
+
* Remove the log interceptor and restore original logger callback behavior.
|
|
7788
|
+
*/
|
|
7789
|
+
removeLogInterceptor() {
|
|
7790
|
+
if (this.pendingRender) {
|
|
7791
|
+
clearImmediate(this.pendingRender);
|
|
7792
|
+
this.pendingRender = null;
|
|
7793
|
+
}
|
|
7794
|
+
if (this.installedLogCallback && globalLogCallback === this.installedLogCallback) setLogCallback(this.originalLogCallback);
|
|
7795
|
+
this.installedLogCallback = null;
|
|
7796
|
+
this.originalLogCallback = null;
|
|
7797
|
+
}
|
|
7057
7798
|
/**
|
|
7058
7799
|
* Initialize progress bar
|
|
7059
7800
|
*/
|
|
@@ -7073,7 +7814,8 @@ var ProgressBarManager = class {
|
|
|
7073
7814
|
return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
|
|
7074
7815
|
},
|
|
7075
7816
|
hideCursor: true,
|
|
7076
|
-
gracefulExit: true
|
|
7817
|
+
gracefulExit: true,
|
|
7818
|
+
stream: process.stderr
|
|
7077
7819
|
}, cliProgress.Presets.shades_classic);
|
|
7078
7820
|
this.progressBar.start(this.totalCount, 0, {
|
|
7079
7821
|
provider: "",
|
|
@@ -7348,6 +8090,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
7348
8090
|
const parts = traceContext.traceparent.split("-");
|
|
7349
8091
|
if (parts.length >= 3) traceId = parts[1];
|
|
7350
8092
|
}
|
|
8093
|
+
if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
|
|
7351
8094
|
const checkResult = await runAssertions({
|
|
7352
8095
|
prompt: renderedPrompt,
|
|
7353
8096
|
provider,
|
|
@@ -7745,7 +8488,7 @@ var Evaluator = class {
|
|
|
7745
8488
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
7746
8489
|
if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
7747
8490
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
7748
|
-
const { loadApiProvider } = await import("../providers-
|
|
8491
|
+
const { loadApiProvider } = await import("../providers-DEYiFVAo.js");
|
|
7749
8492
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
7750
8493
|
} else testCase.provider = defaultProvider;
|
|
7751
8494
|
}
|
|
@@ -7829,7 +8572,7 @@ var Evaluator = class {
|
|
|
7829
8572
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
7830
8573
|
}
|
|
7831
8574
|
if (state.resume && this.evalRecord.persisted) try {
|
|
7832
|
-
const { default: EvalResult } = await import("../evalResult-
|
|
8575
|
+
const { default: EvalResult } = await import("../evalResult-CuvJeNiM.js");
|
|
7833
8576
|
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
|
|
7834
8577
|
const originalCount = runEvalOptions.length;
|
|
7835
8578
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -8029,7 +8772,7 @@ var Evaluator = class {
|
|
|
8029
8772
|
if (isCI() && !isWebUI) {
|
|
8030
8773
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
8031
8774
|
ciProgressReporter.start();
|
|
8032
|
-
} else if (this.options.showProgressBar && process.
|
|
8775
|
+
} else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
|
|
8033
8776
|
this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
|
|
8034
8777
|
if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
|
|
8035
8778
|
if (isWebUI) {
|
|
@@ -8050,7 +8793,10 @@ var Evaluator = class {
|
|
|
8050
8793
|
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
8051
8794
|
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
8052
8795
|
}
|
|
8053
|
-
if (this.options.showProgressBar && progressBarManager)
|
|
8796
|
+
if (this.options.showProgressBar && progressBarManager) {
|
|
8797
|
+
await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
8798
|
+
progressBarManager.installLogInterceptor();
|
|
8799
|
+
}
|
|
8054
8800
|
try {
|
|
8055
8801
|
if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
|
|
8056
8802
|
checkAbort();
|
|
@@ -8076,7 +8822,10 @@ var Evaluator = class {
|
|
|
8076
8822
|
else if (!targetUnavailable) {
|
|
8077
8823
|
logger.info("Evaluation interrupted, saving progress...");
|
|
8078
8824
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
8079
|
-
if (progressBarManager)
|
|
8825
|
+
if (progressBarManager) {
|
|
8826
|
+
progressBarManager.removeLogInterceptor();
|
|
8827
|
+
progressBarManager.stop();
|
|
8828
|
+
}
|
|
8080
8829
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
8081
8830
|
this.evalRecord.setVars(Array.from(vars));
|
|
8082
8831
|
await this.evalRecord.addPrompts(prompts);
|
|
@@ -8084,6 +8833,10 @@ var Evaluator = class {
|
|
|
8084
8833
|
return this.evalRecord;
|
|
8085
8834
|
}
|
|
8086
8835
|
} else {
|
|
8836
|
+
if (progressBarManager) {
|
|
8837
|
+
progressBarManager.removeLogInterceptor();
|
|
8838
|
+
progressBarManager.stop();
|
|
8839
|
+
}
|
|
8087
8840
|
if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
|
|
8088
8841
|
throw err;
|
|
8089
8842
|
}
|
|
@@ -8226,6 +8979,7 @@ var Evaluator = class {
|
|
|
8226
8979
|
await this.evalRecord.addPrompts(prompts);
|
|
8227
8980
|
try {
|
|
8228
8981
|
if (progressBarManager) {
|
|
8982
|
+
progressBarManager.removeLogInterceptor();
|
|
8229
8983
|
progressBarManager.complete();
|
|
8230
8984
|
progressBarManager.stop();
|
|
8231
8985
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -9030,8 +9784,7 @@ function testCaseFromCsvRow(row) {
|
|
|
9030
9784
|
logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
|
|
9031
9785
|
} else if (key.startsWith("__config:")) {
|
|
9032
9786
|
const configParts = key.slice(9).split(":");
|
|
9033
|
-
if (configParts.length
|
|
9034
|
-
else {
|
|
9787
|
+
if (configParts.length === 2) {
|
|
9035
9788
|
const [expectedKey, configKey] = configParts;
|
|
9036
9789
|
let targetIndex;
|
|
9037
9790
|
if (expectedKey === "__expected") targetIndex = 0;
|
|
@@ -9057,7 +9810,7 @@ function testCaseFromCsvRow(row) {
|
|
|
9057
9810
|
}
|
|
9058
9811
|
}
|
|
9059
9812
|
assertionConfigs[targetIndex][configKey] = parsedValue;
|
|
9060
|
-
}
|
|
9813
|
+
} else logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
|
|
9061
9814
|
} else vars[key] = value;
|
|
9062
9815
|
}
|
|
9063
9816
|
for (let i = 0; i < asserts.length; i++) {
|
|
@@ -9186,14 +9939,14 @@ async function parseXlsxFile(filePath) {
|
|
|
9186
9939
|
const sheetName = typeof sheetOption === "number" ? sheetNames[sheetOption - 1] : sheetOption;
|
|
9187
9940
|
const rows = await readXlsxFile(actualFilePath, { sheet: sheetOption });
|
|
9188
9941
|
if (rows.length === 0) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
|
|
9189
|
-
const headers = rows[0].map((cell) => cell
|
|
9942
|
+
const headers = rows[0].map((cell) => cell == null ? "" : String(cell));
|
|
9190
9943
|
if (headers.length === 0 || headers.every((h) => h === "")) throw new Error(`Sheet "${sheetName}" has no valid column headers`);
|
|
9191
9944
|
if (rows.length === 1) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
|
|
9192
9945
|
const data = rows.slice(1).map((row) => {
|
|
9193
9946
|
const obj = {};
|
|
9194
9947
|
headers.forEach((header, index) => {
|
|
9195
9948
|
const cellValue = row[index];
|
|
9196
|
-
obj[header] = cellValue
|
|
9949
|
+
obj[header] = cellValue == null ? "" : String(cellValue);
|
|
9197
9950
|
});
|
|
9198
9951
|
return obj;
|
|
9199
9952
|
});
|
|
@@ -12634,20 +13387,19 @@ function generateEvalSummary(params) {
|
|
|
12634
13387
|
}
|
|
12635
13388
|
}
|
|
12636
13389
|
lines.push("");
|
|
12637
|
-
const
|
|
12638
|
-
|
|
12639
|
-
|
|
12640
|
-
|
|
12641
|
-
|
|
12642
|
-
|
|
12643
|
-
|
|
12644
|
-
}
|
|
12645
|
-
const passedPart = successes > 0 ? `${chalk.green("✓")} ${chalk.green.bold(successes.toLocaleString())} passed` : `${chalk.gray.bold(successes.toLocaleString())} passed`;
|
|
12646
|
-
const failedPart = failures > 0 ? `${chalk.red("✗")} ${chalk.red.bold(failures.toLocaleString())} failed` : `${chalk.gray.bold(failures.toLocaleString())} failed`;
|
|
13390
|
+
const totalTests = successes + failures + errors;
|
|
13391
|
+
const formatResultPercentage = (count) => {
|
|
13392
|
+
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
13393
|
+
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
13394
|
+
};
|
|
13395
|
+
const formatResultLine = (count, label, icon, iconColor) => {
|
|
13396
|
+
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
|
|
13397
|
+
};
|
|
12647
13398
|
const errorLabel = errors === 1 ? "error" : "errors";
|
|
12648
|
-
|
|
12649
|
-
|
|
12650
|
-
|
|
13399
|
+
lines.push(chalk.bold("Results:"));
|
|
13400
|
+
lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
|
|
13401
|
+
lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
|
|
13402
|
+
lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
|
|
12651
13403
|
const durationDisplay = formatDuration(duration);
|
|
12652
13404
|
lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
|
|
12653
13405
|
lines.push("");
|
|
@@ -12808,8 +13560,8 @@ var ModelAudit = class ModelAudit {
|
|
|
12808
13560
|
this.issues = data.issues || data.results?.issues || null;
|
|
12809
13561
|
const issues = data.issues || data.results?.issues;
|
|
12810
13562
|
const resultsHasErrors = data.results?.has_errors ?? false;
|
|
12811
|
-
if (data.hasErrors
|
|
12812
|
-
else this.hasErrors =
|
|
13563
|
+
if (data.hasErrors === void 0) this.hasErrors = resultsHasErrors || issues && issues.some((issue) => issue.severity === "critical" || issue.severity === "error") || false;
|
|
13564
|
+
else this.hasErrors = data.hasErrors;
|
|
12813
13565
|
this.totalChecks = data.totalChecks;
|
|
12814
13566
|
this.passedChecks = data.passedChecks;
|
|
12815
13567
|
this.failedChecks = data.failedChecks;
|
|
@@ -13224,7 +13976,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13224
13976
|
await checkCloudPermissions(config);
|
|
13225
13977
|
const options = {
|
|
13226
13978
|
...evaluateOptions,
|
|
13227
|
-
showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar
|
|
13979
|
+
showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
|
|
13228
13980
|
repeat,
|
|
13229
13981
|
delay: !Number.isNaN(delay) && delay > 0 ? delay : void 0,
|
|
13230
13982
|
maxConcurrency,
|
|
@@ -13608,7 +14360,7 @@ async function doRedteamRun(options) {
|
|
|
13608
14360
|
redteamConfig = await doGenerateRedteam({
|
|
13609
14361
|
...passThroughOptions,
|
|
13610
14362
|
...options.liveRedteamConfig?.commandLineOptions || {},
|
|
13611
|
-
...maxConcurrency
|
|
14363
|
+
...maxConcurrency === void 0 ? {} : { maxConcurrency },
|
|
13612
14364
|
config: configPath,
|
|
13613
14365
|
output: redteamPath,
|
|
13614
14366
|
force: options.force,
|
|
@@ -14430,7 +15182,7 @@ evalRouter.post("/", async (req, res) => {
|
|
|
14430
15182
|
logger.debug("[POST /api/eval] Saving eval results (v4) to database");
|
|
14431
15183
|
const eval_ = await Eval.create(incEval.config, incEval.prompts || [], {
|
|
14432
15184
|
author: incEval.author,
|
|
14433
|
-
createdAt: incEval.createdAt
|
|
15185
|
+
createdAt: incEval.createdAt === void 0 ? void 0 : new Date(incEval.createdAt),
|
|
14434
15186
|
results: incEval.results,
|
|
14435
15187
|
vars: incEval.vars
|
|
14436
15188
|
});
|
|
@@ -17304,7 +18056,7 @@ router.get("/", async (_req, res) => {
|
|
|
17304
18056
|
};
|
|
17305
18057
|
} catch (error) {
|
|
17306
18058
|
logger.debug(`Failed to fetch latest version: ${error}`);
|
|
17307
|
-
latestVersion = versionCache.latestVersion ?? "0.121.
|
|
18059
|
+
latestVersion = versionCache.latestVersion ?? "0.121.3";
|
|
17308
18060
|
}
|
|
17309
18061
|
}
|
|
17310
18062
|
const selfHosted = getEnvBool("PROMPTFOO_SELF_HOSTED");
|
|
@@ -17313,7 +18065,7 @@ router.get("/", async (_req, res) => {
|
|
|
17313
18065
|
selfHosted,
|
|
17314
18066
|
isNpx
|
|
17315
18067
|
});
|
|
17316
|
-
const resolvedLatestVersion = latestVersion ?? "0.121.
|
|
18068
|
+
const resolvedLatestVersion = latestVersion ?? "0.121.3";
|
|
17317
18069
|
const response = {
|
|
17318
18070
|
currentVersion: VERSION,
|
|
17319
18071
|
latestVersion: resolvedLatestVersion,
|