promptfoo 0.121.1 → 0.121.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/src/{accounts-xrUGFA6n.js → accounts-B2XmGjty.js} +5 -5
- package/dist/src/{accounts-Bx-x3bmW.cjs → accounts-BPyfpSeU.cjs} +5 -5
- package/dist/src/{accounts-CMqkzrVf.js → accounts-CFLK3mnD.js} +6 -6
- package/dist/src/{accounts-BgNJDBE6.js → accounts-Xatc0RYb.js} +5 -5
- package/dist/src/{agentic-utils-BKIN5PKu.js → agentic-utils-36epdqwB.js} +3 -3
- package/dist/src/{cometapi-DkXrKi5z.js → agentic-utils-D8yXo5Lm.js} +4 -61
- package/dist/src/{cometapi-vY6aDZgo.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
- package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
- package/dist/src/{agents-C-dDThPK.js → agents-BBVJCIYr.js} +226 -13
- package/dist/src/{agents-CErsqg5U.cjs → agents-BBWxKSM0.cjs} +7 -7
- package/dist/src/{agents-Dy2YpZpa.js → agents-Bqgfdokm.js} +227 -14
- package/dist/src/{agents-B0f4HICh.cjs → agents-CAYbM7qD.cjs} +226 -13
- package/dist/src/{agents-CVIn-Utx.js → agents-CLQ-P15P.js} +7 -7
- package/dist/src/{agents-DeH4Gu94.js → agents-CgBniSlI.js} +8 -8
- package/dist/src/{agents-CXknwsFX.js → agents-DSSTV4bv.js} +226 -13
- package/dist/src/{agents-aF4-T121.js → agents-wg3ohknq.js} +7 -7
- package/dist/src/{aimlapi-tg0Gkcvr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
- package/dist/src/{aimlapi-BNfTBexL.js → aimlapi-BwGC1TtS.js} +13 -13
- package/dist/src/{aimlapi-BAGZDo5G.js → aimlapi-DaC3qZ-o.js} +14 -14
- package/dist/src/{aimlapi-DHRKlBEA.js → aimlapi-MgSLdvy7.js} +13 -13
- package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
- package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
- package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
- package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
- package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
- package/dist/src/app/index.html +4 -4
- package/dist/src/{audio-tf_NBjlC.js → audio-Bn44pQxv.js} +4 -4
- package/dist/src/{audio-CHQ4r-RV.js → audio-DDA5WHdx.js} +4 -4
- package/dist/src/{audio-BWeaWovU.cjs → audio-DVFjQ67_.cjs} +4 -4
- package/dist/src/{audio-BRODU0UK.js → audio-DjU9GswO.js} +5 -5
- package/dist/src/{base-DBtwl2FR.cjs → base-BboXIF_0.cjs} +3 -3
- package/dist/src/{base-B4QJRyFS.js → base-CKjwebIH.js} +3 -3
- package/dist/src/{base-B0tcrnq_.js → base-CqzQ4K8j.js} +3 -3
- package/dist/src/{base-fEDN28WM.js → base-Cz2ZC_iA.js} +3 -3
- package/dist/src/{blobs-BAU-dXan.js → blobs-B1JriOyi.js} +3 -3
- package/dist/src/{blobs-qTYm-1PY.js → blobs-BUWmKWzo.js} +3 -3
- package/dist/src/{blobs-DvS-O6be.cjs → blobs-C6j0bvFz.cjs} +3 -3
- package/dist/src/{blobs-Bpg5rH6i.js → blobs-DXTl6J3H.js} +3 -3
- package/dist/src/{cache-COish3-W.cjs → cache-C5yFZ4gC.cjs} +75 -58
- package/dist/src/{cache-8XhNqPKW.js → cache-CaT5tPgo.js} +75 -58
- package/dist/src/cache-CyCanoMu.js +6 -0
- package/dist/src/{cache-CG0SlR1d.js → cache-DSqR6ezl.js} +75 -58
- package/dist/src/cache-Df_QFDNu.cjs +5 -0
- package/dist/src/{cache-D3eqDYGU.js → cache-HP0NP4k3.js} +75 -58
- package/dist/src/{chat-DHMH-N64.js → chat-B-52XYI1.js} +12 -12
- package/dist/src/{chat-BKm79wib.js → chat-B0iaWhoh.js} +16 -14
- package/dist/src/{chat-DxysjBvt.js → chat-BE0qTA8e.js} +13 -13
- package/dist/src/{chat-CRWNNq73.js → chat-BEwdgGEg.js} +16 -14
- package/dist/src/{chat-2K608PeQ.cjs → chat-BtIKkLKx.cjs} +13 -13
- package/dist/src/{chat-DaqekjFr.cjs → chat-CM8qWR3_.cjs} +17 -15
- package/dist/src/{chat-CM_kyI8B.js → chat-DK1U-eZ-.js} +12 -12
- package/dist/src/{chat-CznLWr_D.js → chat-pxmiVpWe.js} +16 -14
- package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
- package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
- package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
- package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
- package/dist/src/{claude-agent-sdk-CJH22shf.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
- package/dist/src/{claude-agent-sdk-Dy5lT-Tx.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
- package/dist/src/{claude-agent-sdk-BLTu0WBO.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
- package/dist/src/{claude-agent-sdk-D6_k9FKA.js → claude-agent-sdk-HgbFioFw.js} +33 -15
- package/dist/src/cloud-DE3t1-ZI.js +4 -0
- package/dist/src/{cloud-Bc9526yV.js → cloud-z8KZpUoa.js} +3 -3
- package/dist/src/{cloudflare-ai-CWWJCRim.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
- package/dist/src/{cloudflare-ai-C9r2sRhw.js → cloudflare-ai-Bbp26N0L.js} +13 -13
- package/dist/src/{cloudflare-ai-ClWSdor4.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
- package/dist/src/{cloudflare-ai-ICsOuD-z.js → cloudflare-ai-DdKP9TKT.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6xFc5pa.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6O7AlYb.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
- package/dist/src/{cloudflare-gateway-pXGHxJ47.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
- package/dist/src/{cloudflare-gateway-C2_-KG5o.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
- package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
- package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
- package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
- package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
- package/dist/src/cometapi-BDyV-NNm.js +62 -0
- package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
- package/dist/src/{cometapi-Bbjp5V4x.js → cometapi-hhL4TAh3.js} +14 -14
- package/dist/src/{cometapi-BasUi7-_.js → cometapi-sp7sJpBD.js} +15 -15
- package/dist/src/{completion-C_P3ypkJ.js → completion-BCimtq-h.js} +6 -6
- package/dist/src/{completion-6Mx_iXxK.js → completion-DCjv7RZ3.js} +6 -6
- package/dist/src/{completion-CDOouNzq.cjs → completion-DlXUhj5c.cjs} +6 -6
- package/dist/src/{completion-C5rtR_9P.js → completion-DoYy49ti.js} +6 -6
- package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
- package/dist/src/{docker-BwsKwxFs.cjs → docker-Cqj2-QVi.cjs} +14 -14
- package/dist/src/{docker-CZnqU1XV.js → docker-CxCkwMzc.js} +13 -13
- package/dist/src/{docker-DzxyDPIj.js → docker-DpguQj-w.js} +14 -14
- package/dist/src/{docker-5KcG-_86.js → docker-FeBni2dw.js} +13 -13
- package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
- package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
- package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
- package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
- package/dist/src/eval-7aEqoMs3.js +15 -0
- package/dist/src/{eval-DmFyWU7i.js → eval-BTqTn7lb.js} +10 -10
- package/dist/src/{evalResult-CDQiuUuf.js → evalResult-BkIhRdTe.js} +7 -7
- package/dist/src/evalResult-CYNHkk5A.js +12 -0
- package/dist/src/evalResult-CuvJeNiM.js +10 -0
- package/dist/src/{evalResult-CTG2AHOS.js → evalResult-DUDShQrm.js} +7 -7
- package/dist/src/{evalResult-Dap2CekP.cjs → evalResult-DpARzUCb.cjs} +7 -7
- package/dist/src/evalResult-tGdilrWt.cjs +10 -0
- package/dist/src/evaluator-BBUqRhz1.js +36 -0
- package/dist/src/{evaluator-DPFRbFIL.js → evaluator-BcvOGaam.js} +833 -79
- package/dist/src/{extractor-YMU_Gvt8.js → extractor-C8XwivI9.js} +6 -6
- package/dist/src/{extractor-CFG6bcWJ.js → extractor-CAZ2G3Kh.js} +6 -6
- package/dist/src/{extractor-DX36oYEv.cjs → extractor-DG3sSfXE.cjs} +6 -6
- package/dist/src/{extractor-M67RUtg6.js → extractor-D_wd8jxt.js} +6 -6
- package/dist/src/{fetch-4M3YRaqL.js → fetch-BiYv2BZc.js} +3 -3
- package/dist/src/{fetch-BxUk8odA.cjs → fetch-BnR9wSnm.cjs} +3 -3
- package/dist/src/{fetch-60Gzydls.js → fetch-CVAtKnI3.js} +3 -3
- package/dist/src/{fetch-BMv0O527.js → fetch-DoVRJZhJ.js} +4 -4
- package/dist/src/fetch-UWU706qb.js +5 -0
- package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
- package/dist/src/{graders-DOXycdlG.cjs → graders-BElhu9ZY.cjs} +126 -55
- package/dist/src/{graders-R9rYUM0d.js → graders-BXAJ0sbS.js} +120 -55
- package/dist/src/graders-BxfEguVY.js +32 -0
- package/dist/src/graders-CzVMbEnv.js +34 -0
- package/dist/src/{graders-CpdqD9PI.js → graders-DG7mhg-b.js} +120 -55
- package/dist/src/graders-DjCXfj0l.cjs +32 -0
- package/dist/src/{graders-CHO8EPM4.js → graders-RjHF8VfG.js} +120 -55
- package/dist/src/graders-kHzIWOKu.js +32 -0
- package/dist/src/{image-DTedmQPg.cjs → image--F58eEIn.cjs} +6 -6
- package/dist/src/{image-DJEvKveK.js → image-6WQXK8m8.js} +4 -4
- package/dist/src/{image-pAX56tPG.js → image-B8b6f36E.js} +6 -6
- package/dist/src/{image-BmEZqVmk.js → image-CoxZp9PZ.js} +6 -6
- package/dist/src/{image-gvmivTEe.js → image-DO0RYnjH.js} +5 -5
- package/dist/src/{image-CBBVXWuT.js → image-PoF6DN3x.js} +6 -6
- package/dist/src/{image-CDLQOcqT.cjs → image-fza3zuKs.cjs} +4 -4
- package/dist/src/{image-tL5hIOFh.js → image-xNbw5ph2.js} +4 -4
- package/dist/src/index.cjs +863 -110
- package/dist/src/index.d.cts +833 -60
- package/dist/src/index.d.ts +833 -60
- package/dist/src/index.js +860 -108
- package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
- package/dist/src/{knowledgeBase-CoU-UQBg.js → knowledgeBase-Bi7CmDbx.js} +7 -7
- package/dist/src/{knowledgeBase-CLJybhnF.js → knowledgeBase-Ce3ofVan.js} +8 -8
- package/dist/src/{knowledgeBase-DjWPVqSb.js → knowledgeBase-DFRXPZl_.js} +7 -7
- package/dist/src/{knowledgeBase-wkxuRFhA.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
- package/dist/src/{litellm-B9Hysuri.js → litellm-Bo2gQXpo.js} +16 -15
- package/dist/src/{litellm-ePxtr9F1.js → litellm-CKiAxnoM.js} +15 -14
- package/dist/src/{litellm-NYpQ8RQu.cjs → litellm-CnHI69aj.cjs} +16 -15
- package/dist/src/{litellm-CTfa0hqi.js → litellm-Tc294Jhj.js} +15 -14
- package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
- package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
- package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
- package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
- package/dist/src/{luma-ray-BW9IRGIc.js → luma-ray-0ehMPt5N.js} +10 -10
- package/dist/src/{luma-ray-BE2mOt6N.js → luma-ray-C9q8rdQe.js} +9 -9
- package/dist/src/{luma-ray-Cm1KZBhs.js → luma-ray-DP0QA9qn.js} +9 -9
- package/dist/src/{luma-ray-B0GGNRc1.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
- package/dist/src/main.js +69 -71
- package/dist/src/{messages-1x9atZmP.js → messages-DJNo37Ko.js} +14 -9
- package/dist/src/{messages-BLbWdsyt.js → messages-Dy9QecMs.js} +14 -9
- package/dist/src/{messages-1JrJs91T.cjs → messages-HJsyEh4o.cjs} +15 -10
- package/dist/src/{messages-D8EA0oDc.js → messages-biC_ex-p.js} +14 -9
- package/dist/src/{modelslab-C1OLRmVX.js → modelslab-B5J-ZM5c.js} +9 -9
- package/dist/src/{modelslab-CqXBy3U8.js → modelslab-BI458moT.js} +10 -10
- package/dist/src/{modelslab-X5-4LroM.js → modelslab-BTOT8FUO.js} +9 -9
- package/dist/src/{modelslab-DcOSFwKh.cjs → modelslab-IQbNg-r7.cjs} +9 -9
- package/dist/src/{nova-reel-DihqLeol.js → nova-reel-BZ9y-Y5s.js} +9 -9
- package/dist/src/{nova-reel-D9xfaMBs.cjs → nova-reel-CE5etkv9.cjs} +9 -9
- package/dist/src/{nova-reel-D2ZkOSyr.js → nova-reel-DEeQlnOJ.js} +10 -10
- package/dist/src/{nova-reel-BgS1ZWuK.js → nova-reel-Xw1SXLpg.js} +9 -9
- package/dist/src/{nova-sonic-Q3BOJeig.js → nova-sonic-DWswpN1E.js} +7 -7
- package/dist/src/{nova-sonic-DezhVUYT.js → nova-sonic-DXTLpi-r.js} +6 -6
- package/dist/src/{nova-sonic-DVu3mMIy.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
- package/dist/src/{nova-sonic-P-CdUMlV.js → nova-sonic-Ogqf-csn.js} +6 -6
- package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
- package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
- package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
- package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
- package/dist/src/{openclaw-Bv1DINsX.js → openclaw-0Sv7AK3O.js} +172 -109
- package/dist/src/{openclaw-DAfWQn-o.cjs → openclaw-CXxbKgDH.cjs} +174 -110
- package/dist/src/{openclaw-BiSZPL7J.js → openclaw-D1FSCps-.js} +172 -109
- package/dist/src/{openclaw-D1D_ej1z.js → openclaw-D2ENvu7a.js} +173 -110
- package/dist/src/{opencode-sdk-D95s6SnR.js → opencode-sdk-C71Z0ehR.js} +13 -13
- package/dist/src/{opencode-sdk-DxUPkLT7.js → opencode-sdk-CHCs7dEb.js} +12 -12
- package/dist/src/{opencode-sdk-C7m-wRfI.js → opencode-sdk-DDxj4QqH.js} +12 -12
- package/dist/src/{opencode-sdk-CfaLN8PY.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
- package/dist/src/{otlpReceiver-g3ByGaXs.js → otlpReceiver-C9KlUtxh.js} +6 -6
- package/dist/src/{otlpReceiver--AIRW_S4.js → otlpReceiver-CZL48YfC.js} +6 -6
- package/dist/src/{otlpReceiver-Bn5wGB1v.js → otlpReceiver-CavGAA6k.js} +6 -6
- package/dist/src/{otlpReceiver-Diec4cln.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
- package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
- package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
- package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
- package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
- package/dist/src/providers-BSLEaIQG.js +32 -0
- package/dist/src/{providers-CFu-TZl-.cjs → providers-CScd1wN6.cjs} +733 -464
- package/dist/src/{providers-CFLy1_ji.js → providers-Ch6Mr0gn.js} +795 -526
- package/dist/src/{providers-BKRJTjBz.js → providers-Cn73d5sr.js} +795 -526
- package/dist/src/providers-D-FnDg8k.cjs +31 -0
- package/dist/src/providers-DEYiFVAo.js +30 -0
- package/dist/src/{providers-B3HvufyI.js → providers-DvddrgxL.js} +795 -526
- package/dist/src/providers-sS2WI8YD.js +30 -0
- package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
- package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
- package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
- package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
- package/dist/src/{quiverai-CI6gYJVI.js → quiverai-BeofbLVc.js} +4 -4
- package/dist/src/{quiverai-MHSxbmmZ.js → quiverai-CCQn73lq.js} +5 -5
- package/dist/src/{quiverai-CLkWkyZc.cjs → quiverai-CcUhPIBg.cjs} +4 -4
- package/dist/src/{quiverai-C2jVwbH1.js → quiverai-DVSEqJiq.js} +4 -4
- package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
- package/dist/src/{responses-CGw0DCzh.js → responses-BKP_WYis.js} +16 -12
- package/dist/src/{responses-BKqJmhhc.js → responses-CQb1Tj69.js} +16 -12
- package/dist/src/{responses-jxdehPkC.js → responses-CgNyTPsY.js} +16 -12
- package/dist/src/{responses-tD4Bd4dc.cjs → responses-mo0KQDbu.cjs} +16 -12
- package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
- package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
- package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
- package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
- package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
- package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
- package/dist/src/{sagemaker-BK4Zb993.js → sagemaker-CVv8W7so.js} +17 -17
- package/dist/src/{sagemaker-D2Q1c-sD.js → sagemaker-CqeASYE5.js} +17 -17
- package/dist/src/{sagemaker-BfiWTmvn.js → sagemaker-MUbD5V3v.js} +18 -18
- package/dist/src/{sagemaker-CcQHM1jV.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
- package/dist/src/{scanner-J8CA3LsV.js → scanner-DVDeUz1r.js} +10 -10
- package/dist/src/server/index.js +864 -112
- package/dist/src/server-B0Xh1Gx-.js +7 -0
- package/dist/src/{server-B0PPuDw-.cjs → server-BtoCXeXI.cjs} +4 -4
- package/dist/src/{server-BC7XJFgr.js → server-CP9qKM40.js} +4 -4
- package/dist/src/{server-OAs3nBRT.js → server-Cns05F1j.js} +5 -5
- package/dist/src/server-DJTKu9IR.cjs +5 -0
- package/dist/src/{server-DbFphssR.js → server-DZ9MtCn0.js} +6 -6
- package/dist/src/{signal-BOTbd53Z.js → signal-C3ZTsUgi.js} +3 -3
- package/dist/src/{slack-DXMKtA-f.js → slack-2sdpGzbt.js} +2 -2
- package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
- package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
- package/dist/src/{slack-DOdy_kyv.js → slack-DCEV-vWP.js} +2 -2
- package/dist/src/store-C5u6MgC8.js +6 -0
- package/dist/src/{store-BSc-TF2w.cjs → store-CLyU7AtI.cjs} +17 -5
- package/dist/src/store-CNHk-De4.cjs +5 -0
- package/dist/src/{store-DQLEjuEO.js → store-Cj258DgL.js} +17 -5
- package/dist/src/{store-D1tv90v3.js → store-P8OKm19S.js} +17 -5
- package/dist/src/{store-Ub2vaGJ1.js → store-VB0GP46K.js} +17 -5
- package/dist/src/{tables-xKANLRBD.js → tables-BEIFz2tM.js} +3 -3
- package/dist/src/{tables-C7K-XKWp.cjs → tables-BdZQEpRz.cjs} +3 -3
- package/dist/src/{tables-D36WTqKX.js → tables-DmzvLbeZ.js} +3 -3
- package/dist/src/{tables-5EvT_Bwn.js → tables-kC7R5kiK.js} +3 -3
- package/dist/src/{telemetry-C2YDkUQH.js → telemetry-BnH5VJAU.js} +4 -4
- package/dist/src/{telemetry-C15ziL8u.js → telemetry-BugWqKiu.js} +4 -4
- package/dist/src/{telemetry-DMb2Mpfm.js → telemetry-DPXLd7UE.js} +4 -4
- package/dist/src/telemetry-Yig0Tino.js +7 -0
- package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
- package/dist/src/{telemetry-CbrnxHp_.cjs → telemetry-re627Lre.cjs} +4 -4
- package/dist/src/{transcription-CL78qbOU.cjs → transcription-BvtsrzRG.cjs} +13 -13
- package/dist/src/{transcription-DAtxHhAM.js → transcription-CaMivnjG.js} +13 -13
- package/dist/src/{transcription-QHh3AH6Z.js → transcription-DOMMTu01.js} +14 -14
- package/dist/src/{transcription-LNZTNUUL.js → transcription-Hb3VnC4M.js} +13 -13
- package/dist/src/{transform-DOcQeLld.cjs → transform-0BwoBsvO.cjs} +19 -5
- package/dist/src/{transform-DGxXocjk.js → transform-B2-jIv68.js} +8 -6
- package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
- package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
- package/dist/src/transform-ChNIpHz7.js +6 -0
- package/dist/src/{transform-Cgi24fJ7.js → transform-DrleutM3.js} +8 -6
- package/dist/src/{transform-DGLazrMm.js → transform-DyDAwEpE.js} +8 -6
- package/dist/src/transform-PtQ6rAE3.cjs +5 -0
- package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
- package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
- package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
- package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
- package/dist/src/{types-CH3Ge2sE.js → types-CIhFeUC4.js} +45 -11
- package/dist/src/{types-CN_TZ2GJ.js → types-Cd3ygw8W.js} +45 -11
- package/dist/src/{types-LJ0r3wbR.cjs → types-D8cGDZbL.cjs} +46 -12
- package/dist/src/{types-CLKiCBW3.js → types-q8GXGF65.js} +45 -11
- package/dist/src/{util-CchiqXh_.cjs → util--9u9UVCt.cjs} +3 -3
- package/dist/src/{util-5cB-L7U3.js → util-BLvy9qfE.js} +7 -11
- package/dist/src/{util-YT5HPZaS.js → util-Bm3E9jpK.js} +7 -11
- package/dist/src/{util-6-GqIvzS.js → util-BtoGs5Cb.js} +18 -4
- package/dist/src/{util-Db0a0AFH.cjs → util-CFj4YKIn.cjs} +18 -4
- package/dist/src/{util-Dlz_Wvgm.js → util-CMMkIxfU.js} +7 -11
- package/dist/src/{util-Betm42rL.js → util-CgDCK4KI.js} +18 -4
- package/dist/src/{util-Yz-1aEhW.cjs → util-CuLo2pMR.cjs} +7 -11
- package/dist/src/{util-C-PPYSMq.js → util-DM2rTn_6.js} +18 -4
- package/dist/src/{util-B7T3SiBS.js → util-DMFeUvLz.js} +3 -3
- package/dist/src/{util-ZZH-3QZz.js → util-DbVG-yZU.js} +3 -3
- package/dist/src/{util-DaWTWKBK.js → util-vNmDL5DT.js} +3 -3
- package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
- package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
- package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
- package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -38
- package/dist/src/app/assets/index-BFCZg7hQ.js +0 -439
- package/dist/src/app/assets/index-NCn4eVBv.css +0 -1
- package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
- package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
- package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
- package/dist/src/cache-Bbn1Nyrd.cjs +0 -5
- package/dist/src/cache-BwsMSda7.js +0 -6
- package/dist/src/cloud-DmE0EwsY.js +0 -4
- package/dist/src/eval-17JizQIv.js +0 -15
- package/dist/src/evalResult-Cqj8pldJ.js +0 -12
- package/dist/src/evalResult-DvcJAWJU.cjs +0 -10
- package/dist/src/evalResult-Hftn-S_i.js +0 -10
- package/dist/src/evaluator-B2CFNt-P.js +0 -36
- package/dist/src/fetch-KV5kNASw.js +0 -5
- package/dist/src/graders-Bu0H9nXi.js +0 -32
- package/dist/src/graders-Cfhkvx-e.js +0 -34
- package/dist/src/graders-DClJVpGP.cjs +0 -32
- package/dist/src/graders-DcnJsrMO.js +0 -32
- package/dist/src/providers-C1rOSHiR.js +0 -32
- package/dist/src/providers-CxmDwEFf.cjs +0 -31
- package/dist/src/providers-Dodakqr0.js +0 -30
- package/dist/src/providers-GIQ2TcsA.js +0 -30
- package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
- package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
- package/dist/src/server-B1vi21hA.js +0 -7
- package/dist/src/server-Cm9Kai_h.cjs +0 -5
- package/dist/src/store-BNmZ1KAz.cjs +0 -5
- package/dist/src/store-BltJg2cd.js +0 -6
- package/dist/src/telemetry-5BCRNBbe.cjs +0 -5
- package/dist/src/telemetry-D4W5hboe.js +0 -7
- package/dist/src/transform-DTGDnAzW.js +0 -6
- package/dist/src/transform-m3qNw4KP.cjs +0 -5
|
@@ -1,34 +1,35 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { C as
|
|
3
|
-
import { N as VERSION, P as FILE_METADATA_KEY, g as isPromptfooSampleTarget, l as sleep, r as fetchWithRetries, y as parseChatPrompt } from "./fetch-
|
|
2
|
+
import { C as getEnvBool, D as getEvalTimeoutMs, E as getEnvString, O as getMaxEvalTimeMs, T as getEnvInt, b as summarizeEvaluateResultForLogging, c as setLogCallback, g as getAjv, h as extractJsonObjects, j as state, k as isCI, r as globalLogCallback, s as logger, y as safeJsonStringify } from "./logger-BcJBzSSA.js";
|
|
3
|
+
import { N as VERSION, P as FILE_METADATA_KEY, g as isPromptfooSampleTarget, l as sleep, r as fetchWithRetries, y as parseChatPrompt } from "./fetch-DoVRJZhJ.js";
|
|
4
4
|
import { t as invariant } from "./invariant-BtWWVVhl.js";
|
|
5
|
-
import { r as telemetry } from "./telemetry-
|
|
6
|
-
import { d as isGradingResult, p as isApiProvider, s as ResultFailureReason } from "./types-
|
|
7
|
-
import { c as promptYesNo } from "./server-
|
|
8
|
-
import { A as renderPrompt, E as isBasicRefusal, F as TokenUsageTracker, G as VertexChatProvider, I as createRateLimitRegistry, K as AIStudioChatProvider, L as createProviderRateLimitOptions, M as isPackagePath, N as loadFromPackage, P as redteamProviderManager, j as runExtensionHook, k as collectFileMetadata, u as GoogleLiveProvider, v as checkExfilTracking, w as getSessionId } from "./providers-
|
|
9
|
-
import { o as getCache } from "./cache-
|
|
5
|
+
import { r as telemetry } from "./telemetry-BnH5VJAU.js";
|
|
6
|
+
import { d as isGradingResult, p as isApiProvider, s as ResultFailureReason } from "./types-CIhFeUC4.js";
|
|
7
|
+
import { c as promptYesNo } from "./server-DZ9MtCn0.js";
|
|
8
|
+
import { A as renderPrompt, E as isBasicRefusal, F as TokenUsageTracker, G as VertexChatProvider, I as createRateLimitRegistry, K as AIStudioChatProvider, L as createProviderRateLimitOptions, M as isPackagePath, N as loadFromPackage, P as redteamProviderManager, j as runExtensionHook, k as collectFileMetadata, u as GoogleLiveProvider, v as checkExfilTracking, w as getSessionId } from "./providers-Ch6Mr0gn.js";
|
|
9
|
+
import { o as getCache } from "./cache-DSqR6ezl.js";
|
|
10
10
|
import { n as isNonTransientHttpStatus } from "./errors-P6ll7XSJ.js";
|
|
11
11
|
import { i as isJavascriptFile } from "./fileExtensions-Ds-foDzt.js";
|
|
12
|
-
import { E as parseFileUrl, I as isAnthropicProvider, L as isGoogleProvider, R as isOpenAiProvider, T as loadFunction, g as maybeLoadToolsFromExternalFile, w as getNunjucksEngine, z as isProviderAllowed } from "./util-
|
|
13
|
-
import { r as runPython } from "./pythonUtils-
|
|
14
|
-
import { n as transform, r as getProcessShim, t as TransformInputType } from "./transform-
|
|
15
|
-
import { $ as matchesSearchRubric, B as getAndCheckProvider, G as matchesContextFaithfulness, H as matchesAnswerRelevance, J as matchesFactuality, K as matchesContextRecall, Q as matchesPiScore, R as callProviderWithContext, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, at as
|
|
16
|
-
import { i as generateIdFromPrompt } from "./utils-
|
|
17
|
-
import { t as OpenAiChatCompletionProvider } from "./chat-
|
|
12
|
+
import { E as parseFileUrl, I as isAnthropicProvider, L as isGoogleProvider, R as isOpenAiProvider, T as loadFunction, g as maybeLoadToolsFromExternalFile, w as getNunjucksEngine, z as isProviderAllowed } from "./util-Bm3E9jpK.js";
|
|
13
|
+
import { r as runPython } from "./pythonUtils-wIqk7zAf.js";
|
|
14
|
+
import { n as transform, r as getProcessShim, t as TransformInputType } from "./transform-ljLYHEPh.js";
|
|
15
|
+
import { $ as matchesSearchRubric, B as getAndCheckProvider, G as matchesContextFaithfulness, H as matchesAnswerRelevance, J as matchesFactuality, K as matchesContextRecall, Q as matchesPiScore, R as callProviderWithContext, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, at as getDefaultProviders, dt as coerceString, et as matchesSelectBest, ft as getFinalTest, ht as resolveContext, mt as processFileReference, n as getGraderById, nt as matchesTrajectoryGoalSuccess, ot as DefaultSuggestionsProvider, pt as loadFromJavaScriptFile, q as matchesContextRelevance, rt as selectMaxScore, tt as matchesSimilarity, ut as SUGGEST_PROMPTS_SYSTEM_MESSAGE, z as fail } from "./graders-RjHF8VfG.js";
|
|
16
|
+
import { i as generateIdFromPrompt } from "./utils-DEuL4VNB.js";
|
|
17
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-BEwdgGEg.js";
|
|
18
18
|
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-DflFMjS0.js";
|
|
19
|
-
import { m as validateFunctionCall } from "./transform-
|
|
20
|
-
import { l as validateFunctionCall$1 } from "./util-
|
|
21
|
-
import { t as extractAndStoreBinaryData } from "./extractor-
|
|
22
|
-
import { n as getTraceStore } from "./store-
|
|
23
|
-
import { t as providerRegistry } from "./providerRegistry-
|
|
24
|
-
import { n as runRuby } from "./rubyUtils-
|
|
25
|
-
import { a as getActualPromptWithFallback, r as updateSignalFile } from "./signal-
|
|
19
|
+
import { m as validateFunctionCall } from "./transform-DrleutM3.js";
|
|
20
|
+
import { l as validateFunctionCall$1 } from "./util-DM2rTn_6.js";
|
|
21
|
+
import { t as extractAndStoreBinaryData } from "./extractor-CAZ2G3Kh.js";
|
|
22
|
+
import { n as getTraceStore } from "./store-P8OKm19S.js";
|
|
23
|
+
import { t as providerRegistry } from "./providerRegistry-B9lh-_tx.js";
|
|
24
|
+
import { n as runRuby } from "./rubyUtils-CiVfln3g.js";
|
|
25
|
+
import { a as getActualPromptWithFallback, r as updateSignalFile } from "./signal-C3ZTsUgi.js";
|
|
26
26
|
import chalk from "chalk";
|
|
27
27
|
import fs, { createWriteStream } from "fs";
|
|
28
28
|
import path from "path";
|
|
29
29
|
import os from "os";
|
|
30
30
|
import yaml from "js-yaml";
|
|
31
31
|
import util from "util";
|
|
32
|
+
import readline from "readline";
|
|
32
33
|
import { randomBytes } from "crypto";
|
|
33
34
|
import { globSync } from "glob";
|
|
34
35
|
import { XMLParser } from "fast-xml-parser";
|
|
@@ -38,6 +39,7 @@ import cliProgress from "cli-progress";
|
|
|
38
39
|
import { JSDOM } from "jsdom";
|
|
39
40
|
import { distance } from "fastest-levenshtein";
|
|
40
41
|
import * as rouge from "js-rouge";
|
|
42
|
+
import { isDeepStrictEqual } from "node:util";
|
|
41
43
|
import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
|
|
42
44
|
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
43
45
|
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
@@ -256,7 +258,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
256
258
|
telemetry.record("feature_used", { feature: "tracing" });
|
|
257
259
|
try {
|
|
258
260
|
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
259
|
-
const { startOTLPReceiver } = await import("./otlpReceiver-
|
|
261
|
+
const { startOTLPReceiver } = await import("./otlpReceiver-CavGAA6k.js");
|
|
260
262
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
261
263
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
262
264
|
logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
@@ -279,7 +281,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
279
281
|
async function stopOtlpReceiverIfNeeded() {
|
|
280
282
|
if (otlpReceiverStarted) try {
|
|
281
283
|
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
282
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver-
|
|
284
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver-CavGAA6k.js");
|
|
283
285
|
await stopOTLPReceiver();
|
|
284
286
|
otlpReceiverStarted = false;
|
|
285
287
|
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -314,7 +316,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
314
316
|
}
|
|
315
317
|
if (!tracingEnabled) return null;
|
|
316
318
|
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
317
|
-
const { getTraceStore } = await import("./store-
|
|
319
|
+
const { getTraceStore } = await import("./store-C5u6MgC8.js");
|
|
318
320
|
const traceStore = getTraceStore();
|
|
319
321
|
const traceId = generateTraceId();
|
|
320
322
|
const spanId = generateSpanId();
|
|
@@ -1347,7 +1349,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
|
|
|
1347
1349
|
pass = result !== inverse;
|
|
1348
1350
|
score = pass ? 1 : 0;
|
|
1349
1351
|
} else if (typeof result === "number") {
|
|
1350
|
-
pass = assertion.threshold
|
|
1352
|
+
pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
|
|
1351
1353
|
score = result;
|
|
1352
1354
|
} else if (typeof result === "object") return result;
|
|
1353
1355
|
else throw new Error("Custom function must return a boolean or number");
|
|
@@ -1380,7 +1382,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1380
1382
|
} catch {
|
|
1381
1383
|
pass = inverse;
|
|
1382
1384
|
}
|
|
1383
|
-
if (
|
|
1385
|
+
if (parsedJson !== void 0 && renderedValue) {
|
|
1384
1386
|
let validate;
|
|
1385
1387
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
1386
1388
|
const schema = valueFromScript;
|
|
@@ -1392,11 +1394,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1392
1394
|
}
|
|
1393
1395
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1394
1396
|
else throw new Error("is-json assertion must have a string or object value");
|
|
1395
|
-
|
|
1397
|
+
const valid = validate(parsedJson);
|
|
1398
|
+
pass = inverse ? !valid : valid;
|
|
1396
1399
|
if (!pass) return {
|
|
1397
1400
|
pass,
|
|
1398
1401
|
score: 0,
|
|
1399
|
-
reason: `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1402
|
+
reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1400
1403
|
assertion
|
|
1401
1404
|
};
|
|
1402
1405
|
}
|
|
@@ -1423,9 +1426,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1423
1426
|
}
|
|
1424
1427
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1425
1428
|
else throw new Error("contains-json assertion must have a string or object value");
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
+
const valid = validate(jsonObject);
|
|
1430
|
+
pass = inverse ? !valid : valid;
|
|
1431
|
+
if (valid) {
|
|
1432
|
+
if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
|
|
1433
|
+
break;
|
|
1434
|
+
} else errorMessage = `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`;
|
|
1429
1435
|
}
|
|
1430
1436
|
return {
|
|
1431
1437
|
pass,
|
|
@@ -1567,7 +1573,7 @@ function handlePerplexity({ logProbs, assertion }) {
|
|
|
1567
1573
|
if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
|
|
1568
1574
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
1569
1575
|
const perplexity = Math.exp(-avgLogProb);
|
|
1570
|
-
const pass = assertion.threshold
|
|
1576
|
+
const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
|
|
1571
1577
|
return {
|
|
1572
1578
|
pass,
|
|
1573
1579
|
score: pass ? 1 : 0,
|
|
@@ -1579,7 +1585,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1579
1585
|
if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
|
|
1580
1586
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
1581
1587
|
const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
|
|
1582
|
-
const pass = assertion.threshold
|
|
1588
|
+
const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
|
|
1583
1589
|
return {
|
|
1584
1590
|
pass,
|
|
1585
1591
|
score: perplexityNorm,
|
|
@@ -1694,7 +1700,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1694
1700
|
} else {
|
|
1695
1701
|
score = Number.parseFloat(String(result));
|
|
1696
1702
|
if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
1697
|
-
pass = assertion.threshold
|
|
1703
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
1698
1704
|
}
|
|
1699
1705
|
} catch (err) {
|
|
1700
1706
|
return {
|
|
@@ -1955,7 +1961,7 @@ end
|
|
|
1955
1961
|
} else {
|
|
1956
1962
|
score = Number.parseFloat(String(result));
|
|
1957
1963
|
if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
1958
|
-
pass = assertion.threshold
|
|
1964
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
1959
1965
|
}
|
|
1960
1966
|
} catch (err) {
|
|
1961
1967
|
return {
|
|
@@ -2026,6 +2032,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2026
2032
|
};
|
|
2027
2033
|
};
|
|
2028
2034
|
//#endregion
|
|
2035
|
+
//#region src/assertions/traceUtils.ts
|
|
2036
|
+
/**
|
|
2037
|
+
* Shared utilities for trace assertions
|
|
2038
|
+
*/
|
|
2039
|
+
/**
|
|
2040
|
+
* Match a span name against a glob-like pattern.
|
|
2041
|
+
* Supports * (any characters) and ? (single character) wildcards.
|
|
2042
|
+
*
|
|
2043
|
+
* @param spanName - The span name to match
|
|
2044
|
+
* @param pattern - The glob pattern to match against
|
|
2045
|
+
* @returns true if the span name matches the pattern
|
|
2046
|
+
*/
|
|
2047
|
+
function matchesPattern(spanName, pattern) {
|
|
2048
|
+
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2049
|
+
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2050
|
+
}
|
|
2051
|
+
//#endregion
|
|
2052
|
+
//#region src/assertions/skill.ts
|
|
2053
|
+
function getSkillCalls(params) {
|
|
2054
|
+
const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
|
|
2055
|
+
if (!Array.isArray(rawSkillCalls)) return [];
|
|
2056
|
+
return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
|
|
2057
|
+
}
|
|
2058
|
+
function matchesSkill(skillCall, matcher) {
|
|
2059
|
+
if (matcher.name && skillCall.name !== matcher.name) return false;
|
|
2060
|
+
if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
|
|
2061
|
+
return true;
|
|
2062
|
+
}
|
|
2063
|
+
function formatSkillCall(skillCall) {
|
|
2064
|
+
const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
|
|
2065
|
+
return details ? `${skillCall.name} (${details})` : skillCall.name;
|
|
2066
|
+
}
|
|
2067
|
+
function resolveSkillMatchers(value) {
|
|
2068
|
+
const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
|
|
2069
|
+
const validateCount = (field, count) => {
|
|
2070
|
+
if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
|
|
2071
|
+
};
|
|
2072
|
+
if (typeof value === "string" && value.trim()) return {
|
|
2073
|
+
kind: "list",
|
|
2074
|
+
matchers: [{ name: normalizeText(value) }]
|
|
2075
|
+
};
|
|
2076
|
+
if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
|
|
2077
|
+
kind: "list",
|
|
2078
|
+
matchers: value.map((item) => ({ name: item.trim() }))
|
|
2079
|
+
};
|
|
2080
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
2081
|
+
const rawMatcher = value;
|
|
2082
|
+
const matcher = rawMatcher;
|
|
2083
|
+
const name = normalizeText(matcher.name);
|
|
2084
|
+
const pattern = normalizeText(matcher.pattern);
|
|
2085
|
+
if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
|
|
2086
|
+
if ("min" in rawMatcher) validateCount("min", matcher.min);
|
|
2087
|
+
if ("max" in rawMatcher) validateCount("max", matcher.max);
|
|
2088
|
+
if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
|
|
2089
|
+
return {
|
|
2090
|
+
kind: "count",
|
|
2091
|
+
matcher: {
|
|
2092
|
+
max: typeof matcher.max === "number" ? matcher.max : void 0,
|
|
2093
|
+
min: typeof matcher.min === "number" ? matcher.min : void 0,
|
|
2094
|
+
name,
|
|
2095
|
+
pattern
|
|
2096
|
+
}
|
|
2097
|
+
};
|
|
2098
|
+
}
|
|
2099
|
+
throw new Error("skill-used assertion must have a string, string array, or object value");
|
|
2100
|
+
}
|
|
2101
|
+
function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
|
|
2102
|
+
const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
2103
|
+
const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
2104
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
2105
|
+
const expectedSkills = expected.matchers.map((matcher) => matcher.name);
|
|
2106
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
2107
|
+
let reason;
|
|
2108
|
+
if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
2109
|
+
else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
|
|
2110
|
+
else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
2111
|
+
return {
|
|
2112
|
+
pass,
|
|
2113
|
+
score: pass ? 1 : 0,
|
|
2114
|
+
reason,
|
|
2115
|
+
assertion: params.assertion
|
|
2116
|
+
};
|
|
2117
|
+
}
|
|
2118
|
+
function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
|
|
2119
|
+
const hasExplicitMin = matcher.min !== void 0;
|
|
2120
|
+
const hasExplicitMax = matcher.max !== void 0;
|
|
2121
|
+
const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
|
|
2122
|
+
const max = matcher.max;
|
|
2123
|
+
const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
|
|
2124
|
+
const count = matchingSkillCalls.length;
|
|
2125
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
2126
|
+
if (params.inverse) {
|
|
2127
|
+
if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
|
|
2128
|
+
const pass = count === 0;
|
|
2129
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
2130
|
+
return {
|
|
2131
|
+
pass,
|
|
2132
|
+
score: pass ? 1 : 0,
|
|
2133
|
+
reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
|
|
2134
|
+
assertion: params.assertion
|
|
2135
|
+
};
|
|
2136
|
+
}
|
|
2137
|
+
const pass = count >= min && (max === void 0 || count <= max);
|
|
2138
|
+
let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
|
|
2139
|
+
reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
|
|
2140
|
+
if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
|
|
2141
|
+
return {
|
|
2142
|
+
pass,
|
|
2143
|
+
score: pass ? 1 : 0,
|
|
2144
|
+
reason,
|
|
2145
|
+
assertion: params.assertion
|
|
2146
|
+
};
|
|
2147
|
+
}
|
|
2148
|
+
function handleSkillUsed(params) {
|
|
2149
|
+
const skillCalls = getSkillCalls(params);
|
|
2150
|
+
const actualSkills = skillCalls.map(formatSkillCall);
|
|
2151
|
+
const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
|
|
2152
|
+
if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
|
|
2153
|
+
return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
|
|
2154
|
+
}
|
|
2155
|
+
//#endregion
|
|
2029
2156
|
//#region src/assertions/sql.ts
|
|
2030
2157
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
2031
2158
|
let pass = false;
|
|
@@ -2258,23 +2385,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2258
2385
|
};
|
|
2259
2386
|
};
|
|
2260
2387
|
//#endregion
|
|
2261
|
-
//#region src/assertions/traceUtils.ts
|
|
2262
|
-
/**
|
|
2263
|
-
* Shared utilities for trace assertions
|
|
2264
|
-
*/
|
|
2265
|
-
/**
|
|
2266
|
-
* Match a span name against a glob-like pattern.
|
|
2267
|
-
* Supports * (any characters) and ? (single character) wildcards.
|
|
2268
|
-
*
|
|
2269
|
-
* @param spanName - The span name to match
|
|
2270
|
-
* @param pattern - The glob pattern to match against
|
|
2271
|
-
* @returns true if the span name matches the pattern
|
|
2272
|
-
*/
|
|
2273
|
-
function matchesPattern(spanName, pattern) {
|
|
2274
|
-
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2275
|
-
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2276
|
-
}
|
|
2277
|
-
//#endregion
|
|
2278
2388
|
//#region src/assertions/traceErrorSpans.ts
|
|
2279
2389
|
function isErrorSpan(span) {
|
|
2280
2390
|
if (span.statusCode && span.statusCode >= 400) return true;
|
|
@@ -2443,6 +2553,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2443
2553
|
};
|
|
2444
2554
|
};
|
|
2445
2555
|
//#endregion
|
|
2556
|
+
//#region src/assertions/trajectoryUtils.ts
|
|
2557
|
+
const TOOL_ATTRIBUTE_KEYS = [
|
|
2558
|
+
"tool.name",
|
|
2559
|
+
"tool_name",
|
|
2560
|
+
"tool",
|
|
2561
|
+
"function.name",
|
|
2562
|
+
"function_name",
|
|
2563
|
+
"gen_ai.tool.name",
|
|
2564
|
+
"codex.mcp.tool",
|
|
2565
|
+
"agent.tool",
|
|
2566
|
+
"agent.tool_name",
|
|
2567
|
+
"agent.toolName"
|
|
2568
|
+
];
|
|
2569
|
+
const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
|
|
2570
|
+
"tool.arguments",
|
|
2571
|
+
"tool.args",
|
|
2572
|
+
"tool.input",
|
|
2573
|
+
"tool_arguments",
|
|
2574
|
+
"tool_args",
|
|
2575
|
+
"tool_input",
|
|
2576
|
+
"function.arguments",
|
|
2577
|
+
"function.args",
|
|
2578
|
+
"function.input",
|
|
2579
|
+
"function_arguments",
|
|
2580
|
+
"function_args",
|
|
2581
|
+
"gen_ai.tool.arguments",
|
|
2582
|
+
"gen_ai.tool.args",
|
|
2583
|
+
"gen_ai.tool.input",
|
|
2584
|
+
"gen_ai.tool.call.arguments",
|
|
2585
|
+
"gen_ai.tool.call.args",
|
|
2586
|
+
"agent.tool.arguments",
|
|
2587
|
+
"agent.tool.args",
|
|
2588
|
+
"agent.tool.input",
|
|
2589
|
+
"codex.mcp.arguments",
|
|
2590
|
+
"codex.mcp.args",
|
|
2591
|
+
"codex.mcp.input",
|
|
2592
|
+
"arguments",
|
|
2593
|
+
"args",
|
|
2594
|
+
"input"
|
|
2595
|
+
];
|
|
2596
|
+
const COMMAND_ATTRIBUTE_KEYS = [
|
|
2597
|
+
"codex.command",
|
|
2598
|
+
"command",
|
|
2599
|
+
"command.name",
|
|
2600
|
+
"command_name"
|
|
2601
|
+
];
|
|
2602
|
+
const SEARCH_ATTRIBUTE_KEYS = [
|
|
2603
|
+
"codex.search.query",
|
|
2604
|
+
"search.query",
|
|
2605
|
+
"search_query"
|
|
2606
|
+
];
|
|
2607
|
+
const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
|
|
2608
|
+
const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
|
|
2609
|
+
const MAX_JUDGE_SUMMARY_STEPS = 24;
|
|
2610
|
+
const JUDGE_SUMMARY_HEAD_STEPS = 12;
|
|
2611
|
+
const JUDGE_SUMMARY_TAIL_STEPS = 12;
|
|
2612
|
+
function getStringAttribute(attributes, keys) {
|
|
2613
|
+
for (const key of keys) {
|
|
2614
|
+
const value = attributes[key];
|
|
2615
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
2616
|
+
}
|
|
2617
|
+
}
|
|
2618
|
+
function normalizeStructuredAttribute(value) {
|
|
2619
|
+
if (value === void 0 || value === null) return;
|
|
2620
|
+
if (typeof value === "string") {
|
|
2621
|
+
const trimmed = value.trim();
|
|
2622
|
+
if (!trimmed) return;
|
|
2623
|
+
try {
|
|
2624
|
+
return JSON.parse(trimmed);
|
|
2625
|
+
} catch {
|
|
2626
|
+
return trimmed;
|
|
2627
|
+
}
|
|
2628
|
+
}
|
|
2629
|
+
if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
|
|
2630
|
+
}
|
|
2631
|
+
function hasSameStatus(left, right) {
|
|
2632
|
+
return left?.code === right?.code && left?.message === right?.message;
|
|
2633
|
+
}
|
|
2634
|
+
function isSearchLikeSpan(span) {
|
|
2635
|
+
const attributes = span.attributes || {};
|
|
2636
|
+
if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
|
|
2637
|
+
return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
|
|
2638
|
+
}
|
|
2639
|
+
function getTrajectoryStepStatus(step) {
|
|
2640
|
+
if (step.statusCode === void 0 || step.statusCode === 0) return;
|
|
2641
|
+
return {
|
|
2642
|
+
code: step.statusCode,
|
|
2643
|
+
...step.statusMessage ? { message: step.statusMessage } : {}
|
|
2644
|
+
};
|
|
2645
|
+
}
|
|
2646
|
+
function getCommandExecutable(command) {
|
|
2647
|
+
return command.trim().split(/\s+/)[0] || void 0;
|
|
2648
|
+
}
|
|
2649
|
+
function extractToolName(span) {
|
|
2650
|
+
const attributes = span.attributes || {};
|
|
2651
|
+
const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
|
|
2652
|
+
if (directMatch) return directMatch;
|
|
2653
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
2654
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
2655
|
+
if (/tool.?name|function.?name/i.test(key)) return value.trim();
|
|
2656
|
+
if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
|
|
2657
|
+
}
|
|
2658
|
+
if (span.name.startsWith("mcp ")) {
|
|
2659
|
+
const slashIndex = span.name.lastIndexOf("/");
|
|
2660
|
+
if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
|
|
2661
|
+
}
|
|
2662
|
+
}
|
|
2663
|
+
function extractToolArgs(span) {
|
|
2664
|
+
const attributes = span.attributes || {};
|
|
2665
|
+
for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
|
|
2666
|
+
const value = normalizeStructuredAttribute(attributes[key]);
|
|
2667
|
+
if (value !== void 0) return value;
|
|
2668
|
+
}
|
|
2669
|
+
for (const [key, rawValue] of Object.entries(attributes)) {
|
|
2670
|
+
if (/result|output|error|status/i.test(key)) continue;
|
|
2671
|
+
if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
|
|
2672
|
+
const value = normalizeStructuredAttribute(rawValue);
|
|
2673
|
+
if (value !== void 0) return value;
|
|
2674
|
+
}
|
|
2675
|
+
}
|
|
2676
|
+
function extractCommand(span) {
|
|
2677
|
+
const attributes = span.attributes || {};
|
|
2678
|
+
const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
|
|
2679
|
+
if (directMatch) return directMatch;
|
|
2680
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
2681
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
2682
|
+
if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
|
|
2683
|
+
}
|
|
2684
|
+
if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
|
|
2685
|
+
}
|
|
2686
|
+
function extractSearchQuery(span) {
|
|
2687
|
+
const attributes = span.attributes || {};
|
|
2688
|
+
const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
|
|
2689
|
+
if (directMatch) return directMatch;
|
|
2690
|
+
const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
|
|
2691
|
+
if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
|
|
2692
|
+
if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
|
|
2693
|
+
}
|
|
2694
|
+
function isReasoningSpan(span) {
|
|
2695
|
+
if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
|
|
2696
|
+
return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
|
|
2697
|
+
}
|
|
2698
|
+
function isMessageSpan(span) {
|
|
2699
|
+
if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
|
|
2700
|
+
return span.name === "agent response" || span.name === "send input";
|
|
2701
|
+
}
|
|
2702
|
+
function extractTrajectorySteps(trace) {
|
|
2703
|
+
return [...trace.spans || []].map((span, index) => ({
|
|
2704
|
+
span,
|
|
2705
|
+
index
|
|
2706
|
+
})).sort((left, right) => {
|
|
2707
|
+
const timeDiff = left.span.startTime - right.span.startTime;
|
|
2708
|
+
if (timeDiff !== 0) return timeDiff;
|
|
2709
|
+
const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
|
|
2710
|
+
if (endDiff !== 0) return endDiff;
|
|
2711
|
+
return left.index - right.index;
|
|
2712
|
+
}).map(({ span }) => {
|
|
2713
|
+
const toolName = extractToolName(span);
|
|
2714
|
+
const command = extractCommand(span);
|
|
2715
|
+
const searchQuery = extractSearchQuery(span);
|
|
2716
|
+
let type = "span";
|
|
2717
|
+
let name = span.name;
|
|
2718
|
+
const aliases = new Set([span.name]);
|
|
2719
|
+
let args;
|
|
2720
|
+
if (toolName) {
|
|
2721
|
+
type = "tool";
|
|
2722
|
+
name = toolName;
|
|
2723
|
+
aliases.add(toolName);
|
|
2724
|
+
args = extractToolArgs(span);
|
|
2725
|
+
} else if (command) {
|
|
2726
|
+
type = "command";
|
|
2727
|
+
name = command;
|
|
2728
|
+
aliases.add(command);
|
|
2729
|
+
const executable = getCommandExecutable(command);
|
|
2730
|
+
if (executable) aliases.add(executable);
|
|
2731
|
+
} else if (searchQuery) {
|
|
2732
|
+
type = "search";
|
|
2733
|
+
name = searchQuery;
|
|
2734
|
+
aliases.add(searchQuery);
|
|
2735
|
+
} else if (isReasoningSpan(span)) {
|
|
2736
|
+
type = "reasoning";
|
|
2737
|
+
name = span.name;
|
|
2738
|
+
aliases.add("reasoning");
|
|
2739
|
+
} else if (isMessageSpan(span)) {
|
|
2740
|
+
type = "message";
|
|
2741
|
+
name = span.name;
|
|
2742
|
+
aliases.add("message");
|
|
2743
|
+
}
|
|
2744
|
+
return {
|
|
2745
|
+
aliases: [...aliases],
|
|
2746
|
+
...args === void 0 ? {} : { args },
|
|
2747
|
+
attributes: span.attributes || {},
|
|
2748
|
+
endTime: span.endTime,
|
|
2749
|
+
name,
|
|
2750
|
+
spanId: span.spanId,
|
|
2751
|
+
spanName: span.name,
|
|
2752
|
+
startTime: span.startTime,
|
|
2753
|
+
statusCode: span.statusCode,
|
|
2754
|
+
statusMessage: span.statusMessage,
|
|
2755
|
+
type
|
|
2756
|
+
};
|
|
2757
|
+
});
|
|
2758
|
+
}
|
|
2759
|
+
function normalizeTrajectoryMatcher(matcher, defaultType) {
|
|
2760
|
+
if (typeof matcher === "string") return {
|
|
2761
|
+
pattern: matcher,
|
|
2762
|
+
...defaultType ? { type: defaultType } : {}
|
|
2763
|
+
};
|
|
2764
|
+
return {
|
|
2765
|
+
...matcher,
|
|
2766
|
+
...matcher.type ? {} : defaultType ? { type: defaultType } : {}
|
|
2767
|
+
};
|
|
2768
|
+
}
|
|
2769
|
+
function matchesTrajectoryStep(step, matcher, defaultType) {
|
|
2770
|
+
const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
|
|
2771
|
+
if (type) {
|
|
2772
|
+
if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
|
|
2773
|
+
}
|
|
2774
|
+
const matchPattern = pattern || name;
|
|
2775
|
+
if (!matchPattern) return true;
|
|
2776
|
+
return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
|
|
2777
|
+
}
|
|
2778
|
+
function formatTrajectoryStep(step) {
|
|
2779
|
+
return `${step.type}:${step.name}`;
|
|
2780
|
+
}
|
|
2781
|
+
function formatTrajectoryArgs(args) {
|
|
2782
|
+
if (args === void 0) return "(none)";
|
|
2783
|
+
try {
|
|
2784
|
+
const serialized = JSON.stringify(args);
|
|
2785
|
+
if (serialized !== void 0) return serialized;
|
|
2786
|
+
} catch {}
|
|
2787
|
+
return String(args);
|
|
2788
|
+
}
|
|
2789
|
+
function compactJudgeTrajectorySteps(steps) {
|
|
2790
|
+
const compacted = [];
|
|
2791
|
+
for (const step of steps) {
|
|
2792
|
+
const previousStep = compacted[compacted.length - 1];
|
|
2793
|
+
if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
|
|
2794
|
+
previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
|
|
2795
|
+
continue;
|
|
2796
|
+
}
|
|
2797
|
+
compacted.push(step);
|
|
2798
|
+
}
|
|
2799
|
+
return compacted;
|
|
2800
|
+
}
|
|
2801
|
+
function truncateJudgeTrajectorySteps(steps) {
|
|
2802
|
+
if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
|
|
2803
|
+
return [
|
|
2804
|
+
...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
|
|
2805
|
+
{ omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
|
|
2806
|
+
...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
|
|
2807
|
+
];
|
|
2808
|
+
}
|
|
2809
|
+
function summarizeTrajectoryForJudge(trace) {
|
|
2810
|
+
const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
|
|
2811
|
+
index: index + 1,
|
|
2812
|
+
type: step.type,
|
|
2813
|
+
name: step.name,
|
|
2814
|
+
...step.spanName === step.name ? {} : { spanName: step.spanName },
|
|
2815
|
+
...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
|
|
2816
|
+
}));
|
|
2817
|
+
const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
|
|
2818
|
+
const steps = truncateJudgeTrajectorySteps(compactedSteps);
|
|
2819
|
+
return JSON.stringify({
|
|
2820
|
+
traceId: trace.traceId,
|
|
2821
|
+
stepCount: rawSteps.length,
|
|
2822
|
+
compactedStepCount: compactedSteps.length,
|
|
2823
|
+
steps
|
|
2824
|
+
}, null, 2);
|
|
2825
|
+
}
|
|
2826
|
+
//#endregion
|
|
2827
|
+
//#region src/assertions/trajectory.ts
|
|
2828
|
+
function getTraceOrThrow(params) {
|
|
2829
|
+
const trace = params.assertionValueContext.trace;
|
|
2830
|
+
if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
|
|
2831
|
+
return trace;
|
|
2832
|
+
}
|
|
2833
|
+
function applyInverse(pass, inverse) {
|
|
2834
|
+
return inverse ? !pass : pass;
|
|
2835
|
+
}
|
|
2836
|
+
function formatStepList(stepLabels) {
|
|
2837
|
+
return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
|
|
2838
|
+
}
|
|
2839
|
+
function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
|
|
2840
|
+
if (matcher.pattern || matcher.name) return;
|
|
2841
|
+
const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
|
|
2842
|
+
throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
|
|
2843
|
+
}
|
|
2844
|
+
function resolveGoalSuccessValue(value) {
|
|
2845
|
+
if (typeof value === "string" && value.trim()) return { goal: value.trim() };
|
|
2846
|
+
if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
|
|
2847
|
+
throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
|
|
2848
|
+
}
|
|
2849
|
+
function resolveToolMatchers(value) {
|
|
2850
|
+
if (typeof value === "string") return {
|
|
2851
|
+
kind: "list",
|
|
2852
|
+
matchers: [normalizeTrajectoryMatcher(value, "tool")]
|
|
2853
|
+
};
|
|
2854
|
+
if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
|
|
2855
|
+
kind: "list",
|
|
2856
|
+
matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
|
|
2857
|
+
};
|
|
2858
|
+
if (value && typeof value === "object" && !Array.isArray(value)) return {
|
|
2859
|
+
kind: "count",
|
|
2860
|
+
matcher: {
|
|
2861
|
+
...normalizeTrajectoryMatcher(value, "tool"),
|
|
2862
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
2863
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
2864
|
+
}
|
|
2865
|
+
};
|
|
2866
|
+
throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
|
|
2867
|
+
}
|
|
2868
|
+
const handleTrajectoryToolUsed = (params) => {
|
|
2869
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
2870
|
+
const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
|
|
2871
|
+
if (expected.kind === "list") {
|
|
2872
|
+
if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
|
|
2873
|
+
const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
2874
|
+
const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
2875
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
2876
|
+
const actualTools = steps.map(formatTrajectoryStep);
|
|
2877
|
+
const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
|
|
2878
|
+
let reason;
|
|
2879
|
+
if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2880
|
+
else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2881
|
+
else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2882
|
+
return {
|
|
2883
|
+
pass,
|
|
2884
|
+
score: pass ? 1 : 0,
|
|
2885
|
+
reason,
|
|
2886
|
+
assertion: params.assertion
|
|
2887
|
+
};
|
|
2888
|
+
}
|
|
2889
|
+
const matcher = expected.matcher;
|
|
2890
|
+
const min = matcher.min ?? 1;
|
|
2891
|
+
const max = matcher.max;
|
|
2892
|
+
if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
|
|
2893
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
2894
|
+
const count = matchingSteps.length;
|
|
2895
|
+
const basePass = count >= min && (max === void 0 || count <= max);
|
|
2896
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
2897
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
2898
|
+
let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
|
|
2899
|
+
if (max === void 0) reason += ` (expected at least ${min})`;
|
|
2900
|
+
else reason += ` (expected ${min}-${max})`;
|
|
2901
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
2902
|
+
if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
|
|
2903
|
+
return {
|
|
2904
|
+
pass,
|
|
2905
|
+
score: pass ? 1 : 0,
|
|
2906
|
+
reason,
|
|
2907
|
+
assertion: params.assertion
|
|
2908
|
+
};
|
|
2909
|
+
};
|
|
2910
|
+
function resolveSequenceValue(value) {
|
|
2911
|
+
if (Array.isArray(value)) return {
|
|
2912
|
+
mode: "in_order",
|
|
2913
|
+
steps: value
|
|
2914
|
+
};
|
|
2915
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
2916
|
+
const sequenceValue = value;
|
|
2917
|
+
return {
|
|
2918
|
+
mode: sequenceValue.mode || "in_order",
|
|
2919
|
+
steps: sequenceValue.steps || []
|
|
2920
|
+
};
|
|
2921
|
+
}
|
|
2922
|
+
throw new Error("trajectory:tool-sequence assertion must have an array or object value");
|
|
2923
|
+
}
|
|
2924
|
+
function isRecord(value) {
|
|
2925
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2926
|
+
}
|
|
2927
|
+
function matchesExpectedArgsPartial(actual, expected) {
|
|
2928
|
+
if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
|
|
2929
|
+
if (isRecord(expected)) {
|
|
2930
|
+
if (!isRecord(actual)) return false;
|
|
2931
|
+
return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
|
|
2932
|
+
}
|
|
2933
|
+
return isDeepStrictEqual(actual, expected);
|
|
2934
|
+
}
|
|
2935
|
+
function matchesToolArgs(actual, expected, mode) {
|
|
2936
|
+
if (mode === "exact") return isDeepStrictEqual(actual, expected);
|
|
2937
|
+
return matchesExpectedArgsPartial(actual, expected);
|
|
2938
|
+
}
|
|
2939
|
+
function resolveToolArgsMatchMode(mode) {
|
|
2940
|
+
if (mode === void 0) return "partial";
|
|
2941
|
+
if (mode === "partial" || mode === "exact") return mode;
|
|
2942
|
+
throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
|
|
2943
|
+
}
|
|
2944
|
+
function resolveToolArgsMatchValue(value) {
|
|
2945
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
|
|
2946
|
+
const matcher = normalizeTrajectoryMatcher(value, "tool");
|
|
2947
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
|
|
2948
|
+
const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
|
|
2949
|
+
if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
|
|
2950
|
+
return {
|
|
2951
|
+
matcher,
|
|
2952
|
+
expectedArgs,
|
|
2953
|
+
mode: resolveToolArgsMatchMode(value.mode)
|
|
2954
|
+
};
|
|
2955
|
+
}
|
|
2956
|
+
const handleTrajectoryToolSequence = (params) => {
|
|
2957
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
2958
|
+
const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
|
|
2959
|
+
const expectedMatchers = value.steps.map((step, index) => {
|
|
2960
|
+
const matcher = normalizeTrajectoryMatcher(step, "tool");
|
|
2961
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
|
|
2962
|
+
return matcher;
|
|
2963
|
+
});
|
|
2964
|
+
if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
|
|
2965
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
2966
|
+
let basePass = false;
|
|
2967
|
+
let reason = "";
|
|
2968
|
+
if (value.mode === "exact") {
|
|
2969
|
+
basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
|
|
2970
|
+
if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
|
|
2971
|
+
else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
|
|
2972
|
+
} else {
|
|
2973
|
+
let expectedIndex = 0;
|
|
2974
|
+
const matchedSteps = [];
|
|
2975
|
+
for (const step of toolSteps) {
|
|
2976
|
+
if (expectedIndex >= expectedMatchers.length) break;
|
|
2977
|
+
if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
|
|
2978
|
+
matchedSteps.push(formatTrajectoryStep(step));
|
|
2979
|
+
expectedIndex += 1;
|
|
2980
|
+
}
|
|
2981
|
+
}
|
|
2982
|
+
basePass = expectedIndex === expectedMatchers.length;
|
|
2983
|
+
if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2984
|
+
else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
|
|
2985
|
+
}
|
|
2986
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
2987
|
+
if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
|
|
2988
|
+
return {
|
|
2989
|
+
pass,
|
|
2990
|
+
score: pass ? 1 : 0,
|
|
2991
|
+
reason,
|
|
2992
|
+
assertion: params.assertion
|
|
2993
|
+
};
|
|
2994
|
+
};
|
|
2995
|
+
const handleTrajectoryToolArgsMatch = (params) => {
|
|
2996
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
2997
|
+
const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
|
|
2998
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
2999
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
3000
|
+
const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
3001
|
+
const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
|
|
3002
|
+
const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
|
|
3003
|
+
const basePass = matchedStep !== void 0;
|
|
3004
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3005
|
+
const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
|
|
3006
|
+
const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
|
|
3007
|
+
let reason;
|
|
3008
|
+
if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
3009
|
+
else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
|
|
3010
|
+
else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
|
|
3011
|
+
else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
3012
|
+
else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
|
|
3013
|
+
else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
|
|
3014
|
+
else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
|
|
3015
|
+
return {
|
|
3016
|
+
pass,
|
|
3017
|
+
score: pass ? 1 : 0,
|
|
3018
|
+
reason,
|
|
3019
|
+
assertion: params.assertion
|
|
3020
|
+
};
|
|
3021
|
+
};
|
|
3022
|
+
function resolveStepCountValue(value) {
|
|
3023
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
|
|
3024
|
+
return {
|
|
3025
|
+
...normalizeTrajectoryMatcher(value),
|
|
3026
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
3027
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
3028
|
+
};
|
|
3029
|
+
}
|
|
3030
|
+
const handleTrajectoryStepCount = (params) => {
|
|
3031
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params));
|
|
3032
|
+
const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
|
|
3033
|
+
const { min, max } = matcher;
|
|
3034
|
+
if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
|
|
3035
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
3036
|
+
const count = matchingSteps.length;
|
|
3037
|
+
const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
|
|
3038
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3039
|
+
const filterParts = [];
|
|
3040
|
+
if (matcher.type) {
|
|
3041
|
+
const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
|
|
3042
|
+
filterParts.push(`type=${types.join("|")}`);
|
|
3043
|
+
}
|
|
3044
|
+
const pattern = matcher.pattern || matcher.name;
|
|
3045
|
+
if (pattern) filterParts.push(`pattern=${pattern}`);
|
|
3046
|
+
let reason = `Matched ${count} trajectory step(s)`;
|
|
3047
|
+
if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
|
|
3048
|
+
if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
|
|
3049
|
+
else if (min !== void 0) reason += ` (expected at least ${min})`;
|
|
3050
|
+
else if (max !== void 0) reason += ` (expected at most ${max})`;
|
|
3051
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
3052
|
+
if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
|
|
3053
|
+
return {
|
|
3054
|
+
pass,
|
|
3055
|
+
score: pass ? 1 : 0,
|
|
3056
|
+
reason,
|
|
3057
|
+
assertion: params.assertion
|
|
3058
|
+
};
|
|
3059
|
+
};
|
|
3060
|
+
const handleTrajectoryGoalSuccess = async (params) => {
|
|
3061
|
+
const trace = getTraceOrThrow(params);
|
|
3062
|
+
const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
|
|
3063
|
+
const result = await matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
|
|
3064
|
+
if (!params.inverse) return result;
|
|
3065
|
+
return {
|
|
3066
|
+
...result,
|
|
3067
|
+
assertion: params.assertion,
|
|
3068
|
+
pass: !result.pass,
|
|
3069
|
+
score: result.pass ? 0 : 1,
|
|
3070
|
+
reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
|
|
3071
|
+
};
|
|
3072
|
+
};
|
|
3073
|
+
//#endregion
|
|
2446
3074
|
//#region src/assertions/webhook.ts
|
|
2447
3075
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
2448
3076
|
invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
|
|
@@ -2511,18 +3139,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2511
3139
|
if (pass) reason = "Assertion passed";
|
|
2512
3140
|
else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
|
|
2513
3141
|
else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
|
|
2514
|
-
} else if (min
|
|
2515
|
-
const basePass = wordCount >= min;
|
|
2516
|
-
pass = inverse ? !basePass : basePass;
|
|
2517
|
-
if (pass) reason = "Assertion passed";
|
|
2518
|
-
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
2519
|
-
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
2520
|
-
} else {
|
|
3142
|
+
} else if (min === void 0) {
|
|
2521
3143
|
const basePass = wordCount <= max;
|
|
2522
3144
|
pass = inverse ? !basePass : basePass;
|
|
2523
3145
|
if (pass) reason = "Assertion passed";
|
|
2524
3146
|
else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
|
|
2525
3147
|
else reason = `Word count ${wordCount} is greater than maximum ${max}`;
|
|
3148
|
+
} else {
|
|
3149
|
+
const basePass = wordCount >= min;
|
|
3150
|
+
pass = inverse ? !basePass : basePass;
|
|
3151
|
+
if (pass) reason = "Assertion passed";
|
|
3152
|
+
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
3153
|
+
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
2526
3154
|
}
|
|
2527
3155
|
} else {
|
|
2528
3156
|
invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
|
|
@@ -2617,6 +3245,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2617
3245
|
//#endregion
|
|
2618
3246
|
//#region src/assertions/index.ts
|
|
2619
3247
|
const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
3248
|
+
const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
|
|
3249
|
+
const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
|
|
3250
|
+
const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
|
|
3251
|
+
const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
|
|
3252
|
+
const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
|
|
3253
|
+
const MAX_TRACE_FETCH_STABLE_POLLS = 10;
|
|
2620
3254
|
const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
2621
3255
|
"answer-relevance",
|
|
2622
3256
|
"context-faithfulness",
|
|
@@ -2626,8 +3260,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
|
2626
3260
|
"llm-rubric",
|
|
2627
3261
|
"model-graded-closedqa",
|
|
2628
3262
|
"model-graded-factuality",
|
|
2629
|
-
"search-rubric"
|
|
3263
|
+
"search-rubric",
|
|
3264
|
+
"trajectory:goal-success"
|
|
2630
3265
|
]);
|
|
3266
|
+
const TRACE_AWARE_ASSERTION_TYPES = new Set([
|
|
3267
|
+
"javascript",
|
|
3268
|
+
"python",
|
|
3269
|
+
"ruby",
|
|
3270
|
+
"trace-error-spans",
|
|
3271
|
+
"trace-span-count",
|
|
3272
|
+
"trace-span-duration",
|
|
3273
|
+
"trajectory:goal-success",
|
|
3274
|
+
"trajectory:step-count",
|
|
3275
|
+
"trajectory:tool-args-match",
|
|
3276
|
+
"trajectory:tool-sequence",
|
|
3277
|
+
"trajectory:tool-used"
|
|
3278
|
+
]);
|
|
3279
|
+
function assertionUsesTrace(assertion) {
|
|
3280
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
|
|
3281
|
+
return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
|
|
3282
|
+
}
|
|
3283
|
+
function assertionMayNeedTraceContext(assertion) {
|
|
3284
|
+
if (assertionUsesTrace(assertion)) return true;
|
|
3285
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
|
|
3286
|
+
return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || isPackagePath(assertion.value) : false;
|
|
3287
|
+
}
|
|
3288
|
+
function hasTraceAwareAssertions(assertions) {
|
|
3289
|
+
return Boolean(assertions?.some(assertionMayNeedTraceContext));
|
|
3290
|
+
}
|
|
3291
|
+
async function loadTraceData(traceId) {
|
|
3292
|
+
const traceStore = getTraceStore();
|
|
3293
|
+
const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
|
|
3294
|
+
const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
|
|
3295
|
+
const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
|
|
3296
|
+
let lastSpanCount = -1;
|
|
3297
|
+
let stableObservations = 0;
|
|
3298
|
+
let latestTrace = null;
|
|
3299
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
3300
|
+
latestTrace = await traceStore.getTrace(traceId);
|
|
3301
|
+
const spanCount = latestTrace?.spans?.length ?? 0;
|
|
3302
|
+
if (spanCount > 0) {
|
|
3303
|
+
stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
|
|
3304
|
+
lastSpanCount = spanCount;
|
|
3305
|
+
if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
|
|
3306
|
+
} else {
|
|
3307
|
+
stableObservations = 0;
|
|
3308
|
+
lastSpanCount = spanCount;
|
|
3309
|
+
}
|
|
3310
|
+
if (attempt < maxAttempts - 1) await sleep(retryDelayMs);
|
|
3311
|
+
}
|
|
3312
|
+
return latestTrace;
|
|
3313
|
+
}
|
|
2631
3314
|
const ASSERTION_HANDLERS = {
|
|
2632
3315
|
"answer-relevance": handleAnswerRelevance,
|
|
2633
3316
|
bleu: handleBleuScore,
|
|
@@ -2690,12 +3373,18 @@ const ASSERTION_HANDLERS = {
|
|
|
2690
3373
|
ruby: handleRuby,
|
|
2691
3374
|
"rouge-n": handleRougeScore,
|
|
2692
3375
|
"search-rubric": handleSearchRubric,
|
|
3376
|
+
"skill-used": handleSkillUsed,
|
|
2693
3377
|
similar: handleSimilar,
|
|
2694
3378
|
"similar:cosine": handleSimilar,
|
|
2695
3379
|
"similar:dot": handleSimilar,
|
|
2696
3380
|
"similar:euclidean": handleSimilar,
|
|
2697
3381
|
"starts-with": handleStartsWith,
|
|
2698
3382
|
"tool-call-f1": handleToolCallF1,
|
|
3383
|
+
"trajectory:goal-success": handleTrajectoryGoalSuccess,
|
|
3384
|
+
"trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
|
|
3385
|
+
"trajectory:step-count": handleTrajectoryStepCount,
|
|
3386
|
+
"trajectory:tool-sequence": handleTrajectoryToolSequence,
|
|
3387
|
+
"trajectory:tool-used": handleTrajectoryToolUsed,
|
|
2699
3388
|
"trace-error-spans": handleTraceErrorSpans,
|
|
2700
3389
|
"trace-span-count": handleTraceSpanCount,
|
|
2701
3390
|
"trace-span-duration": handleTraceSpanDuration,
|
|
@@ -2738,7 +3427,7 @@ function isAssertionInverse(assertion) {
|
|
|
2738
3427
|
function getAssertionBaseType(assertion) {
|
|
2739
3428
|
return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
|
|
2740
3429
|
}
|
|
2741
|
-
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
|
|
3430
|
+
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
|
|
2742
3431
|
const resolvedVars = vars || test.vars || {};
|
|
2743
3432
|
const { cost, logProbs, output: originalOutput } = providerResponse;
|
|
2744
3433
|
let output = originalOutput;
|
|
@@ -2757,14 +3446,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2757
3446
|
providerResponse,
|
|
2758
3447
|
...assertion.config ? { config: structuredClone(assertion.config) } : {}
|
|
2759
3448
|
};
|
|
2760
|
-
if (traceId) try {
|
|
2761
|
-
const
|
|
2762
|
-
if (
|
|
2763
|
-
traceId:
|
|
2764
|
-
evaluationId:
|
|
2765
|
-
testCaseId:
|
|
2766
|
-
metadata:
|
|
2767
|
-
spans:
|
|
3449
|
+
if (traceId && assertionMayNeedTraceContext(assertion)) try {
|
|
3450
|
+
const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
|
|
3451
|
+
if (resolvedTraceData) context.trace = {
|
|
3452
|
+
traceId: resolvedTraceData.traceId,
|
|
3453
|
+
evaluationId: resolvedTraceData.evaluationId,
|
|
3454
|
+
testCaseId: resolvedTraceData.testCaseId,
|
|
3455
|
+
metadata: resolvedTraceData.metadata,
|
|
3456
|
+
spans: resolvedTraceData.spans || []
|
|
2768
3457
|
};
|
|
2769
3458
|
} catch (error) {
|
|
2770
3459
|
logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
@@ -2797,7 +3486,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2797
3486
|
};
|
|
2798
3487
|
}
|
|
2799
3488
|
else if (filePath.endsWith(".rb")) try {
|
|
2800
|
-
const { runRuby } = await import("./rubyUtils-
|
|
3489
|
+
const { runRuby } = await import("./rubyUtils-Rt6pKA96.js");
|
|
2801
3490
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2802
3491
|
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2803
3492
|
} catch (error) {
|
|
@@ -2906,6 +3595,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
2906
3595
|
index: i
|
|
2907
3596
|
};
|
|
2908
3597
|
}).flat();
|
|
3598
|
+
const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
|
|
3599
|
+
let preloadedTraceData;
|
|
3600
|
+
if (shouldPreloadTrace && traceId) try {
|
|
3601
|
+
preloadedTraceData = await loadTraceData(traceId);
|
|
3602
|
+
} catch (error) {
|
|
3603
|
+
logger.debug(`Failed to preload trace data for assertions: ${error}`);
|
|
3604
|
+
preloadedTraceData = null;
|
|
3605
|
+
}
|
|
2909
3606
|
await async.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
|
|
2910
3607
|
if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
|
|
2911
3608
|
const result = await runAssertion({
|
|
@@ -2917,7 +3614,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
2917
3614
|
vars,
|
|
2918
3615
|
latencyMs,
|
|
2919
3616
|
assertIndex: index,
|
|
2920
|
-
traceId
|
|
3617
|
+
traceId,
|
|
3618
|
+
traceData: preloadedTraceData
|
|
2921
3619
|
});
|
|
2922
3620
|
assertResult.addResult({
|
|
2923
3621
|
index,
|
|
@@ -3083,7 +3781,7 @@ var CIProgressReporter = class {
|
|
|
3083
3781
|
else {
|
|
3084
3782
|
const eta = remaining / rate;
|
|
3085
3783
|
if (eta > 1440) etaDisplay = ">24 hours";
|
|
3086
|
-
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta)
|
|
3784
|
+
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
|
|
3087
3785
|
}
|
|
3088
3786
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3089
3787
|
logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
@@ -3447,12 +4145,55 @@ var JsonlFileWriter = class {
|
|
|
3447
4145
|
var ProgressBarManager = class {
|
|
3448
4146
|
progressBar;
|
|
3449
4147
|
isWebUI;
|
|
4148
|
+
originalLogCallback = null;
|
|
4149
|
+
installedLogCallback = null;
|
|
4150
|
+
pendingRender = null;
|
|
3450
4151
|
totalCount = 0;
|
|
3451
4152
|
completedCount = 0;
|
|
3452
4153
|
concurrency = 1;
|
|
3453
4154
|
constructor(isWebUI) {
|
|
3454
4155
|
this.isWebUI = isWebUI;
|
|
3455
4156
|
}
|
|
4157
|
+
clearProgressBarLine() {
|
|
4158
|
+
readline.cursorTo(process.stderr, 0);
|
|
4159
|
+
readline.clearLine(process.stderr, 0);
|
|
4160
|
+
}
|
|
4161
|
+
scheduleRender() {
|
|
4162
|
+
if (!this.progressBar || this.pendingRender) return;
|
|
4163
|
+
this.pendingRender = setImmediate(() => {
|
|
4164
|
+
this.pendingRender = null;
|
|
4165
|
+
this.progressBar?.render();
|
|
4166
|
+
});
|
|
4167
|
+
}
|
|
4168
|
+
handleLogMessage() {
|
|
4169
|
+
if (!this.progressBar) return;
|
|
4170
|
+
this.clearProgressBarLine();
|
|
4171
|
+
this.scheduleRender();
|
|
4172
|
+
}
|
|
4173
|
+
/**
|
|
4174
|
+
* Coordinate console logging with the progress bar to prevent visual corruption.
|
|
4175
|
+
*/
|
|
4176
|
+
installLogInterceptor() {
|
|
4177
|
+
if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
|
|
4178
|
+
this.originalLogCallback = globalLogCallback;
|
|
4179
|
+
this.installedLogCallback = (message) => {
|
|
4180
|
+
this.originalLogCallback?.(message);
|
|
4181
|
+
this.handleLogMessage();
|
|
4182
|
+
};
|
|
4183
|
+
setLogCallback(this.installedLogCallback);
|
|
4184
|
+
}
|
|
4185
|
+
/**
|
|
4186
|
+
* Remove the log interceptor and restore original logger callback behavior.
|
|
4187
|
+
*/
|
|
4188
|
+
removeLogInterceptor() {
|
|
4189
|
+
if (this.pendingRender) {
|
|
4190
|
+
clearImmediate(this.pendingRender);
|
|
4191
|
+
this.pendingRender = null;
|
|
4192
|
+
}
|
|
4193
|
+
if (this.installedLogCallback && globalLogCallback === this.installedLogCallback) setLogCallback(this.originalLogCallback);
|
|
4194
|
+
this.installedLogCallback = null;
|
|
4195
|
+
this.originalLogCallback = null;
|
|
4196
|
+
}
|
|
3456
4197
|
/**
|
|
3457
4198
|
* Initialize progress bar
|
|
3458
4199
|
*/
|
|
@@ -3472,7 +4213,8 @@ var ProgressBarManager = class {
|
|
|
3472
4213
|
return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
|
|
3473
4214
|
},
|
|
3474
4215
|
hideCursor: true,
|
|
3475
|
-
gracefulExit: true
|
|
4216
|
+
gracefulExit: true,
|
|
4217
|
+
stream: process.stderr
|
|
3476
4218
|
}, cliProgress.Presets.shades_classic);
|
|
3477
4219
|
this.progressBar.start(this.totalCount, 0, {
|
|
3478
4220
|
provider: "",
|
|
@@ -3747,6 +4489,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3747
4489
|
const parts = traceContext.traceparent.split("-");
|
|
3748
4490
|
if (parts.length >= 3) traceId = parts[1];
|
|
3749
4491
|
}
|
|
4492
|
+
if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
|
|
3750
4493
|
const checkResult = await runAssertions({
|
|
3751
4494
|
prompt: renderedPrompt,
|
|
3752
4495
|
provider,
|
|
@@ -4144,7 +4887,7 @@ var Evaluator = class {
|
|
|
4144
4887
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4145
4888
|
if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4146
4889
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4147
|
-
const { loadApiProvider } = await import("./providers-
|
|
4890
|
+
const { loadApiProvider } = await import("./providers-BSLEaIQG.js");
|
|
4148
4891
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4149
4892
|
} else testCase.provider = defaultProvider;
|
|
4150
4893
|
}
|
|
@@ -4228,7 +4971,7 @@ var Evaluator = class {
|
|
|
4228
4971
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4229
4972
|
}
|
|
4230
4973
|
if (state.resume && this.evalRecord.persisted) try {
|
|
4231
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
4974
|
+
const { default: EvalResult } = await import("./evalResult-CYNHkk5A.js");
|
|
4232
4975
|
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
|
|
4233
4976
|
const originalCount = runEvalOptions.length;
|
|
4234
4977
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -4428,7 +5171,7 @@ var Evaluator = class {
|
|
|
4428
5171
|
if (isCI() && !isWebUI) {
|
|
4429
5172
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4430
5173
|
ciProgressReporter.start();
|
|
4431
|
-
} else if (this.options.showProgressBar && process.
|
|
5174
|
+
} else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
|
|
4432
5175
|
this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
|
|
4433
5176
|
if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
|
|
4434
5177
|
if (isWebUI) {
|
|
@@ -4449,7 +5192,10 @@ var Evaluator = class {
|
|
|
4449
5192
|
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4450
5193
|
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4451
5194
|
}
|
|
4452
|
-
if (this.options.showProgressBar && progressBarManager)
|
|
5195
|
+
if (this.options.showProgressBar && progressBarManager) {
|
|
5196
|
+
await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
5197
|
+
progressBarManager.installLogInterceptor();
|
|
5198
|
+
}
|
|
4453
5199
|
try {
|
|
4454
5200
|
if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
|
|
4455
5201
|
checkAbort();
|
|
@@ -4475,7 +5221,10 @@ var Evaluator = class {
|
|
|
4475
5221
|
else if (!targetUnavailable) {
|
|
4476
5222
|
logger.info("Evaluation interrupted, saving progress...");
|
|
4477
5223
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4478
|
-
if (progressBarManager)
|
|
5224
|
+
if (progressBarManager) {
|
|
5225
|
+
progressBarManager.removeLogInterceptor();
|
|
5226
|
+
progressBarManager.stop();
|
|
5227
|
+
}
|
|
4479
5228
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
4480
5229
|
this.evalRecord.setVars(Array.from(vars));
|
|
4481
5230
|
await this.evalRecord.addPrompts(prompts);
|
|
@@ -4483,6 +5232,10 @@ var Evaluator = class {
|
|
|
4483
5232
|
return this.evalRecord;
|
|
4484
5233
|
}
|
|
4485
5234
|
} else {
|
|
5235
|
+
if (progressBarManager) {
|
|
5236
|
+
progressBarManager.removeLogInterceptor();
|
|
5237
|
+
progressBarManager.stop();
|
|
5238
|
+
}
|
|
4486
5239
|
if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
|
|
4487
5240
|
throw err;
|
|
4488
5241
|
}
|
|
@@ -4625,6 +5378,7 @@ var Evaluator = class {
|
|
|
4625
5378
|
await this.evalRecord.addPrompts(prompts);
|
|
4626
5379
|
try {
|
|
4627
5380
|
if (progressBarManager) {
|
|
5381
|
+
progressBarManager.removeLogInterceptor();
|
|
4628
5382
|
progressBarManager.complete();
|
|
4629
5383
|
progressBarManager.stop();
|
|
4630
5384
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -4790,6 +5544,6 @@ function evaluate(testSuite, evalRecord, options) {
|
|
|
4790
5544
|
return new Evaluator(testSuite, evalRecord, options).evaluate();
|
|
4791
5545
|
}
|
|
4792
5546
|
//#endregion
|
|
4793
|
-
export {
|
|
5547
|
+
export { isAllowedPrompt as a, assertions_default as c, runAssertions as d, generateVarCombinations as i, readAssertions as l, evaluate as n, runEval as o, formatVarsForDisplay as r, doesPromptRefMatch as s, ProgressBarManager as t, renderMetricName as u };
|
|
4794
5548
|
|
|
4795
|
-
//# sourceMappingURL=evaluator-
|
|
5549
|
+
//# sourceMappingURL=evaluator-BcvOGaam.js.map
|