promptfoo 0.121.1 → 0.121.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/src/{accounts-xrUGFA6n.js → accounts-B2XmGjty.js} +5 -5
- package/dist/src/{accounts-Bx-x3bmW.cjs → accounts-BPyfpSeU.cjs} +5 -5
- package/dist/src/{accounts-CMqkzrVf.js → accounts-CFLK3mnD.js} +6 -6
- package/dist/src/{accounts-BgNJDBE6.js → accounts-Xatc0RYb.js} +5 -5
- package/dist/src/{agentic-utils-BKIN5PKu.js → agentic-utils-36epdqwB.js} +3 -3
- package/dist/src/{cometapi-DkXrKi5z.js → agentic-utils-D8yXo5Lm.js} +4 -61
- package/dist/src/{cometapi-vY6aDZgo.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
- package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
- package/dist/src/{agents-C-dDThPK.js → agents-BBVJCIYr.js} +226 -13
- package/dist/src/{agents-CErsqg5U.cjs → agents-BBWxKSM0.cjs} +7 -7
- package/dist/src/{agents-Dy2YpZpa.js → agents-Bqgfdokm.js} +227 -14
- package/dist/src/{agents-B0f4HICh.cjs → agents-CAYbM7qD.cjs} +226 -13
- package/dist/src/{agents-CVIn-Utx.js → agents-CLQ-P15P.js} +7 -7
- package/dist/src/{agents-DeH4Gu94.js → agents-CgBniSlI.js} +8 -8
- package/dist/src/{agents-CXknwsFX.js → agents-DSSTV4bv.js} +226 -13
- package/dist/src/{agents-aF4-T121.js → agents-wg3ohknq.js} +7 -7
- package/dist/src/{aimlapi-tg0Gkcvr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
- package/dist/src/{aimlapi-BNfTBexL.js → aimlapi-BwGC1TtS.js} +13 -13
- package/dist/src/{aimlapi-BAGZDo5G.js → aimlapi-DaC3qZ-o.js} +14 -14
- package/dist/src/{aimlapi-DHRKlBEA.js → aimlapi-MgSLdvy7.js} +13 -13
- package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
- package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
- package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
- package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
- package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
- package/dist/src/app/index.html +4 -4
- package/dist/src/{audio-tf_NBjlC.js → audio-Bn44pQxv.js} +4 -4
- package/dist/src/{audio-CHQ4r-RV.js → audio-DDA5WHdx.js} +4 -4
- package/dist/src/{audio-BWeaWovU.cjs → audio-DVFjQ67_.cjs} +4 -4
- package/dist/src/{audio-BRODU0UK.js → audio-DjU9GswO.js} +5 -5
- package/dist/src/{base-DBtwl2FR.cjs → base-BboXIF_0.cjs} +3 -3
- package/dist/src/{base-B4QJRyFS.js → base-CKjwebIH.js} +3 -3
- package/dist/src/{base-B0tcrnq_.js → base-CqzQ4K8j.js} +3 -3
- package/dist/src/{base-fEDN28WM.js → base-Cz2ZC_iA.js} +3 -3
- package/dist/src/{blobs-BAU-dXan.js → blobs-B1JriOyi.js} +3 -3
- package/dist/src/{blobs-qTYm-1PY.js → blobs-BUWmKWzo.js} +3 -3
- package/dist/src/{blobs-DvS-O6be.cjs → blobs-C6j0bvFz.cjs} +3 -3
- package/dist/src/{blobs-Bpg5rH6i.js → blobs-DXTl6J3H.js} +3 -3
- package/dist/src/{cache-COish3-W.cjs → cache-C5yFZ4gC.cjs} +75 -58
- package/dist/src/{cache-8XhNqPKW.js → cache-CaT5tPgo.js} +75 -58
- package/dist/src/cache-CyCanoMu.js +6 -0
- package/dist/src/{cache-CG0SlR1d.js → cache-DSqR6ezl.js} +75 -58
- package/dist/src/cache-Df_QFDNu.cjs +5 -0
- package/dist/src/{cache-D3eqDYGU.js → cache-HP0NP4k3.js} +75 -58
- package/dist/src/{chat-DHMH-N64.js → chat-B-52XYI1.js} +12 -12
- package/dist/src/{chat-BKm79wib.js → chat-B0iaWhoh.js} +16 -14
- package/dist/src/{chat-DxysjBvt.js → chat-BE0qTA8e.js} +13 -13
- package/dist/src/{chat-CRWNNq73.js → chat-BEwdgGEg.js} +16 -14
- package/dist/src/{chat-2K608PeQ.cjs → chat-BtIKkLKx.cjs} +13 -13
- package/dist/src/{chat-DaqekjFr.cjs → chat-CM8qWR3_.cjs} +17 -15
- package/dist/src/{chat-CM_kyI8B.js → chat-DK1U-eZ-.js} +12 -12
- package/dist/src/{chat-CznLWr_D.js → chat-pxmiVpWe.js} +16 -14
- package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
- package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
- package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
- package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
- package/dist/src/{claude-agent-sdk-CJH22shf.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
- package/dist/src/{claude-agent-sdk-Dy5lT-Tx.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
- package/dist/src/{claude-agent-sdk-BLTu0WBO.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
- package/dist/src/{claude-agent-sdk-D6_k9FKA.js → claude-agent-sdk-HgbFioFw.js} +33 -15
- package/dist/src/cloud-DE3t1-ZI.js +4 -0
- package/dist/src/{cloud-Bc9526yV.js → cloud-z8KZpUoa.js} +3 -3
- package/dist/src/{cloudflare-ai-CWWJCRim.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
- package/dist/src/{cloudflare-ai-C9r2sRhw.js → cloudflare-ai-Bbp26N0L.js} +13 -13
- package/dist/src/{cloudflare-ai-ClWSdor4.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
- package/dist/src/{cloudflare-ai-ICsOuD-z.js → cloudflare-ai-DdKP9TKT.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6xFc5pa.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6O7AlYb.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
- package/dist/src/{cloudflare-gateway-pXGHxJ47.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
- package/dist/src/{cloudflare-gateway-C2_-KG5o.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
- package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
- package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
- package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
- package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
- package/dist/src/cometapi-BDyV-NNm.js +62 -0
- package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
- package/dist/src/{cometapi-Bbjp5V4x.js → cometapi-hhL4TAh3.js} +14 -14
- package/dist/src/{cometapi-BasUi7-_.js → cometapi-sp7sJpBD.js} +15 -15
- package/dist/src/{completion-C_P3ypkJ.js → completion-BCimtq-h.js} +6 -6
- package/dist/src/{completion-6Mx_iXxK.js → completion-DCjv7RZ3.js} +6 -6
- package/dist/src/{completion-CDOouNzq.cjs → completion-DlXUhj5c.cjs} +6 -6
- package/dist/src/{completion-C5rtR_9P.js → completion-DoYy49ti.js} +6 -6
- package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
- package/dist/src/{docker-BwsKwxFs.cjs → docker-Cqj2-QVi.cjs} +14 -14
- package/dist/src/{docker-CZnqU1XV.js → docker-CxCkwMzc.js} +13 -13
- package/dist/src/{docker-DzxyDPIj.js → docker-DpguQj-w.js} +14 -14
- package/dist/src/{docker-5KcG-_86.js → docker-FeBni2dw.js} +13 -13
- package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
- package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
- package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
- package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
- package/dist/src/eval-7aEqoMs3.js +15 -0
- package/dist/src/{eval-DmFyWU7i.js → eval-BTqTn7lb.js} +10 -10
- package/dist/src/{evalResult-CDQiuUuf.js → evalResult-BkIhRdTe.js} +7 -7
- package/dist/src/evalResult-CYNHkk5A.js +12 -0
- package/dist/src/evalResult-CuvJeNiM.js +10 -0
- package/dist/src/{evalResult-CTG2AHOS.js → evalResult-DUDShQrm.js} +7 -7
- package/dist/src/{evalResult-Dap2CekP.cjs → evalResult-DpARzUCb.cjs} +7 -7
- package/dist/src/evalResult-tGdilrWt.cjs +10 -0
- package/dist/src/evaluator-BBUqRhz1.js +36 -0
- package/dist/src/{evaluator-DPFRbFIL.js → evaluator-BcvOGaam.js} +833 -79
- package/dist/src/{extractor-YMU_Gvt8.js → extractor-C8XwivI9.js} +6 -6
- package/dist/src/{extractor-CFG6bcWJ.js → extractor-CAZ2G3Kh.js} +6 -6
- package/dist/src/{extractor-DX36oYEv.cjs → extractor-DG3sSfXE.cjs} +6 -6
- package/dist/src/{extractor-M67RUtg6.js → extractor-D_wd8jxt.js} +6 -6
- package/dist/src/{fetch-4M3YRaqL.js → fetch-BiYv2BZc.js} +3 -3
- package/dist/src/{fetch-BxUk8odA.cjs → fetch-BnR9wSnm.cjs} +3 -3
- package/dist/src/{fetch-60Gzydls.js → fetch-CVAtKnI3.js} +3 -3
- package/dist/src/{fetch-BMv0O527.js → fetch-DoVRJZhJ.js} +4 -4
- package/dist/src/fetch-UWU706qb.js +5 -0
- package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
- package/dist/src/{graders-DOXycdlG.cjs → graders-BElhu9ZY.cjs} +126 -55
- package/dist/src/{graders-R9rYUM0d.js → graders-BXAJ0sbS.js} +120 -55
- package/dist/src/graders-BxfEguVY.js +32 -0
- package/dist/src/graders-CzVMbEnv.js +34 -0
- package/dist/src/{graders-CpdqD9PI.js → graders-DG7mhg-b.js} +120 -55
- package/dist/src/graders-DjCXfj0l.cjs +32 -0
- package/dist/src/{graders-CHO8EPM4.js → graders-RjHF8VfG.js} +120 -55
- package/dist/src/graders-kHzIWOKu.js +32 -0
- package/dist/src/{image-DTedmQPg.cjs → image--F58eEIn.cjs} +6 -6
- package/dist/src/{image-DJEvKveK.js → image-6WQXK8m8.js} +4 -4
- package/dist/src/{image-pAX56tPG.js → image-B8b6f36E.js} +6 -6
- package/dist/src/{image-BmEZqVmk.js → image-CoxZp9PZ.js} +6 -6
- package/dist/src/{image-gvmivTEe.js → image-DO0RYnjH.js} +5 -5
- package/dist/src/{image-CBBVXWuT.js → image-PoF6DN3x.js} +6 -6
- package/dist/src/{image-CDLQOcqT.cjs → image-fza3zuKs.cjs} +4 -4
- package/dist/src/{image-tL5hIOFh.js → image-xNbw5ph2.js} +4 -4
- package/dist/src/index.cjs +863 -110
- package/dist/src/index.d.cts +833 -60
- package/dist/src/index.d.ts +833 -60
- package/dist/src/index.js +860 -108
- package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
- package/dist/src/{knowledgeBase-CoU-UQBg.js → knowledgeBase-Bi7CmDbx.js} +7 -7
- package/dist/src/{knowledgeBase-CLJybhnF.js → knowledgeBase-Ce3ofVan.js} +8 -8
- package/dist/src/{knowledgeBase-DjWPVqSb.js → knowledgeBase-DFRXPZl_.js} +7 -7
- package/dist/src/{knowledgeBase-wkxuRFhA.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
- package/dist/src/{litellm-B9Hysuri.js → litellm-Bo2gQXpo.js} +16 -15
- package/dist/src/{litellm-ePxtr9F1.js → litellm-CKiAxnoM.js} +15 -14
- package/dist/src/{litellm-NYpQ8RQu.cjs → litellm-CnHI69aj.cjs} +16 -15
- package/dist/src/{litellm-CTfa0hqi.js → litellm-Tc294Jhj.js} +15 -14
- package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
- package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
- package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
- package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
- package/dist/src/{luma-ray-BW9IRGIc.js → luma-ray-0ehMPt5N.js} +10 -10
- package/dist/src/{luma-ray-BE2mOt6N.js → luma-ray-C9q8rdQe.js} +9 -9
- package/dist/src/{luma-ray-Cm1KZBhs.js → luma-ray-DP0QA9qn.js} +9 -9
- package/dist/src/{luma-ray-B0GGNRc1.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
- package/dist/src/main.js +69 -71
- package/dist/src/{messages-1x9atZmP.js → messages-DJNo37Ko.js} +14 -9
- package/dist/src/{messages-BLbWdsyt.js → messages-Dy9QecMs.js} +14 -9
- package/dist/src/{messages-1JrJs91T.cjs → messages-HJsyEh4o.cjs} +15 -10
- package/dist/src/{messages-D8EA0oDc.js → messages-biC_ex-p.js} +14 -9
- package/dist/src/{modelslab-C1OLRmVX.js → modelslab-B5J-ZM5c.js} +9 -9
- package/dist/src/{modelslab-CqXBy3U8.js → modelslab-BI458moT.js} +10 -10
- package/dist/src/{modelslab-X5-4LroM.js → modelslab-BTOT8FUO.js} +9 -9
- package/dist/src/{modelslab-DcOSFwKh.cjs → modelslab-IQbNg-r7.cjs} +9 -9
- package/dist/src/{nova-reel-DihqLeol.js → nova-reel-BZ9y-Y5s.js} +9 -9
- package/dist/src/{nova-reel-D9xfaMBs.cjs → nova-reel-CE5etkv9.cjs} +9 -9
- package/dist/src/{nova-reel-D2ZkOSyr.js → nova-reel-DEeQlnOJ.js} +10 -10
- package/dist/src/{nova-reel-BgS1ZWuK.js → nova-reel-Xw1SXLpg.js} +9 -9
- package/dist/src/{nova-sonic-Q3BOJeig.js → nova-sonic-DWswpN1E.js} +7 -7
- package/dist/src/{nova-sonic-DezhVUYT.js → nova-sonic-DXTLpi-r.js} +6 -6
- package/dist/src/{nova-sonic-DVu3mMIy.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
- package/dist/src/{nova-sonic-P-CdUMlV.js → nova-sonic-Ogqf-csn.js} +6 -6
- package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
- package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
- package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
- package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
- package/dist/src/{openclaw-Bv1DINsX.js → openclaw-0Sv7AK3O.js} +172 -109
- package/dist/src/{openclaw-DAfWQn-o.cjs → openclaw-CXxbKgDH.cjs} +174 -110
- package/dist/src/{openclaw-BiSZPL7J.js → openclaw-D1FSCps-.js} +172 -109
- package/dist/src/{openclaw-D1D_ej1z.js → openclaw-D2ENvu7a.js} +173 -110
- package/dist/src/{opencode-sdk-D95s6SnR.js → opencode-sdk-C71Z0ehR.js} +13 -13
- package/dist/src/{opencode-sdk-DxUPkLT7.js → opencode-sdk-CHCs7dEb.js} +12 -12
- package/dist/src/{opencode-sdk-C7m-wRfI.js → opencode-sdk-DDxj4QqH.js} +12 -12
- package/dist/src/{opencode-sdk-CfaLN8PY.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
- package/dist/src/{otlpReceiver-g3ByGaXs.js → otlpReceiver-C9KlUtxh.js} +6 -6
- package/dist/src/{otlpReceiver--AIRW_S4.js → otlpReceiver-CZL48YfC.js} +6 -6
- package/dist/src/{otlpReceiver-Bn5wGB1v.js → otlpReceiver-CavGAA6k.js} +6 -6
- package/dist/src/{otlpReceiver-Diec4cln.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
- package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
- package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
- package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
- package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
- package/dist/src/providers-BSLEaIQG.js +32 -0
- package/dist/src/{providers-CFu-TZl-.cjs → providers-CScd1wN6.cjs} +733 -464
- package/dist/src/{providers-CFLy1_ji.js → providers-Ch6Mr0gn.js} +795 -526
- package/dist/src/{providers-BKRJTjBz.js → providers-Cn73d5sr.js} +795 -526
- package/dist/src/providers-D-FnDg8k.cjs +31 -0
- package/dist/src/providers-DEYiFVAo.js +30 -0
- package/dist/src/{providers-B3HvufyI.js → providers-DvddrgxL.js} +795 -526
- package/dist/src/providers-sS2WI8YD.js +30 -0
- package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
- package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
- package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
- package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
- package/dist/src/{quiverai-CI6gYJVI.js → quiverai-BeofbLVc.js} +4 -4
- package/dist/src/{quiverai-MHSxbmmZ.js → quiverai-CCQn73lq.js} +5 -5
- package/dist/src/{quiverai-CLkWkyZc.cjs → quiverai-CcUhPIBg.cjs} +4 -4
- package/dist/src/{quiverai-C2jVwbH1.js → quiverai-DVSEqJiq.js} +4 -4
- package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
- package/dist/src/{responses-CGw0DCzh.js → responses-BKP_WYis.js} +16 -12
- package/dist/src/{responses-BKqJmhhc.js → responses-CQb1Tj69.js} +16 -12
- package/dist/src/{responses-jxdehPkC.js → responses-CgNyTPsY.js} +16 -12
- package/dist/src/{responses-tD4Bd4dc.cjs → responses-mo0KQDbu.cjs} +16 -12
- package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
- package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
- package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
- package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
- package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
- package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
- package/dist/src/{sagemaker-BK4Zb993.js → sagemaker-CVv8W7so.js} +17 -17
- package/dist/src/{sagemaker-D2Q1c-sD.js → sagemaker-CqeASYE5.js} +17 -17
- package/dist/src/{sagemaker-BfiWTmvn.js → sagemaker-MUbD5V3v.js} +18 -18
- package/dist/src/{sagemaker-CcQHM1jV.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
- package/dist/src/{scanner-J8CA3LsV.js → scanner-DVDeUz1r.js} +10 -10
- package/dist/src/server/index.js +864 -112
- package/dist/src/server-B0Xh1Gx-.js +7 -0
- package/dist/src/{server-B0PPuDw-.cjs → server-BtoCXeXI.cjs} +4 -4
- package/dist/src/{server-BC7XJFgr.js → server-CP9qKM40.js} +4 -4
- package/dist/src/{server-OAs3nBRT.js → server-Cns05F1j.js} +5 -5
- package/dist/src/server-DJTKu9IR.cjs +5 -0
- package/dist/src/{server-DbFphssR.js → server-DZ9MtCn0.js} +6 -6
- package/dist/src/{signal-BOTbd53Z.js → signal-C3ZTsUgi.js} +3 -3
- package/dist/src/{slack-DXMKtA-f.js → slack-2sdpGzbt.js} +2 -2
- package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
- package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
- package/dist/src/{slack-DOdy_kyv.js → slack-DCEV-vWP.js} +2 -2
- package/dist/src/store-C5u6MgC8.js +6 -0
- package/dist/src/{store-BSc-TF2w.cjs → store-CLyU7AtI.cjs} +17 -5
- package/dist/src/store-CNHk-De4.cjs +5 -0
- package/dist/src/{store-DQLEjuEO.js → store-Cj258DgL.js} +17 -5
- package/dist/src/{store-D1tv90v3.js → store-P8OKm19S.js} +17 -5
- package/dist/src/{store-Ub2vaGJ1.js → store-VB0GP46K.js} +17 -5
- package/dist/src/{tables-xKANLRBD.js → tables-BEIFz2tM.js} +3 -3
- package/dist/src/{tables-C7K-XKWp.cjs → tables-BdZQEpRz.cjs} +3 -3
- package/dist/src/{tables-D36WTqKX.js → tables-DmzvLbeZ.js} +3 -3
- package/dist/src/{tables-5EvT_Bwn.js → tables-kC7R5kiK.js} +3 -3
- package/dist/src/{telemetry-C2YDkUQH.js → telemetry-BnH5VJAU.js} +4 -4
- package/dist/src/{telemetry-C15ziL8u.js → telemetry-BugWqKiu.js} +4 -4
- package/dist/src/{telemetry-DMb2Mpfm.js → telemetry-DPXLd7UE.js} +4 -4
- package/dist/src/telemetry-Yig0Tino.js +7 -0
- package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
- package/dist/src/{telemetry-CbrnxHp_.cjs → telemetry-re627Lre.cjs} +4 -4
- package/dist/src/{transcription-CL78qbOU.cjs → transcription-BvtsrzRG.cjs} +13 -13
- package/dist/src/{transcription-DAtxHhAM.js → transcription-CaMivnjG.js} +13 -13
- package/dist/src/{transcription-QHh3AH6Z.js → transcription-DOMMTu01.js} +14 -14
- package/dist/src/{transcription-LNZTNUUL.js → transcription-Hb3VnC4M.js} +13 -13
- package/dist/src/{transform-DOcQeLld.cjs → transform-0BwoBsvO.cjs} +19 -5
- package/dist/src/{transform-DGxXocjk.js → transform-B2-jIv68.js} +8 -6
- package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
- package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
- package/dist/src/transform-ChNIpHz7.js +6 -0
- package/dist/src/{transform-Cgi24fJ7.js → transform-DrleutM3.js} +8 -6
- package/dist/src/{transform-DGLazrMm.js → transform-DyDAwEpE.js} +8 -6
- package/dist/src/transform-PtQ6rAE3.cjs +5 -0
- package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
- package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
- package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
- package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
- package/dist/src/{types-CH3Ge2sE.js → types-CIhFeUC4.js} +45 -11
- package/dist/src/{types-CN_TZ2GJ.js → types-Cd3ygw8W.js} +45 -11
- package/dist/src/{types-LJ0r3wbR.cjs → types-D8cGDZbL.cjs} +46 -12
- package/dist/src/{types-CLKiCBW3.js → types-q8GXGF65.js} +45 -11
- package/dist/src/{util-CchiqXh_.cjs → util--9u9UVCt.cjs} +3 -3
- package/dist/src/{util-5cB-L7U3.js → util-BLvy9qfE.js} +7 -11
- package/dist/src/{util-YT5HPZaS.js → util-Bm3E9jpK.js} +7 -11
- package/dist/src/{util-6-GqIvzS.js → util-BtoGs5Cb.js} +18 -4
- package/dist/src/{util-Db0a0AFH.cjs → util-CFj4YKIn.cjs} +18 -4
- package/dist/src/{util-Dlz_Wvgm.js → util-CMMkIxfU.js} +7 -11
- package/dist/src/{util-Betm42rL.js → util-CgDCK4KI.js} +18 -4
- package/dist/src/{util-Yz-1aEhW.cjs → util-CuLo2pMR.cjs} +7 -11
- package/dist/src/{util-C-PPYSMq.js → util-DM2rTn_6.js} +18 -4
- package/dist/src/{util-B7T3SiBS.js → util-DMFeUvLz.js} +3 -3
- package/dist/src/{util-ZZH-3QZz.js → util-DbVG-yZU.js} +3 -3
- package/dist/src/{util-DaWTWKBK.js → util-vNmDL5DT.js} +3 -3
- package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
- package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
- package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
- package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -38
- package/dist/src/app/assets/index-BFCZg7hQ.js +0 -439
- package/dist/src/app/assets/index-NCn4eVBv.css +0 -1
- package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
- package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
- package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
- package/dist/src/cache-Bbn1Nyrd.cjs +0 -5
- package/dist/src/cache-BwsMSda7.js +0 -6
- package/dist/src/cloud-DmE0EwsY.js +0 -4
- package/dist/src/eval-17JizQIv.js +0 -15
- package/dist/src/evalResult-Cqj8pldJ.js +0 -12
- package/dist/src/evalResult-DvcJAWJU.cjs +0 -10
- package/dist/src/evalResult-Hftn-S_i.js +0 -10
- package/dist/src/evaluator-B2CFNt-P.js +0 -36
- package/dist/src/fetch-KV5kNASw.js +0 -5
- package/dist/src/graders-Bu0H9nXi.js +0 -32
- package/dist/src/graders-Cfhkvx-e.js +0 -34
- package/dist/src/graders-DClJVpGP.cjs +0 -32
- package/dist/src/graders-DcnJsrMO.js +0 -32
- package/dist/src/providers-C1rOSHiR.js +0 -32
- package/dist/src/providers-CxmDwEFf.cjs +0 -31
- package/dist/src/providers-Dodakqr0.js +0 -30
- package/dist/src/providers-GIQ2TcsA.js +0 -30
- package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
- package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
- package/dist/src/server-B1vi21hA.js +0 -7
- package/dist/src/server-Cm9Kai_h.cjs +0 -5
- package/dist/src/store-BNmZ1KAz.cjs +0 -5
- package/dist/src/store-BltJg2cd.js +0 -6
- package/dist/src/telemetry-5BCRNBbe.cjs +0 -5
- package/dist/src/telemetry-D4W5hboe.js +0 -7
- package/dist/src/transform-DTGDnAzW.js +0 -6
- package/dist/src/transform-m3qNw4KP.cjs +0 -5
package/dist/src/index.js
CHANGED
|
@@ -1,40 +1,40 @@
|
|
|
1
|
-
import { C as
|
|
1
|
+
import { C as getEnvString, D as state, E as isCI, S as getEnvInt, T as getMaxEvalTimeMs, _ as safeJsonStringify, a as logger, b as getEnvBool, g as orderKeys, m as getAjv, n as globalLogCallback, o as setLogCallback, p as extractJsonObjects, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as summarizeEvaluateResultForLogging, w as getEvalTimeoutMs, x as getEnvFloat } from "./logger-DO8_zM18.js";
|
|
2
2
|
import { t as invariant } from "./invariant-Ddh24eXh.js";
|
|
3
|
-
import { r as importModule, t as getDirectory } from "./esm-
|
|
4
|
-
import { r as runPython } from "./pythonUtils-
|
|
3
|
+
import { r as importModule, t as getDirectory } from "./esm-SUNIX1x3.js";
|
|
4
|
+
import { r as runPython } from "./pythonUtils-Cpo0Ez1p.js";
|
|
5
5
|
import { i as isJavascriptFile } from "./fileExtensions-DnqA1y9x.js";
|
|
6
|
-
import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-
|
|
7
|
-
import { $ as matchesSearchRubric, A as BeavertailsPlugin, B as getAndCheckProvider, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as retryWithDeduplication, G as matchesContextFaithfulness, H as matchesAnswerRelevance, I as sampleArray, J as matchesFactuality, K as matchesContextRecall, L as fetchHuggingFaceDataset, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as getCustomPolicies, Q as matchesPiScore, R as callProviderWithContext, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as
|
|
8
|
-
import { A as isApiProvider, C as TestGeneratorConfigSchema, Ct as BaseTokenUsageSchema, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as PromptSchema, T as TestSuiteSchema, Tt as InputsSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TELECOM_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as isProviderOptions, k as isResultFailureReason, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CompletionTokenDetailsSchema, x as TestCasesWithMetadataPromptSchema, xt as UNALIGNED_PROVIDER_HARM_PLUGINS, y as TestCaseSchema, z as StrategyConfigSchema } from "./types-
|
|
9
|
-
import { A as getProviderDescription, C as deduplicateTestCases, D as resultIsForTestCase, E as getTestCaseDeduplicationKey, M as isGoogleProvider, N as isOpenAiProvider, O as checkProviderApiKeys, P as isProviderAllowed, S as setupEnv, T as filterRuntimeVars, b as loadFunction, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, h as renderEnvOnlyInObject, i as fetchCsvFromGoogleSheet, j as isAnthropicProvider, k as doesProviderRefMatch, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as extractVariablesFromTemplates, w as extractRuntimeVars, x as parseFileUrl, y as getNunjucksEngine } from "./util-
|
|
10
|
-
import { A as getShareApiBaseUrl, F as HUMAN_ASSERTION_TYPE, N as VERSION, O as TERMINAL_MAX_WIDTH, P as FILE_METADATA_KEY, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareViewBaseUrl, k as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "./fetch-
|
|
11
|
-
import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "./cache-
|
|
12
|
-
import { A as createRateLimitRegistry, B as isCloudProvider, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getCloudDatabaseId, I as getEvalConfigFromCloud, J as AIStudioChatProvider, L as getOrgContext, M as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as checkCloudPermissions, R as getPluginSeverityOverridesFromCloud, T as runExtensionHook, V as resolveTeamId, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, q as VertexChatProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId } from "./providers-
|
|
13
|
-
import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-
|
|
6
|
+
import { i as getProcessShim, n as transform, t as TransformInputType } from "./transform-BqPkNPYm.js";
|
|
7
|
+
import { $ as matchesSearchRubric, A as BeavertailsPlugin, B as getAndCheckProvider, C as HarmbenchPlugin, D as DebugAccessPlugin, E as DivergentRepetitionPlugin, F as retryWithDeduplication, G as matchesContextFaithfulness, H as matchesAnswerRelevance, I as sampleArray, J as matchesFactuality, K as matchesContextRecall, L as fetchHuggingFaceDataset, M as RedteamGraderBase, N as RedteamPluginBase, O as CrossSessionLeakPlugin, P as getCustomPolicies, Q as matchesPiScore, R as callProviderWithContext, S as ImitationPlugin, T as ExcessiveAgencyPlugin, U as matchesClassification, V as loadRubricPrompt, W as matchesClosedQa, X as matchesLlmRubric, Y as matchesGEval, Z as matchesModeration, _ as makeInlinePolicyIdSync, a as UnverifiableClaimsPlugin, at as DefaultSuggestionsProvider, b as OverreliancePlugin, c as ToolDiscoveryPlugin, ct as readProviderPromptMap, d as RbacPlugin, dt as getFinalTest, et as matchesSelectBest, f as PromptExtractionPlugin, ft as loadFromJavaScriptFile, g as isValidPolicyObject, h as determinePolicyTypeFromId, i as VLGuardPlugin, it as getDefaultProviders, j as AegisPlugin, k as ContractPlugin, l as SqlInjectionPlugin, lt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, m as PolicyPlugin, mt as resolveContext, n as getGraderById, nt as matchesTrajectoryGoalSuccess, o as UnsafeBenchPlugin, ot as processPrompts, p as PoliticsPlugin, pt as processFileReference, q as matchesContextRelevance, r as VLSUPlugin, rt as selectMaxScore, s as ToxicChatPlugin, st as readPrompts, t as GRADERS, tt as matchesSimilarity, u as ShellInjectionPlugin, ut as coerceString, v as PlinyPlugin, w as HallucinationPlugin, x as IntentPlugin, y as getPiiLeakTestsForCategory, z as fail } from "./graders-DG7mhg-b.js";
|
|
8
|
+
import { A as isApiProvider, C as TestGeneratorConfigSchema, Ct as BaseTokenUsageSchema, D as VarsSchema, E as UnifiedConfigSchema, F as ConversationMessageSchema, I as PartialGenerationError, J as getDefaultNFanout, K as STRATEGY_COLLECTIONS, L as PluginConfigSchema, M as RedteamConfigSchema, O as isGradingResult, P as ProvidersSchema, Q as categoryAliases, R as PolicyObjectSchema, S as TestCasesWithMetadataSchema, St as PromptSchema, T as TestSuiteSchema, Tt as InputsSchema, V as isUuid, W as DEFAULT_STRATEGIES, X as isFanoutStrategy, Z as Severity, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TELECOM_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as riskCategorySeverityMap, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as isProviderOptions, k as isResultFailureReason, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as BIAS_PLUGINS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as STRATEGY_COLLECTION_MAPPINGS, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as ALIASED_PLUGIN_MAPPINGS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CompletionTokenDetailsSchema, x as TestCasesWithMetadataPromptSchema, xt as UNALIGNED_PROVIDER_HARM_PLUGINS, y as TestCaseSchema, z as StrategyConfigSchema } from "./types-q8GXGF65.js";
|
|
9
|
+
import { A as getProviderDescription, C as deduplicateTestCases, D as resultIsForTestCase, E as getTestCaseDeduplicationKey, M as isGoogleProvider, N as isOpenAiProvider, O as checkProviderApiKeys, P as isProviderAllowed, S as setupEnv, T as filterRuntimeVars, b as loadFunction, c as maybeLoadFromExternalFile, d as maybeLoadToolsFromExternalFile, h as renderEnvOnlyInObject, i as fetchCsvFromGoogleSheet, j as isAnthropicProvider, k as doesProviderRefMatch, m as readOutput, n as writeMultipleOutputs, p as readFilters, r as writeOutput, s as maybeLoadConfigFromExternalFile, t as printBorder, v as extractVariablesFromTemplates, w as extractRuntimeVars, x as parseFileUrl, y as getNunjucksEngine } from "./util-CMMkIxfU.js";
|
|
10
|
+
import { A as getShareApiBaseUrl, F as HUMAN_ASSERTION_TYPE, N as VERSION, O as TERMINAL_MAX_WIDTH, P as FILE_METADATA_KEY, _ as isPromptfooSampleTarget, a as CloudConfig, b as parseChatPrompt, d as sleep, j as getShareViewBaseUrl, k as getDefaultShareViewBaseUrl, n as fetchWithRetries, o as cloudConfig, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, u as getCurrentTimestamp } from "./fetch-CVAtKnI3.js";
|
|
11
|
+
import { i as getCache, n as disableCache, o as NON_TRANSIENT_HTTP_STATUSES, r as fetchWithCache, s as isNonTransientHttpStatus, t as cache_exports } from "./cache-CaT5tPgo.js";
|
|
12
|
+
import { A as createRateLimitRegistry, B as isCloudProvider, C as collectFileMetadata, D as loadFromPackage, E as isPackagePath, F as getCloudDatabaseId, I as getEvalConfigFromCloud, J as AIStudioChatProvider, L as getOrgContext, M as PromptfooHarmfulCompletionProvider, O as redteamProviderManager, P as checkCloudPermissions, R as getPluginSeverityOverridesFromCloud, T as runExtensionHook, V as resolveTeamId, _ as extractVariablesFromJson, a as resolveProviderConfigs, b as isBasicRefusal, c as Strategies, d as pluginMatchesStrategyTargets, f as checkExfilTracking, g as extractPromptFromTags, i as resolveProvider, j as createProviderRateLimitOptions, k as TokenUsageTracker, l as loadStrategy, m as extractGoalFromPrompt, n as loadApiProvider, o as MCPProvider, q as VertexChatProvider, r as loadApiProviders, s as GoogleLiveProvider, t as getProviderIds, u as validateStrategies, v as getSessionId, w as renderPrompt, y as getShortPluginId } from "./providers-Cn73d5sr.js";
|
|
13
|
+
import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-CFxO9KGo.js";
|
|
14
14
|
import { n as sha256, t as randomSequence } from "./createHash-DmPQkvBh.js";
|
|
15
15
|
import "./genaiTracer-D3fD9dNV.js";
|
|
16
|
-
import { t as OpenAiChatCompletionProvider } from "./chat-
|
|
16
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-pxmiVpWe.js";
|
|
17
17
|
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-NYT-WKS6.js";
|
|
18
|
-
import { m as validateFunctionCall } from "./transform-
|
|
19
|
-
import "./messages-
|
|
20
|
-
import "./util-
|
|
21
|
-
import "./responses-
|
|
22
|
-
import "./openai-
|
|
23
|
-
import { l as validateFunctionCall$1 } from "./util-
|
|
24
|
-
import "./completion-
|
|
25
|
-
import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-
|
|
26
|
-
import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-
|
|
27
|
-
import { t as getBlobByHash } from "./blobs-
|
|
28
|
-
import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-
|
|
29
|
-
import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-
|
|
30
|
-
import { t as telemetry } from "./telemetry-
|
|
18
|
+
import { m as validateFunctionCall } from "./transform-DyDAwEpE.js";
|
|
19
|
+
import "./messages-Dy9QecMs.js";
|
|
20
|
+
import "./util-vNmDL5DT.js";
|
|
21
|
+
import "./responses-CQb1Tj69.js";
|
|
22
|
+
import "./openai-BcB5KlTk.js";
|
|
23
|
+
import { l as validateFunctionCall$1 } from "./util-CgDCK4KI.js";
|
|
24
|
+
import "./completion-DCjv7RZ3.js";
|
|
25
|
+
import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-B2XmGjty.js";
|
|
26
|
+
import { i as getRemoteGenerationUrl, l as shouldGenerateRemote, o as getRemoteHealthUrl, r as promptYesNo, s as neverGenerateRemote } from "./server-CP9qKM40.js";
|
|
27
|
+
import { t as getBlobByHash } from "./blobs-DXTl6J3H.js";
|
|
28
|
+
import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-kC7R5kiK.js";
|
|
29
|
+
import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-D_wd8jxt.js";
|
|
30
|
+
import { t as telemetry } from "./telemetry-BugWqKiu.js";
|
|
31
31
|
import { t as ellipsize } from "./text-B_UCRPp2.js";
|
|
32
|
-
import { t as getTraceStore } from "./store-
|
|
33
|
-
import "./base-
|
|
34
|
-
import "./image-
|
|
35
|
-
import { t as providerRegistry } from "./providerRegistry-
|
|
36
|
-
import { n as runRuby } from "./rubyUtils-
|
|
37
|
-
import { t as EvalResult } from "./evalResult-
|
|
32
|
+
import { t as getTraceStore } from "./store-Cj258DgL.js";
|
|
33
|
+
import "./base-CqzQ4K8j.js";
|
|
34
|
+
import "./image-CoxZp9PZ.js";
|
|
35
|
+
import { t as providerRegistry } from "./providerRegistry-CUWki5mQ.js";
|
|
36
|
+
import { n as runRuby } from "./rubyUtils-PgU-gHmx.js";
|
|
37
|
+
import { t as EvalResult } from "./evalResult-BkIhRdTe.js";
|
|
38
38
|
import * as fs$1 from "fs";
|
|
39
39
|
import fs, { createWriteStream } from "fs";
|
|
40
40
|
import * as path$2 from "path";
|
|
@@ -56,11 +56,13 @@ import { XMLParser } from "fast-xml-parser";
|
|
|
56
56
|
import crypto$1, { createHash, randomBytes } from "crypto";
|
|
57
57
|
import { DiagConsoleLogger, DiagLogLevel, diag, propagation } from "@opentelemetry/api";
|
|
58
58
|
import input from "@inquirer/input";
|
|
59
|
+
import readline from "readline";
|
|
59
60
|
import { and, desc, eq, inArray, sql } from "drizzle-orm";
|
|
60
61
|
import cliProgress from "cli-progress";
|
|
61
62
|
import { JSDOM } from "jsdom";
|
|
62
63
|
import { distance } from "fastest-levenshtein";
|
|
63
64
|
import * as rouge from "js-rouge";
|
|
65
|
+
import { isDeepStrictEqual } from "node:util";
|
|
64
66
|
import "debounce";
|
|
65
67
|
import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
|
|
66
68
|
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
@@ -288,7 +290,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
288
290
|
telemetry.record("feature_used", { feature: "tracing" });
|
|
289
291
|
try {
|
|
290
292
|
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
291
|
-
const { startOTLPReceiver } = await import("./otlpReceiver
|
|
293
|
+
const { startOTLPReceiver } = await import("./otlpReceiver-CZL48YfC.js");
|
|
292
294
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
293
295
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
294
296
|
logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
@@ -311,7 +313,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
311
313
|
async function stopOtlpReceiverIfNeeded() {
|
|
312
314
|
if (otlpReceiverStarted) try {
|
|
313
315
|
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
314
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver
|
|
316
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver-CZL48YfC.js");
|
|
315
317
|
await stopOTLPReceiver();
|
|
316
318
|
otlpReceiverStarted = false;
|
|
317
319
|
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -346,7 +348,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
346
348
|
}
|
|
347
349
|
if (!tracingEnabled) return null;
|
|
348
350
|
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
349
|
-
const { getTraceStore } = await import("./store-
|
|
351
|
+
const { getTraceStore } = await import("./store-Cj258DgL.js").then((n) => n.n);
|
|
350
352
|
const traceStore = getTraceStore();
|
|
351
353
|
const traceId = generateTraceId();
|
|
352
354
|
const spanId = generateSpanId();
|
|
@@ -1379,7 +1381,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
|
|
|
1379
1381
|
pass = result !== inverse;
|
|
1380
1382
|
score = pass ? 1 : 0;
|
|
1381
1383
|
} else if (typeof result === "number") {
|
|
1382
|
-
pass = assertion.threshold
|
|
1384
|
+
pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
|
|
1383
1385
|
score = result;
|
|
1384
1386
|
} else if (typeof result === "object") return result;
|
|
1385
1387
|
else throw new Error("Custom function must return a boolean or number");
|
|
@@ -1412,7 +1414,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1412
1414
|
} catch {
|
|
1413
1415
|
pass = inverse;
|
|
1414
1416
|
}
|
|
1415
|
-
if (
|
|
1417
|
+
if (parsedJson !== void 0 && renderedValue) {
|
|
1416
1418
|
let validate;
|
|
1417
1419
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
1418
1420
|
const schema = valueFromScript;
|
|
@@ -1424,11 +1426,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1424
1426
|
}
|
|
1425
1427
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1426
1428
|
else throw new Error("is-json assertion must have a string or object value");
|
|
1427
|
-
|
|
1429
|
+
const valid = validate(parsedJson);
|
|
1430
|
+
pass = inverse ? !valid : valid;
|
|
1428
1431
|
if (!pass) return {
|
|
1429
1432
|
pass,
|
|
1430
1433
|
score: 0,
|
|
1431
|
-
reason: `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1434
|
+
reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`,
|
|
1432
1435
|
assertion
|
|
1433
1436
|
};
|
|
1434
1437
|
}
|
|
@@ -1455,9 +1458,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1455
1458
|
}
|
|
1456
1459
|
else if (typeof renderedValue === "object") validate = getAjv().compile(renderedValue);
|
|
1457
1460
|
else throw new Error("contains-json assertion must have a string or object value");
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
+
const valid = validate(jsonObject);
|
|
1462
|
+
pass = inverse ? !valid : valid;
|
|
1463
|
+
if (valid) {
|
|
1464
|
+
if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
|
|
1465
|
+
break;
|
|
1466
|
+
} else errorMessage = `JSON does not conform to the provided schema. Errors: ${getAjv().errorsText(validate.errors)}`;
|
|
1461
1467
|
}
|
|
1462
1468
|
return {
|
|
1463
1469
|
pass,
|
|
@@ -1641,7 +1647,7 @@ function handlePerplexity({ logProbs, assertion }) {
|
|
|
1641
1647
|
if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
|
|
1642
1648
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
1643
1649
|
const perplexity = Math.exp(-avgLogProb);
|
|
1644
|
-
const pass = assertion.threshold
|
|
1650
|
+
const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
|
|
1645
1651
|
return {
|
|
1646
1652
|
pass,
|
|
1647
1653
|
score: pass ? 1 : 0,
|
|
@@ -1653,7 +1659,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1653
1659
|
if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
|
|
1654
1660
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
1655
1661
|
const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
|
|
1656
|
-
const pass = assertion.threshold
|
|
1662
|
+
const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
|
|
1657
1663
|
return {
|
|
1658
1664
|
pass,
|
|
1659
1665
|
score: perplexityNorm,
|
|
@@ -1768,7 +1774,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1768
1774
|
} else {
|
|
1769
1775
|
score = Number.parseFloat(String(result));
|
|
1770
1776
|
if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
1771
|
-
pass = assertion.threshold
|
|
1777
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
1772
1778
|
}
|
|
1773
1779
|
} catch (err) {
|
|
1774
1780
|
return {
|
|
@@ -2029,7 +2035,7 @@ end
|
|
|
2029
2035
|
} else {
|
|
2030
2036
|
score = Number.parseFloat(String(result));
|
|
2031
2037
|
if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
2032
|
-
pass = assertion.threshold
|
|
2038
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
2033
2039
|
}
|
|
2034
2040
|
} catch (err) {
|
|
2035
2041
|
return {
|
|
@@ -2100,6 +2106,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2100
2106
|
};
|
|
2101
2107
|
};
|
|
2102
2108
|
//#endregion
|
|
2109
|
+
//#region src/assertions/traceUtils.ts
|
|
2110
|
+
/**
|
|
2111
|
+
* Shared utilities for trace assertions
|
|
2112
|
+
*/
|
|
2113
|
+
/**
|
|
2114
|
+
* Match a span name against a glob-like pattern.
|
|
2115
|
+
* Supports * (any characters) and ? (single character) wildcards.
|
|
2116
|
+
*
|
|
2117
|
+
* @param spanName - The span name to match
|
|
2118
|
+
* @param pattern - The glob pattern to match against
|
|
2119
|
+
* @returns true if the span name matches the pattern
|
|
2120
|
+
*/
|
|
2121
|
+
function matchesPattern(spanName, pattern) {
|
|
2122
|
+
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2123
|
+
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2124
|
+
}
|
|
2125
|
+
//#endregion
|
|
2126
|
+
//#region src/assertions/skill.ts
|
|
2127
|
+
function getSkillCalls(params) {
|
|
2128
|
+
const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
|
|
2129
|
+
if (!Array.isArray(rawSkillCalls)) return [];
|
|
2130
|
+
return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
|
|
2131
|
+
}
|
|
2132
|
+
function matchesSkill(skillCall, matcher) {
|
|
2133
|
+
if (matcher.name && skillCall.name !== matcher.name) return false;
|
|
2134
|
+
if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
|
|
2135
|
+
return true;
|
|
2136
|
+
}
|
|
2137
|
+
function formatSkillCall(skillCall) {
|
|
2138
|
+
const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
|
|
2139
|
+
return details ? `${skillCall.name} (${details})` : skillCall.name;
|
|
2140
|
+
}
|
|
2141
|
+
function resolveSkillMatchers(value) {
|
|
2142
|
+
const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
|
|
2143
|
+
const validateCount = (field, count) => {
|
|
2144
|
+
if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
|
|
2145
|
+
};
|
|
2146
|
+
if (typeof value === "string" && value.trim()) return {
|
|
2147
|
+
kind: "list",
|
|
2148
|
+
matchers: [{ name: normalizeText(value) }]
|
|
2149
|
+
};
|
|
2150
|
+
if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
|
|
2151
|
+
kind: "list",
|
|
2152
|
+
matchers: value.map((item) => ({ name: item.trim() }))
|
|
2153
|
+
};
|
|
2154
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
2155
|
+
const rawMatcher = value;
|
|
2156
|
+
const matcher = rawMatcher;
|
|
2157
|
+
const name = normalizeText(matcher.name);
|
|
2158
|
+
const pattern = normalizeText(matcher.pattern);
|
|
2159
|
+
if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
|
|
2160
|
+
if ("min" in rawMatcher) validateCount("min", matcher.min);
|
|
2161
|
+
if ("max" in rawMatcher) validateCount("max", matcher.max);
|
|
2162
|
+
if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
|
|
2163
|
+
return {
|
|
2164
|
+
kind: "count",
|
|
2165
|
+
matcher: {
|
|
2166
|
+
max: typeof matcher.max === "number" ? matcher.max : void 0,
|
|
2167
|
+
min: typeof matcher.min === "number" ? matcher.min : void 0,
|
|
2168
|
+
name,
|
|
2169
|
+
pattern
|
|
2170
|
+
}
|
|
2171
|
+
};
|
|
2172
|
+
}
|
|
2173
|
+
throw new Error("skill-used assertion must have a string, string array, or object value");
|
|
2174
|
+
}
|
|
2175
|
+
function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
|
|
2176
|
+
const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
2177
|
+
const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
2178
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
2179
|
+
const expectedSkills = expected.matchers.map((matcher) => matcher.name);
|
|
2180
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
2181
|
+
let reason;
|
|
2182
|
+
if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
2183
|
+
else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
|
|
2184
|
+
else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
2185
|
+
return {
|
|
2186
|
+
pass,
|
|
2187
|
+
score: pass ? 1 : 0,
|
|
2188
|
+
reason,
|
|
2189
|
+
assertion: params.assertion
|
|
2190
|
+
};
|
|
2191
|
+
}
|
|
2192
|
+
function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
|
|
2193
|
+
const hasExplicitMin = matcher.min !== void 0;
|
|
2194
|
+
const hasExplicitMax = matcher.max !== void 0;
|
|
2195
|
+
const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
|
|
2196
|
+
const max = matcher.max;
|
|
2197
|
+
const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
|
|
2198
|
+
const count = matchingSkillCalls.length;
|
|
2199
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
2200
|
+
if (params.inverse) {
|
|
2201
|
+
if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
|
|
2202
|
+
const pass = count === 0;
|
|
2203
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
2204
|
+
return {
|
|
2205
|
+
pass,
|
|
2206
|
+
score: pass ? 1 : 0,
|
|
2207
|
+
reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
|
|
2208
|
+
assertion: params.assertion
|
|
2209
|
+
};
|
|
2210
|
+
}
|
|
2211
|
+
const pass = count >= min && (max === void 0 || count <= max);
|
|
2212
|
+
let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
|
|
2213
|
+
reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
|
|
2214
|
+
if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
|
|
2215
|
+
return {
|
|
2216
|
+
pass,
|
|
2217
|
+
score: pass ? 1 : 0,
|
|
2218
|
+
reason,
|
|
2219
|
+
assertion: params.assertion
|
|
2220
|
+
};
|
|
2221
|
+
}
|
|
2222
|
+
function handleSkillUsed(params) {
|
|
2223
|
+
const skillCalls = getSkillCalls(params);
|
|
2224
|
+
const actualSkills = skillCalls.map(formatSkillCall);
|
|
2225
|
+
const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
|
|
2226
|
+
if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
|
|
2227
|
+
return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
|
|
2228
|
+
}
|
|
2229
|
+
//#endregion
|
|
2103
2230
|
//#region src/assertions/sql.ts
|
|
2104
2231
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
2105
2232
|
let pass = false;
|
|
@@ -2332,23 +2459,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2332
2459
|
};
|
|
2333
2460
|
};
|
|
2334
2461
|
//#endregion
|
|
2335
|
-
//#region src/assertions/traceUtils.ts
|
|
2336
|
-
/**
|
|
2337
|
-
* Shared utilities for trace assertions
|
|
2338
|
-
*/
|
|
2339
|
-
/**
|
|
2340
|
-
* Match a span name against a glob-like pattern.
|
|
2341
|
-
* Supports * (any characters) and ? (single character) wildcards.
|
|
2342
|
-
*
|
|
2343
|
-
* @param spanName - The span name to match
|
|
2344
|
-
* @param pattern - The glob pattern to match against
|
|
2345
|
-
* @returns true if the span name matches the pattern
|
|
2346
|
-
*/
|
|
2347
|
-
function matchesPattern(spanName, pattern) {
|
|
2348
|
-
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2349
|
-
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2350
|
-
}
|
|
2351
|
-
//#endregion
|
|
2352
2462
|
//#region src/assertions/traceErrorSpans.ts
|
|
2353
2463
|
function isErrorSpan(span) {
|
|
2354
2464
|
if (span.statusCode && span.statusCode >= 400) return true;
|
|
@@ -2517,6 +2627,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2517
2627
|
};
|
|
2518
2628
|
};
|
|
2519
2629
|
//#endregion
|
|
2630
|
+
//#region src/assertions/trajectoryUtils.ts
|
|
2631
|
+
const TOOL_ATTRIBUTE_KEYS = [
|
|
2632
|
+
"tool.name",
|
|
2633
|
+
"tool_name",
|
|
2634
|
+
"tool",
|
|
2635
|
+
"function.name",
|
|
2636
|
+
"function_name",
|
|
2637
|
+
"gen_ai.tool.name",
|
|
2638
|
+
"codex.mcp.tool",
|
|
2639
|
+
"agent.tool",
|
|
2640
|
+
"agent.tool_name",
|
|
2641
|
+
"agent.toolName"
|
|
2642
|
+
];
|
|
2643
|
+
const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
|
|
2644
|
+
"tool.arguments",
|
|
2645
|
+
"tool.args",
|
|
2646
|
+
"tool.input",
|
|
2647
|
+
"tool_arguments",
|
|
2648
|
+
"tool_args",
|
|
2649
|
+
"tool_input",
|
|
2650
|
+
"function.arguments",
|
|
2651
|
+
"function.args",
|
|
2652
|
+
"function.input",
|
|
2653
|
+
"function_arguments",
|
|
2654
|
+
"function_args",
|
|
2655
|
+
"gen_ai.tool.arguments",
|
|
2656
|
+
"gen_ai.tool.args",
|
|
2657
|
+
"gen_ai.tool.input",
|
|
2658
|
+
"gen_ai.tool.call.arguments",
|
|
2659
|
+
"gen_ai.tool.call.args",
|
|
2660
|
+
"agent.tool.arguments",
|
|
2661
|
+
"agent.tool.args",
|
|
2662
|
+
"agent.tool.input",
|
|
2663
|
+
"codex.mcp.arguments",
|
|
2664
|
+
"codex.mcp.args",
|
|
2665
|
+
"codex.mcp.input",
|
|
2666
|
+
"arguments",
|
|
2667
|
+
"args",
|
|
2668
|
+
"input"
|
|
2669
|
+
];
|
|
2670
|
+
const COMMAND_ATTRIBUTE_KEYS = [
|
|
2671
|
+
"codex.command",
|
|
2672
|
+
"command",
|
|
2673
|
+
"command.name",
|
|
2674
|
+
"command_name"
|
|
2675
|
+
];
|
|
2676
|
+
const SEARCH_ATTRIBUTE_KEYS = [
|
|
2677
|
+
"codex.search.query",
|
|
2678
|
+
"search.query",
|
|
2679
|
+
"search_query"
|
|
2680
|
+
];
|
|
2681
|
+
const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
|
|
2682
|
+
const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
|
|
2683
|
+
const MAX_JUDGE_SUMMARY_STEPS = 24;
|
|
2684
|
+
const JUDGE_SUMMARY_HEAD_STEPS = 12;
|
|
2685
|
+
const JUDGE_SUMMARY_TAIL_STEPS = 12;
|
|
2686
|
+
function getStringAttribute(attributes, keys) {
|
|
2687
|
+
for (const key of keys) {
|
|
2688
|
+
const value = attributes[key];
|
|
2689
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
2690
|
+
}
|
|
2691
|
+
}
|
|
2692
|
+
function normalizeStructuredAttribute(value) {
|
|
2693
|
+
if (value === void 0 || value === null) return;
|
|
2694
|
+
if (typeof value === "string") {
|
|
2695
|
+
const trimmed = value.trim();
|
|
2696
|
+
if (!trimmed) return;
|
|
2697
|
+
try {
|
|
2698
|
+
return JSON.parse(trimmed);
|
|
2699
|
+
} catch {
|
|
2700
|
+
return trimmed;
|
|
2701
|
+
}
|
|
2702
|
+
}
|
|
2703
|
+
if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
|
|
2704
|
+
}
|
|
2705
|
+
function hasSameStatus(left, right) {
|
|
2706
|
+
return left?.code === right?.code && left?.message === right?.message;
|
|
2707
|
+
}
|
|
2708
|
+
function isSearchLikeSpan(span) {
|
|
2709
|
+
const attributes = span.attributes || {};
|
|
2710
|
+
if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
|
|
2711
|
+
return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
|
|
2712
|
+
}
|
|
2713
|
+
function getTrajectoryStepStatus(step) {
|
|
2714
|
+
if (step.statusCode === void 0 || step.statusCode === 0) return;
|
|
2715
|
+
return {
|
|
2716
|
+
code: step.statusCode,
|
|
2717
|
+
...step.statusMessage ? { message: step.statusMessage } : {}
|
|
2718
|
+
};
|
|
2719
|
+
}
|
|
2720
|
+
function getCommandExecutable(command) {
|
|
2721
|
+
return command.trim().split(/\s+/)[0] || void 0;
|
|
2722
|
+
}
|
|
2723
|
+
function extractToolName(span) {
|
|
2724
|
+
const attributes = span.attributes || {};
|
|
2725
|
+
const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
|
|
2726
|
+
if (directMatch) return directMatch;
|
|
2727
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
2728
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
2729
|
+
if (/tool.?name|function.?name/i.test(key)) return value.trim();
|
|
2730
|
+
if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
|
|
2731
|
+
}
|
|
2732
|
+
if (span.name.startsWith("mcp ")) {
|
|
2733
|
+
const slashIndex = span.name.lastIndexOf("/");
|
|
2734
|
+
if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2737
|
+
function extractToolArgs(span) {
|
|
2738
|
+
const attributes = span.attributes || {};
|
|
2739
|
+
for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
|
|
2740
|
+
const value = normalizeStructuredAttribute(attributes[key]);
|
|
2741
|
+
if (value !== void 0) return value;
|
|
2742
|
+
}
|
|
2743
|
+
for (const [key, rawValue] of Object.entries(attributes)) {
|
|
2744
|
+
if (/result|output|error|status/i.test(key)) continue;
|
|
2745
|
+
if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
|
|
2746
|
+
const value = normalizeStructuredAttribute(rawValue);
|
|
2747
|
+
if (value !== void 0) return value;
|
|
2748
|
+
}
|
|
2749
|
+
}
|
|
2750
|
+
function extractCommand(span) {
|
|
2751
|
+
const attributes = span.attributes || {};
|
|
2752
|
+
const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
|
|
2753
|
+
if (directMatch) return directMatch;
|
|
2754
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
2755
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
2756
|
+
if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
|
|
2757
|
+
}
|
|
2758
|
+
if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
|
|
2759
|
+
}
|
|
2760
|
+
function extractSearchQuery(span) {
|
|
2761
|
+
const attributes = span.attributes || {};
|
|
2762
|
+
const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
|
|
2763
|
+
if (directMatch) return directMatch;
|
|
2764
|
+
const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
|
|
2765
|
+
if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
|
|
2766
|
+
if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
|
|
2767
|
+
}
|
|
2768
|
+
function isReasoningSpan(span) {
|
|
2769
|
+
if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
|
|
2770
|
+
return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
|
|
2771
|
+
}
|
|
2772
|
+
function isMessageSpan(span) {
|
|
2773
|
+
if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
|
|
2774
|
+
return span.name === "agent response" || span.name === "send input";
|
|
2775
|
+
}
|
|
2776
|
+
function extractTrajectorySteps(trace) {
|
|
2777
|
+
return [...trace.spans || []].map((span, index) => ({
|
|
2778
|
+
span,
|
|
2779
|
+
index
|
|
2780
|
+
})).sort((left, right) => {
|
|
2781
|
+
const timeDiff = left.span.startTime - right.span.startTime;
|
|
2782
|
+
if (timeDiff !== 0) return timeDiff;
|
|
2783
|
+
const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
|
|
2784
|
+
if (endDiff !== 0) return endDiff;
|
|
2785
|
+
return left.index - right.index;
|
|
2786
|
+
}).map(({ span }) => {
|
|
2787
|
+
const toolName = extractToolName(span);
|
|
2788
|
+
const command = extractCommand(span);
|
|
2789
|
+
const searchQuery = extractSearchQuery(span);
|
|
2790
|
+
let type = "span";
|
|
2791
|
+
let name = span.name;
|
|
2792
|
+
const aliases = new Set([span.name]);
|
|
2793
|
+
let args;
|
|
2794
|
+
if (toolName) {
|
|
2795
|
+
type = "tool";
|
|
2796
|
+
name = toolName;
|
|
2797
|
+
aliases.add(toolName);
|
|
2798
|
+
args = extractToolArgs(span);
|
|
2799
|
+
} else if (command) {
|
|
2800
|
+
type = "command";
|
|
2801
|
+
name = command;
|
|
2802
|
+
aliases.add(command);
|
|
2803
|
+
const executable = getCommandExecutable(command);
|
|
2804
|
+
if (executable) aliases.add(executable);
|
|
2805
|
+
} else if (searchQuery) {
|
|
2806
|
+
type = "search";
|
|
2807
|
+
name = searchQuery;
|
|
2808
|
+
aliases.add(searchQuery);
|
|
2809
|
+
} else if (isReasoningSpan(span)) {
|
|
2810
|
+
type = "reasoning";
|
|
2811
|
+
name = span.name;
|
|
2812
|
+
aliases.add("reasoning");
|
|
2813
|
+
} else if (isMessageSpan(span)) {
|
|
2814
|
+
type = "message";
|
|
2815
|
+
name = span.name;
|
|
2816
|
+
aliases.add("message");
|
|
2817
|
+
}
|
|
2818
|
+
return {
|
|
2819
|
+
aliases: [...aliases],
|
|
2820
|
+
...args === void 0 ? {} : { args },
|
|
2821
|
+
attributes: span.attributes || {},
|
|
2822
|
+
endTime: span.endTime,
|
|
2823
|
+
name,
|
|
2824
|
+
spanId: span.spanId,
|
|
2825
|
+
spanName: span.name,
|
|
2826
|
+
startTime: span.startTime,
|
|
2827
|
+
statusCode: span.statusCode,
|
|
2828
|
+
statusMessage: span.statusMessage,
|
|
2829
|
+
type
|
|
2830
|
+
};
|
|
2831
|
+
});
|
|
2832
|
+
}
|
|
2833
|
+
function normalizeTrajectoryMatcher(matcher, defaultType) {
|
|
2834
|
+
if (typeof matcher === "string") return {
|
|
2835
|
+
pattern: matcher,
|
|
2836
|
+
...defaultType ? { type: defaultType } : {}
|
|
2837
|
+
};
|
|
2838
|
+
return {
|
|
2839
|
+
...matcher,
|
|
2840
|
+
...matcher.type ? {} : defaultType ? { type: defaultType } : {}
|
|
2841
|
+
};
|
|
2842
|
+
}
|
|
2843
|
+
function matchesTrajectoryStep(step, matcher, defaultType) {
|
|
2844
|
+
const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
|
|
2845
|
+
if (type) {
|
|
2846
|
+
if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
|
|
2847
|
+
}
|
|
2848
|
+
const matchPattern = pattern || name;
|
|
2849
|
+
if (!matchPattern) return true;
|
|
2850
|
+
return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
|
|
2851
|
+
}
|
|
2852
|
+
function formatTrajectoryStep(step) {
|
|
2853
|
+
return `${step.type}:${step.name}`;
|
|
2854
|
+
}
|
|
2855
|
+
function formatTrajectoryArgs(args) {
|
|
2856
|
+
if (args === void 0) return "(none)";
|
|
2857
|
+
try {
|
|
2858
|
+
const serialized = JSON.stringify(args);
|
|
2859
|
+
if (serialized !== void 0) return serialized;
|
|
2860
|
+
} catch {}
|
|
2861
|
+
return String(args);
|
|
2862
|
+
}
|
|
2863
|
+
function compactJudgeTrajectorySteps(steps) {
|
|
2864
|
+
const compacted = [];
|
|
2865
|
+
for (const step of steps) {
|
|
2866
|
+
const previousStep = compacted[compacted.length - 1];
|
|
2867
|
+
if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
|
|
2868
|
+
previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
|
|
2869
|
+
continue;
|
|
2870
|
+
}
|
|
2871
|
+
compacted.push(step);
|
|
2872
|
+
}
|
|
2873
|
+
return compacted;
|
|
2874
|
+
}
|
|
2875
|
+
function truncateJudgeTrajectorySteps(steps) {
|
|
2876
|
+
if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
|
|
2877
|
+
return [
|
|
2878
|
+
...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
|
|
2879
|
+
{ omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
|
|
2880
|
+
...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
|
|
2881
|
+
];
|
|
2882
|
+
}
|
|
2883
|
+
function summarizeTrajectoryForJudge(trace) {
|
|
2884
|
+
const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
|
|
2885
|
+
index: index + 1,
|
|
2886
|
+
type: step.type,
|
|
2887
|
+
name: step.name,
|
|
2888
|
+
...step.spanName === step.name ? {} : { spanName: step.spanName },
|
|
2889
|
+
...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
|
|
2890
|
+
}));
|
|
2891
|
+
const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
|
|
2892
|
+
const steps = truncateJudgeTrajectorySteps(compactedSteps);
|
|
2893
|
+
return JSON.stringify({
|
|
2894
|
+
traceId: trace.traceId,
|
|
2895
|
+
stepCount: rawSteps.length,
|
|
2896
|
+
compactedStepCount: compactedSteps.length,
|
|
2897
|
+
steps
|
|
2898
|
+
}, null, 2);
|
|
2899
|
+
}
|
|
2900
|
+
//#endregion
|
|
2901
|
+
//#region src/assertions/trajectory.ts
|
|
2902
|
+
function getTraceOrThrow(params) {
|
|
2903
|
+
const trace = params.assertionValueContext.trace;
|
|
2904
|
+
if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
|
|
2905
|
+
return trace;
|
|
2906
|
+
}
|
|
2907
|
+
function applyInverse(pass, inverse) {
|
|
2908
|
+
return inverse ? !pass : pass;
|
|
2909
|
+
}
|
|
2910
|
+
function formatStepList(stepLabels) {
|
|
2911
|
+
return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
|
|
2912
|
+
}
|
|
2913
|
+
function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
|
|
2914
|
+
if (matcher.pattern || matcher.name) return;
|
|
2915
|
+
const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
|
|
2916
|
+
throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
|
|
2917
|
+
}
|
|
2918
|
+
function resolveGoalSuccessValue(value) {
|
|
2919
|
+
if (typeof value === "string" && value.trim()) return { goal: value.trim() };
|
|
2920
|
+
if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
|
|
2921
|
+
throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
|
|
2922
|
+
}
|
|
2923
|
+
function resolveToolMatchers(value) {
|
|
2924
|
+
if (typeof value === "string") return {
|
|
2925
|
+
kind: "list",
|
|
2926
|
+
matchers: [normalizeTrajectoryMatcher(value, "tool")]
|
|
2927
|
+
};
|
|
2928
|
+
if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
|
|
2929
|
+
kind: "list",
|
|
2930
|
+
matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
|
|
2931
|
+
};
|
|
2932
|
+
if (value && typeof value === "object" && !Array.isArray(value)) return {
|
|
2933
|
+
kind: "count",
|
|
2934
|
+
matcher: {
|
|
2935
|
+
...normalizeTrajectoryMatcher(value, "tool"),
|
|
2936
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
2937
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
2938
|
+
}
|
|
2939
|
+
};
|
|
2940
|
+
throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
|
|
2941
|
+
}
|
|
2942
|
+
const handleTrajectoryToolUsed = (params) => {
|
|
2943
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
2944
|
+
const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
|
|
2945
|
+
if (expected.kind === "list") {
|
|
2946
|
+
if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
|
|
2947
|
+
const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
2948
|
+
const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
2949
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
2950
|
+
const actualTools = steps.map(formatTrajectoryStep);
|
|
2951
|
+
const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
|
|
2952
|
+
let reason;
|
|
2953
|
+
if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2954
|
+
else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2955
|
+
else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2956
|
+
return {
|
|
2957
|
+
pass,
|
|
2958
|
+
score: pass ? 1 : 0,
|
|
2959
|
+
reason,
|
|
2960
|
+
assertion: params.assertion
|
|
2961
|
+
};
|
|
2962
|
+
}
|
|
2963
|
+
const matcher = expected.matcher;
|
|
2964
|
+
const min = matcher.min ?? 1;
|
|
2965
|
+
const max = matcher.max;
|
|
2966
|
+
if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
|
|
2967
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
2968
|
+
const count = matchingSteps.length;
|
|
2969
|
+
const basePass = count >= min && (max === void 0 || count <= max);
|
|
2970
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
2971
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
2972
|
+
let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
|
|
2973
|
+
if (max === void 0) reason += ` (expected at least ${min})`;
|
|
2974
|
+
else reason += ` (expected ${min}-${max})`;
|
|
2975
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
2976
|
+
if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
|
|
2977
|
+
return {
|
|
2978
|
+
pass,
|
|
2979
|
+
score: pass ? 1 : 0,
|
|
2980
|
+
reason,
|
|
2981
|
+
assertion: params.assertion
|
|
2982
|
+
};
|
|
2983
|
+
};
|
|
2984
|
+
function resolveSequenceValue(value) {
|
|
2985
|
+
if (Array.isArray(value)) return {
|
|
2986
|
+
mode: "in_order",
|
|
2987
|
+
steps: value
|
|
2988
|
+
};
|
|
2989
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
2990
|
+
const sequenceValue = value;
|
|
2991
|
+
return {
|
|
2992
|
+
mode: sequenceValue.mode || "in_order",
|
|
2993
|
+
steps: sequenceValue.steps || []
|
|
2994
|
+
};
|
|
2995
|
+
}
|
|
2996
|
+
throw new Error("trajectory:tool-sequence assertion must have an array or object value");
|
|
2997
|
+
}
|
|
2998
|
+
function isRecord(value) {
|
|
2999
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3000
|
+
}
|
|
3001
|
+
function matchesExpectedArgsPartial(actual, expected) {
|
|
3002
|
+
if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
|
|
3003
|
+
if (isRecord(expected)) {
|
|
3004
|
+
if (!isRecord(actual)) return false;
|
|
3005
|
+
return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
|
|
3006
|
+
}
|
|
3007
|
+
return isDeepStrictEqual(actual, expected);
|
|
3008
|
+
}
|
|
3009
|
+
function matchesToolArgs(actual, expected, mode) {
|
|
3010
|
+
if (mode === "exact") return isDeepStrictEqual(actual, expected);
|
|
3011
|
+
return matchesExpectedArgsPartial(actual, expected);
|
|
3012
|
+
}
|
|
3013
|
+
function resolveToolArgsMatchMode(mode) {
|
|
3014
|
+
if (mode === void 0) return "partial";
|
|
3015
|
+
if (mode === "partial" || mode === "exact") return mode;
|
|
3016
|
+
throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
|
|
3017
|
+
}
|
|
3018
|
+
function resolveToolArgsMatchValue(value) {
|
|
3019
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
|
|
3020
|
+
const matcher = normalizeTrajectoryMatcher(value, "tool");
|
|
3021
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
|
|
3022
|
+
const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
|
|
3023
|
+
if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
|
|
3024
|
+
return {
|
|
3025
|
+
matcher,
|
|
3026
|
+
expectedArgs,
|
|
3027
|
+
mode: resolveToolArgsMatchMode(value.mode)
|
|
3028
|
+
};
|
|
3029
|
+
}
|
|
3030
|
+
const handleTrajectoryToolSequence = (params) => {
|
|
3031
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
3032
|
+
const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
|
|
3033
|
+
const expectedMatchers = value.steps.map((step, index) => {
|
|
3034
|
+
const matcher = normalizeTrajectoryMatcher(step, "tool");
|
|
3035
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
|
|
3036
|
+
return matcher;
|
|
3037
|
+
});
|
|
3038
|
+
if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
|
|
3039
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
3040
|
+
let basePass = false;
|
|
3041
|
+
let reason = "";
|
|
3042
|
+
if (value.mode === "exact") {
|
|
3043
|
+
basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
|
|
3044
|
+
if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
|
|
3045
|
+
else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
|
|
3046
|
+
} else {
|
|
3047
|
+
let expectedIndex = 0;
|
|
3048
|
+
const matchedSteps = [];
|
|
3049
|
+
for (const step of toolSteps) {
|
|
3050
|
+
if (expectedIndex >= expectedMatchers.length) break;
|
|
3051
|
+
if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
|
|
3052
|
+
matchedSteps.push(formatTrajectoryStep(step));
|
|
3053
|
+
expectedIndex += 1;
|
|
3054
|
+
}
|
|
3055
|
+
}
|
|
3056
|
+
basePass = expectedIndex === expectedMatchers.length;
|
|
3057
|
+
if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
3058
|
+
else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
|
|
3059
|
+
}
|
|
3060
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3061
|
+
if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
|
|
3062
|
+
return {
|
|
3063
|
+
pass,
|
|
3064
|
+
score: pass ? 1 : 0,
|
|
3065
|
+
reason,
|
|
3066
|
+
assertion: params.assertion
|
|
3067
|
+
};
|
|
3068
|
+
};
|
|
3069
|
+
const handleTrajectoryToolArgsMatch = (params) => {
|
|
3070
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
3071
|
+
const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
|
|
3072
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
3073
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
3074
|
+
const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
3075
|
+
const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
|
|
3076
|
+
const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
|
|
3077
|
+
const basePass = matchedStep !== void 0;
|
|
3078
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3079
|
+
const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
|
|
3080
|
+
const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
|
|
3081
|
+
let reason;
|
|
3082
|
+
if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
3083
|
+
else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
|
|
3084
|
+
else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
|
|
3085
|
+
else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
3086
|
+
else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
|
|
3087
|
+
else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
|
|
3088
|
+
else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
|
|
3089
|
+
return {
|
|
3090
|
+
pass,
|
|
3091
|
+
score: pass ? 1 : 0,
|
|
3092
|
+
reason,
|
|
3093
|
+
assertion: params.assertion
|
|
3094
|
+
};
|
|
3095
|
+
};
|
|
3096
|
+
function resolveStepCountValue(value) {
|
|
3097
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
|
|
3098
|
+
return {
|
|
3099
|
+
...normalizeTrajectoryMatcher(value),
|
|
3100
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
3101
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
3102
|
+
};
|
|
3103
|
+
}
|
|
3104
|
+
const handleTrajectoryStepCount = (params) => {
|
|
3105
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params));
|
|
3106
|
+
const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
|
|
3107
|
+
const { min, max } = matcher;
|
|
3108
|
+
if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
|
|
3109
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
3110
|
+
const count = matchingSteps.length;
|
|
3111
|
+
const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
|
|
3112
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3113
|
+
const filterParts = [];
|
|
3114
|
+
if (matcher.type) {
|
|
3115
|
+
const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
|
|
3116
|
+
filterParts.push(`type=${types.join("|")}`);
|
|
3117
|
+
}
|
|
3118
|
+
const pattern = matcher.pattern || matcher.name;
|
|
3119
|
+
if (pattern) filterParts.push(`pattern=${pattern}`);
|
|
3120
|
+
let reason = `Matched ${count} trajectory step(s)`;
|
|
3121
|
+
if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
|
|
3122
|
+
if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
|
|
3123
|
+
else if (min !== void 0) reason += ` (expected at least ${min})`;
|
|
3124
|
+
else if (max !== void 0) reason += ` (expected at most ${max})`;
|
|
3125
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
3126
|
+
if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
|
|
3127
|
+
return {
|
|
3128
|
+
pass,
|
|
3129
|
+
score: pass ? 1 : 0,
|
|
3130
|
+
reason,
|
|
3131
|
+
assertion: params.assertion
|
|
3132
|
+
};
|
|
3133
|
+
};
|
|
3134
|
+
const handleTrajectoryGoalSuccess = async (params) => {
|
|
3135
|
+
const trace = getTraceOrThrow(params);
|
|
3136
|
+
const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
|
|
3137
|
+
const result = await matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
|
|
3138
|
+
if (!params.inverse) return result;
|
|
3139
|
+
return {
|
|
3140
|
+
...result,
|
|
3141
|
+
assertion: params.assertion,
|
|
3142
|
+
pass: !result.pass,
|
|
3143
|
+
score: result.pass ? 0 : 1,
|
|
3144
|
+
reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
|
|
3145
|
+
};
|
|
3146
|
+
};
|
|
3147
|
+
//#endregion
|
|
2520
3148
|
//#region src/assertions/webhook.ts
|
|
2521
3149
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
2522
3150
|
invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
|
|
@@ -2585,18 +3213,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2585
3213
|
if (pass) reason = "Assertion passed";
|
|
2586
3214
|
else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
|
|
2587
3215
|
else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
|
|
2588
|
-
} else if (min
|
|
2589
|
-
const basePass = wordCount >= min;
|
|
2590
|
-
pass = inverse ? !basePass : basePass;
|
|
2591
|
-
if (pass) reason = "Assertion passed";
|
|
2592
|
-
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
2593
|
-
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
2594
|
-
} else {
|
|
3216
|
+
} else if (min === void 0) {
|
|
2595
3217
|
const basePass = wordCount <= max;
|
|
2596
3218
|
pass = inverse ? !basePass : basePass;
|
|
2597
3219
|
if (pass) reason = "Assertion passed";
|
|
2598
3220
|
else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
|
|
2599
3221
|
else reason = `Word count ${wordCount} is greater than maximum ${max}`;
|
|
3222
|
+
} else {
|
|
3223
|
+
const basePass = wordCount >= min;
|
|
3224
|
+
pass = inverse ? !basePass : basePass;
|
|
3225
|
+
if (pass) reason = "Assertion passed";
|
|
3226
|
+
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
3227
|
+
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
2600
3228
|
}
|
|
2601
3229
|
} else {
|
|
2602
3230
|
invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
|
|
@@ -2691,6 +3319,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2691
3319
|
//#endregion
|
|
2692
3320
|
//#region src/assertions/index.ts
|
|
2693
3321
|
const ASSERTIONS_MAX_CONCURRENCY = getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
3322
|
+
const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
|
|
3323
|
+
const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
|
|
3324
|
+
const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
|
|
3325
|
+
const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
|
|
3326
|
+
const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
|
|
3327
|
+
const MAX_TRACE_FETCH_STABLE_POLLS = 10;
|
|
2694
3328
|
const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
2695
3329
|
"answer-relevance",
|
|
2696
3330
|
"context-faithfulness",
|
|
@@ -2700,8 +3334,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
|
2700
3334
|
"llm-rubric",
|
|
2701
3335
|
"model-graded-closedqa",
|
|
2702
3336
|
"model-graded-factuality",
|
|
2703
|
-
"search-rubric"
|
|
3337
|
+
"search-rubric",
|
|
3338
|
+
"trajectory:goal-success"
|
|
2704
3339
|
]);
|
|
3340
|
+
const TRACE_AWARE_ASSERTION_TYPES = new Set([
|
|
3341
|
+
"javascript",
|
|
3342
|
+
"python",
|
|
3343
|
+
"ruby",
|
|
3344
|
+
"trace-error-spans",
|
|
3345
|
+
"trace-span-count",
|
|
3346
|
+
"trace-span-duration",
|
|
3347
|
+
"trajectory:goal-success",
|
|
3348
|
+
"trajectory:step-count",
|
|
3349
|
+
"trajectory:tool-args-match",
|
|
3350
|
+
"trajectory:tool-sequence",
|
|
3351
|
+
"trajectory:tool-used"
|
|
3352
|
+
]);
|
|
3353
|
+
function assertionUsesTrace(assertion) {
|
|
3354
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
|
|
3355
|
+
return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
|
|
3356
|
+
}
|
|
3357
|
+
function assertionMayNeedTraceContext(assertion) {
|
|
3358
|
+
if (assertionUsesTrace(assertion)) return true;
|
|
3359
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
|
|
3360
|
+
return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || isPackagePath(assertion.value) : false;
|
|
3361
|
+
}
|
|
3362
|
+
function hasTraceAwareAssertions(assertions) {
|
|
3363
|
+
return Boolean(assertions?.some(assertionMayNeedTraceContext));
|
|
3364
|
+
}
|
|
3365
|
+
async function loadTraceData(traceId) {
|
|
3366
|
+
const traceStore = getTraceStore();
|
|
3367
|
+
const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
|
|
3368
|
+
const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
|
|
3369
|
+
const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
|
|
3370
|
+
let lastSpanCount = -1;
|
|
3371
|
+
let stableObservations = 0;
|
|
3372
|
+
let latestTrace = null;
|
|
3373
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
3374
|
+
latestTrace = await traceStore.getTrace(traceId);
|
|
3375
|
+
const spanCount = latestTrace?.spans?.length ?? 0;
|
|
3376
|
+
if (spanCount > 0) {
|
|
3377
|
+
stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
|
|
3378
|
+
lastSpanCount = spanCount;
|
|
3379
|
+
if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
|
|
3380
|
+
} else {
|
|
3381
|
+
stableObservations = 0;
|
|
3382
|
+
lastSpanCount = spanCount;
|
|
3383
|
+
}
|
|
3384
|
+
if (attempt < maxAttempts - 1) await sleep(retryDelayMs);
|
|
3385
|
+
}
|
|
3386
|
+
return latestTrace;
|
|
3387
|
+
}
|
|
2705
3388
|
const ASSERTION_HANDLERS = {
|
|
2706
3389
|
"answer-relevance": handleAnswerRelevance,
|
|
2707
3390
|
bleu: handleBleuScore,
|
|
@@ -2764,12 +3447,18 @@ const ASSERTION_HANDLERS = {
|
|
|
2764
3447
|
ruby: handleRuby,
|
|
2765
3448
|
"rouge-n": handleRougeScore,
|
|
2766
3449
|
"search-rubric": handleSearchRubric,
|
|
3450
|
+
"skill-used": handleSkillUsed,
|
|
2767
3451
|
similar: handleSimilar,
|
|
2768
3452
|
"similar:cosine": handleSimilar,
|
|
2769
3453
|
"similar:dot": handleSimilar,
|
|
2770
3454
|
"similar:euclidean": handleSimilar,
|
|
2771
3455
|
"starts-with": handleStartsWith,
|
|
2772
3456
|
"tool-call-f1": handleToolCallF1,
|
|
3457
|
+
"trajectory:goal-success": handleTrajectoryGoalSuccess,
|
|
3458
|
+
"trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
|
|
3459
|
+
"trajectory:step-count": handleTrajectoryStepCount,
|
|
3460
|
+
"trajectory:tool-sequence": handleTrajectoryToolSequence,
|
|
3461
|
+
"trajectory:tool-used": handleTrajectoryToolUsed,
|
|
2773
3462
|
"trace-error-spans": handleTraceErrorSpans,
|
|
2774
3463
|
"trace-span-count": handleTraceSpanCount,
|
|
2775
3464
|
"trace-span-duration": handleTraceSpanDuration,
|
|
@@ -2812,7 +3501,7 @@ function isAssertionInverse(assertion) {
|
|
|
2812
3501
|
function getAssertionBaseType(assertion) {
|
|
2813
3502
|
return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
|
|
2814
3503
|
}
|
|
2815
|
-
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
|
|
3504
|
+
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
|
|
2816
3505
|
const resolvedVars = vars || test.vars || {};
|
|
2817
3506
|
const { cost, logProbs, output: originalOutput } = providerResponse;
|
|
2818
3507
|
let output = originalOutput;
|
|
@@ -2831,14 +3520,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2831
3520
|
providerResponse,
|
|
2832
3521
|
...assertion.config ? { config: structuredClone(assertion.config) } : {}
|
|
2833
3522
|
};
|
|
2834
|
-
if (traceId) try {
|
|
2835
|
-
const
|
|
2836
|
-
if (
|
|
2837
|
-
traceId:
|
|
2838
|
-
evaluationId:
|
|
2839
|
-
testCaseId:
|
|
2840
|
-
metadata:
|
|
2841
|
-
spans:
|
|
3523
|
+
if (traceId && assertionMayNeedTraceContext(assertion)) try {
|
|
3524
|
+
const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
|
|
3525
|
+
if (resolvedTraceData) context.trace = {
|
|
3526
|
+
traceId: resolvedTraceData.traceId,
|
|
3527
|
+
evaluationId: resolvedTraceData.evaluationId,
|
|
3528
|
+
testCaseId: resolvedTraceData.testCaseId,
|
|
3529
|
+
metadata: resolvedTraceData.metadata,
|
|
3530
|
+
spans: resolvedTraceData.spans || []
|
|
2842
3531
|
};
|
|
2843
3532
|
} catch (error) {
|
|
2844
3533
|
logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
@@ -2871,7 +3560,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2871
3560
|
};
|
|
2872
3561
|
}
|
|
2873
3562
|
else if (filePath.endsWith(".rb")) try {
|
|
2874
|
-
const { runRuby } = await import("./rubyUtils-
|
|
3563
|
+
const { runRuby } = await import("./rubyUtils-PgU-gHmx.js").then((n) => n.t);
|
|
2875
3564
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2876
3565
|
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2877
3566
|
} catch (error) {
|
|
@@ -2980,6 +3669,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
2980
3669
|
index: i
|
|
2981
3670
|
};
|
|
2982
3671
|
}).flat();
|
|
3672
|
+
const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
|
|
3673
|
+
let preloadedTraceData;
|
|
3674
|
+
if (shouldPreloadTrace && traceId) try {
|
|
3675
|
+
preloadedTraceData = await loadTraceData(traceId);
|
|
3676
|
+
} catch (error) {
|
|
3677
|
+
logger.debug(`Failed to preload trace data for assertions: ${error}`);
|
|
3678
|
+
preloadedTraceData = null;
|
|
3679
|
+
}
|
|
2983
3680
|
await async.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
|
|
2984
3681
|
if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
|
|
2985
3682
|
const result = await runAssertion({
|
|
@@ -2991,7 +3688,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
2991
3688
|
vars,
|
|
2992
3689
|
latencyMs,
|
|
2993
3690
|
assertIndex: index,
|
|
2994
|
-
traceId
|
|
3691
|
+
traceId,
|
|
3692
|
+
traceData: preloadedTraceData
|
|
2995
3693
|
});
|
|
2996
3694
|
assertResult.addResult({
|
|
2997
3695
|
index,
|
|
@@ -3137,7 +3835,7 @@ var CIProgressReporter = class {
|
|
|
3137
3835
|
else {
|
|
3138
3836
|
const eta = remaining / rate;
|
|
3139
3837
|
if (eta > 1440) etaDisplay = ">24 hours";
|
|
3140
|
-
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta)
|
|
3838
|
+
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
|
|
3141
3839
|
}
|
|
3142
3840
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3143
3841
|
logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
@@ -3538,12 +4236,55 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3538
4236
|
var ProgressBarManager = class {
|
|
3539
4237
|
progressBar;
|
|
3540
4238
|
isWebUI;
|
|
4239
|
+
originalLogCallback = null;
|
|
4240
|
+
installedLogCallback = null;
|
|
4241
|
+
pendingRender = null;
|
|
3541
4242
|
totalCount = 0;
|
|
3542
4243
|
completedCount = 0;
|
|
3543
4244
|
concurrency = 1;
|
|
3544
4245
|
constructor(isWebUI) {
|
|
3545
4246
|
this.isWebUI = isWebUI;
|
|
3546
4247
|
}
|
|
4248
|
+
clearProgressBarLine() {
|
|
4249
|
+
readline.cursorTo(process.stderr, 0);
|
|
4250
|
+
readline.clearLine(process.stderr, 0);
|
|
4251
|
+
}
|
|
4252
|
+
scheduleRender() {
|
|
4253
|
+
if (!this.progressBar || this.pendingRender) return;
|
|
4254
|
+
this.pendingRender = setImmediate(() => {
|
|
4255
|
+
this.pendingRender = null;
|
|
4256
|
+
this.progressBar?.render();
|
|
4257
|
+
});
|
|
4258
|
+
}
|
|
4259
|
+
handleLogMessage() {
|
|
4260
|
+
if (!this.progressBar) return;
|
|
4261
|
+
this.clearProgressBarLine();
|
|
4262
|
+
this.scheduleRender();
|
|
4263
|
+
}
|
|
4264
|
+
/**
|
|
4265
|
+
* Coordinate console logging with the progress bar to prevent visual corruption.
|
|
4266
|
+
*/
|
|
4267
|
+
installLogInterceptor() {
|
|
4268
|
+
if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
|
|
4269
|
+
this.originalLogCallback = globalLogCallback;
|
|
4270
|
+
this.installedLogCallback = (message) => {
|
|
4271
|
+
this.originalLogCallback?.(message);
|
|
4272
|
+
this.handleLogMessage();
|
|
4273
|
+
};
|
|
4274
|
+
setLogCallback(this.installedLogCallback);
|
|
4275
|
+
}
|
|
4276
|
+
/**
|
|
4277
|
+
* Remove the log interceptor and restore original logger callback behavior.
|
|
4278
|
+
*/
|
|
4279
|
+
removeLogInterceptor() {
|
|
4280
|
+
if (this.pendingRender) {
|
|
4281
|
+
clearImmediate(this.pendingRender);
|
|
4282
|
+
this.pendingRender = null;
|
|
4283
|
+
}
|
|
4284
|
+
if (this.installedLogCallback && globalLogCallback === this.installedLogCallback) setLogCallback(this.originalLogCallback);
|
|
4285
|
+
this.installedLogCallback = null;
|
|
4286
|
+
this.originalLogCallback = null;
|
|
4287
|
+
}
|
|
3547
4288
|
/**
|
|
3548
4289
|
* Initialize progress bar
|
|
3549
4290
|
*/
|
|
@@ -3563,7 +4304,8 @@ var ProgressBarManager = class {
|
|
|
3563
4304
|
return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
|
|
3564
4305
|
},
|
|
3565
4306
|
hideCursor: true,
|
|
3566
|
-
gracefulExit: true
|
|
4307
|
+
gracefulExit: true,
|
|
4308
|
+
stream: process.stderr
|
|
3567
4309
|
}, cliProgress.Presets.shades_classic);
|
|
3568
4310
|
this.progressBar.start(this.totalCount, 0, {
|
|
3569
4311
|
provider: "",
|
|
@@ -3838,6 +4580,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3838
4580
|
const parts = traceContext.traceparent.split("-");
|
|
3839
4581
|
if (parts.length >= 3) traceId = parts[1];
|
|
3840
4582
|
}
|
|
4583
|
+
if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
|
|
3841
4584
|
const checkResult = await runAssertions({
|
|
3842
4585
|
prompt: renderedPrompt,
|
|
3843
4586
|
provider,
|
|
@@ -4235,7 +4978,7 @@ var Evaluator = class {
|
|
|
4235
4978
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4236
4979
|
if (isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4237
4980
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4238
|
-
const { loadApiProvider } = await import("./providers-
|
|
4981
|
+
const { loadApiProvider } = await import("./providers-sS2WI8YD.js");
|
|
4239
4982
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4240
4983
|
} else testCase.provider = defaultProvider;
|
|
4241
4984
|
}
|
|
@@ -4319,7 +5062,7 @@ var Evaluator = class {
|
|
|
4319
5062
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4320
5063
|
}
|
|
4321
5064
|
if (state.resume && this.evalRecord.persisted) try {
|
|
4322
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
5065
|
+
const { default: EvalResult } = await import("./evalResult-BkIhRdTe.js").then((n) => n.n);
|
|
4323
5066
|
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: state.retryMode });
|
|
4324
5067
|
const originalCount = runEvalOptions.length;
|
|
4325
5068
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -4519,7 +5262,7 @@ var Evaluator = class {
|
|
|
4519
5262
|
if (isCI() && !isWebUI) {
|
|
4520
5263
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4521
5264
|
ciProgressReporter.start();
|
|
4522
|
-
} else if (this.options.showProgressBar && process.
|
|
5265
|
+
} else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
|
|
4523
5266
|
this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
|
|
4524
5267
|
if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
|
|
4525
5268
|
if (isWebUI) {
|
|
@@ -4540,7 +5283,10 @@ var Evaluator = class {
|
|
|
4540
5283
|
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4541
5284
|
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4542
5285
|
}
|
|
4543
|
-
if (this.options.showProgressBar && progressBarManager)
|
|
5286
|
+
if (this.options.showProgressBar && progressBarManager) {
|
|
5287
|
+
await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
5288
|
+
progressBarManager.installLogInterceptor();
|
|
5289
|
+
}
|
|
4544
5290
|
try {
|
|
4545
5291
|
if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
|
|
4546
5292
|
checkAbort();
|
|
@@ -4566,7 +5312,10 @@ var Evaluator = class {
|
|
|
4566
5312
|
else if (!targetUnavailable) {
|
|
4567
5313
|
logger.info("Evaluation interrupted, saving progress...");
|
|
4568
5314
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4569
|
-
if (progressBarManager)
|
|
5315
|
+
if (progressBarManager) {
|
|
5316
|
+
progressBarManager.removeLogInterceptor();
|
|
5317
|
+
progressBarManager.stop();
|
|
5318
|
+
}
|
|
4570
5319
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
4571
5320
|
this.evalRecord.setVars(Array.from(vars));
|
|
4572
5321
|
await this.evalRecord.addPrompts(prompts);
|
|
@@ -4574,6 +5323,10 @@ var Evaluator = class {
|
|
|
4574
5323
|
return this.evalRecord;
|
|
4575
5324
|
}
|
|
4576
5325
|
} else {
|
|
5326
|
+
if (progressBarManager) {
|
|
5327
|
+
progressBarManager.removeLogInterceptor();
|
|
5328
|
+
progressBarManager.stop();
|
|
5329
|
+
}
|
|
4577
5330
|
if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
|
|
4578
5331
|
throw err;
|
|
4579
5332
|
}
|
|
@@ -4716,6 +5469,7 @@ var Evaluator = class {
|
|
|
4716
5469
|
await this.evalRecord.addPrompts(prompts);
|
|
4717
5470
|
try {
|
|
4718
5471
|
if (progressBarManager) {
|
|
5472
|
+
progressBarManager.removeLogInterceptor();
|
|
4719
5473
|
progressBarManager.complete();
|
|
4720
5474
|
progressBarManager.stop();
|
|
4721
5475
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -7073,8 +7827,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7073
7827
|
logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
|
|
7074
7828
|
} else if (key.startsWith("__config:")) {
|
|
7075
7829
|
const configParts = key.slice(9).split(":");
|
|
7076
|
-
if (configParts.length
|
|
7077
|
-
else {
|
|
7830
|
+
if (configParts.length === 2) {
|
|
7078
7831
|
const [expectedKey, configKey] = configParts;
|
|
7079
7832
|
let targetIndex;
|
|
7080
7833
|
if (expectedKey === "__expected") targetIndex = 0;
|
|
@@ -7100,7 +7853,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7100
7853
|
}
|
|
7101
7854
|
}
|
|
7102
7855
|
assertionConfigs[targetIndex][configKey] = parsedValue;
|
|
7103
|
-
}
|
|
7856
|
+
} else logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
|
|
7104
7857
|
} else vars[key] = value;
|
|
7105
7858
|
}
|
|
7106
7859
|
for (let i = 0; i < asserts.length; i++) {
|
|
@@ -7229,14 +7982,14 @@ async function parseXlsxFile(filePath) {
|
|
|
7229
7982
|
const sheetName = typeof sheetOption === "number" ? sheetNames[sheetOption - 1] : sheetOption;
|
|
7230
7983
|
const rows = await readXlsxFile(actualFilePath, { sheet: sheetOption });
|
|
7231
7984
|
if (rows.length === 0) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
|
|
7232
|
-
const headers = rows[0].map((cell) => cell
|
|
7985
|
+
const headers = rows[0].map((cell) => cell == null ? "" : String(cell));
|
|
7233
7986
|
if (headers.length === 0 || headers.every((h) => h === "")) throw new Error(`Sheet "${sheetName}" has no valid column headers`);
|
|
7234
7987
|
if (rows.length === 1) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
|
|
7235
7988
|
const data = rows.slice(1).map((row) => {
|
|
7236
7989
|
const obj = {};
|
|
7237
7990
|
headers.forEach((header, index) => {
|
|
7238
7991
|
const cellValue = row[index];
|
|
7239
|
-
obj[header] = cellValue
|
|
7992
|
+
obj[header] = cellValue == null ? "" : String(cellValue);
|
|
7240
7993
|
});
|
|
7241
7994
|
return obj;
|
|
7242
7995
|
});
|
|
@@ -11183,20 +11936,19 @@ function generateEvalSummary(params) {
|
|
|
11183
11936
|
}
|
|
11184
11937
|
}
|
|
11185
11938
|
lines.push("");
|
|
11186
|
-
const
|
|
11187
|
-
|
|
11188
|
-
|
|
11189
|
-
|
|
11190
|
-
|
|
11191
|
-
|
|
11192
|
-
|
|
11193
|
-
}
|
|
11194
|
-
const passedPart = successes > 0 ? `${chalk.green("✓")} ${chalk.green.bold(successes.toLocaleString())} passed` : `${chalk.gray.bold(successes.toLocaleString())} passed`;
|
|
11195
|
-
const failedPart = failures > 0 ? `${chalk.red("✗")} ${chalk.red.bold(failures.toLocaleString())} failed` : `${chalk.gray.bold(failures.toLocaleString())} failed`;
|
|
11939
|
+
const totalTests = successes + failures + errors;
|
|
11940
|
+
const formatResultPercentage = (count) => {
|
|
11941
|
+
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
11942
|
+
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
11943
|
+
};
|
|
11944
|
+
const formatResultLine = (count, label, icon, iconColor) => {
|
|
11945
|
+
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
|
|
11946
|
+
};
|
|
11196
11947
|
const errorLabel = errors === 1 ? "error" : "errors";
|
|
11197
|
-
|
|
11198
|
-
|
|
11199
|
-
|
|
11948
|
+
lines.push(chalk.bold("Results:"));
|
|
11949
|
+
lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
|
|
11950
|
+
lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
|
|
11951
|
+
lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
|
|
11200
11952
|
const durationDisplay = formatDuration(duration);
|
|
11201
11953
|
lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
|
|
11202
11954
|
lines.push("");
|
|
@@ -11530,7 +12282,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11530
12282
|
await checkCloudPermissions(config);
|
|
11531
12283
|
const options = {
|
|
11532
12284
|
...evaluateOptions,
|
|
11533
|
-
showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar
|
|
12285
|
+
showProgressBar: getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
|
|
11534
12286
|
repeat,
|
|
11535
12287
|
delay: !Number.isNaN(delay) && delay > 0 ? delay : void 0,
|
|
11536
12288
|
maxConcurrency,
|
|
@@ -11914,7 +12666,7 @@ async function doRedteamRun(options) {
|
|
|
11914
12666
|
redteamConfig = await doGenerateRedteam({
|
|
11915
12667
|
...passThroughOptions,
|
|
11916
12668
|
...options.liveRedteamConfig?.commandLineOptions || {},
|
|
11917
|
-
...maxConcurrency
|
|
12669
|
+
...maxConcurrency === void 0 ? {} : { maxConcurrency },
|
|
11918
12670
|
config: configPath,
|
|
11919
12671
|
output: redteamPath,
|
|
11920
12672
|
force: options.force,
|