promptfoo 0.121.1 → 0.121.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/src/{accounts-xrUGFA6n.js → accounts-B2XmGjty.js} +5 -5
- package/dist/src/{accounts-Bx-x3bmW.cjs → accounts-BPyfpSeU.cjs} +5 -5
- package/dist/src/{accounts-CMqkzrVf.js → accounts-CFLK3mnD.js} +6 -6
- package/dist/src/{accounts-BgNJDBE6.js → accounts-Xatc0RYb.js} +5 -5
- package/dist/src/{agentic-utils-BKIN5PKu.js → agentic-utils-36epdqwB.js} +3 -3
- package/dist/src/{cometapi-DkXrKi5z.js → agentic-utils-D8yXo5Lm.js} +4 -61
- package/dist/src/{cometapi-vY6aDZgo.cjs → agentic-utils-DAVsChuB.cjs} +24 -62
- package/dist/src/agentic-utils-DIYAAYE7.js +153 -0
- package/dist/src/{agents-C-dDThPK.js → agents-BBVJCIYr.js} +226 -13
- package/dist/src/{agents-CErsqg5U.cjs → agents-BBWxKSM0.cjs} +7 -7
- package/dist/src/{agents-Dy2YpZpa.js → agents-Bqgfdokm.js} +227 -14
- package/dist/src/{agents-B0f4HICh.cjs → agents-CAYbM7qD.cjs} +226 -13
- package/dist/src/{agents-CVIn-Utx.js → agents-CLQ-P15P.js} +7 -7
- package/dist/src/{agents-DeH4Gu94.js → agents-CgBniSlI.js} +8 -8
- package/dist/src/{agents-CXknwsFX.js → agents-DSSTV4bv.js} +226 -13
- package/dist/src/{agents-aF4-T121.js → agents-wg3ohknq.js} +7 -7
- package/dist/src/{aimlapi-tg0Gkcvr.cjs → aimlapi-Bv8Fmc-b.cjs} +14 -14
- package/dist/src/{aimlapi-BNfTBexL.js → aimlapi-BwGC1TtS.js} +13 -13
- package/dist/src/{aimlapi-BAGZDo5G.js → aimlapi-DaC3qZ-o.js} +14 -14
- package/dist/src/{aimlapi-DHRKlBEA.js → aimlapi-MgSLdvy7.js} +13 -13
- package/dist/src/app/assets/index-B6l9CVVb.js +439 -0
- package/dist/src/app/assets/index-DyZ0Ep37.css +1 -0
- package/dist/src/app/assets/sync-CStkzc6u.js +4 -0
- package/dist/src/app/assets/vendor-charts-BnDWwBlI.js +36 -0
- package/dist/src/app/assets/vendor-markdown-Bz7N-ca6.js +29 -0
- package/dist/src/app/index.html +4 -4
- package/dist/src/{audio-tf_NBjlC.js → audio-Bn44pQxv.js} +4 -4
- package/dist/src/{audio-CHQ4r-RV.js → audio-DDA5WHdx.js} +4 -4
- package/dist/src/{audio-BWeaWovU.cjs → audio-DVFjQ67_.cjs} +4 -4
- package/dist/src/{audio-BRODU0UK.js → audio-DjU9GswO.js} +5 -5
- package/dist/src/{base-DBtwl2FR.cjs → base-BboXIF_0.cjs} +3 -3
- package/dist/src/{base-B4QJRyFS.js → base-CKjwebIH.js} +3 -3
- package/dist/src/{base-B0tcrnq_.js → base-CqzQ4K8j.js} +3 -3
- package/dist/src/{base-fEDN28WM.js → base-Cz2ZC_iA.js} +3 -3
- package/dist/src/{blobs-BAU-dXan.js → blobs-B1JriOyi.js} +3 -3
- package/dist/src/{blobs-qTYm-1PY.js → blobs-BUWmKWzo.js} +3 -3
- package/dist/src/{blobs-DvS-O6be.cjs → blobs-C6j0bvFz.cjs} +3 -3
- package/dist/src/{blobs-Bpg5rH6i.js → blobs-DXTl6J3H.js} +3 -3
- package/dist/src/{cache-COish3-W.cjs → cache-C5yFZ4gC.cjs} +75 -58
- package/dist/src/{cache-8XhNqPKW.js → cache-CaT5tPgo.js} +75 -58
- package/dist/src/cache-CyCanoMu.js +6 -0
- package/dist/src/{cache-CG0SlR1d.js → cache-DSqR6ezl.js} +75 -58
- package/dist/src/cache-Df_QFDNu.cjs +5 -0
- package/dist/src/{cache-D3eqDYGU.js → cache-HP0NP4k3.js} +75 -58
- package/dist/src/{chat-DHMH-N64.js → chat-B-52XYI1.js} +12 -12
- package/dist/src/{chat-BKm79wib.js → chat-B0iaWhoh.js} +16 -14
- package/dist/src/{chat-DxysjBvt.js → chat-BE0qTA8e.js} +13 -13
- package/dist/src/{chat-CRWNNq73.js → chat-BEwdgGEg.js} +16 -14
- package/dist/src/{chat-2K608PeQ.cjs → chat-BtIKkLKx.cjs} +13 -13
- package/dist/src/{chat-DaqekjFr.cjs → chat-CM8qWR3_.cjs} +17 -15
- package/dist/src/{chat-CM_kyI8B.js → chat-DK1U-eZ-.js} +12 -12
- package/dist/src/{chat-CznLWr_D.js → chat-pxmiVpWe.js} +16 -14
- package/dist/src/{chatkit-65VXf5SR.js → chatkit-BYGQlHlV.js} +4 -4
- package/dist/src/{chatkit-DKyPi1Gs.cjs → chatkit-Cx174XI3.cjs} +4 -4
- package/dist/src/{chatkit-BxFvW8KY.js → chatkit-_8eJqKcD.js} +4 -4
- package/dist/src/{chatkit-Be-Q-a9F.js → chatkit-a2D6mY6s.js} +4 -4
- package/dist/src/{claude-agent-sdk-CJH22shf.cjs → claude-agent-sdk-8ddRp1L2.cjs} +35 -17
- package/dist/src/{claude-agent-sdk-Dy5lT-Tx.js → claude-agent-sdk-Bq5EArsX.js} +33 -15
- package/dist/src/{claude-agent-sdk-BLTu0WBO.js → claude-agent-sdk-CMjh4LFH.js} +33 -15
- package/dist/src/{claude-agent-sdk-D6_k9FKA.js → claude-agent-sdk-HgbFioFw.js} +33 -15
- package/dist/src/cloud-DE3t1-ZI.js +4 -0
- package/dist/src/{cloud-Bc9526yV.js → cloud-z8KZpUoa.js} +3 -3
- package/dist/src/{cloudflare-ai-CWWJCRim.js → cloudflare-ai-BGyXlpXJ.js} +13 -13
- package/dist/src/{cloudflare-ai-C9r2sRhw.js → cloudflare-ai-Bbp26N0L.js} +13 -13
- package/dist/src/{cloudflare-ai-ClWSdor4.cjs → cloudflare-ai-C62x6MQG.cjs} +14 -14
- package/dist/src/{cloudflare-ai-ICsOuD-z.js → cloudflare-ai-DdKP9TKT.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6xFc5pa.js → cloudflare-gateway-BwAaUgeW.js} +14 -14
- package/dist/src/{cloudflare-gateway-D6O7AlYb.js → cloudflare-gateway-D-e9i1Sn.js} +15 -15
- package/dist/src/{cloudflare-gateway-pXGHxJ47.js → cloudflare-gateway-DXhtXDRb.js} +15 -163
- package/dist/src/{cloudflare-gateway-C2_-KG5o.cjs → cloudflare-gateway-Dx36ftqF.cjs} +15 -15
- package/dist/src/{codex-sdk-DUwKWezN.js → codex-sdk-BQEw16R_.js} +180 -11
- package/dist/src/{codex-sdk-C6UMlxwV.js → codex-sdk-C_07GuVS.js} +180 -11
- package/dist/src/{codex-sdk-GGAw0qbD.js → codex-sdk-DE5G18dx.js} +180 -11
- package/dist/src/{codex-sdk-fAO0c3yA.cjs → codex-sdk-ZLKfDjqP.cjs} +181 -12
- package/dist/src/cometapi-BDyV-NNm.js +62 -0
- package/dist/src/cometapi-C3hOlM7-.cjs +62 -0
- package/dist/src/{cometapi-Bbjp5V4x.js → cometapi-hhL4TAh3.js} +14 -14
- package/dist/src/{cometapi-BasUi7-_.js → cometapi-sp7sJpBD.js} +15 -15
- package/dist/src/{completion-C_P3ypkJ.js → completion-BCimtq-h.js} +6 -6
- package/dist/src/{completion-6Mx_iXxK.js → completion-DCjv7RZ3.js} +6 -6
- package/dist/src/{completion-CDOouNzq.cjs → completion-DlXUhj5c.cjs} +6 -6
- package/dist/src/{completion-C5rtR_9P.js → completion-DoYy49ti.js} +6 -6
- package/dist/src/{createHash-CfZSc0b4.cjs → createHash-BYwImsYv.cjs} +2 -2
- package/dist/src/{docker-BwsKwxFs.cjs → docker-Cqj2-QVi.cjs} +14 -14
- package/dist/src/{docker-CZnqU1XV.js → docker-CxCkwMzc.js} +13 -13
- package/dist/src/{docker-DzxyDPIj.js → docker-DpguQj-w.js} +14 -14
- package/dist/src/{docker-5KcG-_86.js → docker-FeBni2dw.js} +13 -13
- package/dist/src/{esm-C03C-mv3.js → esm-7UIl0pPM.js} +2 -2
- package/dist/src/{esm-Cd1AjG1D.js → esm-CKWP3u_P.js} +3 -3
- package/dist/src/{esm-CnNt7sI4.cjs → esm-CipptfDu.cjs} +2 -2
- package/dist/src/{esm-CaIwzWR5.js → esm-SUNIX1x3.js} +3 -3
- package/dist/src/eval-7aEqoMs3.js +15 -0
- package/dist/src/{eval-DmFyWU7i.js → eval-BTqTn7lb.js} +10 -10
- package/dist/src/{evalResult-CDQiuUuf.js → evalResult-BkIhRdTe.js} +7 -7
- package/dist/src/evalResult-CYNHkk5A.js +12 -0
- package/dist/src/evalResult-CuvJeNiM.js +10 -0
- package/dist/src/{evalResult-CTG2AHOS.js → evalResult-DUDShQrm.js} +7 -7
- package/dist/src/{evalResult-Dap2CekP.cjs → evalResult-DpARzUCb.cjs} +7 -7
- package/dist/src/evalResult-tGdilrWt.cjs +10 -0
- package/dist/src/evaluator-BBUqRhz1.js +36 -0
- package/dist/src/{evaluator-DPFRbFIL.js → evaluator-BcvOGaam.js} +833 -79
- package/dist/src/{extractor-YMU_Gvt8.js → extractor-C8XwivI9.js} +6 -6
- package/dist/src/{extractor-CFG6bcWJ.js → extractor-CAZ2G3Kh.js} +6 -6
- package/dist/src/{extractor-DX36oYEv.cjs → extractor-DG3sSfXE.cjs} +6 -6
- package/dist/src/{extractor-M67RUtg6.js → extractor-D_wd8jxt.js} +6 -6
- package/dist/src/{fetch-4M3YRaqL.js → fetch-BiYv2BZc.js} +3 -3
- package/dist/src/{fetch-BxUk8odA.cjs → fetch-BnR9wSnm.cjs} +3 -3
- package/dist/src/{fetch-60Gzydls.js → fetch-CVAtKnI3.js} +3 -3
- package/dist/src/{fetch-BMv0O527.js → fetch-DoVRJZhJ.js} +4 -4
- package/dist/src/fetch-UWU706qb.js +5 -0
- package/dist/src/{genaiTracer-DN4dQywX.cjs → genaiTracer-BfxrvSUb.cjs} +2 -2
- package/dist/src/{graders-DOXycdlG.cjs → graders-BElhu9ZY.cjs} +126 -55
- package/dist/src/{graders-R9rYUM0d.js → graders-BXAJ0sbS.js} +120 -55
- package/dist/src/graders-BxfEguVY.js +32 -0
- package/dist/src/graders-CzVMbEnv.js +34 -0
- package/dist/src/{graders-CpdqD9PI.js → graders-DG7mhg-b.js} +120 -55
- package/dist/src/graders-DjCXfj0l.cjs +32 -0
- package/dist/src/{graders-CHO8EPM4.js → graders-RjHF8VfG.js} +120 -55
- package/dist/src/graders-kHzIWOKu.js +32 -0
- package/dist/src/{image-DTedmQPg.cjs → image--F58eEIn.cjs} +6 -6
- package/dist/src/{image-DJEvKveK.js → image-6WQXK8m8.js} +4 -4
- package/dist/src/{image-pAX56tPG.js → image-B8b6f36E.js} +6 -6
- package/dist/src/{image-BmEZqVmk.js → image-CoxZp9PZ.js} +6 -6
- package/dist/src/{image-gvmivTEe.js → image-DO0RYnjH.js} +5 -5
- package/dist/src/{image-CBBVXWuT.js → image-PoF6DN3x.js} +6 -6
- package/dist/src/{image-CDLQOcqT.cjs → image-fza3zuKs.cjs} +4 -4
- package/dist/src/{image-tL5hIOFh.js → image-xNbw5ph2.js} +4 -4
- package/dist/src/index.cjs +863 -110
- package/dist/src/index.d.cts +833 -60
- package/dist/src/index.d.ts +833 -60
- package/dist/src/index.js +860 -108
- package/dist/src/{interactiveCheck-BgLZUIt3.js → interactiveCheck-BnMYOjMu.js} +2 -2
- package/dist/src/{knowledgeBase-CoU-UQBg.js → knowledgeBase-Bi7CmDbx.js} +7 -7
- package/dist/src/{knowledgeBase-CLJybhnF.js → knowledgeBase-Ce3ofVan.js} +8 -8
- package/dist/src/{knowledgeBase-DjWPVqSb.js → knowledgeBase-DFRXPZl_.js} +7 -7
- package/dist/src/{knowledgeBase-wkxuRFhA.cjs → knowledgeBase-DqrLX8fy.cjs} +7 -7
- package/dist/src/{litellm-B9Hysuri.js → litellm-Bo2gQXpo.js} +16 -15
- package/dist/src/{litellm-ePxtr9F1.js → litellm-CKiAxnoM.js} +15 -14
- package/dist/src/{litellm-NYpQ8RQu.cjs → litellm-CnHI69aj.cjs} +16 -15
- package/dist/src/{litellm-CTfa0hqi.js → litellm-Tc294Jhj.js} +15 -14
- package/dist/src/{logger-KkObSCzq.js → logger-BcJBzSSA.js} +10 -14
- package/dist/src/{logger-DLcq4dWf.js → logger-BnkjG2jt.js} +10 -14
- package/dist/src/{logger-Cp1GPUjj.cjs → logger-D5iKBpu_.cjs} +27 -13
- package/dist/src/{logger-CT3IKMKA.js → logger-DO8_zM18.js} +10 -14
- package/dist/src/{luma-ray-BW9IRGIc.js → luma-ray-0ehMPt5N.js} +10 -10
- package/dist/src/{luma-ray-BE2mOt6N.js → luma-ray-C9q8rdQe.js} +9 -9
- package/dist/src/{luma-ray-Cm1KZBhs.js → luma-ray-DP0QA9qn.js} +9 -9
- package/dist/src/{luma-ray-B0GGNRc1.cjs → luma-ray-m9Ku2meV.cjs} +9 -9
- package/dist/src/main.js +69 -71
- package/dist/src/{messages-1x9atZmP.js → messages-DJNo37Ko.js} +14 -9
- package/dist/src/{messages-BLbWdsyt.js → messages-Dy9QecMs.js} +14 -9
- package/dist/src/{messages-1JrJs91T.cjs → messages-HJsyEh4o.cjs} +15 -10
- package/dist/src/{messages-D8EA0oDc.js → messages-biC_ex-p.js} +14 -9
- package/dist/src/{modelslab-C1OLRmVX.js → modelslab-B5J-ZM5c.js} +9 -9
- package/dist/src/{modelslab-CqXBy3U8.js → modelslab-BI458moT.js} +10 -10
- package/dist/src/{modelslab-X5-4LroM.js → modelslab-BTOT8FUO.js} +9 -9
- package/dist/src/{modelslab-DcOSFwKh.cjs → modelslab-IQbNg-r7.cjs} +9 -9
- package/dist/src/{nova-reel-DihqLeol.js → nova-reel-BZ9y-Y5s.js} +9 -9
- package/dist/src/{nova-reel-D9xfaMBs.cjs → nova-reel-CE5etkv9.cjs} +9 -9
- package/dist/src/{nova-reel-D2ZkOSyr.js → nova-reel-DEeQlnOJ.js} +10 -10
- package/dist/src/{nova-reel-BgS1ZWuK.js → nova-reel-Xw1SXLpg.js} +9 -9
- package/dist/src/{nova-sonic-Q3BOJeig.js → nova-sonic-DWswpN1E.js} +7 -7
- package/dist/src/{nova-sonic-DezhVUYT.js → nova-sonic-DXTLpi-r.js} +6 -6
- package/dist/src/{nova-sonic-DVu3mMIy.cjs → nova-sonic-N0yCm0vb.cjs} +6 -6
- package/dist/src/{nova-sonic-P-CdUMlV.js → nova-sonic-Ogqf-csn.js} +6 -6
- package/dist/src/{openai-DhbB7eWK.js → openai-BMcwgD5C.js} +2 -2
- package/dist/src/{openai-j-sE2O7r.js → openai-BcB5KlTk.js} +2 -2
- package/dist/src/{openai-Cuif0GEt.cjs → openai-CoxGAQwn.cjs} +2 -2
- package/dist/src/{openai-DElQ-fPX.js → openai-D6wITiVn.js} +2 -2
- package/dist/src/{openclaw-Bv1DINsX.js → openclaw-0Sv7AK3O.js} +172 -109
- package/dist/src/{openclaw-DAfWQn-o.cjs → openclaw-CXxbKgDH.cjs} +174 -110
- package/dist/src/{openclaw-BiSZPL7J.js → openclaw-D1FSCps-.js} +172 -109
- package/dist/src/{openclaw-D1D_ej1z.js → openclaw-D2ENvu7a.js} +173 -110
- package/dist/src/{opencode-sdk-D95s6SnR.js → opencode-sdk-C71Z0ehR.js} +13 -13
- package/dist/src/{opencode-sdk-DxUPkLT7.js → opencode-sdk-CHCs7dEb.js} +12 -12
- package/dist/src/{opencode-sdk-C7m-wRfI.js → opencode-sdk-DDxj4QqH.js} +12 -12
- package/dist/src/{opencode-sdk-CfaLN8PY.cjs → opencode-sdk-WWJhnbKr.cjs} +16 -16
- package/dist/src/{otlpReceiver-g3ByGaXs.js → otlpReceiver-C9KlUtxh.js} +6 -6
- package/dist/src/{otlpReceiver--AIRW_S4.js → otlpReceiver-CZL48YfC.js} +6 -6
- package/dist/src/{otlpReceiver-Bn5wGB1v.js → otlpReceiver-CavGAA6k.js} +6 -6
- package/dist/src/{otlpReceiver-Diec4cln.cjs → otlpReceiver-DHKqJlsz.cjs} +6 -6
- package/dist/src/{providerRegistry-B0RUOLI_.js → providerRegistry-B9lh-_tx.js} +2 -2
- package/dist/src/{providerRegistry-Civky8Ar.cjs → providerRegistry-BTDgfV5h.cjs} +2 -2
- package/dist/src/{providerRegistry-CD8MEar9.js → providerRegistry-BkzVH5Ba.js} +2 -2
- package/dist/src/{providerRegistry-DM8rZYol.js → providerRegistry-CUWki5mQ.js} +2 -2
- package/dist/src/providers-BSLEaIQG.js +32 -0
- package/dist/src/{providers-CFu-TZl-.cjs → providers-CScd1wN6.cjs} +733 -464
- package/dist/src/{providers-CFLy1_ji.js → providers-Ch6Mr0gn.js} +795 -526
- package/dist/src/{providers-BKRJTjBz.js → providers-Cn73d5sr.js} +795 -526
- package/dist/src/providers-D-FnDg8k.cjs +31 -0
- package/dist/src/providers-DEYiFVAo.js +30 -0
- package/dist/src/{providers-B3HvufyI.js → providers-DvddrgxL.js} +795 -526
- package/dist/src/providers-sS2WI8YD.js +30 -0
- package/dist/src/{pythonUtils-D6fwaDSg.js → pythonUtils-Bzwbgpbg.js} +3 -3
- package/dist/src/{pythonUtils-D5nxkQ0P.js → pythonUtils-Cpo0Ez1p.js} +3 -3
- package/dist/src/{pythonUtils-CTU3Y3lw.cjs → pythonUtils-dAVigVK-.cjs} +3 -3
- package/dist/src/{pythonUtils-C3py6GC1.js → pythonUtils-wIqk7zAf.js} +3 -3
- package/dist/src/{quiverai-CI6gYJVI.js → quiverai-BeofbLVc.js} +4 -4
- package/dist/src/{quiverai-MHSxbmmZ.js → quiverai-CCQn73lq.js} +5 -5
- package/dist/src/{quiverai-CLkWkyZc.cjs → quiverai-CcUhPIBg.cjs} +4 -4
- package/dist/src/{quiverai-C2jVwbH1.js → quiverai-DVSEqJiq.js} +4 -4
- package/dist/src/{render-Drod8m7K.js → render-BHl6QVq9.js} +3 -3
- package/dist/src/{responses-CGw0DCzh.js → responses-BKP_WYis.js} +16 -12
- package/dist/src/{responses-BKqJmhhc.js → responses-CQb1Tj69.js} +16 -12
- package/dist/src/{responses-jxdehPkC.js → responses-CgNyTPsY.js} +16 -12
- package/dist/src/{responses-tD4Bd4dc.cjs → responses-mo0KQDbu.cjs} +16 -12
- package/dist/src/rubyUtils-B1HXG4ej.cjs +4 -0
- package/dist/src/{rubyUtils-DhCAlxZr.cjs → rubyUtils-CGeUtCfW.cjs} +3 -3
- package/dist/src/{rubyUtils-Boc4HZzX.js → rubyUtils-CiVfln3g.js} +3 -3
- package/dist/src/{rubyUtils-BcuGX77l.js → rubyUtils-DECSbsfY.js} +3 -3
- package/dist/src/{rubyUtils-BUVePouc.js → rubyUtils-PgU-gHmx.js} +3 -3
- package/dist/src/rubyUtils-Rt6pKA96.js +5 -0
- package/dist/src/{sagemaker-BK4Zb993.js → sagemaker-CVv8W7so.js} +17 -17
- package/dist/src/{sagemaker-D2Q1c-sD.js → sagemaker-CqeASYE5.js} +17 -17
- package/dist/src/{sagemaker-BfiWTmvn.js → sagemaker-MUbD5V3v.js} +18 -18
- package/dist/src/{sagemaker-CcQHM1jV.cjs → sagemaker-jiw1wQa-.cjs} +17 -17
- package/dist/src/{scanner-J8CA3LsV.js → scanner-DVDeUz1r.js} +10 -10
- package/dist/src/server/index.js +864 -112
- package/dist/src/server-B0Xh1Gx-.js +7 -0
- package/dist/src/{server-B0PPuDw-.cjs → server-BtoCXeXI.cjs} +4 -4
- package/dist/src/{server-BC7XJFgr.js → server-CP9qKM40.js} +4 -4
- package/dist/src/{server-OAs3nBRT.js → server-Cns05F1j.js} +5 -5
- package/dist/src/server-DJTKu9IR.cjs +5 -0
- package/dist/src/{server-DbFphssR.js → server-DZ9MtCn0.js} +6 -6
- package/dist/src/{signal-BOTbd53Z.js → signal-C3ZTsUgi.js} +3 -3
- package/dist/src/{slack-DXMKtA-f.js → slack-2sdpGzbt.js} +2 -2
- package/dist/src/{slack-BmVAVGaK.cjs → slack-94iG3T0s.cjs} +2 -2
- package/dist/src/{slack-DCUPTzS2.js → slack-BR0HtO3K.js} +2 -2
- package/dist/src/{slack-DOdy_kyv.js → slack-DCEV-vWP.js} +2 -2
- package/dist/src/store-C5u6MgC8.js +6 -0
- package/dist/src/{store-BSc-TF2w.cjs → store-CLyU7AtI.cjs} +17 -5
- package/dist/src/store-CNHk-De4.cjs +5 -0
- package/dist/src/{store-DQLEjuEO.js → store-Cj258DgL.js} +17 -5
- package/dist/src/{store-D1tv90v3.js → store-P8OKm19S.js} +17 -5
- package/dist/src/{store-Ub2vaGJ1.js → store-VB0GP46K.js} +17 -5
- package/dist/src/{tables-xKANLRBD.js → tables-BEIFz2tM.js} +3 -3
- package/dist/src/{tables-C7K-XKWp.cjs → tables-BdZQEpRz.cjs} +3 -3
- package/dist/src/{tables-D36WTqKX.js → tables-DmzvLbeZ.js} +3 -3
- package/dist/src/{tables-5EvT_Bwn.js → tables-kC7R5kiK.js} +3 -3
- package/dist/src/{telemetry-C2YDkUQH.js → telemetry-BnH5VJAU.js} +4 -4
- package/dist/src/{telemetry-C15ziL8u.js → telemetry-BugWqKiu.js} +4 -4
- package/dist/src/{telemetry-DMb2Mpfm.js → telemetry-DPXLd7UE.js} +4 -4
- package/dist/src/telemetry-Yig0Tino.js +7 -0
- package/dist/src/telemetry-p8Pwqm1i.cjs +5 -0
- package/dist/src/{telemetry-CbrnxHp_.cjs → telemetry-re627Lre.cjs} +4 -4
- package/dist/src/{transcription-CL78qbOU.cjs → transcription-BvtsrzRG.cjs} +13 -13
- package/dist/src/{transcription-DAtxHhAM.js → transcription-CaMivnjG.js} +13 -13
- package/dist/src/{transcription-QHh3AH6Z.js → transcription-DOMMTu01.js} +14 -14
- package/dist/src/{transcription-LNZTNUUL.js → transcription-Hb3VnC4M.js} +13 -13
- package/dist/src/{transform-DOcQeLld.cjs → transform-0BwoBsvO.cjs} +19 -5
- package/dist/src/{transform-DGxXocjk.js → transform-B2-jIv68.js} +8 -6
- package/dist/src/{transform-DECvGmzp.js → transform-BqPkNPYm.js} +4 -4
- package/dist/src/{transform-aa6tmVpZ.js → transform-BzK09Q_9.js} +4 -4
- package/dist/src/transform-ChNIpHz7.js +6 -0
- package/dist/src/{transform-Cgi24fJ7.js → transform-DrleutM3.js} +8 -6
- package/dist/src/{transform-DGLazrMm.js → transform-DyDAwEpE.js} +8 -6
- package/dist/src/transform-PtQ6rAE3.cjs +5 -0
- package/dist/src/{transform-CzK1Q0zl.cjs → transform-ZrG2dvlo.cjs} +4 -4
- package/dist/src/{transform-DilY9wbS.js → transform-ljLYHEPh.js} +4 -4
- package/dist/src/{transformersAvailability-CEVM2GNQ.js → transformersAvailability-BGkzavwb.js} +1 -1
- package/dist/src/{transformersAvailability-CwayUSlh.cjs → transformersAvailability-DKoRtQLy.cjs} +1 -1
- package/dist/src/{types-CH3Ge2sE.js → types-CIhFeUC4.js} +45 -11
- package/dist/src/{types-CN_TZ2GJ.js → types-Cd3ygw8W.js} +45 -11
- package/dist/src/{types-LJ0r3wbR.cjs → types-D8cGDZbL.cjs} +46 -12
- package/dist/src/{types-CLKiCBW3.js → types-q8GXGF65.js} +45 -11
- package/dist/src/{util-CchiqXh_.cjs → util--9u9UVCt.cjs} +3 -3
- package/dist/src/{util-5cB-L7U3.js → util-BLvy9qfE.js} +7 -11
- package/dist/src/{util-YT5HPZaS.js → util-Bm3E9jpK.js} +7 -11
- package/dist/src/{util-6-GqIvzS.js → util-BtoGs5Cb.js} +18 -4
- package/dist/src/{util-Db0a0AFH.cjs → util-CFj4YKIn.cjs} +18 -4
- package/dist/src/{util-Dlz_Wvgm.js → util-CMMkIxfU.js} +7 -11
- package/dist/src/{util-Betm42rL.js → util-CgDCK4KI.js} +18 -4
- package/dist/src/{util-Yz-1aEhW.cjs → util-CuLo2pMR.cjs} +7 -11
- package/dist/src/{util-C-PPYSMq.js → util-DM2rTn_6.js} +18 -4
- package/dist/src/{util-B7T3SiBS.js → util-DMFeUvLz.js} +3 -3
- package/dist/src/{util-ZZH-3QZz.js → util-DbVG-yZU.js} +3 -3
- package/dist/src/{util-DaWTWKBK.js → util-vNmDL5DT.js} +3 -3
- package/dist/src/{utils-XiOAgly5.js → utils-CFxO9KGo.js} +2 -2
- package/dist/src/{utils-f2-Moju7.js → utils-DEuL4VNB.js} +2 -2
- package/dist/src/{utils-Cz9qXqII.cjs → utils-DKw8mrgr.cjs} +3 -3
- package/dist/src/{utils-dLokC-eR.js → utils-DOjD4dTC.js} +2 -2
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -38
- package/dist/src/app/assets/index-BFCZg7hQ.js +0 -439
- package/dist/src/app/assets/index-NCn4eVBv.css +0 -1
- package/dist/src/app/assets/sync-9qqYcY-B.js +0 -4
- package/dist/src/app/assets/vendor-charts-CCl15Imd.js +0 -36
- package/dist/src/app/assets/vendor-markdown-0tekx3KX.js +0 -29
- package/dist/src/cache-Bbn1Nyrd.cjs +0 -5
- package/dist/src/cache-BwsMSda7.js +0 -6
- package/dist/src/cloud-DmE0EwsY.js +0 -4
- package/dist/src/eval-17JizQIv.js +0 -15
- package/dist/src/evalResult-Cqj8pldJ.js +0 -12
- package/dist/src/evalResult-DvcJAWJU.cjs +0 -10
- package/dist/src/evalResult-Hftn-S_i.js +0 -10
- package/dist/src/evaluator-B2CFNt-P.js +0 -36
- package/dist/src/fetch-KV5kNASw.js +0 -5
- package/dist/src/graders-Bu0H9nXi.js +0 -32
- package/dist/src/graders-Cfhkvx-e.js +0 -34
- package/dist/src/graders-DClJVpGP.cjs +0 -32
- package/dist/src/graders-DcnJsrMO.js +0 -32
- package/dist/src/providers-C1rOSHiR.js +0 -32
- package/dist/src/providers-CxmDwEFf.cjs +0 -31
- package/dist/src/providers-Dodakqr0.js +0 -30
- package/dist/src/providers-GIQ2TcsA.js +0 -30
- package/dist/src/rubyUtils-BUHu6PhO.js +0 -5
- package/dist/src/rubyUtils-CP42kMvq.cjs +0 -4
- package/dist/src/server-B1vi21hA.js +0 -7
- package/dist/src/server-Cm9Kai_h.cjs +0 -5
- package/dist/src/store-BNmZ1KAz.cjs +0 -5
- package/dist/src/store-BltJg2cd.js +0 -6
- package/dist/src/telemetry-5BCRNBbe.cjs +0 -5
- package/dist/src/telemetry-D4W5hboe.js +0 -7
- package/dist/src/transform-DTGDnAzW.js +0 -6
- package/dist/src/transform-m3qNw4KP.cjs +0 -5
package/dist/src/index.cjs
CHANGED
|
@@ -2,43 +2,43 @@ Object.defineProperties(exports, {
|
|
|
2
2
|
__esModule: { value: true },
|
|
3
3
|
[Symbol.toStringTag]: { value: "Module" }
|
|
4
4
|
});
|
|
5
|
-
const require_logger = require("./logger-
|
|
5
|
+
const require_logger = require("./logger-D5iKBpu_.cjs");
|
|
6
6
|
const require_invariant = require("./invariant-kfQ8Bu82.cjs");
|
|
7
|
-
const require_esm = require("./esm-
|
|
8
|
-
const require_pythonUtils = require("./pythonUtils-
|
|
7
|
+
const require_esm = require("./esm-CipptfDu.cjs");
|
|
8
|
+
const require_pythonUtils = require("./pythonUtils-dAVigVK-.cjs");
|
|
9
9
|
const require_fileExtensions = require("./fileExtensions-bYh77CN8.cjs");
|
|
10
|
-
const require_transform = require("./transform-
|
|
11
|
-
const require_graders = require("./graders-
|
|
12
|
-
const require_types = require("./types-
|
|
13
|
-
const require_util = require("./util-
|
|
14
|
-
const require_fetch = require("./fetch-
|
|
15
|
-
const require_cache = require("./cache-
|
|
16
|
-
const require_providers = require("./providers-
|
|
17
|
-
const require_utils = require("./utils-
|
|
18
|
-
const require_createHash = require("./createHash-
|
|
19
|
-
require("./genaiTracer-
|
|
20
|
-
const require_chat = require("./chat-
|
|
10
|
+
const require_transform = require("./transform-ZrG2dvlo.cjs");
|
|
11
|
+
const require_graders = require("./graders-BElhu9ZY.cjs");
|
|
12
|
+
const require_types = require("./types-D8cGDZbL.cjs");
|
|
13
|
+
const require_util = require("./util-CuLo2pMR.cjs");
|
|
14
|
+
const require_fetch = require("./fetch-BnR9wSnm.cjs");
|
|
15
|
+
const require_cache = require("./cache-C5yFZ4gC.cjs");
|
|
16
|
+
const require_providers = require("./providers-CScd1wN6.cjs");
|
|
17
|
+
const require_utils = require("./utils-DKw8mrgr.cjs");
|
|
18
|
+
const require_createHash = require("./createHash-BYwImsYv.cjs");
|
|
19
|
+
require("./genaiTracer-BfxrvSUb.cjs");
|
|
20
|
+
const require_chat = require("./chat-CM8qWR3_.cjs");
|
|
21
21
|
const require_tokenUsageUtils = require("./tokenUsageUtils-bVa1ga6f.cjs");
|
|
22
|
-
const require_transform$1 = require("./transform-
|
|
23
|
-
require("./messages-
|
|
24
|
-
require("./util
|
|
25
|
-
require("./responses-
|
|
26
|
-
require("./openai-
|
|
27
|
-
const require_util$2 = require("./util-
|
|
28
|
-
require("./completion-
|
|
29
|
-
const require_accounts = require("./accounts-
|
|
30
|
-
const require_server = require("./server-
|
|
31
|
-
const require_blobs = require("./blobs-
|
|
32
|
-
const require_tables = require("./tables-
|
|
33
|
-
const require_extractor = require("./extractor-
|
|
34
|
-
const require_telemetry = require("./telemetry-
|
|
22
|
+
const require_transform$1 = require("./transform-0BwoBsvO.cjs");
|
|
23
|
+
require("./messages-HJsyEh4o.cjs");
|
|
24
|
+
require("./util--9u9UVCt.cjs");
|
|
25
|
+
require("./responses-mo0KQDbu.cjs");
|
|
26
|
+
require("./openai-CoxGAQwn.cjs");
|
|
27
|
+
const require_util$2 = require("./util-CFj4YKIn.cjs");
|
|
28
|
+
require("./completion-DlXUhj5c.cjs");
|
|
29
|
+
const require_accounts = require("./accounts-BPyfpSeU.cjs");
|
|
30
|
+
const require_server = require("./server-BtoCXeXI.cjs");
|
|
31
|
+
const require_blobs = require("./blobs-C6j0bvFz.cjs");
|
|
32
|
+
const require_tables = require("./tables-BdZQEpRz.cjs");
|
|
33
|
+
const require_extractor = require("./extractor-DG3sSfXE.cjs");
|
|
34
|
+
const require_telemetry = require("./telemetry-re627Lre.cjs");
|
|
35
35
|
const require_text = require("./text-CW1cyrwj.cjs");
|
|
36
|
-
const require_store = require("./store-
|
|
37
|
-
require("./base-
|
|
38
|
-
require("./image
|
|
39
|
-
const require_providerRegistry = require("./providerRegistry-
|
|
40
|
-
const require_rubyUtils = require("./rubyUtils-
|
|
41
|
-
const require_evalResult = require("./evalResult-
|
|
36
|
+
const require_store = require("./store-CLyU7AtI.cjs");
|
|
37
|
+
require("./base-BboXIF_0.cjs");
|
|
38
|
+
require("./image--F58eEIn.cjs");
|
|
39
|
+
const require_providerRegistry = require("./providerRegistry-BTDgfV5h.cjs");
|
|
40
|
+
const require_rubyUtils = require("./rubyUtils-CGeUtCfW.cjs");
|
|
41
|
+
const require_evalResult = require("./evalResult-DpARzUCb.cjs");
|
|
42
42
|
let fs = require("fs");
|
|
43
43
|
fs = require_logger.__toESM(fs);
|
|
44
44
|
let path = require("path");
|
|
@@ -68,6 +68,8 @@ crypto$1 = require_logger.__toESM(crypto$1);
|
|
|
68
68
|
let _opentelemetry_api = require("@opentelemetry/api");
|
|
69
69
|
let _inquirer_input = require("@inquirer/input");
|
|
70
70
|
_inquirer_input = require_logger.__toESM(_inquirer_input);
|
|
71
|
+
let readline = require("readline");
|
|
72
|
+
readline = require_logger.__toESM(readline);
|
|
71
73
|
let drizzle_orm = require("drizzle-orm");
|
|
72
74
|
let cli_progress = require("cli-progress");
|
|
73
75
|
cli_progress = require_logger.__toESM(cli_progress);
|
|
@@ -75,6 +77,7 @@ let jsdom = require("jsdom");
|
|
|
75
77
|
let fastest_levenshtein = require("fastest-levenshtein");
|
|
76
78
|
let js_rouge = require("js-rouge");
|
|
77
79
|
js_rouge = require_logger.__toESM(js_rouge);
|
|
80
|
+
let node_util = require("node:util");
|
|
78
81
|
require("debounce");
|
|
79
82
|
let _opentelemetry_core = require("@opentelemetry/core");
|
|
80
83
|
let _opentelemetry_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otlp-http");
|
|
@@ -307,7 +310,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
307
310
|
require_telemetry.telemetry.record("feature_used", { feature: "tracing" });
|
|
308
311
|
try {
|
|
309
312
|
require_logger.logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
310
|
-
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
313
|
+
const { startOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DHKqJlsz.cjs"));
|
|
311
314
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
312
315
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
313
316
|
require_logger.logger.debug(`[EvaluatorTracing] Starting OTLP receiver on ${host}:${port}`);
|
|
@@ -330,7 +333,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
330
333
|
async function stopOtlpReceiverIfNeeded() {
|
|
331
334
|
if (otlpReceiverStarted) try {
|
|
332
335
|
require_logger.logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
333
|
-
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-
|
|
336
|
+
const { stopOTLPReceiver } = await Promise.resolve().then(() => require("./otlpReceiver-DHKqJlsz.cjs"));
|
|
334
337
|
await stopOTLPReceiver();
|
|
335
338
|
otlpReceiverStarted = false;
|
|
336
339
|
require_logger.logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -365,7 +368,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
365
368
|
}
|
|
366
369
|
if (!tracingEnabled) return null;
|
|
367
370
|
require_logger.logger.debug("[EvaluatorTracing] Importing trace store");
|
|
368
|
-
const { getTraceStore } = await Promise.resolve().then(() => require("./store-
|
|
371
|
+
const { getTraceStore } = await Promise.resolve().then(() => require("./store-CNHk-De4.cjs"));
|
|
369
372
|
const traceStore = getTraceStore();
|
|
370
373
|
const traceId = generateTraceId();
|
|
371
374
|
const spanId = generateSpanId();
|
|
@@ -1398,7 +1401,7 @@ const handleJavascript = async ({ assertion, renderedValue, valueFromScript, ass
|
|
|
1398
1401
|
pass = result !== inverse;
|
|
1399
1402
|
score = pass ? 1 : 0;
|
|
1400
1403
|
} else if (typeof result === "number") {
|
|
1401
|
-
pass = assertion.threshold
|
|
1404
|
+
pass = assertion.threshold === void 0 ? result > 0 : result >= assertion.threshold;
|
|
1402
1405
|
score = result;
|
|
1403
1406
|
} else if (typeof result === "object") return result;
|
|
1404
1407
|
else throw new Error("Custom function must return a boolean or number");
|
|
@@ -1431,7 +1434,7 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1431
1434
|
} catch {
|
|
1432
1435
|
pass = inverse;
|
|
1433
1436
|
}
|
|
1434
|
-
if (
|
|
1437
|
+
if (parsedJson !== void 0 && renderedValue) {
|
|
1435
1438
|
let validate;
|
|
1436
1439
|
if (typeof renderedValue === "string") if (renderedValue.startsWith("file://")) {
|
|
1437
1440
|
const schema = valueFromScript;
|
|
@@ -1443,11 +1446,12 @@ function handleIsJson({ outputString, renderedValue, inverse, valueFromScript, a
|
|
|
1443
1446
|
}
|
|
1444
1447
|
else if (typeof renderedValue === "object") validate = require_logger.getAjv().compile(renderedValue);
|
|
1445
1448
|
else throw new Error("is-json assertion must have a string or object value");
|
|
1446
|
-
|
|
1449
|
+
const valid = validate(parsedJson);
|
|
1450
|
+
pass = inverse ? !valid : valid;
|
|
1447
1451
|
if (!pass) return {
|
|
1448
1452
|
pass,
|
|
1449
1453
|
score: 0,
|
|
1450
|
-
reason: `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`,
|
|
1454
|
+
reason: inverse ? "Output is JSON that conforms to the provided schema" : `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`,
|
|
1451
1455
|
assertion
|
|
1452
1456
|
};
|
|
1453
1457
|
}
|
|
@@ -1474,9 +1478,12 @@ function handleContainsJson({ assertion, renderedValue, outputString, inverse, v
|
|
|
1474
1478
|
}
|
|
1475
1479
|
else if (typeof renderedValue === "object") validate = require_logger.getAjv().compile(renderedValue);
|
|
1476
1480
|
else throw new Error("contains-json assertion must have a string or object value");
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1481
|
+
const valid = validate(jsonObject);
|
|
1482
|
+
pass = inverse ? !valid : valid;
|
|
1483
|
+
if (valid) {
|
|
1484
|
+
if (inverse) errorMessage = "Output contains JSON conforming to the provided schema";
|
|
1485
|
+
break;
|
|
1486
|
+
} else errorMessage = `JSON does not conform to the provided schema. Errors: ${require_logger.getAjv().errorsText(validate.errors)}`;
|
|
1480
1487
|
}
|
|
1481
1488
|
return {
|
|
1482
1489
|
pass,
|
|
@@ -1660,7 +1667,7 @@ function handlePerplexity({ logProbs, assertion }) {
|
|
|
1660
1667
|
if (!logProbs || logProbs.length === 0) throw new Error("Perplexity assertion does not support providers that do not return logProbs");
|
|
1661
1668
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
1662
1669
|
const perplexity = Math.exp(-avgLogProb);
|
|
1663
|
-
const pass = assertion.threshold
|
|
1670
|
+
const pass = assertion.threshold === void 0 ? true : perplexity <= assertion.threshold;
|
|
1664
1671
|
return {
|
|
1665
1672
|
pass,
|
|
1666
1673
|
score: pass ? 1 : 0,
|
|
@@ -1672,7 +1679,7 @@ function handlePerplexityScore({ logProbs, assertion }) {
|
|
|
1672
1679
|
if (!logProbs || logProbs.length === 0) throw new Error("perplexity-score assertion does not support providers that do not return logProbs");
|
|
1673
1680
|
const avgLogProb = logProbs.reduce((acc, logProb) => acc + logProb, 0) / logProbs.length;
|
|
1674
1681
|
const perplexityNorm = 1 / (1 + Math.exp(-avgLogProb));
|
|
1675
|
-
const pass = assertion.threshold
|
|
1682
|
+
const pass = assertion.threshold === void 0 ? true : perplexityNorm >= assertion.threshold;
|
|
1676
1683
|
return {
|
|
1677
1684
|
pass,
|
|
1678
1685
|
score: perplexityNorm,
|
|
@@ -1787,7 +1794,7 @@ ${isMultiline ? renderedValue.split("\n").map((line) => `${indentStyle}${line}`)
|
|
|
1787
1794
|
} else {
|
|
1788
1795
|
score = Number.parseFloat(String(result));
|
|
1789
1796
|
if (Number.isNaN(score)) throw new Error(`Python assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
1790
|
-
pass = assertion.threshold
|
|
1797
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
1791
1798
|
}
|
|
1792
1799
|
} catch (err) {
|
|
1793
1800
|
return {
|
|
@@ -2048,7 +2055,7 @@ end
|
|
|
2048
2055
|
} else {
|
|
2049
2056
|
score = Number.parseFloat(String(result));
|
|
2050
2057
|
if (Number.isNaN(score)) throw new Error(`Ruby assertion must return a boolean, number, or {pass, score, reason} object. Instead got:\n${result}`);
|
|
2051
|
-
pass = assertion.threshold
|
|
2058
|
+
pass = assertion.threshold === void 0 ? score > 0 : score >= assertion.threshold;
|
|
2052
2059
|
}
|
|
2053
2060
|
} catch (err) {
|
|
2054
2061
|
return {
|
|
@@ -2119,6 +2126,127 @@ const handleSimilar = async ({ assertion, renderedValue, outputString, inverse,
|
|
|
2119
2126
|
};
|
|
2120
2127
|
};
|
|
2121
2128
|
//#endregion
|
|
2129
|
+
//#region src/assertions/traceUtils.ts
|
|
2130
|
+
/**
|
|
2131
|
+
* Shared utilities for trace assertions
|
|
2132
|
+
*/
|
|
2133
|
+
/**
|
|
2134
|
+
* Match a span name against a glob-like pattern.
|
|
2135
|
+
* Supports * (any characters) and ? (single character) wildcards.
|
|
2136
|
+
*
|
|
2137
|
+
* @param spanName - The span name to match
|
|
2138
|
+
* @param pattern - The glob pattern to match against
|
|
2139
|
+
* @returns true if the span name matches the pattern
|
|
2140
|
+
*/
|
|
2141
|
+
function matchesPattern(spanName, pattern) {
|
|
2142
|
+
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2143
|
+
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2144
|
+
}
|
|
2145
|
+
//#endregion
|
|
2146
|
+
//#region src/assertions/skill.ts
|
|
2147
|
+
function getSkillCalls(params) {
|
|
2148
|
+
const rawSkillCalls = params.providerResponse?.metadata?.skillCalls;
|
|
2149
|
+
if (!Array.isArray(rawSkillCalls)) return [];
|
|
2150
|
+
return rawSkillCalls.filter((entry) => Boolean(entry) && typeof entry === "object" && typeof entry.name === "string");
|
|
2151
|
+
}
|
|
2152
|
+
function matchesSkill(skillCall, matcher) {
|
|
2153
|
+
if (matcher.name && skillCall.name !== matcher.name) return false;
|
|
2154
|
+
if (matcher.pattern && !matchesPattern(skillCall.name, matcher.pattern)) return false;
|
|
2155
|
+
return true;
|
|
2156
|
+
}
|
|
2157
|
+
function formatSkillCall(skillCall) {
|
|
2158
|
+
const details = [skillCall.source, skillCall.path].filter(Boolean).join(", ");
|
|
2159
|
+
return details ? `${skillCall.name} (${details})` : skillCall.name;
|
|
2160
|
+
}
|
|
2161
|
+
function resolveSkillMatchers(value) {
|
|
2162
|
+
const normalizeText = (text) => typeof text === "string" ? text.trim() : void 0;
|
|
2163
|
+
const validateCount = (field, count) => {
|
|
2164
|
+
if (!Number.isFinite(count) || !Number.isInteger(count) || count < 0) throw new Error(`skill-used assertion object ${field} must be a finite non-negative integer`);
|
|
2165
|
+
};
|
|
2166
|
+
if (typeof value === "string" && value.trim()) return {
|
|
2167
|
+
kind: "list",
|
|
2168
|
+
matchers: [{ name: normalizeText(value) }]
|
|
2169
|
+
};
|
|
2170
|
+
if (Array.isArray(value) && value.length > 0 && value.every((item) => typeof item === "string" && item.trim())) return {
|
|
2171
|
+
kind: "list",
|
|
2172
|
+
matchers: value.map((item) => ({ name: item.trim() }))
|
|
2173
|
+
};
|
|
2174
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
2175
|
+
const rawMatcher = value;
|
|
2176
|
+
const matcher = rawMatcher;
|
|
2177
|
+
const name = normalizeText(matcher.name);
|
|
2178
|
+
const pattern = normalizeText(matcher.pattern);
|
|
2179
|
+
if (!name && !pattern) throw new Error("skill-used assertion object must include a name or pattern property");
|
|
2180
|
+
if ("min" in rawMatcher) validateCount("min", matcher.min);
|
|
2181
|
+
if ("max" in rawMatcher) validateCount("max", matcher.max);
|
|
2182
|
+
if (typeof matcher.min === "number" && typeof matcher.max === "number" && matcher.max < matcher.min) throw new Error("skill-used assertion object max must be greater than or equal to min");
|
|
2183
|
+
return {
|
|
2184
|
+
kind: "count",
|
|
2185
|
+
matcher: {
|
|
2186
|
+
max: typeof matcher.max === "number" ? matcher.max : void 0,
|
|
2187
|
+
min: typeof matcher.min === "number" ? matcher.min : void 0,
|
|
2188
|
+
name,
|
|
2189
|
+
pattern
|
|
2190
|
+
}
|
|
2191
|
+
};
|
|
2192
|
+
}
|
|
2193
|
+
throw new Error("skill-used assertion must have a string, string array, or object value");
|
|
2194
|
+
}
|
|
2195
|
+
function handleListSkillAssertion(params, skillCalls, actualSkills, expected) {
|
|
2196
|
+
const missing = expected.matchers.filter((matcher) => !skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
2197
|
+
const matched = expected.matchers.filter((matcher) => skillCalls.some((skillCall) => matchesSkill(skillCall, matcher)));
|
|
2198
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
2199
|
+
const expectedSkills = expected.matchers.map((matcher) => matcher.name);
|
|
2200
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
2201
|
+
let reason;
|
|
2202
|
+
if (params.inverse) reason = pass ? `Forbidden skill(s) were not used: ${expectedSkills.join(", ")}` : `Forbidden skill(s) were used: ${matched.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
2203
|
+
else if (pass) reason = `Observed required skill(s): ${expectedSkills.join(", ")}. Actual skills: ${actualSummary}`;
|
|
2204
|
+
else reason = `Missing required skill(s): ${missing.map((matcher) => matcher.name).join(", ")}. Actual skills: ${actualSummary}`;
|
|
2205
|
+
return {
|
|
2206
|
+
pass,
|
|
2207
|
+
score: pass ? 1 : 0,
|
|
2208
|
+
reason,
|
|
2209
|
+
assertion: params.assertion
|
|
2210
|
+
};
|
|
2211
|
+
}
|
|
2212
|
+
function handleCountSkillAssertion(params, skillCalls, actualSkills, matcher) {
|
|
2213
|
+
const hasExplicitMin = matcher.min !== void 0;
|
|
2214
|
+
const hasExplicitMax = matcher.max !== void 0;
|
|
2215
|
+
const min = matcher.min ?? (hasExplicitMax ? 0 : 1);
|
|
2216
|
+
const max = matcher.max;
|
|
2217
|
+
const matchingSkillCalls = skillCalls.filter((skillCall) => matchesSkill(skillCall, matcher));
|
|
2218
|
+
const count = matchingSkillCalls.length;
|
|
2219
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
2220
|
+
if (params.inverse) {
|
|
2221
|
+
if (hasExplicitMin || hasExplicitMax && max !== 0) throw new Error("not-skill-used object assertions only support name/pattern with no count bounds, or max: 0");
|
|
2222
|
+
const pass = count === 0;
|
|
2223
|
+
const actualSummary = actualSkills.length > 0 ? actualSkills.join(", ") : "(none)";
|
|
2224
|
+
return {
|
|
2225
|
+
pass,
|
|
2226
|
+
score: pass ? 1 : 0,
|
|
2227
|
+
reason: pass ? `Forbidden skill "${matcherLabel}" was not used. Actual skills: ${actualSummary}` : `Forbidden skill "${matcherLabel}" was used ${count} time(s). Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`,
|
|
2228
|
+
assertion: params.assertion
|
|
2229
|
+
};
|
|
2230
|
+
}
|
|
2231
|
+
const pass = count >= min && (max === void 0 || count <= max);
|
|
2232
|
+
let reason = `Matched skill "${matcherLabel}" ${count} time(s)`;
|
|
2233
|
+
reason += max === void 0 ? ` (expected at least ${min})` : ` (expected ${min}-${max})`;
|
|
2234
|
+
if (matchingSkillCalls.length > 0) reason += `. Matches: ${matchingSkillCalls.map(formatSkillCall).join(", ")}`;
|
|
2235
|
+
return {
|
|
2236
|
+
pass,
|
|
2237
|
+
score: pass ? 1 : 0,
|
|
2238
|
+
reason,
|
|
2239
|
+
assertion: params.assertion
|
|
2240
|
+
};
|
|
2241
|
+
}
|
|
2242
|
+
function handleSkillUsed(params) {
|
|
2243
|
+
const skillCalls = getSkillCalls(params);
|
|
2244
|
+
const actualSkills = skillCalls.map(formatSkillCall);
|
|
2245
|
+
const expected = resolveSkillMatchers(params.renderedValue ?? params.assertion.value);
|
|
2246
|
+
if (expected.kind === "list") return handleListSkillAssertion(params, skillCalls, actualSkills, expected);
|
|
2247
|
+
return handleCountSkillAssertion(params, skillCalls, actualSkills, expected.matcher);
|
|
2248
|
+
}
|
|
2249
|
+
//#endregion
|
|
2122
2250
|
//#region src/assertions/sql.ts
|
|
2123
2251
|
const handleIsSql = async ({ assertion, renderedValue, outputString, inverse }) => {
|
|
2124
2252
|
let pass = false;
|
|
@@ -2351,23 +2479,6 @@ const handleToolCallF1 = ({ assertion, output, renderedValue, inverse }) => {
|
|
|
2351
2479
|
};
|
|
2352
2480
|
};
|
|
2353
2481
|
//#endregion
|
|
2354
|
-
//#region src/assertions/traceUtils.ts
|
|
2355
|
-
/**
|
|
2356
|
-
* Shared utilities for trace assertions
|
|
2357
|
-
*/
|
|
2358
|
-
/**
|
|
2359
|
-
* Match a span name against a glob-like pattern.
|
|
2360
|
-
* Supports * (any characters) and ? (single character) wildcards.
|
|
2361
|
-
*
|
|
2362
|
-
* @param spanName - The span name to match
|
|
2363
|
-
* @param pattern - The glob pattern to match against
|
|
2364
|
-
* @returns true if the span name matches the pattern
|
|
2365
|
-
*/
|
|
2366
|
-
function matchesPattern(spanName, pattern) {
|
|
2367
|
-
const regexPattern = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
2368
|
-
return new RegExp(`^${regexPattern}$`, "i").test(spanName);
|
|
2369
|
-
}
|
|
2370
|
-
//#endregion
|
|
2371
2482
|
//#region src/assertions/traceErrorSpans.ts
|
|
2372
2483
|
function isErrorSpan(span) {
|
|
2373
2484
|
if (span.statusCode && span.statusCode >= 400) return true;
|
|
@@ -2536,6 +2647,524 @@ const handleTraceSpanDuration = ({ assertion, assertionValueContext }) => {
|
|
|
2536
2647
|
};
|
|
2537
2648
|
};
|
|
2538
2649
|
//#endregion
|
|
2650
|
+
//#region src/assertions/trajectoryUtils.ts
|
|
2651
|
+
const TOOL_ATTRIBUTE_KEYS = [
|
|
2652
|
+
"tool.name",
|
|
2653
|
+
"tool_name",
|
|
2654
|
+
"tool",
|
|
2655
|
+
"function.name",
|
|
2656
|
+
"function_name",
|
|
2657
|
+
"gen_ai.tool.name",
|
|
2658
|
+
"codex.mcp.tool",
|
|
2659
|
+
"agent.tool",
|
|
2660
|
+
"agent.tool_name",
|
|
2661
|
+
"agent.toolName"
|
|
2662
|
+
];
|
|
2663
|
+
const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
|
|
2664
|
+
"tool.arguments",
|
|
2665
|
+
"tool.args",
|
|
2666
|
+
"tool.input",
|
|
2667
|
+
"tool_arguments",
|
|
2668
|
+
"tool_args",
|
|
2669
|
+
"tool_input",
|
|
2670
|
+
"function.arguments",
|
|
2671
|
+
"function.args",
|
|
2672
|
+
"function.input",
|
|
2673
|
+
"function_arguments",
|
|
2674
|
+
"function_args",
|
|
2675
|
+
"gen_ai.tool.arguments",
|
|
2676
|
+
"gen_ai.tool.args",
|
|
2677
|
+
"gen_ai.tool.input",
|
|
2678
|
+
"gen_ai.tool.call.arguments",
|
|
2679
|
+
"gen_ai.tool.call.args",
|
|
2680
|
+
"agent.tool.arguments",
|
|
2681
|
+
"agent.tool.args",
|
|
2682
|
+
"agent.tool.input",
|
|
2683
|
+
"codex.mcp.arguments",
|
|
2684
|
+
"codex.mcp.args",
|
|
2685
|
+
"codex.mcp.input",
|
|
2686
|
+
"arguments",
|
|
2687
|
+
"args",
|
|
2688
|
+
"input"
|
|
2689
|
+
];
|
|
2690
|
+
const COMMAND_ATTRIBUTE_KEYS = [
|
|
2691
|
+
"codex.command",
|
|
2692
|
+
"command",
|
|
2693
|
+
"command.name",
|
|
2694
|
+
"command_name"
|
|
2695
|
+
];
|
|
2696
|
+
const SEARCH_ATTRIBUTE_KEYS = [
|
|
2697
|
+
"codex.search.query",
|
|
2698
|
+
"search.query",
|
|
2699
|
+
"search_query"
|
|
2700
|
+
];
|
|
2701
|
+
const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
|
|
2702
|
+
const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
|
|
2703
|
+
const MAX_JUDGE_SUMMARY_STEPS = 24;
|
|
2704
|
+
const JUDGE_SUMMARY_HEAD_STEPS = 12;
|
|
2705
|
+
const JUDGE_SUMMARY_TAIL_STEPS = 12;
|
|
2706
|
+
function getStringAttribute(attributes, keys) {
|
|
2707
|
+
for (const key of keys) {
|
|
2708
|
+
const value = attributes[key];
|
|
2709
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
2710
|
+
}
|
|
2711
|
+
}
|
|
2712
|
+
function normalizeStructuredAttribute(value) {
|
|
2713
|
+
if (value === void 0 || value === null) return;
|
|
2714
|
+
if (typeof value === "string") {
|
|
2715
|
+
const trimmed = value.trim();
|
|
2716
|
+
if (!trimmed) return;
|
|
2717
|
+
try {
|
|
2718
|
+
return JSON.parse(trimmed);
|
|
2719
|
+
} catch {
|
|
2720
|
+
return trimmed;
|
|
2721
|
+
}
|
|
2722
|
+
}
|
|
2723
|
+
if (typeof value === "number" || typeof value === "boolean" || typeof value === "object") return value;
|
|
2724
|
+
}
|
|
2725
|
+
function hasSameStatus(left, right) {
|
|
2726
|
+
return left?.code === right?.code && left?.message === right?.message;
|
|
2727
|
+
}
|
|
2728
|
+
function isSearchLikeSpan(span) {
|
|
2729
|
+
const attributes = span.attributes || {};
|
|
2730
|
+
if (SEARCH_SPAN_NAME_PATTERN.test(span.name) || span.name.startsWith("search ")) return true;
|
|
2731
|
+
return Object.keys(attributes).some((key) => key !== "query" && /(^|[._])(search|lookup|retriev(?:e|al))($|[._])/i.test(key));
|
|
2732
|
+
}
|
|
2733
|
+
function getTrajectoryStepStatus(step) {
|
|
2734
|
+
if (step.statusCode === void 0 || step.statusCode === 0) return;
|
|
2735
|
+
return {
|
|
2736
|
+
code: step.statusCode,
|
|
2737
|
+
...step.statusMessage ? { message: step.statusMessage } : {}
|
|
2738
|
+
};
|
|
2739
|
+
}
|
|
2740
|
+
function getCommandExecutable(command) {
|
|
2741
|
+
return command.trim().split(/\s+/)[0] || void 0;
|
|
2742
|
+
}
|
|
2743
|
+
function extractToolName(span) {
|
|
2744
|
+
const attributes = span.attributes || {};
|
|
2745
|
+
const directMatch = getStringAttribute(attributes, TOOL_ATTRIBUTE_KEYS);
|
|
2746
|
+
if (directMatch) return directMatch;
|
|
2747
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
2748
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
2749
|
+
if (/tool.?name|function.?name/i.test(key)) return value.trim();
|
|
2750
|
+
if (/(^|[._])tool($|[._])/i.test(key) && !/result|output/i.test(key)) return value.trim();
|
|
2751
|
+
}
|
|
2752
|
+
if (span.name.startsWith("mcp ")) {
|
|
2753
|
+
const slashIndex = span.name.lastIndexOf("/");
|
|
2754
|
+
if (slashIndex !== -1 && slashIndex < span.name.length - 1) return span.name.slice(slashIndex + 1).trim();
|
|
2755
|
+
}
|
|
2756
|
+
}
|
|
2757
|
+
function extractToolArgs(span) {
|
|
2758
|
+
const attributes = span.attributes || {};
|
|
2759
|
+
for (const key of TOOL_ARGUMENT_ATTRIBUTE_KEYS) {
|
|
2760
|
+
const value = normalizeStructuredAttribute(attributes[key]);
|
|
2761
|
+
if (value !== void 0) return value;
|
|
2762
|
+
}
|
|
2763
|
+
for (const [key, rawValue] of Object.entries(attributes)) {
|
|
2764
|
+
if (/result|output|error|status/i.test(key)) continue;
|
|
2765
|
+
if (!/(^|[._])(arguments|args|input)($|[._])/i.test(key)) continue;
|
|
2766
|
+
const value = normalizeStructuredAttribute(rawValue);
|
|
2767
|
+
if (value !== void 0) return value;
|
|
2768
|
+
}
|
|
2769
|
+
}
|
|
2770
|
+
function extractCommand(span) {
|
|
2771
|
+
const attributes = span.attributes || {};
|
|
2772
|
+
const directMatch = getStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
|
|
2773
|
+
if (directMatch) return directMatch;
|
|
2774
|
+
for (const [key, value] of Object.entries(attributes)) {
|
|
2775
|
+
if (typeof value !== "string" || !value.trim()) continue;
|
|
2776
|
+
if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
|
|
2777
|
+
}
|
|
2778
|
+
if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
|
|
2779
|
+
}
|
|
2780
|
+
function extractSearchQuery(span) {
|
|
2781
|
+
const attributes = span.attributes || {};
|
|
2782
|
+
const directMatch = getStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
|
|
2783
|
+
if (directMatch) return directMatch;
|
|
2784
|
+
const genericQuery = getStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
|
|
2785
|
+
if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
|
|
2786
|
+
if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
|
|
2787
|
+
}
|
|
2788
|
+
function isReasoningSpan(span) {
|
|
2789
|
+
if ((span.attributes || {})["codex.item.type"] === "reasoning") return true;
|
|
2790
|
+
return /^reasoning([_\s]|$)/i.test(span.name) || span.name === "reasoning";
|
|
2791
|
+
}
|
|
2792
|
+
function isMessageSpan(span) {
|
|
2793
|
+
if ((span.attributes || {})["codex.item.type"] === "agent_message") return true;
|
|
2794
|
+
return span.name === "agent response" || span.name === "send input";
|
|
2795
|
+
}
|
|
2796
|
+
function extractTrajectorySteps(trace) {
|
|
2797
|
+
return [...trace.spans || []].map((span, index) => ({
|
|
2798
|
+
span,
|
|
2799
|
+
index
|
|
2800
|
+
})).sort((left, right) => {
|
|
2801
|
+
const timeDiff = left.span.startTime - right.span.startTime;
|
|
2802
|
+
if (timeDiff !== 0) return timeDiff;
|
|
2803
|
+
const endDiff = (left.span.endTime ?? left.span.startTime) - (right.span.endTime ?? right.span.startTime);
|
|
2804
|
+
if (endDiff !== 0) return endDiff;
|
|
2805
|
+
return left.index - right.index;
|
|
2806
|
+
}).map(({ span }) => {
|
|
2807
|
+
const toolName = extractToolName(span);
|
|
2808
|
+
const command = extractCommand(span);
|
|
2809
|
+
const searchQuery = extractSearchQuery(span);
|
|
2810
|
+
let type = "span";
|
|
2811
|
+
let name = span.name;
|
|
2812
|
+
const aliases = new Set([span.name]);
|
|
2813
|
+
let args;
|
|
2814
|
+
if (toolName) {
|
|
2815
|
+
type = "tool";
|
|
2816
|
+
name = toolName;
|
|
2817
|
+
aliases.add(toolName);
|
|
2818
|
+
args = extractToolArgs(span);
|
|
2819
|
+
} else if (command) {
|
|
2820
|
+
type = "command";
|
|
2821
|
+
name = command;
|
|
2822
|
+
aliases.add(command);
|
|
2823
|
+
const executable = getCommandExecutable(command);
|
|
2824
|
+
if (executable) aliases.add(executable);
|
|
2825
|
+
} else if (searchQuery) {
|
|
2826
|
+
type = "search";
|
|
2827
|
+
name = searchQuery;
|
|
2828
|
+
aliases.add(searchQuery);
|
|
2829
|
+
} else if (isReasoningSpan(span)) {
|
|
2830
|
+
type = "reasoning";
|
|
2831
|
+
name = span.name;
|
|
2832
|
+
aliases.add("reasoning");
|
|
2833
|
+
} else if (isMessageSpan(span)) {
|
|
2834
|
+
type = "message";
|
|
2835
|
+
name = span.name;
|
|
2836
|
+
aliases.add("message");
|
|
2837
|
+
}
|
|
2838
|
+
return {
|
|
2839
|
+
aliases: [...aliases],
|
|
2840
|
+
...args === void 0 ? {} : { args },
|
|
2841
|
+
attributes: span.attributes || {},
|
|
2842
|
+
endTime: span.endTime,
|
|
2843
|
+
name,
|
|
2844
|
+
spanId: span.spanId,
|
|
2845
|
+
spanName: span.name,
|
|
2846
|
+
startTime: span.startTime,
|
|
2847
|
+
statusCode: span.statusCode,
|
|
2848
|
+
statusMessage: span.statusMessage,
|
|
2849
|
+
type
|
|
2850
|
+
};
|
|
2851
|
+
});
|
|
2852
|
+
}
|
|
2853
|
+
function normalizeTrajectoryMatcher(matcher, defaultType) {
|
|
2854
|
+
if (typeof matcher === "string") return {
|
|
2855
|
+
pattern: matcher,
|
|
2856
|
+
...defaultType ? { type: defaultType } : {}
|
|
2857
|
+
};
|
|
2858
|
+
return {
|
|
2859
|
+
...matcher,
|
|
2860
|
+
...matcher.type ? {} : defaultType ? { type: defaultType } : {}
|
|
2861
|
+
};
|
|
2862
|
+
}
|
|
2863
|
+
function matchesTrajectoryStep(step, matcher, defaultType) {
|
|
2864
|
+
const { type, pattern, name } = normalizeTrajectoryMatcher(matcher, defaultType);
|
|
2865
|
+
if (type) {
|
|
2866
|
+
if (!(Array.isArray(type) ? type : [type]).includes(step.type)) return false;
|
|
2867
|
+
}
|
|
2868
|
+
const matchPattern = pattern || name;
|
|
2869
|
+
if (!matchPattern) return true;
|
|
2870
|
+
return step.aliases.some((alias) => matchesPattern(alias, matchPattern));
|
|
2871
|
+
}
|
|
2872
|
+
function formatTrajectoryStep(step) {
|
|
2873
|
+
return `${step.type}:${step.name}`;
|
|
2874
|
+
}
|
|
2875
|
+
function formatTrajectoryArgs(args) {
|
|
2876
|
+
if (args === void 0) return "(none)";
|
|
2877
|
+
try {
|
|
2878
|
+
const serialized = JSON.stringify(args);
|
|
2879
|
+
if (serialized !== void 0) return serialized;
|
|
2880
|
+
} catch {}
|
|
2881
|
+
return String(args);
|
|
2882
|
+
}
|
|
2883
|
+
function compactJudgeTrajectorySteps(steps) {
|
|
2884
|
+
const compacted = [];
|
|
2885
|
+
for (const step of steps) {
|
|
2886
|
+
const previousStep = compacted[compacted.length - 1];
|
|
2887
|
+
if (previousStep && previousStep.type === step.type && previousStep.name === step.name && previousStep.spanName === step.spanName && hasSameStatus(previousStep.status, step.status)) {
|
|
2888
|
+
previousStep.collapsedCount = (previousStep.collapsedCount ?? 1) + 1;
|
|
2889
|
+
continue;
|
|
2890
|
+
}
|
|
2891
|
+
compacted.push(step);
|
|
2892
|
+
}
|
|
2893
|
+
return compacted;
|
|
2894
|
+
}
|
|
2895
|
+
function truncateJudgeTrajectorySteps(steps) {
|
|
2896
|
+
if (steps.length <= MAX_JUDGE_SUMMARY_STEPS) return steps;
|
|
2897
|
+
return [
|
|
2898
|
+
...steps.slice(0, JUDGE_SUMMARY_HEAD_STEPS),
|
|
2899
|
+
{ omittedCount: steps.length - MAX_JUDGE_SUMMARY_STEPS },
|
|
2900
|
+
...steps.slice(-JUDGE_SUMMARY_TAIL_STEPS)
|
|
2901
|
+
];
|
|
2902
|
+
}
|
|
2903
|
+
function summarizeTrajectoryForJudge(trace) {
|
|
2904
|
+
const rawSteps = extractTrajectorySteps(trace).map((step, index) => ({
|
|
2905
|
+
index: index + 1,
|
|
2906
|
+
type: step.type,
|
|
2907
|
+
name: step.name,
|
|
2908
|
+
...step.spanName === step.name ? {} : { spanName: step.spanName },
|
|
2909
|
+
...getTrajectoryStepStatus(step) ? { status: getTrajectoryStepStatus(step) } : {}
|
|
2910
|
+
}));
|
|
2911
|
+
const compactedSteps = compactJudgeTrajectorySteps(rawSteps);
|
|
2912
|
+
const steps = truncateJudgeTrajectorySteps(compactedSteps);
|
|
2913
|
+
return JSON.stringify({
|
|
2914
|
+
traceId: trace.traceId,
|
|
2915
|
+
stepCount: rawSteps.length,
|
|
2916
|
+
compactedStepCount: compactedSteps.length,
|
|
2917
|
+
steps
|
|
2918
|
+
}, null, 2);
|
|
2919
|
+
}
|
|
2920
|
+
//#endregion
|
|
2921
|
+
//#region src/assertions/trajectory.ts
|
|
2922
|
+
function getTraceOrThrow(params) {
|
|
2923
|
+
const trace = params.assertionValueContext.trace;
|
|
2924
|
+
if (!trace || !trace.spans) throw new Error(`No trace data available for ${params.baseType} assertion`);
|
|
2925
|
+
return trace;
|
|
2926
|
+
}
|
|
2927
|
+
function applyInverse(pass, inverse) {
|
|
2928
|
+
return inverse ? !pass : pass;
|
|
2929
|
+
}
|
|
2930
|
+
function formatStepList(stepLabels) {
|
|
2931
|
+
return stepLabels.length > 0 ? stepLabels.join(", ") : "(none)";
|
|
2932
|
+
}
|
|
2933
|
+
function requireNamedTrajectoryMatcher(matcher, assertionType, index) {
|
|
2934
|
+
if (matcher.pattern || matcher.name) return;
|
|
2935
|
+
const stepLabel = index === void 0 ? "object" : `step ${index + 1}`;
|
|
2936
|
+
throw new Error(`${assertionType} assertion ${stepLabel} must include a name or pattern property`);
|
|
2937
|
+
}
|
|
2938
|
+
function resolveGoalSuccessValue(value) {
|
|
2939
|
+
if (typeof value === "string" && value.trim()) return { goal: value.trim() };
|
|
2940
|
+
if (value && typeof value === "object" && !Array.isArray(value) && typeof value.goal === "string" && value.goal.trim()) return { goal: value.goal.trim() };
|
|
2941
|
+
throw new Error("trajectory:goal-success assertion must have a string value or an object with a goal property");
|
|
2942
|
+
}
|
|
2943
|
+
function resolveToolMatchers(value) {
|
|
2944
|
+
if (typeof value === "string") return {
|
|
2945
|
+
kind: "list",
|
|
2946
|
+
matchers: [normalizeTrajectoryMatcher(value, "tool")]
|
|
2947
|
+
};
|
|
2948
|
+
if (Array.isArray(value) && value.every((item) => typeof item === "string")) return {
|
|
2949
|
+
kind: "list",
|
|
2950
|
+
matchers: value.map((item) => normalizeTrajectoryMatcher(item, "tool"))
|
|
2951
|
+
};
|
|
2952
|
+
if (value && typeof value === "object" && !Array.isArray(value)) return {
|
|
2953
|
+
kind: "count",
|
|
2954
|
+
matcher: {
|
|
2955
|
+
...normalizeTrajectoryMatcher(value, "tool"),
|
|
2956
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
2957
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
2958
|
+
}
|
|
2959
|
+
};
|
|
2960
|
+
throw new Error("trajectory:tool-used assertion must have a string, string array, or object value");
|
|
2961
|
+
}
|
|
2962
|
+
const handleTrajectoryToolUsed = (params) => {
|
|
2963
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
2964
|
+
const expected = resolveToolMatchers(params.renderedValue ?? params.assertion.value);
|
|
2965
|
+
if (expected.kind === "list") {
|
|
2966
|
+
if (expected.matchers.length === 0) throw new Error("trajectory:tool-used assertion requires at least one expected tool");
|
|
2967
|
+
const missing = expected.matchers.filter((matcher) => !steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
2968
|
+
const matched = expected.matchers.filter((matcher) => steps.some((step) => matchesTrajectoryStep(step, matcher)));
|
|
2969
|
+
const pass = params.inverse ? matched.length === 0 : missing.length === 0;
|
|
2970
|
+
const actualTools = steps.map(formatTrajectoryStep);
|
|
2971
|
+
const expectedTools = expected.matchers.map((matcher) => matcher.pattern || matcher.name || "*");
|
|
2972
|
+
let reason;
|
|
2973
|
+
if (params.inverse) reason = pass ? `Forbidden tool(s) were not used: ${expectedTools.join(", ")}` : `Forbidden tool(s) were used: ${matched.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2974
|
+
else if (pass) reason = `Observed required tool(s): ${expectedTools.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2975
|
+
else reason = `Missing required tool(s): ${missing.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
2976
|
+
return {
|
|
2977
|
+
pass,
|
|
2978
|
+
score: pass ? 1 : 0,
|
|
2979
|
+
reason,
|
|
2980
|
+
assertion: params.assertion
|
|
2981
|
+
};
|
|
2982
|
+
}
|
|
2983
|
+
const matcher = expected.matcher;
|
|
2984
|
+
const min = matcher.min ?? 1;
|
|
2985
|
+
const max = matcher.max;
|
|
2986
|
+
if (!matcher.pattern && !matcher.name) throw new Error("trajectory:tool-used assertion object must include a name or pattern property");
|
|
2987
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
2988
|
+
const count = matchingSteps.length;
|
|
2989
|
+
const basePass = count >= min && (max === void 0 || count <= max);
|
|
2990
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
2991
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
2992
|
+
let reason = `Matched tool "${matcherLabel}" ${count} time(s)`;
|
|
2993
|
+
if (max === void 0) reason += ` (expected at least ${min})`;
|
|
2994
|
+
else reason += ` (expected ${min}-${max})`;
|
|
2995
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
2996
|
+
if (params.inverse) reason = basePass ? `Tool "${matcherLabel}" matched ${count} time(s), which violates the inverse assertion` : `Tool "${matcherLabel}" did not satisfy the forbidden match condition`;
|
|
2997
|
+
return {
|
|
2998
|
+
pass,
|
|
2999
|
+
score: pass ? 1 : 0,
|
|
3000
|
+
reason,
|
|
3001
|
+
assertion: params.assertion
|
|
3002
|
+
};
|
|
3003
|
+
};
|
|
3004
|
+
function resolveSequenceValue(value) {
|
|
3005
|
+
if (Array.isArray(value)) return {
|
|
3006
|
+
mode: "in_order",
|
|
3007
|
+
steps: value
|
|
3008
|
+
};
|
|
3009
|
+
if (value && typeof value === "object" && !Array.isArray(value)) {
|
|
3010
|
+
const sequenceValue = value;
|
|
3011
|
+
return {
|
|
3012
|
+
mode: sequenceValue.mode || "in_order",
|
|
3013
|
+
steps: sequenceValue.steps || []
|
|
3014
|
+
};
|
|
3015
|
+
}
|
|
3016
|
+
throw new Error("trajectory:tool-sequence assertion must have an array or object value");
|
|
3017
|
+
}
|
|
3018
|
+
function isRecord(value) {
|
|
3019
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3020
|
+
}
|
|
3021
|
+
function matchesExpectedArgsPartial(actual, expected) {
|
|
3022
|
+
if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
|
|
3023
|
+
if (isRecord(expected)) {
|
|
3024
|
+
if (!isRecord(actual)) return false;
|
|
3025
|
+
return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
|
|
3026
|
+
}
|
|
3027
|
+
return (0, node_util.isDeepStrictEqual)(actual, expected);
|
|
3028
|
+
}
|
|
3029
|
+
function matchesToolArgs(actual, expected, mode) {
|
|
3030
|
+
if (mode === "exact") return (0, node_util.isDeepStrictEqual)(actual, expected);
|
|
3031
|
+
return matchesExpectedArgsPartial(actual, expected);
|
|
3032
|
+
}
|
|
3033
|
+
function resolveToolArgsMatchMode(mode) {
|
|
3034
|
+
if (mode === void 0) return "partial";
|
|
3035
|
+
if (mode === "partial" || mode === "exact") return mode;
|
|
3036
|
+
throw new Error("trajectory:tool-args-match assertion mode must be \"partial\" or \"exact\"");
|
|
3037
|
+
}
|
|
3038
|
+
function resolveToolArgsMatchValue(value) {
|
|
3039
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:tool-args-match assertion must have an object value");
|
|
3040
|
+
const matcher = normalizeTrajectoryMatcher(value, "tool");
|
|
3041
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-args-match");
|
|
3042
|
+
const expectedArgs = Object.prototype.hasOwnProperty.call(value, "args") ? value.args : value.arguments;
|
|
3043
|
+
if (expectedArgs === void 0) throw new Error("trajectory:tool-args-match assertion must include an args or arguments property");
|
|
3044
|
+
return {
|
|
3045
|
+
matcher,
|
|
3046
|
+
expectedArgs,
|
|
3047
|
+
mode: resolveToolArgsMatchMode(value.mode)
|
|
3048
|
+
};
|
|
3049
|
+
}
|
|
3050
|
+
const handleTrajectoryToolSequence = (params) => {
|
|
3051
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
3052
|
+
const value = resolveSequenceValue(params.renderedValue ?? params.assertion.value);
|
|
3053
|
+
const expectedMatchers = value.steps.map((step, index) => {
|
|
3054
|
+
const matcher = normalizeTrajectoryMatcher(step, "tool");
|
|
3055
|
+
requireNamedTrajectoryMatcher(matcher, "trajectory:tool-sequence", index);
|
|
3056
|
+
return matcher;
|
|
3057
|
+
});
|
|
3058
|
+
if (expectedMatchers.length === 0) throw new Error("trajectory:tool-sequence assertion requires at least one expected step");
|
|
3059
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
3060
|
+
let basePass = false;
|
|
3061
|
+
let reason = "";
|
|
3062
|
+
if (value.mode === "exact") {
|
|
3063
|
+
basePass = toolSteps.length === expectedMatchers.length && expectedMatchers.every((matcher, index) => matchesTrajectoryStep(toolSteps[index], matcher));
|
|
3064
|
+
if (basePass) reason = `Observed exact tool sequence: ${formatStepList(actualTools)}`;
|
|
3065
|
+
else reason = `Expected exact tool sequence of ${expectedMatchers.map((matcher) => matcher.pattern || matcher.name || "*").join(", ")}, but actual tools were ${formatStepList(actualTools)}`;
|
|
3066
|
+
} else {
|
|
3067
|
+
let expectedIndex = 0;
|
|
3068
|
+
const matchedSteps = [];
|
|
3069
|
+
for (const step of toolSteps) {
|
|
3070
|
+
if (expectedIndex >= expectedMatchers.length) break;
|
|
3071
|
+
if (matchesTrajectoryStep(step, expectedMatchers[expectedIndex])) {
|
|
3072
|
+
matchedSteps.push(formatTrajectoryStep(step));
|
|
3073
|
+
expectedIndex += 1;
|
|
3074
|
+
}
|
|
3075
|
+
}
|
|
3076
|
+
basePass = expectedIndex === expectedMatchers.length;
|
|
3077
|
+
if (basePass) reason = `Observed tool sequence in order: ${matchedSteps.join(", ")}. Actual tools: ${formatStepList(actualTools)}`;
|
|
3078
|
+
else reason = `Expected tool "${expectedMatchers[expectedIndex]?.pattern || expectedMatchers[expectedIndex]?.name || "*"}" was not observed in order. Actual tools: ${formatStepList(actualTools)}`;
|
|
3079
|
+
}
|
|
3080
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3081
|
+
if (params.inverse) reason = basePass ? `Forbidden tool sequence was observed. Actual tools: ${formatStepList(actualTools)}` : `Forbidden tool sequence was not observed`;
|
|
3082
|
+
return {
|
|
3083
|
+
pass,
|
|
3084
|
+
score: pass ? 1 : 0,
|
|
3085
|
+
reason,
|
|
3086
|
+
assertion: params.assertion
|
|
3087
|
+
};
|
|
3088
|
+
};
|
|
3089
|
+
const handleTrajectoryToolArgsMatch = (params) => {
|
|
3090
|
+
const toolSteps = extractTrajectorySteps(getTraceOrThrow(params)).filter((step) => step.type === "tool");
|
|
3091
|
+
const { matcher, expectedArgs, mode } = resolveToolArgsMatchValue(params.renderedValue ?? params.assertion.value);
|
|
3092
|
+
const matcherLabel = matcher.pattern || matcher.name || "*";
|
|
3093
|
+
const actualTools = toolSteps.map(formatTrajectoryStep);
|
|
3094
|
+
const matchingSteps = toolSteps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
3095
|
+
const stepsWithArgs = matchingSteps.filter((step) => step.args !== void 0);
|
|
3096
|
+
const matchedStep = stepsWithArgs.find((step) => matchesToolArgs(step.args, expectedArgs, mode));
|
|
3097
|
+
const basePass = matchedStep !== void 0;
|
|
3098
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3099
|
+
const expectedArgsLabel = formatTrajectoryArgs(expectedArgs);
|
|
3100
|
+
const observedArgsLabel = stepsWithArgs.length > 0 ? stepsWithArgs.map((step) => formatTrajectoryArgs(step.args)).join(", ") : "(none)";
|
|
3101
|
+
let reason;
|
|
3102
|
+
if (params.inverse) if (basePass) reason = `Forbidden argument match for tool "${matcherLabel}" was observed on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
3103
|
+
else if (matchingSteps.length === 0) reason = `Forbidden argument match for tool "${matcherLabel}" was not observed because no tool call matched it`;
|
|
3104
|
+
else reason = `Forbidden argument match for tool "${matcherLabel}" was not observed. Observed args: ${observedArgsLabel}`;
|
|
3105
|
+
else if (basePass) reason = `Tool "${matcherLabel}" matched expected arguments (${mode}) on ${formatTrajectoryStep(matchedStep)}. Args: ${formatTrajectoryArgs(matchedStep.args)}`;
|
|
3106
|
+
else if (matchingSteps.length === 0) reason = `No tool call matched "${matcherLabel}". Actual tools: ${formatStepList(actualTools)}`;
|
|
3107
|
+
else if (stepsWithArgs.length === 0) reason = `Tool "${matcherLabel}" was observed but no arguments were captured. Actual tools: ${formatStepList(actualTools)}`;
|
|
3108
|
+
else reason = `No call to tool "${matcherLabel}" matched expected arguments (${mode}): ${expectedArgsLabel}. Observed args: ${observedArgsLabel}`;
|
|
3109
|
+
return {
|
|
3110
|
+
pass,
|
|
3111
|
+
score: pass ? 1 : 0,
|
|
3112
|
+
reason,
|
|
3113
|
+
assertion: params.assertion
|
|
3114
|
+
};
|
|
3115
|
+
};
|
|
3116
|
+
function resolveStepCountValue(value) {
|
|
3117
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) throw new Error("trajectory:step-count assertion must have an object value");
|
|
3118
|
+
return {
|
|
3119
|
+
...normalizeTrajectoryMatcher(value),
|
|
3120
|
+
max: typeof value.max === "number" ? value.max : void 0,
|
|
3121
|
+
min: typeof value.min === "number" ? value.min : void 0
|
|
3122
|
+
};
|
|
3123
|
+
}
|
|
3124
|
+
const handleTrajectoryStepCount = (params) => {
|
|
3125
|
+
const steps = extractTrajectorySteps(getTraceOrThrow(params));
|
|
3126
|
+
const matcher = resolveStepCountValue(params.renderedValue ?? params.assertion.value);
|
|
3127
|
+
const { min, max } = matcher;
|
|
3128
|
+
if (min === void 0 && max === void 0) throw new Error("trajectory:step-count assertion must include a min or max property");
|
|
3129
|
+
const matchingSteps = steps.filter((step) => matchesTrajectoryStep(step, matcher));
|
|
3130
|
+
const count = matchingSteps.length;
|
|
3131
|
+
const basePass = (min === void 0 || count >= min) && (max === void 0 || count <= max);
|
|
3132
|
+
const pass = applyInverse(basePass, params.inverse);
|
|
3133
|
+
const filterParts = [];
|
|
3134
|
+
if (matcher.type) {
|
|
3135
|
+
const types = Array.isArray(matcher.type) ? matcher.type : [matcher.type];
|
|
3136
|
+
filterParts.push(`type=${types.join("|")}`);
|
|
3137
|
+
}
|
|
3138
|
+
const pattern = matcher.pattern || matcher.name;
|
|
3139
|
+
if (pattern) filterParts.push(`pattern=${pattern}`);
|
|
3140
|
+
let reason = `Matched ${count} trajectory step(s)`;
|
|
3141
|
+
if (filterParts.length > 0) reason += ` for ${filterParts.join(", ")}`;
|
|
3142
|
+
if (min !== void 0 && max !== void 0) reason += ` (expected ${min}-${max})`;
|
|
3143
|
+
else if (min !== void 0) reason += ` (expected at least ${min})`;
|
|
3144
|
+
else if (max !== void 0) reason += ` (expected at most ${max})`;
|
|
3145
|
+
if (matchingSteps.length > 0) reason += `. Matches: ${matchingSteps.map(formatTrajectoryStep).join(", ")}`;
|
|
3146
|
+
if (params.inverse) reason = basePass ? `Trajectory step count satisfied the forbidden range` : `Trajectory step count did not satisfy the forbidden range`;
|
|
3147
|
+
return {
|
|
3148
|
+
pass,
|
|
3149
|
+
score: pass ? 1 : 0,
|
|
3150
|
+
reason,
|
|
3151
|
+
assertion: params.assertion
|
|
3152
|
+
};
|
|
3153
|
+
};
|
|
3154
|
+
const handleTrajectoryGoalSuccess = async (params) => {
|
|
3155
|
+
const trace = getTraceOrThrow(params);
|
|
3156
|
+
const { goal } = resolveGoalSuccessValue(params.renderedValue ?? params.assertion.value);
|
|
3157
|
+
const result = await require_graders.matchesTrajectoryGoalSuccess(goal, summarizeTrajectoryForJudge(trace), params.outputString, params.test.options, params.assertionValueContext.vars, params.assertion, params.providerCallContext);
|
|
3158
|
+
if (!params.inverse) return result;
|
|
3159
|
+
return {
|
|
3160
|
+
...result,
|
|
3161
|
+
assertion: params.assertion,
|
|
3162
|
+
pass: !result.pass,
|
|
3163
|
+
score: result.pass ? 0 : 1,
|
|
3164
|
+
reason: result.pass ? `Agent unexpectedly achieved the goal: ${goal}` : `Agent did not achieve the forbidden goal: ${goal}`
|
|
3165
|
+
};
|
|
3166
|
+
};
|
|
3167
|
+
//#endregion
|
|
2539
3168
|
//#region src/assertions/webhook.ts
|
|
2540
3169
|
async function handleWebhook({ assertion, renderedValue, test, prompt, output, inverse }) {
|
|
2541
3170
|
require_invariant.invariant(renderedValue, "\"webhook\" assertion type must have a URL value");
|
|
@@ -2604,18 +3233,18 @@ const handleWordCount = ({ assertion, renderedValue, valueFromScript, outputStri
|
|
|
2604
3233
|
if (pass) reason = "Assertion passed";
|
|
2605
3234
|
else if (inverse) reason = `Expected word count to not be between ${min} and ${max}, but got ${wordCount}`;
|
|
2606
3235
|
else reason = `Word count ${wordCount} is not between ${min} and ${max}`;
|
|
2607
|
-
} else if (min
|
|
2608
|
-
const basePass = wordCount >= min;
|
|
2609
|
-
pass = inverse ? !basePass : basePass;
|
|
2610
|
-
if (pass) reason = "Assertion passed";
|
|
2611
|
-
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
2612
|
-
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
2613
|
-
} else {
|
|
3236
|
+
} else if (min === void 0) {
|
|
2614
3237
|
const basePass = wordCount <= max;
|
|
2615
3238
|
pass = inverse ? !basePass : basePass;
|
|
2616
3239
|
if (pass) reason = "Assertion passed";
|
|
2617
3240
|
else if (inverse) reason = `Expected word count to be greater than ${max}, but got ${wordCount}`;
|
|
2618
3241
|
else reason = `Word count ${wordCount} is greater than maximum ${max}`;
|
|
3242
|
+
} else {
|
|
3243
|
+
const basePass = wordCount >= min;
|
|
3244
|
+
pass = inverse ? !basePass : basePass;
|
|
3245
|
+
if (pass) reason = "Assertion passed";
|
|
3246
|
+
else if (inverse) reason = `Expected word count to be less than ${min}, but got ${wordCount}`;
|
|
3247
|
+
else reason = `Word count ${wordCount} is less than minimum ${min}`;
|
|
2619
3248
|
}
|
|
2620
3249
|
} else {
|
|
2621
3250
|
require_invariant.invariant(typeof value === "number" || typeof value === "string" && !Number.isNaN(Number(value)), "\"word-count\" assertion value must be a number or an object with min/max properties");
|
|
@@ -2710,6 +3339,12 @@ const handleIsXml = ({ assertion, renderedValue, outputString, inverse, baseType
|
|
|
2710
3339
|
//#endregion
|
|
2711
3340
|
//#region src/assertions/index.ts
|
|
2712
3341
|
const ASSERTIONS_MAX_CONCURRENCY = require_logger.getEnvInt("PROMPTFOO_ASSERTIONS_MAX_CONCURRENCY", 3);
|
|
3342
|
+
const DEFAULT_TRACE_FETCH_MAX_ATTEMPTS = 6;
|
|
3343
|
+
const DEFAULT_TRACE_FETCH_RETRY_DELAY_MS = 250;
|
|
3344
|
+
const DEFAULT_TRACE_FETCH_STABLE_POLLS = 2;
|
|
3345
|
+
const MAX_TRACE_FETCH_MAX_ATTEMPTS = 30;
|
|
3346
|
+
const MAX_TRACE_FETCH_RETRY_DELAY_MS = 5e3;
|
|
3347
|
+
const MAX_TRACE_FETCH_STABLE_POLLS = 10;
|
|
2713
3348
|
const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
2714
3349
|
"answer-relevance",
|
|
2715
3350
|
"context-faithfulness",
|
|
@@ -2719,8 +3354,57 @@ const MODEL_GRADED_ASSERTION_TYPES = new Set([
|
|
|
2719
3354
|
"llm-rubric",
|
|
2720
3355
|
"model-graded-closedqa",
|
|
2721
3356
|
"model-graded-factuality",
|
|
2722
|
-
"search-rubric"
|
|
3357
|
+
"search-rubric",
|
|
3358
|
+
"trajectory:goal-success"
|
|
3359
|
+
]);
|
|
3360
|
+
const TRACE_AWARE_ASSERTION_TYPES = new Set([
|
|
3361
|
+
"javascript",
|
|
3362
|
+
"python",
|
|
3363
|
+
"ruby",
|
|
3364
|
+
"trace-error-spans",
|
|
3365
|
+
"trace-span-count",
|
|
3366
|
+
"trace-span-duration",
|
|
3367
|
+
"trajectory:goal-success",
|
|
3368
|
+
"trajectory:step-count",
|
|
3369
|
+
"trajectory:tool-args-match",
|
|
3370
|
+
"trajectory:tool-sequence",
|
|
3371
|
+
"trajectory:tool-used"
|
|
2723
3372
|
]);
|
|
3373
|
+
function assertionUsesTrace(assertion) {
|
|
3374
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionUsesTrace);
|
|
3375
|
+
return TRACE_AWARE_ASSERTION_TYPES.has(getAssertionBaseType(assertion));
|
|
3376
|
+
}
|
|
3377
|
+
function assertionMayNeedTraceContext(assertion) {
|
|
3378
|
+
if (assertionUsesTrace(assertion)) return true;
|
|
3379
|
+
if (assertion.type === "assert-set") return assertion.assert.some(assertionMayNeedTraceContext);
|
|
3380
|
+
return typeof assertion.value === "string" ? assertion.value.startsWith("file://") || require_providers.isPackagePath(assertion.value) : false;
|
|
3381
|
+
}
|
|
3382
|
+
function hasTraceAwareAssertions(assertions) {
|
|
3383
|
+
return Boolean(assertions?.some(assertionMayNeedTraceContext));
|
|
3384
|
+
}
|
|
3385
|
+
async function loadTraceData(traceId) {
|
|
3386
|
+
const traceStore = require_store.getTraceStore();
|
|
3387
|
+
const maxAttempts = Math.min(MAX_TRACE_FETCH_MAX_ATTEMPTS, Math.max(1, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_MAX_ATTEMPTS", DEFAULT_TRACE_FETCH_MAX_ATTEMPTS)));
|
|
3388
|
+
const retryDelayMs = Math.min(MAX_TRACE_FETCH_RETRY_DELAY_MS, Math.max(0, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_RETRY_DELAY_MS", DEFAULT_TRACE_FETCH_RETRY_DELAY_MS)));
|
|
3389
|
+
const stablePolls = Math.min(MAX_TRACE_FETCH_STABLE_POLLS, Math.max(1, require_logger.getEnvInt("PROMPTFOO_TRACE_FETCH_STABLE_POLLS", DEFAULT_TRACE_FETCH_STABLE_POLLS)));
|
|
3390
|
+
let lastSpanCount = -1;
|
|
3391
|
+
let stableObservations = 0;
|
|
3392
|
+
let latestTrace = null;
|
|
3393
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
3394
|
+
latestTrace = await traceStore.getTrace(traceId);
|
|
3395
|
+
const spanCount = latestTrace?.spans?.length ?? 0;
|
|
3396
|
+
if (spanCount > 0) {
|
|
3397
|
+
stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
|
|
3398
|
+
lastSpanCount = spanCount;
|
|
3399
|
+
if (stableObservations >= stablePolls || attempt === maxAttempts - 1) return latestTrace;
|
|
3400
|
+
} else {
|
|
3401
|
+
stableObservations = 0;
|
|
3402
|
+
lastSpanCount = spanCount;
|
|
3403
|
+
}
|
|
3404
|
+
if (attempt < maxAttempts - 1) await require_fetch.sleep(retryDelayMs);
|
|
3405
|
+
}
|
|
3406
|
+
return latestTrace;
|
|
3407
|
+
}
|
|
2724
3408
|
const ASSERTION_HANDLERS = {
|
|
2725
3409
|
"answer-relevance": handleAnswerRelevance,
|
|
2726
3410
|
bleu: handleBleuScore,
|
|
@@ -2783,12 +3467,18 @@ const ASSERTION_HANDLERS = {
|
|
|
2783
3467
|
ruby: handleRuby,
|
|
2784
3468
|
"rouge-n": handleRougeScore,
|
|
2785
3469
|
"search-rubric": handleSearchRubric,
|
|
3470
|
+
"skill-used": handleSkillUsed,
|
|
2786
3471
|
similar: handleSimilar,
|
|
2787
3472
|
"similar:cosine": handleSimilar,
|
|
2788
3473
|
"similar:dot": handleSimilar,
|
|
2789
3474
|
"similar:euclidean": handleSimilar,
|
|
2790
3475
|
"starts-with": handleStartsWith,
|
|
2791
3476
|
"tool-call-f1": handleToolCallF1,
|
|
3477
|
+
"trajectory:goal-success": handleTrajectoryGoalSuccess,
|
|
3478
|
+
"trajectory:tool-args-match": handleTrajectoryToolArgsMatch,
|
|
3479
|
+
"trajectory:step-count": handleTrajectoryStepCount,
|
|
3480
|
+
"trajectory:tool-sequence": handleTrajectoryToolSequence,
|
|
3481
|
+
"trajectory:tool-used": handleTrajectoryToolUsed,
|
|
2792
3482
|
"trace-error-spans": handleTraceErrorSpans,
|
|
2793
3483
|
"trace-span-count": handleTraceSpanCount,
|
|
2794
3484
|
"trace-span-duration": handleTraceSpanDuration,
|
|
@@ -2831,7 +3521,7 @@ function isAssertionInverse(assertion) {
|
|
|
2831
3521
|
function getAssertionBaseType(assertion) {
|
|
2832
3522
|
return isAssertionInverse(assertion) ? assertion.type.slice(4) : assertion.type;
|
|
2833
3523
|
}
|
|
2834
|
-
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId }) {
|
|
3524
|
+
async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs, providerResponse, traceId, traceData }) {
|
|
2835
3525
|
const resolvedVars = vars || test.vars || {};
|
|
2836
3526
|
const { cost, logProbs, output: originalOutput } = providerResponse;
|
|
2837
3527
|
let output = originalOutput;
|
|
@@ -2850,14 +3540,14 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2850
3540
|
providerResponse,
|
|
2851
3541
|
...assertion.config ? { config: structuredClone(assertion.config) } : {}
|
|
2852
3542
|
};
|
|
2853
|
-
if (traceId) try {
|
|
2854
|
-
const
|
|
2855
|
-
if (
|
|
2856
|
-
traceId:
|
|
2857
|
-
evaluationId:
|
|
2858
|
-
testCaseId:
|
|
2859
|
-
metadata:
|
|
2860
|
-
spans:
|
|
3543
|
+
if (traceId && assertionMayNeedTraceContext(assertion)) try {
|
|
3544
|
+
const resolvedTraceData = traceData === void 0 ? await loadTraceData(traceId) : traceData;
|
|
3545
|
+
if (resolvedTraceData) context.trace = {
|
|
3546
|
+
traceId: resolvedTraceData.traceId,
|
|
3547
|
+
evaluationId: resolvedTraceData.evaluationId,
|
|
3548
|
+
testCaseId: resolvedTraceData.testCaseId,
|
|
3549
|
+
metadata: resolvedTraceData.metadata,
|
|
3550
|
+
spans: resolvedTraceData.spans || []
|
|
2861
3551
|
};
|
|
2862
3552
|
} catch (error) {
|
|
2863
3553
|
require_logger.logger.debug(`Failed to fetch trace data for assertion: ${error}`);
|
|
@@ -2890,7 +3580,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
2890
3580
|
};
|
|
2891
3581
|
}
|
|
2892
3582
|
else if (filePath.endsWith(".rb")) try {
|
|
2893
|
-
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-
|
|
3583
|
+
const { runRuby } = await Promise.resolve().then(() => require("./rubyUtils-B1HXG4ej.cjs"));
|
|
2894
3584
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
2895
3585
|
require_logger.logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
2896
3586
|
} catch (error) {
|
|
@@ -2999,6 +3689,14 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
2999
3689
|
index: i
|
|
3000
3690
|
};
|
|
3001
3691
|
}).flat();
|
|
3692
|
+
const shouldPreloadTrace = !!traceId && hasTraceAwareAssertions(asserts.map(({ assertion }) => assertion));
|
|
3693
|
+
let preloadedTraceData;
|
|
3694
|
+
if (shouldPreloadTrace && traceId) try {
|
|
3695
|
+
preloadedTraceData = await loadTraceData(traceId);
|
|
3696
|
+
} catch (error) {
|
|
3697
|
+
require_logger.logger.debug(`Failed to preload trace data for assertions: ${error}`);
|
|
3698
|
+
preloadedTraceData = null;
|
|
3699
|
+
}
|
|
3002
3700
|
await async.default.forEachOfLimit(asserts, ASSERTIONS_MAX_CONCURRENCY, async ({ assertion, assertResult, index }) => {
|
|
3003
3701
|
if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
|
|
3004
3702
|
const result = await runAssertion({
|
|
@@ -3010,7 +3708,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
3010
3708
|
vars,
|
|
3011
3709
|
latencyMs,
|
|
3012
3710
|
assertIndex: index,
|
|
3013
|
-
traceId
|
|
3711
|
+
traceId,
|
|
3712
|
+
traceData: preloadedTraceData
|
|
3014
3713
|
});
|
|
3015
3714
|
assertResult.addResult({
|
|
3016
3715
|
index,
|
|
@@ -3156,7 +3855,7 @@ var CIProgressReporter = class {
|
|
|
3156
3855
|
else {
|
|
3157
3856
|
const eta = remaining / rate;
|
|
3158
3857
|
if (eta > 1440) etaDisplay = ">24 hours";
|
|
3159
|
-
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta)
|
|
3858
|
+
else etaDisplay = `${Math.round(eta)} minute${Math.round(eta) === 1 ? "" : "s"}`;
|
|
3160
3859
|
}
|
|
3161
3860
|
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
3162
3861
|
require_logger.logger.info(`[CI Progress] Evaluation running for ${this.formatElapsedTime(elapsed)} - Completed ${this.completedTests}/${this.totalTests} tests (${percentage}%)`);
|
|
@@ -3557,12 +4256,55 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
3557
4256
|
var ProgressBarManager = class {
|
|
3558
4257
|
progressBar;
|
|
3559
4258
|
isWebUI;
|
|
4259
|
+
originalLogCallback = null;
|
|
4260
|
+
installedLogCallback = null;
|
|
4261
|
+
pendingRender = null;
|
|
3560
4262
|
totalCount = 0;
|
|
3561
4263
|
completedCount = 0;
|
|
3562
4264
|
concurrency = 1;
|
|
3563
4265
|
constructor(isWebUI) {
|
|
3564
4266
|
this.isWebUI = isWebUI;
|
|
3565
4267
|
}
|
|
4268
|
+
clearProgressBarLine() {
|
|
4269
|
+
readline.default.cursorTo(process.stderr, 0);
|
|
4270
|
+
readline.default.clearLine(process.stderr, 0);
|
|
4271
|
+
}
|
|
4272
|
+
scheduleRender() {
|
|
4273
|
+
if (!this.progressBar || this.pendingRender) return;
|
|
4274
|
+
this.pendingRender = setImmediate(() => {
|
|
4275
|
+
this.pendingRender = null;
|
|
4276
|
+
this.progressBar?.render();
|
|
4277
|
+
});
|
|
4278
|
+
}
|
|
4279
|
+
handleLogMessage() {
|
|
4280
|
+
if (!this.progressBar) return;
|
|
4281
|
+
this.clearProgressBarLine();
|
|
4282
|
+
this.scheduleRender();
|
|
4283
|
+
}
|
|
4284
|
+
/**
|
|
4285
|
+
* Coordinate console logging with the progress bar to prevent visual corruption.
|
|
4286
|
+
*/
|
|
4287
|
+
installLogInterceptor() {
|
|
4288
|
+
if (!this.progressBar || this.isWebUI || this.installedLogCallback) return;
|
|
4289
|
+
this.originalLogCallback = require_logger.globalLogCallback;
|
|
4290
|
+
this.installedLogCallback = (message) => {
|
|
4291
|
+
this.originalLogCallback?.(message);
|
|
4292
|
+
this.handleLogMessage();
|
|
4293
|
+
};
|
|
4294
|
+
require_logger.setLogCallback(this.installedLogCallback);
|
|
4295
|
+
}
|
|
4296
|
+
/**
|
|
4297
|
+
* Remove the log interceptor and restore original logger callback behavior.
|
|
4298
|
+
*/
|
|
4299
|
+
removeLogInterceptor() {
|
|
4300
|
+
if (this.pendingRender) {
|
|
4301
|
+
clearImmediate(this.pendingRender);
|
|
4302
|
+
this.pendingRender = null;
|
|
4303
|
+
}
|
|
4304
|
+
if (this.installedLogCallback && require_logger.globalLogCallback === this.installedLogCallback) require_logger.setLogCallback(this.originalLogCallback);
|
|
4305
|
+
this.installedLogCallback = null;
|
|
4306
|
+
this.originalLogCallback = null;
|
|
4307
|
+
}
|
|
3566
4308
|
/**
|
|
3567
4309
|
* Initialize progress bar
|
|
3568
4310
|
*/
|
|
@@ -3582,7 +4324,8 @@ var ProgressBarManager = class {
|
|
|
3582
4324
|
return `Evaluating [${bar}${spaces}] ${percentage}% | ${params.value}/${params.total}${errorsText} | ${payload.provider} ${payload.prompt} ${payload.vars}`;
|
|
3583
4325
|
},
|
|
3584
4326
|
hideCursor: true,
|
|
3585
|
-
gracefulExit: true
|
|
4327
|
+
gracefulExit: true,
|
|
4328
|
+
stream: process.stderr
|
|
3586
4329
|
}, cli_progress.default.Presets.shades_classic);
|
|
3587
4330
|
this.progressBar.start(this.totalCount, 0, {
|
|
3588
4331
|
provider: "",
|
|
@@ -3857,6 +4600,7 @@ async function runEval({ provider, prompt, test, testSuite, delay, nunjucksFilte
|
|
|
3857
4600
|
const parts = traceContext.traceparent.split("-");
|
|
3858
4601
|
if (parts.length >= 3) traceId = parts[1];
|
|
3859
4602
|
}
|
|
4603
|
+
if (traceId && hasTraceAwareAssertions(test.assert)) await flushOtel();
|
|
3860
4604
|
const checkResult = await runAssertions({
|
|
3861
4605
|
prompt: renderedPrompt,
|
|
3862
4606
|
provider,
|
|
@@ -4254,7 +4998,7 @@ var Evaluator = class {
|
|
|
4254
4998
|
const defaultProvider = testSuite.defaultTest.provider;
|
|
4255
4999
|
if (require_types.isApiProvider(defaultProvider)) testCase.provider = defaultProvider;
|
|
4256
5000
|
else if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
4257
|
-
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-
|
|
5001
|
+
const { loadApiProvider } = await Promise.resolve().then(() => require("./providers-D-FnDg8k.cjs"));
|
|
4258
5002
|
testCase.provider = await loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
4259
5003
|
} else testCase.provider = defaultProvider;
|
|
4260
5004
|
}
|
|
@@ -4338,7 +5082,7 @@ var Evaluator = class {
|
|
|
4338
5082
|
if (evalOption.test.assert?.some((a) => a.type === "max-score")) rowsWithMaxScoreAssertion.add(evalOption.testIdx);
|
|
4339
5083
|
}
|
|
4340
5084
|
if (require_logger.state.resume && this.evalRecord.persisted) try {
|
|
4341
|
-
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-
|
|
5085
|
+
const { default: EvalResult } = await Promise.resolve().then(() => require("./evalResult-tGdilrWt.cjs"));
|
|
4342
5086
|
const completedPairs = await EvalResult.getCompletedIndexPairs(this.evalRecord.id, { excludeErrors: require_logger.state.retryMode });
|
|
4343
5087
|
const originalCount = runEvalOptions.length;
|
|
4344
5088
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -4538,7 +5282,7 @@ var Evaluator = class {
|
|
|
4538
5282
|
if (require_logger.isCI() && !isWebUI) {
|
|
4539
5283
|
ciProgressReporter = new CIProgressReporter(runEvalOptions.length);
|
|
4540
5284
|
ciProgressReporter.start();
|
|
4541
|
-
} else if (this.options.showProgressBar && process.
|
|
5285
|
+
} else if (this.options.showProgressBar && process.stderr.isTTY) progressBarManager = new ProgressBarManager(isWebUI);
|
|
4542
5286
|
this.options.progressCallback = (completed, total, index, evalStep, metrics) => {
|
|
4543
5287
|
if (originalProgressCallback) originalProgressCallback(completed, total, index, evalStep, metrics);
|
|
4544
5288
|
if (isWebUI) {
|
|
@@ -4559,7 +5303,10 @@ var Evaluator = class {
|
|
|
4559
5303
|
if (serialRunEvalOptions.length > 0) require_logger.logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
4560
5304
|
if (concurrentRunEvalOptions.length > 0) require_logger.logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
4561
5305
|
}
|
|
4562
|
-
if (this.options.showProgressBar && progressBarManager)
|
|
5306
|
+
if (this.options.showProgressBar && progressBarManager) {
|
|
5307
|
+
await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
5308
|
+
progressBarManager.installLogInterceptor();
|
|
5309
|
+
}
|
|
4563
5310
|
try {
|
|
4564
5311
|
if (serialRunEvalOptions.length > 0) for (const evalStep of serialRunEvalOptions) {
|
|
4565
5312
|
checkAbort();
|
|
@@ -4585,7 +5332,10 @@ var Evaluator = class {
|
|
|
4585
5332
|
else if (!targetUnavailable) {
|
|
4586
5333
|
require_logger.logger.info("Evaluation interrupted, saving progress...");
|
|
4587
5334
|
if (globalTimeout) clearTimeout(globalTimeout);
|
|
4588
|
-
if (progressBarManager)
|
|
5335
|
+
if (progressBarManager) {
|
|
5336
|
+
progressBarManager.removeLogInterceptor();
|
|
5337
|
+
progressBarManager.stop();
|
|
5338
|
+
}
|
|
4589
5339
|
if (ciProgressReporter) ciProgressReporter.finish();
|
|
4590
5340
|
this.evalRecord.setVars(Array.from(vars));
|
|
4591
5341
|
await this.evalRecord.addPrompts(prompts);
|
|
@@ -4593,6 +5343,10 @@ var Evaluator = class {
|
|
|
4593
5343
|
return this.evalRecord;
|
|
4594
5344
|
}
|
|
4595
5345
|
} else {
|
|
5346
|
+
if (progressBarManager) {
|
|
5347
|
+
progressBarManager.removeLogInterceptor();
|
|
5348
|
+
progressBarManager.stop();
|
|
5349
|
+
}
|
|
4596
5350
|
if (ciProgressReporter) ciProgressReporter.error(`Evaluation failed: ${String(err)}`);
|
|
4597
5351
|
throw err;
|
|
4598
5352
|
}
|
|
@@ -4735,6 +5489,7 @@ var Evaluator = class {
|
|
|
4735
5489
|
await this.evalRecord.addPrompts(prompts);
|
|
4736
5490
|
try {
|
|
4737
5491
|
if (progressBarManager) {
|
|
5492
|
+
progressBarManager.removeLogInterceptor();
|
|
4738
5493
|
progressBarManager.complete();
|
|
4739
5494
|
progressBarManager.stop();
|
|
4740
5495
|
} else if (ciProgressReporter) ciProgressReporter.finish();
|
|
@@ -7088,8 +7843,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7088
7843
|
require_logger.logger.warn("The \"__metadata\" column requires a key, e.g. \"__metadata:category\". This column will be ignored.");
|
|
7089
7844
|
} else if (key.startsWith("__config:")) {
|
|
7090
7845
|
const configParts = key.slice(9).split(":");
|
|
7091
|
-
if (configParts.length
|
|
7092
|
-
else {
|
|
7846
|
+
if (configParts.length === 2) {
|
|
7093
7847
|
const [expectedKey, configKey] = configParts;
|
|
7094
7848
|
let targetIndex;
|
|
7095
7849
|
if (expectedKey === "__expected") targetIndex = 0;
|
|
@@ -7115,7 +7869,7 @@ function testCaseFromCsvRow(row) {
|
|
|
7115
7869
|
}
|
|
7116
7870
|
}
|
|
7117
7871
|
assertionConfigs[targetIndex][configKey] = parsedValue;
|
|
7118
|
-
}
|
|
7872
|
+
} else require_logger.logger.warn(`Invalid __config column format: "${key}". Expected format: __config:__expected:threshold or __config:__expected<N>:threshold`);
|
|
7119
7873
|
} else vars[key] = value;
|
|
7120
7874
|
}
|
|
7121
7875
|
for (let i = 0; i < asserts.length; i++) {
|
|
@@ -7244,14 +7998,14 @@ async function parseXlsxFile(filePath) {
|
|
|
7244
7998
|
const sheetName = typeof sheetOption === "number" ? sheetNames[sheetOption - 1] : sheetOption;
|
|
7245
7999
|
const rows = await readXlsxFile(actualFilePath, { sheet: sheetOption });
|
|
7246
8000
|
if (rows.length === 0) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
|
|
7247
|
-
const headers = rows[0].map((cell) => cell
|
|
8001
|
+
const headers = rows[0].map((cell) => cell == null ? "" : String(cell));
|
|
7248
8002
|
if (headers.length === 0 || headers.every((h) => h === "")) throw new Error(`Sheet "${sheetName}" has no valid column headers`);
|
|
7249
8003
|
if (rows.length === 1) throw new Error(`Sheet "${sheetName}" is empty or contains no valid data rows`);
|
|
7250
8004
|
const data = rows.slice(1).map((row) => {
|
|
7251
8005
|
const obj = {};
|
|
7252
8006
|
headers.forEach((header, index) => {
|
|
7253
8007
|
const cellValue = row[index];
|
|
7254
|
-
obj[header] = cellValue
|
|
8008
|
+
obj[header] = cellValue == null ? "" : String(cellValue);
|
|
7255
8009
|
});
|
|
7256
8010
|
return obj;
|
|
7257
8011
|
});
|
|
@@ -11198,20 +11952,19 @@ function generateEvalSummary(params) {
|
|
|
11198
11952
|
}
|
|
11199
11953
|
}
|
|
11200
11954
|
lines.push("");
|
|
11201
|
-
const
|
|
11202
|
-
|
|
11203
|
-
|
|
11204
|
-
|
|
11205
|
-
|
|
11206
|
-
|
|
11207
|
-
|
|
11208
|
-
}
|
|
11209
|
-
const passedPart = successes > 0 ? `${chalk.default.green("✓")} ${chalk.default.green.bold(successes.toLocaleString())} passed` : `${chalk.default.gray.bold(successes.toLocaleString())} passed`;
|
|
11210
|
-
const failedPart = failures > 0 ? `${chalk.default.red("✗")} ${chalk.default.red.bold(failures.toLocaleString())} failed` : `${chalk.default.gray.bold(failures.toLocaleString())} failed`;
|
|
11955
|
+
const totalTests = successes + failures + errors;
|
|
11956
|
+
const formatResultPercentage = (count) => {
|
|
11957
|
+
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
11958
|
+
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
11959
|
+
};
|
|
11960
|
+
const formatResultLine = (count, label, icon, iconColor) => {
|
|
11961
|
+
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.default.white.bold(count.toLocaleString())} ${chalk.default.white(label)} ${chalk.default.gray(`(${formatResultPercentage(count)})`)}`;
|
|
11962
|
+
};
|
|
11211
11963
|
const errorLabel = errors === 1 ? "error" : "errors";
|
|
11212
|
-
|
|
11213
|
-
|
|
11214
|
-
|
|
11964
|
+
lines.push(chalk.default.bold("Results:"));
|
|
11965
|
+
lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.default.green));
|
|
11966
|
+
lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.default.red));
|
|
11967
|
+
lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.default.red));
|
|
11215
11968
|
const durationDisplay = formatDuration(duration);
|
|
11216
11969
|
lines.push(chalk.default.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
|
|
11217
11970
|
lines.push("");
|
|
@@ -11545,7 +12298,7 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
11545
12298
|
await require_providers.checkCloudPermissions(config);
|
|
11546
12299
|
const options = {
|
|
11547
12300
|
...evaluateOptions,
|
|
11548
|
-
showProgressBar: require_logger.getLogLevel() === "debug" ? false : cmdObj.progressBar
|
|
12301
|
+
showProgressBar: require_logger.getLogLevel() === "debug" ? false : cmdObj.progressBar === void 0 ? evaluateOptions.showProgressBar === void 0 ? true : evaluateOptions.showProgressBar : cmdObj.progressBar !== false,
|
|
11549
12302
|
repeat,
|
|
11550
12303
|
delay: !Number.isNaN(delay) && delay > 0 ? delay : void 0,
|
|
11551
12304
|
maxConcurrency,
|
|
@@ -11929,7 +12682,7 @@ async function doRedteamRun(options) {
|
|
|
11929
12682
|
redteamConfig = await doGenerateRedteam({
|
|
11930
12683
|
...passThroughOptions,
|
|
11931
12684
|
...options.liveRedteamConfig?.commandLineOptions || {},
|
|
11932
|
-
...maxConcurrency
|
|
12685
|
+
...maxConcurrency === void 0 ? {} : { maxConcurrency },
|
|
11933
12686
|
config: configPath,
|
|
11934
12687
|
output: redteamPath,
|
|
11935
12688
|
force: options.force,
|