promptfoo 0.121.4 → 0.121.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/{ListApp-DQkFNqE9.js → ListApp-DLmM02JS.js} +1 -1
- package/dist/src/{accounts-DhMYUUbu.js → accounts-Ca7WIoPY.js} +12 -7
- package/dist/src/{accounts-F9d_5sMC.js → accounts-CjFnOPmb.js} +14 -9
- package/dist/src/{accounts-Dy17bs4D.cjs → accounts-CmWzeD2d.cjs} +16 -10
- package/dist/src/{accounts-DdJ2pHMI.js → accounts-DanM1wq_.js} +13 -8
- package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-CJ0j3fBi.js} +3 -3
- package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-DDEGRV9v.js} +3 -3
- package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-DvPWSUpb.cjs} +8 -7
- package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-TxUEMPYS.js} +2 -2
- package/dist/src/{agents-BahDpe5G.cjs → agents-B4sRuXg3.cjs} +7 -6
- package/dist/src/{agents-pQeBEXMm.js → agents-B8q7h_ek.js} +5 -5
- package/dist/src/{agents-CgaMXvLM.js → agents-CBgJvRkB.js} +21 -10
- package/dist/src/{agents-C-R_jfzI.js → agents-CYn2n3QP.js} +4 -4
- package/dist/src/{agents-8FDnTriG.js → agents-D-vDNFx4.js} +21 -10
- package/dist/src/{agents-aYPQLf8W.js → agents-LrHuQqr1.js} +20 -9
- package/dist/src/{agents-DJ35I3Nt.js → agents-QGg76OF-.js} +5 -5
- package/dist/src/{agents-D7-HGxUj.cjs → agents-eHZ9nlgA.cjs} +21 -10
- package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-CJEbQ0o6.js} +7 -7
- package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-D5HXzZ0s.js} +6 -6
- package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-T6HGNxNe.cjs} +7 -7
- package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-eYv3a_DK.js} +7 -7
- package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/app/assets/Report-BNHJKN35.js +1 -0
- package/dist/src/app/assets/index-BnT6P6sF.js +388 -0
- package/dist/src/app/assets/index-yhM8y1PP.css +1 -0
- package/dist/src/app/assets/{scroll-timeline-D9IT_e8Z.js → scroll-timeline-RpeTwOvs.js} +1 -1
- package/dist/src/app/assets/sync-5gq6fmG4.js +4 -0
- package/dist/src/app/assets/vendor-charts-BL9OMNU7.js +36 -0
- package/dist/src/app/assets/{vendor-markdown-Ch00wnNI.js → vendor-markdown-BYsQqn7Z.js} +10 -10
- package/dist/src/app/assets/{vendor-react-CVvmk1UB.js → vendor-react-CqWgVW6T.js} +2 -2
- package/dist/src/app/assets/{vendor-utils-BnEYbx2Q.js → vendor-utils-BHPO71pu.js} +1 -1
- package/dist/src/app/index.html +31 -6
- package/dist/src/{audio-COrn8rM6.js → audio-BqnRvcWG.js} +3 -3
- package/dist/src/{audio-DcVKoInv.js → audio-CPMtV1yR.js} +4 -4
- package/dist/src/{audio-B7izf48x.js → audio-DyiebVB3.js} +4 -4
- package/dist/src/{audio-BQtNuYBj.cjs → audio-FnxbEnSE.cjs} +4 -4
- package/dist/src/authoritativeMarkupInjection-BZIywVjG.js +74 -0
- package/dist/src/authoritativeMarkupInjection-DyAXAsSr.js +75 -0
- package/dist/src/authoritativeMarkupInjection-F2gBw0lN.cjs +74 -0
- package/dist/src/authoritativeMarkupInjection-QEQmFS83.js +74 -0
- package/dist/src/{base-PYJvBE1i.js → base-CKLo890h.js} +4 -3
- package/dist/src/{base-fZ9wgg50.js → base-Co80MMCi.js} +5 -4
- package/dist/src/{base-D-670DX8.cjs → base-DGJW48uz.cjs} +5 -4
- package/dist/src/{base-yrI1Yal4.js → base-E9I8zXjz.js} +5 -4
- package/dist/src/bestOfN-B3wNzjSB.js +137 -0
- package/dist/src/bestOfN-BBsO41z4.js +136 -0
- package/dist/src/bestOfN-CAwmg5UL.cjs +140 -0
- package/dist/src/bestOfN-_kTi8Bxe.js +136 -0
- package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B0977K1O.cjs} +7 -6
- package/dist/src/{blobs-BCZavS8s.js → blobs-CeFdPn_T.js} +3 -3
- package/dist/src/{blobs-BQWqnnvL.js → blobs-DODuTK-a.js} +3 -3
- package/dist/src/{blobs-C-F78Kfn.js → blobs-Dwef1Ao1.js} +2 -2
- package/dist/src/{cache-BIyPcp5v.cjs → cache-CPGUA4Yl.cjs} +135 -25
- package/dist/src/cache-Cf7b4pWE.js +3 -0
- package/dist/src/{cache-D5NZmMiT.js → cache-DIXbtkNO.js} +125 -10
- package/dist/src/{cache-mb7c8hbp.js → cache-DpPWrkTE.js} +128 -12
- package/dist/src/{cache-C4Xb-hNb.js → cache-roFAE0cI.js} +126 -11
- package/dist/src/{chat-I9izLm49.js → chat-CUCorGiL.js} +12 -12
- package/dist/src/{chat-BPXSW8Bv.cjs → chat-DG1wG4w0.cjs} +6 -6
- package/dist/src/{chat-BfPaS15_.js → chat-Dabu84Br.js} +12 -12
- package/dist/src/{chat-Dr3DUQ0D.js → chat-DqUFcWI0.js} +12 -12
- package/dist/src/{chat-CclRbxGf.cjs → chat-DxTDQ83C.cjs} +14 -13
- package/dist/src/{chat-MKxMnZJZ.js → chat-GmlolEwo.js} +4 -4
- package/dist/src/{chat-0bwXjVP0.js → chat-TP8Qifkh.js} +6 -6
- package/dist/src/{chat-mW0ORo8G.js → chat-iwaM5UTQ.js} +6 -6
- package/dist/src/{chatkit-zUIVoDos.js → chatkit-B6DWi70Q.js} +4 -4
- package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-BYveR48_.cjs} +6 -5
- package/dist/src/{chatkit-Cv6AhukM.js → chatkit-fARZwEfV.js} +3 -3
- package/dist/src/{chatkit-CJnHRRMM.js → chatkit-lb6FK02w.js} +4 -4
- package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BQNp_y-F.js} +212 -67
- package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-D5Jl0SDh.js} +212 -67
- package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-DH416NBD.cjs} +218 -72
- package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-x1XJ1-pU.js} +212 -67
- package/dist/src/{cloud-DQZ5sVjW.js → cloud-D3DiFqH6.js} +3 -3
- package/dist/src/cloud-p96PA4MH.js +3 -0
- package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-B6NVI3ax.js} +4 -4
- package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-CEAW-xQa.cjs} +6 -6
- package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-RFSojyXG.js} +6 -6
- package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-r4tbYmWU.js} +6 -6
- package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-BCkLouto.js} +5 -5
- package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-BaZ4insB.js} +3 -3
- package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-CF-Vb-2Z.js} +5 -5
- package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-TJMLBj6I.cjs} +5 -5
- package/dist/src/codex-app-server-B8KHEiF4.js +1915 -0
- package/dist/src/codex-app-server-CnrLBCeA.cjs +1921 -0
- package/dist/src/codex-app-server-DIXZ230V.js +1915 -0
- package/dist/src/codex-app-server-Dd22dC_N.js +1916 -0
- package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-B6Wah8Pa.js} +6 -6
- package/dist/src/codex-sdk-BGjVAk23.js +3 -0
- package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-CFF6gUyi.cjs} +18 -10
- package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-CmQABzV3.js} +6 -6
- package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-D2d54RL8.js} +5 -5
- package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-Bu9B8NUY.js} +8 -8
- package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CtzNCHKu.js} +7 -7
- package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-DHCDlQUI.cjs} +8 -8
- package/dist/src/{cometapi-BUlt_ELa.js → cometapi-OBILPLlu.js} +8 -8
- package/dist/src/{completion-HUe8wDhZ.js → completion-CO2e1_62.js} +6 -6
- package/dist/src/{completion-BozdoXba.cjs → completion-CSYfl2cd.cjs} +6 -6
- package/dist/src/{completion-x0a_c2y1.js → completion-DZNxcyfG.js} +6 -6
- package/dist/src/{completion-Dnxn7E-j.js → completion-sNvCLTAP.js} +5 -5
- package/dist/src/constants-BjJV0cRr.js +6 -0
- package/dist/src/constants-DH5XYLKZ.js +7 -0
- package/dist/src/constants-DZGEFLsu.js +6 -0
- package/dist/src/constants-a2kYssQk.cjs +11 -0
- package/dist/src/{createHash-4gFQpDDv.js → createHash-BtbSX3mj.js} +1 -1
- package/dist/src/{createHash-CwDVU5xr.js → createHash-CGVzWdjj.js} +1 -1
- package/dist/src/{createHash-B7KvgoOD.cjs → createHash-CSiqnK5P.cjs} +2 -2
- package/dist/src/{createHash-ChI45QR1.js → createHash-CgRvs4Fn.js} +1 -1
- package/dist/src/crescendo-BXEJK_bi.cjs +704 -0
- package/dist/src/crescendo-CU_Y2i-m.js +702 -0
- package/dist/src/crescendo-J1Xx4_zb.js +703 -0
- package/dist/src/crescendo-QiaSLW0d.js +701 -0
- package/dist/src/custom-BJfP00Bh.js +619 -0
- package/dist/src/custom-CZVn-1-r.js +620 -0
- package/dist/src/custom-Cqia7M0D.cjs +621 -0
- package/dist/src/custom-notggYVl.js +618 -0
- package/dist/src/{docker-DCgsveLD.js → docker-4D1eL6Gq.js} +6 -6
- package/dist/src/{docker-ClnmCf1Z.js → docker-BBv1WUDu.js} +5 -5
- package/dist/src/{docker-DS4_Osau.cjs → docker-D06JUoe2.cjs} +6 -6
- package/dist/src/{docker-CQmlA2NU.js → docker-DdJQBxK9.js} +6 -6
- package/dist/src/{embedding-D3xTseo7.js → embedding--UZVe4_7.js} +6 -6
- package/dist/src/{embedding-I45KG3o7.cjs → embedding-BbrwopfX.cjs} +6 -6
- package/dist/src/{embedding-nFbumxcv.js → embedding-Bi3rxrZF.js} +5 -5
- package/dist/src/{embedding-DD9wa3ae.js → embedding-C251p1-8.js} +6 -6
- package/dist/src/{errors-Cw810C93.js → errors-9PcUL8BC.js} +1 -1
- package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
- package/dist/src/{esm-CtEPLdAj.cjs → esm-BIKakvNa.cjs} +8 -7
- package/dist/src/{esm-C7PnfdF8.js → esm-BTK1W7lG.js} +1 -1
- package/dist/src/{esm-tVgYPY-f.js → esm-Bexx2PFc.js} +2 -2
- package/dist/src/{eval-u4UVafl6.js → eval-0VRANImH.js} +21 -21
- package/dist/src/{eval-CzJFfFO9.js → eval-DscR5iOM.js} +1 -1
- package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-2RRJvFyB.js} +41 -16
- package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-CvtS8h8u.cjs} +51 -15
- package/dist/src/evalResult-DqzsS6_W.js +3 -0
- package/dist/src/{evalResult-D3hVYFis.js → evalResult-eUkJv9Ko.js} +40 -15
- package/dist/src/evaluator-DNdJF1Gv.js +3 -0
- package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-DRoiYB2q.js} +1060 -187
- package/dist/src/evaluatorHelpers-BsYP_muT.js +511 -0
- package/dist/src/evaluatorHelpers-CRqTvSux.cjs +537 -0
- package/dist/src/evaluatorHelpers-DuqFFfq7.js +510 -0
- package/dist/src/{extractor-CAfTSraf.js → extractor-BR7XAzAL.js} +6 -6
- package/dist/src/{extractor-WVPOrH43.cjs → extractor-BdxEtt3J.cjs} +6 -6
- package/dist/src/{extractor-DNSeBVOJ.js → extractor-CIW3iN-b.js} +6 -6
- package/dist/src/{extractor-Dk6bRWkv.js → extractor-CxRtnaHl.js} +5 -5
- package/dist/src/{fetch-B0Z3Oe4k.js → fetch-BufrQtvR.js} +93 -40
- package/dist/src/{fetch-BEWnXrrG.js → fetch-DXUnXkVU.js} +89 -40
- package/dist/src/{fetch-CJU5ELPa.cjs → fetch-Dw4XZHjj.cjs} +330 -270
- package/dist/src/{fetch-Di00EQrc.js → fetch-It34O8Ur.js} +305 -252
- package/dist/src/fetch-_YgGd2qv.js +3 -0
- package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-BhdwzYaD.cjs} +24 -1
- package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-CXRfY3Ss.js} +12 -2
- package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-D4GCJ67J.js} +12 -2
- package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-CMVNrYvE.js} +1 -1
- package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-14nugQQx.cjs} +14 -2
- package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-BPVvltoW.js} +2 -2
- package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-D18lYzhB.js} +2 -2
- package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-jJKYsnjc.js} +2 -2
- package/dist/src/goat-Ckd3q3AY.js +467 -0
- package/dist/src/goat-Qgurm-NP.js +466 -0
- package/dist/src/goat-ghadEDdy.js +465 -0
- package/dist/src/goat-una6pZGP.cjs +469 -0
- package/dist/src/graders-BDT7dif6.js +3 -0
- package/dist/src/{graders-eIHhRqoC.js → graders-BGP99PdK.js} +2416 -2224
- package/dist/src/{graders-Zy3x0zqX.js → graders-BX0f2tvS.js} +2423 -2226
- package/dist/src/{graders-pvbReLLn.js → graders-C0nXU_ZP.js} +1806 -1609
- package/dist/src/{graders--zknU_uk.cjs → graders-ClrU2fnd.cjs} +2219 -1949
- package/dist/src/hydra-BSNZZm2M.js +543 -0
- package/dist/src/hydra-BxdG4nkg.js +541 -0
- package/dist/src/hydra-DE4xWwyc.js +542 -0
- package/dist/src/hydra-DrJttnvw.cjs +542 -0
- package/dist/src/image-B4oBtu6J.js +443 -0
- package/dist/src/{image-dnoUgPrC.js → image-BN-hjLL9.js} +4 -4
- package/dist/src/{image-9302QVqR.js → image-B_fPIwdg.js} +3 -3
- package/dist/src/image-BvUAW344.js +442 -0
- package/dist/src/image-Cvjwx1uY.js +442 -0
- package/dist/src/{image-De2FBmYV.cjs → image-DfVCGPbI.cjs} +4 -4
- package/dist/src/{image-u7-rKnYU.js → image-QzmydkiG.js} +4 -4
- package/dist/src/image-X0oY4350.cjs +465 -0
- package/dist/src/index.cjs +1689 -558
- package/dist/src/index.d.cts +3270 -1624
- package/dist/src/index.d.ts +3270 -1624
- package/dist/src/index.js +1553 -438
- package/dist/src/indirectWebPwn-02ZIghCS.js +259 -0
- package/dist/src/indirectWebPwn-BJ22AbQa.cjs +397 -0
- package/dist/src/indirectWebPwn-CbjUG0rh.js +385 -0
- package/dist/src/indirectWebPwn-CfQJt3gk.cjs +260 -0
- package/dist/src/indirectWebPwn-DBQhOjoD.js +260 -0
- package/dist/src/indirectWebPwn-OsXnKejv.js +259 -0
- package/dist/src/indirectWebPwn-tNx9OZ35.js +385 -0
- package/dist/src/indirectWebPwn-uyWdHx04.js +386 -0
- package/dist/src/inputVariables-B0qUChbV.js +467 -0
- package/dist/src/inputVariables-DUGMb9Ka.js +464 -0
- package/dist/src/inputVariables-DXFdi7AI.js +468 -0
- package/dist/src/inputVariables-Dq9W-Z3a.cjs +475 -0
- package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-C4QlIuoR.js} +2 -2
- package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
- package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
- package/dist/src/{invariant-kfQ8Bu82.cjs → invariant-QtnLD03y.cjs} +1 -1
- package/dist/src/iterative-CpU6i2As.js +490 -0
- package/dist/src/iterative-DJQEQpG3.js +491 -0
- package/dist/src/iterative-DQBuWM-j.cjs +493 -0
- package/dist/src/iterative-FTS4Bz67.js +492 -0
- package/dist/src/iterativeImage-BUABMVOA.js +413 -0
- package/dist/src/iterativeImage-ByFWkxax.cjs +415 -0
- package/dist/src/iterativeImage-BzUapOUi.js +414 -0
- package/dist/src/iterativeImage-Doz8mgxF.js +413 -0
- package/dist/src/iterativeMeta-B3YiAOc8.js +386 -0
- package/dist/src/iterativeMeta-C7APE_P1.js +385 -0
- package/dist/src/iterativeMeta-CSS8M6Ds.cjs +385 -0
- package/dist/src/iterativeMeta-DgoQ7bLh.js +384 -0
- package/dist/src/iterativeTree-B5zxBBSW.js +769 -0
- package/dist/src/iterativeTree-CNyIk0Yn.js +768 -0
- package/dist/src/iterativeTree-CPMF10ve.cjs +771 -0
- package/dist/src/iterativeTree-DvZ7GBwt.js +770 -0
- package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-BadkINlJ.js} +24 -10
- package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-Bi_8sV-H.js} +25 -11
- package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CkMljjdg.js} +25 -11
- package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-DUh34xba.cjs} +25 -11
- package/dist/src/{litellm-DRjpcSa7.js → litellm-BKBo0jpC.js} +5 -5
- package/dist/src/{litellm-C2kqjxqp.js → litellm-BXyn5kZK.js} +5 -5
- package/dist/src/{litellm-p37R1dzQ.js → litellm-CNcfbCfa.js} +4 -4
- package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CtAr7bKG.cjs} +5 -5
- package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
- package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
- package/dist/src/{logger-COuQb2xB.cjs → logger-cfNpzI4o.cjs} +13 -55
- package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-BMX1iEB6.js} +5 -5
- package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CR5TSpp4.js} +5 -5
- package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-D3FUc2K3.cjs} +9 -8
- package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-OEMmS1RB.js} +6 -6
- package/dist/src/main.js +909 -369
- package/dist/src/memoryPoisoning-CM83NWYl.js +107 -0
- package/dist/src/memoryPoisoning-D8h9gXJF.js +106 -0
- package/dist/src/memoryPoisoning-Dp-btinn.cjs +106 -0
- package/dist/src/memoryPoisoning-cLuCoTuJ.js +106 -0
- package/dist/src/{messages-BTQz42fn.js → messages-BabO-cX8.js} +273 -17
- package/dist/src/{messages-811uVVW5.cjs → messages-DBPir0TQ.cjs} +278 -18
- package/dist/src/{messages-zWbkLLHz.js → messages-DGUlSNU7.js} +273 -17
- package/dist/src/{messages-MYTQ2TWp.js → messages-vsE_-Lv0.js} +273 -17
- package/dist/src/{meteor-DHdzY1Ss.js → meteor--TZYICTI.js} +2 -2
- package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-CR226f7Z.cjs} +2 -2
- package/dist/src/{meteor-CU5UAE-H.js → meteor-Cl_yd7rJ.js} +2 -2
- package/dist/src/{meteor-DuAFv6gF.js → meteor-Dce-_zGQ.js} +1 -1
- package/dist/src/mischievousUser-0l8GD7Dp.js +46 -0
- package/dist/src/mischievousUser-BUOP9W5r.js +46 -0
- package/dist/src/mischievousUser-frFYKxu6.js +47 -0
- package/dist/src/mischievousUser-olGgHIVR.cjs +46 -0
- package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-CNV5bMSk.cjs} +7 -7
- package/dist/src/{modelslab-D0erNWKe.js → modelslab-Cogmu4mG.js} +6 -6
- package/dist/src/{modelslab-DIq-6y7x.js → modelslab-Dzst7VTU.js} +6 -6
- package/dist/src/{modelslab-wu9yi5GE.js → modelslab-EyDczZ5A.js} +7 -7
- package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-BGPNBOMS.js} +6 -6
- package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-B_5NKFu1.js} +5 -5
- package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-C4eUJGse.js} +5 -5
- package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-CjJRxI1X.cjs} +9 -8
- package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BNGmgfFz.js} +4 -4
- package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-ChPlh5na.js} +4 -4
- package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CrV0iaY_.js} +3 -3
- package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-DuOG9Aun.cjs} +5 -4
- package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
- package/dist/src/{openai-URNyItar.cjs → openai-C3uXv8wS.cjs} +2 -2
- package/dist/src/{openai-Qsvz25mV.js → openai-CJrsh9n4.js} +2 -2
- package/dist/src/{openai-iYtrXzOX.js → openai-zgwBb4Ff.js} +1 -1
- package/dist/src/{openclaw-CnQ363Wi.js → openclaw-BIHlu_36.js} +10 -8
- package/dist/src/{openclaw-CwzlQSQX.js → openclaw-CF7fMido.js} +9 -7
- package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-Dphc01BY.cjs} +18 -15
- package/dist/src/{openclaw-CLWrW03k.js → openclaw-zIJAsz3P.js} +10 -8
- package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-B3vlPLsp.js} +40 -5
- package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-D05JSgMQ.js} +40 -5
- package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-DoY6GbWw.cjs} +46 -10
- package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-sRKYHGoI.js} +39 -4
- package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver--gTpSagc.js} +120 -4
- package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-B2eaKC8C.js} +120 -4
- package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-BXjcRqAM.js} +119 -3
- package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-CvJdBGSc.cjs} +125 -7
- package/dist/src/packageParser--MWTSrPW.js +36 -0
- package/dist/src/packageParser-CgE-ziRo.js +35 -0
- package/dist/src/packageParser-QoCS1FMl.cjs +54 -0
- package/dist/src/packageParser-hwwSGnAZ.js +35 -0
- package/dist/src/processShim-BBxt7LKO.js +95 -0
- package/dist/src/processShim-BcGzU8fY.js +94 -0
- package/dist/src/processShim-C_z3aRvF.js +94 -0
- package/dist/src/processShim-DSY9BV2T.cjs +98 -0
- package/dist/src/promptLength-0qIHyhA5.js +71 -0
- package/dist/src/promptLength-4X-Wd8PG.js +72 -0
- package/dist/src/promptLength-B9nZEfO6.js +71 -0
- package/dist/src/promptLength-BbBbDHNj.cjs +94 -0
- package/dist/src/promptfoo-BDrfT30-.js +180 -0
- package/dist/src/promptfoo-Cm4hiy1Y.js +180 -0
- package/dist/src/promptfoo-Rjp-MeBb.js +181 -0
- package/dist/src/promptfoo-b-baRMj-.cjs +205 -0
- package/dist/src/prompts-BYMtqPCw.js +259 -0
- package/dist/src/prompts-C-bqE1Yp.js +260 -0
- package/dist/src/prompts-Cp_Qx5Ml.js +270 -0
- package/dist/src/prompts-DHhQsANy.js +259 -0
- package/dist/src/prompts-D_QpZ2Dm.js +271 -0
- package/dist/src/prompts-hNvWBD3z.cjs +284 -0
- package/dist/src/prompts-huDVH2CI.js +270 -0
- package/dist/src/prompts-p78Hul5i.cjs +289 -0
- package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
- package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-CZO_w7ue.cjs} +2 -2
- package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DHcFiVWX.js} +1 -1
- package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-ReCd0sFa.js} +2 -2
- package/dist/src/{providers-BV_KMZje.js → providers-B9KzWxAX.js} +10558 -21587
- package/dist/src/{providers-DruaQfwu.js → providers-BCCz6_IX.js} +1228 -12196
- package/dist/src/{providers-1eKkXBKp.cjs → providers-BDVVIQM6.cjs} +10649 -21843
- package/dist/src/{providers-iUt5fbAN.js → providers-BYAn82cf.js} +1 -1
- package/dist/src/{providers-Domz_llv.js → providers-DVYRZP4E.js} +10589 -21570
- package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-CLCgQ9tt.js} +3 -3
- package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-CgYxeSmO.js} +3 -3
- package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-Cokhluq3.cjs} +8 -7
- package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-D0BYebvX.js} +3 -3
- package/dist/src/{quiverai-DFotyafY.cjs → quiverai-BAp6iTZD.cjs} +4 -4
- package/dist/src/{quiverai-aPPvXOgn.js → quiverai-BvIhI_0l.js} +4 -4
- package/dist/src/{quiverai-DR0SnIQV.js → quiverai-CdTWPe-A.js} +3 -3
- package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Cv7rJKDz.js} +4 -4
- package/dist/src/registry-BUJrgjwv.js +124 -0
- package/dist/src/registry-DXm1t_x0.js +125 -0
- package/dist/src/registry-Dp5EqoXc.js +124 -0
- package/dist/src/registry-KCVF1CFC.cjs +124 -0
- package/dist/src/{server-D6Il2Sob.js → remoteGeneration-B1_XsKXU.js} +16 -108
- package/dist/src/{server-BSB45Nt9.js → remoteGeneration-COpWcmWd.js} +15 -146
- package/dist/src/{server-Dx2TyCH2.cjs → remoteGeneration-DS9N3pgB.cjs} +30 -119
- package/dist/src/remoteGeneration-DsaSwmG2.js +217 -0
- package/dist/src/render-BNTrbmBw.cjs +384 -0
- package/dist/src/render-CSP99NLm.js +348 -0
- package/dist/src/render-DFfDeYUK.js +347 -0
- package/dist/src/{render-CgVDrJmM.js → render-DznWrxGO.js} +2 -2
- package/dist/src/render-_6ur1fhE.js +347 -0
- package/dist/src/resourceAttributes-D1jP3kL5.js +17 -0
- package/dist/src/resourceAttributes-DQbBB--2.js +16 -0
- package/dist/src/resourceAttributes-ephgOvdR.cjs +27 -0
- package/dist/src/resourceAttributes-v6-I67fn.js +16 -0
- package/dist/src/{responses-Bi9vBuW_.cjs → responses-1UFFF9N_.cjs} +51 -16
- package/dist/src/{responses-DL9m8CyY.js → responses-B3W2JvOQ.js} +49 -15
- package/dist/src/{responses--OsX2aYW.js → responses-B6ktc3Ra.js} +49 -15
- package/dist/src/{responses-C-flexAY.js → responses-URRzV8qE.js} +49 -15
- package/dist/src/rolldown-runtime-D_mwlA32.cjs +43 -0
- package/dist/src/rubyUtils-BYVlQ94c.js +3 -0
- package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CXlFM2rR.js} +3 -3
- package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-CnlW8AYb.js} +3 -3
- package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-CqUWBZAt.cjs} +18 -27
- package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DdGojpfv.js} +3 -3
- package/dist/src/runtimeTransform-BJOpL9Yc.js +142 -0
- package/dist/src/runtimeTransform-Dgh_D7DU.js +143 -0
- package/dist/src/runtimeTransform-DigbjU1r.js +142 -0
- package/dist/src/runtimeTransform-ON3YYILw.cjs +147 -0
- package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-CujrzP1a.js} +62 -51
- package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-DzffAqo_.cjs} +65 -53
- package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-vhtSV7JI.js} +62 -51
- package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-yr1QKeBs.js} +61 -50
- package/dist/src/{scanner-1DqWi1Ej.js → scanner-DS0109SS.js} +7 -7
- package/dist/src/server/index.js +5105 -605
- package/dist/src/server-B8rqV126.cjs +126 -0
- package/dist/src/server-BaLytskk.js +3 -0
- package/dist/src/server-CMJD10J4.js +107 -0
- package/dist/src/server-Ddp8GNMp.js +146 -0
- package/dist/src/server-DhMHosWj.js +182 -0
- package/dist/src/shared-7pmVZLNO.js +1334 -0
- package/dist/src/shared-9WHQ1oNE.js +1335 -0
- package/dist/src/{fileExtensions-BArZuxsI.js → shared-BoG7qLMv.js} +12 -2
- package/dist/src/shared-D6IjElRI.js +1334 -0
- package/dist/src/shared-WkgnDkcg.cjs +1436 -0
- package/dist/src/{signal-CE5G3a7x.js → signal-CSurUUyV.js} +3 -3
- package/dist/src/simulatedUser-C9aQObBI.js +222 -0
- package/dist/src/simulatedUser-Cu601Dd4.cjs +227 -0
- package/dist/src/simulatedUser-U_qAHnuB.js +222 -0
- package/dist/src/simulatedUser-p3tACcmw.js +223 -0
- package/dist/src/{slack-DDUe-5MC.js → slack-Bapo-7_8.js} +2 -2
- package/dist/src/{slack-1Rhq0EoV.cjs → slack-DMC1QVEg.cjs} +3 -2
- package/dist/src/{slack-D5Wpy8LM.js → slack-DTEFhrMn.js} +2 -2
- package/dist/src/{slack-acRb0IqQ.js → slack-k-_CP84Q.js} +1 -1
- package/dist/src/storage-BU4qcnOb.js +875 -0
- package/dist/src/storage-CA-v9V2v.cjs +911 -0
- package/dist/src/storage-CD-GWAdx.js +822 -0
- package/dist/src/storage-QdU-SmvD.js +834 -0
- package/dist/src/{store-DAAyxcy6.cjs → store-B2NDDooM.cjs} +60 -24
- package/dist/src/{store-CYEy5J2D.js → store-DKd5592Q.js} +51 -20
- package/dist/src/{store-M0b1WfYb.js → store-HpopRVzl.js} +50 -19
- package/dist/src/store-IbiRIF3k.js +3 -0
- package/dist/src/strategies-7CS3Alao.cjs +2360 -0
- package/dist/src/strategies-CiSeroPH.js +2331 -0
- package/dist/src/strategies-DRJjGTIY.js +2333 -0
- package/dist/src/{tables-DQ4WU5tX.js → tables-CRSXQ2Ke.js} +2 -2
- package/dist/src/{tables-CsWou1Bx.js → tables-CxjU7bBd.js} +3 -3
- package/dist/src/{tables-DUfh1F7Z.cjs → tables-DBIJU0WE.cjs} +6 -5
- package/dist/src/{tables-C4CH3zRr.js → tables-DafUHOeh.js} +3 -3
- package/dist/src/{telemetry-CQPez_Jp.js → telemetry-00ezXr_t.js} +5 -4
- package/dist/src/telemetry-ByPqDcKC.js +3 -0
- package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CJ7FnCsc.cjs} +18 -11
- package/dist/src/{telemetry-dbaJ0E98.js → telemetry-DmXYcJNV.js} +5 -4
- package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-DwX9XUN5.js} +4 -3
- package/dist/src/{text-KvuD2Iko.js → text-Db-Wt2u2.js} +1 -1
- package/dist/src/{text-DHxdyQqT.js → text-DwYK5EBn.js} +1 -1
- package/dist/src/{text-BVi-cLPJ.cjs → text-nywWsRBM.cjs} +1 -1
- package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-BjVkdk18.js} +1 -1
- package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-CDet74yk.js} +1 -1
- package/dist/src/tokenUsageUtils-CmnQ0G2m.js +142 -0
- package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-_B-P8IAi.cjs} +1 -1
- package/dist/src/toolAttributes-BAjwcBf0.cjs +103 -0
- package/dist/src/toolAttributes-COVgDrBG.js +87 -0
- package/dist/src/toolAttributes-DJ9ZEKXD.js +86 -0
- package/dist/src/tracingOptions-BnwKCkSB.js +221 -0
- package/dist/src/tracingOptions-Chi74lOD.js +219 -0
- package/dist/src/tracingOptions-DrbSFaKy.cjs +249 -0
- package/dist/src/tracingOptions-ji2OuXbT.js +220 -0
- package/dist/src/{transcription-DuWDupG7.js → transcription-B8uIgCYX.js} +5 -5
- package/dist/src/{transcription-CJspiD2c.js → transcription-CfU5loSq.js} +6 -6
- package/dist/src/{transcription-V2HaAmy2.js → transcription-Dkd22_4K.js} +6 -6
- package/dist/src/{transcription-BvjmiYB1.cjs → transcription-mzuf18Mq.cjs} +9 -8
- package/dist/src/{transform-lQrDE1BQ.js → transform-BIMynQsA.js} +9 -9
- package/dist/src/transform-BnSTnFlp.js +187 -0
- package/dist/src/transform-BnSXWmU_2.cjs +221 -0
- package/dist/src/transform-CGt7Kt3y2.js +186 -0
- package/dist/src/transform-CrPGTsij.js +186 -0
- package/dist/src/{transform-CTeuTR3S.cjs → transform-DhNkAUs8.cjs} +13 -12
- package/dist/src/{transform-CG0ehZNG.js → transform-DmvYBRll.js} +9 -9
- package/dist/src/{transform-zDhMmzwX.js → transform-EtD4jAWi.js} +9 -9
- package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-0ThtPved.js} +1 -1
- package/dist/src/transformersAvailability-BYydDE5U.js +35 -0
- package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-BvyU9vDD.js} +1 -1
- package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-BytPvKUW.cjs} +1 -1
- package/dist/src/{types-Dm9JM6Vb.js → types-BFevViUY.js} +115 -19
- package/dist/src/{types-Bgh5SOn6.js → types-BJQBBPTP.js} +115 -19
- package/dist/src/{types-CeaeaZdP.cjs → types-CxJvaY2S.cjs} +357 -172
- package/dist/src/{types-BGQDAP8i.js → types-D6glLbdF.js} +271 -170
- package/dist/src/{util-BYvQUPp7.js → util--WMgw7wM.js} +28 -8
- package/dist/src/{util-C9J8ahRn.js → util-5WnCSb0h.js} +72 -48
- package/dist/src/{util-CN3SrLT4.cjs → util-BSIuSLVK.cjs} +74 -49
- package/dist/src/{util-C8e5uydV.js → util-Bx677_k2.js} +154 -147
- package/dist/src/util-CN8om2rz.cjs +386 -0
- package/dist/src/{util-DDs-7g6-.js → util-CoQWM76y.js} +28 -8
- package/dist/src/util-DNl96nNs.js +327 -0
- package/dist/src/{util-DxWpWjhc.js → util-DURocbYR.js} +667 -507
- package/dist/src/util-Df8YMvS1.js +327 -0
- package/dist/src/{util-DvU2Pw8c.js → util-DiQ3QvBB.js} +28 -8
- package/dist/src/{util-oGMLA7vc.js → util-I-Rf-KaD.js} +862 -577
- package/dist/src/{util-olYL5C6N.cjs → util-IYzs5Y04.cjs} +33 -7
- package/dist/src/{util-D9TisOyk.js → util-LKTmNsMQ.js} +71 -47
- package/dist/src/{util-Bxn8emtE.cjs → util-SPsvFONY.cjs} +738 -582
- package/dist/src/{util-D3q0WQ-0.js → util-efByNxcr.js} +72 -48
- package/dist/src/util-kDURhgJW.js +328 -0
- package/dist/src/{utils-DJfvjyMj.js → utils-B0lzitHZ.js} +3 -3
- package/dist/src/{utils-BLJKfv0y.js → utils-BFOh20Gb.js} +3 -3
- package/dist/src/{utils-hXtCYanr.js → utils-BGY69tk_.js} +2 -2
- package/dist/src/{utils-B05gLxER.cjs → utils-Ve6kuJsa.cjs} +3 -3
- package/dist/src/version-BK20a4sw.js +16 -0
- package/dist/src/version-BWCSaByA.cjs +27 -0
- package/dist/src/version-eRkNuGv8.js +17 -0
- package/dist/src/version-lpHV_53E.js +16 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +56 -28
- package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
- package/dist/src/app/assets/index-BXGkeMwh.css +0 -1
- package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
- package/dist/src/app/assets/sync-IjzpWrOE.js +0 -4
- package/dist/src/app/assets/vendor-charts-BNdH8TCw.js +0 -36
- package/dist/src/cache-Cr9oLMUa.js +0 -3
- package/dist/src/cache-DbLsVWB2.cjs +0 -3
- package/dist/src/cloud-Hphvo8kr.js +0 -3
- package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
- package/dist/src/codex-sdk-CWEnH70W.cjs +0 -2
- package/dist/src/evalResult-D8MT9p0s.js +0 -3
- package/dist/src/evalResult-DElBuddX.js +0 -2
- package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
- package/dist/src/evaluator-CVessDWe.js +0 -3
- package/dist/src/fetch-C7bGKDlQ.js +0 -3
- package/dist/src/graders-BOAzQEUe.cjs +0 -2
- package/dist/src/graders-D4BTsZdG2.js +0 -3
- package/dist/src/graders-DOJK1XpV.js +0 -2
- package/dist/src/graders-NAv9LcBn.js +0 -2
- package/dist/src/image-B5Mv-Z3h.js +0 -257
- package/dist/src/image-DVz2RiMF.js +0 -258
- package/dist/src/image-qUpPvmNZ.js +0 -257
- package/dist/src/image-x6KqLQl4.cjs +0 -280
- package/dist/src/providers-Bp4S-FvO.js +0 -2
- package/dist/src/providers-DV3ax9e_.cjs +0 -3
- package/dist/src/providers-u9Enmfok.js +0 -2
- package/dist/src/render-CH-62LbA.js +0 -135
- package/dist/src/render-CMEpfLaO.js +0 -136
- package/dist/src/render-DHIZ6_k8.js +0 -135
- package/dist/src/render-DfQSFxGE.cjs +0 -165
- package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
- package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
- package/dist/src/server-BNYztJkh.js +0 -385
- package/dist/src/server-DCtHUqlp.js +0 -3
- package/dist/src/server-DaA2eR26.cjs +0 -2
- package/dist/src/store-CWOSz6D_.cjs +0 -2
- package/dist/src/store-DCDBhv7B.js +0 -3
- package/dist/src/store-Dn9HUkdW.js +0 -240
- package/dist/src/telemetry-C1IqxcdW.js +0 -3
- package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
- package/dist/src/transform-Bbg6A8Jk.js +0 -216
- package/dist/src/transform-CUnzlsbn.cjs +0 -228
- package/dist/src/transform-DYX1_Xnh.js +0 -216
- package/dist/src/transform-DgKlRr73.cjs +0 -2
- package/dist/src/transform-M6ITAESf.js +0 -3
- package/dist/src/transform-UN5UGu8U.js +0 -213
|
@@ -1,29 +1,39 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { A as getMaxEvalTimeMs, D as getEnvInt, N as state, O as getEnvString, S as summarizeEvaluateResultForLogging, T as getEnvBool, _ as extractJsonObjects, c as setLogCallback, j as isCI, k as getEvalTimeoutMs, r as globalLogCallback, s as logger, v as getAjv, x as safeJsonStringify } from "./logger-
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
11
|
-
import {
|
|
12
|
-
import { c as
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import { r as runPython } from "./pythonUtils-
|
|
16
|
-
import {
|
|
17
|
-
import {
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import {
|
|
22
|
-
import {
|
|
23
|
-
import { t as
|
|
24
|
-
import { n as
|
|
25
|
-
import {
|
|
26
|
-
import {
|
|
2
|
+
import { A as getMaxEvalTimeMs, D as getEnvInt, N as state, O as getEnvString, S as summarizeEvaluateResultForLogging, T as getEnvBool, _ as extractJsonObjects, c as setLogCallback, g as extractFirstJsonObject, j as isCI, k as getEvalTimeoutMs, r as globalLogCallback, s as logger, v as getAjv, x as safeJsonStringify } from "./logger-BbY6ypFL.js";
|
|
3
|
+
import { L as FILE_METADATA_KEY, f as sleep, r as fetchWithRetries, w as parseChatPrompt, x as isPromptfooSampleTarget } from "./fetch-DXUnXkVU.js";
|
|
4
|
+
import { n as VERSION } from "./version-eRkNuGv8.js";
|
|
5
|
+
import { t as invariant } from "./invariant-B2Rf6avk.js";
|
|
6
|
+
import { r as telemetry } from "./telemetry-00ezXr_t.js";
|
|
7
|
+
import { at as MULTI_INPUT_VAR, d as isGradingResult, nt as LLAMA_GUARD_REPLICATE_PROVIDER, p as isApiProvider, s as ResultFailureReason } from "./types-BFevViUY.js";
|
|
8
|
+
import { i as isJavascriptFile } from "./fileExtensions-D4GCJ67J.js";
|
|
9
|
+
import { i as extractVariablesFromTemplate, o as getNunjucksEngine, r as analyzeTemplateReference } from "./render-CSP99NLm.js";
|
|
10
|
+
import { t as providerRegistry } from "./providerRegistry-ReCd0sFa.js";
|
|
11
|
+
import { l as shouldGenerateRemote } from "./remoteGeneration-B1_XsKXU.js";
|
|
12
|
+
import { c as promptYesNo } from "./server-DhMHosWj.js";
|
|
13
|
+
import { n as isNonTransientHttpStatus } from "./errors-9PcUL8BC.js";
|
|
14
|
+
import { l as withCacheNamespace, o as getCache } from "./cache-DpPWrkTE.js";
|
|
15
|
+
import { r as runPython } from "./pythonUtils-CgYxeSmO.js";
|
|
16
|
+
import { B as parseFileUrl, C as isOpenAiProvider, F as maybeLoadToolsFromExternalFile, S as isGoogleProvider, w as isProviderAllowed, x as isAnthropicProvider, z as loadFunction } from "./util-DURocbYR.js";
|
|
17
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-DqUFcWI0.js";
|
|
18
|
+
import { h as validateFunctionCall } from "./transform-EtD4jAWi.js";
|
|
19
|
+
import { l as validateFunctionCall$1 } from "./util-5WnCSb0h.js";
|
|
20
|
+
import { _ as VertexChatProvider, n as loadApiProvider, v as GoogleLiveProvider, y as AIStudioChatProvider } from "./providers-B9KzWxAX.js";
|
|
21
|
+
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-CDet74yk.js";
|
|
22
|
+
import { t as getProcessShim } from "./processShim-BBxt7LKO.js";
|
|
23
|
+
import { n as loadFromPackage, t as isPackagePath } from "./packageParser--MWTSrPW.js";
|
|
24
|
+
import { n as runRuby } from "./rubyUtils-CXlFM2rR.js";
|
|
25
|
+
import { c as isBasicRefusal, o as getSessionId } from "./util-kDURhgJW.js";
|
|
26
|
+
import { $ as DEFAULT_WEB_SEARCH_PROMPT, Ct as getDefaultProviders, Dt as getGradingProvider, Et as getAndCheckProvider, G as matchesGEval, H as isGraderFailure, J as matchesTrajectoryGoalSuccess, K as matchesLlmRubric, Ot as getProviderCallExecutionContext, St as processFileReference, Tt as callProviderWithContext, U as matchesClosedQa, W as matchesFactuality, Y as doRemoteGrading, _t as splitIntoSentences, at as CONTEXT_RECALL, bt as getFinalTest, ct as CONTEXT_RELEVANCE, dt as renderLlmRubricPrompt, et as SELECT_BEST_PROMPT, ft as cosineSimilarity, gt as normalizeMatcherTokenUsage, ht as fail, it as CONTEXT_FAITHFULNESS_NLI_STATEMENTS, kt as withProviderCallExecutionContext, lt as CONTEXT_RELEVANCE_BAD, mt as euclideanDistance, n as getGraderById, nt as ANSWER_RELEVANCY_GENERATE, ot as CONTEXT_RECALL_ATTRIBUTED_TOKEN, pt as dotProduct, q as matchesPiScore, rt as CONTEXT_FAITHFULNESS_LONGFORM, st as CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN, tt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, ut as loadRubricPrompt, vt as tryParse, wt as DEFAULT_ANTHROPIC_MODEL, xt as loadFromJavaScriptFile, yt as coerceString } from "./graders-BGP99PdK.js";
|
|
27
|
+
import { f as redteamProviderManager, g as createProviderRateLimitOptions, h as createRateLimitRegistry, m as TokenUsageTracker } from "./shared-9WHQ1oNE.js";
|
|
28
|
+
import { i as generateIdFromPrompt } from "./utils-BFOh20Gb.js";
|
|
29
|
+
import { a as getTransformLabel, i as getTransformErrorMessage, o as transform, r as TransformInputType } from "./transform-BnSTnFlp.js";
|
|
30
|
+
import { n as getTraceStore } from "./store-DKd5592Q.js";
|
|
31
|
+
import { a as getActualPromptWithFallback, r as updateSignalFile } from "./signal-CSurUUyV.js";
|
|
32
|
+
import { t as extractAndStoreBinaryData } from "./extractor-CIW3iN-b.js";
|
|
33
|
+
import { i as throwIfTargetPromptExceedsMaxChars } from "./promptLength-4X-Wd8PG.js";
|
|
34
|
+
import { n as checkExfilTracking } from "./indirectWebPwn-uyWdHx04.js";
|
|
35
|
+
import { n as getFirstStringAttribute, r as getToolNameFromAttributes, t as TOOL_ARGUMENT_ATTRIBUTE_KEYS } from "./toolAttributes-COVgDrBG.js";
|
|
36
|
+
import { i as filterFiniteScores, n as renderPrompt, r as runExtensionHook, t as collectFileMetadata } from "./evaluatorHelpers-BsYP_muT.js";
|
|
27
37
|
import { AsyncResource } from "node:async_hooks";
|
|
28
38
|
import chalk from "chalk";
|
|
29
39
|
import fs, { createWriteStream } from "fs";
|
|
@@ -37,11 +47,12 @@ import readline from "readline";
|
|
|
37
47
|
import { globSync } from "glob";
|
|
38
48
|
import { XMLParser } from "fast-xml-parser";
|
|
39
49
|
import async from "async";
|
|
40
|
-
import
|
|
41
|
-
import { JSDOM } from "jsdom";
|
|
50
|
+
import { parse as parse$1 } from "parse5";
|
|
42
51
|
import { distance } from "fastest-levenshtein";
|
|
52
|
+
import cliProgress from "cli-progress";
|
|
43
53
|
import * as rouge from "js-rouge";
|
|
44
54
|
import { isDeepStrictEqual } from "node:util";
|
|
55
|
+
import { LRUCache } from "lru-cache";
|
|
45
56
|
import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
|
|
46
57
|
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
47
58
|
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
@@ -218,6 +229,505 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
218
229
|
};
|
|
219
230
|
};
|
|
220
231
|
//#endregion
|
|
232
|
+
//#region src/matchers/classification.ts
|
|
233
|
+
/**
|
|
234
|
+
*
|
|
235
|
+
* @param expected Expected classification. If undefined, matches any classification.
|
|
236
|
+
* @param output Text to classify.
|
|
237
|
+
* @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
|
|
238
|
+
* @param grading
|
|
239
|
+
* @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
|
|
240
|
+
*/
|
|
241
|
+
async function matchesClassification(expected, output, threshold, grading) {
|
|
242
|
+
const resp = await (await getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
|
|
243
|
+
if (!resp.classification) return fail(resp.error || "Unknown error fetching classification");
|
|
244
|
+
let score;
|
|
245
|
+
if (expected === void 0) {
|
|
246
|
+
const scores = Object.values(resp.classification);
|
|
247
|
+
if (scores.length === 0) return {
|
|
248
|
+
pass: false,
|
|
249
|
+
score: 0,
|
|
250
|
+
reason: "No classification scores returned"
|
|
251
|
+
};
|
|
252
|
+
score = Math.max(...scores);
|
|
253
|
+
} else score = resp.classification[expected] || 0;
|
|
254
|
+
if (score >= threshold - Number.EPSILON) {
|
|
255
|
+
const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
|
|
256
|
+
return {
|
|
257
|
+
pass: true,
|
|
258
|
+
score,
|
|
259
|
+
reason
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
return {
|
|
263
|
+
pass: false,
|
|
264
|
+
score,
|
|
265
|
+
reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
//#endregion
|
|
269
|
+
//#region src/matchers/comparison.ts
|
|
270
|
+
async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
|
|
271
|
+
invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
|
|
272
|
+
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "select-best check"), await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, SELECT_BEST_PROMPT), {
|
|
273
|
+
criteria,
|
|
274
|
+
outputs: outputs.map((o) => tryParse(o)),
|
|
275
|
+
...vars || {}
|
|
276
|
+
}), "select-best", {
|
|
277
|
+
criteria,
|
|
278
|
+
outputs: outputs.map((o) => tryParse(o)),
|
|
279
|
+
...vars || {}
|
|
280
|
+
}, providerCallContext);
|
|
281
|
+
if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => fail(resp.error || "No output", resp.tokenUsage));
|
|
282
|
+
invariant(typeof resp.output === "string", "select-best produced malformed response");
|
|
283
|
+
const firstIntegerMatch = resp.output.trim().match(/\d+/);
|
|
284
|
+
const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
|
|
285
|
+
if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
|
|
286
|
+
const tokensUsed = normalizeMatcherTokenUsage(resp.tokenUsage);
|
|
287
|
+
return outputs.map((_output, index) => {
|
|
288
|
+
if (index === verdict) return {
|
|
289
|
+
pass: true,
|
|
290
|
+
score: 1,
|
|
291
|
+
reason: `Output selected as the best: ${criteria}`,
|
|
292
|
+
tokensUsed
|
|
293
|
+
};
|
|
294
|
+
else return {
|
|
295
|
+
pass: false,
|
|
296
|
+
score: 0,
|
|
297
|
+
reason: `Output not selected: ${criteria}`,
|
|
298
|
+
tokensUsed
|
|
299
|
+
};
|
|
300
|
+
});
|
|
301
|
+
}
|
|
302
|
+
async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
|
|
303
|
+
invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
|
|
304
|
+
const value = assertion.value || {};
|
|
305
|
+
const options = {
|
|
306
|
+
method: typeof value === "object" && "method" in value ? value.method : "average",
|
|
307
|
+
weights: typeof value === "object" && "weights" in value ? value.weights : {},
|
|
308
|
+
threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
|
|
309
|
+
};
|
|
310
|
+
const scores = resultsWithGradingResults.map((result, index) => {
|
|
311
|
+
const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
|
|
312
|
+
if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
|
|
313
|
+
let totalWeightedScore = 0;
|
|
314
|
+
let totalWeight = 0;
|
|
315
|
+
relevantResults.forEach((componentResult) => {
|
|
316
|
+
const assertionType = componentResult.assertion?.type || "unknown";
|
|
317
|
+
const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
|
|
318
|
+
const score = componentResult.score || 0;
|
|
319
|
+
totalWeightedScore += score * weight;
|
|
320
|
+
totalWeight += weight;
|
|
321
|
+
});
|
|
322
|
+
let aggregateScore;
|
|
323
|
+
if (options.method === "sum") aggregateScore = totalWeightedScore;
|
|
324
|
+
else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
|
|
325
|
+
return {
|
|
326
|
+
index,
|
|
327
|
+
score: aggregateScore,
|
|
328
|
+
componentCount: relevantResults.length,
|
|
329
|
+
totalWeight
|
|
330
|
+
};
|
|
331
|
+
});
|
|
332
|
+
let maxScore = -Infinity;
|
|
333
|
+
let winnerIndex = 0;
|
|
334
|
+
for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
|
|
335
|
+
maxScore = scores[i].score;
|
|
336
|
+
winnerIndex = i;
|
|
337
|
+
}
|
|
338
|
+
const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
|
|
339
|
+
return scores.map(({ index, score, componentCount, totalWeight }) => {
|
|
340
|
+
const isWinner = index === winnerIndex && meetsThreshold;
|
|
341
|
+
return {
|
|
342
|
+
pass: isWinner,
|
|
343
|
+
score: isWinner ? 1 : 0,
|
|
344
|
+
reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
|
|
345
|
+
namedScores: {
|
|
346
|
+
maxScore: score,
|
|
347
|
+
assertionCount: componentCount,
|
|
348
|
+
totalWeight
|
|
349
|
+
}
|
|
350
|
+
};
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
//#endregion
|
|
354
|
+
//#region src/matchers/moderation.ts
|
|
355
|
+
async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
|
|
356
|
+
if (!assistantResponse) return {
|
|
357
|
+
pass: true,
|
|
358
|
+
score: 1,
|
|
359
|
+
reason: "No output to moderate"
|
|
360
|
+
};
|
|
361
|
+
const defaultProviders = await getDefaultProviders();
|
|
362
|
+
const defaultModerationProvider = !getEnvString("OPENAI_API_KEY") && (getEnvString("REPLICATE_API_KEY") || getEnvString("REPLICATE_API_TOKEN")) ? await loadApiProvider(LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
|
|
363
|
+
const moderationProvider = await getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
|
|
364
|
+
invariant(moderationProvider, "Moderation provider must be defined");
|
|
365
|
+
const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
|
|
366
|
+
if (resp.error) return {
|
|
367
|
+
pass: false,
|
|
368
|
+
score: 0,
|
|
369
|
+
reason: `Moderation API error: ${resp.error}`
|
|
370
|
+
};
|
|
371
|
+
const { flags } = resp;
|
|
372
|
+
if (!flags || flags.length === 0) return {
|
|
373
|
+
pass: true,
|
|
374
|
+
score: 1,
|
|
375
|
+
reason: "No moderation flags detected"
|
|
376
|
+
};
|
|
377
|
+
const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
|
|
378
|
+
if (filteredFlags.length > 0) return {
|
|
379
|
+
pass: false,
|
|
380
|
+
score: 0,
|
|
381
|
+
reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
|
|
382
|
+
};
|
|
383
|
+
return {
|
|
384
|
+
pass: true,
|
|
385
|
+
score: 1,
|
|
386
|
+
reason: "No relevant moderation flags detected"
|
|
387
|
+
};
|
|
388
|
+
}
|
|
389
|
+
//#endregion
|
|
390
|
+
//#region src/assertions/contextUtils.ts
|
|
391
|
+
/**
|
|
392
|
+
* Resolves the context value for context-based assertions.
|
|
393
|
+
* Supports extracting context from test variables or transforming from output.
|
|
394
|
+
* Can return either a single context string or an array of context chunks.
|
|
395
|
+
*
|
|
396
|
+
* @param assertion - The assertion configuration
|
|
397
|
+
* @param test - The test case
|
|
398
|
+
* @param output - The provider output (after provider transform, before test transform)
|
|
399
|
+
* @param prompt - The prompt text
|
|
400
|
+
* @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
|
|
401
|
+
* @param providerResponse - Optional full provider response for contextTransform
|
|
402
|
+
* @returns The resolved context string or array of strings
|
|
403
|
+
* @throws Error if context cannot be resolved or transform fails
|
|
404
|
+
*/
|
|
405
|
+
async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
|
|
406
|
+
let contextValue;
|
|
407
|
+
if (test.vars?.context) {
|
|
408
|
+
if (typeof test.vars.context === "string") contextValue = test.vars.context;
|
|
409
|
+
else if (Array.isArray(test.vars.context)) {
|
|
410
|
+
const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
|
|
411
|
+
if (invalidEntry) {
|
|
412
|
+
const [idx, val] = invalidEntry;
|
|
413
|
+
invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
|
|
414
|
+
}
|
|
415
|
+
contextValue = test.vars.context;
|
|
416
|
+
}
|
|
417
|
+
} else if (fallbackContext) contextValue = fallbackContext;
|
|
418
|
+
if (assertion.contextTransform) {
|
|
419
|
+
const getLabel = () => getTransformLabel(assertion.contextTransform);
|
|
420
|
+
try {
|
|
421
|
+
const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
|
|
422
|
+
const transformed = await transform(assertion.contextTransform, outputForTransform, {
|
|
423
|
+
vars: test.vars,
|
|
424
|
+
prompt: { label: prompt },
|
|
425
|
+
...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
|
|
426
|
+
});
|
|
427
|
+
invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), () => `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${getLabel()}`);
|
|
428
|
+
contextValue = transformed;
|
|
429
|
+
} catch (error) {
|
|
430
|
+
throw new Error(`Failed to transform context using expression '${getLabel()}': ${getTransformErrorMessage(error)}`);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
|
|
434
|
+
return contextValue;
|
|
435
|
+
}
|
|
436
|
+
/**
|
|
437
|
+
* Serializes context (string or string[]) to a single string for prompts.
|
|
438
|
+
* Joins chunks with double newlines to preserve separation.
|
|
439
|
+
*/
|
|
440
|
+
function serializeContext(context) {
|
|
441
|
+
return Array.isArray(context) ? context.join("\n\n") : context;
|
|
442
|
+
}
|
|
443
|
+
//#endregion
|
|
444
|
+
//#region src/matchers/rag.ts
|
|
445
|
+
async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
|
|
446
|
+
const defaults = await getDefaultProviders();
|
|
447
|
+
const embeddingProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
|
|
448
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
|
|
449
|
+
const tokensUsed = normalizeMatcherTokenUsage(void 0);
|
|
450
|
+
const rubricPrompt = await loadRubricPrompt(grading?.rubricPrompt, ANSWER_RELEVANCY_GENERATE);
|
|
451
|
+
const parsedOutput = tryParse(output);
|
|
452
|
+
const promptText = await renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
|
|
453
|
+
const candidateQuestions = [];
|
|
454
|
+
for (let i = 0; i < 3; i++) {
|
|
455
|
+
const resp = await callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
|
|
456
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
457
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
458
|
+
invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
|
|
459
|
+
candidateQuestions.push(resp.output);
|
|
460
|
+
}
|
|
461
|
+
invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
|
|
462
|
+
const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
|
|
463
|
+
accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
|
|
464
|
+
if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
|
|
465
|
+
const inputEmbedding = inputEmbeddingResp.embedding;
|
|
466
|
+
const similarities = [];
|
|
467
|
+
const questionsWithScores = [];
|
|
468
|
+
for (const question of candidateQuestions) {
|
|
469
|
+
const resp = await embeddingProvider.callEmbeddingApi(question);
|
|
470
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
471
|
+
if (resp.error || !resp.embedding) return fail(resp.error || "No embedding", tokensUsed);
|
|
472
|
+
const questionSimilarity = cosineSimilarity(inputEmbedding, resp.embedding);
|
|
473
|
+
similarities.push(questionSimilarity);
|
|
474
|
+
questionsWithScores.push({
|
|
475
|
+
question,
|
|
476
|
+
similarity: questionSimilarity
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
|
|
480
|
+
const pass = similarity >= threshold - Number.EPSILON;
|
|
481
|
+
const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
|
|
482
|
+
const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
483
|
+
const metadata = {
|
|
484
|
+
generatedQuestions: questionsWithScores,
|
|
485
|
+
averageSimilarity: similarity,
|
|
486
|
+
threshold
|
|
487
|
+
};
|
|
488
|
+
if (pass) return {
|
|
489
|
+
pass: true,
|
|
490
|
+
score: similarity,
|
|
491
|
+
reason: greaterThanReason,
|
|
492
|
+
tokensUsed,
|
|
493
|
+
metadata
|
|
494
|
+
};
|
|
495
|
+
return {
|
|
496
|
+
pass: false,
|
|
497
|
+
score: similarity,
|
|
498
|
+
reason: lessThanReason,
|
|
499
|
+
tokensUsed,
|
|
500
|
+
metadata
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
|
|
504
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context recall check");
|
|
505
|
+
const contextString = serializeContext(context);
|
|
506
|
+
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RECALL), {
|
|
507
|
+
context: contextString,
|
|
508
|
+
groundTruth,
|
|
509
|
+
...vars || {}
|
|
510
|
+
}), "context-recall", {
|
|
511
|
+
context: contextString,
|
|
512
|
+
groundTruth,
|
|
513
|
+
...vars || {}
|
|
514
|
+
}, providerCallContext);
|
|
515
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
516
|
+
invariant(typeof resp.output === "string", "context-recall produced malformed response");
|
|
517
|
+
const attributedTokenLower = CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
|
|
518
|
+
const notAttributedTokenLower = CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
|
|
519
|
+
const sentences = splitIntoSentences(resp.output).filter((line) => {
|
|
520
|
+
const lowerLine = line.toLowerCase();
|
|
521
|
+
return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
|
|
522
|
+
});
|
|
523
|
+
const sentenceAttributions = [];
|
|
524
|
+
let numerator = 0;
|
|
525
|
+
for (const sentence of sentences) {
|
|
526
|
+
const lowerSentence = sentence.toLowerCase();
|
|
527
|
+
const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
|
|
528
|
+
if (isAttributed) numerator++;
|
|
529
|
+
const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
|
|
530
|
+
const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
|
|
531
|
+
sentenceAttributions.push({
|
|
532
|
+
sentence: cleanSentence,
|
|
533
|
+
attributed: isAttributed
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
const score = sentences.length > 0 ? numerator / sentences.length : 0;
|
|
537
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
538
|
+
const metadata = {
|
|
539
|
+
sentenceAttributions,
|
|
540
|
+
totalSentences: sentences.length,
|
|
541
|
+
attributedSentences: numerator,
|
|
542
|
+
score
|
|
543
|
+
};
|
|
544
|
+
return {
|
|
545
|
+
pass,
|
|
546
|
+
score,
|
|
547
|
+
reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
|
|
548
|
+
tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
549
|
+
metadata
|
|
550
|
+
};
|
|
551
|
+
}
|
|
552
|
+
async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
|
|
553
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context relevance check");
|
|
554
|
+
const contextString = serializeContext(context);
|
|
555
|
+
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RELEVANCE), {
|
|
556
|
+
context: contextString,
|
|
557
|
+
query: question
|
|
558
|
+
}), "context-relevance", {
|
|
559
|
+
context: contextString,
|
|
560
|
+
query: question
|
|
561
|
+
}, providerCallContext);
|
|
562
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
563
|
+
invariant(typeof resp.output === "string", "context-relevance produced malformed response");
|
|
564
|
+
const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : splitIntoSentences(context);
|
|
565
|
+
const totalContextUnits = contextUnits.length;
|
|
566
|
+
const extractedSentences = splitIntoSentences(resp.output);
|
|
567
|
+
const relevantSentences = [];
|
|
568
|
+
const insufficientInformation = resp.output.includes(CONTEXT_RELEVANCE_BAD);
|
|
569
|
+
let numerator = 0;
|
|
570
|
+
if (insufficientInformation) numerator = 0;
|
|
571
|
+
else {
|
|
572
|
+
const uniqueRelevantSentences = [...new Set(extractedSentences)];
|
|
573
|
+
numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
|
|
574
|
+
relevantSentences.push(...uniqueRelevantSentences);
|
|
575
|
+
}
|
|
576
|
+
const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
|
|
577
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
578
|
+
const metadata = {
|
|
579
|
+
extractedSentences: relevantSentences,
|
|
580
|
+
totalContextUnits,
|
|
581
|
+
totalContextSentences: totalContextUnits,
|
|
582
|
+
contextUnits,
|
|
583
|
+
relevantSentenceCount: numerator,
|
|
584
|
+
insufficientInformation,
|
|
585
|
+
score
|
|
586
|
+
};
|
|
587
|
+
return {
|
|
588
|
+
pass,
|
|
589
|
+
score,
|
|
590
|
+
reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
|
|
591
|
+
tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
592
|
+
metadata
|
|
593
|
+
};
|
|
594
|
+
}
|
|
595
|
+
async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
|
|
596
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "faithfulness check");
|
|
597
|
+
const tokensUsed = normalizeMatcherTokenUsage(void 0);
|
|
598
|
+
if (grading?.rubricPrompt) invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
|
|
599
|
+
const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
|
|
600
|
+
const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
|
|
601
|
+
const longformPrompt = await loadRubricPrompt(rawLongformPrompt, CONTEXT_FAITHFULNESS_LONGFORM);
|
|
602
|
+
const nliPrompt = await loadRubricPrompt(rawNliPrompt, CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
|
|
603
|
+
let promptText = await renderLlmRubricPrompt(longformPrompt, {
|
|
604
|
+
question: query,
|
|
605
|
+
answer: tryParse(output),
|
|
606
|
+
...vars || {}
|
|
607
|
+
});
|
|
608
|
+
let resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
|
|
609
|
+
question: query,
|
|
610
|
+
answer: tryParse(output),
|
|
611
|
+
...vars || {}
|
|
612
|
+
}, providerCallContext);
|
|
613
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
614
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
615
|
+
invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
616
|
+
const contextString = serializeContext(context);
|
|
617
|
+
const statements = splitIntoSentences(resp.output);
|
|
618
|
+
promptText = await renderLlmRubricPrompt(nliPrompt, {
|
|
619
|
+
context: contextString,
|
|
620
|
+
statements,
|
|
621
|
+
...vars || {}
|
|
622
|
+
});
|
|
623
|
+
resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
|
|
624
|
+
context: contextString,
|
|
625
|
+
statements,
|
|
626
|
+
...vars || {}
|
|
627
|
+
}, providerCallContext);
|
|
628
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
629
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
630
|
+
invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
631
|
+
let finalAnswer = "Final verdict for each statement in order:";
|
|
632
|
+
finalAnswer = finalAnswer.toLowerCase();
|
|
633
|
+
let verdicts = resp.output.toLowerCase().trim();
|
|
634
|
+
let score = 0;
|
|
635
|
+
if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
|
|
636
|
+
verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
|
|
637
|
+
const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
|
|
638
|
+
if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
|
|
639
|
+
} else {
|
|
640
|
+
const noVerdictCount = verdicts.split("verdict: no").length - 1;
|
|
641
|
+
if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
|
|
642
|
+
}
|
|
643
|
+
score = Math.min(1, Math.max(0, score));
|
|
644
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
645
|
+
return {
|
|
646
|
+
pass,
|
|
647
|
+
score,
|
|
648
|
+
reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
|
|
649
|
+
tokensUsed
|
|
650
|
+
};
|
|
651
|
+
}
|
|
652
|
+
//#endregion
|
|
653
|
+
//#region src/matchers/similarity.ts
|
|
654
|
+
function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
|
|
655
|
+
switch (metric) {
|
|
656
|
+
case "cosine": return cosineSimilarity(expectedEmbedding, outputEmbedding);
|
|
657
|
+
case "dot_product": return dotProduct(expectedEmbedding, outputEmbedding);
|
|
658
|
+
case "euclidean": return euclideanDistance(expectedEmbedding, outputEmbedding);
|
|
659
|
+
default: return fail(`Unsupported metric: ${metric}`, tokensUsed);
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
|
|
663
|
+
if (metric === "euclidean") {
|
|
664
|
+
const distance = similarity;
|
|
665
|
+
const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
|
|
666
|
+
const normalizedScore = 1 / (1 + distance);
|
|
667
|
+
const score = inverse ? 1 - normalizedScore : normalizedScore;
|
|
668
|
+
const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
|
|
669
|
+
const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
|
|
670
|
+
return {
|
|
671
|
+
pass,
|
|
672
|
+
score,
|
|
673
|
+
reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
|
|
674
|
+
tokensUsed
|
|
675
|
+
};
|
|
676
|
+
}
|
|
677
|
+
const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
|
|
678
|
+
const score = inverse ? 1 - similarity : similarity;
|
|
679
|
+
const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
|
|
680
|
+
const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
681
|
+
return {
|
|
682
|
+
pass,
|
|
683
|
+
score,
|
|
684
|
+
reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
|
|
685
|
+
tokensUsed
|
|
686
|
+
};
|
|
687
|
+
}
|
|
688
|
+
async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
|
|
689
|
+
if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
|
|
690
|
+
const similarityResp = await finalProvider.callSimilarityApi(expected, output);
|
|
691
|
+
accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
|
|
692
|
+
if (similarityResp.error) return fail(similarityResp.error, tokensUsed);
|
|
693
|
+
if (similarityResp.similarity == null) return fail("Unknown error fetching similarity", tokensUsed);
|
|
694
|
+
if (!Number.isFinite(similarityResp.similarity)) return fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
|
|
695
|
+
return similarityResp.similarity;
|
|
696
|
+
}
|
|
697
|
+
const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
|
|
698
|
+
if (typeof callEmbeddingApi !== "function") {
|
|
699
|
+
if ("callSimilarityApi" in finalProvider) return fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
|
|
700
|
+
throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
|
|
701
|
+
}
|
|
702
|
+
const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
|
|
703
|
+
const mergedUsage = normalizeMatcherTokenUsage(void 0);
|
|
704
|
+
accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
|
|
705
|
+
accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
|
|
706
|
+
accumulateTokenUsage(tokensUsed, mergedUsage);
|
|
707
|
+
if (expectedEmbedding.error || outputEmbedding.error) return fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
|
|
708
|
+
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return fail("Embedding not found", tokensUsed);
|
|
709
|
+
return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
|
|
710
|
+
}
|
|
711
|
+
async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
|
|
712
|
+
if (metric === "cosine" && state.config?.redteam && shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
|
|
713
|
+
return await doRemoteGrading({
|
|
714
|
+
task: "similar",
|
|
715
|
+
expected,
|
|
716
|
+
output,
|
|
717
|
+
threshold,
|
|
718
|
+
inverse
|
|
719
|
+
});
|
|
720
|
+
} catch (error) {
|
|
721
|
+
return fail(`Could not perform remote grading: ${error}`);
|
|
722
|
+
}
|
|
723
|
+
const defaults = await getDefaultProviders();
|
|
724
|
+
const finalProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
|
|
725
|
+
const tokensUsed = normalizeMatcherTokenUsage(void 0);
|
|
726
|
+
const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
|
|
727
|
+
if (typeof similarity !== "number") return similarity;
|
|
728
|
+
return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
|
|
729
|
+
}
|
|
730
|
+
//#endregion
|
|
221
731
|
//#region src/tracing/evaluatorTracing.ts
|
|
222
732
|
let otlpReceiverStarted = false;
|
|
223
733
|
const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
|
|
@@ -261,7 +771,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
261
771
|
telemetry.record("feature_used", { feature: "tracing" });
|
|
262
772
|
try {
|
|
263
773
|
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
264
|
-
const { startOTLPReceiver } = await import("./otlpReceiver
|
|
774
|
+
const { startOTLPReceiver } = await import("./otlpReceiver--gTpSagc.js");
|
|
265
775
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
266
776
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
267
777
|
const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
|
|
@@ -285,7 +795,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
285
795
|
async function stopOtlpReceiverIfNeeded() {
|
|
286
796
|
if (otlpReceiverStarted) try {
|
|
287
797
|
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
288
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver
|
|
798
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver--gTpSagc.js");
|
|
289
799
|
await stopOTLPReceiver();
|
|
290
800
|
otlpReceiverStarted = false;
|
|
291
801
|
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -320,7 +830,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
320
830
|
}
|
|
321
831
|
if (!tracingEnabled) return null;
|
|
322
832
|
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
323
|
-
const { getTraceStore } = await import("./store-
|
|
833
|
+
const { getTraceStore } = await import("./store-IbiRIF3k.js");
|
|
324
834
|
const traceStore = getTraceStore();
|
|
325
835
|
const traceId = generateTraceId();
|
|
326
836
|
const spanId = generateSpanId();
|
|
@@ -630,38 +1140,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
630
1140
|
}
|
|
631
1141
|
//#endregion
|
|
632
1142
|
//#region src/assertions/contains.ts
|
|
1143
|
+
/**
|
|
1144
|
+
* Advance over separators between parsed fields.
|
|
1145
|
+
*
|
|
1146
|
+
* Contains-any values allow whitespace around comma delimiters, and historical
|
|
1147
|
+
* parsing ignored repeated commas rather than producing empty fields.
|
|
1148
|
+
*/
|
|
1149
|
+
function skipWhitespaceAndCommas(value, startIndex) {
|
|
1150
|
+
let i = startIndex;
|
|
1151
|
+
while (i < value.length) {
|
|
1152
|
+
i = skipWhitespace(value, i);
|
|
1153
|
+
if (value[i] !== ",") break;
|
|
1154
|
+
i++;
|
|
1155
|
+
}
|
|
1156
|
+
return i;
|
|
1157
|
+
}
|
|
1158
|
+
/**
|
|
1159
|
+
* Advance over whitespace while preserving comma delimiter handling for callers.
|
|
1160
|
+
*/
|
|
1161
|
+
function skipWhitespace(value, startIndex) {
|
|
1162
|
+
let i = startIndex;
|
|
1163
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
1164
|
+
return i;
|
|
1165
|
+
}
|
|
1166
|
+
/**
|
|
1167
|
+
* Parse a quoted field using the assertion parser's CSV-like escape rules.
|
|
1168
|
+
*
|
|
1169
|
+
* Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
|
|
1170
|
+
* unterminated fields so malformed assertion values do not silently pass.
|
|
1171
|
+
*/
|
|
1172
|
+
function parseQuotedField(value, startIndex) {
|
|
1173
|
+
let i = startIndex + 1;
|
|
1174
|
+
let field = "";
|
|
1175
|
+
let terminated = false;
|
|
1176
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
|
|
1177
|
+
field += value[i + 1];
|
|
1178
|
+
i += 2;
|
|
1179
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
1180
|
+
field += "\"";
|
|
1181
|
+
i += 2;
|
|
1182
|
+
} else if (value[i] === "\"") {
|
|
1183
|
+
i++;
|
|
1184
|
+
terminated = true;
|
|
1185
|
+
break;
|
|
1186
|
+
} else {
|
|
1187
|
+
field += value[i];
|
|
1188
|
+
i++;
|
|
1189
|
+
}
|
|
1190
|
+
invariant(terminated, "Unterminated quoted field in contains assertion value");
|
|
1191
|
+
return {
|
|
1192
|
+
field,
|
|
1193
|
+
nextIndex: i
|
|
1194
|
+
};
|
|
1195
|
+
}
|
|
1196
|
+
/**
|
|
1197
|
+
* Parse an unquoted field up to the next comma, trimming surrounding whitespace.
|
|
1198
|
+
*/
|
|
1199
|
+
function parseUnquotedField(value, startIndex) {
|
|
1200
|
+
let i = startIndex;
|
|
1201
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
1202
|
+
return {
|
|
1203
|
+
field: value.substring(startIndex, i).trim(),
|
|
1204
|
+
nextIndex: i
|
|
1205
|
+
};
|
|
1206
|
+
}
|
|
1207
|
+
/**
|
|
1208
|
+
* Split a contains-any string into fields while preserving quoted commas.
|
|
1209
|
+
*/
|
|
633
1210
|
function parseCommaSeparatedValues(value) {
|
|
634
1211
|
const results = [];
|
|
635
1212
|
let i = 0;
|
|
636
1213
|
while (i < value.length) {
|
|
637
|
-
|
|
1214
|
+
i = skipWhitespaceAndCommas(value, i);
|
|
638
1215
|
if (i >= value.length) break;
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
i++;
|
|
645
|
-
let field = "";
|
|
646
|
-
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
647
|
-
field += value[i + 1];
|
|
648
|
-
i += 2;
|
|
649
|
-
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
650
|
-
field += "\"";
|
|
651
|
-
i += 2;
|
|
652
|
-
} else if (value[i] === "\"") {
|
|
653
|
-
i++;
|
|
654
|
-
break;
|
|
655
|
-
} else {
|
|
656
|
-
field += value[i];
|
|
657
|
-
i++;
|
|
658
|
-
}
|
|
659
|
-
results.push(field);
|
|
660
|
-
} else {
|
|
661
|
-
const start = i;
|
|
662
|
-
while (i < value.length && value[i] !== ",") i++;
|
|
663
|
-
results.push(value.substring(start, i).trim());
|
|
664
|
-
}
|
|
1216
|
+
const isQuotedField = value[i] === "\"";
|
|
1217
|
+
const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
|
|
1218
|
+
results.push(parsed.field);
|
|
1219
|
+
i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
|
|
1220
|
+
invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
|
|
665
1221
|
}
|
|
666
1222
|
return results;
|
|
667
1223
|
}
|
|
@@ -906,27 +1462,67 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
906
1462
|
};
|
|
907
1463
|
//#endregion
|
|
908
1464
|
//#region src/assertions/geval.ts
|
|
909
|
-
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
910
|
-
invariant(typeof renderedValue === "string" || Array.isArray(renderedValue), "G-Eval assertion type must have a string or array of strings value");
|
|
1465
|
+
const handleGEval = async ({ assertion, inverse, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
1466
|
+
invariant(typeof renderedValue === "string" || Array.isArray(renderedValue) && renderedValue.every((value) => typeof value === "string"), "G-Eval assertion type must have a string or array of strings value");
|
|
911
1467
|
const threshold = assertion.threshold ?? .7;
|
|
912
1468
|
if (Array.isArray(renderedValue)) {
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
1469
|
+
if (renderedValue.length === 0) return {
|
|
1470
|
+
assertion,
|
|
1471
|
+
pass: false,
|
|
1472
|
+
score: 0,
|
|
1473
|
+
reason: "G-Eval assertion requires at least one criterion string in the value array."
|
|
1474
|
+
};
|
|
1475
|
+
const responses = [];
|
|
1476
|
+
let failure;
|
|
1477
|
+
for (const [index, value] of renderedValue.entries()) {
|
|
916
1478
|
const resp = await matchesGEval(value, prompt || "", outputString, threshold, test.options, providerCallContext);
|
|
917
|
-
|
|
918
|
-
|
|
1479
|
+
responses.push(resp);
|
|
1480
|
+
if (isGraderFailure(resp)) {
|
|
1481
|
+
failure = {
|
|
1482
|
+
index,
|
|
1483
|
+
resp
|
|
1484
|
+
};
|
|
1485
|
+
break;
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
const tokensUsed = createEmptyTokenUsage();
|
|
1489
|
+
for (const r of responses) accumulateTokenUsage(tokensUsed, r.tokensUsed);
|
|
1490
|
+
if (failure) {
|
|
1491
|
+
const criterion = renderedValue[failure.index];
|
|
1492
|
+
return {
|
|
1493
|
+
assertion,
|
|
1494
|
+
pass: false,
|
|
1495
|
+
score: 0,
|
|
1496
|
+
reason: `G-Eval criterion ${failure.index + 1}/${renderedValue.length} (${JSON.stringify(criterion)}) failed: ${failure.resp.reason}`,
|
|
1497
|
+
tokensUsed,
|
|
1498
|
+
metadata: failure.resp.metadata
|
|
1499
|
+
};
|
|
919
1500
|
}
|
|
920
|
-
const
|
|
1501
|
+
const averageScore = responses.reduce((acc, r) => acc + r.score, 0) / responses.length;
|
|
1502
|
+
const combinedReason = responses.map((r) => r.reason).join("\n\n");
|
|
921
1503
|
return {
|
|
922
1504
|
assertion,
|
|
923
|
-
pass:
|
|
924
|
-
score:
|
|
925
|
-
reason:
|
|
1505
|
+
pass: averageScore >= threshold !== inverse,
|
|
1506
|
+
score: inverse ? 1 - averageScore : averageScore,
|
|
1507
|
+
reason: combinedReason,
|
|
1508
|
+
tokensUsed
|
|
926
1509
|
};
|
|
927
|
-
}
|
|
1510
|
+
}
|
|
1511
|
+
const resp = await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext);
|
|
1512
|
+
if (isGraderFailure(resp)) return {
|
|
928
1513
|
assertion,
|
|
929
|
-
|
|
1514
|
+
pass: false,
|
|
1515
|
+
score: 0,
|
|
1516
|
+
reason: resp.reason,
|
|
1517
|
+
tokensUsed: resp.tokensUsed,
|
|
1518
|
+
metadata: resp.metadata
|
|
1519
|
+
};
|
|
1520
|
+
const passed = resp.score >= threshold !== inverse;
|
|
1521
|
+
return {
|
|
1522
|
+
assertion,
|
|
1523
|
+
...resp,
|
|
1524
|
+
pass: passed,
|
|
1525
|
+
score: inverse ? 1 - resp.score : resp.score
|
|
930
1526
|
};
|
|
931
1527
|
};
|
|
932
1528
|
//#endregion
|
|
@@ -1066,6 +1662,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1066
1662
|
};
|
|
1067
1663
|
//#endregion
|
|
1068
1664
|
//#region src/assertions/html.ts
|
|
1665
|
+
const LITERAL_WRAPPER_PATTERNS = {
|
|
1666
|
+
html: /<html(?=[\s>/])/,
|
|
1667
|
+
head: /<head(?=[\s>/])/,
|
|
1668
|
+
body: /<body(?=[\s>/])/
|
|
1669
|
+
};
|
|
1670
|
+
function isWrapperTagName(tagName) {
|
|
1671
|
+
return tagName === "html" || tagName === "head" || tagName === "body";
|
|
1672
|
+
}
|
|
1673
|
+
function isTextNode(node) {
|
|
1674
|
+
return node.nodeName === "#text";
|
|
1675
|
+
}
|
|
1676
|
+
function isElementNode(node) {
|
|
1677
|
+
return "tagName" in node;
|
|
1678
|
+
}
|
|
1679
|
+
function hasSourceCodeLocation(element) {
|
|
1680
|
+
return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
|
|
1681
|
+
}
|
|
1682
|
+
function getChildNodes(node) {
|
|
1683
|
+
return "childNodes" in node ? node.childNodes : [];
|
|
1684
|
+
}
|
|
1685
|
+
function findFirstElement(root, predicate) {
|
|
1686
|
+
const stack = [root];
|
|
1687
|
+
while (stack.length > 0) {
|
|
1688
|
+
const current = stack.pop();
|
|
1689
|
+
if (isElementNode(current) && predicate(current)) return current;
|
|
1690
|
+
const children = getChildNodes(current);
|
|
1691
|
+
for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
|
|
1692
|
+
}
|
|
1693
|
+
}
|
|
1694
|
+
function hasTopLevelText(parentNode) {
|
|
1695
|
+
return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
|
|
1696
|
+
}
|
|
1697
|
+
function isUserProvidedElement(element, inputLowercase) {
|
|
1698
|
+
const tagName = element.tagName.toLowerCase();
|
|
1699
|
+
if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
|
|
1700
|
+
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1701
|
+
}
|
|
1069
1702
|
const HTML_PATTERNS = {
|
|
1070
1703
|
openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
|
|
1071
1704
|
closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
|
|
@@ -1221,37 +1854,21 @@ function validateHtml(htmlString) {
|
|
|
1221
1854
|
isValid: false,
|
|
1222
1855
|
reason: "Output appears to be XML, not HTML"
|
|
1223
1856
|
};
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
|
|
1240
|
-
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1241
|
-
})) return {
|
|
1242
|
-
isValid: false,
|
|
1243
|
-
reason: "Output does not contain recognized HTML elements"
|
|
1244
|
-
};
|
|
1245
|
-
return {
|
|
1246
|
-
isValid: true,
|
|
1247
|
-
reason: "Output is valid HTML"
|
|
1248
|
-
};
|
|
1249
|
-
} catch (error) {
|
|
1250
|
-
return {
|
|
1251
|
-
isValid: false,
|
|
1252
|
-
reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
1253
|
-
};
|
|
1254
|
-
}
|
|
1857
|
+
const document = parse$1(trimmed, { sourceCodeLocationInfo: true });
|
|
1858
|
+
const inputLowercase = trimmed.toLowerCase();
|
|
1859
|
+
const body = findFirstElement(document, (element) => element.tagName === "body");
|
|
1860
|
+
if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
|
|
1861
|
+
isValid: false,
|
|
1862
|
+
reason: "Output must be wrapped in HTML tags"
|
|
1863
|
+
};
|
|
1864
|
+
if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
|
|
1865
|
+
isValid: false,
|
|
1866
|
+
reason: "Output does not contain recognized HTML elements"
|
|
1867
|
+
};
|
|
1868
|
+
return {
|
|
1869
|
+
isValid: true,
|
|
1870
|
+
reason: "Output is valid HTML"
|
|
1871
|
+
};
|
|
1255
1872
|
}
|
|
1256
1873
|
const handleContainsHtml = ({ assertion, outputString, inverse }) => {
|
|
1257
1874
|
const pass = containsHtml(outputString) !== inverse;
|
|
@@ -1866,45 +2483,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
1866
2483
|
}
|
|
1867
2484
|
//#endregion
|
|
1868
2485
|
//#region src/assertions/trajectoryUtils.ts
|
|
1869
|
-
const TOOL_ATTRIBUTE_KEYS = [
|
|
1870
|
-
"tool.name",
|
|
1871
|
-
"tool_name",
|
|
1872
|
-
"tool",
|
|
1873
|
-
"function.name",
|
|
1874
|
-
"function_name",
|
|
1875
|
-
"gen_ai.tool.name",
|
|
1876
|
-
"codex.mcp.tool",
|
|
1877
|
-
"agent.tool",
|
|
1878
|
-
"agent.tool_name",
|
|
1879
|
-
"agent.toolName"
|
|
1880
|
-
];
|
|
1881
|
-
const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
|
|
1882
|
-
"tool.arguments",
|
|
1883
|
-
"tool.args",
|
|
1884
|
-
"tool.input",
|
|
1885
|
-
"tool_arguments",
|
|
1886
|
-
"tool_args",
|
|
1887
|
-
"tool_input",
|
|
1888
|
-
"function.arguments",
|
|
1889
|
-
"function.args",
|
|
1890
|
-
"function.input",
|
|
1891
|
-
"function_arguments",
|
|
1892
|
-
"function_args",
|
|
1893
|
-
"gen_ai.tool.arguments",
|
|
1894
|
-
"gen_ai.tool.args",
|
|
1895
|
-
"gen_ai.tool.input",
|
|
1896
|
-
"gen_ai.tool.call.arguments",
|
|
1897
|
-
"gen_ai.tool.call.args",
|
|
1898
|
-
"agent.tool.arguments",
|
|
1899
|
-
"agent.tool.args",
|
|
1900
|
-
"agent.tool.input",
|
|
1901
|
-
"codex.mcp.arguments",
|
|
1902
|
-
"codex.mcp.args",
|
|
1903
|
-
"codex.mcp.input",
|
|
1904
|
-
"arguments",
|
|
1905
|
-
"args",
|
|
1906
|
-
"input"
|
|
1907
|
-
];
|
|
1908
2486
|
const COMMAND_ATTRIBUTE_KEYS = [
|
|
1909
2487
|
"codex.command",
|
|
1910
2488
|
"command",
|
|
@@ -1917,16 +2495,15 @@ const SEARCH_ATTRIBUTE_KEYS = [
|
|
|
1917
2495
|
"search_query"
|
|
1918
2496
|
];
|
|
1919
2497
|
const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
|
|
2498
|
+
const COMMAND_TOOL_NAMES = new Set([
|
|
2499
|
+
"exec_command",
|
|
2500
|
+
"local_shell",
|
|
2501
|
+
"shell"
|
|
2502
|
+
]);
|
|
1920
2503
|
const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
|
|
1921
2504
|
const MAX_JUDGE_SUMMARY_STEPS = 24;
|
|
1922
2505
|
const JUDGE_SUMMARY_HEAD_STEPS = 12;
|
|
1923
2506
|
const JUDGE_SUMMARY_TAIL_STEPS = 12;
|
|
1924
|
-
function getStringAttribute(attributes, keys) {
|
|
1925
|
-
for (const key of keys) {
|
|
1926
|
-
const value = attributes[key];
|
|
1927
|
-
if (typeof value === "string" && value.trim()) return value.trim();
|
|
1928
|
-
}
|
|
1929
|
-
}
|
|
1930
2507
|
function normalizeStructuredAttribute(value) {
|
|
1931
2508
|
if (value === void 0 || value === null) return;
|
|
1932
2509
|
if (typeof value === "string") {
|
|
@@ -1958,9 +2535,12 @@ function getTrajectoryStepStatus(step) {
|
|
|
1958
2535
|
function getCommandExecutable(command) {
|
|
1959
2536
|
return command.trim().split(/\s+/)[0] || void 0;
|
|
1960
2537
|
}
|
|
2538
|
+
function isCommandToolName(toolName) {
|
|
2539
|
+
return !!toolName && COMMAND_TOOL_NAMES.has(toolName.trim().toLowerCase());
|
|
2540
|
+
}
|
|
1961
2541
|
function extractToolName(span) {
|
|
1962
2542
|
const attributes = span.attributes || {};
|
|
1963
|
-
const directMatch =
|
|
2543
|
+
const directMatch = getToolNameFromAttributes(attributes);
|
|
1964
2544
|
if (directMatch) return directMatch;
|
|
1965
2545
|
for (const [key, value] of Object.entries(attributes)) {
|
|
1966
2546
|
if (typeof value !== "string" || !value.trim()) continue;
|
|
@@ -1985,21 +2565,31 @@ function extractToolArgs(span) {
|
|
|
1985
2565
|
if (value !== void 0) return value;
|
|
1986
2566
|
}
|
|
1987
2567
|
}
|
|
1988
|
-
function extractCommand(span) {
|
|
2568
|
+
function extractCommand(span, toolName = extractToolName(span), getToolArgs = () => extractToolArgs(span)) {
|
|
1989
2569
|
const attributes = span.attributes || {};
|
|
1990
|
-
const directMatch =
|
|
2570
|
+
const directMatch = getFirstStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
|
|
1991
2571
|
if (directMatch) return directMatch;
|
|
1992
2572
|
for (const [key, value] of Object.entries(attributes)) {
|
|
1993
2573
|
if (typeof value !== "string" || !value.trim()) continue;
|
|
1994
2574
|
if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
|
|
1995
2575
|
}
|
|
2576
|
+
const toolArgs = getToolArgs();
|
|
2577
|
+
if (isCommandToolName(toolName) && toolArgs && typeof toolArgs === "object") {
|
|
2578
|
+
const args = toolArgs;
|
|
2579
|
+
const command = args.cmd ?? args.command;
|
|
2580
|
+
if (typeof command === "string" && command.trim()) return command.trim();
|
|
2581
|
+
if (Array.isArray(command)) {
|
|
2582
|
+
const joined = command.map((part) => String(part).trim()).filter(Boolean).join(" ");
|
|
2583
|
+
if (joined) return joined;
|
|
2584
|
+
}
|
|
2585
|
+
}
|
|
1996
2586
|
if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
|
|
1997
2587
|
}
|
|
1998
2588
|
function extractSearchQuery(span) {
|
|
1999
2589
|
const attributes = span.attributes || {};
|
|
2000
|
-
const directMatch =
|
|
2590
|
+
const directMatch = getFirstStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
|
|
2001
2591
|
if (directMatch) return directMatch;
|
|
2002
|
-
const genericQuery =
|
|
2592
|
+
const genericQuery = getFirstStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
|
|
2003
2593
|
if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
|
|
2004
2594
|
if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
|
|
2005
2595
|
}
|
|
@@ -2023,17 +2613,34 @@ function extractTrajectorySteps(trace) {
|
|
|
2023
2613
|
return left.index - right.index;
|
|
2024
2614
|
}).map(({ span }) => {
|
|
2025
2615
|
const toolName = extractToolName(span);
|
|
2026
|
-
|
|
2616
|
+
let toolArgs;
|
|
2617
|
+
let hasExtractedToolArgs = false;
|
|
2618
|
+
const getToolArgs = () => {
|
|
2619
|
+
if (!hasExtractedToolArgs) {
|
|
2620
|
+
toolArgs = extractToolArgs(span);
|
|
2621
|
+
hasExtractedToolArgs = true;
|
|
2622
|
+
}
|
|
2623
|
+
return toolArgs;
|
|
2624
|
+
};
|
|
2625
|
+
const command = extractCommand(span, toolName, getToolArgs);
|
|
2027
2626
|
const searchQuery = extractSearchQuery(span);
|
|
2028
2627
|
let type = "span";
|
|
2029
2628
|
let name = span.name;
|
|
2030
2629
|
const aliases = new Set([span.name]);
|
|
2031
2630
|
let args;
|
|
2032
|
-
if (toolName) {
|
|
2631
|
+
if (command && isCommandToolName(toolName)) {
|
|
2632
|
+
type = "command";
|
|
2633
|
+
name = command;
|
|
2634
|
+
aliases.add(command);
|
|
2635
|
+
args = getToolArgs();
|
|
2636
|
+
if (toolName) aliases.add(toolName);
|
|
2637
|
+
const executable = getCommandExecutable(command);
|
|
2638
|
+
if (executable) aliases.add(executable);
|
|
2639
|
+
} else if (toolName) {
|
|
2033
2640
|
type = "tool";
|
|
2034
2641
|
name = toolName;
|
|
2035
2642
|
aliases.add(toolName);
|
|
2036
|
-
args =
|
|
2643
|
+
args = getToolArgs();
|
|
2037
2644
|
} else if (command) {
|
|
2038
2645
|
type = "command";
|
|
2039
2646
|
name = command;
|
|
@@ -2314,11 +2921,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
2314
2921
|
const rougeMethod = rouge[baseType[baseType.length - 1]];
|
|
2315
2922
|
const score = rougeMethod(outputString, renderedValue, {});
|
|
2316
2923
|
const threshold = assertion.threshold ?? .75;
|
|
2317
|
-
const pass = score >= threshold != inverse;
|
|
2318
2924
|
return {
|
|
2319
|
-
pass,
|
|
2925
|
+
pass: score >= threshold !== inverse,
|
|
2320
2926
|
score: inverse ? 1 - score : score,
|
|
2321
|
-
reason:
|
|
2927
|
+
reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
|
|
2322
2928
|
assertion
|
|
2323
2929
|
};
|
|
2324
2930
|
}
|
|
@@ -2380,6 +2986,192 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
|
|
|
2380
2986
|
}
|
|
2381
2987
|
};
|
|
2382
2988
|
//#endregion
|
|
2989
|
+
//#region src/providers/webSearchUtils.ts
|
|
2990
|
+
function hasTool(provider, predicate) {
|
|
2991
|
+
return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
|
|
2992
|
+
}
|
|
2993
|
+
function getProviderId(provider) {
|
|
2994
|
+
if (typeof provider.id !== "function") return null;
|
|
2995
|
+
try {
|
|
2996
|
+
return provider.id();
|
|
2997
|
+
} catch (err) {
|
|
2998
|
+
logger.debug(`Failed to read provider id: ${err}`);
|
|
2999
|
+
return null;
|
|
3000
|
+
}
|
|
3001
|
+
}
|
|
3002
|
+
function isOpenAiResponsesProvider(provider, id) {
|
|
3003
|
+
return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
|
|
3004
|
+
}
|
|
3005
|
+
/**
|
|
3006
|
+
* Check if a provider has web search capabilities
|
|
3007
|
+
* @param provider The provider to check
|
|
3008
|
+
* @returns true if the provider supports web search
|
|
3009
|
+
*/
|
|
3010
|
+
function hasWebSearchCapability(provider) {
|
|
3011
|
+
if (!provider) return false;
|
|
3012
|
+
const id = getProviderId(provider);
|
|
3013
|
+
if (!id) return false;
|
|
3014
|
+
if (id.includes("perplexity")) return true;
|
|
3015
|
+
if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
|
|
3016
|
+
if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
|
|
3017
|
+
if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
|
|
3018
|
+
if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
|
|
3019
|
+
if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
|
|
3020
|
+
return false;
|
|
3021
|
+
}
|
|
3022
|
+
/**
|
|
3023
|
+
* Load a provider with web search capabilities.
|
|
3024
|
+
* Tries multiple providers in order of preference until one succeeds.
|
|
3025
|
+
* Uses the latest and most capable models from each provider with specific checkpoint IDs.
|
|
3026
|
+
*
|
|
3027
|
+
* @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
|
|
3028
|
+
* @returns A provider with web search capabilities or null
|
|
3029
|
+
*/
|
|
3030
|
+
async function loadWebSearchProvider(preferAnthropic = false) {
|
|
3031
|
+
const loadAnthropicWebSearch = async () => {
|
|
3032
|
+
try {
|
|
3033
|
+
return await loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
|
|
3034
|
+
type: "web_search_20250305",
|
|
3035
|
+
name: "web_search",
|
|
3036
|
+
max_uses: 5
|
|
3037
|
+
}] } } });
|
|
3038
|
+
} catch (err) {
|
|
3039
|
+
logger.debug(`Failed to load Anthropic web search provider: ${err}`);
|
|
3040
|
+
return null;
|
|
3041
|
+
}
|
|
3042
|
+
};
|
|
3043
|
+
const loadOpenAIWebSearch = async () => {
|
|
3044
|
+
try {
|
|
3045
|
+
return await loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
|
|
3046
|
+
} catch (err) {
|
|
3047
|
+
logger.debug(`Failed to load OpenAI web search provider: ${err}`);
|
|
3048
|
+
return null;
|
|
3049
|
+
}
|
|
3050
|
+
};
|
|
3051
|
+
const loadPerplexity = async () => {
|
|
3052
|
+
try {
|
|
3053
|
+
return await loadApiProvider("perplexity:sonar-pro");
|
|
3054
|
+
} catch (err) {
|
|
3055
|
+
logger.debug(`Failed to load Perplexity provider: ${err}`);
|
|
3056
|
+
return null;
|
|
3057
|
+
}
|
|
3058
|
+
};
|
|
3059
|
+
const loadGoogleWebSearch = async () => {
|
|
3060
|
+
try {
|
|
3061
|
+
return await loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3062
|
+
} catch (err) {
|
|
3063
|
+
logger.debug(`Failed to load Google web search provider: ${err}`);
|
|
3064
|
+
return null;
|
|
3065
|
+
}
|
|
3066
|
+
};
|
|
3067
|
+
const loadVertexWebSearch = async () => {
|
|
3068
|
+
try {
|
|
3069
|
+
return await loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3070
|
+
} catch (err) {
|
|
3071
|
+
logger.debug(`Failed to load Vertex web search provider: ${err}`);
|
|
3072
|
+
return null;
|
|
3073
|
+
}
|
|
3074
|
+
};
|
|
3075
|
+
const loadXaiWebSearch = async () => {
|
|
3076
|
+
try {
|
|
3077
|
+
return await loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
|
|
3078
|
+
} catch (err) {
|
|
3079
|
+
logger.debug(`Failed to load xAI web search provider: ${err}`);
|
|
3080
|
+
return null;
|
|
3081
|
+
}
|
|
3082
|
+
};
|
|
3083
|
+
const providers = preferAnthropic ? [
|
|
3084
|
+
loadAnthropicWebSearch,
|
|
3085
|
+
loadOpenAIWebSearch,
|
|
3086
|
+
loadPerplexity,
|
|
3087
|
+
loadGoogleWebSearch,
|
|
3088
|
+
loadVertexWebSearch,
|
|
3089
|
+
loadXaiWebSearch
|
|
3090
|
+
] : [
|
|
3091
|
+
loadOpenAIWebSearch,
|
|
3092
|
+
loadAnthropicWebSearch,
|
|
3093
|
+
loadPerplexity,
|
|
3094
|
+
loadGoogleWebSearch,
|
|
3095
|
+
loadVertexWebSearch,
|
|
3096
|
+
loadXaiWebSearch
|
|
3097
|
+
];
|
|
3098
|
+
for (const getProvider of providers) {
|
|
3099
|
+
const provider = await getProvider();
|
|
3100
|
+
if (provider && hasWebSearchCapability(provider)) {
|
|
3101
|
+
logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
|
|
3102
|
+
return provider;
|
|
3103
|
+
}
|
|
3104
|
+
if (provider) logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
|
|
3105
|
+
}
|
|
3106
|
+
return null;
|
|
3107
|
+
}
|
|
3108
|
+
//#endregion
|
|
3109
|
+
//#region src/matchers/search.ts
|
|
3110
|
+
async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
|
|
3111
|
+
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
3112
|
+
const defaultProviders = await getDefaultProviders();
|
|
3113
|
+
const defaultSearchProviders = [
|
|
3114
|
+
defaultProviders.webSearchProvider,
|
|
3115
|
+
defaultProviders.llmRubricProvider,
|
|
3116
|
+
defaultProviders.gradingProvider
|
|
3117
|
+
];
|
|
3118
|
+
let searchProvider = (grading.provider ? await getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
|
|
3119
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3120
|
+
const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
|
|
3121
|
+
if (webSearchDefault) searchProvider = webSearchDefault;
|
|
3122
|
+
}
|
|
3123
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3124
|
+
const webSearchProvider = await loadWebSearchProvider(true);
|
|
3125
|
+
if (webSearchProvider) searchProvider = webSearchProvider;
|
|
3126
|
+
}
|
|
3127
|
+
if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
|
|
3128
|
+
const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, DEFAULT_WEB_SEARCH_PROMPT), {
|
|
3129
|
+
output: tryParse(llmOutput),
|
|
3130
|
+
rubric,
|
|
3131
|
+
...vars || {}
|
|
3132
|
+
});
|
|
3133
|
+
const resp = await callProviderWithContext(searchProvider, prompt, "search-rubric", {
|
|
3134
|
+
output: tryParse(llmOutput),
|
|
3135
|
+
rubric,
|
|
3136
|
+
...vars || {}
|
|
3137
|
+
}, providerCallContext);
|
|
3138
|
+
if (resp.error || !resp.output) return {
|
|
3139
|
+
pass: false,
|
|
3140
|
+
score: 0,
|
|
3141
|
+
reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
|
|
3142
|
+
tokensUsed: resp.tokenUsage,
|
|
3143
|
+
assertion
|
|
3144
|
+
};
|
|
3145
|
+
try {
|
|
3146
|
+
const result = extractFirstJsonObject(String(resp.output));
|
|
3147
|
+
let pass = result.pass ?? false;
|
|
3148
|
+
const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
|
|
3149
|
+
if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
|
|
3150
|
+
return {
|
|
3151
|
+
pass,
|
|
3152
|
+
score,
|
|
3153
|
+
reason: result.reason || "No reason provided",
|
|
3154
|
+
tokensUsed: resp.tokenUsage,
|
|
3155
|
+
assertion,
|
|
3156
|
+
metadata: {
|
|
3157
|
+
searchResults: result.searchResults || [],
|
|
3158
|
+
searchProvider: searchProvider.id()
|
|
3159
|
+
}
|
|
3160
|
+
};
|
|
3161
|
+
} catch (err) {
|
|
3162
|
+
logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
|
|
3163
|
+
const outputLower = String(resp.output).toLowerCase();
|
|
3164
|
+
const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
|
|
3165
|
+
return {
|
|
3166
|
+
pass,
|
|
3167
|
+
score: pass ? 1 : 0,
|
|
3168
|
+
reason: resp.output,
|
|
3169
|
+
tokensUsed: resp.tokenUsage,
|
|
3170
|
+
assertion
|
|
3171
|
+
};
|
|
3172
|
+
}
|
|
3173
|
+
}
|
|
3174
|
+
//#endregion
|
|
2383
3175
|
//#region src/assertions/searchRubric.ts
|
|
2384
3176
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
2385
3177
|
if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
|
|
@@ -3412,7 +4204,7 @@ async function loadTraceData(traceId) {
|
|
|
3412
4204
|
let stableObservations = 0;
|
|
3413
4205
|
let latestTrace = null;
|
|
3414
4206
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
3415
|
-
latestTrace = await traceStore.getTrace(traceId);
|
|
4207
|
+
latestTrace = await traceStore.getTrace(traceId, { sanitizeAttributes: false });
|
|
3416
4208
|
const spanCount = latestTrace?.spans?.length ?? 0;
|
|
3417
4209
|
if (spanCount > 0) {
|
|
3418
4210
|
stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
|
|
@@ -3465,7 +4257,7 @@ const ASSERTION_HANDLERS = {
|
|
|
3465
4257
|
"llm-rubric": handleLlmRubric,
|
|
3466
4258
|
meteor: async (params) => {
|
|
3467
4259
|
try {
|
|
3468
|
-
const { handleMeteorAssertion } = await import("./meteor
|
|
4260
|
+
const { handleMeteorAssertion } = await import("./meteor--TZYICTI.js");
|
|
3469
4261
|
return handleMeteorAssertion(params);
|
|
3470
4262
|
} catch (error) {
|
|
3471
4263
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -3601,7 +4393,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
3601
4393
|
};
|
|
3602
4394
|
}
|
|
3603
4395
|
else if (filePath.endsWith(".rb")) try {
|
|
3604
|
-
const { runRuby } = await import("./rubyUtils-
|
|
4396
|
+
const { runRuby } = await import("./rubyUtils-BYVlQ94c.js");
|
|
3605
4397
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
3606
4398
|
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
3607
4399
|
} catch (error) {
|
|
@@ -3718,7 +4510,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
3718
4510
|
logger.debug(`Failed to preload trace data for assertions: ${error}`);
|
|
3719
4511
|
preloadedTraceData = null;
|
|
3720
4512
|
}
|
|
3721
|
-
|
|
4513
|
+
const concurrency = getProviderCallExecutionContext()?.providerCallQueue ? 1 : ASSERTIONS_MAX_CONCURRENCY;
|
|
4514
|
+
await async.forEachOfLimit(asserts, concurrency, async ({ assertion, assertResult, index }) => {
|
|
3722
4515
|
if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
|
|
3723
4516
|
const result = await runAssertion({
|
|
3724
4517
|
prompt,
|
|
@@ -3865,7 +4658,8 @@ var CIProgressReporter = class {
|
|
|
3865
4658
|
}
|
|
3866
4659
|
updateTotalTests(newTotal) {
|
|
3867
4660
|
this.totalTests = Math.max(newTotal, 1);
|
|
3868
|
-
|
|
4661
|
+
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
4662
|
+
this.highestPercentageSeen = percentage;
|
|
3869
4663
|
}
|
|
3870
4664
|
finish() {
|
|
3871
4665
|
if (this.intervalId) {
|
|
@@ -4038,6 +4832,10 @@ function getDefaultOtelConfig() {
|
|
|
4038
4832
|
}
|
|
4039
4833
|
//#endregion
|
|
4040
4834
|
//#region src/tracing/localSpanExporter.ts
|
|
4835
|
+
const MISSING_TRACE_RETRY_DELAY_MS = 50;
|
|
4836
|
+
function delay(ms) {
|
|
4837
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
4838
|
+
}
|
|
4041
4839
|
/**
|
|
4042
4840
|
* A span exporter that writes spans to the local TraceStore (SQLite).
|
|
4043
4841
|
* This allows OTEL spans to be stored locally for analysis in the promptfoo UI.
|
|
@@ -4079,7 +4877,7 @@ var LocalSpanExporter = class {
|
|
|
4079
4877
|
}
|
|
4080
4878
|
let firstError;
|
|
4081
4879
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
4082
|
-
const result = await
|
|
4880
|
+
const result = await this.addSpansWithTraceRetry(traceStore, traceId, spanDataList);
|
|
4083
4881
|
if (result.stored) logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
4084
4882
|
else logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
4085
4883
|
} catch (error) {
|
|
@@ -4091,6 +4889,16 @@ var LocalSpanExporter = class {
|
|
|
4091
4889
|
}
|
|
4092
4890
|
return firstError;
|
|
4093
4891
|
}
|
|
4892
|
+
async addSpansWithTraceRetry(traceStore, traceId, spans) {
|
|
4893
|
+
const options = {
|
|
4894
|
+
skipTraceCheck: false,
|
|
4895
|
+
warnIfMissingTrace: false
|
|
4896
|
+
};
|
|
4897
|
+
const result = await traceStore.addSpans(traceId, spans, options);
|
|
4898
|
+
if (result.stored) return result;
|
|
4899
|
+
await delay(MISSING_TRACE_RETRY_DELAY_MS);
|
|
4900
|
+
return traceStore.addSpans(traceId, spans, options);
|
|
4901
|
+
}
|
|
4094
4902
|
/**
|
|
4095
4903
|
* Convert an OTEL ReadableSpan to our SpanData format.
|
|
4096
4904
|
*/
|
|
@@ -4336,6 +5144,15 @@ function backfillNamedScoreWeights(accumulator) {
|
|
|
4336
5144
|
}
|
|
4337
5145
|
//#endregion
|
|
4338
5146
|
//#region src/evaluator.ts
|
|
5147
|
+
const CONVERSATION_VAR_NAME = "_conversation";
|
|
5148
|
+
const promptUsesConversationVariableCache = new LRUCache({ max: 1024 });
|
|
5149
|
+
function promptUsesConversationVariable(prompt) {
|
|
5150
|
+
const cached = promptUsesConversationVariableCache.get(prompt.raw);
|
|
5151
|
+
if (cached !== void 0) return cached;
|
|
5152
|
+
const { referenced, parsed } = analyzeTemplateReference(prompt.raw, CONVERSATION_VAR_NAME);
|
|
5153
|
+
if (parsed) promptUsesConversationVariableCache.set(prompt.raw, referenced);
|
|
5154
|
+
return referenced;
|
|
5155
|
+
}
|
|
4339
5156
|
/**
|
|
4340
5157
|
* Manages a single progress bar for the evaluation
|
|
4341
5158
|
*/
|
|
@@ -4535,6 +5352,18 @@ function hasProviderGroupedAssertion(assertion) {
|
|
|
4535
5352
|
function shouldDeferGradingForTest(test) {
|
|
4536
5353
|
return Boolean(test.assert?.some(hasProviderGroupedAssertion));
|
|
4537
5354
|
}
|
|
5355
|
+
function logGroupedGradingStatus({ concurrency, hasEvalStepTimeout, runEvalOptions, shouldGroupGradingByProvider, usesConversationVar }) {
|
|
5356
|
+
if (!runEvalOptions.some(({ test }) => shouldDeferGradingForTest(test))) return;
|
|
5357
|
+
if (shouldGroupGradingByProvider) {
|
|
5358
|
+
logger.info("Grouping model-graded assertions by provider to minimize local-model reload overhead.");
|
|
5359
|
+
return;
|
|
5360
|
+
}
|
|
5361
|
+
if (concurrency !== 1) return;
|
|
5362
|
+
const reasons = [];
|
|
5363
|
+
if (hasEvalStepTimeout) reasons.push("per-eval-step timeout is configured");
|
|
5364
|
+
if (usesConversationVar) reasons.push("conversation variables require per-row ordering");
|
|
5365
|
+
if (reasons.length > 0) logger.info(`Serial grading grouping disabled because ${reasons.join(" and ")}; model-graded judges may reload between rows.`);
|
|
5366
|
+
}
|
|
4538
5367
|
function applyGradingResult(row, checkResult) {
|
|
4539
5368
|
if (!checkResult.pass) {
|
|
4540
5369
|
row.error = checkResult.reason;
|
|
@@ -4549,14 +5378,29 @@ function applyGradingResult(row, checkResult) {
|
|
|
4549
5378
|
if (checkResult.tokensUsed) accumulateAssertionTokenUsage(row.tokenUsage.assertions, checkResult.tokensUsed);
|
|
4550
5379
|
row.gradingResult = checkResult;
|
|
4551
5380
|
}
|
|
4552
|
-
|
|
4553
|
-
|
|
4554
|
-
|
|
4555
|
-
|
|
4556
|
-
|
|
4557
|
-
|
|
4558
|
-
|
|
4559
|
-
|
|
5381
|
+
const ABORTED_GRADING_PREFIX = "Aborted: ";
|
|
5382
|
+
function isAbortShapedError(error) {
|
|
5383
|
+
return error instanceof Error && (error.name === "AbortError" || error.name === "AbortException");
|
|
5384
|
+
}
|
|
5385
|
+
function applyGradingError(row, error, abortSignal) {
|
|
5386
|
+
const errorAsError = error instanceof Error ? error : void 0;
|
|
5387
|
+
if (Boolean(abortSignal?.aborted) && isAbortShapedError(error)) {
|
|
5388
|
+
const shortMessage = errorAsError?.message ?? String(error);
|
|
5389
|
+
logger.debug("Assertion grading aborted", {
|
|
5390
|
+
error: shortMessage,
|
|
5391
|
+
promptIdx: row.promptIdx,
|
|
5392
|
+
testIdx: row.testIdx
|
|
5393
|
+
});
|
|
5394
|
+
row.error = `${ABORTED_GRADING_PREFIX}${shortMessage}`;
|
|
5395
|
+
} else {
|
|
5396
|
+
const fullMessage = errorAsError ? errorAsError.stack ?? errorAsError.message : String(error);
|
|
5397
|
+
logger.error("Assertion grading failed during eval", {
|
|
5398
|
+
error: fullMessage,
|
|
5399
|
+
promptIdx: row.promptIdx,
|
|
5400
|
+
testIdx: row.testIdx
|
|
5401
|
+
});
|
|
5402
|
+
row.error = fullMessage;
|
|
5403
|
+
}
|
|
4560
5404
|
row.failureReason = ResultFailureReason.ERROR;
|
|
4561
5405
|
row.success = false;
|
|
4562
5406
|
row.score = 0;
|
|
@@ -4588,7 +5432,7 @@ function createRunEvalState({ provider, prompt, test }) {
|
|
|
4588
5432
|
};
|
|
4589
5433
|
}
|
|
4590
5434
|
function attachConversationVar({ conversations, conversationKey, prompt, test, vars }) {
|
|
4591
|
-
const usesConversation = prompt
|
|
5435
|
+
const usesConversation = promptUsesConversationVariable(prompt);
|
|
4592
5436
|
if (!getEnvBool("PROMPTFOO_DISABLE_CONVERSATION_VAR") && !test.options?.disableConversationVar && usesConversation) vars._conversation = conversations?.[conversationKey] || [];
|
|
4593
5437
|
}
|
|
4594
5438
|
function createRunEvalSetup({ provider, prompt, promptConfig, vars }) {
|
|
@@ -4835,7 +5679,7 @@ async function gradeRunEvalResponse({ abortSignal, deferGrading, evalId, latency
|
|
|
4835
5679
|
assertScoringFunction: test.assertScoringFunction,
|
|
4836
5680
|
traceId
|
|
4837
5681
|
}).then((checkResult) => applyGradingResult(ret, checkResult))).catch((error) => {
|
|
4838
|
-
applyGradingError(ret, error);
|
|
5682
|
+
applyGradingError(ret, error, abortSignal);
|
|
4839
5683
|
});
|
|
4840
5684
|
deferredGradingPromises.set(ret, gradingPromise);
|
|
4841
5685
|
return;
|
|
@@ -5382,7 +6226,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
|
|
|
5382
6226
|
const defaultProvider = defaultTest.provider;
|
|
5383
6227
|
if (isApiProvider(defaultProvider)) return defaultProvider;
|
|
5384
6228
|
if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
5385
|
-
const { loadApiProvider } = await import("./providers-
|
|
6229
|
+
const { loadApiProvider } = await import("./providers-BYAn82cf.js");
|
|
5386
6230
|
return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
5387
6231
|
}
|
|
5388
6232
|
return defaultProvider;
|
|
@@ -5542,7 +6386,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
|
|
|
5542
6386
|
async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
|
|
5543
6387
|
if (!state.resume || !evalRecord.persisted) return;
|
|
5544
6388
|
try {
|
|
5545
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
6389
|
+
const { default: EvalResult } = await import("./evalResult-DqzsS6_W.js");
|
|
5546
6390
|
const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: state.retryMode });
|
|
5547
6391
|
const originalCount = runEvalOptions.length;
|
|
5548
6392
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -5556,14 +6400,14 @@ async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
|
|
|
5556
6400
|
}
|
|
5557
6401
|
}
|
|
5558
6402
|
function adjustConcurrencyForSerialFeatures({ concurrency, prompts, tests }) {
|
|
5559
|
-
const usesConversationVar = prompts.some(
|
|
6403
|
+
const usesConversationVar = prompts.some(promptUsesConversationVariable);
|
|
5560
6404
|
if (concurrency <= 1) return {
|
|
5561
6405
|
concurrency,
|
|
5562
6406
|
usesConversationVar
|
|
5563
6407
|
};
|
|
5564
6408
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
5565
6409
|
if (usesConversationVar) {
|
|
5566
|
-
logger.info(`Setting concurrency to 1 because the ${chalk.cyan(
|
|
6410
|
+
logger.info(`Setting concurrency to 1 because the ${chalk.cyan(CONVERSATION_VAR_NAME)} variable is used.`);
|
|
5567
6411
|
return {
|
|
5568
6412
|
concurrency: 1,
|
|
5569
6413
|
usesConversationVar
|
|
@@ -5793,7 +6637,8 @@ var Evaluator = class {
|
|
|
5793
6637
|
};
|
|
5794
6638
|
this.conversations = {};
|
|
5795
6639
|
this.registers = {};
|
|
5796
|
-
|
|
6640
|
+
const jsonlFiles = Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : [];
|
|
6641
|
+
this.fileWriters = jsonlFiles.map((p) => new JsonlFileWriter(p));
|
|
5797
6642
|
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
5798
6643
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
5799
6644
|
logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
@@ -5913,6 +6758,25 @@ var Evaluator = class {
|
|
|
5913
6758
|
this.trackCompletedRow(evalStep, row, context);
|
|
5914
6759
|
context.numComplete++;
|
|
5915
6760
|
const promptEvalCount = reservePromptEvalCount(context, row.promptIdx);
|
|
6761
|
+
if (context.testSuite.extensions?.length) try {
|
|
6762
|
+
const afterEachOut = await runExtensionHook(context.testSuite.extensions, "afterEach", {
|
|
6763
|
+
test: evalStep.test,
|
|
6764
|
+
result: {
|
|
6765
|
+
...row,
|
|
6766
|
+
namedScores: { ...row.namedScores },
|
|
6767
|
+
metadata: { ...row.metadata },
|
|
6768
|
+
response: row.response ? {
|
|
6769
|
+
...row.response,
|
|
6770
|
+
metadata: { ...row.response.metadata }
|
|
6771
|
+
} : row.response
|
|
6772
|
+
}
|
|
6773
|
+
});
|
|
6774
|
+
row.namedScores = filterFiniteScores(afterEachOut.result.namedScores);
|
|
6775
|
+
row.metadata = afterEachOut.result.metadata;
|
|
6776
|
+
if (row.response && afterEachOut.result.response) row.response.metadata = afterEachOut.result.response.metadata;
|
|
6777
|
+
} catch (error) {
|
|
6778
|
+
logger.error(`afterEach extension hook failed, persisting row without hook modifications`, { error });
|
|
6779
|
+
}
|
|
5916
6780
|
await this.persistEvalRow(row);
|
|
5917
6781
|
if (this.abortIfTargetUnavailable(row, context)) break;
|
|
5918
6782
|
const metrics = context.prompts[row.promptIdx].metrics;
|
|
@@ -5924,10 +6788,6 @@ var Evaluator = class {
|
|
|
5924
6788
|
promptEvalCount,
|
|
5925
6789
|
row
|
|
5926
6790
|
});
|
|
5927
|
-
await runExtensionHook(context.testSuite.extensions, "afterEach", {
|
|
5928
|
-
test: evalStep.test,
|
|
5929
|
-
result: row
|
|
5930
|
-
});
|
|
5931
6791
|
context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics);
|
|
5932
6792
|
}
|
|
5933
6793
|
}
|
|
@@ -6001,9 +6861,8 @@ var Evaluator = class {
|
|
|
6001
6861
|
context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
|
|
6002
6862
|
}
|
|
6003
6863
|
async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
|
|
6004
|
-
let flushGroupedRows;
|
|
6005
6864
|
try {
|
|
6006
|
-
if (shouldGroupGradingByProvider)
|
|
6865
|
+
if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
|
|
6007
6866
|
checkAbort,
|
|
6008
6867
|
evalStepIndexMap,
|
|
6009
6868
|
groupedRunEvalOptions,
|
|
@@ -6035,7 +6894,6 @@ var Evaluator = class {
|
|
|
6035
6894
|
cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
|
|
6036
6895
|
throw err;
|
|
6037
6896
|
}
|
|
6038
|
-
await flushGroupedRows?.();
|
|
6039
6897
|
if (isEvalTimedOut()) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
6040
6898
|
else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
|
|
6041
6899
|
ciProgressReporter,
|
|
@@ -6084,7 +6942,15 @@ var Evaluator = class {
|
|
|
6084
6942
|
})) break;
|
|
6085
6943
|
}
|
|
6086
6944
|
} catch (error) {
|
|
6087
|
-
|
|
6945
|
+
const pendingRowCount = groupedRows.reduce((sum, entry) => sum + entry.rows.length, 0);
|
|
6946
|
+
try {
|
|
6947
|
+
await flushGroupedRows();
|
|
6948
|
+
} catch (flushError) {
|
|
6949
|
+
logger.warn("Failed to flush grouped rows after error; target outputs may be lost", {
|
|
6950
|
+
error: flushError instanceof Error ? flushError.message : String(flushError),
|
|
6951
|
+
pendingRowCount
|
|
6952
|
+
});
|
|
6953
|
+
}
|
|
6088
6954
|
throw error;
|
|
6089
6955
|
}
|
|
6090
6956
|
await flushGroupedRows();
|
|
@@ -6520,6 +7386,13 @@ var Evaluator = class {
|
|
|
6520
7386
|
if (!this.options.silent) {
|
|
6521
7387
|
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
6522
7388
|
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
7389
|
+
logGroupedGradingStatus({
|
|
7390
|
+
concurrency,
|
|
7391
|
+
hasEvalStepTimeout,
|
|
7392
|
+
runEvalOptions,
|
|
7393
|
+
shouldGroupGradingByProvider,
|
|
7394
|
+
usesConversationVar
|
|
7395
|
+
});
|
|
6523
7396
|
}
|
|
6524
7397
|
if (this.options.showProgressBar && progressBarManager) {
|
|
6525
7398
|
await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
@@ -6622,4 +7495,4 @@ function evaluate(testSuite, evalRecord, options) {
|
|
|
6622
7495
|
//#endregion
|
|
6623
7496
|
export { isAllowedPrompt as a, assertions_default as c, generateVarCombinations as i, readAssertions as l, evaluate as n, accumulateNamedMetric as o, formatVarsForDisplay as r, doesPromptRefMatch as s, ProgressBarManager as t, runAssertions as u };
|
|
6624
7497
|
|
|
6625
|
-
//# sourceMappingURL=evaluator-
|
|
7498
|
+
//# sourceMappingURL=evaluator-DRoiYB2q.js.map
|