promptfoo 0.121.4 → 0.121.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/{ListApp-DQkFNqE9.js → ListApp-DLmM02JS.js} +1 -1
- package/dist/src/{accounts-DhMYUUbu.js → accounts-Ca7WIoPY.js} +12 -7
- package/dist/src/{accounts-F9d_5sMC.js → accounts-CjFnOPmb.js} +14 -9
- package/dist/src/{accounts-Dy17bs4D.cjs → accounts-CmWzeD2d.cjs} +16 -10
- package/dist/src/{accounts-DdJ2pHMI.js → accounts-DanM1wq_.js} +13 -8
- package/dist/src/{agentic-utils-qFlm6zes.js → agentic-utils-CJ0j3fBi.js} +3 -3
- package/dist/src/{agentic-utils-w68v6_Dz.js → agentic-utils-DDEGRV9v.js} +3 -3
- package/dist/src/{agentic-utils-BpX5b23w.cjs → agentic-utils-DvPWSUpb.cjs} +8 -7
- package/dist/src/{agentic-utils-P172hM8B.js → agentic-utils-TxUEMPYS.js} +2 -2
- package/dist/src/{agents-BahDpe5G.cjs → agents-B4sRuXg3.cjs} +7 -6
- package/dist/src/{agents-pQeBEXMm.js → agents-B8q7h_ek.js} +5 -5
- package/dist/src/{agents-CgaMXvLM.js → agents-CBgJvRkB.js} +21 -10
- package/dist/src/{agents-C-R_jfzI.js → agents-CYn2n3QP.js} +4 -4
- package/dist/src/{agents-8FDnTriG.js → agents-D-vDNFx4.js} +21 -10
- package/dist/src/{agents-aYPQLf8W.js → agents-LrHuQqr1.js} +20 -9
- package/dist/src/{agents-DJ35I3Nt.js → agents-QGg76OF-.js} +5 -5
- package/dist/src/{agents-D7-HGxUj.cjs → agents-eHZ9nlgA.cjs} +21 -10
- package/dist/src/{aimlapi-sgYnkE54.js → aimlapi-CJEbQ0o6.js} +7 -7
- package/dist/src/{aimlapi-BD6J9oKt.js → aimlapi-D5HXzZ0s.js} +6 -6
- package/dist/src/{aimlapi-qcK4OT55.cjs → aimlapi-T6HGNxNe.cjs} +7 -7
- package/dist/src/{aimlapi-BCq3MHeL.js → aimlapi-eYv3a_DK.js} +7 -7
- package/dist/src/app/app/tsconfig.app.tsbuildinfo +1 -1
- package/dist/src/app/assets/Report-BNHJKN35.js +1 -0
- package/dist/src/app/assets/index-BnT6P6sF.js +388 -0
- package/dist/src/app/assets/index-yhM8y1PP.css +1 -0
- package/dist/src/app/assets/{scroll-timeline-D9IT_e8Z.js → scroll-timeline-RpeTwOvs.js} +1 -1
- package/dist/src/app/assets/sync-5gq6fmG4.js +4 -0
- package/dist/src/app/assets/vendor-charts-BL9OMNU7.js +36 -0
- package/dist/src/app/assets/{vendor-markdown-Ch00wnNI.js → vendor-markdown-BYsQqn7Z.js} +10 -10
- package/dist/src/app/assets/{vendor-react-CVvmk1UB.js → vendor-react-CqWgVW6T.js} +2 -2
- package/dist/src/app/assets/{vendor-utils-BnEYbx2Q.js → vendor-utils-BHPO71pu.js} +1 -1
- package/dist/src/app/index.html +31 -6
- package/dist/src/{audio-COrn8rM6.js → audio-BqnRvcWG.js} +3 -3
- package/dist/src/{audio-DcVKoInv.js → audio-CPMtV1yR.js} +4 -4
- package/dist/src/{audio-B7izf48x.js → audio-DyiebVB3.js} +4 -4
- package/dist/src/{audio-BQtNuYBj.cjs → audio-FnxbEnSE.cjs} +4 -4
- package/dist/src/authoritativeMarkupInjection-BZIywVjG.js +74 -0
- package/dist/src/authoritativeMarkupInjection-DyAXAsSr.js +75 -0
- package/dist/src/authoritativeMarkupInjection-F2gBw0lN.cjs +74 -0
- package/dist/src/authoritativeMarkupInjection-QEQmFS83.js +74 -0
- package/dist/src/{base-PYJvBE1i.js → base-CKLo890h.js} +4 -3
- package/dist/src/{base-fZ9wgg50.js → base-Co80MMCi.js} +5 -4
- package/dist/src/{base-D-670DX8.cjs → base-DGJW48uz.cjs} +5 -4
- package/dist/src/{base-yrI1Yal4.js → base-E9I8zXjz.js} +5 -4
- package/dist/src/bestOfN-B3wNzjSB.js +137 -0
- package/dist/src/bestOfN-BBsO41z4.js +136 -0
- package/dist/src/bestOfN-CAwmg5UL.cjs +140 -0
- package/dist/src/bestOfN-_kTi8Bxe.js +136 -0
- package/dist/src/{blobs-D2FAd1Q5.cjs → blobs-B0977K1O.cjs} +7 -6
- package/dist/src/{blobs-BCZavS8s.js → blobs-CeFdPn_T.js} +3 -3
- package/dist/src/{blobs-BQWqnnvL.js → blobs-DODuTK-a.js} +3 -3
- package/dist/src/{blobs-C-F78Kfn.js → blobs-Dwef1Ao1.js} +2 -2
- package/dist/src/{cache-BIyPcp5v.cjs → cache-CPGUA4Yl.cjs} +135 -25
- package/dist/src/cache-Cf7b4pWE.js +3 -0
- package/dist/src/{cache-D5NZmMiT.js → cache-DIXbtkNO.js} +125 -10
- package/dist/src/{cache-mb7c8hbp.js → cache-DpPWrkTE.js} +128 -12
- package/dist/src/{cache-C4Xb-hNb.js → cache-roFAE0cI.js} +126 -11
- package/dist/src/{chat-I9izLm49.js → chat-CUCorGiL.js} +12 -12
- package/dist/src/{chat-BPXSW8Bv.cjs → chat-DG1wG4w0.cjs} +6 -6
- package/dist/src/{chat-BfPaS15_.js → chat-Dabu84Br.js} +12 -12
- package/dist/src/{chat-Dr3DUQ0D.js → chat-DqUFcWI0.js} +12 -12
- package/dist/src/{chat-CclRbxGf.cjs → chat-DxTDQ83C.cjs} +14 -13
- package/dist/src/{chat-MKxMnZJZ.js → chat-GmlolEwo.js} +4 -4
- package/dist/src/{chat-0bwXjVP0.js → chat-TP8Qifkh.js} +6 -6
- package/dist/src/{chat-mW0ORo8G.js → chat-iwaM5UTQ.js} +6 -6
- package/dist/src/{chatkit-zUIVoDos.js → chatkit-B6DWi70Q.js} +4 -4
- package/dist/src/{chatkit-BoWoSgXl.cjs → chatkit-BYveR48_.cjs} +6 -5
- package/dist/src/{chatkit-Cv6AhukM.js → chatkit-fARZwEfV.js} +3 -3
- package/dist/src/{chatkit-CJnHRRMM.js → chatkit-lb6FK02w.js} +4 -4
- package/dist/src/{claude-agent-sdk-Dtq_L-Sc.js → claude-agent-sdk-BQNp_y-F.js} +212 -67
- package/dist/src/{claude-agent-sdk-BQNuLaAK.js → claude-agent-sdk-D5Jl0SDh.js} +212 -67
- package/dist/src/{claude-agent-sdk-CPJo3dBQ.cjs → claude-agent-sdk-DH416NBD.cjs} +218 -72
- package/dist/src/{claude-agent-sdk-nfAIcxNf.js → claude-agent-sdk-x1XJ1-pU.js} +212 -67
- package/dist/src/{cloud-DQZ5sVjW.js → cloud-D3DiFqH6.js} +3 -3
- package/dist/src/cloud-p96PA4MH.js +3 -0
- package/dist/src/{cloudflare-ai-BIB567w6.js → cloudflare-ai-B6NVI3ax.js} +4 -4
- package/dist/src/{cloudflare-ai-Dl3N9OVD.cjs → cloudflare-ai-CEAW-xQa.cjs} +6 -6
- package/dist/src/{cloudflare-ai-DlKr0rY7.js → cloudflare-ai-RFSojyXG.js} +6 -6
- package/dist/src/{cloudflare-ai-DGLte7Py.js → cloudflare-ai-r4tbYmWU.js} +6 -6
- package/dist/src/{cloudflare-gateway-CiIZHU0Q.js → cloudflare-gateway-BCkLouto.js} +5 -5
- package/dist/src/{cloudflare-gateway-DI1HNP5F.js → cloudflare-gateway-BaZ4insB.js} +3 -3
- package/dist/src/{cloudflare-gateway-BDZrYydE.js → cloudflare-gateway-CF-Vb-2Z.js} +5 -5
- package/dist/src/{cloudflare-gateway-BYDp495F.cjs → cloudflare-gateway-TJMLBj6I.cjs} +5 -5
- package/dist/src/codex-app-server-B8KHEiF4.js +1915 -0
- package/dist/src/codex-app-server-CnrLBCeA.cjs +1921 -0
- package/dist/src/codex-app-server-DIXZ230V.js +1915 -0
- package/dist/src/codex-app-server-Dd22dC_N.js +1916 -0
- package/dist/src/{codex-sdk-CpqiOqDO.js → codex-sdk-B6Wah8Pa.js} +6 -6
- package/dist/src/codex-sdk-BGjVAk23.js +3 -0
- package/dist/src/{codex-sdk-C2_M2pl_.cjs → codex-sdk-CFF6gUyi.cjs} +18 -10
- package/dist/src/{codex-sdk-Rtky3M4I.js → codex-sdk-CmQABzV3.js} +6 -6
- package/dist/src/{codex-sdk-CErXn7qh.js → codex-sdk-D2d54RL8.js} +5 -5
- package/dist/src/{cometapi-CtJ-mS8R.js → cometapi-Bu9B8NUY.js} +8 -8
- package/dist/src/{cometapi-DT-jlVCB.js → cometapi-CtzNCHKu.js} +7 -7
- package/dist/src/{cometapi-UVOryo4W.cjs → cometapi-DHCDlQUI.cjs} +8 -8
- package/dist/src/{cometapi-BUlt_ELa.js → cometapi-OBILPLlu.js} +8 -8
- package/dist/src/{completion-HUe8wDhZ.js → completion-CO2e1_62.js} +6 -6
- package/dist/src/{completion-BozdoXba.cjs → completion-CSYfl2cd.cjs} +6 -6
- package/dist/src/{completion-x0a_c2y1.js → completion-DZNxcyfG.js} +6 -6
- package/dist/src/{completion-Dnxn7E-j.js → completion-sNvCLTAP.js} +5 -5
- package/dist/src/constants-BjJV0cRr.js +6 -0
- package/dist/src/constants-DH5XYLKZ.js +7 -0
- package/dist/src/constants-DZGEFLsu.js +6 -0
- package/dist/src/constants-a2kYssQk.cjs +11 -0
- package/dist/src/{createHash-4gFQpDDv.js → createHash-BtbSX3mj.js} +1 -1
- package/dist/src/{createHash-CwDVU5xr.js → createHash-CGVzWdjj.js} +1 -1
- package/dist/src/{createHash-B7KvgoOD.cjs → createHash-CSiqnK5P.cjs} +2 -2
- package/dist/src/{createHash-ChI45QR1.js → createHash-CgRvs4Fn.js} +1 -1
- package/dist/src/crescendo-BXEJK_bi.cjs +704 -0
- package/dist/src/crescendo-CU_Y2i-m.js +702 -0
- package/dist/src/crescendo-J1Xx4_zb.js +703 -0
- package/dist/src/crescendo-QiaSLW0d.js +701 -0
- package/dist/src/custom-BJfP00Bh.js +619 -0
- package/dist/src/custom-CZVn-1-r.js +620 -0
- package/dist/src/custom-Cqia7M0D.cjs +621 -0
- package/dist/src/custom-notggYVl.js +618 -0
- package/dist/src/{docker-DCgsveLD.js → docker-4D1eL6Gq.js} +6 -6
- package/dist/src/{docker-ClnmCf1Z.js → docker-BBv1WUDu.js} +5 -5
- package/dist/src/{docker-DS4_Osau.cjs → docker-D06JUoe2.cjs} +6 -6
- package/dist/src/{docker-CQmlA2NU.js → docker-DdJQBxK9.js} +6 -6
- package/dist/src/{embedding-D3xTseo7.js → embedding--UZVe4_7.js} +6 -6
- package/dist/src/{embedding-I45KG3o7.cjs → embedding-BbrwopfX.cjs} +6 -6
- package/dist/src/{embedding-nFbumxcv.js → embedding-Bi3rxrZF.js} +5 -5
- package/dist/src/{embedding-DD9wa3ae.js → embedding-C251p1-8.js} +6 -6
- package/dist/src/{errors-Cw810C93.js → errors-9PcUL8BC.js} +1 -1
- package/dist/src/{esm-Dh4dOLlt.js → esm-B6whoAcf.js} +2 -2
- package/dist/src/{esm-CtEPLdAj.cjs → esm-BIKakvNa.cjs} +8 -7
- package/dist/src/{esm-C7PnfdF8.js → esm-BTK1W7lG.js} +1 -1
- package/dist/src/{esm-tVgYPY-f.js → esm-Bexx2PFc.js} +2 -2
- package/dist/src/{eval-u4UVafl6.js → eval-0VRANImH.js} +21 -21
- package/dist/src/{eval-CzJFfFO9.js → eval-DscR5iOM.js} +1 -1
- package/dist/src/{evalResult-Bgm9ZH31.js → evalResult-2RRJvFyB.js} +41 -16
- package/dist/src/{evalResult-KZqXl4XP.cjs → evalResult-CvtS8h8u.cjs} +51 -15
- package/dist/src/evalResult-DqzsS6_W.js +3 -0
- package/dist/src/{evalResult-D3hVYFis.js → evalResult-eUkJv9Ko.js} +40 -15
- package/dist/src/evaluator-DNdJF1Gv.js +3 -0
- package/dist/src/{evaluator-IvuDYSvQ.js → evaluator-DRoiYB2q.js} +1060 -187
- package/dist/src/evaluatorHelpers-BsYP_muT.js +511 -0
- package/dist/src/evaluatorHelpers-CRqTvSux.cjs +537 -0
- package/dist/src/evaluatorHelpers-DuqFFfq7.js +510 -0
- package/dist/src/{extractor-CAfTSraf.js → extractor-BR7XAzAL.js} +6 -6
- package/dist/src/{extractor-WVPOrH43.cjs → extractor-BdxEtt3J.cjs} +6 -6
- package/dist/src/{extractor-DNSeBVOJ.js → extractor-CIW3iN-b.js} +6 -6
- package/dist/src/{extractor-Dk6bRWkv.js → extractor-CxRtnaHl.js} +5 -5
- package/dist/src/{fetch-B0Z3Oe4k.js → fetch-BufrQtvR.js} +93 -40
- package/dist/src/{fetch-BEWnXrrG.js → fetch-DXUnXkVU.js} +89 -40
- package/dist/src/{fetch-CJU5ELPa.cjs → fetch-Dw4XZHjj.cjs} +330 -270
- package/dist/src/{fetch-Di00EQrc.js → fetch-It34O8Ur.js} +305 -252
- package/dist/src/fetch-_YgGd2qv.js +3 -0
- package/dist/src/{fileExtensions-bYh77CN8.cjs → fileExtensions-BhdwzYaD.cjs} +24 -1
- package/dist/src/{fileExtensions-DnqA1y9x.js → fileExtensions-CXRfY3Ss.js} +12 -2
- package/dist/src/{fileExtensions-AWa2ZML4.js → fileExtensions-D4GCJ67J.js} +12 -2
- package/dist/src/{formatDuration-DZzPsexs.js → formatDuration-CMVNrYvE.js} +1 -1
- package/dist/src/{genaiTracer-yRuxj9-L.cjs → genaiTracer-14nugQQx.cjs} +14 -2
- package/dist/src/{genaiTracer-DWdZ28hY.js → genaiTracer-BPVvltoW.js} +2 -2
- package/dist/src/{genaiTracer-XnrcgDCe.js → genaiTracer-D18lYzhB.js} +2 -2
- package/dist/src/{genaiTracer-COYDi-tC.js → genaiTracer-jJKYsnjc.js} +2 -2
- package/dist/src/goat-Ckd3q3AY.js +467 -0
- package/dist/src/goat-Qgurm-NP.js +466 -0
- package/dist/src/goat-ghadEDdy.js +465 -0
- package/dist/src/goat-una6pZGP.cjs +469 -0
- package/dist/src/graders-BDT7dif6.js +3 -0
- package/dist/src/{graders-eIHhRqoC.js → graders-BGP99PdK.js} +2416 -2224
- package/dist/src/{graders-Zy3x0zqX.js → graders-BX0f2tvS.js} +2423 -2226
- package/dist/src/{graders-pvbReLLn.js → graders-C0nXU_ZP.js} +1806 -1609
- package/dist/src/{graders--zknU_uk.cjs → graders-ClrU2fnd.cjs} +2219 -1949
- package/dist/src/hydra-BSNZZm2M.js +543 -0
- package/dist/src/hydra-BxdG4nkg.js +541 -0
- package/dist/src/hydra-DE4xWwyc.js +542 -0
- package/dist/src/hydra-DrJttnvw.cjs +542 -0
- package/dist/src/image-B4oBtu6J.js +443 -0
- package/dist/src/{image-dnoUgPrC.js → image-BN-hjLL9.js} +4 -4
- package/dist/src/{image-9302QVqR.js → image-B_fPIwdg.js} +3 -3
- package/dist/src/image-BvUAW344.js +442 -0
- package/dist/src/image-Cvjwx1uY.js +442 -0
- package/dist/src/{image-De2FBmYV.cjs → image-DfVCGPbI.cjs} +4 -4
- package/dist/src/{image-u7-rKnYU.js → image-QzmydkiG.js} +4 -4
- package/dist/src/image-X0oY4350.cjs +465 -0
- package/dist/src/index.cjs +1689 -558
- package/dist/src/index.d.cts +3270 -1624
- package/dist/src/index.d.ts +3270 -1624
- package/dist/src/index.js +1553 -438
- package/dist/src/indirectWebPwn-02ZIghCS.js +259 -0
- package/dist/src/indirectWebPwn-BJ22AbQa.cjs +397 -0
- package/dist/src/indirectWebPwn-CbjUG0rh.js +385 -0
- package/dist/src/indirectWebPwn-CfQJt3gk.cjs +260 -0
- package/dist/src/indirectWebPwn-DBQhOjoD.js +260 -0
- package/dist/src/indirectWebPwn-OsXnKejv.js +259 -0
- package/dist/src/indirectWebPwn-tNx9OZ35.js +385 -0
- package/dist/src/indirectWebPwn-uyWdHx04.js +386 -0
- package/dist/src/inputVariables-B0qUChbV.js +467 -0
- package/dist/src/inputVariables-DUGMb9Ka.js +464 -0
- package/dist/src/inputVariables-DXFdi7AI.js +468 -0
- package/dist/src/inputVariables-Dq9W-Z3a.cjs +475 -0
- package/dist/src/{interactiveCheck-CLERUB0c.js → interactiveCheck-C4QlIuoR.js} +2 -2
- package/dist/src/{invariant-BtWWVVhl.js → invariant-B2Rf6avk.js} +1 -1
- package/dist/src/{invariant-vgHWClmd.js → invariant-DIYf9sP1.js} +1 -1
- package/dist/src/{invariant-kfQ8Bu82.cjs → invariant-QtnLD03y.cjs} +1 -1
- package/dist/src/iterative-CpU6i2As.js +490 -0
- package/dist/src/iterative-DJQEQpG3.js +491 -0
- package/dist/src/iterative-DQBuWM-j.cjs +493 -0
- package/dist/src/iterative-FTS4Bz67.js +492 -0
- package/dist/src/iterativeImage-BUABMVOA.js +413 -0
- package/dist/src/iterativeImage-ByFWkxax.cjs +415 -0
- package/dist/src/iterativeImage-BzUapOUi.js +414 -0
- package/dist/src/iterativeImage-Doz8mgxF.js +413 -0
- package/dist/src/iterativeMeta-B3YiAOc8.js +386 -0
- package/dist/src/iterativeMeta-C7APE_P1.js +385 -0
- package/dist/src/iterativeMeta-CSS8M6Ds.cjs +385 -0
- package/dist/src/iterativeMeta-DgoQ7bLh.js +384 -0
- package/dist/src/iterativeTree-B5zxBBSW.js +769 -0
- package/dist/src/iterativeTree-CNyIk0Yn.js +768 -0
- package/dist/src/iterativeTree-CPMF10ve.cjs +771 -0
- package/dist/src/iterativeTree-DvZ7GBwt.js +770 -0
- package/dist/src/{knowledgeBase-Dgc7CBWF.js → knowledgeBase-BadkINlJ.js} +24 -10
- package/dist/src/{knowledgeBase-RhFPGWDc.js → knowledgeBase-Bi_8sV-H.js} +25 -11
- package/dist/src/{knowledgeBase-lm9RXSAm.js → knowledgeBase-CkMljjdg.js} +25 -11
- package/dist/src/{knowledgeBase-Bpoe_nLu.cjs → knowledgeBase-DUh34xba.cjs} +25 -11
- package/dist/src/{litellm-DRjpcSa7.js → litellm-BKBo0jpC.js} +5 -5
- package/dist/src/{litellm-C2kqjxqp.js → litellm-BXyn5kZK.js} +5 -5
- package/dist/src/{litellm-p37R1dzQ.js → litellm-CNcfbCfa.js} +4 -4
- package/dist/src/{litellm-CoyI4IAl.cjs → litellm-CtAr7bKG.cjs} +5 -5
- package/dist/src/{logger-DksKw1Qc.js → logger-BbY6ypFL.js} +2 -2
- package/dist/src/{logger-B88EkIn6.js → logger-KD8JjCRJ.js} +2 -2
- package/dist/src/{logger-COuQb2xB.cjs → logger-cfNpzI4o.cjs} +13 -55
- package/dist/src/{luma-ray-KgTCXrZC.js → luma-ray-BMX1iEB6.js} +5 -5
- package/dist/src/{luma-ray-B863CmuZ.js → luma-ray-CR5TSpp4.js} +5 -5
- package/dist/src/{luma-ray-BxVKaW2a.cjs → luma-ray-D3FUc2K3.cjs} +9 -8
- package/dist/src/{luma-ray-BTTLtqQ8.js → luma-ray-OEMmS1RB.js} +6 -6
- package/dist/src/main.js +909 -369
- package/dist/src/memoryPoisoning-CM83NWYl.js +107 -0
- package/dist/src/memoryPoisoning-D8h9gXJF.js +106 -0
- package/dist/src/memoryPoisoning-Dp-btinn.cjs +106 -0
- package/dist/src/memoryPoisoning-cLuCoTuJ.js +106 -0
- package/dist/src/{messages-BTQz42fn.js → messages-BabO-cX8.js} +273 -17
- package/dist/src/{messages-811uVVW5.cjs → messages-DBPir0TQ.cjs} +278 -18
- package/dist/src/{messages-zWbkLLHz.js → messages-DGUlSNU7.js} +273 -17
- package/dist/src/{messages-MYTQ2TWp.js → messages-vsE_-Lv0.js} +273 -17
- package/dist/src/{meteor-DHdzY1Ss.js → meteor--TZYICTI.js} +2 -2
- package/dist/src/{meteor-Co1VQ1u5.cjs → meteor-CR226f7Z.cjs} +2 -2
- package/dist/src/{meteor-CU5UAE-H.js → meteor-Cl_yd7rJ.js} +2 -2
- package/dist/src/{meteor-DuAFv6gF.js → meteor-Dce-_zGQ.js} +1 -1
- package/dist/src/mischievousUser-0l8GD7Dp.js +46 -0
- package/dist/src/mischievousUser-BUOP9W5r.js +46 -0
- package/dist/src/mischievousUser-frFYKxu6.js +47 -0
- package/dist/src/mischievousUser-olGgHIVR.cjs +46 -0
- package/dist/src/{modelslab-Dk1JAtVo.cjs → modelslab-CNV5bMSk.cjs} +7 -7
- package/dist/src/{modelslab-D0erNWKe.js → modelslab-Cogmu4mG.js} +6 -6
- package/dist/src/{modelslab-DIq-6y7x.js → modelslab-Dzst7VTU.js} +6 -6
- package/dist/src/{modelslab-wu9yi5GE.js → modelslab-EyDczZ5A.js} +7 -7
- package/dist/src/{nova-reel-CCFRfeRb.js → nova-reel-BGPNBOMS.js} +6 -6
- package/dist/src/{nova-reel-DQrm74ng.js → nova-reel-B_5NKFu1.js} +5 -5
- package/dist/src/{nova-reel-gr11WG7f.js → nova-reel-C4eUJGse.js} +5 -5
- package/dist/src/{nova-reel-CrLXVKQf.cjs → nova-reel-CjJRxI1X.cjs} +9 -8
- package/dist/src/{nova-sonic-BYdp-QLs.js → nova-sonic-BNGmgfFz.js} +4 -4
- package/dist/src/{nova-sonic-TDgrlTk7.js → nova-sonic-ChPlh5na.js} +4 -4
- package/dist/src/{nova-sonic-B_ZXcUJB.js → nova-sonic-CrV0iaY_.js} +3 -3
- package/dist/src/{nova-sonic-i5tUvXKn.cjs → nova-sonic-DuOG9Aun.cjs} +5 -4
- package/dist/src/{openai-DhVEmgeZ.js → openai-BMHD2Huo.js} +2 -2
- package/dist/src/{openai-URNyItar.cjs → openai-C3uXv8wS.cjs} +2 -2
- package/dist/src/{openai-Qsvz25mV.js → openai-CJrsh9n4.js} +2 -2
- package/dist/src/{openai-iYtrXzOX.js → openai-zgwBb4Ff.js} +1 -1
- package/dist/src/{openclaw-CnQ363Wi.js → openclaw-BIHlu_36.js} +10 -8
- package/dist/src/{openclaw-CwzlQSQX.js → openclaw-CF7fMido.js} +9 -7
- package/dist/src/{openclaw-wX9rtfke.cjs → openclaw-Dphc01BY.cjs} +18 -15
- package/dist/src/{openclaw-CLWrW03k.js → openclaw-zIJAsz3P.js} +10 -8
- package/dist/src/{opencode-sdk-BUu5Nevv.js → opencode-sdk-B3vlPLsp.js} +40 -5
- package/dist/src/{opencode-sdk-BxD8vXp_.js → opencode-sdk-D05JSgMQ.js} +40 -5
- package/dist/src/{opencode-sdk-BZ2idgYA.cjs → opencode-sdk-DoY6GbWw.cjs} +46 -10
- package/dist/src/{opencode-sdk-GI2KaAXq.js → opencode-sdk-sRKYHGoI.js} +39 -4
- package/dist/src/{otlpReceiver-BntK801g.js → otlpReceiver--gTpSagc.js} +120 -4
- package/dist/src/{otlpReceiver-DmVulbhC.js → otlpReceiver-B2eaKC8C.js} +120 -4
- package/dist/src/{otlpReceiver-B2z58l4e.js → otlpReceiver-BXjcRqAM.js} +119 -3
- package/dist/src/{otlpReceiver-BfcVq2Nq.cjs → otlpReceiver-CvJdBGSc.cjs} +125 -7
- package/dist/src/packageParser--MWTSrPW.js +36 -0
- package/dist/src/packageParser-CgE-ziRo.js +35 -0
- package/dist/src/packageParser-QoCS1FMl.cjs +54 -0
- package/dist/src/packageParser-hwwSGnAZ.js +35 -0
- package/dist/src/processShim-BBxt7LKO.js +95 -0
- package/dist/src/processShim-BcGzU8fY.js +94 -0
- package/dist/src/processShim-C_z3aRvF.js +94 -0
- package/dist/src/processShim-DSY9BV2T.cjs +98 -0
- package/dist/src/promptLength-0qIHyhA5.js +71 -0
- package/dist/src/promptLength-4X-Wd8PG.js +72 -0
- package/dist/src/promptLength-B9nZEfO6.js +71 -0
- package/dist/src/promptLength-BbBbDHNj.cjs +94 -0
- package/dist/src/promptfoo-BDrfT30-.js +180 -0
- package/dist/src/promptfoo-Cm4hiy1Y.js +180 -0
- package/dist/src/promptfoo-Rjp-MeBb.js +181 -0
- package/dist/src/promptfoo-b-baRMj-.cjs +205 -0
- package/dist/src/prompts-BYMtqPCw.js +259 -0
- package/dist/src/prompts-C-bqE1Yp.js +260 -0
- package/dist/src/prompts-Cp_Qx5Ml.js +270 -0
- package/dist/src/prompts-DHhQsANy.js +259 -0
- package/dist/src/prompts-D_QpZ2Dm.js +271 -0
- package/dist/src/prompts-hNvWBD3z.cjs +284 -0
- package/dist/src/prompts-huDVH2CI.js +270 -0
- package/dist/src/prompts-p78Hul5i.cjs +289 -0
- package/dist/src/{providerRegistry-CPQ_CmVO.js → providerRegistry-1gB5vtzQ.js} +2 -2
- package/dist/src/{providerRegistry-CQMdTmHP.cjs → providerRegistry-CZO_w7ue.cjs} +2 -2
- package/dist/src/{providerRegistry-Bvh8mv85.js → providerRegistry-DHcFiVWX.js} +1 -1
- package/dist/src/{providerRegistry-CWoPjKFZ.js → providerRegistry-ReCd0sFa.js} +2 -2
- package/dist/src/{providers-BV_KMZje.js → providers-B9KzWxAX.js} +10558 -21587
- package/dist/src/{providers-DruaQfwu.js → providers-BCCz6_IX.js} +1228 -12196
- package/dist/src/{providers-1eKkXBKp.cjs → providers-BDVVIQM6.cjs} +10649 -21843
- package/dist/src/{providers-iUt5fbAN.js → providers-BYAn82cf.js} +1 -1
- package/dist/src/{providers-Domz_llv.js → providers-DVYRZP4E.js} +10589 -21570
- package/dist/src/{pythonUtils-Cldx7huE.js → pythonUtils-CLCgQ9tt.js} +3 -3
- package/dist/src/{pythonUtils-CnndUbW-.js → pythonUtils-CgYxeSmO.js} +3 -3
- package/dist/src/{pythonUtils-tAJvvpS-.cjs → pythonUtils-Cokhluq3.cjs} +8 -7
- package/dist/src/{pythonUtils-C2UQ30Rz.js → pythonUtils-D0BYebvX.js} +3 -3
- package/dist/src/{quiverai-DFotyafY.cjs → quiverai-BAp6iTZD.cjs} +4 -4
- package/dist/src/{quiverai-aPPvXOgn.js → quiverai-BvIhI_0l.js} +4 -4
- package/dist/src/{quiverai-DR0SnIQV.js → quiverai-CdTWPe-A.js} +3 -3
- package/dist/src/{quiverai-CtWi6x_g.js → quiverai-Cv7rJKDz.js} +4 -4
- package/dist/src/registry-BUJrgjwv.js +124 -0
- package/dist/src/registry-DXm1t_x0.js +125 -0
- package/dist/src/registry-Dp5EqoXc.js +124 -0
- package/dist/src/registry-KCVF1CFC.cjs +124 -0
- package/dist/src/{server-D6Il2Sob.js → remoteGeneration-B1_XsKXU.js} +16 -108
- package/dist/src/{server-BSB45Nt9.js → remoteGeneration-COpWcmWd.js} +15 -146
- package/dist/src/{server-Dx2TyCH2.cjs → remoteGeneration-DS9N3pgB.cjs} +30 -119
- package/dist/src/remoteGeneration-DsaSwmG2.js +217 -0
- package/dist/src/render-BNTrbmBw.cjs +384 -0
- package/dist/src/render-CSP99NLm.js +348 -0
- package/dist/src/render-DFfDeYUK.js +347 -0
- package/dist/src/{render-CgVDrJmM.js → render-DznWrxGO.js} +2 -2
- package/dist/src/render-_6ur1fhE.js +347 -0
- package/dist/src/resourceAttributes-D1jP3kL5.js +17 -0
- package/dist/src/resourceAttributes-DQbBB--2.js +16 -0
- package/dist/src/resourceAttributes-ephgOvdR.cjs +27 -0
- package/dist/src/resourceAttributes-v6-I67fn.js +16 -0
- package/dist/src/{responses-Bi9vBuW_.cjs → responses-1UFFF9N_.cjs} +51 -16
- package/dist/src/{responses-DL9m8CyY.js → responses-B3W2JvOQ.js} +49 -15
- package/dist/src/{responses--OsX2aYW.js → responses-B6ktc3Ra.js} +49 -15
- package/dist/src/{responses-C-flexAY.js → responses-URRzV8qE.js} +49 -15
- package/dist/src/rolldown-runtime-D_mwlA32.cjs +43 -0
- package/dist/src/rubyUtils-BYVlQ94c.js +3 -0
- package/dist/src/{rubyUtils-DsGrTx8R.js → rubyUtils-CXlFM2rR.js} +3 -3
- package/dist/src/{rubyUtils-DVLeA2jg.js → rubyUtils-CnlW8AYb.js} +3 -3
- package/dist/src/{rubyUtils-B6eljPuh.cjs → rubyUtils-CqUWBZAt.cjs} +18 -27
- package/dist/src/{rubyUtils-CYSQEG4a.js → rubyUtils-DdGojpfv.js} +3 -3
- package/dist/src/runtimeTransform-BJOpL9Yc.js +142 -0
- package/dist/src/runtimeTransform-Dgh_D7DU.js +143 -0
- package/dist/src/runtimeTransform-DigbjU1r.js +142 -0
- package/dist/src/runtimeTransform-ON3YYILw.cjs +147 -0
- package/dist/src/{sagemaker-BVkaG2-l.js → sagemaker-CujrzP1a.js} +62 -51
- package/dist/src/{sagemaker-XnfhheQv.cjs → sagemaker-DzffAqo_.cjs} +65 -53
- package/dist/src/{sagemaker-D67yzMzs.js → sagemaker-vhtSV7JI.js} +62 -51
- package/dist/src/{sagemaker-BveBvuxm.js → sagemaker-yr1QKeBs.js} +61 -50
- package/dist/src/{scanner-1DqWi1Ej.js → scanner-DS0109SS.js} +7 -7
- package/dist/src/server/index.js +5105 -605
- package/dist/src/server-B8rqV126.cjs +126 -0
- package/dist/src/server-BaLytskk.js +3 -0
- package/dist/src/server-CMJD10J4.js +107 -0
- package/dist/src/server-Ddp8GNMp.js +146 -0
- package/dist/src/server-DhMHosWj.js +182 -0
- package/dist/src/shared-7pmVZLNO.js +1334 -0
- package/dist/src/shared-9WHQ1oNE.js +1335 -0
- package/dist/src/{fileExtensions-BArZuxsI.js → shared-BoG7qLMv.js} +12 -2
- package/dist/src/shared-D6IjElRI.js +1334 -0
- package/dist/src/shared-WkgnDkcg.cjs +1436 -0
- package/dist/src/{signal-CE5G3a7x.js → signal-CSurUUyV.js} +3 -3
- package/dist/src/simulatedUser-C9aQObBI.js +222 -0
- package/dist/src/simulatedUser-Cu601Dd4.cjs +227 -0
- package/dist/src/simulatedUser-U_qAHnuB.js +222 -0
- package/dist/src/simulatedUser-p3tACcmw.js +223 -0
- package/dist/src/{slack-DDUe-5MC.js → slack-Bapo-7_8.js} +2 -2
- package/dist/src/{slack-1Rhq0EoV.cjs → slack-DMC1QVEg.cjs} +3 -2
- package/dist/src/{slack-D5Wpy8LM.js → slack-DTEFhrMn.js} +2 -2
- package/dist/src/{slack-acRb0IqQ.js → slack-k-_CP84Q.js} +1 -1
- package/dist/src/storage-BU4qcnOb.js +875 -0
- package/dist/src/storage-CA-v9V2v.cjs +911 -0
- package/dist/src/storage-CD-GWAdx.js +822 -0
- package/dist/src/storage-QdU-SmvD.js +834 -0
- package/dist/src/{store-DAAyxcy6.cjs → store-B2NDDooM.cjs} +60 -24
- package/dist/src/{store-CYEy5J2D.js → store-DKd5592Q.js} +51 -20
- package/dist/src/{store-M0b1WfYb.js → store-HpopRVzl.js} +50 -19
- package/dist/src/store-IbiRIF3k.js +3 -0
- package/dist/src/strategies-7CS3Alao.cjs +2360 -0
- package/dist/src/strategies-CiSeroPH.js +2331 -0
- package/dist/src/strategies-DRJjGTIY.js +2333 -0
- package/dist/src/{tables-DQ4WU5tX.js → tables-CRSXQ2Ke.js} +2 -2
- package/dist/src/{tables-CsWou1Bx.js → tables-CxjU7bBd.js} +3 -3
- package/dist/src/{tables-DUfh1F7Z.cjs → tables-DBIJU0WE.cjs} +6 -5
- package/dist/src/{tables-C4CH3zRr.js → tables-DafUHOeh.js} +3 -3
- package/dist/src/{telemetry-CQPez_Jp.js → telemetry-00ezXr_t.js} +5 -4
- package/dist/src/telemetry-ByPqDcKC.js +3 -0
- package/dist/src/{telemetry-Dsw_faFj.cjs → telemetry-CJ7FnCsc.cjs} +18 -11
- package/dist/src/{telemetry-dbaJ0E98.js → telemetry-DmXYcJNV.js} +5 -4
- package/dist/src/{telemetry-Dvqxv3YC.js → telemetry-DwX9XUN5.js} +4 -3
- package/dist/src/{text-KvuD2Iko.js → text-Db-Wt2u2.js} +1 -1
- package/dist/src/{text-DHxdyQqT.js → text-DwYK5EBn.js} +1 -1
- package/dist/src/{text-BVi-cLPJ.cjs → text-nywWsRBM.cjs} +1 -1
- package/dist/src/{tokenUsageUtils-C-bmyHoE.js → tokenUsageUtils-BjVkdk18.js} +1 -1
- package/dist/src/{tokenUsageUtils-CXrvO-wA.js → tokenUsageUtils-CDet74yk.js} +1 -1
- package/dist/src/tokenUsageUtils-CmnQ0G2m.js +142 -0
- package/dist/src/{tokenUsageUtils-Bb7DkZPz.cjs → tokenUsageUtils-_B-P8IAi.cjs} +1 -1
- package/dist/src/toolAttributes-BAjwcBf0.cjs +103 -0
- package/dist/src/toolAttributes-COVgDrBG.js +87 -0
- package/dist/src/toolAttributes-DJ9ZEKXD.js +86 -0
- package/dist/src/tracingOptions-BnwKCkSB.js +221 -0
- package/dist/src/tracingOptions-Chi74lOD.js +219 -0
- package/dist/src/tracingOptions-DrbSFaKy.cjs +249 -0
- package/dist/src/tracingOptions-ji2OuXbT.js +220 -0
- package/dist/src/{transcription-DuWDupG7.js → transcription-B8uIgCYX.js} +5 -5
- package/dist/src/{transcription-CJspiD2c.js → transcription-CfU5loSq.js} +6 -6
- package/dist/src/{transcription-V2HaAmy2.js → transcription-Dkd22_4K.js} +6 -6
- package/dist/src/{transcription-BvjmiYB1.cjs → transcription-mzuf18Mq.cjs} +9 -8
- package/dist/src/{transform-lQrDE1BQ.js → transform-BIMynQsA.js} +9 -9
- package/dist/src/transform-BnSTnFlp.js +187 -0
- package/dist/src/transform-BnSXWmU_2.cjs +221 -0
- package/dist/src/transform-CGt7Kt3y2.js +186 -0
- package/dist/src/transform-CrPGTsij.js +186 -0
- package/dist/src/{transform-CTeuTR3S.cjs → transform-DhNkAUs8.cjs} +13 -12
- package/dist/src/{transform-CG0ehZNG.js → transform-DmvYBRll.js} +9 -9
- package/dist/src/{transform-zDhMmzwX.js → transform-EtD4jAWi.js} +9 -9
- package/dist/src/{transformersAvailability-CcHusyhw.js → transformersAvailability-0ThtPved.js} +1 -1
- package/dist/src/transformersAvailability-BYydDE5U.js +35 -0
- package/dist/src/{transformersAvailability-DLlROWhg.js → transformersAvailability-BvyU9vDD.js} +1 -1
- package/dist/src/{transformersAvailability-Cju9mHgR.cjs → transformersAvailability-BytPvKUW.cjs} +1 -1
- package/dist/src/{types-Dm9JM6Vb.js → types-BFevViUY.js} +115 -19
- package/dist/src/{types-Bgh5SOn6.js → types-BJQBBPTP.js} +115 -19
- package/dist/src/{types-CeaeaZdP.cjs → types-CxJvaY2S.cjs} +357 -172
- package/dist/src/{types-BGQDAP8i.js → types-D6glLbdF.js} +271 -170
- package/dist/src/{util-BYvQUPp7.js → util--WMgw7wM.js} +28 -8
- package/dist/src/{util-C9J8ahRn.js → util-5WnCSb0h.js} +72 -48
- package/dist/src/{util-CN3SrLT4.cjs → util-BSIuSLVK.cjs} +74 -49
- package/dist/src/{util-C8e5uydV.js → util-Bx677_k2.js} +154 -147
- package/dist/src/util-CN8om2rz.cjs +386 -0
- package/dist/src/{util-DDs-7g6-.js → util-CoQWM76y.js} +28 -8
- package/dist/src/util-DNl96nNs.js +327 -0
- package/dist/src/{util-DxWpWjhc.js → util-DURocbYR.js} +667 -507
- package/dist/src/util-Df8YMvS1.js +327 -0
- package/dist/src/{util-DvU2Pw8c.js → util-DiQ3QvBB.js} +28 -8
- package/dist/src/{util-oGMLA7vc.js → util-I-Rf-KaD.js} +862 -577
- package/dist/src/{util-olYL5C6N.cjs → util-IYzs5Y04.cjs} +33 -7
- package/dist/src/{util-D9TisOyk.js → util-LKTmNsMQ.js} +71 -47
- package/dist/src/{util-Bxn8emtE.cjs → util-SPsvFONY.cjs} +738 -582
- package/dist/src/{util-D3q0WQ-0.js → util-efByNxcr.js} +72 -48
- package/dist/src/util-kDURhgJW.js +328 -0
- package/dist/src/{utils-DJfvjyMj.js → utils-B0lzitHZ.js} +3 -3
- package/dist/src/{utils-BLJKfv0y.js → utils-BFOh20Gb.js} +3 -3
- package/dist/src/{utils-hXtCYanr.js → utils-BGY69tk_.js} +2 -2
- package/dist/src/{utils-B05gLxER.cjs → utils-Ve6kuJsa.cjs} +3 -3
- package/dist/src/version-BK20a4sw.js +16 -0
- package/dist/src/version-BWCSaByA.cjs +27 -0
- package/dist/src/version-eRkNuGv8.js +17 -0
- package/dist/src/version-lpHV_53E.js +16 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +56 -28
- package/dist/src/app/assets/Report-CQYFezYu.js +0 -1
- package/dist/src/app/assets/index-BXGkeMwh.css +0 -1
- package/dist/src/app/assets/index-BzJt18Jz.js +0 -385
- package/dist/src/app/assets/sync-IjzpWrOE.js +0 -4
- package/dist/src/app/assets/vendor-charts-BNdH8TCw.js +0 -36
- package/dist/src/cache-Cr9oLMUa.js +0 -3
- package/dist/src/cache-DbLsVWB2.cjs +0 -3
- package/dist/src/cloud-Hphvo8kr.js +0 -3
- package/dist/src/codex-sdk-BAmYE7qy.js +0 -3
- package/dist/src/codex-sdk-CWEnH70W.cjs +0 -2
- package/dist/src/evalResult-D8MT9p0s.js +0 -3
- package/dist/src/evalResult-DElBuddX.js +0 -2
- package/dist/src/evalResult-Dvc-iucu.cjs +0 -2
- package/dist/src/evaluator-CVessDWe.js +0 -3
- package/dist/src/fetch-C7bGKDlQ.js +0 -3
- package/dist/src/graders-BOAzQEUe.cjs +0 -2
- package/dist/src/graders-D4BTsZdG2.js +0 -3
- package/dist/src/graders-DOJK1XpV.js +0 -2
- package/dist/src/graders-NAv9LcBn.js +0 -2
- package/dist/src/image-B5Mv-Z3h.js +0 -257
- package/dist/src/image-DVz2RiMF.js +0 -258
- package/dist/src/image-qUpPvmNZ.js +0 -257
- package/dist/src/image-x6KqLQl4.cjs +0 -280
- package/dist/src/providers-Bp4S-FvO.js +0 -2
- package/dist/src/providers-DV3ax9e_.cjs +0 -3
- package/dist/src/providers-u9Enmfok.js +0 -2
- package/dist/src/render-CH-62LbA.js +0 -135
- package/dist/src/render-CMEpfLaO.js +0 -136
- package/dist/src/render-DHIZ6_k8.js +0 -135
- package/dist/src/render-DfQSFxGE.cjs +0 -165
- package/dist/src/rubyUtils-D1L2d3jb.js +0 -3
- package/dist/src/rubyUtils-DUbq4tff.cjs +0 -2
- package/dist/src/server-BNYztJkh.js +0 -385
- package/dist/src/server-DCtHUqlp.js +0 -3
- package/dist/src/server-DaA2eR26.cjs +0 -2
- package/dist/src/store-CWOSz6D_.cjs +0 -2
- package/dist/src/store-DCDBhv7B.js +0 -3
- package/dist/src/store-Dn9HUkdW.js +0 -240
- package/dist/src/telemetry-C1IqxcdW.js +0 -3
- package/dist/src/telemetry-C4ZEa_es.cjs +0 -2
- package/dist/src/transform-Bbg6A8Jk.js +0 -216
- package/dist/src/transform-CUnzlsbn.cjs +0 -228
- package/dist/src/transform-DYX1_Xnh.js +0 -216
- package/dist/src/transform-DgKlRr73.cjs +0 -2
- package/dist/src/transform-M6ITAESf.js +0 -3
- package/dist/src/transform-UN5UGu8U.js +0 -213
package/dist/src/index.js
CHANGED
|
@@ -1,33 +1,47 @@
|
|
|
1
|
-
import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, g as getAjv, h as extractJsonObjects, k as state, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "./logger-Ct2S6Yx-.js";
|
|
1
|
+
import { C as getEnvFloat, D as getMaxEvalTimeMs, E as getEvalTimeoutMs, O as isCI, S as getEnvBool, T as getEnvString, a as logger, b as summarizeEvaluateResultForLogging, g as getAjv, h as extractJsonObjects, k as state, m as extractFirstJsonObject, n as globalLogCallback, o as setLogCallback, r as isDebugEnabled, s as setLogLevel, t as getLogLevel, v as orderKeys, w as getEnvInt, y as safeJsonStringify } from "./logger-Ct2S6Yx-.js";
|
|
2
2
|
import { t as invariant } from "./invariant-Ddh24eXh.js";
|
|
3
|
-
import { r as
|
|
4
|
-
import { r as
|
|
5
|
-
import {
|
|
6
|
-
import { i as
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import { n as
|
|
16
|
-
import { t as
|
|
17
|
-
import {
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import {
|
|
22
|
-
import {
|
|
23
|
-
import { t as
|
|
24
|
-
import {
|
|
25
|
-
import {
|
|
26
|
-
import { t as
|
|
27
|
-
import { t as
|
|
28
|
-
import { t as
|
|
29
|
-
import { n as runRuby } from "./rubyUtils-
|
|
30
|
-
import { t as
|
|
3
|
+
import { $ as riskCategorySeverityMap, A as RedteamConfigSchema, At as DocumentMediaInjectionPlacementValues, B as isUuid, Bt as getInputDescription, C as TestGeneratorConfigSchema, Ct as CODING_AGENT_CORE_PLUGINS, D as VarsSchema, Dt as BaseTokenUsageSchema, E as UnifiedConfigSchema, F as PartialGenerationError, Ft as InputDefinitionSchema, G as STRATEGY_COLLECTIONS, Gt as isProviderOptions, Ht as normalizeInputDefinition, I as PluginConfigSchema, It as InputTypeSchema, K as STRATEGY_COLLECTION_MAPPINGS, L as PolicyObjectSchema, Lt as InputTypeValues, Mt as DocxInjectionPlacementValues, N as ProvidersSchema, Nt as InputConfigSchema, O as isGradingResult, Ot as CompletionTokenDetailsSchema, P as ConversationMessageSchema, Pt as InputDefinitionObjectSchema, R as StrategyConfigSchema, Rt as InputsSchema, S as TestCasesWithMetadataSchema, St as UNALIGNED_PROVIDER_HARM_PLUGINS, T as TestSuiteSchema, U as DEFAULT_STRATEGIES, Ut as normalizeInputs, Vt as getInputType, Wt as isApiProvider, X as Severity, Y as isFanoutStrategy, Z as categoryAliases, _ as ScenarioSchema, _t as REDTEAM_PROVIDER_HARM_PLUGINS, a as AtomicTestCaseSchema, at as FINANCIAL_PLUGINS, b as TestCaseWithVarsFileSchema, bt as TEEN_SAFETY_PLUGINS, c as CompletedPromptSchema, ct as INSURANCE_PLUGINS, d as EvaluateOptionsSchema, dt as MEDICAL_PLUGINS, et as ALIASED_PLUGIN_MAPPINGS, f as GradingConfigSchema, ft as MULTI_INPUT_EXCLUDED_PLUGINS, g as ResultFailureReason, gt as PLUGIN_CATEGORIES, h as OutputFileExtension, ht as PII_PLUGINS, i as AssertionTypeSchema, it as DEFAULT_PLUGINS, j as PromptSchema, jt as DocxInjectionPlacementSchema, k as isResultFailureReason, kt as DocumentMediaInjectionPlacementSchema, l as DerivedMetricSchema, lt as LLAMA_GUARD_ENABLED_CATEGORIES, m as OutputConfigSchema, mt as PHARMACY_PLUGINS, n as AssertionSchema, nt as CANARY_BREAKING_STRATEGY_IDS, o as BaseAssertionTypesSchema, ot as FOUNDATION_PLUGINS, p as NotPrefixedAssertionTypesSchema, pt as MULTI_INPUT_VAR, q as getDefaultNFanout, r as AssertionSetSchema, rt as DATASET_EXEMPT_PLUGINS, s as CommandLineOptionsSchema, st as HARM_PLUGINS, t as AssertionOrSetSchema, tt as BIAS_PLUGINS, u as EvalResultsFilterMode, ut as LLAMA_GUARD_REPLICATE_PROVIDER, v as SpecialAssertionTypesSchema, vt as REMOTE_ONLY_PLUGIN_IDS, w as TestSuiteConfigSchema, wt as CODING_AGENT_PLUGINS, x as TestCasesWithMetadataPromptSchema, xt as TELECOM_PLUGINS, y as TestCaseSchema, zt as buildInputPromptDescription } from "./types-D6glLbdF.js";
|
|
4
|
+
import { F as getShareApiBaseUrl, I as getShareViewBaseUrl, L as FILE_METADATA_KEY, N as TERMINAL_MAX_WIDTH, P as getDefaultShareViewBaseUrl, R as HUMAN_ASSERTION_TYPE, T as cloudConfig, _ as isPromptfooSampleTarget, b as parseChatPrompt, c as getCurrentTimestamp, l as sleep, n as fetchWithRetries, p as REQUEST_TIMEOUT_MS, r as fetchWithTimeout, t as fetchWithProxy, w as CloudConfig } from "./fetch-It34O8Ur.js";
|
|
5
|
+
import { n as VERSION } from "./version-lpHV_53E.js";
|
|
6
|
+
import { i as isJavascriptFile } from "./fileExtensions-CXRfY3Ss.js";
|
|
7
|
+
import { c as setUserEmail, i as getUserEmail, o as isLoggedIntoCloud, r as getAuthor, s as promptForEmailUnverified, t as checkEmailStatusAndMaybeExit } from "./accounts-Ca7WIoPY.js";
|
|
8
|
+
import { r as importModule, t as getDirectory } from "./esm-BTK1W7lG.js";
|
|
9
|
+
import { a as extractVariablesFromTemplates, i as extractVariablesFromTemplate, o as getNunjucksEngine, r as analyzeTemplateReference, t as renderEnvOnlyInObject } from "./render-DFfDeYUK.js";
|
|
10
|
+
import { t as providerRegistry } from "./providerRegistry-DHcFiVWX.js";
|
|
11
|
+
import { a as getRemoteHealthUrl, l as shouldGenerateRemote, n as getRemoteGenerationExplicitlyDisabledError, r as getRemoteGenerationUrl, s as neverGenerateRemote } from "./remoteGeneration-DsaSwmG2.js";
|
|
12
|
+
import { r as promptYesNo } from "./server-CMJD10J4.js";
|
|
13
|
+
import { a as getCloudDatabaseId, c as getPluginSeverityOverridesFromCloud, d as isCloudProvider, i as checkCloudPermissions, o as getEvalConfigFromCloud, p as resolveTeamId, s as getOrgContext } from "./storage-CD-GWAdx.js";
|
|
14
|
+
import { r as runPython } from "./pythonUtils-D0BYebvX.js";
|
|
15
|
+
import { A as readFilters, M as loadFunction, N as parseFileUrl, O as maybeLoadToolsFromExternalFile, T as maybeLoadFromExternalFile, _ as isProviderAllowed, a as setupEnv, b as normalizeProviderRef, c as filterRuntimeVars, d as checkProviderApiKeys, f as doesProviderRefMatch, g as isOpenAiProvider, h as isGoogleProvider, i as fetchCsvFromGoogleSheet, j as readOutput, l as getTestCaseDeduplicationKey, m as isAnthropicProvider, n as writeMultipleOutputs, o as deduplicateTestCases, p as getProviderDescription, r as writeOutput, s as extractRuntimeVars, t as printBorder, u as resultIsForTestCase, w as maybeLoadConfigFromExternalFile } from "./util-Bx677_k2.js";
|
|
16
|
+
import { n as sha256, t as randomSequence } from "./createHash-BtbSX3mj.js";
|
|
17
|
+
import { c as NON_TRANSIENT_HTTP_STATUSES, i as getCache, l as isNonTransientHttpStatus, n as disableCache, r as fetchWithCache, s as withCacheNamespace, t as cache_exports } from "./cache-DIXbtkNO.js";
|
|
18
|
+
import { t as OpenAiChatCompletionProvider } from "./chat-Dabu84Br.js";
|
|
19
|
+
import { h as validateFunctionCall } from "./transform-DmvYBRll.js";
|
|
20
|
+
import { l as validateFunctionCall$1 } from "./util-LKTmNsMQ.js";
|
|
21
|
+
import { _ as AIStudioChatProvider, a as resolveProvider, f as MCPProvider, g as GoogleLiveProvider, h as VertexChatProvider, n as loadApiProvider, o as resolveProviderConfigs, r as loadApiProviders, t as getProviderIds } from "./providers-DVYRZP4E.js";
|
|
22
|
+
import { a as createEmptyTokenUsage, i as createEmptyAssertions, n as accumulateResponseTokenUsage, o as normalizeTokenUsage, r as accumulateTokenUsage, t as accumulateAssertionTokenUsage } from "./tokenUsageUtils-CmnQ0G2m.js";
|
|
23
|
+
import { t as ellipsize } from "./text-DwYK5EBn.js";
|
|
24
|
+
import { t as telemetry } from "./telemetry-DwX9XUN5.js";
|
|
25
|
+
import { a as evalsTable, c as evalsToTagsTable, d as tagsTable, i as evalResultsTable, l as promptsTable, m as getDbSignalPath, o as evalsToDatasetsTable, p as getDb, r as datasetsTable, s as evalsToPromptsTable } from "./tables-CRSXQ2Ke.js";
|
|
26
|
+
import { t as getBlobByHash } from "./blobs-Dwef1Ao1.js";
|
|
27
|
+
import { t as getProcessShim } from "./processShim-BcGzU8fY.js";
|
|
28
|
+
import { n as loadFromPackage, t as isPackagePath } from "./packageParser-CgE-ziRo.js";
|
|
29
|
+
import { n as runRuby } from "./rubyUtils-CnlW8AYb.js";
|
|
30
|
+
import { n as materializeInputVariablesWithMetadata, t as buildPromptInputDescriptions } from "./inputVariables-DUGMb9Ka.js";
|
|
31
|
+
import { a as extractPromptFromTags, c as isBasicRefusal, i as extractMaterializedVariablesFromJsonWithMetadata, n as extractGoalFromPrompt, o as getSessionId, r as extractInputVarsFromPrompt, s as getShortPluginId } from "./util-DNl96nNs.js";
|
|
32
|
+
import { n as PromptfooHarmfulCompletionProvider } from "./promptfoo-Cm4hiy1Y.js";
|
|
33
|
+
import { $ as readProviderPromptMap, A as ExcessiveAgencyPlugin, At as withProviderCallExecutionContext, B as retryWithDeduplication, C as PlinyPlugin, Ct as processFileReference, D as ImitationPlugin, Dt as getAndCheckProvider, E as IntentPlugin, Et as callProviderWithContext, F as BeavertailsPlugin, G as matchesFactuality, H as fetchHuggingFaceDataset, I as AegisPlugin, J as matchesPiScore, K as matchesGEval, L as RedteamGraderBase, M as DebugAccessPlugin, N as CrossSessionLeakPlugin, O as HarmbenchPlugin, Ot as getGradingProvider, P as ContractPlugin, Q as readPrompts, R as RedteamPluginBase, S as makeInlinePolicyIdSync, St as loadFromJavaScriptFile, T as OverreliancePlugin, Tt as DEFAULT_ANTHROPIC_MODEL, U as isGraderFailure, V as sampleArray, W as matchesClosedQa, X as doRemoteGrading, Y as matchesTrajectoryGoalSuccess, Z as processPrompts, _ as PromptExtractionPlugin, _t as normalizeMatcherTokenUsage, a as VLGuardPlugin, at as CONTEXT_FAITHFULNESS_NLI_STATEMENTS, b as determinePolicyTypeFromId, bt as coerceString, c as ToxicChatPlugin, ct as CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN, d as TeenSafetyDangerousRoleplayPlugin, dt as loadRubricPrompt, et as DEFAULT_WEB_SEARCH_PROMPT, f as TeenSafetyDangerousContentPlugin, ft as renderLlmRubricPrompt, g as RbacPlugin, gt as fail, h as ShellInjectionPlugin, ht as euclideanDistance, i as VLSUPlugin, it as CONTEXT_FAITHFULNESS_LONGFORM, j as DivergentRepetitionPlugin, k as HallucinationPlugin, kt as getProviderCallExecutionContext, l as ToolDiscoveryPlugin, lt as CONTEXT_RELEVANCE, m as SqlInjectionPlugin, mt as dotProduct, n as getGraderById, nt as SUGGEST_PROMPTS_SYSTEM_MESSAGE, o as UnverifiableClaimsPlugin, ot as CONTEXT_RECALL, p as TeenSafetyAgeRestrictedGoodsAndServicesPlugin, pt as cosineSimilarity, q as matchesLlmRubric, rt as ANSWER_RELEVANCY_GENERATE, s as UnsafeBenchPlugin, st as CONTEXT_RECALL_ATTRIBUTED_TOKEN, t as GRADERS, tt as SELECT_BEST_PROMPT, u as TeenSafetyHarmfulBodyIdealsPlugin, ut as CONTEXT_RELEVANCE_BAD, v as PoliticsPlugin, vt as splitIntoSentences, w as getPiiLeakTestsForCategory, wt as getDefaultProviders, x as isValidPolicyObject, xt as getFinalTest, y as PolicyPlugin, yt as tryParse, z as getCustomPolicies } from "./graders-BX0f2tvS.js";
|
|
34
|
+
import { f as redteamProviderManager, g as createProviderRateLimitOptions, h as createRateLimitRegistry, m as TokenUsageTracker } from "./shared-D6IjElRI.js";
|
|
35
|
+
import { i as generateIdFromPrompt, t as hashPrompt } from "./utils-BGY69tk_.js";
|
|
36
|
+
import { a as getTransformLabel, i as getTransformErrorMessage, n as TRANSFORM_KEYS, o as transform, r as TransformInputType, t as INLINE_FUNCTION_LABEL } from "./transform-CGt7Kt3y2.js";
|
|
37
|
+
import { t as getTraceStore } from "./store-HpopRVzl.js";
|
|
38
|
+
import { n as isBlobStorageEnabled, t as extractAndStoreBinaryData } from "./extractor-CxRtnaHl.js";
|
|
39
|
+
import { i as throwIfTargetPromptExceedsMaxChars, n as getGeneratedPromptOverLimit, r as getMaxCharsPerMessageModifierValue, t as MAX_CHARS_PER_MESSAGE_MODIFIER_KEY } from "./promptLength-B9nZEfO6.js";
|
|
40
|
+
import { n as checkExfilTracking } from "./indirectWebPwn-CbjUG0rh.js";
|
|
41
|
+
import { n as getFirstStringAttribute, r as getToolNameFromAttributes, t as TOOL_ARGUMENT_ATTRIBUTE_KEYS } from "./toolAttributes-DJ9ZEKXD.js";
|
|
42
|
+
import { i as filterFiniteScores, n as renderPrompt, r as runExtensionHook, t as collectFileMetadata } from "./evaluatorHelpers-DuqFFfq7.js";
|
|
43
|
+
import { r as sanitizeProvider, t as EvalResult } from "./evalResult-2RRJvFyB.js";
|
|
44
|
+
import { i as pluginMatchesStrategyTargets, n as loadStrategy, r as validateStrategies, t as Strategies } from "./strategies-CiSeroPH.js";
|
|
31
45
|
import * as fs$2 from "fs";
|
|
32
46
|
import fs, { createWriteStream } from "fs";
|
|
33
47
|
import * as path$2 from "path";
|
|
@@ -35,29 +49,30 @@ import path, { parse } from "path";
|
|
|
35
49
|
import async from "async";
|
|
36
50
|
import yaml from "js-yaml";
|
|
37
51
|
import { AsyncResource } from "node:async_hooks";
|
|
38
|
-
import { resolve } from "node:path";
|
|
39
|
-
import { fileURLToPath } from "node:url";
|
|
40
52
|
import chalk from "chalk";
|
|
41
53
|
import * as os$1 from "os";
|
|
42
54
|
import os from "os";
|
|
43
|
-
import util from "util";
|
|
44
55
|
import dedent from "dedent";
|
|
45
|
-
import * as fsPromises from "fs/promises";
|
|
46
|
-
import { globSync } from "glob";
|
|
47
56
|
import { z } from "zod";
|
|
48
|
-
import
|
|
49
|
-
import
|
|
57
|
+
import * as fsPromises from "fs/promises";
|
|
58
|
+
import util from "util";
|
|
59
|
+
import input from "@inquirer/input";
|
|
60
|
+
import { resolve } from "node:path";
|
|
61
|
+
import { fileURLToPath } from "node:url";
|
|
50
62
|
import crypto$1, { createHash, randomBytes } from "crypto";
|
|
51
63
|
import { DiagConsoleLogger, DiagLogLevel, diag, propagation } from "@opentelemetry/api";
|
|
52
|
-
import input from "@inquirer/input";
|
|
53
64
|
import readline from "readline";
|
|
65
|
+
import { parse as parse$1 } from "csv-parse/sync";
|
|
66
|
+
import { globSync } from "glob";
|
|
67
|
+
import { XMLParser } from "fast-xml-parser";
|
|
54
68
|
import { and, desc, eq, inArray, sql } from "drizzle-orm";
|
|
55
|
-
import cliProgress from "cli-progress";
|
|
56
69
|
import { URL } from "url";
|
|
57
|
-
import {
|
|
70
|
+
import { parse as parse$2 } from "parse5";
|
|
58
71
|
import { distance } from "fastest-levenshtein";
|
|
72
|
+
import cliProgress from "cli-progress";
|
|
59
73
|
import * as rouge from "js-rouge";
|
|
60
74
|
import { isDeepStrictEqual } from "node:util";
|
|
75
|
+
import { LRUCache } from "lru-cache";
|
|
61
76
|
import "debounce";
|
|
62
77
|
import { ExportResultCode, W3CTraceContextPropagator } from "@opentelemetry/core";
|
|
63
78
|
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
@@ -242,6 +257,505 @@ const handleConversationRelevance = async ({ assertion, outputString, prompt, pr
|
|
|
242
257
|
};
|
|
243
258
|
};
|
|
244
259
|
//#endregion
|
|
260
|
+
//#region src/matchers/classification.ts
|
|
261
|
+
/**
|
|
262
|
+
*
|
|
263
|
+
* @param expected Expected classification. If undefined, matches any classification.
|
|
264
|
+
* @param output Text to classify.
|
|
265
|
+
* @param threshold Value between 0 and 1. If the expected classification is undefined, the threshold is the minimum score for any classification. If the expected classification is defined, the threshold is the minimum score for that classification.
|
|
266
|
+
* @param grading
|
|
267
|
+
* @returns Pass if the output matches the classification with a score greater than or equal to the threshold.
|
|
268
|
+
*/
|
|
269
|
+
async function matchesClassification(expected, output, threshold, grading) {
|
|
270
|
+
const resp = await (await getAndCheckProvider("classification", grading?.provider, null, "classification check")).callClassificationApi(output);
|
|
271
|
+
if (!resp.classification) return fail(resp.error || "Unknown error fetching classification");
|
|
272
|
+
let score;
|
|
273
|
+
if (expected === void 0) {
|
|
274
|
+
const scores = Object.values(resp.classification);
|
|
275
|
+
if (scores.length === 0) return {
|
|
276
|
+
pass: false,
|
|
277
|
+
score: 0,
|
|
278
|
+
reason: "No classification scores returned"
|
|
279
|
+
};
|
|
280
|
+
score = Math.max(...scores);
|
|
281
|
+
} else score = resp.classification[expected] || 0;
|
|
282
|
+
if (score >= threshold - Number.EPSILON) {
|
|
283
|
+
const reason = expected === void 0 ? `Maximum classification score ${score.toFixed(2)} >= ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} >= ${threshold}`;
|
|
284
|
+
return {
|
|
285
|
+
pass: true,
|
|
286
|
+
score,
|
|
287
|
+
reason
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
return {
|
|
291
|
+
pass: false,
|
|
292
|
+
score,
|
|
293
|
+
reason: expected === void 0 ? `Maximum classification score ${score.toFixed(2)} < ${threshold}` : `Classification ${expected} has score ${score.toFixed(2)} < ${threshold}`
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
//#endregion
|
|
297
|
+
//#region src/matchers/comparison.ts
|
|
298
|
+
async function matchesSelectBest(criteria, outputs, grading, vars, providerCallContext) {
|
|
299
|
+
invariant(outputs.length >= 2, "select-best assertion must have at least two outputs to compare between");
|
|
300
|
+
const resp = await callProviderWithContext(await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "select-best check"), await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, SELECT_BEST_PROMPT), {
|
|
301
|
+
criteria,
|
|
302
|
+
outputs: outputs.map((o) => tryParse(o)),
|
|
303
|
+
...vars || {}
|
|
304
|
+
}), "select-best", {
|
|
305
|
+
criteria,
|
|
306
|
+
outputs: outputs.map((o) => tryParse(o)),
|
|
307
|
+
...vars || {}
|
|
308
|
+
}, providerCallContext);
|
|
309
|
+
if (resp.error || !resp.output) return Array.from({ length: outputs.length }, () => fail(resp.error || "No output", resp.tokenUsage));
|
|
310
|
+
invariant(typeof resp.output === "string", "select-best produced malformed response");
|
|
311
|
+
const firstIntegerMatch = resp.output.trim().match(/\d+/);
|
|
312
|
+
const verdict = firstIntegerMatch ? Number.parseInt(firstIntegerMatch[0], 10) : NaN;
|
|
313
|
+
if (Number.isNaN(verdict) || verdict < 0 || verdict >= outputs.length) return Array.from({ length: outputs.length }, () => fail(`Invalid select-best verdict: ${verdict}`, resp.tokenUsage));
|
|
314
|
+
const tokensUsed = normalizeMatcherTokenUsage(resp.tokenUsage);
|
|
315
|
+
return outputs.map((_output, index) => {
|
|
316
|
+
if (index === verdict) return {
|
|
317
|
+
pass: true,
|
|
318
|
+
score: 1,
|
|
319
|
+
reason: `Output selected as the best: ${criteria}`,
|
|
320
|
+
tokensUsed
|
|
321
|
+
};
|
|
322
|
+
else return {
|
|
323
|
+
pass: false,
|
|
324
|
+
score: 0,
|
|
325
|
+
reason: `Output not selected: ${criteria}`,
|
|
326
|
+
tokensUsed
|
|
327
|
+
};
|
|
328
|
+
});
|
|
329
|
+
}
|
|
330
|
+
async function selectMaxScore(outputs, resultsWithGradingResults, assertion) {
|
|
331
|
+
invariant(outputs.length >= 2, "max-score assertion must have at least two outputs to compare between");
|
|
332
|
+
const value = assertion.value || {};
|
|
333
|
+
const options = {
|
|
334
|
+
method: typeof value === "object" && "method" in value ? value.method : "average",
|
|
335
|
+
weights: typeof value === "object" && "weights" in value ? value.weights : {},
|
|
336
|
+
threshold: typeof value === "object" && "threshold" in value ? value.threshold : void 0
|
|
337
|
+
};
|
|
338
|
+
const scores = resultsWithGradingResults.map((result, index) => {
|
|
339
|
+
const relevantResults = (result.gradingResult?.componentResults || []).filter((r) => r.assertion && r.assertion.type !== "max-score" && r.assertion.type !== "select-best");
|
|
340
|
+
if (relevantResults.length === 0) throw new Error("max-score requires at least one other assertion (besides max-score or select-best) to aggregate scores from");
|
|
341
|
+
let totalWeightedScore = 0;
|
|
342
|
+
let totalWeight = 0;
|
|
343
|
+
relevantResults.forEach((componentResult) => {
|
|
344
|
+
const assertionType = componentResult.assertion?.type || "unknown";
|
|
345
|
+
const weight = options.weights[assertionType] === void 0 ? 1 : options.weights[assertionType];
|
|
346
|
+
const score = componentResult.score || 0;
|
|
347
|
+
totalWeightedScore += score * weight;
|
|
348
|
+
totalWeight += weight;
|
|
349
|
+
});
|
|
350
|
+
let aggregateScore;
|
|
351
|
+
if (options.method === "sum") aggregateScore = totalWeightedScore;
|
|
352
|
+
else aggregateScore = totalWeight > 0 ? totalWeightedScore / totalWeight : 0;
|
|
353
|
+
return {
|
|
354
|
+
index,
|
|
355
|
+
score: aggregateScore,
|
|
356
|
+
componentCount: relevantResults.length,
|
|
357
|
+
totalWeight
|
|
358
|
+
};
|
|
359
|
+
});
|
|
360
|
+
let maxScore = -Infinity;
|
|
361
|
+
let winnerIndex = 0;
|
|
362
|
+
for (let i = 0; i < scores.length; i++) if (scores[i].score > maxScore) {
|
|
363
|
+
maxScore = scores[i].score;
|
|
364
|
+
winnerIndex = i;
|
|
365
|
+
}
|
|
366
|
+
const meetsThreshold = options.threshold === void 0 || maxScore >= options.threshold;
|
|
367
|
+
return scores.map(({ index, score, componentCount, totalWeight }) => {
|
|
368
|
+
const isWinner = index === winnerIndex && meetsThreshold;
|
|
369
|
+
return {
|
|
370
|
+
pass: isWinner,
|
|
371
|
+
score: isWinner ? 1 : 0,
|
|
372
|
+
reason: isWinner ? `Selected as highest scoring output (score: ${score.toFixed(3)})` : score === maxScore && !meetsThreshold ? `Not selected - score ${score.toFixed(3)} below threshold ${options.threshold}` : `Not selected (score: ${score.toFixed(3)}, max: ${maxScore.toFixed(3)})`,
|
|
373
|
+
namedScores: {
|
|
374
|
+
maxScore: score,
|
|
375
|
+
assertionCount: componentCount,
|
|
376
|
+
totalWeight
|
|
377
|
+
}
|
|
378
|
+
};
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
//#endregion
|
|
382
|
+
//#region src/matchers/moderation.ts
|
|
383
|
+
async function matchesModeration({ userPrompt, assistantResponse, categories = [] }, grading) {
|
|
384
|
+
if (!assistantResponse) return {
|
|
385
|
+
pass: true,
|
|
386
|
+
score: 1,
|
|
387
|
+
reason: "No output to moderate"
|
|
388
|
+
};
|
|
389
|
+
const defaultProviders = await getDefaultProviders();
|
|
390
|
+
const defaultModerationProvider = !getEnvString("OPENAI_API_KEY") && (getEnvString("REPLICATE_API_KEY") || getEnvString("REPLICATE_API_TOKEN")) ? await loadApiProvider(LLAMA_GUARD_REPLICATE_PROVIDER) : defaultProviders.moderationProvider;
|
|
391
|
+
const moderationProvider = await getAndCheckProvider("moderation", grading?.provider, defaultModerationProvider, "moderation check");
|
|
392
|
+
invariant(moderationProvider, "Moderation provider must be defined");
|
|
393
|
+
const resp = await moderationProvider.callModerationApi(userPrompt, assistantResponse);
|
|
394
|
+
if (resp.error) return {
|
|
395
|
+
pass: false,
|
|
396
|
+
score: 0,
|
|
397
|
+
reason: `Moderation API error: ${resp.error}`
|
|
398
|
+
};
|
|
399
|
+
const { flags } = resp;
|
|
400
|
+
if (!flags || flags.length === 0) return {
|
|
401
|
+
pass: true,
|
|
402
|
+
score: 1,
|
|
403
|
+
reason: "No moderation flags detected"
|
|
404
|
+
};
|
|
405
|
+
const filteredFlags = categories.length === 0 ? flags : flags.filter((flag) => categories.includes(flag.code));
|
|
406
|
+
if (filteredFlags.length > 0) return {
|
|
407
|
+
pass: false,
|
|
408
|
+
score: 0,
|
|
409
|
+
reason: `Moderation flags detected: ${filteredFlags.map((flag) => flag.description).join(", ")}`
|
|
410
|
+
};
|
|
411
|
+
return {
|
|
412
|
+
pass: true,
|
|
413
|
+
score: 1,
|
|
414
|
+
reason: "No relevant moderation flags detected"
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
//#endregion
|
|
418
|
+
//#region src/assertions/contextUtils.ts
|
|
419
|
+
/**
|
|
420
|
+
* Resolves the context value for context-based assertions.
|
|
421
|
+
* Supports extracting context from test variables or transforming from output.
|
|
422
|
+
* Can return either a single context string or an array of context chunks.
|
|
423
|
+
*
|
|
424
|
+
* @param assertion - The assertion configuration
|
|
425
|
+
* @param test - The test case
|
|
426
|
+
* @param output - The provider output (after provider transform, before test transform)
|
|
427
|
+
* @param prompt - The prompt text
|
|
428
|
+
* @param fallbackContext - Optional fallback context (e.g., prompt for context-recall)
|
|
429
|
+
* @param providerResponse - Optional full provider response for contextTransform
|
|
430
|
+
* @returns The resolved context string or array of strings
|
|
431
|
+
* @throws Error if context cannot be resolved or transform fails
|
|
432
|
+
*/
|
|
433
|
+
async function resolveContext(assertion, test, output, prompt, fallbackContext, providerResponse) {
|
|
434
|
+
let contextValue;
|
|
435
|
+
if (test.vars?.context) {
|
|
436
|
+
if (typeof test.vars.context === "string") contextValue = test.vars.context;
|
|
437
|
+
else if (Array.isArray(test.vars.context)) {
|
|
438
|
+
const invalidEntry = [...test.vars.context.entries()].find(([, v]) => typeof v !== "string");
|
|
439
|
+
if (invalidEntry) {
|
|
440
|
+
const [idx, val] = invalidEntry;
|
|
441
|
+
invariant(false, `Invalid context: expected an array of strings, but found ${typeof val} at index ${idx}`);
|
|
442
|
+
}
|
|
443
|
+
contextValue = test.vars.context;
|
|
444
|
+
}
|
|
445
|
+
} else if (fallbackContext) contextValue = fallbackContext;
|
|
446
|
+
if (assertion.contextTransform) {
|
|
447
|
+
const getLabel = () => getTransformLabel(assertion.contextTransform);
|
|
448
|
+
try {
|
|
449
|
+
const outputForTransform = providerResponse?.providerTransformedOutput ?? output;
|
|
450
|
+
const transformed = await transform(assertion.contextTransform, outputForTransform, {
|
|
451
|
+
vars: test.vars,
|
|
452
|
+
prompt: { label: prompt },
|
|
453
|
+
...providerResponse && providerResponse.metadata && { metadata: providerResponse.metadata }
|
|
454
|
+
});
|
|
455
|
+
invariant(typeof transformed === "string" || Array.isArray(transformed) && transformed.every((item) => typeof item === "string"), () => `contextTransform must return a string or array of strings. Got ${typeof transformed}. Check your transform expression: ${getLabel()}`);
|
|
456
|
+
contextValue = transformed;
|
|
457
|
+
} catch (error) {
|
|
458
|
+
throw new Error(`Failed to transform context using expression '${getLabel()}': ${getTransformErrorMessage(error)}`);
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
invariant(typeof contextValue === "string" && contextValue.length > 0 || Array.isArray(contextValue) && contextValue.length > 0 && contextValue.every((item) => typeof item === "string" && item.length > 0), "Context is required for context-based assertions. Provide either a \"context\" variable (string or array of strings) in your test case or use \"contextTransform\" to extract context from the provider response.");
|
|
462
|
+
return contextValue;
|
|
463
|
+
}
|
|
464
|
+
/**
|
|
465
|
+
* Serializes context (string or string[]) to a single string for prompts.
|
|
466
|
+
* Joins chunks with double newlines to preserve separation.
|
|
467
|
+
*/
|
|
468
|
+
function serializeContext(context) {
|
|
469
|
+
return Array.isArray(context) ? context.join("\n\n") : context;
|
|
470
|
+
}
|
|
471
|
+
//#endregion
|
|
472
|
+
//#region src/matchers/rag.ts
|
|
473
|
+
async function matchesAnswerRelevance(input, output, threshold, grading, providerCallContext) {
|
|
474
|
+
const defaults = await getDefaultProviders();
|
|
475
|
+
const embeddingProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "answer relevancy check");
|
|
476
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, defaults.gradingProvider, "answer relevancy check");
|
|
477
|
+
const tokensUsed = normalizeMatcherTokenUsage(void 0);
|
|
478
|
+
const rubricPrompt = await loadRubricPrompt(grading?.rubricPrompt, ANSWER_RELEVANCY_GENERATE);
|
|
479
|
+
const parsedOutput = tryParse(output);
|
|
480
|
+
const promptText = await renderLlmRubricPrompt(rubricPrompt, { answer: parsedOutput });
|
|
481
|
+
const candidateQuestions = [];
|
|
482
|
+
for (let i = 0; i < 3; i++) {
|
|
483
|
+
const resp = await callProviderWithContext(textProvider, promptText, "answer-relevance", { answer: parsedOutput }, providerCallContext);
|
|
484
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
485
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
486
|
+
invariant(typeof resp.output === "string", "answer relevancy check produced malformed response");
|
|
487
|
+
candidateQuestions.push(resp.output);
|
|
488
|
+
}
|
|
489
|
+
invariant(typeof embeddingProvider.callEmbeddingApi === "function", `Provider ${embeddingProvider.id()} must implement callEmbeddingApi for similarity check`);
|
|
490
|
+
const inputEmbeddingResp = await embeddingProvider.callEmbeddingApi(input);
|
|
491
|
+
accumulateTokenUsage(tokensUsed, inputEmbeddingResp.tokenUsage);
|
|
492
|
+
if (inputEmbeddingResp.error || !inputEmbeddingResp.embedding) return fail(inputEmbeddingResp.error || "No embedding", tokensUsed);
|
|
493
|
+
const inputEmbedding = inputEmbeddingResp.embedding;
|
|
494
|
+
const similarities = [];
|
|
495
|
+
const questionsWithScores = [];
|
|
496
|
+
for (const question of candidateQuestions) {
|
|
497
|
+
const resp = await embeddingProvider.callEmbeddingApi(question);
|
|
498
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
499
|
+
if (resp.error || !resp.embedding) return fail(resp.error || "No embedding", tokensUsed);
|
|
500
|
+
const questionSimilarity = cosineSimilarity(inputEmbedding, resp.embedding);
|
|
501
|
+
similarities.push(questionSimilarity);
|
|
502
|
+
questionsWithScores.push({
|
|
503
|
+
question,
|
|
504
|
+
similarity: questionSimilarity
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
const similarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
|
|
508
|
+
const pass = similarity >= threshold - Number.EPSILON;
|
|
509
|
+
const greaterThanReason = `Relevance ${similarity.toFixed(2)} is greater than threshold ${threshold}`;
|
|
510
|
+
const lessThanReason = `Relevance ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
511
|
+
const metadata = {
|
|
512
|
+
generatedQuestions: questionsWithScores,
|
|
513
|
+
averageSimilarity: similarity,
|
|
514
|
+
threshold
|
|
515
|
+
};
|
|
516
|
+
if (pass) return {
|
|
517
|
+
pass: true,
|
|
518
|
+
score: similarity,
|
|
519
|
+
reason: greaterThanReason,
|
|
520
|
+
tokensUsed,
|
|
521
|
+
metadata
|
|
522
|
+
};
|
|
523
|
+
return {
|
|
524
|
+
pass: false,
|
|
525
|
+
score: similarity,
|
|
526
|
+
reason: lessThanReason,
|
|
527
|
+
tokensUsed,
|
|
528
|
+
metadata
|
|
529
|
+
};
|
|
530
|
+
}
|
|
531
|
+
async function matchesContextRecall(context, groundTruth, threshold, grading, vars, providerCallContext) {
|
|
532
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context recall check");
|
|
533
|
+
const contextString = serializeContext(context);
|
|
534
|
+
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RECALL), {
|
|
535
|
+
context: contextString,
|
|
536
|
+
groundTruth,
|
|
537
|
+
...vars || {}
|
|
538
|
+
}), "context-recall", {
|
|
539
|
+
context: contextString,
|
|
540
|
+
groundTruth,
|
|
541
|
+
...vars || {}
|
|
542
|
+
}, providerCallContext);
|
|
543
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
544
|
+
invariant(typeof resp.output === "string", "context-recall produced malformed response");
|
|
545
|
+
const attributedTokenLower = CONTEXT_RECALL_ATTRIBUTED_TOKEN.toLowerCase();
|
|
546
|
+
const notAttributedTokenLower = CONTEXT_RECALL_NOT_ATTRIBUTED_TOKEN.toLowerCase();
|
|
547
|
+
const sentences = splitIntoSentences(resp.output).filter((line) => {
|
|
548
|
+
const lowerLine = line.toLowerCase();
|
|
549
|
+
return lowerLine.includes(attributedTokenLower) || lowerLine.includes(notAttributedTokenLower);
|
|
550
|
+
});
|
|
551
|
+
const sentenceAttributions = [];
|
|
552
|
+
let numerator = 0;
|
|
553
|
+
for (const sentence of sentences) {
|
|
554
|
+
const lowerSentence = sentence.toLowerCase();
|
|
555
|
+
const isAttributed = !lowerSentence.includes(notAttributedTokenLower) && lowerSentence.includes(attributedTokenLower);
|
|
556
|
+
if (isAttributed) numerator++;
|
|
557
|
+
const sentenceMatch = sentence.match(/^\d+\.\s*([^\.]+\.)/);
|
|
558
|
+
const cleanSentence = sentenceMatch ? sentenceMatch[1].trim() : sentence.split(".")[0].trim();
|
|
559
|
+
sentenceAttributions.push({
|
|
560
|
+
sentence: cleanSentence,
|
|
561
|
+
attributed: isAttributed
|
|
562
|
+
});
|
|
563
|
+
}
|
|
564
|
+
const score = sentences.length > 0 ? numerator / sentences.length : 0;
|
|
565
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
566
|
+
const metadata = {
|
|
567
|
+
sentenceAttributions,
|
|
568
|
+
totalSentences: sentences.length,
|
|
569
|
+
attributedSentences: numerator,
|
|
570
|
+
score
|
|
571
|
+
};
|
|
572
|
+
return {
|
|
573
|
+
pass,
|
|
574
|
+
score,
|
|
575
|
+
reason: pass ? `Recall ${score.toFixed(2)} is >= ${threshold}` : `Recall ${score.toFixed(2)} is < ${threshold}`,
|
|
576
|
+
tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
577
|
+
metadata
|
|
578
|
+
};
|
|
579
|
+
}
|
|
580
|
+
async function matchesContextRelevance(question, context, threshold, grading, providerCallContext) {
|
|
581
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "context relevance check");
|
|
582
|
+
const contextString = serializeContext(context);
|
|
583
|
+
const resp = await callProviderWithContext(textProvider, await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, CONTEXT_RELEVANCE), {
|
|
584
|
+
context: contextString,
|
|
585
|
+
query: question
|
|
586
|
+
}), "context-relevance", {
|
|
587
|
+
context: contextString,
|
|
588
|
+
query: question
|
|
589
|
+
}, providerCallContext);
|
|
590
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", resp.tokenUsage);
|
|
591
|
+
invariant(typeof resp.output === "string", "context-relevance produced malformed response");
|
|
592
|
+
const contextUnits = Array.isArray(context) ? context.filter((chunk) => chunk.trim().length > 0) : splitIntoSentences(context);
|
|
593
|
+
const totalContextUnits = contextUnits.length;
|
|
594
|
+
const extractedSentences = splitIntoSentences(resp.output);
|
|
595
|
+
const relevantSentences = [];
|
|
596
|
+
const insufficientInformation = resp.output.includes(CONTEXT_RELEVANCE_BAD);
|
|
597
|
+
let numerator = 0;
|
|
598
|
+
if (insufficientInformation) numerator = 0;
|
|
599
|
+
else {
|
|
600
|
+
const uniqueRelevantSentences = [...new Set(extractedSentences)];
|
|
601
|
+
numerator = Math.min(uniqueRelevantSentences.length, totalContextUnits);
|
|
602
|
+
relevantSentences.push(...uniqueRelevantSentences);
|
|
603
|
+
}
|
|
604
|
+
const score = totalContextUnits > 0 ? numerator / totalContextUnits : 0;
|
|
605
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
606
|
+
const metadata = {
|
|
607
|
+
extractedSentences: relevantSentences,
|
|
608
|
+
totalContextUnits,
|
|
609
|
+
totalContextSentences: totalContextUnits,
|
|
610
|
+
contextUnits,
|
|
611
|
+
relevantSentenceCount: numerator,
|
|
612
|
+
insufficientInformation,
|
|
613
|
+
score
|
|
614
|
+
};
|
|
615
|
+
return {
|
|
616
|
+
pass,
|
|
617
|
+
score,
|
|
618
|
+
reason: pass ? `Context relevance ${score.toFixed(2)} is >= ${threshold}` : `Context relevance ${score.toFixed(2)} is < ${threshold}`,
|
|
619
|
+
tokensUsed: normalizeMatcherTokenUsage(resp.tokenUsage),
|
|
620
|
+
metadata
|
|
621
|
+
};
|
|
622
|
+
}
|
|
623
|
+
async function matchesContextFaithfulness(query, output, context, threshold, grading, vars, providerCallContext) {
|
|
624
|
+
const textProvider = await getAndCheckProvider("text", grading?.provider, (await getDefaultProviders()).gradingProvider, "faithfulness check");
|
|
625
|
+
const tokensUsed = normalizeMatcherTokenUsage(void 0);
|
|
626
|
+
if (grading?.rubricPrompt) invariant(Array.isArray(grading.rubricPrompt), "rubricPrompt must be an array");
|
|
627
|
+
const rawLongformPrompt = typeof grading?.rubricPrompt?.[0] === "string" ? grading?.rubricPrompt?.[0] : grading?.rubricPrompt?.[0]?.content;
|
|
628
|
+
const rawNliPrompt = typeof grading?.rubricPrompt?.[1] === "string" ? grading?.rubricPrompt?.[1] : grading?.rubricPrompt?.[1]?.content;
|
|
629
|
+
const longformPrompt = await loadRubricPrompt(rawLongformPrompt, CONTEXT_FAITHFULNESS_LONGFORM);
|
|
630
|
+
const nliPrompt = await loadRubricPrompt(rawNliPrompt, CONTEXT_FAITHFULNESS_NLI_STATEMENTS);
|
|
631
|
+
let promptText = await renderLlmRubricPrompt(longformPrompt, {
|
|
632
|
+
question: query,
|
|
633
|
+
answer: tryParse(output),
|
|
634
|
+
...vars || {}
|
|
635
|
+
});
|
|
636
|
+
let resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-longform", {
|
|
637
|
+
question: query,
|
|
638
|
+
answer: tryParse(output),
|
|
639
|
+
...vars || {}
|
|
640
|
+
}, providerCallContext);
|
|
641
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
642
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
643
|
+
invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
644
|
+
const contextString = serializeContext(context);
|
|
645
|
+
const statements = splitIntoSentences(resp.output);
|
|
646
|
+
promptText = await renderLlmRubricPrompt(nliPrompt, {
|
|
647
|
+
context: contextString,
|
|
648
|
+
statements,
|
|
649
|
+
...vars || {}
|
|
650
|
+
});
|
|
651
|
+
resp = await callProviderWithContext(textProvider, promptText, "context-faithfulness-nli", {
|
|
652
|
+
context: contextString,
|
|
653
|
+
statements,
|
|
654
|
+
...vars || {}
|
|
655
|
+
}, providerCallContext);
|
|
656
|
+
accumulateTokenUsage(tokensUsed, resp.tokenUsage);
|
|
657
|
+
if (resp.error || !resp.output) return fail(resp.error || "No output", tokensUsed);
|
|
658
|
+
invariant(typeof resp.output === "string", "context-faithfulness produced malformed response");
|
|
659
|
+
let finalAnswer = "Final verdict for each statement in order:";
|
|
660
|
+
finalAnswer = finalAnswer.toLowerCase();
|
|
661
|
+
let verdicts = resp.output.toLowerCase().trim();
|
|
662
|
+
let score = 0;
|
|
663
|
+
if (statements.length > 0) if (verdicts.includes(finalAnswer)) {
|
|
664
|
+
verdicts = verdicts.slice(verdicts.indexOf(finalAnswer) + finalAnswer.length);
|
|
665
|
+
const parsedVerdicts = verdicts.split(".").filter((answer) => answer.trim() !== "");
|
|
666
|
+
if (parsedVerdicts.length > 0) score = 1 - parsedVerdicts.filter((answer) => !answer.includes("yes")).length / statements.length;
|
|
667
|
+
} else {
|
|
668
|
+
const noVerdictCount = verdicts.split("verdict: no").length - 1;
|
|
669
|
+
if (noVerdictCount + (verdicts.split("verdict: yes").length - 1) > 0) score = 1 - noVerdictCount / statements.length;
|
|
670
|
+
}
|
|
671
|
+
score = Math.min(1, Math.max(0, score));
|
|
672
|
+
const pass = score >= threshold - Number.EPSILON;
|
|
673
|
+
return {
|
|
674
|
+
pass,
|
|
675
|
+
score,
|
|
676
|
+
reason: pass ? `Faithfulness ${score.toFixed(2)} is >= ${threshold}` : `Faithfulness ${score.toFixed(2)} is < ${threshold}`,
|
|
677
|
+
tokensUsed
|
|
678
|
+
};
|
|
679
|
+
}
|
|
680
|
+
//#endregion
|
|
681
|
+
//#region src/matchers/similarity.ts
|
|
682
|
+
function calculateSimilarityScore(expectedEmbedding, outputEmbedding, metric, tokensUsed) {
|
|
683
|
+
switch (metric) {
|
|
684
|
+
case "cosine": return cosineSimilarity(expectedEmbedding, outputEmbedding);
|
|
685
|
+
case "dot_product": return dotProduct(expectedEmbedding, outputEmbedding);
|
|
686
|
+
case "euclidean": return euclideanDistance(expectedEmbedding, outputEmbedding);
|
|
687
|
+
default: return fail(`Unsupported metric: ${metric}`, tokensUsed);
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
function buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed) {
|
|
691
|
+
if (metric === "euclidean") {
|
|
692
|
+
const distance = similarity;
|
|
693
|
+
const pass = inverse ? distance >= threshold - Number.EPSILON : distance <= threshold + Number.EPSILON;
|
|
694
|
+
const normalizedScore = 1 / (1 + distance);
|
|
695
|
+
const score = inverse ? 1 - normalizedScore : normalizedScore;
|
|
696
|
+
const belowThresholdReason = `Distance ${distance.toFixed(2)} is less than or equal to threshold ${threshold}`;
|
|
697
|
+
const aboveThresholdReason = `Distance ${distance.toFixed(2)} is greater than threshold ${threshold}`;
|
|
698
|
+
return {
|
|
699
|
+
pass,
|
|
700
|
+
score,
|
|
701
|
+
reason: pass ? inverse ? aboveThresholdReason : belowThresholdReason : inverse ? belowThresholdReason : aboveThresholdReason,
|
|
702
|
+
tokensUsed
|
|
703
|
+
};
|
|
704
|
+
}
|
|
705
|
+
const pass = inverse ? similarity <= threshold + Number.EPSILON : similarity >= threshold - Number.EPSILON;
|
|
706
|
+
const score = inverse ? 1 - similarity : similarity;
|
|
707
|
+
const greaterThanReason = `Similarity ${similarity.toFixed(2)} is greater than or equal to threshold ${threshold}`;
|
|
708
|
+
const lessThanReason = `Similarity ${similarity.toFixed(2)} is less than threshold ${threshold}`;
|
|
709
|
+
return {
|
|
710
|
+
pass,
|
|
711
|
+
score,
|
|
712
|
+
reason: pass ? inverse ? lessThanReason : greaterThanReason : inverse ? greaterThanReason : lessThanReason,
|
|
713
|
+
tokensUsed
|
|
714
|
+
};
|
|
715
|
+
}
|
|
716
|
+
async function calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed) {
|
|
717
|
+
if (metric === "cosine" && "callSimilarityApi" in finalProvider) {
|
|
718
|
+
const similarityResp = await finalProvider.callSimilarityApi(expected, output);
|
|
719
|
+
accumulateTokenUsage(tokensUsed, similarityResp.tokenUsage);
|
|
720
|
+
if (similarityResp.error) return fail(similarityResp.error, tokensUsed);
|
|
721
|
+
if (similarityResp.similarity == null) return fail("Unknown error fetching similarity", tokensUsed);
|
|
722
|
+
if (!Number.isFinite(similarityResp.similarity)) return fail(`Invalid similarity score: ${similarityResp.similarity}`, tokensUsed);
|
|
723
|
+
return similarityResp.similarity;
|
|
724
|
+
}
|
|
725
|
+
const callEmbeddingApi = "callEmbeddingApi" in finalProvider ? finalProvider.callEmbeddingApi : void 0;
|
|
726
|
+
if (typeof callEmbeddingApi !== "function") {
|
|
727
|
+
if ("callSimilarityApi" in finalProvider) return fail(`Provider ${finalProvider.id()} only supports cosine similarity via callSimilarityApi`, tokensUsed);
|
|
728
|
+
throw new Error("Provider must implement callSimilarityApi or callEmbeddingApi");
|
|
729
|
+
}
|
|
730
|
+
const [expectedEmbedding, outputEmbedding] = await Promise.all([callEmbeddingApi.call(finalProvider, expected), callEmbeddingApi.call(finalProvider, output)]);
|
|
731
|
+
const mergedUsage = normalizeMatcherTokenUsage(void 0);
|
|
732
|
+
accumulateTokenUsage(mergedUsage, expectedEmbedding.tokenUsage);
|
|
733
|
+
accumulateTokenUsage(mergedUsage, outputEmbedding.tokenUsage);
|
|
734
|
+
accumulateTokenUsage(tokensUsed, mergedUsage);
|
|
735
|
+
if (expectedEmbedding.error || outputEmbedding.error) return fail(expectedEmbedding.error || outputEmbedding.error || "Unknown error fetching embeddings", tokensUsed);
|
|
736
|
+
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) return fail("Embedding not found", tokensUsed);
|
|
737
|
+
return calculateSimilarityScore(expectedEmbedding.embedding, outputEmbedding.embedding, metric, tokensUsed);
|
|
738
|
+
}
|
|
739
|
+
async function matchesSimilarity(expected, output, threshold, inverse = false, grading, metric = "cosine") {
|
|
740
|
+
if (metric === "cosine" && state.config?.redteam && shouldGenerateRemote({ requireEmbeddingProvider: true })) try {
|
|
741
|
+
return await doRemoteGrading({
|
|
742
|
+
task: "similar",
|
|
743
|
+
expected,
|
|
744
|
+
output,
|
|
745
|
+
threshold,
|
|
746
|
+
inverse
|
|
747
|
+
});
|
|
748
|
+
} catch (error) {
|
|
749
|
+
return fail(`Could not perform remote grading: ${error}`);
|
|
750
|
+
}
|
|
751
|
+
const defaults = await getDefaultProviders();
|
|
752
|
+
const finalProvider = await getAndCheckProvider("embedding", grading?.provider, defaults.embeddingProvider, "similarity check");
|
|
753
|
+
const tokensUsed = normalizeMatcherTokenUsage(void 0);
|
|
754
|
+
const similarity = await calculateProviderSimilarity(finalProvider, expected, output, metric, tokensUsed);
|
|
755
|
+
if (typeof similarity !== "number") return similarity;
|
|
756
|
+
return buildSimilarityResult(similarity, threshold, inverse, metric, tokensUsed);
|
|
757
|
+
}
|
|
758
|
+
//#endregion
|
|
245
759
|
//#region src/tracing/evaluatorTracing.ts
|
|
246
760
|
let otlpReceiverStarted = false;
|
|
247
761
|
const DEFAULT_OTLP_ACCEPT_FORMATS = ["json", "protobuf"];
|
|
@@ -285,7 +799,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
285
799
|
telemetry.record("feature_used", { feature: "tracing" });
|
|
286
800
|
try {
|
|
287
801
|
logger.debug("[EvaluatorTracing] Tracing configuration detected, starting OTLP receiver");
|
|
288
|
-
const { startOTLPReceiver } = await import("./otlpReceiver-
|
|
802
|
+
const { startOTLPReceiver } = await import("./otlpReceiver-BXjcRqAM.js");
|
|
289
803
|
const port = testSuite.tracing.otlp.http.port || 4318;
|
|
290
804
|
const host = testSuite.tracing.otlp.http.host || "127.0.0.1";
|
|
291
805
|
const acceptFormats = normalizeOtlpAcceptFormats(testSuite.tracing.otlp.http.acceptFormats);
|
|
@@ -309,7 +823,7 @@ async function startOtlpReceiverIfNeeded(testSuite) {
|
|
|
309
823
|
async function stopOtlpReceiverIfNeeded() {
|
|
310
824
|
if (otlpReceiverStarted) try {
|
|
311
825
|
logger.debug("[EvaluatorTracing] Stopping OTLP receiver");
|
|
312
|
-
const { stopOTLPReceiver } = await import("./otlpReceiver-
|
|
826
|
+
const { stopOTLPReceiver } = await import("./otlpReceiver-BXjcRqAM.js");
|
|
313
827
|
await stopOTLPReceiver();
|
|
314
828
|
otlpReceiverStarted = false;
|
|
315
829
|
logger.info("[EvaluatorTracing] OTLP receiver stopped successfully");
|
|
@@ -344,7 +858,7 @@ async function generateTraceContextIfNeeded(test, evaluateOptions, testIdx, prom
|
|
|
344
858
|
}
|
|
345
859
|
if (!tracingEnabled) return null;
|
|
346
860
|
logger.debug("[EvaluatorTracing] Importing trace store");
|
|
347
|
-
const { getTraceStore } = await import("./store-
|
|
861
|
+
const { getTraceStore } = await import("./store-HpopRVzl.js").then((n) => n.n);
|
|
348
862
|
const traceStore = getTraceStore();
|
|
349
863
|
const traceId = generateTraceId();
|
|
350
864
|
const spanId = generateSpanId();
|
|
@@ -654,38 +1168,84 @@ async function handleClassifier({ assertion, renderedValue, outputString, test,
|
|
|
654
1168
|
}
|
|
655
1169
|
//#endregion
|
|
656
1170
|
//#region src/assertions/contains.ts
|
|
1171
|
+
/**
|
|
1172
|
+
* Advance over separators between parsed fields.
|
|
1173
|
+
*
|
|
1174
|
+
* Contains-any values allow whitespace around comma delimiters, and historical
|
|
1175
|
+
* parsing ignored repeated commas rather than producing empty fields.
|
|
1176
|
+
*/
|
|
1177
|
+
function skipWhitespaceAndCommas(value, startIndex) {
|
|
1178
|
+
let i = startIndex;
|
|
1179
|
+
while (i < value.length) {
|
|
1180
|
+
i = skipWhitespace(value, i);
|
|
1181
|
+
if (value[i] !== ",") break;
|
|
1182
|
+
i++;
|
|
1183
|
+
}
|
|
1184
|
+
return i;
|
|
1185
|
+
}
|
|
1186
|
+
/**
|
|
1187
|
+
* Advance over whitespace while preserving comma delimiter handling for callers.
|
|
1188
|
+
*/
|
|
1189
|
+
function skipWhitespace(value, startIndex) {
|
|
1190
|
+
let i = startIndex;
|
|
1191
|
+
while (i < value.length && /\s/.test(value[i])) i++;
|
|
1192
|
+
return i;
|
|
1193
|
+
}
|
|
1194
|
+
/**
|
|
1195
|
+
* Parse a quoted field using the assertion parser's CSV-like escape rules.
|
|
1196
|
+
*
|
|
1197
|
+
* Supports backslash-escaped quotes/backslashes and doubled quotes, and rejects
|
|
1198
|
+
* unterminated fields so malformed assertion values do not silently pass.
|
|
1199
|
+
*/
|
|
1200
|
+
function parseQuotedField(value, startIndex) {
|
|
1201
|
+
let i = startIndex + 1;
|
|
1202
|
+
let field = "";
|
|
1203
|
+
let terminated = false;
|
|
1204
|
+
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && ["\"", "\\"].includes(value[i + 1])) {
|
|
1205
|
+
field += value[i + 1];
|
|
1206
|
+
i += 2;
|
|
1207
|
+
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
1208
|
+
field += "\"";
|
|
1209
|
+
i += 2;
|
|
1210
|
+
} else if (value[i] === "\"") {
|
|
1211
|
+
i++;
|
|
1212
|
+
terminated = true;
|
|
1213
|
+
break;
|
|
1214
|
+
} else {
|
|
1215
|
+
field += value[i];
|
|
1216
|
+
i++;
|
|
1217
|
+
}
|
|
1218
|
+
invariant(terminated, "Unterminated quoted field in contains assertion value");
|
|
1219
|
+
return {
|
|
1220
|
+
field,
|
|
1221
|
+
nextIndex: i
|
|
1222
|
+
};
|
|
1223
|
+
}
|
|
1224
|
+
/**
|
|
1225
|
+
* Parse an unquoted field up to the next comma, trimming surrounding whitespace.
|
|
1226
|
+
*/
|
|
1227
|
+
function parseUnquotedField(value, startIndex) {
|
|
1228
|
+
let i = startIndex;
|
|
1229
|
+
while (i < value.length && value[i] !== ",") i++;
|
|
1230
|
+
return {
|
|
1231
|
+
field: value.substring(startIndex, i).trim(),
|
|
1232
|
+
nextIndex: i
|
|
1233
|
+
};
|
|
1234
|
+
}
|
|
1235
|
+
/**
|
|
1236
|
+
* Split a contains-any string into fields while preserving quoted commas.
|
|
1237
|
+
*/
|
|
657
1238
|
function parseCommaSeparatedValues(value) {
|
|
658
1239
|
const results = [];
|
|
659
1240
|
let i = 0;
|
|
660
1241
|
while (i < value.length) {
|
|
661
|
-
|
|
1242
|
+
i = skipWhitespaceAndCommas(value, i);
|
|
662
1243
|
if (i >= value.length) break;
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
i++;
|
|
669
|
-
let field = "";
|
|
670
|
-
while (i < value.length) if (value[i] === "\\" && i + 1 < value.length && (value[i + 1] === "\"" || value[i + 1] === "\\")) {
|
|
671
|
-
field += value[i + 1];
|
|
672
|
-
i += 2;
|
|
673
|
-
} else if (value[i] === "\"" && i + 1 < value.length && value[i + 1] === "\"") {
|
|
674
|
-
field += "\"";
|
|
675
|
-
i += 2;
|
|
676
|
-
} else if (value[i] === "\"") {
|
|
677
|
-
i++;
|
|
678
|
-
break;
|
|
679
|
-
} else {
|
|
680
|
-
field += value[i];
|
|
681
|
-
i++;
|
|
682
|
-
}
|
|
683
|
-
results.push(field);
|
|
684
|
-
} else {
|
|
685
|
-
const start = i;
|
|
686
|
-
while (i < value.length && value[i] !== ",") i++;
|
|
687
|
-
results.push(value.substring(start, i).trim());
|
|
688
|
-
}
|
|
1244
|
+
const isQuotedField = value[i] === "\"";
|
|
1245
|
+
const parsed = isQuotedField ? parseQuotedField(value, i) : parseUnquotedField(value, i);
|
|
1246
|
+
results.push(parsed.field);
|
|
1247
|
+
i = isQuotedField ? skipWhitespace(value, parsed.nextIndex) : parsed.nextIndex;
|
|
1248
|
+
invariant(!isQuotedField || i >= value.length || value[i] === ",", "Expected comma after quoted field in contains assertion value");
|
|
689
1249
|
}
|
|
690
1250
|
return results;
|
|
691
1251
|
}
|
|
@@ -930,27 +1490,67 @@ const handleIsValidFunctionCall = ({ assertion, output, provider, test }) => {
|
|
|
930
1490
|
};
|
|
931
1491
|
//#endregion
|
|
932
1492
|
//#region src/assertions/geval.ts
|
|
933
|
-
const handleGEval = async ({ assertion, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
934
|
-
invariant(typeof renderedValue === "string" || Array.isArray(renderedValue), "G-Eval assertion type must have a string or array of strings value");
|
|
1493
|
+
const handleGEval = async ({ assertion, inverse, renderedValue, prompt, outputString, test, providerCallContext }) => {
|
|
1494
|
+
invariant(typeof renderedValue === "string" || Array.isArray(renderedValue) && renderedValue.every((value) => typeof value === "string"), "G-Eval assertion type must have a string or array of strings value");
|
|
935
1495
|
const threshold = assertion.threshold ?? .7;
|
|
936
1496
|
if (Array.isArray(renderedValue)) {
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
1497
|
+
if (renderedValue.length === 0) return {
|
|
1498
|
+
assertion,
|
|
1499
|
+
pass: false,
|
|
1500
|
+
score: 0,
|
|
1501
|
+
reason: "G-Eval assertion requires at least one criterion string in the value array."
|
|
1502
|
+
};
|
|
1503
|
+
const responses = [];
|
|
1504
|
+
let failure;
|
|
1505
|
+
for (const [index, value] of renderedValue.entries()) {
|
|
940
1506
|
const resp = await matchesGEval(value, prompt || "", outputString, threshold, test.options, providerCallContext);
|
|
941
|
-
|
|
942
|
-
|
|
1507
|
+
responses.push(resp);
|
|
1508
|
+
if (isGraderFailure(resp)) {
|
|
1509
|
+
failure = {
|
|
1510
|
+
index,
|
|
1511
|
+
resp
|
|
1512
|
+
};
|
|
1513
|
+
break;
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
const tokensUsed = createEmptyTokenUsage();
|
|
1517
|
+
for (const r of responses) accumulateTokenUsage(tokensUsed, r.tokensUsed);
|
|
1518
|
+
if (failure) {
|
|
1519
|
+
const criterion = renderedValue[failure.index];
|
|
1520
|
+
return {
|
|
1521
|
+
assertion,
|
|
1522
|
+
pass: false,
|
|
1523
|
+
score: 0,
|
|
1524
|
+
reason: `G-Eval criterion ${failure.index + 1}/${renderedValue.length} (${JSON.stringify(criterion)}) failed: ${failure.resp.reason}`,
|
|
1525
|
+
tokensUsed,
|
|
1526
|
+
metadata: failure.resp.metadata
|
|
1527
|
+
};
|
|
943
1528
|
}
|
|
944
|
-
const
|
|
1529
|
+
const averageScore = responses.reduce((acc, r) => acc + r.score, 0) / responses.length;
|
|
1530
|
+
const combinedReason = responses.map((r) => r.reason).join("\n\n");
|
|
945
1531
|
return {
|
|
946
1532
|
assertion,
|
|
947
|
-
pass:
|
|
948
|
-
score:
|
|
949
|
-
reason:
|
|
1533
|
+
pass: averageScore >= threshold !== inverse,
|
|
1534
|
+
score: inverse ? 1 - averageScore : averageScore,
|
|
1535
|
+
reason: combinedReason,
|
|
1536
|
+
tokensUsed
|
|
950
1537
|
};
|
|
951
|
-
}
|
|
1538
|
+
}
|
|
1539
|
+
const resp = await matchesGEval(renderedValue, prompt || "", outputString, threshold, test.options, providerCallContext);
|
|
1540
|
+
if (isGraderFailure(resp)) return {
|
|
1541
|
+
assertion,
|
|
1542
|
+
pass: false,
|
|
1543
|
+
score: 0,
|
|
1544
|
+
reason: resp.reason,
|
|
1545
|
+
tokensUsed: resp.tokensUsed,
|
|
1546
|
+
metadata: resp.metadata
|
|
1547
|
+
};
|
|
1548
|
+
const passed = resp.score >= threshold !== inverse;
|
|
1549
|
+
return {
|
|
952
1550
|
assertion,
|
|
953
|
-
...
|
|
1551
|
+
...resp,
|
|
1552
|
+
pass: passed,
|
|
1553
|
+
score: inverse ? 1 - resp.score : resp.score
|
|
954
1554
|
};
|
|
955
1555
|
};
|
|
956
1556
|
//#endregion
|
|
@@ -1090,6 +1690,43 @@ const handleGuardrails = async ({ assertion, inverse, providerResponse }) => {
|
|
|
1090
1690
|
};
|
|
1091
1691
|
//#endregion
|
|
1092
1692
|
//#region src/assertions/html.ts
|
|
1693
|
+
const LITERAL_WRAPPER_PATTERNS = {
|
|
1694
|
+
html: /<html(?=[\s>/])/,
|
|
1695
|
+
head: /<head(?=[\s>/])/,
|
|
1696
|
+
body: /<body(?=[\s>/])/
|
|
1697
|
+
};
|
|
1698
|
+
function isWrapperTagName(tagName) {
|
|
1699
|
+
return tagName === "html" || tagName === "head" || tagName === "body";
|
|
1700
|
+
}
|
|
1701
|
+
function isTextNode(node) {
|
|
1702
|
+
return node.nodeName === "#text";
|
|
1703
|
+
}
|
|
1704
|
+
function isElementNode(node) {
|
|
1705
|
+
return "tagName" in node;
|
|
1706
|
+
}
|
|
1707
|
+
function hasSourceCodeLocation(element) {
|
|
1708
|
+
return "sourceCodeLocation" in element && element.sourceCodeLocation !== null && element.sourceCodeLocation !== void 0;
|
|
1709
|
+
}
|
|
1710
|
+
function getChildNodes(node) {
|
|
1711
|
+
return "childNodes" in node ? node.childNodes : [];
|
|
1712
|
+
}
|
|
1713
|
+
function findFirstElement(root, predicate) {
|
|
1714
|
+
const stack = [root];
|
|
1715
|
+
while (stack.length > 0) {
|
|
1716
|
+
const current = stack.pop();
|
|
1717
|
+
if (isElementNode(current) && predicate(current)) return current;
|
|
1718
|
+
const children = getChildNodes(current);
|
|
1719
|
+
for (let i = children.length - 1; i >= 0; i--) stack.push(children[i]);
|
|
1720
|
+
}
|
|
1721
|
+
}
|
|
1722
|
+
function hasTopLevelText(parentNode) {
|
|
1723
|
+
return parentNode.childNodes.some((node) => isTextNode(node) && Boolean(node.value.trim()));
|
|
1724
|
+
}
|
|
1725
|
+
function isUserProvidedElement(element, inputLowercase) {
|
|
1726
|
+
const tagName = element.tagName.toLowerCase();
|
|
1727
|
+
if (isWrapperTagName(tagName)) return LITERAL_WRAPPER_PATTERNS[tagName].test(inputLowercase) && hasSourceCodeLocation(element);
|
|
1728
|
+
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1729
|
+
}
|
|
1093
1730
|
const HTML_PATTERNS = {
|
|
1094
1731
|
openingTag: /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/,
|
|
1095
1732
|
closingTag: /<\/[a-zA-Z][a-zA-Z0-9-]*\s*>/,
|
|
@@ -1245,37 +1882,21 @@ function validateHtml(htmlString) {
|
|
|
1245
1882
|
isValid: false,
|
|
1246
1883
|
reason: "Output appears to be XML, not HTML"
|
|
1247
1884
|
};
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
].includes(tagName) && !trimmed.toLowerCase().includes(`<${tagName}`)) return false;
|
|
1264
|
-
return VALID_HTML_ELEMENTS.has(tagName) || tagName.includes("-");
|
|
1265
|
-
})) return {
|
|
1266
|
-
isValid: false,
|
|
1267
|
-
reason: "Output does not contain recognized HTML elements"
|
|
1268
|
-
};
|
|
1269
|
-
return {
|
|
1270
|
-
isValid: true,
|
|
1271
|
-
reason: "Output is valid HTML"
|
|
1272
|
-
};
|
|
1273
|
-
} catch (error) {
|
|
1274
|
-
return {
|
|
1275
|
-
isValid: false,
|
|
1276
|
-
reason: `HTML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
1277
|
-
};
|
|
1278
|
-
}
|
|
1885
|
+
const document = parse$2(trimmed, { sourceCodeLocationInfo: true });
|
|
1886
|
+
const inputLowercase = trimmed.toLowerCase();
|
|
1887
|
+
const body = findFirstElement(document, (element) => element.tagName === "body");
|
|
1888
|
+
if (!(body !== void 0 && LITERAL_WRAPPER_PATTERNS.body.test(inputLowercase) && hasSourceCodeLocation(body)) && body && hasTopLevelText(body)) return {
|
|
1889
|
+
isValid: false,
|
|
1890
|
+
reason: "Output must be wrapped in HTML tags"
|
|
1891
|
+
};
|
|
1892
|
+
if (!findFirstElement(document, (element) => isUserProvidedElement(element, inputLowercase))) return {
|
|
1893
|
+
isValid: false,
|
|
1894
|
+
reason: "Output does not contain recognized HTML elements"
|
|
1895
|
+
};
|
|
1896
|
+
return {
|
|
1897
|
+
isValid: true,
|
|
1898
|
+
reason: "Output is valid HTML"
|
|
1899
|
+
};
|
|
1279
1900
|
}
|
|
1280
1901
|
const handleContainsHtml = ({ assertion, outputString, inverse }) => {
|
|
1281
1902
|
const pass = containsHtml(outputString) !== inverse;
|
|
@@ -1932,45 +2553,6 @@ function matchesPattern(spanName, pattern) {
|
|
|
1932
2553
|
}
|
|
1933
2554
|
//#endregion
|
|
1934
2555
|
//#region src/assertions/trajectoryUtils.ts
|
|
1935
|
-
const TOOL_ATTRIBUTE_KEYS = [
|
|
1936
|
-
"tool.name",
|
|
1937
|
-
"tool_name",
|
|
1938
|
-
"tool",
|
|
1939
|
-
"function.name",
|
|
1940
|
-
"function_name",
|
|
1941
|
-
"gen_ai.tool.name",
|
|
1942
|
-
"codex.mcp.tool",
|
|
1943
|
-
"agent.tool",
|
|
1944
|
-
"agent.tool_name",
|
|
1945
|
-
"agent.toolName"
|
|
1946
|
-
];
|
|
1947
|
-
const TOOL_ARGUMENT_ATTRIBUTE_KEYS = [
|
|
1948
|
-
"tool.arguments",
|
|
1949
|
-
"tool.args",
|
|
1950
|
-
"tool.input",
|
|
1951
|
-
"tool_arguments",
|
|
1952
|
-
"tool_args",
|
|
1953
|
-
"tool_input",
|
|
1954
|
-
"function.arguments",
|
|
1955
|
-
"function.args",
|
|
1956
|
-
"function.input",
|
|
1957
|
-
"function_arguments",
|
|
1958
|
-
"function_args",
|
|
1959
|
-
"gen_ai.tool.arguments",
|
|
1960
|
-
"gen_ai.tool.args",
|
|
1961
|
-
"gen_ai.tool.input",
|
|
1962
|
-
"gen_ai.tool.call.arguments",
|
|
1963
|
-
"gen_ai.tool.call.args",
|
|
1964
|
-
"agent.tool.arguments",
|
|
1965
|
-
"agent.tool.args",
|
|
1966
|
-
"agent.tool.input",
|
|
1967
|
-
"codex.mcp.arguments",
|
|
1968
|
-
"codex.mcp.args",
|
|
1969
|
-
"codex.mcp.input",
|
|
1970
|
-
"arguments",
|
|
1971
|
-
"args",
|
|
1972
|
-
"input"
|
|
1973
|
-
];
|
|
1974
2556
|
const COMMAND_ATTRIBUTE_KEYS = [
|
|
1975
2557
|
"codex.command",
|
|
1976
2558
|
"command",
|
|
@@ -1983,16 +2565,15 @@ const SEARCH_ATTRIBUTE_KEYS = [
|
|
|
1983
2565
|
"search_query"
|
|
1984
2566
|
];
|
|
1985
2567
|
const GENERIC_QUERY_ATTRIBUTE_KEYS = ["query"];
|
|
2568
|
+
const COMMAND_TOOL_NAMES = new Set([
|
|
2569
|
+
"exec_command",
|
|
2570
|
+
"local_shell",
|
|
2571
|
+
"shell"
|
|
2572
|
+
]);
|
|
1986
2573
|
const SEARCH_SPAN_NAME_PATTERN = /(^|[\s._:/-])(search|find|lookup|retriev(?:e|al))($|[\s._:/-])/i;
|
|
1987
2574
|
const MAX_JUDGE_SUMMARY_STEPS = 24;
|
|
1988
2575
|
const JUDGE_SUMMARY_HEAD_STEPS = 12;
|
|
1989
2576
|
const JUDGE_SUMMARY_TAIL_STEPS = 12;
|
|
1990
|
-
function getStringAttribute(attributes, keys) {
|
|
1991
|
-
for (const key of keys) {
|
|
1992
|
-
const value = attributes[key];
|
|
1993
|
-
if (typeof value === "string" && value.trim()) return value.trim();
|
|
1994
|
-
}
|
|
1995
|
-
}
|
|
1996
2577
|
function normalizeStructuredAttribute(value) {
|
|
1997
2578
|
if (value === void 0 || value === null) return;
|
|
1998
2579
|
if (typeof value === "string") {
|
|
@@ -2024,9 +2605,12 @@ function getTrajectoryStepStatus(step) {
|
|
|
2024
2605
|
function getCommandExecutable(command) {
|
|
2025
2606
|
return command.trim().split(/\s+/)[0] || void 0;
|
|
2026
2607
|
}
|
|
2608
|
+
function isCommandToolName(toolName) {
|
|
2609
|
+
return !!toolName && COMMAND_TOOL_NAMES.has(toolName.trim().toLowerCase());
|
|
2610
|
+
}
|
|
2027
2611
|
function extractToolName(span) {
|
|
2028
2612
|
const attributes = span.attributes || {};
|
|
2029
|
-
const directMatch =
|
|
2613
|
+
const directMatch = getToolNameFromAttributes(attributes);
|
|
2030
2614
|
if (directMatch) return directMatch;
|
|
2031
2615
|
for (const [key, value] of Object.entries(attributes)) {
|
|
2032
2616
|
if (typeof value !== "string" || !value.trim()) continue;
|
|
@@ -2051,21 +2635,31 @@ function extractToolArgs(span) {
|
|
|
2051
2635
|
if (value !== void 0) return value;
|
|
2052
2636
|
}
|
|
2053
2637
|
}
|
|
2054
|
-
function extractCommand(span) {
|
|
2638
|
+
function extractCommand(span, toolName = extractToolName(span), getToolArgs = () => extractToolArgs(span)) {
|
|
2055
2639
|
const attributes = span.attributes || {};
|
|
2056
|
-
const directMatch =
|
|
2640
|
+
const directMatch = getFirstStringAttribute(attributes, COMMAND_ATTRIBUTE_KEYS);
|
|
2057
2641
|
if (directMatch) return directMatch;
|
|
2058
2642
|
for (const [key, value] of Object.entries(attributes)) {
|
|
2059
2643
|
if (typeof value !== "string" || !value.trim()) continue;
|
|
2060
2644
|
if (/command/i.test(key) && !/output|result/i.test(key)) return value.trim();
|
|
2061
2645
|
}
|
|
2646
|
+
const toolArgs = getToolArgs();
|
|
2647
|
+
if (isCommandToolName(toolName) && toolArgs && typeof toolArgs === "object") {
|
|
2648
|
+
const args = toolArgs;
|
|
2649
|
+
const command = args.cmd ?? args.command;
|
|
2650
|
+
if (typeof command === "string" && command.trim()) return command.trim();
|
|
2651
|
+
if (Array.isArray(command)) {
|
|
2652
|
+
const joined = command.map((part) => String(part).trim()).filter(Boolean).join(" ");
|
|
2653
|
+
if (joined) return joined;
|
|
2654
|
+
}
|
|
2655
|
+
}
|
|
2062
2656
|
if (span.name.startsWith("exec ")) return span.name.slice(5).trim();
|
|
2063
2657
|
}
|
|
2064
2658
|
function extractSearchQuery(span) {
|
|
2065
2659
|
const attributes = span.attributes || {};
|
|
2066
|
-
const directMatch =
|
|
2660
|
+
const directMatch = getFirstStringAttribute(attributes, SEARCH_ATTRIBUTE_KEYS);
|
|
2067
2661
|
if (directMatch) return directMatch;
|
|
2068
|
-
const genericQuery =
|
|
2662
|
+
const genericQuery = getFirstStringAttribute(attributes, GENERIC_QUERY_ATTRIBUTE_KEYS);
|
|
2069
2663
|
if (genericQuery && isSearchLikeSpan(span)) return genericQuery;
|
|
2070
2664
|
if (span.name.startsWith("search ")) return span.name.slice(7).replace(/^"|"$/g, "").trim();
|
|
2071
2665
|
}
|
|
@@ -2089,17 +2683,34 @@ function extractTrajectorySteps(trace) {
|
|
|
2089
2683
|
return left.index - right.index;
|
|
2090
2684
|
}).map(({ span }) => {
|
|
2091
2685
|
const toolName = extractToolName(span);
|
|
2092
|
-
|
|
2686
|
+
let toolArgs;
|
|
2687
|
+
let hasExtractedToolArgs = false;
|
|
2688
|
+
const getToolArgs = () => {
|
|
2689
|
+
if (!hasExtractedToolArgs) {
|
|
2690
|
+
toolArgs = extractToolArgs(span);
|
|
2691
|
+
hasExtractedToolArgs = true;
|
|
2692
|
+
}
|
|
2693
|
+
return toolArgs;
|
|
2694
|
+
};
|
|
2695
|
+
const command = extractCommand(span, toolName, getToolArgs);
|
|
2093
2696
|
const searchQuery = extractSearchQuery(span);
|
|
2094
2697
|
let type = "span";
|
|
2095
2698
|
let name = span.name;
|
|
2096
2699
|
const aliases = new Set([span.name]);
|
|
2097
2700
|
let args;
|
|
2098
|
-
if (toolName) {
|
|
2701
|
+
if (command && isCommandToolName(toolName)) {
|
|
2702
|
+
type = "command";
|
|
2703
|
+
name = command;
|
|
2704
|
+
aliases.add(command);
|
|
2705
|
+
args = getToolArgs();
|
|
2706
|
+
if (toolName) aliases.add(toolName);
|
|
2707
|
+
const executable = getCommandExecutable(command);
|
|
2708
|
+
if (executable) aliases.add(executable);
|
|
2709
|
+
} else if (toolName) {
|
|
2099
2710
|
type = "tool";
|
|
2100
2711
|
name = toolName;
|
|
2101
2712
|
aliases.add(toolName);
|
|
2102
|
-
args =
|
|
2713
|
+
args = getToolArgs();
|
|
2103
2714
|
} else if (command) {
|
|
2104
2715
|
type = "command";
|
|
2105
2716
|
name = command;
|
|
@@ -2380,11 +2991,10 @@ function handleRougeScore({ baseType, assertion, renderedValue, outputString, in
|
|
|
2380
2991
|
const rougeMethod = rouge[baseType[baseType.length - 1]];
|
|
2381
2992
|
const score = rougeMethod(outputString, renderedValue, {});
|
|
2382
2993
|
const threshold = assertion.threshold ?? .75;
|
|
2383
|
-
const pass = score >= threshold != inverse;
|
|
2384
2994
|
return {
|
|
2385
|
-
pass,
|
|
2995
|
+
pass: score >= threshold !== inverse,
|
|
2386
2996
|
score: inverse ? 1 - score : score,
|
|
2387
|
-
reason:
|
|
2997
|
+
reason: `${baseType.toUpperCase()} score ${score.toFixed(2)} is ${score >= threshold ? "greater than or equal to" : "less than"} threshold ${threshold}`,
|
|
2388
2998
|
assertion
|
|
2389
2999
|
};
|
|
2390
3000
|
}
|
|
@@ -2446,6 +3056,192 @@ const handleRuby = async ({ assertion, renderedValue, valueFromScript, assertion
|
|
|
2446
3056
|
}
|
|
2447
3057
|
};
|
|
2448
3058
|
//#endregion
|
|
3059
|
+
//#region src/providers/webSearchUtils.ts
|
|
3060
|
+
function hasTool(provider, predicate) {
|
|
3061
|
+
return Array.isArray(provider.config?.tools) && provider.config.tools.some(predicate);
|
|
3062
|
+
}
|
|
3063
|
+
function getProviderId(provider) {
|
|
3064
|
+
if (typeof provider.id !== "function") return null;
|
|
3065
|
+
try {
|
|
3066
|
+
return provider.id();
|
|
3067
|
+
} catch (err) {
|
|
3068
|
+
logger.debug(`Failed to read provider id: ${err}`);
|
|
3069
|
+
return null;
|
|
3070
|
+
}
|
|
3071
|
+
}
|
|
3072
|
+
function isOpenAiResponsesProvider(provider, id) {
|
|
3073
|
+
return id.includes("openai:responses") || provider.constructor?.name === "OpenAiResponsesProvider";
|
|
3074
|
+
}
|
|
3075
|
+
/**
|
|
3076
|
+
* Check if a provider has web search capabilities
|
|
3077
|
+
* @param provider The provider to check
|
|
3078
|
+
* @returns true if the provider supports web search
|
|
3079
|
+
*/
|
|
3080
|
+
function hasWebSearchCapability(provider) {
|
|
3081
|
+
if (!provider) return false;
|
|
3082
|
+
const id = getProviderId(provider);
|
|
3083
|
+
if (!id) return false;
|
|
3084
|
+
if (id.includes("perplexity")) return true;
|
|
3085
|
+
if ((id.includes("google") || id.includes("gemini") || id.includes("vertex")) && hasTool(provider, (t) => t.googleSearch !== void 0)) return true;
|
|
3086
|
+
if (id.includes("xai") && provider.config?.search_parameters?.mode === "on") return true;
|
|
3087
|
+
if (isOpenAiResponsesProvider(provider, id) && hasTool(provider, (t) => t.type === "web_search_preview")) return true;
|
|
3088
|
+
if (id.startsWith("openai:codex") && (provider.config?.web_search_mode === "live" || provider.config?.web_search_mode === "cached" || provider.config?.web_search_enabled === true)) return true;
|
|
3089
|
+
if (id.includes("anthropic") && hasTool(provider, (t) => t.type === "web_search_20250305")) return true;
|
|
3090
|
+
return false;
|
|
3091
|
+
}
|
|
3092
|
+
/**
|
|
3093
|
+
* Load a provider with web search capabilities.
|
|
3094
|
+
* Tries multiple providers in order of preference until one succeeds.
|
|
3095
|
+
* Uses the latest and most capable models from each provider with specific checkpoint IDs.
|
|
3096
|
+
*
|
|
3097
|
+
* @param preferAnthropic Whether to try Anthropic first (true) or OpenAI first (false)
|
|
3098
|
+
* @returns A provider with web search capabilities or null
|
|
3099
|
+
*/
|
|
3100
|
+
async function loadWebSearchProvider(preferAnthropic = false) {
|
|
3101
|
+
const loadAnthropicWebSearch = async () => {
|
|
3102
|
+
try {
|
|
3103
|
+
return await loadApiProvider("anthropic:messages:claude-opus-4-6", { options: { config: { tools: [{
|
|
3104
|
+
type: "web_search_20250305",
|
|
3105
|
+
name: "web_search",
|
|
3106
|
+
max_uses: 5
|
|
3107
|
+
}] } } });
|
|
3108
|
+
} catch (err) {
|
|
3109
|
+
logger.debug(`Failed to load Anthropic web search provider: ${err}`);
|
|
3110
|
+
return null;
|
|
3111
|
+
}
|
|
3112
|
+
};
|
|
3113
|
+
const loadOpenAIWebSearch = async () => {
|
|
3114
|
+
try {
|
|
3115
|
+
return await loadApiProvider("openai:responses:gpt-5.4-2026-03-05", { options: { config: { tools: [{ type: "web_search_preview" }] } } });
|
|
3116
|
+
} catch (err) {
|
|
3117
|
+
logger.debug(`Failed to load OpenAI web search provider: ${err}`);
|
|
3118
|
+
return null;
|
|
3119
|
+
}
|
|
3120
|
+
};
|
|
3121
|
+
const loadPerplexity = async () => {
|
|
3122
|
+
try {
|
|
3123
|
+
return await loadApiProvider("perplexity:sonar-pro");
|
|
3124
|
+
} catch (err) {
|
|
3125
|
+
logger.debug(`Failed to load Perplexity provider: ${err}`);
|
|
3126
|
+
return null;
|
|
3127
|
+
}
|
|
3128
|
+
};
|
|
3129
|
+
const loadGoogleWebSearch = async () => {
|
|
3130
|
+
try {
|
|
3131
|
+
return await loadApiProvider("google:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3132
|
+
} catch (err) {
|
|
3133
|
+
logger.debug(`Failed to load Google web search provider: ${err}`);
|
|
3134
|
+
return null;
|
|
3135
|
+
}
|
|
3136
|
+
};
|
|
3137
|
+
const loadVertexWebSearch = async () => {
|
|
3138
|
+
try {
|
|
3139
|
+
return await loadApiProvider("vertex:gemini-3-pro-preview", { options: { config: { tools: [{ googleSearch: {} }] } } });
|
|
3140
|
+
} catch (err) {
|
|
3141
|
+
logger.debug(`Failed to load Vertex web search provider: ${err}`);
|
|
3142
|
+
return null;
|
|
3143
|
+
}
|
|
3144
|
+
};
|
|
3145
|
+
const loadXaiWebSearch = async () => {
|
|
3146
|
+
try {
|
|
3147
|
+
return await loadApiProvider("xai:grok-4-1-fast-reasoning", { options: { config: { search_parameters: { mode: "on" } } } });
|
|
3148
|
+
} catch (err) {
|
|
3149
|
+
logger.debug(`Failed to load xAI web search provider: ${err}`);
|
|
3150
|
+
return null;
|
|
3151
|
+
}
|
|
3152
|
+
};
|
|
3153
|
+
const providers = preferAnthropic ? [
|
|
3154
|
+
loadAnthropicWebSearch,
|
|
3155
|
+
loadOpenAIWebSearch,
|
|
3156
|
+
loadPerplexity,
|
|
3157
|
+
loadGoogleWebSearch,
|
|
3158
|
+
loadVertexWebSearch,
|
|
3159
|
+
loadXaiWebSearch
|
|
3160
|
+
] : [
|
|
3161
|
+
loadOpenAIWebSearch,
|
|
3162
|
+
loadAnthropicWebSearch,
|
|
3163
|
+
loadPerplexity,
|
|
3164
|
+
loadGoogleWebSearch,
|
|
3165
|
+
loadVertexWebSearch,
|
|
3166
|
+
loadXaiWebSearch
|
|
3167
|
+
];
|
|
3168
|
+
for (const getProvider of providers) {
|
|
3169
|
+
const provider = await getProvider();
|
|
3170
|
+
if (provider && hasWebSearchCapability(provider)) {
|
|
3171
|
+
logger.info(`Using ${getProviderId(provider) ?? "loaded provider"} as web search provider`);
|
|
3172
|
+
return provider;
|
|
3173
|
+
}
|
|
3174
|
+
if (provider) logger.debug(`Loaded provider ${getProviderId(provider) ?? "unknown"} does not support web search`);
|
|
3175
|
+
}
|
|
3176
|
+
return null;
|
|
3177
|
+
}
|
|
3178
|
+
//#endregion
|
|
3179
|
+
//#region src/matchers/search.ts
|
|
3180
|
+
async function matchesSearchRubric(rubric, llmOutput, grading, vars, assertion, _provider, providerCallContext) {
|
|
3181
|
+
if (!grading) throw new Error("Cannot grade output without grading config. Specify --grader option or grading config.");
|
|
3182
|
+
const defaultProviders = await getDefaultProviders();
|
|
3183
|
+
const defaultSearchProviders = [
|
|
3184
|
+
defaultProviders.webSearchProvider,
|
|
3185
|
+
defaultProviders.llmRubricProvider,
|
|
3186
|
+
defaultProviders.gradingProvider
|
|
3187
|
+
];
|
|
3188
|
+
let searchProvider = (grading.provider ? await getGradingProvider("text", grading.provider, null) : null) || defaultSearchProviders.find((provider) => Boolean(provider));
|
|
3189
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3190
|
+
const webSearchDefault = defaultSearchProviders.find((provider) => hasWebSearchCapability(provider));
|
|
3191
|
+
if (webSearchDefault) searchProvider = webSearchDefault;
|
|
3192
|
+
}
|
|
3193
|
+
if (!hasWebSearchCapability(searchProvider)) {
|
|
3194
|
+
const webSearchProvider = await loadWebSearchProvider(true);
|
|
3195
|
+
if (webSearchProvider) searchProvider = webSearchProvider;
|
|
3196
|
+
}
|
|
3197
|
+
if (!searchProvider || !hasWebSearchCapability(searchProvider)) throw new Error(`search-rubric assertion requires a grading provider with web search capabilities. Use --grader with a web search provider (e.g., anthropic:messages:${DEFAULT_ANTHROPIC_MODEL}, openai:responses:o4-mini with tools configured, perplexity:sonar) or configure one in defaultTest.options.provider`);
|
|
3198
|
+
const prompt = await renderLlmRubricPrompt(await loadRubricPrompt(grading?.rubricPrompt, DEFAULT_WEB_SEARCH_PROMPT), {
|
|
3199
|
+
output: tryParse(llmOutput),
|
|
3200
|
+
rubric,
|
|
3201
|
+
...vars || {}
|
|
3202
|
+
});
|
|
3203
|
+
const resp = await callProviderWithContext(searchProvider, prompt, "search-rubric", {
|
|
3204
|
+
output: tryParse(llmOutput),
|
|
3205
|
+
rubric,
|
|
3206
|
+
...vars || {}
|
|
3207
|
+
}, providerCallContext);
|
|
3208
|
+
if (resp.error || !resp.output) return {
|
|
3209
|
+
pass: false,
|
|
3210
|
+
score: 0,
|
|
3211
|
+
reason: `Search rubric evaluation failed: ${resp.error || "No output"}`,
|
|
3212
|
+
tokensUsed: resp.tokenUsage,
|
|
3213
|
+
assertion
|
|
3214
|
+
};
|
|
3215
|
+
try {
|
|
3216
|
+
const result = extractFirstJsonObject(String(resp.output));
|
|
3217
|
+
let pass = result.pass ?? false;
|
|
3218
|
+
const score = typeof result.score === "number" ? result.score : pass ? 1 : 0;
|
|
3219
|
+
if (assertion?.threshold !== void 0) pass = pass && score >= assertion.threshold;
|
|
3220
|
+
return {
|
|
3221
|
+
pass,
|
|
3222
|
+
score,
|
|
3223
|
+
reason: result.reason || "No reason provided",
|
|
3224
|
+
tokensUsed: resp.tokenUsage,
|
|
3225
|
+
assertion,
|
|
3226
|
+
metadata: {
|
|
3227
|
+
searchResults: result.searchResults || [],
|
|
3228
|
+
searchProvider: searchProvider.id()
|
|
3229
|
+
}
|
|
3230
|
+
};
|
|
3231
|
+
} catch (err) {
|
|
3232
|
+
logger.warn(`[search-rubric] Could not parse structured JSON from provider response, falling back to substring matching: ${err.message}`);
|
|
3233
|
+
const outputLower = String(resp.output).toLowerCase();
|
|
3234
|
+
const pass = outputLower.includes("\"pass\":true") || outputLower.includes("\"pass\": true");
|
|
3235
|
+
return {
|
|
3236
|
+
pass,
|
|
3237
|
+
score: pass ? 1 : 0,
|
|
3238
|
+
reason: resp.output,
|
|
3239
|
+
tokensUsed: resp.tokenUsage,
|
|
3240
|
+
assertion
|
|
3241
|
+
};
|
|
3242
|
+
}
|
|
3243
|
+
}
|
|
3244
|
+
//#endregion
|
|
2449
3245
|
//#region src/assertions/searchRubric.ts
|
|
2450
3246
|
async function handleSearchRubric({ assertion, baseType: _baseType, inverse, provider, providerCallContext, renderedValue, test, providerResponse }) {
|
|
2451
3247
|
if (renderedValue == null) throw new Error("search-rubric assertion type must have a string value");
|
|
@@ -3101,13 +3897,13 @@ function resolveSequenceValue(value) {
|
|
|
3101
3897
|
}
|
|
3102
3898
|
throw new Error("trajectory:tool-sequence assertion must have an array or object value");
|
|
3103
3899
|
}
|
|
3104
|
-
function isRecord(value) {
|
|
3900
|
+
function isRecord$1(value) {
|
|
3105
3901
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3106
3902
|
}
|
|
3107
3903
|
function matchesExpectedArgsPartial(actual, expected) {
|
|
3108
3904
|
if (Array.isArray(expected)) return Array.isArray(actual) && actual.length === expected.length && expected.every((item, index) => matchesExpectedArgsPartial(actual[index], item));
|
|
3109
|
-
if (isRecord(expected)) {
|
|
3110
|
-
if (!isRecord(actual)) return false;
|
|
3905
|
+
if (isRecord$1(expected)) {
|
|
3906
|
+
if (!isRecord$1(actual)) return false;
|
|
3111
3907
|
return Object.entries(expected).every(([key, expectedValue]) => Object.prototype.hasOwnProperty.call(actual, key) && matchesExpectedArgsPartial(actual[key], expectedValue));
|
|
3112
3908
|
}
|
|
3113
3909
|
return isDeepStrictEqual(actual, expected);
|
|
@@ -3478,7 +4274,7 @@ async function loadTraceData(traceId) {
|
|
|
3478
4274
|
let stableObservations = 0;
|
|
3479
4275
|
let latestTrace = null;
|
|
3480
4276
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
3481
|
-
latestTrace = await traceStore.getTrace(traceId);
|
|
4277
|
+
latestTrace = await traceStore.getTrace(traceId, { sanitizeAttributes: false });
|
|
3482
4278
|
const spanCount = latestTrace?.spans?.length ?? 0;
|
|
3483
4279
|
if (spanCount > 0) {
|
|
3484
4280
|
stableObservations = spanCount === lastSpanCount ? stableObservations + 1 : 1;
|
|
@@ -3531,7 +4327,7 @@ const ASSERTION_HANDLERS = {
|
|
|
3531
4327
|
"llm-rubric": handleLlmRubric,
|
|
3532
4328
|
meteor: async (params) => {
|
|
3533
4329
|
try {
|
|
3534
|
-
const { handleMeteorAssertion } = await import("./meteor-
|
|
4330
|
+
const { handleMeteorAssertion } = await import("./meteor-Dce-_zGQ.js");
|
|
3535
4331
|
return handleMeteorAssertion(params);
|
|
3536
4332
|
} catch (error) {
|
|
3537
4333
|
if (error instanceof Error && (error.message.includes("Cannot find module") || error.message.includes("natural\" package is required"))) return {
|
|
@@ -3667,7 +4463,7 @@ async function runAssertion({ prompt, provider, assertion, test, vars, latencyMs
|
|
|
3667
4463
|
};
|
|
3668
4464
|
}
|
|
3669
4465
|
else if (filePath.endsWith(".rb")) try {
|
|
3670
|
-
const { runRuby } = await import("./rubyUtils-
|
|
4466
|
+
const { runRuby } = await import("./rubyUtils-CnlW8AYb.js").then((n) => n.t);
|
|
3671
4467
|
valueFromScript = await runRuby(filePath, functionName || "get_assert", [output, context]);
|
|
3672
4468
|
logger.debug(`Ruby script ${filePath} output: ${valueFromScript}`);
|
|
3673
4469
|
} catch (error) {
|
|
@@ -3784,7 +4580,8 @@ async function runAssertions({ assertScoringFunction, latencyMs, prompt, provide
|
|
|
3784
4580
|
logger.debug(`Failed to preload trace data for assertions: ${error}`);
|
|
3785
4581
|
preloadedTraceData = null;
|
|
3786
4582
|
}
|
|
3787
|
-
|
|
4583
|
+
const concurrency = getProviderCallExecutionContext()?.providerCallQueue ? 1 : ASSERTIONS_MAX_CONCURRENCY;
|
|
4584
|
+
await async.forEachOfLimit(asserts, concurrency, async ({ assertion, assertResult, index }) => {
|
|
3788
4585
|
if (assertion.type.startsWith("select-") || assertion.type === "max-score") return;
|
|
3789
4586
|
const result = await runAssertion({
|
|
3790
4587
|
prompt,
|
|
@@ -3911,7 +4708,8 @@ var CIProgressReporter = class {
|
|
|
3911
4708
|
}
|
|
3912
4709
|
updateTotalTests(newTotal) {
|
|
3913
4710
|
this.totalTests = Math.max(newTotal, 1);
|
|
3914
|
-
|
|
4711
|
+
const percentage = Math.floor(this.completedTests / this.totalTests * 100);
|
|
4712
|
+
this.highestPercentageSeen = percentage;
|
|
3915
4713
|
}
|
|
3916
4714
|
finish() {
|
|
3917
4715
|
if (this.intervalId) {
|
|
@@ -4084,6 +4882,10 @@ function getDefaultOtelConfig() {
|
|
|
4084
4882
|
}
|
|
4085
4883
|
//#endregion
|
|
4086
4884
|
//#region src/tracing/localSpanExporter.ts
|
|
4885
|
+
const MISSING_TRACE_RETRY_DELAY_MS = 50;
|
|
4886
|
+
function delay(ms) {
|
|
4887
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
4888
|
+
}
|
|
4087
4889
|
/**
|
|
4088
4890
|
* A span exporter that writes spans to the local TraceStore (SQLite).
|
|
4089
4891
|
* This allows OTEL spans to be stored locally for analysis in the promptfoo UI.
|
|
@@ -4125,7 +4927,7 @@ var LocalSpanExporter = class {
|
|
|
4125
4927
|
}
|
|
4126
4928
|
let firstError;
|
|
4127
4929
|
for (const [traceId, spanDataList] of spansByTrace) try {
|
|
4128
|
-
const result = await
|
|
4930
|
+
const result = await this.addSpansWithTraceRetry(traceStore, traceId, spanDataList);
|
|
4129
4931
|
if (result.stored) logger.debug(`[LocalSpanExporter] Added ${spanDataList.length} spans to trace ${traceId}`);
|
|
4130
4932
|
else logger.debug(`[LocalSpanExporter] Skipping ${spanDataList.length} spans for orphan trace ${traceId}: ${result.reason}`);
|
|
4131
4933
|
} catch (error) {
|
|
@@ -4137,6 +4939,16 @@ var LocalSpanExporter = class {
|
|
|
4137
4939
|
}
|
|
4138
4940
|
return firstError;
|
|
4139
4941
|
}
|
|
4942
|
+
async addSpansWithTraceRetry(traceStore, traceId, spans) {
|
|
4943
|
+
const options = {
|
|
4944
|
+
skipTraceCheck: false,
|
|
4945
|
+
warnIfMissingTrace: false
|
|
4946
|
+
};
|
|
4947
|
+
const result = await traceStore.addSpans(traceId, spans, options);
|
|
4948
|
+
if (result.stored) return result;
|
|
4949
|
+
await delay(MISSING_TRACE_RETRY_DELAY_MS);
|
|
4950
|
+
return traceStore.addSpans(traceId, spans, options);
|
|
4951
|
+
}
|
|
4140
4952
|
/**
|
|
4141
4953
|
* Convert an OTEL ReadableSpan to our SpanData format.
|
|
4142
4954
|
*/
|
|
@@ -4419,6 +5231,15 @@ function isPromptAllowed(prompt, allowedPrompts) {
|
|
|
4419
5231
|
}
|
|
4420
5232
|
//#endregion
|
|
4421
5233
|
//#region src/evaluator.ts
|
|
5234
|
+
const CONVERSATION_VAR_NAME = "_conversation";
|
|
5235
|
+
const promptUsesConversationVariableCache = new LRUCache({ max: 1024 });
|
|
5236
|
+
function promptUsesConversationVariable(prompt) {
|
|
5237
|
+
const cached = promptUsesConversationVariableCache.get(prompt.raw);
|
|
5238
|
+
if (cached !== void 0) return cached;
|
|
5239
|
+
const { referenced, parsed } = analyzeTemplateReference(prompt.raw, CONVERSATION_VAR_NAME);
|
|
5240
|
+
if (parsed) promptUsesConversationVariableCache.set(prompt.raw, referenced);
|
|
5241
|
+
return referenced;
|
|
5242
|
+
}
|
|
4422
5243
|
/**
|
|
4423
5244
|
* Manages a single progress bar for the evaluation
|
|
4424
5245
|
*/
|
|
@@ -4618,6 +5439,18 @@ function hasProviderGroupedAssertion(assertion) {
|
|
|
4618
5439
|
function shouldDeferGradingForTest(test) {
|
|
4619
5440
|
return Boolean(test.assert?.some(hasProviderGroupedAssertion));
|
|
4620
5441
|
}
|
|
5442
|
+
function logGroupedGradingStatus({ concurrency, hasEvalStepTimeout, runEvalOptions, shouldGroupGradingByProvider, usesConversationVar }) {
|
|
5443
|
+
if (!runEvalOptions.some(({ test }) => shouldDeferGradingForTest(test))) return;
|
|
5444
|
+
if (shouldGroupGradingByProvider) {
|
|
5445
|
+
logger.info("Grouping model-graded assertions by provider to minimize local-model reload overhead.");
|
|
5446
|
+
return;
|
|
5447
|
+
}
|
|
5448
|
+
if (concurrency !== 1) return;
|
|
5449
|
+
const reasons = [];
|
|
5450
|
+
if (hasEvalStepTimeout) reasons.push("per-eval-step timeout is configured");
|
|
5451
|
+
if (usesConversationVar) reasons.push("conversation variables require per-row ordering");
|
|
5452
|
+
if (reasons.length > 0) logger.info(`Serial grading grouping disabled because ${reasons.join(" and ")}; model-graded judges may reload between rows.`);
|
|
5453
|
+
}
|
|
4621
5454
|
function applyGradingResult(row, checkResult) {
|
|
4622
5455
|
if (!checkResult.pass) {
|
|
4623
5456
|
row.error = checkResult.reason;
|
|
@@ -4632,14 +5465,29 @@ function applyGradingResult(row, checkResult) {
|
|
|
4632
5465
|
if (checkResult.tokensUsed) accumulateAssertionTokenUsage(row.tokenUsage.assertions, checkResult.tokensUsed);
|
|
4633
5466
|
row.gradingResult = checkResult;
|
|
4634
5467
|
}
|
|
4635
|
-
|
|
4636
|
-
|
|
4637
|
-
|
|
4638
|
-
|
|
4639
|
-
|
|
4640
|
-
|
|
4641
|
-
|
|
4642
|
-
|
|
5468
|
+
const ABORTED_GRADING_PREFIX = "Aborted: ";
|
|
5469
|
+
function isAbortShapedError(error) {
|
|
5470
|
+
return error instanceof Error && (error.name === "AbortError" || error.name === "AbortException");
|
|
5471
|
+
}
|
|
5472
|
+
function applyGradingError(row, error, abortSignal) {
|
|
5473
|
+
const errorAsError = error instanceof Error ? error : void 0;
|
|
5474
|
+
if (Boolean(abortSignal?.aborted) && isAbortShapedError(error)) {
|
|
5475
|
+
const shortMessage = errorAsError?.message ?? String(error);
|
|
5476
|
+
logger.debug("Assertion grading aborted", {
|
|
5477
|
+
error: shortMessage,
|
|
5478
|
+
promptIdx: row.promptIdx,
|
|
5479
|
+
testIdx: row.testIdx
|
|
5480
|
+
});
|
|
5481
|
+
row.error = `${ABORTED_GRADING_PREFIX}${shortMessage}`;
|
|
5482
|
+
} else {
|
|
5483
|
+
const fullMessage = errorAsError ? errorAsError.stack ?? errorAsError.message : String(error);
|
|
5484
|
+
logger.error("Assertion grading failed during eval", {
|
|
5485
|
+
error: fullMessage,
|
|
5486
|
+
promptIdx: row.promptIdx,
|
|
5487
|
+
testIdx: row.testIdx
|
|
5488
|
+
});
|
|
5489
|
+
row.error = fullMessage;
|
|
5490
|
+
}
|
|
4643
5491
|
row.failureReason = ResultFailureReason.ERROR;
|
|
4644
5492
|
row.success = false;
|
|
4645
5493
|
row.score = 0;
|
|
@@ -4671,7 +5519,7 @@ function createRunEvalState({ provider, prompt, test }) {
|
|
|
4671
5519
|
};
|
|
4672
5520
|
}
|
|
4673
5521
|
function attachConversationVar({ conversations, conversationKey, prompt, test, vars }) {
|
|
4674
|
-
const usesConversation = prompt
|
|
5522
|
+
const usesConversation = promptUsesConversationVariable(prompt);
|
|
4675
5523
|
if (!getEnvBool("PROMPTFOO_DISABLE_CONVERSATION_VAR") && !test.options?.disableConversationVar && usesConversation) vars._conversation = conversations?.[conversationKey] || [];
|
|
4676
5524
|
}
|
|
4677
5525
|
function createRunEvalSetup({ provider, prompt, promptConfig, vars }) {
|
|
@@ -4918,7 +5766,7 @@ async function gradeRunEvalResponse({ abortSignal, deferGrading, evalId, latency
|
|
|
4918
5766
|
assertScoringFunction: test.assertScoringFunction,
|
|
4919
5767
|
traceId
|
|
4920
5768
|
}).then((checkResult) => applyGradingResult(ret, checkResult))).catch((error) => {
|
|
4921
|
-
applyGradingError(ret, error);
|
|
5769
|
+
applyGradingError(ret, error, abortSignal);
|
|
4922
5770
|
});
|
|
4923
5771
|
deferredGradingPromises.set(ret, gradingPromise);
|
|
4924
5772
|
return;
|
|
@@ -5465,7 +6313,7 @@ async function resolveDefaultTestProvider(defaultTest, testCase) {
|
|
|
5465
6313
|
const defaultProvider = defaultTest.provider;
|
|
5466
6314
|
if (isApiProvider(defaultProvider)) return defaultProvider;
|
|
5467
6315
|
if (typeof defaultProvider === "object" && defaultProvider.id) {
|
|
5468
|
-
const { loadApiProvider } = await import("./providers-
|
|
6316
|
+
const { loadApiProvider } = await import("./providers-DVYRZP4E.js").then((n) => n.i);
|
|
5469
6317
|
return loadApiProvider(typeof defaultProvider.id === "function" ? defaultProvider.id() : defaultProvider.id, { options: defaultProvider });
|
|
5470
6318
|
}
|
|
5471
6319
|
return defaultProvider;
|
|
@@ -5625,7 +6473,7 @@ function buildRepeatCacheContextByTestIdx(runEvalOptions) {
|
|
|
5625
6473
|
async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
|
|
5626
6474
|
if (!state.resume || !evalRecord.persisted) return;
|
|
5627
6475
|
try {
|
|
5628
|
-
const { default: EvalResult } = await import("./evalResult-
|
|
6476
|
+
const { default: EvalResult } = await import("./evalResult-2RRJvFyB.js").then((n) => n.n);
|
|
5629
6477
|
const completedPairs = await EvalResult.getCompletedIndexPairs(evalRecord.id, { excludeErrors: state.retryMode });
|
|
5630
6478
|
const originalCount = runEvalOptions.length;
|
|
5631
6479
|
for (let i = runEvalOptions.length - 1; i >= 0; i--) {
|
|
@@ -5639,14 +6487,14 @@ async function filterCompletedResumeSteps(runEvalOptions, evalRecord) {
|
|
|
5639
6487
|
}
|
|
5640
6488
|
}
|
|
5641
6489
|
function adjustConcurrencyForSerialFeatures({ concurrency, prompts, tests }) {
|
|
5642
|
-
const usesConversationVar = prompts.some(
|
|
6490
|
+
const usesConversationVar = prompts.some(promptUsesConversationVariable);
|
|
5643
6491
|
if (concurrency <= 1) return {
|
|
5644
6492
|
concurrency,
|
|
5645
6493
|
usesConversationVar
|
|
5646
6494
|
};
|
|
5647
6495
|
const usesStoreOutputAs = tests.some((t) => t.options?.storeOutputAs);
|
|
5648
6496
|
if (usesConversationVar) {
|
|
5649
|
-
logger.info(`Setting concurrency to 1 because the ${chalk.cyan(
|
|
6497
|
+
logger.info(`Setting concurrency to 1 because the ${chalk.cyan(CONVERSATION_VAR_NAME)} variable is used.`);
|
|
5650
6498
|
return {
|
|
5651
6499
|
concurrency: 1,
|
|
5652
6500
|
usesConversationVar
|
|
@@ -5876,7 +6724,8 @@ var Evaluator = class {
|
|
|
5876
6724
|
};
|
|
5877
6725
|
this.conversations = {};
|
|
5878
6726
|
this.registers = {};
|
|
5879
|
-
|
|
6727
|
+
const jsonlFiles = Array.isArray(evalRecord.config.outputPath) ? evalRecord.config.outputPath.filter((p) => p.endsWith(".jsonl")) : evalRecord.config.outputPath?.endsWith(".jsonl") ? [evalRecord.config.outputPath] : [];
|
|
6728
|
+
this.fileWriters = jsonlFiles.map((p) => new JsonlFileWriter(p));
|
|
5880
6729
|
this.rateLimitRegistry = createRateLimitRegistry({ maxConcurrency: options.maxConcurrency || 4 });
|
|
5881
6730
|
this.rateLimitRegistry.on("ratelimit:hit", (data) => {
|
|
5882
6731
|
logger.debug(`[Scheduler] Rate limit hit for ${data.rateLimitKey}`, {
|
|
@@ -5996,6 +6845,25 @@ var Evaluator = class {
|
|
|
5996
6845
|
this.trackCompletedRow(evalStep, row, context);
|
|
5997
6846
|
context.numComplete++;
|
|
5998
6847
|
const promptEvalCount = reservePromptEvalCount(context, row.promptIdx);
|
|
6848
|
+
if (context.testSuite.extensions?.length) try {
|
|
6849
|
+
const afterEachOut = await runExtensionHook(context.testSuite.extensions, "afterEach", {
|
|
6850
|
+
test: evalStep.test,
|
|
6851
|
+
result: {
|
|
6852
|
+
...row,
|
|
6853
|
+
namedScores: { ...row.namedScores },
|
|
6854
|
+
metadata: { ...row.metadata },
|
|
6855
|
+
response: row.response ? {
|
|
6856
|
+
...row.response,
|
|
6857
|
+
metadata: { ...row.response.metadata }
|
|
6858
|
+
} : row.response
|
|
6859
|
+
}
|
|
6860
|
+
});
|
|
6861
|
+
row.namedScores = filterFiniteScores(afterEachOut.result.namedScores);
|
|
6862
|
+
row.metadata = afterEachOut.result.metadata;
|
|
6863
|
+
if (row.response && afterEachOut.result.response) row.response.metadata = afterEachOut.result.response.metadata;
|
|
6864
|
+
} catch (error) {
|
|
6865
|
+
logger.error(`afterEach extension hook failed, persisting row without hook modifications`, { error });
|
|
6866
|
+
}
|
|
5999
6867
|
await this.persistEvalRow(row);
|
|
6000
6868
|
if (this.abortIfTargetUnavailable(row, context)) break;
|
|
6001
6869
|
const metrics = context.prompts[row.promptIdx].metrics;
|
|
@@ -6007,10 +6875,6 @@ var Evaluator = class {
|
|
|
6007
6875
|
promptEvalCount,
|
|
6008
6876
|
row
|
|
6009
6877
|
});
|
|
6010
|
-
await runExtensionHook(context.testSuite.extensions, "afterEach", {
|
|
6011
|
-
test: evalStep.test,
|
|
6012
|
-
result: row
|
|
6013
|
-
});
|
|
6014
6878
|
context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics);
|
|
6015
6879
|
}
|
|
6016
6880
|
}
|
|
@@ -6084,9 +6948,8 @@ var Evaluator = class {
|
|
|
6084
6948
|
context.options.progressCallback?.(context.numComplete, context.runEvalOptionsLength, index, evalStep, metrics || createTimeoutMetrics(timeoutMs));
|
|
6085
6949
|
}
|
|
6086
6950
|
async executeEvalSteps({ checkAbort, ciProgressReporter, combinedAbortSignal, concurrentRunEvalOptions, evalStepIndexMap, globalTimeout, groupedRunEvalOptions, isEvalTimedOut, isWebUI, maxEvalTimeMs, processingContext, processedIndices, progressBarManager, prompts, serialRunEvalOptions, shouldGroupGradingByProvider }) {
|
|
6087
|
-
let flushGroupedRows;
|
|
6088
6951
|
try {
|
|
6089
|
-
if (shouldGroupGradingByProvider)
|
|
6952
|
+
if (shouldGroupGradingByProvider) await this.runGroupedEvalSteps({
|
|
6090
6953
|
checkAbort,
|
|
6091
6954
|
evalStepIndexMap,
|
|
6092
6955
|
groupedRunEvalOptions,
|
|
@@ -6118,7 +6981,6 @@ var Evaluator = class {
|
|
|
6118
6981
|
cleanupProgressAfterError(progressBarManager, ciProgressReporter, err);
|
|
6119
6982
|
throw err;
|
|
6120
6983
|
}
|
|
6121
|
-
await flushGroupedRows?.();
|
|
6122
6984
|
if (isEvalTimedOut()) logger.warn(`Evaluation stopped after reaching max duration (${maxEvalTimeMs}ms)`);
|
|
6123
6985
|
else if (!processingContext.targetUnavailable) return this.saveInterruptedEval({
|
|
6124
6986
|
ciProgressReporter,
|
|
@@ -6167,7 +7029,15 @@ var Evaluator = class {
|
|
|
6167
7029
|
})) break;
|
|
6168
7030
|
}
|
|
6169
7031
|
} catch (error) {
|
|
6170
|
-
|
|
7032
|
+
const pendingRowCount = groupedRows.reduce((sum, entry) => sum + entry.rows.length, 0);
|
|
7033
|
+
try {
|
|
7034
|
+
await flushGroupedRows();
|
|
7035
|
+
} catch (flushError) {
|
|
7036
|
+
logger.warn("Failed to flush grouped rows after error; target outputs may be lost", {
|
|
7037
|
+
error: flushError instanceof Error ? flushError.message : String(flushError),
|
|
7038
|
+
pendingRowCount
|
|
7039
|
+
});
|
|
7040
|
+
}
|
|
6171
7041
|
throw error;
|
|
6172
7042
|
}
|
|
6173
7043
|
await flushGroupedRows();
|
|
@@ -6603,6 +7473,13 @@ var Evaluator = class {
|
|
|
6603
7473
|
if (!this.options.silent) {
|
|
6604
7474
|
if (serialRunEvalOptions.length > 0) logger.info(`Running ${serialRunEvalOptions.length} test cases serially...`);
|
|
6605
7475
|
if (concurrentRunEvalOptions.length > 0) logger.info(`Running ${concurrentRunEvalOptions.length} test cases (up to ${concurrency} at a time)...`);
|
|
7476
|
+
logGroupedGradingStatus({
|
|
7477
|
+
concurrency,
|
|
7478
|
+
hasEvalStepTimeout,
|
|
7479
|
+
runEvalOptions,
|
|
7480
|
+
shouldGroupGradingByProvider,
|
|
7481
|
+
usesConversationVar
|
|
7482
|
+
});
|
|
6606
7483
|
}
|
|
6607
7484
|
if (this.options.showProgressBar && progressBarManager) {
|
|
6608
7485
|
await progressBarManager.initialize(runEvalOptions, concurrency, 0);
|
|
@@ -7606,7 +8483,7 @@ var Eval = class Eval {
|
|
|
7606
8483
|
const evalInstance = new Eval(eval_.config, {
|
|
7607
8484
|
id: eval_.id,
|
|
7608
8485
|
createdAt: new Date(eval_.createdAt),
|
|
7609
|
-
author: eval_.author
|
|
8486
|
+
author: eval_.author,
|
|
7610
8487
|
description: eval_.description || void 0,
|
|
7611
8488
|
prompts: eval_.prompts || [],
|
|
7612
8489
|
datasetId,
|
|
@@ -7629,7 +8506,7 @@ var Eval = class Eval {
|
|
|
7629
8506
|
return (await getDb().select().from(evalsTable).limit(limit).orderBy(desc(evalsTable.createdAt)).all()).map((e) => new Eval(e.config, {
|
|
7630
8507
|
id: e.id,
|
|
7631
8508
|
createdAt: new Date(e.createdAt),
|
|
7632
|
-
author: e.author
|
|
8509
|
+
author: e.author,
|
|
7633
8510
|
description: e.description || void 0,
|
|
7634
8511
|
prompts: e.prompts || [],
|
|
7635
8512
|
persisted: true
|
|
@@ -7644,7 +8521,7 @@ var Eval = class Eval {
|
|
|
7644
8521
|
return (await getDb().select().from(evalsTable).orderBy(desc(evalsTable.createdAt)).limit(limit).offset(offset).all()).map((e) => new Eval(e.config, {
|
|
7645
8522
|
id: e.id,
|
|
7646
8523
|
createdAt: new Date(e.createdAt),
|
|
7647
|
-
author: e.author
|
|
8524
|
+
author: e.author,
|
|
7648
8525
|
description: e.description || void 0,
|
|
7649
8526
|
prompts: e.prompts || [],
|
|
7650
8527
|
persisted: true
|
|
@@ -7659,7 +8536,7 @@ var Eval = class Eval {
|
|
|
7659
8536
|
static async create(config, renderedPrompts, opts) {
|
|
7660
8537
|
const createdAt = opts?.createdAt || /* @__PURE__ */ new Date();
|
|
7661
8538
|
const evalId = opts?.id || createEvalId(createdAt);
|
|
7662
|
-
const author = opts
|
|
8539
|
+
const author = opts && "author" in opts ? opts.author ?? null : getAuthor();
|
|
7663
8540
|
const db = getDb();
|
|
7664
8541
|
const datasetId = sha256(JSON.stringify(config.tests || []));
|
|
7665
8542
|
db.transaction(() => {
|
|
@@ -7721,7 +8598,7 @@ var Eval = class Eval {
|
|
|
7721
8598
|
});
|
|
7722
8599
|
return new Eval(config, {
|
|
7723
8600
|
id: evalId,
|
|
7724
|
-
author
|
|
8601
|
+
author,
|
|
7725
8602
|
createdAt,
|
|
7726
8603
|
persisted: true,
|
|
7727
8604
|
runtimeOptions: sanitizeRuntimeOptions(opts?.runtimeOptions)
|
|
@@ -7731,7 +8608,7 @@ var Eval = class Eval {
|
|
|
7731
8608
|
const createdAt = opts?.createdAt || /* @__PURE__ */ new Date();
|
|
7732
8609
|
this.createdAt = createdAt.getTime();
|
|
7733
8610
|
this.id = opts?.id || createEvalId(createdAt);
|
|
7734
|
-
this.author = opts?.author;
|
|
8611
|
+
this.author = opts?.author ?? null;
|
|
7735
8612
|
this.config = config;
|
|
7736
8613
|
this.results = [];
|
|
7737
8614
|
this.prompts = opts?.prompts || [];
|
|
@@ -8261,7 +9138,7 @@ var Eval = class Eval {
|
|
|
8261
9138
|
newConfig.description = copyDescription;
|
|
8262
9139
|
const newPrompts = structuredClone(this.prompts);
|
|
8263
9140
|
const newVars = this.vars ? structuredClone(this.vars) : [];
|
|
8264
|
-
const author =
|
|
9141
|
+
const author = getAuthor();
|
|
8265
9142
|
const db = getDb();
|
|
8266
9143
|
let copiedCount = 0;
|
|
8267
9144
|
db.transaction(() => {
|
|
@@ -8418,47 +9295,11 @@ function filterPrompts(prompts, filterPromptsOption) {
|
|
|
8418
9295
|
//#endregion
|
|
8419
9296
|
//#region src/commands/eval/filterProviders.ts
|
|
8420
9297
|
/**
|
|
8421
|
-
* Checks if a value is a valid provider ID (non-empty string).
|
|
8422
|
-
*/
|
|
8423
|
-
function isValidProviderId(id) {
|
|
8424
|
-
return id !== null && id !== void 0 && typeof id === "string" && id !== "";
|
|
8425
|
-
}
|
|
8426
|
-
/**
|
|
8427
9298
|
* Extracts the id and label from a raw provider config without instantiating it.
|
|
8428
9299
|
* Handles all provider config formats: string, function, ProviderOptions, ProviderOptionsMap.
|
|
8429
9300
|
*/
|
|
8430
9301
|
function getProviderIdAndLabel(provider, index) {
|
|
8431
|
-
|
|
8432
|
-
if (typeof provider === "function") {
|
|
8433
|
-
const label = provider.label;
|
|
8434
|
-
return {
|
|
8435
|
-
id: label ?? `custom-function-${index}`,
|
|
8436
|
-
label
|
|
8437
|
-
};
|
|
8438
|
-
}
|
|
8439
|
-
const providerId = provider.id;
|
|
8440
|
-
if ("id" in provider && isValidProviderId(providerId)) return {
|
|
8441
|
-
id: providerId,
|
|
8442
|
-
label: provider.label
|
|
8443
|
-
};
|
|
8444
|
-
const keys = Object.keys(provider);
|
|
8445
|
-
if (keys.length > 0) {
|
|
8446
|
-
const id = keys[0];
|
|
8447
|
-
const value = provider[id];
|
|
8448
|
-
if (typeof value === "object" && value !== null) return {
|
|
8449
|
-
id: value.id || id,
|
|
8450
|
-
label: value.label
|
|
8451
|
-
};
|
|
8452
|
-
}
|
|
8453
|
-
const label = provider.label;
|
|
8454
|
-
if (isValidProviderId(label)) return {
|
|
8455
|
-
id: label,
|
|
8456
|
-
label
|
|
8457
|
-
};
|
|
8458
|
-
return {
|
|
8459
|
-
id: `unknown-${index}`,
|
|
8460
|
-
label
|
|
8461
|
-
};
|
|
9302
|
+
return normalizeProviderRef(provider, { index });
|
|
8462
9303
|
}
|
|
8463
9304
|
/**
|
|
8464
9305
|
* Filters raw provider configs BEFORE instantiation.
|
|
@@ -10691,25 +11532,45 @@ var AlignedHarmfulPlugin = class extends RedteamPluginBase {
|
|
|
10691
11532
|
getAssertions(_prompt) {
|
|
10692
11533
|
return getHarmfulAssertions(this.harmCategory);
|
|
10693
11534
|
}
|
|
10694
|
-
promptsToTestCases(prompts) {
|
|
11535
|
+
async promptsToTestCases(prompts) {
|
|
10695
11536
|
const hasMultipleInputs = this.config.inputs && Object.keys(this.config.inputs).length > 0;
|
|
10696
11537
|
const harmCategoryLabel = HARM_PLUGINS[this.harmCategory] || this.harmCategory;
|
|
10697
|
-
|
|
11538
|
+
const pluginId = getShortPluginId(this.harmCategory);
|
|
11539
|
+
return Promise.all([...prompts].sort((a, b) => a.__prompt.localeCompare(b.__prompt)).map(async ({ __prompt }, materializationIndex) => {
|
|
10698
11540
|
const vars = { [this.injectVar]: __prompt };
|
|
10699
|
-
|
|
10700
|
-
|
|
10701
|
-
|
|
10702
|
-
|
|
11541
|
+
let inputMaterialization;
|
|
11542
|
+
if (hasMultipleInputs) {
|
|
11543
|
+
let parsed;
|
|
11544
|
+
try {
|
|
11545
|
+
parsed = JSON.parse(__prompt);
|
|
11546
|
+
} catch (error) {
|
|
11547
|
+
logger.debug("[AlignedHarmful] Could not parse prompt as JSON for multi-input mode", { error });
|
|
11548
|
+
}
|
|
11549
|
+
if (parsed) try {
|
|
11550
|
+
const materializedVars = await extractMaterializedVariablesFromJsonWithMetadata(parsed, this.config.inputs, {
|
|
11551
|
+
materializationIndex,
|
|
11552
|
+
pluginId,
|
|
11553
|
+
provider: this.provider,
|
|
11554
|
+
purpose: this.purpose
|
|
11555
|
+
});
|
|
11556
|
+
Object.assign(vars, materializedVars.vars);
|
|
11557
|
+
inputMaterialization = materializedVars.metadata;
|
|
11558
|
+
} catch (error) {
|
|
11559
|
+
logger.debug("[AlignedHarmful] Failed to materialize prompt inputs", { error });
|
|
11560
|
+
throw error;
|
|
11561
|
+
}
|
|
11562
|
+
}
|
|
10703
11563
|
return {
|
|
10704
11564
|
vars,
|
|
10705
11565
|
metadata: {
|
|
10706
11566
|
harmCategory: harmCategoryLabel,
|
|
10707
|
-
pluginId
|
|
10708
|
-
pluginConfig: this.config
|
|
11567
|
+
pluginId,
|
|
11568
|
+
pluginConfig: this.config,
|
|
11569
|
+
...inputMaterialization ? { inputMaterialization } : {}
|
|
10709
11570
|
},
|
|
10710
11571
|
assert: getHarmfulAssertions(this.harmCategory)
|
|
10711
11572
|
};
|
|
10712
|
-
});
|
|
11573
|
+
}));
|
|
10713
11574
|
}
|
|
10714
11575
|
};
|
|
10715
11576
|
//#endregion
|
|
@@ -10718,20 +11579,37 @@ var AlignedHarmfulPlugin = class extends RedteamPluginBase {
|
|
|
10718
11579
|
* Extract content from <Prompt> tags and parse JSON if inputs are defined.
|
|
10719
11580
|
* Returns the processed prompt and any additional vars extracted from JSON.
|
|
10720
11581
|
*/
|
|
10721
|
-
function processPromptForInputs(prompt,
|
|
11582
|
+
async function processPromptForInputs(prompt, inputs, plugin, provider, purpose, materializationIndex) {
|
|
10722
11583
|
let processedPrompt = prompt.trim();
|
|
10723
11584
|
const additionalVars = {};
|
|
11585
|
+
let additionalMetadata;
|
|
10724
11586
|
const extractedPrompt = extractPromptFromTags(processedPrompt);
|
|
10725
11587
|
if (extractedPrompt) processedPrompt = extractedPrompt;
|
|
10726
|
-
if (inputs && Object.keys(inputs).length > 0)
|
|
10727
|
-
|
|
10728
|
-
|
|
10729
|
-
|
|
10730
|
-
|
|
11588
|
+
if (inputs && Object.keys(inputs).length > 0) {
|
|
11589
|
+
let parsed;
|
|
11590
|
+
try {
|
|
11591
|
+
parsed = JSON.parse(processedPrompt);
|
|
11592
|
+
} catch (error) {
|
|
11593
|
+
logger.debug("[Harmful] Could not parse prompt as JSON for multi-input mode", { error });
|
|
11594
|
+
}
|
|
11595
|
+
if (parsed) try {
|
|
11596
|
+
const materializedVars = await extractMaterializedVariablesFromJsonWithMetadata(parsed, inputs, {
|
|
11597
|
+
materializationIndex,
|
|
11598
|
+
pluginId: plugin,
|
|
11599
|
+
provider,
|
|
11600
|
+
purpose
|
|
11601
|
+
});
|
|
11602
|
+
Object.assign(additionalVars, materializedVars.vars);
|
|
11603
|
+
additionalMetadata = materializedVars.metadata;
|
|
11604
|
+
} catch (error) {
|
|
11605
|
+
logger.debug("[Harmful] Failed to materialize prompt inputs", { error });
|
|
11606
|
+
throw error;
|
|
11607
|
+
}
|
|
10731
11608
|
}
|
|
10732
11609
|
return {
|
|
10733
11610
|
processedPrompt,
|
|
10734
|
-
additionalVars
|
|
11611
|
+
additionalVars,
|
|
11612
|
+
additionalMetadata
|
|
10735
11613
|
};
|
|
10736
11614
|
}
|
|
10737
11615
|
async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, plugin) {
|
|
@@ -10752,15 +11630,19 @@ async function getHarmfulTests({ purpose, injectVar, n, delayMs = 0, config }, p
|
|
|
10752
11630
|
};
|
|
10753
11631
|
const allPrompts = await retryWithDeduplication(generatePrompts, n);
|
|
10754
11632
|
const inputs = config?.inputs;
|
|
10755
|
-
return sampleArray(allPrompts, n).map((prompt) => {
|
|
10756
|
-
const { processedPrompt, additionalVars } = processPromptForInputs(prompt,
|
|
11633
|
+
return Promise.all(sampleArray(allPrompts, n).map(async (prompt, materializationIndex) => {
|
|
11634
|
+
const { processedPrompt, additionalVars, additionalMetadata } = await processPromptForInputs(prompt, inputs, plugin, unalignedProvider, purpose, materializationIndex);
|
|
10757
11635
|
const testCase = createTestCase(injectVar, processedPrompt, plugin);
|
|
10758
11636
|
if (Object.keys(additionalVars).length > 0) testCase.vars = {
|
|
10759
11637
|
...testCase.vars,
|
|
10760
11638
|
...additionalVars
|
|
10761
11639
|
};
|
|
11640
|
+
if (additionalMetadata) testCase.metadata = {
|
|
11641
|
+
...testCase.metadata,
|
|
11642
|
+
inputMaterialization: additionalMetadata
|
|
11643
|
+
};
|
|
10762
11644
|
return testCase;
|
|
10763
|
-
});
|
|
11645
|
+
}));
|
|
10764
11646
|
}
|
|
10765
11647
|
//#endregion
|
|
10766
11648
|
//#region src/redteam/plugins/teenSafety/graderExamples.ts
|
|
@@ -11078,7 +11960,7 @@ const MAX_CHARS_RETRY_MODIFIER_KEY = "__maxCharsPerMessageRetry";
|
|
|
11078
11960
|
function computeModifiersFromConfig(config) {
|
|
11079
11961
|
const modifiers = { ...config?.modifiers };
|
|
11080
11962
|
if (config?.language && typeof config.language === "string") modifiers.language = config.language;
|
|
11081
|
-
if (config?.inputs && Object.keys(config.inputs).length > 0) modifiers.__outputFormat = `Output each test case as JSON wrapped in <Prompt> tags: <Prompt>{${Object.entries(config.inputs).map(([k, description]) => `"${k}": "${description}"`).join(", ")}}</Prompt>`;
|
|
11963
|
+
if (config?.inputs && Object.keys(config.inputs).length > 0) modifiers.__outputFormat = `Output each test case as JSON wrapped in <Prompt> tags: <Prompt>{${Object.entries(buildPromptInputDescriptions(config.inputs) ?? {}).map(([k, description]) => `"${k}": "${description}"`).join(", ")}}</Prompt>`;
|
|
11082
11964
|
const maxCharsModifier = getMaxCharsPerMessageModifierValue(config?.maxCharsPerMessage);
|
|
11083
11965
|
if (maxCharsModifier) modifiers[MAX_CHARS_PER_MESSAGE_MODIFIER_KEY] = maxCharsModifier;
|
|
11084
11966
|
return modifiers;
|
|
@@ -11163,9 +12045,10 @@ function dedupeTestCases(testCases) {
|
|
|
11163
12045
|
return deduped;
|
|
11164
12046
|
}
|
|
11165
12047
|
function buildMaxCharsRetryInstructions(rejectedPromptLengths, limit) {
|
|
12048
|
+
const longestRejectedPromptText = rejectedPromptLengths.length > 0 ? `${Math.max(...rejectedPromptLengths)} characters` : "unknown length";
|
|
11166
12049
|
return dedent`
|
|
11167
12050
|
Your previous response included ${rejectedPromptLengths.length} generated prompt${rejectedPromptLengths.length === 1 ? "" : "s"} that exceeded the ${limit ?? "configured"}-character limit.
|
|
11168
|
-
The longest rejected prompt was ${
|
|
12051
|
+
The longest rejected prompt was ${longestRejectedPromptText}.
|
|
11169
12052
|
Generate replacement prompts only, and keep every user message within the character limit.
|
|
11170
12053
|
`.trim();
|
|
11171
12054
|
}
|
|
@@ -11243,6 +12126,31 @@ async function fetchRemoteTestCases(key, purpose, injectVar, n, config) {
|
|
|
11243
12126
|
return [];
|
|
11244
12127
|
}
|
|
11245
12128
|
}
|
|
12129
|
+
async function materializeRemoteTestCaseInputs({ config, injectVar, pluginId, provider, purpose, testCases }) {
|
|
12130
|
+
const inputs = config.inputs;
|
|
12131
|
+
if (!inputs || Object.keys(inputs).length === 0) return testCases;
|
|
12132
|
+
return Promise.all(testCases.map(async (testCase, materializationIndex) => {
|
|
12133
|
+
const inputVars = extractInputVarsFromPrompt(String(testCase.vars?.[injectVar] ?? ""), inputs);
|
|
12134
|
+
if (!inputVars) return testCase;
|
|
12135
|
+
const materializedVars = await materializeInputVariablesWithMetadata(inputVars, inputs, {
|
|
12136
|
+
materializationIndex,
|
|
12137
|
+
pluginId,
|
|
12138
|
+
provider,
|
|
12139
|
+
purpose
|
|
12140
|
+
});
|
|
12141
|
+
return {
|
|
12142
|
+
...testCase,
|
|
12143
|
+
vars: {
|
|
12144
|
+
...testCase.vars || {},
|
|
12145
|
+
...materializedVars.vars
|
|
12146
|
+
},
|
|
12147
|
+
metadata: {
|
|
12148
|
+
...testCase.metadata || {},
|
|
12149
|
+
...materializedVars.metadata ? { inputMaterialization: materializedVars.metadata } : {}
|
|
12150
|
+
}
|
|
12151
|
+
};
|
|
12152
|
+
}));
|
|
12153
|
+
}
|
|
11246
12154
|
function createPluginFactory(PluginClass, key, validate) {
|
|
11247
12155
|
return {
|
|
11248
12156
|
key,
|
|
@@ -11253,13 +12161,21 @@ function createPluginFactory(PluginClass, key, validate) {
|
|
|
11253
12161
|
logger.debug(`Using local redteam generation for ${key}`);
|
|
11254
12162
|
return new PluginClass(provider, purpose, injectVar, configWithDefaults).generateTests(n, delayMs);
|
|
11255
12163
|
}
|
|
11256
|
-
const
|
|
12164
|
+
const pluginId = getShortPluginId(key);
|
|
12165
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12166
|
+
config: configWithDefaults ?? {},
|
|
12167
|
+
injectVar,
|
|
12168
|
+
pluginId,
|
|
12169
|
+
provider,
|
|
12170
|
+
purpose,
|
|
12171
|
+
testCases: await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {})
|
|
12172
|
+
});
|
|
11257
12173
|
const computedModifiers = computeModifiersFromConfig(configWithDefaults);
|
|
11258
12174
|
return testCases.map((testCase) => ({
|
|
11259
12175
|
...testCase,
|
|
11260
12176
|
metadata: {
|
|
11261
12177
|
...testCase.metadata,
|
|
11262
|
-
pluginId
|
|
12178
|
+
pluginId,
|
|
11263
12179
|
pluginConfig: {
|
|
11264
12180
|
...configWithDefaults,
|
|
11265
12181
|
modifiers: computedModifiers
|
|
@@ -11316,7 +12232,7 @@ const pluginFactories = [
|
|
|
11316
12232
|
key: category,
|
|
11317
12233
|
action: async (params) => {
|
|
11318
12234
|
if (neverGenerateRemote()) {
|
|
11319
|
-
logger.error(`${category} plugin
|
|
12235
|
+
logger.error(getRemoteGenerationExplicitlyDisabledError(`${category} plugin`));
|
|
11320
12236
|
return [];
|
|
11321
12237
|
}
|
|
11322
12238
|
const testCases = await getHarmfulTests(params, category);
|
|
@@ -11339,13 +12255,21 @@ const piiPlugins = PII_PLUGINS.map((category) => ({
|
|
|
11339
12255
|
key: category,
|
|
11340
12256
|
action: async (params) => {
|
|
11341
12257
|
if (shouldGenerateRemote()) {
|
|
11342
|
-
const
|
|
12258
|
+
const pluginId = getShortPluginId(category);
|
|
12259
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12260
|
+
config: params.config ?? {},
|
|
12261
|
+
injectVar: params.injectVar,
|
|
12262
|
+
pluginId,
|
|
12263
|
+
provider: params.provider,
|
|
12264
|
+
purpose: params.purpose,
|
|
12265
|
+
testCases: await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {})
|
|
12266
|
+
});
|
|
11343
12267
|
const computedModifiers = computeModifiersFromConfig(params.config);
|
|
11344
12268
|
return testCases.map((testCase) => ({
|
|
11345
12269
|
...testCase,
|
|
11346
12270
|
metadata: {
|
|
11347
12271
|
...testCase.metadata,
|
|
11348
|
-
pluginId
|
|
12272
|
+
pluginId,
|
|
11349
12273
|
pluginConfig: {
|
|
11350
12274
|
...params.config,
|
|
11351
12275
|
modifiers: computedModifiers
|
|
@@ -11367,16 +12291,24 @@ const biasPlugins = BIAS_PLUGINS.map((category) => ({
|
|
|
11367
12291
|
key: category,
|
|
11368
12292
|
action: async (params) => {
|
|
11369
12293
|
if (neverGenerateRemote()) {
|
|
11370
|
-
logger.error(`${category} plugin
|
|
12294
|
+
logger.error(getRemoteGenerationExplicitlyDisabledError(`${category} plugin`));
|
|
11371
12295
|
return [];
|
|
11372
12296
|
}
|
|
11373
|
-
const
|
|
12297
|
+
const pluginId = getShortPluginId(category);
|
|
12298
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12299
|
+
config: params.config ?? {},
|
|
12300
|
+
injectVar: params.injectVar,
|
|
12301
|
+
pluginId,
|
|
12302
|
+
provider: params.provider,
|
|
12303
|
+
purpose: params.purpose,
|
|
12304
|
+
testCases: await fetchRemoteTestCases(category, params.purpose, params.injectVar, params.n, params.config ?? {})
|
|
12305
|
+
});
|
|
11374
12306
|
const computedModifiers = computeModifiersFromConfig(params.config);
|
|
11375
12307
|
return testCases.map((testCase) => ({
|
|
11376
12308
|
...testCase,
|
|
11377
12309
|
metadata: {
|
|
11378
12310
|
...testCase.metadata,
|
|
11379
|
-
pluginId
|
|
12311
|
+
pluginId,
|
|
11380
12312
|
pluginConfig: {
|
|
11381
12313
|
...params.config,
|
|
11382
12314
|
modifiers: computedModifiers
|
|
@@ -11389,19 +12321,27 @@ function createRemotePlugin(key, validate) {
|
|
|
11389
12321
|
return {
|
|
11390
12322
|
key,
|
|
11391
12323
|
validate,
|
|
11392
|
-
action: async ({ purpose, injectVar, n, config }) => {
|
|
12324
|
+
action: async ({ provider, purpose, injectVar, n, config }) => {
|
|
11393
12325
|
const configWithDefaults = applyDefaultRemotePluginConfig(key, config);
|
|
11394
12326
|
if (neverGenerateRemote()) {
|
|
11395
|
-
logger.error(`${key} plugin
|
|
12327
|
+
logger.error(getRemoteGenerationExplicitlyDisabledError(`${key} plugin`));
|
|
11396
12328
|
return [];
|
|
11397
12329
|
}
|
|
11398
|
-
const
|
|
12330
|
+
const pluginId = getShortPluginId(key);
|
|
12331
|
+
const testCases = await materializeRemoteTestCaseInputs({
|
|
12332
|
+
config: configWithDefaults ?? {},
|
|
12333
|
+
injectVar,
|
|
12334
|
+
pluginId,
|
|
12335
|
+
provider,
|
|
12336
|
+
purpose,
|
|
12337
|
+
testCases: await fetchRemoteTestCases(key, purpose, injectVar, n, configWithDefaults ?? {})
|
|
12338
|
+
});
|
|
11399
12339
|
const computedModifiers = computeModifiersFromConfig(configWithDefaults);
|
|
11400
12340
|
const testsWithMetadata = testCases.map((testCase) => ({
|
|
11401
12341
|
...testCase,
|
|
11402
12342
|
metadata: {
|
|
11403
12343
|
...testCase.metadata,
|
|
11404
|
-
pluginId
|
|
12344
|
+
pluginId,
|
|
11405
12345
|
pluginConfig: {
|
|
11406
12346
|
...configWithDefaults,
|
|
11407
12347
|
modifiers: computedModifiers
|
|
@@ -11471,6 +12411,37 @@ function getPolicyText(metadata) {
|
|
|
11471
12411
|
return typeof policyObject.text === "string" && policyObject.text.length > 0 ? policyObject.text : void 0;
|
|
11472
12412
|
}
|
|
11473
12413
|
}
|
|
12414
|
+
async function rematerializeStrategyInputVars(testCase, injectVar, provider, purpose, materializationIndex) {
|
|
12415
|
+
const inputs = testCase.metadata?.pluginConfig?.inputs;
|
|
12416
|
+
const inputMaterialization = testCase.metadata?.inputMaterialization;
|
|
12417
|
+
if (!inputs || Object.keys(inputs).length === 0 || !testCase.vars?.[injectVar]) return {
|
|
12418
|
+
inputMaterialization,
|
|
12419
|
+
vars: testCase.vars
|
|
12420
|
+
};
|
|
12421
|
+
try {
|
|
12422
|
+
const materializedVars = await extractMaterializedVariablesFromJsonWithMetadata(JSON.parse(String(testCase.vars[injectVar])), inputs, {
|
|
12423
|
+
materializationIndex,
|
|
12424
|
+
pluginId: String(testCase.metadata?.pluginId || "unknown-plugin"),
|
|
12425
|
+
provider,
|
|
12426
|
+
purpose
|
|
12427
|
+
});
|
|
12428
|
+
return {
|
|
12429
|
+
inputMaterialization: materializedVars.metadata ? {
|
|
12430
|
+
...inputMaterialization,
|
|
12431
|
+
...materializedVars.metadata
|
|
12432
|
+
} : inputMaterialization,
|
|
12433
|
+
vars: {
|
|
12434
|
+
...testCase.vars,
|
|
12435
|
+
...materializedVars.vars
|
|
12436
|
+
}
|
|
12437
|
+
};
|
|
12438
|
+
} catch {
|
|
12439
|
+
return {
|
|
12440
|
+
inputMaterialization,
|
|
12441
|
+
vars: testCase.vars
|
|
12442
|
+
};
|
|
12443
|
+
}
|
|
12444
|
+
}
|
|
11474
12445
|
/**
|
|
11475
12446
|
* Gets the severity level for a plugin based on its ID and configuration.
|
|
11476
12447
|
* @param pluginId - The ID of the plugin.
|
|
@@ -11612,6 +12583,7 @@ const categories = {
|
|
|
11612
12583
|
foundation: FOUNDATION_PLUGINS,
|
|
11613
12584
|
harmful: Object.keys(HARM_PLUGINS),
|
|
11614
12585
|
"coding-agent:core": CODING_AGENT_CORE_PLUGINS,
|
|
12586
|
+
"coding-agent:all": CODING_AGENT_PLUGINS,
|
|
11615
12587
|
bias: BIAS_PLUGINS,
|
|
11616
12588
|
pii: PII_PLUGINS,
|
|
11617
12589
|
medical: MEDICAL_PLUGINS,
|
|
@@ -11706,7 +12678,7 @@ function addLanguageToPluginMetadata(test, lang, plugin, maxCharsPerMessage, tes
|
|
|
11706
12678
|
* @param injectVar - The variable to inject.
|
|
11707
12679
|
* @returns An array of new test cases generated by strategies.
|
|
11708
12680
|
*/
|
|
11709
|
-
async function applyStrategies(testCases, strategies, injectVar, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage) {
|
|
12681
|
+
async function applyStrategies(testCases, strategies, injectVar, provider, purpose, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage) {
|
|
11710
12682
|
const newTestCases = [];
|
|
11711
12683
|
const strategyResults = {};
|
|
11712
12684
|
for (const strategy of strategies) {
|
|
@@ -11762,14 +12734,8 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
11762
12734
|
}
|
|
11763
12735
|
}
|
|
11764
12736
|
resultTestCases = filterOversizedTestCases(resultTestCases, injectVar, `Strategy ${strategy.id}`, maxCharsPerMessage);
|
|
11765
|
-
newTestCases.push(...resultTestCases.map((t) => {
|
|
11766
|
-
const
|
|
11767
|
-
let updatedVars = t.vars;
|
|
11768
|
-
if (inputs && Object.keys(inputs).length > 0 && t.vars?.[injectVar]) try {
|
|
11769
|
-
const parsed = JSON.parse(String(t.vars[injectVar]));
|
|
11770
|
-
updatedVars = { ...t.vars };
|
|
11771
|
-
Object.assign(updatedVars, extractVariablesFromJson(parsed, inputs));
|
|
11772
|
-
} catch {}
|
|
12737
|
+
newTestCases.push(...await Promise.all(resultTestCases.map(async (t, materializationIndex) => {
|
|
12738
|
+
const { inputMaterialization, vars } = await rematerializeStrategyInputVars(t, injectVar, provider, purpose, materializationIndex);
|
|
11773
12739
|
const strategyConfig = {
|
|
11774
12740
|
...strategy.config || {},
|
|
11775
12741
|
...maxCharsPerMessage ? { maxCharsPerMessage } : {},
|
|
@@ -11777,16 +12743,17 @@ async function applyStrategies(testCases, strategies, injectVar, excludeTargetOu
|
|
|
11777
12743
|
};
|
|
11778
12744
|
return {
|
|
11779
12745
|
...t,
|
|
11780
|
-
vars
|
|
12746
|
+
vars,
|
|
11781
12747
|
metadata: {
|
|
11782
12748
|
...t?.metadata || {},
|
|
11783
12749
|
...strategy.id !== "retry" && { strategyId: t?.metadata?.strategyId || strategy.id },
|
|
11784
12750
|
...t?.metadata?.pluginId && { pluginId: t.metadata.pluginId },
|
|
11785
12751
|
...t?.metadata?.pluginConfig && { pluginConfig: t.metadata.pluginConfig },
|
|
12752
|
+
...inputMaterialization && { inputMaterialization },
|
|
11786
12753
|
...Object.keys(strategyConfig).length > 0 && { strategyConfig }
|
|
11787
12754
|
}
|
|
11788
12755
|
};
|
|
11789
|
-
}));
|
|
12756
|
+
})));
|
|
11790
12757
|
const displayId = strategy.id === "layer" && Array.isArray(strategy.config?.steps) ? `layer(${strategy.config.steps.map((st) => typeof st === "string" ? st : st.id).join("→")})` : strategy.id;
|
|
11791
12758
|
const languagesInResults = new Set(strategyTestCases.map((t) => getLanguageForTestCase(t)).filter((lang) => lang !== void 0));
|
|
11792
12759
|
const applyNumTestsCap = (calculatedRequested) => {
|
|
@@ -12241,7 +13208,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
12241
13208
|
targetIds,
|
|
12242
13209
|
...retryStrategy.config
|
|
12243
13210
|
};
|
|
12244
|
-
const { testCases: retryTestCases, strategyResults: retryResults } = await applyStrategies(pluginTestCases, [retryStrategy], injectVar, void 0, maxCharsPerMessage);
|
|
13211
|
+
const { testCases: retryTestCases, strategyResults: retryResults } = await applyStrategies(pluginTestCases, [retryStrategy], injectVar, redteamProvider, purpose, void 0, maxCharsPerMessage);
|
|
12245
13212
|
pluginTestCases.push(...retryTestCases);
|
|
12246
13213
|
Object.assign(strategyResults, retryResults);
|
|
12247
13214
|
if (showProgressBar) progressBar?.increment(retryTestCases.length);
|
|
@@ -12249,7 +13216,7 @@ async function synthesize({ abortSignal, delay, entities: entitiesOverride, inje
|
|
|
12249
13216
|
checkAbort();
|
|
12250
13217
|
const nonBasicStrategies = strategies.filter((s) => !["basic", "retry"].includes(s.id));
|
|
12251
13218
|
if (showProgressBar && nonBasicStrategies.length > 0) progressBar?.update({ task: "Applying strategies" });
|
|
12252
|
-
const { testCases: strategyTestCases, strategyResults: otherStrategyResults } = await applyStrategies(pluginTestCases, nonBasicStrategies, injectVar, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage);
|
|
13219
|
+
const { testCases: strategyTestCases, strategyResults: otherStrategyResults } = await applyStrategies(pluginTestCases, nonBasicStrategies, injectVar, redteamProvider, purpose, excludeTargetOutputFromAgenticAttackGeneration, maxCharsPerMessage);
|
|
12253
13220
|
Object.assign(strategyResults, otherStrategyResults);
|
|
12254
13221
|
if (showProgressBar && strategyTestCases.length > 0) progressBar?.increment(strategyTestCases.length);
|
|
12255
13222
|
const finalTestCases = [...includeBasicTests ? pluginTestCases : [], ...strategyTestCases];
|
|
@@ -13142,6 +14109,10 @@ function stripAuthFromUrl(urlString) {
|
|
|
13142
14109
|
}
|
|
13143
14110
|
}
|
|
13144
14111
|
async function handleEmailCollection(evalRecord) {
|
|
14112
|
+
if (evalRecord.author) {
|
|
14113
|
+
logger.debug(`[Share] Skipping email collection because author is already set`, { evalId: evalRecord.id });
|
|
14114
|
+
return;
|
|
14115
|
+
}
|
|
13145
14116
|
if (!process.stdout.isTTY || isCI() || getEnvBool("PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST")) return;
|
|
13146
14117
|
let email = getUserEmail();
|
|
13147
14118
|
if (!email) {
|
|
@@ -13214,7 +14185,7 @@ function generateTable(evaluateTable, tableCellMaxLength = 250, maxRows = 25) {
|
|
|
13214
14185
|
for (const row of evaluateTable.body.slice(0, maxRows)) table.push([...row.vars.map((v) => ellipsize(v, tableCellMaxLength)), ...row.outputs.map(({ pass, text, failureReason: failureType }) => {
|
|
13215
14186
|
text = ellipsize(text, tableCellMaxLength);
|
|
13216
14187
|
if (pass) return chalk.green("[PASS] ") + text;
|
|
13217
|
-
|
|
14188
|
+
return chalk.red(failureType === ResultFailureReason.ASSERT ? "[FAIL] " : "[ERROR] ") + text.split("---").map((c, idx) => idx === 0 ? chalk.red.bold(c) : c).join("---");
|
|
13218
14189
|
})]);
|
|
13219
14190
|
return table.toString();
|
|
13220
14191
|
}
|
|
@@ -13285,6 +14256,14 @@ function shouldShareResults(opts) {
|
|
|
13285
14256
|
return cloudConfig.isEnabled() && sharing !== false;
|
|
13286
14257
|
}
|
|
13287
14258
|
//#endregion
|
|
14259
|
+
//#region src/commands/eval/redteamWarning.ts
|
|
14260
|
+
function warnIfRedteamConfigHasNoTests(config, testSuite) {
|
|
14261
|
+
if (config.redteam && (!testSuite.tests || testSuite.tests.length === 0) && (!testSuite.scenarios || testSuite.scenarios.length === 0)) logger.warn(chalk.yellow(dedent`
|
|
14262
|
+
Warning: Config file has a redteam section but no test cases.
|
|
14263
|
+
Did you mean to run ${chalk.bold("promptfoo redteam generate")} instead?
|
|
14264
|
+
`));
|
|
14265
|
+
}
|
|
14266
|
+
//#endregion
|
|
13288
14267
|
//#region src/util/formatDuration.ts
|
|
13289
14268
|
/**
|
|
13290
14269
|
* Formats a duration in seconds into a human-readable string
|
|
@@ -13305,6 +14284,115 @@ function formatDuration(seconds) {
|
|
|
13305
14284
|
}
|
|
13306
14285
|
//#endregion
|
|
13307
14286
|
//#region src/commands/eval/summary.ts
|
|
14287
|
+
function getCompletionMessage({ completionType, evalId, shareableUrl, wasAborted, writeToDatabase, activelySharing }) {
|
|
14288
|
+
if (wasAborted) {
|
|
14289
|
+
const idSuffix = writeToDatabase ? ` (ID: ${chalk.cyan(evalId)})` : "";
|
|
14290
|
+
return `${chalk.red("✗")} ${completionType} aborted${idSuffix}`;
|
|
14291
|
+
}
|
|
14292
|
+
if (writeToDatabase && shareableUrl) return `${chalk.green("✓")} ${completionType} complete: ${shareableUrl}`;
|
|
14293
|
+
if (writeToDatabase && activelySharing) return `${chalk.green("✓")} ${completionType} complete`;
|
|
14294
|
+
if (writeToDatabase) return `${chalk.green("✓")} ${completionType} complete (ID: ${chalk.cyan(evalId)})`;
|
|
14295
|
+
return `${chalk.green("✓")} ${completionType} complete`;
|
|
14296
|
+
}
|
|
14297
|
+
function getAbortSummaryLines(targetErrorStatus) {
|
|
14298
|
+
if (targetErrorStatus == null) return [];
|
|
14299
|
+
return [
|
|
14300
|
+
"",
|
|
14301
|
+
chalk.red.bold("Scan stopped: Target is unavailable and will not recover on retry."),
|
|
14302
|
+
chalk.red(` Target returned HTTP ${targetErrorStatus}`),
|
|
14303
|
+
"",
|
|
14304
|
+
chalk.yellow("Possible causes:"),
|
|
14305
|
+
chalk.yellow(" • Invalid API key or authentication (401/403)"),
|
|
14306
|
+
chalk.yellow(" • Target endpoint does not exist (404)"),
|
|
14307
|
+
chalk.yellow(" • Server does not support the request (501)"),
|
|
14308
|
+
"",
|
|
14309
|
+
chalk.cyan("To fix: Check your target configuration and credentials.")
|
|
14310
|
+
];
|
|
14311
|
+
}
|
|
14312
|
+
function getGuidanceLines({ writeToDatabase, shareableUrl, wantsToShare, activelySharing, hasExplicitDisable, cloudEnabled }) {
|
|
14313
|
+
if (!writeToDatabase || shareableUrl || wantsToShare || activelySharing) return [];
|
|
14314
|
+
const lines = ["", `» View results: ${chalk.green.bold("promptfoo view")}`];
|
|
14315
|
+
if (!hasExplicitDisable) lines.push(cloudEnabled ? `» Create shareable URL: ${chalk.green.bold("promptfoo share")}` : `» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
|
|
14316
|
+
lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
|
|
14317
|
+
return lines;
|
|
14318
|
+
}
|
|
14319
|
+
function buildUsageDetails(usage, total) {
|
|
14320
|
+
const parts = [];
|
|
14321
|
+
if (usage.prompt && usage.prompt > 0) parts.push(`${usage.prompt.toLocaleString()} prompt`);
|
|
14322
|
+
if (usage.completion && usage.completion > 0) parts.push(`${usage.completion.toLocaleString()} completion`);
|
|
14323
|
+
if (usage.cached && usage.cached > 0) parts.push(usage.cached === total && parts.length === 0 ? "cached" : `${usage.cached.toLocaleString()} cached`);
|
|
14324
|
+
if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) parts.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
14325
|
+
return parts;
|
|
14326
|
+
}
|
|
14327
|
+
function getTokenUsageLines(tokenUsage, isRedteam, tracker) {
|
|
14328
|
+
const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
|
|
14329
|
+
const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
|
|
14330
|
+
if (!hasEvalTokens && !hasGradingTokens) return [];
|
|
14331
|
+
const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
|
|
14332
|
+
const evalTokens = {
|
|
14333
|
+
prompt: tokenUsage.prompt || 0,
|
|
14334
|
+
completion: tokenUsage.completion || 0,
|
|
14335
|
+
total: tokenUsage.total || combinedTotal,
|
|
14336
|
+
cached: tokenUsage.cached || 0,
|
|
14337
|
+
numRequests: tokenUsage.numRequests || 0,
|
|
14338
|
+
completionDetails: tokenUsage.completionDetails || {
|
|
14339
|
+
reasoning: 0,
|
|
14340
|
+
acceptedPrediction: 0,
|
|
14341
|
+
rejectedPrediction: 0
|
|
14342
|
+
}
|
|
14343
|
+
};
|
|
14344
|
+
const lines = [`${chalk.bold("Total Tokens:")} ${chalk.white.bold((evalTokens.total + (tokenUsage.assertions?.total || 0)).toLocaleString())}`];
|
|
14345
|
+
if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
|
|
14346
|
+
if (evalTokens.total > 0) {
|
|
14347
|
+
const evalParts = buildUsageDetails(evalTokens, evalTokens.total);
|
|
14348
|
+
lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
|
|
14349
|
+
}
|
|
14350
|
+
if (tokenUsage.assertions?.total && tokenUsage.assertions.total > 0) {
|
|
14351
|
+
const gradingParts = buildUsageDetails(tokenUsage.assertions, tokenUsage.assertions.total);
|
|
14352
|
+
lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
|
|
14353
|
+
}
|
|
14354
|
+
lines.push(...getProviderUsageLines(tracker));
|
|
14355
|
+
return lines;
|
|
14356
|
+
}
|
|
14357
|
+
function getProviderUsageLines(tracker) {
|
|
14358
|
+
const providerIds = tracker.getProviderIds();
|
|
14359
|
+
if (providerIds.length <= 1) return [];
|
|
14360
|
+
const sortedProviders = providerIds.map((id) => ({
|
|
14361
|
+
id,
|
|
14362
|
+
usage: tracker.getProviderUsage(id)
|
|
14363
|
+
})).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
|
|
14364
|
+
const lines = ["", chalk.bold("Providers:")];
|
|
14365
|
+
for (const { id, usage } of sortedProviders) {
|
|
14366
|
+
if ((usage.total || 0) === 0 && (usage.prompt || 0) + (usage.completion || 0) === 0) continue;
|
|
14367
|
+
const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
|
|
14368
|
+
const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
|
|
14369
|
+
const details = buildUsageDetails(usage, displayTotal);
|
|
14370
|
+
const requestInfo = `${usage.numRequests || 0} requests`;
|
|
14371
|
+
const separator = details.length > 0 ? "; " : "";
|
|
14372
|
+
lines.push(` ${chalk.gray(`${displayId}:`)} ${chalk.white(displayTotal.toLocaleString())} (${requestInfo}${separator}${details.join(", ")})`);
|
|
14373
|
+
}
|
|
14374
|
+
return lines;
|
|
14375
|
+
}
|
|
14376
|
+
function formatResultPercentage(count, totalTests) {
|
|
14377
|
+
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
14378
|
+
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
14379
|
+
}
|
|
14380
|
+
function formatResultLine(count, label, icon, iconColor, totalTests) {
|
|
14381
|
+
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count, totalTests)})`)}`;
|
|
14382
|
+
}
|
|
14383
|
+
function getResultsLines({ successes, failures, errors, duration, maxConcurrency }) {
|
|
14384
|
+
const totalTests = successes + failures + errors;
|
|
14385
|
+
const errorLabel = errors === 1 ? "error" : "errors";
|
|
14386
|
+
return [
|
|
14387
|
+
"",
|
|
14388
|
+
chalk.bold("Results:"),
|
|
14389
|
+
formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green, totalTests),
|
|
14390
|
+
formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red, totalTests),
|
|
14391
|
+
formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red, totalTests),
|
|
14392
|
+
chalk.gray(`Duration: ${formatDuration(duration)} (concurrency: ${maxConcurrency})`),
|
|
14393
|
+
""
|
|
14394
|
+
];
|
|
14395
|
+
}
|
|
13308
14396
|
/**
|
|
13309
14397
|
* Generate formatted evaluation summary output for CLI display.
|
|
13310
14398
|
*
|
|
@@ -13343,115 +14431,28 @@ function formatDuration(seconds) {
|
|
|
13343
14431
|
* ```
|
|
13344
14432
|
*/
|
|
13345
14433
|
function generateEvalSummary(params) {
|
|
13346
|
-
|
|
13347
|
-
|
|
13348
|
-
|
|
13349
|
-
|
|
13350
|
-
|
|
13351
|
-
|
|
13352
|
-
|
|
13353
|
-
|
|
13354
|
-
|
|
13355
|
-
|
|
13356
|
-
|
|
13357
|
-
|
|
13358
|
-
|
|
13359
|
-
|
|
13360
|
-
|
|
13361
|
-
|
|
13362
|
-
|
|
13363
|
-
|
|
13364
|
-
|
|
13365
|
-
|
|
13366
|
-
|
|
13367
|
-
|
|
13368
|
-
lines.push("");
|
|
13369
|
-
lines.push(chalk.cyan("To fix: Check your target configuration and credentials."));
|
|
13370
|
-
}
|
|
13371
|
-
if (writeToDatabase && !shareableUrl && !wantsToShare && !activelySharing) {
|
|
13372
|
-
lines.push("");
|
|
13373
|
-
lines.push(`» View results: ${chalk.green.bold("promptfoo view")}`);
|
|
13374
|
-
if (!hasExplicitDisable) if (cloudEnabled) lines.push(`» Create shareable URL: ${chalk.green.bold("promptfoo share")}`);
|
|
13375
|
-
else lines.push(`» Share with your team: ${chalk.green.bold("https://promptfoo.app")}`);
|
|
13376
|
-
lines.push(`» Feedback: ${chalk.green.bold("https://promptfoo.dev/feedback")}`);
|
|
13377
|
-
}
|
|
13378
|
-
lines.push("");
|
|
13379
|
-
const hasEvalTokens = (tokenUsage.total || 0) > 0 || (tokenUsage.prompt || 0) + (tokenUsage.completion || 0) > 0;
|
|
13380
|
-
const hasGradingTokens = tokenUsage.assertions && (tokenUsage.assertions.total || 0) > 0;
|
|
13381
|
-
if (hasEvalTokens || hasGradingTokens) {
|
|
13382
|
-
const combinedTotal = (tokenUsage.prompt || 0) + (tokenUsage.completion || 0);
|
|
13383
|
-
const evalTokens = {
|
|
13384
|
-
prompt: tokenUsage.prompt || 0,
|
|
13385
|
-
completion: tokenUsage.completion || 0,
|
|
13386
|
-
total: tokenUsage.total || combinedTotal,
|
|
13387
|
-
cached: tokenUsage.cached || 0,
|
|
13388
|
-
completionDetails: tokenUsage.completionDetails || {
|
|
13389
|
-
reasoning: 0,
|
|
13390
|
-
acceptedPrediction: 0,
|
|
13391
|
-
rejectedPrediction: 0
|
|
13392
|
-
}
|
|
13393
|
-
};
|
|
13394
|
-
const grandTotal = evalTokens.total + (tokenUsage.assertions?.total || 0);
|
|
13395
|
-
lines.push(`${chalk.bold("Total Tokens:")} ${chalk.white.bold(grandTotal.toLocaleString())}`);
|
|
13396
|
-
if (isRedteam && tokenUsage.numRequests) lines.push(` ${chalk.gray("Probes:")} ${chalk.white(tokenUsage.numRequests.toLocaleString())}`);
|
|
13397
|
-
if (evalTokens.total > 0) {
|
|
13398
|
-
const evalParts = [];
|
|
13399
|
-
if (evalTokens.prompt > 0) evalParts.push(`${evalTokens.prompt.toLocaleString()} prompt`);
|
|
13400
|
-
if (evalTokens.completion > 0) evalParts.push(`${evalTokens.completion.toLocaleString()} completion`);
|
|
13401
|
-
if (evalTokens.cached > 0) if (evalTokens.cached === evalTokens.total && evalParts.length === 0) evalParts.push("cached");
|
|
13402
|
-
else evalParts.push(`${evalTokens.cached.toLocaleString()} cached`);
|
|
13403
|
-
if (evalTokens.completionDetails?.reasoning && evalTokens.completionDetails.reasoning > 0) evalParts.push(`${evalTokens.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13404
|
-
lines.push(` ${chalk.gray("Eval:")} ${chalk.white(evalTokens.total.toLocaleString())} (${evalParts.join(", ")})`);
|
|
13405
|
-
}
|
|
13406
|
-
if (tokenUsage.assertions && tokenUsage.assertions.total && tokenUsage.assertions.total > 0) {
|
|
13407
|
-
const gradingParts = [];
|
|
13408
|
-
if (tokenUsage.assertions.prompt && tokenUsage.assertions.prompt > 0) gradingParts.push(`${tokenUsage.assertions.prompt.toLocaleString()} prompt`);
|
|
13409
|
-
if (tokenUsage.assertions.completion && tokenUsage.assertions.completion > 0) gradingParts.push(`${tokenUsage.assertions.completion.toLocaleString()} completion`);
|
|
13410
|
-
if (tokenUsage.assertions.cached && tokenUsage.assertions.cached > 0) if (tokenUsage.assertions.cached === tokenUsage.assertions.total && gradingParts.length === 0) gradingParts.push("cached");
|
|
13411
|
-
else gradingParts.push(`${tokenUsage.assertions.cached.toLocaleString()} cached`);
|
|
13412
|
-
if (tokenUsage.assertions.completionDetails?.reasoning && tokenUsage.assertions.completionDetails.reasoning > 0) gradingParts.push(`${tokenUsage.assertions.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13413
|
-
lines.push(` ${chalk.gray("Grading:")} ${chalk.white(tokenUsage.assertions.total.toLocaleString())} (${gradingParts.join(", ")})`);
|
|
13414
|
-
}
|
|
13415
|
-
const providerIds = tracker.getProviderIds();
|
|
13416
|
-
if (providerIds.length > 1) {
|
|
13417
|
-
lines.push("");
|
|
13418
|
-
lines.push(chalk.bold("Providers:"));
|
|
13419
|
-
const sortedProviders = providerIds.map((id) => ({
|
|
13420
|
-
id,
|
|
13421
|
-
usage: tracker.getProviderUsage(id)
|
|
13422
|
-
})).filter((p) => p.usage != null).sort((a, b) => (b.usage.total || 0) - (a.usage.total || 0));
|
|
13423
|
-
for (const { id, usage } of sortedProviders) if ((usage.total || 0) > 0 || (usage.prompt || 0) + (usage.completion || 0) > 0) {
|
|
13424
|
-
const displayTotal = usage.total || (usage.prompt || 0) + (usage.completion || 0);
|
|
13425
|
-
const displayId = id.includes(" (") ? id.substring(0, id.indexOf(" (")) : id;
|
|
13426
|
-
const details = [];
|
|
13427
|
-
if (usage.prompt && usage.prompt > 0) details.push(`${usage.prompt.toLocaleString()} prompt`);
|
|
13428
|
-
if (usage.completion && usage.completion > 0) details.push(`${usage.completion.toLocaleString()} completion`);
|
|
13429
|
-
if (usage.cached && usage.cached > 0) if (usage.cached === displayTotal && details.length === 0) details.push("cached");
|
|
13430
|
-
else details.push(`${usage.cached.toLocaleString()} cached`);
|
|
13431
|
-
if (usage.completionDetails?.reasoning && usage.completionDetails.reasoning > 0) details.push(`${usage.completionDetails.reasoning.toLocaleString()} reasoning`);
|
|
13432
|
-
const breakdown = ` (${`${usage.numRequests || 0} requests`}${details.length > 0 ? "; " : ""}${details.join(", ")})`;
|
|
13433
|
-
lines.push(` ${chalk.gray(displayId + ":")} ${chalk.white(displayTotal.toLocaleString())}${breakdown}`);
|
|
13434
|
-
}
|
|
13435
|
-
}
|
|
13436
|
-
}
|
|
13437
|
-
lines.push("");
|
|
13438
|
-
const totalTests = successes + failures + errors;
|
|
13439
|
-
const formatResultPercentage = (count) => {
|
|
13440
|
-
const percentage = totalTests === 0 ? 0 : count / totalTests * 100;
|
|
13441
|
-
return percentage === 0 || percentage === 100 ? `${percentage.toFixed(0)}%` : `${percentage.toFixed(2)}%`;
|
|
13442
|
-
};
|
|
13443
|
-
const formatResultLine = (count, label, icon, iconColor) => {
|
|
13444
|
-
return ` ${icon ? `${iconColor(icon)} ` : ""}${chalk.white.bold(count.toLocaleString())} ${chalk.white(label)} ${chalk.gray(`(${formatResultPercentage(count)})`)}`;
|
|
13445
|
-
};
|
|
13446
|
-
const errorLabel = errors === 1 ? "error" : "errors";
|
|
13447
|
-
lines.push(chalk.bold("Results:"));
|
|
13448
|
-
lines.push(formatResultLine(successes, "passed", successes > 0 ? "✓" : void 0, chalk.green));
|
|
13449
|
-
lines.push(formatResultLine(failures, "failed", failures > 0 ? "✗" : void 0, chalk.red));
|
|
13450
|
-
lines.push(formatResultLine(errors, errorLabel, errors > 0 ? "✗" : void 0, chalk.red));
|
|
13451
|
-
const durationDisplay = formatDuration(duration);
|
|
13452
|
-
lines.push(chalk.gray(`Duration: ${durationDisplay} (concurrency: ${maxConcurrency})`));
|
|
13453
|
-
lines.push("");
|
|
13454
|
-
return lines;
|
|
14434
|
+
return [
|
|
14435
|
+
getCompletionMessage({
|
|
14436
|
+
completionType: params.isRedteam ? "Red team" : "Eval",
|
|
14437
|
+
evalId: params.evalId,
|
|
14438
|
+
shareableUrl: params.shareableUrl,
|
|
14439
|
+
wasAborted: params.targetErrorStatus != null,
|
|
14440
|
+
writeToDatabase: params.writeToDatabase,
|
|
14441
|
+
activelySharing: params.activelySharing ?? false
|
|
14442
|
+
}),
|
|
14443
|
+
...getAbortSummaryLines(params.targetErrorStatus),
|
|
14444
|
+
...getGuidanceLines({
|
|
14445
|
+
writeToDatabase: params.writeToDatabase,
|
|
14446
|
+
shareableUrl: params.shareableUrl,
|
|
14447
|
+
wantsToShare: params.wantsToShare,
|
|
14448
|
+
activelySharing: params.activelySharing ?? false,
|
|
14449
|
+
hasExplicitDisable: params.hasExplicitDisable,
|
|
14450
|
+
cloudEnabled: params.cloudEnabled
|
|
14451
|
+
}),
|
|
14452
|
+
"",
|
|
14453
|
+
...getTokenUsageLines(params.tokenUsage, params.isRedteam, params.tracker),
|
|
14454
|
+
...getResultsLines(params)
|
|
14455
|
+
];
|
|
13455
14456
|
}
|
|
13456
14457
|
//#endregion
|
|
13457
14458
|
//#region src/commands/retry.ts
|
|
@@ -13703,14 +14704,11 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13703
14704
|
state.resume = true;
|
|
13704
14705
|
state.retryMode = true;
|
|
13705
14706
|
} else ({config, testSuite, basePath: _basePath, commandLineOptions} = await resolveConfigs(cmdObj, defaultConfig));
|
|
13706
|
-
if (!cmdObj.envPath && commandLineOptions?.envPath) {
|
|
14707
|
+
if ((!cmdObj.envPath || cmdObj.envPath.length === 0) && commandLineOptions?.envPath) {
|
|
13707
14708
|
logger.debug(`Loading additional environment from config: ${commandLineOptions.envPath}`);
|
|
13708
14709
|
setupEnv(commandLineOptions.envPath);
|
|
13709
14710
|
}
|
|
13710
|
-
|
|
13711
|
-
Warning: Config file has a redteam section but no test cases.
|
|
13712
|
-
Did you mean to run ${chalk.bold("promptfoo redteam generate")} instead?
|
|
13713
|
-
`));
|
|
14711
|
+
warnIfRedteamConfigHasNoTests(config, testSuite);
|
|
13714
14712
|
if (config.redteam && Array.isArray(config.providers) && config.providers.length > 0 && typeof config.providers[0] === "object" && config.providers[0].id === "http") {
|
|
13715
14713
|
const maybeUrl = config.providers[0]?.config?.url;
|
|
13716
14714
|
if (typeof maybeUrl === "string" && maybeUrl.includes("promptfoo.app")) telemetry.record("feature_used", { feature: "redteam_run_with_example" });
|
|
@@ -13818,7 +14816,14 @@ async function doEval(cmdObj, defaultConfig, defaultConfigPath, evaluateOptions)
|
|
|
13818
14816
|
${z.prettifyError(testSuiteSchema.error)}
|
|
13819
14817
|
|
|
13820
14818
|
Please review your promptfooconfig.yaml configuration.`));
|
|
13821
|
-
const
|
|
14819
|
+
const author = getAuthor();
|
|
14820
|
+
const evalRecord = resumeEval ? resumeEval : cmdObj.write ? await Eval.create(config, testSuite.prompts, {
|
|
14821
|
+
author,
|
|
14822
|
+
runtimeOptions: options
|
|
14823
|
+
}) : new Eval(config, {
|
|
14824
|
+
author,
|
|
14825
|
+
runtimeOptions: options
|
|
14826
|
+
});
|
|
13822
14827
|
const abortController = new AbortController();
|
|
13823
14828
|
const previousAbortSignal = evaluateOptions.abortSignal;
|
|
13824
14829
|
evaluateOptions.abortSignal = previousAbortSignal ? AbortSignal.any([previousAbortSignal, abortController.signal]) : abortController.signal;
|
|
@@ -14220,65 +15225,175 @@ async function doRedteamRun(options) {
|
|
|
14220
15225
|
return evalResult;
|
|
14221
15226
|
}
|
|
14222
15227
|
//#endregion
|
|
15228
|
+
//#region src/types/transform.ts
|
|
15229
|
+
/** Runtime type guard for `TransformFunction` values. */
|
|
15230
|
+
function isTransformFunction(value) {
|
|
15231
|
+
return typeof value === "function";
|
|
15232
|
+
}
|
|
15233
|
+
//#endregion
|
|
14223
15234
|
//#region src/index.ts
|
|
15235
|
+
/**
|
|
15236
|
+
* Shallow-clone a test case so the caller can swap in resolved ApiProvider
|
|
15237
|
+
* instances on `options.provider` / `assert[].provider` without leaking those
|
|
15238
|
+
* mutations back to the input. The input may alias the unified config written
|
|
15239
|
+
* to the Eval record, and a live SDK client (e.g. Bedrock's BedrockRuntime,
|
|
15240
|
+
* Anthropic's client) holds circular references that break drizzle's JSON
|
|
15241
|
+
* serialization on `evalRecord.save()`. Fixes #8687.
|
|
15242
|
+
*
|
|
15243
|
+
* Detaches only `options` and `assert[]`. Other reference fields (`provider`,
|
|
15244
|
+
* `vars`, `metadata`, `providerOutput`) remain aliased — callers must reassign
|
|
15245
|
+
* those by reference rather than mutating in place. `assert-set` children are
|
|
15246
|
+
* not deep-cloned because the resolve loop skips `assert-set`; if that ever
|
|
15247
|
+
* changes, extend this helper.
|
|
15248
|
+
*/
|
|
15249
|
+
function cloneTestForResolve(test) {
|
|
15250
|
+
const cloned = { ...test };
|
|
15251
|
+
if (test.options) cloned.options = { ...test.options };
|
|
15252
|
+
if (test.assert) cloned.assert = test.assert.map((assertion) => ({ ...assertion }));
|
|
15253
|
+
return cloned;
|
|
15254
|
+
}
|
|
15255
|
+
function toSerializableProviderRef(provider) {
|
|
15256
|
+
if (isApiProvider(provider)) return sanitizeProvider(provider);
|
|
15257
|
+
if (Array.isArray(provider)) return provider.map(toSerializableProviderRef);
|
|
15258
|
+
return provider;
|
|
15259
|
+
}
|
|
15260
|
+
function isRecord(value) {
|
|
15261
|
+
return Boolean(value && typeof value === "object" && !Array.isArray(value));
|
|
15262
|
+
}
|
|
15263
|
+
function withSerializableProvider(record) {
|
|
15264
|
+
if (!isApiProvider(record.provider)) return record;
|
|
15265
|
+
return {
|
|
15266
|
+
...record,
|
|
15267
|
+
provider: sanitizeProvider(record.provider)
|
|
15268
|
+
};
|
|
15269
|
+
}
|
|
15270
|
+
/**
|
|
15271
|
+
* Function-valued transforms are first-class at runtime but are silently dropped
|
|
15272
|
+
* by `JSON.stringify`. Persisted eval configs (drizzle-stored) must never retain
|
|
15273
|
+
* a function reference, so replace every `transform`-like field with a
|
|
15274
|
+
* `[inline function]: name` marker. Non-function values pass through unchanged.
|
|
15275
|
+
*
|
|
15276
|
+
* `droppedRef.value` is flipped to `true` the first time a function is replaced
|
|
15277
|
+
* so the caller can emit a single warning instead of logging per field.
|
|
15278
|
+
*/
|
|
15279
|
+
function replaceFunctionTransforms(record, droppedRef) {
|
|
15280
|
+
let result;
|
|
15281
|
+
for (const key of TRANSFORM_KEYS) {
|
|
15282
|
+
const value = record[key];
|
|
15283
|
+
if (!isTransformFunction(value)) continue;
|
|
15284
|
+
if (!result) result = { ...record };
|
|
15285
|
+
result[key] = value.name ? `${INLINE_FUNCTION_LABEL}: ${value.name}` : INLINE_FUNCTION_LABEL;
|
|
15286
|
+
droppedRef.value = true;
|
|
15287
|
+
}
|
|
15288
|
+
return result ?? record;
|
|
15289
|
+
}
|
|
15290
|
+
function toSerializableAssertion(assertion, droppedRef) {
|
|
15291
|
+
if (!isRecord(assertion)) return assertion;
|
|
15292
|
+
let sanitizedAssertion = withSerializableProvider(assertion);
|
|
15293
|
+
sanitizedAssertion = replaceFunctionTransforms(sanitizedAssertion, droppedRef);
|
|
15294
|
+
if (Array.isArray(assertion.assert)) sanitizedAssertion = {
|
|
15295
|
+
...sanitizedAssertion,
|
|
15296
|
+
assert: assertion.assert.map((a) => toSerializableAssertion(a, droppedRef))
|
|
15297
|
+
};
|
|
15298
|
+
return sanitizedAssertion;
|
|
15299
|
+
}
|
|
15300
|
+
function toSerializableTestCase(test, droppedRef) {
|
|
15301
|
+
if (!isRecord(test)) return test;
|
|
15302
|
+
let sanitizedTest = withSerializableProvider(test);
|
|
15303
|
+
if (isRecord(test.options)) {
|
|
15304
|
+
let options = withSerializableProvider(test.options);
|
|
15305
|
+
options = replaceFunctionTransforms(options, droppedRef);
|
|
15306
|
+
if (options !== test.options) sanitizedTest = {
|
|
15307
|
+
...sanitizedTest,
|
|
15308
|
+
options
|
|
15309
|
+
};
|
|
15310
|
+
}
|
|
15311
|
+
if (Array.isArray(test.assert)) sanitizedTest = {
|
|
15312
|
+
...sanitizedTest,
|
|
15313
|
+
assert: test.assert.map((a) => toSerializableAssertion(a, droppedRef))
|
|
15314
|
+
};
|
|
15315
|
+
return sanitizedTest;
|
|
15316
|
+
}
|
|
15317
|
+
function toSerializableScenario(scenario, droppedRef) {
|
|
15318
|
+
if (!isRecord(scenario)) return scenario;
|
|
15319
|
+
if (!Array.isArray(scenario.tests)) return scenario;
|
|
15320
|
+
return {
|
|
15321
|
+
...scenario,
|
|
15322
|
+
tests: scenario.tests.map((t) => toSerializableTestCase(t, droppedRef))
|
|
15323
|
+
};
|
|
15324
|
+
}
|
|
15325
|
+
function createSerializableUnifiedConfig(testSuite, prompts) {
|
|
15326
|
+
const droppedRef = { value: false };
|
|
15327
|
+
const config = {
|
|
15328
|
+
...testSuite,
|
|
15329
|
+
providers: toSerializableProviderRef(testSuite.providers),
|
|
15330
|
+
defaultTest: toSerializableTestCase(testSuite.defaultTest, droppedRef),
|
|
15331
|
+
tests: Array.isArray(testSuite.tests) ? testSuite.tests.map((t) => toSerializableTestCase(t, droppedRef)) : testSuite.tests,
|
|
15332
|
+
scenarios: Array.isArray(testSuite.scenarios) ? testSuite.scenarios.map((s) => toSerializableScenario(s, droppedRef)) : testSuite.scenarios,
|
|
15333
|
+
prompts
|
|
15334
|
+
};
|
|
15335
|
+
if (droppedRef.value && testSuite.writeLatestResults) logger.warn("Function-valued transform(s) in testSuite were replaced with \"[inline function]\" markers in the persisted config. Re-running the saved eval will not invoke them; use string expressions or file:// references if you need the config to round-trip.");
|
|
15336
|
+
return config;
|
|
15337
|
+
}
|
|
14224
15338
|
async function evaluate(testSuite, options = {}) {
|
|
14225
|
-
|
|
14226
|
-
|
|
15339
|
+
const { author: suiteAuthor, ...testSuiteConfig } = testSuite;
|
|
15340
|
+
if (testSuiteConfig.writeLatestResults) await runDbMigrations();
|
|
15341
|
+
const loadedProviders = await loadApiProviders(testSuiteConfig.providers, { env: testSuiteConfig.env });
|
|
14227
15342
|
const providerMap = {};
|
|
14228
15343
|
for (const p of loadedProviders) {
|
|
14229
15344
|
providerMap[p.id()] = p;
|
|
14230
15345
|
if (p.label) providerMap[p.label] = p;
|
|
14231
15346
|
}
|
|
14232
|
-
let resolvedDefaultTest =
|
|
14233
|
-
if (typeof
|
|
15347
|
+
let resolvedDefaultTest = testSuiteConfig.defaultTest;
|
|
15348
|
+
if (typeof testSuiteConfig.defaultTest === "string" && testSuiteConfig.defaultTest.startsWith("file://")) resolvedDefaultTest = await maybeLoadFromExternalFile(testSuiteConfig.defaultTest);
|
|
14234
15349
|
const constructedTestSuite = {
|
|
14235
|
-
...
|
|
15350
|
+
...testSuiteConfig,
|
|
14236
15351
|
defaultTest: resolvedDefaultTest,
|
|
14237
|
-
scenarios:
|
|
15352
|
+
scenarios: testSuiteConfig.scenarios,
|
|
14238
15353
|
providers: loadedProviders,
|
|
14239
|
-
tests: await readTests(
|
|
14240
|
-
nunjucksFilters: await readFilters(
|
|
14241
|
-
prompts: await processPrompts(
|
|
14242
|
-
};
|
|
14243
|
-
if (typeof constructedTestSuite.defaultTest === "object") {
|
|
14244
|
-
|
|
14245
|
-
|
|
15354
|
+
tests: await readTests(testSuiteConfig.tests),
|
|
15355
|
+
nunjucksFilters: await readFilters(testSuiteConfig.nunjucksFilters || {}),
|
|
15356
|
+
prompts: await processPrompts(testSuiteConfig.prompts)
|
|
15357
|
+
};
|
|
15358
|
+
if (typeof constructedTestSuite.defaultTest === "object" && constructedTestSuite.defaultTest) {
|
|
15359
|
+
constructedTestSuite.defaultTest = cloneTestForResolve(constructedTestSuite.defaultTest);
|
|
15360
|
+
if (constructedTestSuite.defaultTest.provider && !isApiProvider(constructedTestSuite.defaultTest.provider)) constructedTestSuite.defaultTest.provider = await resolveProvider(constructedTestSuite.defaultTest.provider, providerMap, {
|
|
15361
|
+
env: testSuiteConfig.env,
|
|
14246
15362
|
basePath: state.basePath
|
|
14247
15363
|
});
|
|
14248
|
-
if (constructedTestSuite.defaultTest
|
|
14249
|
-
env:
|
|
15364
|
+
if (constructedTestSuite.defaultTest.options?.provider && !isApiProvider(constructedTestSuite.defaultTest.options.provider)) constructedTestSuite.defaultTest.options.provider = await resolveProvider(constructedTestSuite.defaultTest.options.provider, providerMap, {
|
|
15365
|
+
env: testSuiteConfig.env,
|
|
14250
15366
|
basePath: state.basePath
|
|
14251
15367
|
});
|
|
14252
15368
|
}
|
|
14253
|
-
|
|
15369
|
+
constructedTestSuite.tests = (constructedTestSuite.tests || []).map(cloneTestForResolve);
|
|
15370
|
+
for (const test of constructedTestSuite.tests) {
|
|
14254
15371
|
if (test.options?.provider && !isApiProvider(test.options.provider)) test.options.provider = await resolveProvider(test.options.provider, providerMap, {
|
|
14255
|
-
env:
|
|
15372
|
+
env: testSuiteConfig.env,
|
|
14256
15373
|
basePath: state.basePath
|
|
14257
15374
|
});
|
|
14258
|
-
|
|
15375
|
+
for (const assertion of test.assert || []) {
|
|
14259
15376
|
if (assertion.type === "assert-set" || typeof assertion.provider === "function") continue;
|
|
14260
15377
|
if (assertion.provider && !isApiProvider(assertion.provider)) assertion.provider = await resolveProvider(assertion.provider, providerMap, {
|
|
14261
|
-
env:
|
|
15378
|
+
env: testSuiteConfig.env,
|
|
14262
15379
|
basePath: state.basePath
|
|
14263
15380
|
});
|
|
14264
15381
|
}
|
|
14265
15382
|
}
|
|
14266
15383
|
if (options.cache === false) disableCache();
|
|
14267
|
-
const parsedProviderPromptMap = readProviderPromptMap(
|
|
14268
|
-
const unifiedConfig =
|
|
14269
|
-
|
|
14270
|
-
|
|
14271
|
-
};
|
|
14272
|
-
const evalRecord = testSuite.writeLatestResults ? await Eval.create(unifiedConfig, constructedTestSuite.prompts) : new Eval(unifiedConfig);
|
|
15384
|
+
const parsedProviderPromptMap = readProviderPromptMap(testSuiteConfig, constructedTestSuite.prompts);
|
|
15385
|
+
const unifiedConfig = createSerializableUnifiedConfig(testSuiteConfig, constructedTestSuite.prompts);
|
|
15386
|
+
const author = getAuthor(suiteAuthor);
|
|
15387
|
+
const evalRecord = testSuiteConfig.writeLatestResults ? await Eval.create(unifiedConfig, constructedTestSuite.prompts, { author }) : new Eval(unifiedConfig, { author });
|
|
14273
15388
|
const ret = await evaluate$1({
|
|
14274
15389
|
...constructedTestSuite,
|
|
14275
15390
|
providerPromptMap: parsedProviderPromptMap
|
|
14276
15391
|
}, evalRecord, {
|
|
14277
15392
|
eventSource: "library",
|
|
14278
|
-
isRedteam: Boolean(
|
|
15393
|
+
isRedteam: Boolean(testSuiteConfig.redteam),
|
|
14279
15394
|
...options
|
|
14280
15395
|
});
|
|
14281
|
-
if (
|
|
15396
|
+
if (testSuiteConfig.writeLatestResults && testSuiteConfig.sharing) if (isSharingEnabled(ret)) try {
|
|
14282
15397
|
const shareableUrl = await createShareableUrl(ret, { silent: true });
|
|
14283
15398
|
if (shareableUrl) {
|
|
14284
15399
|
ret.shareableUrl = shareableUrl;
|
|
@@ -14289,9 +15404,9 @@ async function evaluate(testSuite, options = {}) {
|
|
|
14289
15404
|
logger.warn(`Failed to create shareable URL: ${error}`);
|
|
14290
15405
|
}
|
|
14291
15406
|
else logger.debug("Sharing requested but not enabled (check cloud config or sharing settings)");
|
|
14292
|
-
if (
|
|
14293
|
-
if (typeof
|
|
14294
|
-
else if (Array.isArray(
|
|
15407
|
+
if (testSuiteConfig.outputPath) {
|
|
15408
|
+
if (typeof testSuiteConfig.outputPath === "string") await writeOutput(testSuiteConfig.outputPath, evalRecord, null);
|
|
15409
|
+
else if (Array.isArray(testSuiteConfig.outputPath)) await writeMultipleOutputs(testSuiteConfig.outputPath, evalRecord, null);
|
|
14295
15410
|
}
|
|
14296
15411
|
return ret;
|
|
14297
15412
|
}
|
|
@@ -14320,6 +15435,6 @@ var src_default = {
|
|
|
14320
15435
|
redteam
|
|
14321
15436
|
};
|
|
14322
15437
|
//#endregion
|
|
14323
|
-
export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, cache_exports as cache, src_default as default, evaluate, generateTable, guardrails, isApiProvider, isGradingResult, isProviderOptions, isResultFailureReason, loadApiProvider, redteam };
|
|
15438
|
+
export { AssertionOrSetSchema, AssertionSchema, AssertionSetSchema, AssertionTypeSchema, AtomicTestCaseSchema, BaseAssertionTypesSchema, BaseTokenUsageSchema, CommandLineOptionsSchema, CompletedPromptSchema, CompletionTokenDetailsSchema, ConversationMessageSchema, DerivedMetricSchema, DocumentMediaInjectionPlacementSchema, DocumentMediaInjectionPlacementValues, DocxInjectionPlacementSchema, DocxInjectionPlacementValues, EvalResultsFilterMode, EvaluateOptionsSchema, GradingConfigSchema, InputConfigSchema, InputDefinitionObjectSchema, InputDefinitionSchema, InputTypeSchema, InputTypeValues, InputsSchema, NotPrefixedAssertionTypesSchema, OutputConfigSchema, OutputFileExtension, PartialGenerationError, PluginConfigSchema, PolicyObjectSchema, ProvidersSchema, ResultFailureReason, ScenarioSchema, SpecialAssertionTypesSchema, StrategyConfigSchema, TestCaseSchema, TestCaseWithVarsFileSchema, TestCasesWithMetadataPromptSchema, TestCasesWithMetadataSchema, TestGeneratorConfigSchema, TestSuiteConfigSchema, TestSuiteSchema, UnifiedConfigSchema, VarsSchema, assertions_default as assertions, buildInputPromptDescription, cache_exports as cache, src_default as default, evaluate, generateTable, getInputDescription, getInputType, guardrails, isApiProvider, isGradingResult, isProviderOptions, isResultFailureReason, isTransformFunction, loadApiProvider, normalizeInputDefinition, normalizeInputs, redteam };
|
|
14324
15439
|
|
|
14325
15440
|
//# sourceMappingURL=index.js.map
|